{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 3.0, "eval_steps": 500, "global_step": 563148, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0026635982015384943, "grad_norm": 0.7201167941093445, "learning_rate": 0.0001996, "loss": 9.4233, "step": 500 }, { "epoch": 0.005327196403076989, "grad_norm": 0.15531601011753082, "learning_rate": 0.0003996, "loss": 7.4925, "step": 1000 }, { "epoch": 0.007990794604615483, "grad_norm": 0.2483946532011032, "learning_rate": 0.0005996, "loss": 7.4229, "step": 1500 }, { "epoch": 0.010654392806153977, "grad_norm": 0.5883714556694031, "learning_rate": 0.0007996, "loss": 7.2323, "step": 2000 }, { "epoch": 0.013317991007692471, "grad_norm": 0.7867951989173889, "learning_rate": 0.0009996, "loss": 7.0605, "step": 2500 }, { "epoch": 0.015981589209230967, "grad_norm": 0.8444465398788452, "learning_rate": 0.0009991117421269675, "loss": 6.9306, "step": 3000 }, { "epoch": 0.01864518741076946, "grad_norm": 0.6867188215255737, "learning_rate": 0.00099821991695324, "loss": 6.8518, "step": 3500 }, { "epoch": 0.021308785612307955, "grad_norm": 0.5377506017684937, "learning_rate": 0.0009973280917795124, "loss": 6.7872, "step": 4000 }, { "epoch": 0.02397238381384645, "grad_norm": 0.6717762351036072, "learning_rate": 0.0009964362666057848, "loss": 6.7506, "step": 4500 }, { "epoch": 0.026635982015384942, "grad_norm": 1.001440167427063, "learning_rate": 0.0009955444414320573, "loss": 6.7159, "step": 5000 }, { "epoch": 0.029299580216923436, "grad_norm": 0.5917439460754395, "learning_rate": 0.0009946526162583297, "loss": 6.6817, "step": 5500 }, { "epoch": 0.031963178418461934, "grad_norm": 0.6403864026069641, "learning_rate": 0.0009937607910846021, "loss": 6.6561, "step": 6000 }, { "epoch": 0.034626776620000424, "grad_norm": 0.6477270126342773, "learning_rate": 0.0009928689659108746, "loss": 6.626, "step": 6500 }, { "epoch": 0.03729037482153892, "grad_norm": 0.8317912817001343, "learning_rate": 0.0009919789243874944, "loss": 6.6155, "step": 7000 }, { "epoch": 0.03995397302307741, "grad_norm": 0.81658536195755, "learning_rate": 0.0009910870992137668, "loss": 6.5983, "step": 7500 }, { "epoch": 0.04261757122461591, "grad_norm": 0.8080710768699646, "learning_rate": 0.0009901952740400395, "loss": 6.5712, "step": 8000 }, { "epoch": 0.045281169426154406, "grad_norm": 0.7330273985862732, "learning_rate": 0.000989303448866312, "loss": 6.5671, "step": 8500 }, { "epoch": 0.0479447676276929, "grad_norm": 0.5048246383666992, "learning_rate": 0.0009884134073429318, "loss": 6.5566, "step": 9000 }, { "epoch": 0.050608365829231394, "grad_norm": 0.60006183385849, "learning_rate": 0.0009875215821692042, "loss": 6.5299, "step": 9500 }, { "epoch": 0.053271964030769885, "grad_norm": 0.7553561329841614, "learning_rate": 0.0009866297569954767, "loss": 6.4984, "step": 10000 }, { "epoch": 0.05593556223230838, "grad_norm": 0.6969451904296875, "learning_rate": 0.000985737931821749, "loss": 6.4697, "step": 10500 }, { "epoch": 0.05859916043384687, "grad_norm": 0.8137800097465515, "learning_rate": 0.0009848461066480215, "loss": 6.4535, "step": 11000 }, { "epoch": 0.06126275863538537, "grad_norm": 0.6285300850868225, "learning_rate": 0.000983954281474294, "loss": 6.4259, "step": 11500 }, { "epoch": 0.06392635683692387, "grad_norm": 0.6301620006561279, "learning_rate": 0.0009830624563005664, "loss": 6.4174, "step": 12000 }, { "epoch": 0.06658995503846236, "grad_norm": 0.49541255831718445, "learning_rate": 0.0009821706311268388, "loss": 6.4134, "step": 12500 }, { "epoch": 0.06925355324000085, "grad_norm": 0.8492177128791809, "learning_rate": 0.000981280589603459, "loss": 6.394, "step": 13000 }, { "epoch": 0.07191715144153935, "grad_norm": 0.6284229755401611, "learning_rate": 0.0009803887644297313, "loss": 6.3861, "step": 13500 }, { "epoch": 0.07458074964307784, "grad_norm": 0.7854110598564148, "learning_rate": 0.0009794969392560038, "loss": 6.3795, "step": 14000 }, { "epoch": 0.07724434784461634, "grad_norm": 0.6952440738677979, "learning_rate": 0.0009786051140822762, "loss": 6.3679, "step": 14500 }, { "epoch": 0.07990794604615482, "grad_norm": Infinity, "learning_rate": 0.0009777132889085486, "loss": 6.363, "step": 15000 }, { "epoch": 0.08257154424769332, "grad_norm": 0.6554950475692749, "learning_rate": 0.0009768232473851685, "loss": 6.3597, "step": 15500 }, { "epoch": 0.08523514244923182, "grad_norm": 0.6918802261352539, "learning_rate": 0.000975931422211441, "loss": 6.3536, "step": 16000 }, { "epoch": 0.08789874065077032, "grad_norm": 0.749622642993927, "learning_rate": 0.0009750395970377135, "loss": 6.3438, "step": 16500 }, { "epoch": 0.09056233885230881, "grad_norm": 0.7492349743843079, "learning_rate": 0.000974147771863986, "loss": 6.3332, "step": 17000 }, { "epoch": 0.0932259370538473, "grad_norm": 0.6446586847305298, "learning_rate": 0.000973257730340606, "loss": 6.3241, "step": 17500 }, { "epoch": 0.0958895352553858, "grad_norm": 0.8464730978012085, "learning_rate": 0.0009723659051668784, "loss": 6.3194, "step": 18000 }, { "epoch": 0.09855313345692429, "grad_norm": 0.6281186938285828, "learning_rate": 0.0009714740799931508, "loss": 6.309, "step": 18500 }, { "epoch": 0.10121673165846279, "grad_norm": 0.8605656027793884, "learning_rate": 0.0009705822548194233, "loss": 6.2991, "step": 19000 }, { "epoch": 0.10388032986000127, "grad_norm": 0.7788176536560059, "learning_rate": 0.0009696922132960431, "loss": 6.3005, "step": 19500 }, { "epoch": 0.10654392806153977, "grad_norm": 0.6075990200042725, "learning_rate": 0.0009688003881223157, "loss": 6.2843, "step": 20000 }, { "epoch": 0.10920752626307827, "grad_norm": 0.7577124238014221, "learning_rate": 0.0009679085629485881, "loss": 6.2759, "step": 20500 }, { "epoch": 0.11187112446461676, "grad_norm": 0.8228011727333069, "learning_rate": 0.0009670167377748605, "loss": 6.2599, "step": 21000 }, { "epoch": 0.11453472266615526, "grad_norm": 0.7447388172149658, "learning_rate": 0.0009661266962514804, "loss": 6.2513, "step": 21500 }, { "epoch": 0.11719832086769374, "grad_norm": 0.9003899097442627, "learning_rate": 0.0009652348710777528, "loss": 6.2279, "step": 22000 }, { "epoch": 0.11986191906923224, "grad_norm": 1.0574650764465332, "learning_rate": 0.0009643430459040254, "loss": 6.2027, "step": 22500 }, { "epoch": 0.12252551727077074, "grad_norm": 0.9610631465911865, "learning_rate": 0.0009634512207302978, "loss": 6.1742, "step": 23000 }, { "epoch": 0.12518911547230924, "grad_norm": 1.1535989046096802, "learning_rate": 0.0009625611792069178, "loss": 6.1294, "step": 23500 }, { "epoch": 0.12785271367384773, "grad_norm": 1.1773658990859985, "learning_rate": 0.0009616711376835376, "loss": 6.097, "step": 24000 }, { "epoch": 0.13051631187538623, "grad_norm": 1.2815760374069214, "learning_rate": 0.0009607793125098101, "loss": 6.0634, "step": 24500 }, { "epoch": 0.13317991007692473, "grad_norm": 1.4569323062896729, "learning_rate": 0.0009598874873360826, "loss": 6.0457, "step": 25000 }, { "epoch": 0.1358435082784632, "grad_norm": 1.506204605102539, "learning_rate": 0.000958995662162355, "loss": 6.0186, "step": 25500 }, { "epoch": 0.1385071064800017, "grad_norm": 1.3472563028335571, "learning_rate": 0.0009581056206389749, "loss": 6.0086, "step": 26000 }, { "epoch": 0.1411707046815402, "grad_norm": 1.4809520244598389, "learning_rate": 0.0009572137954652473, "loss": 5.9898, "step": 26500 }, { "epoch": 0.1438343028830787, "grad_norm": 1.5233690738677979, "learning_rate": 0.0009563219702915198, "loss": 5.9781, "step": 27000 }, { "epoch": 0.1464979010846172, "grad_norm": 1.5101710557937622, "learning_rate": 0.0009554301451177923, "loss": 5.9561, "step": 27500 }, { "epoch": 0.14916149928615569, "grad_norm": 1.612731695175171, "learning_rate": 0.0009545401035944123, "loss": 5.9526, "step": 28000 }, { "epoch": 0.15182509748769418, "grad_norm": 1.7018260955810547, "learning_rate": 0.0009536482784206847, "loss": 5.9338, "step": 28500 }, { "epoch": 0.15448869568923268, "grad_norm": 1.7604913711547852, "learning_rate": 0.0009527564532469571, "loss": 5.9321, "step": 29000 }, { "epoch": 0.15715229389077118, "grad_norm": 1.721969485282898, "learning_rate": 0.0009518646280732296, "loss": 5.9175, "step": 29500 }, { "epoch": 0.15981589209230965, "grad_norm": 1.5823644399642944, "learning_rate": 0.0009509745865498494, "loss": 5.9153, "step": 30000 }, { "epoch": 0.16247949029384814, "grad_norm": 1.7854641675949097, "learning_rate": 0.000950082761376122, "loss": 5.9072, "step": 30500 }, { "epoch": 0.16514308849538664, "grad_norm": 1.7369080781936646, "learning_rate": 0.0009491909362023944, "loss": 5.9029, "step": 31000 }, { "epoch": 0.16780668669692514, "grad_norm": 1.674492597579956, "learning_rate": 0.0009482991110286668, "loss": 5.8841, "step": 31500 }, { "epoch": 0.17047028489846364, "grad_norm": 1.7058457136154175, "learning_rate": 0.0009474072858549393, "loss": 5.8883, "step": 32000 }, { "epoch": 0.17313388310000213, "grad_norm": 1.5853819847106934, "learning_rate": 0.0009465172443315591, "loss": 5.8775, "step": 32500 }, { "epoch": 0.17579748130154063, "grad_norm": 1.7525198459625244, "learning_rate": 0.0009456254191578317, "loss": 5.8717, "step": 33000 }, { "epoch": 0.17846107950307913, "grad_norm": 1.9233468770980835, "learning_rate": 0.0009447335939841041, "loss": 5.8608, "step": 33500 }, { "epoch": 0.18112467770461763, "grad_norm": 1.637522578239441, "learning_rate": 0.0009438417688103765, "loss": 5.8658, "step": 34000 }, { "epoch": 0.1837882759061561, "grad_norm": 1.8892813920974731, "learning_rate": 0.000942949943636649, "loss": 5.8523, "step": 34500 }, { "epoch": 0.1864518741076946, "grad_norm": 1.9510762691497803, "learning_rate": 0.0009420599021132689, "loss": 5.8404, "step": 35000 }, { "epoch": 0.1891154723092331, "grad_norm": 1.7907196283340454, "learning_rate": 0.0009411680769395415, "loss": 5.8396, "step": 35500 }, { "epoch": 0.1917790705107716, "grad_norm": 1.8805279731750488, "learning_rate": 0.0009402762517658139, "loss": 5.8293, "step": 36000 }, { "epoch": 0.19444266871231008, "grad_norm": 1.7272233963012695, "learning_rate": 0.0009393844265920863, "loss": 5.8268, "step": 36500 }, { "epoch": 0.19710626691384858, "grad_norm": 2.035203695297241, "learning_rate": 0.0009384926014183588, "loss": 5.8209, "step": 37000 }, { "epoch": 0.19976986511538708, "grad_norm": 1.8728936910629272, "learning_rate": 0.0009376007762446312, "loss": 5.8165, "step": 37500 }, { "epoch": 0.20243346331692558, "grad_norm": 1.9231390953063965, "learning_rate": 0.0009367089510709037, "loss": 5.8149, "step": 38000 }, { "epoch": 0.20509706151846407, "grad_norm": 1.7793642282485962, "learning_rate": 0.0009358171258971762, "loss": 5.8132, "step": 38500 }, { "epoch": 0.20776065972000254, "grad_norm": 1.7759062051773071, "learning_rate": 0.000934927084373796, "loss": 5.8065, "step": 39000 }, { "epoch": 0.21042425792154104, "grad_norm": 1.7528033256530762, "learning_rate": 0.0009340352592000685, "loss": 5.8023, "step": 39500 }, { "epoch": 0.21308785612307954, "grad_norm": 1.8702290058135986, "learning_rate": 0.0009331434340263409, "loss": 5.7909, "step": 40000 }, { "epoch": 0.21575145432461804, "grad_norm": 1.9332852363586426, "learning_rate": 0.000932253392502961, "loss": 5.7937, "step": 40500 }, { "epoch": 0.21841505252615653, "grad_norm": 1.8513240814208984, "learning_rate": 0.0009313615673292334, "loss": 5.7865, "step": 41000 }, { "epoch": 0.22107865072769503, "grad_norm": 1.8357592821121216, "learning_rate": 0.0009304697421555058, "loss": 5.7859, "step": 41500 }, { "epoch": 0.22374224892923353, "grad_norm": 1.7558057308197021, "learning_rate": 0.0009295779169817783, "loss": 5.7781, "step": 42000 }, { "epoch": 0.22640584713077203, "grad_norm": 1.7014683485031128, "learning_rate": 0.0009286860918080507, "loss": 5.7703, "step": 42500 }, { "epoch": 0.22906944533231052, "grad_norm": 1.8377306461334229, "learning_rate": 0.0009277942666343233, "loss": 5.7775, "step": 43000 }, { "epoch": 0.231733043533849, "grad_norm": 1.7670570611953735, "learning_rate": 0.0009269024414605957, "loss": 5.7606, "step": 43500 }, { "epoch": 0.2343966417353875, "grad_norm": 1.907322883605957, "learning_rate": 0.0009260106162868681, "loss": 5.7595, "step": 44000 }, { "epoch": 0.237060239936926, "grad_norm": 1.9192357063293457, "learning_rate": 0.000925120574763488, "loss": 5.7574, "step": 44500 }, { "epoch": 0.23972383813846448, "grad_norm": 1.801256775856018, "learning_rate": 0.0009242287495897604, "loss": 5.7623, "step": 45000 }, { "epoch": 0.24238743634000298, "grad_norm": 1.7864599227905273, "learning_rate": 0.000923336924416033, "loss": 5.7464, "step": 45500 }, { "epoch": 0.24505103454154148, "grad_norm": 2.0881760120391846, "learning_rate": 0.0009224450992423054, "loss": 5.7492, "step": 46000 }, { "epoch": 0.24771463274307998, "grad_norm": 2.0729496479034424, "learning_rate": 0.0009215550577189252, "loss": 5.7464, "step": 46500 }, { "epoch": 0.2503782309446185, "grad_norm": 1.807739496231079, "learning_rate": 0.0009206632325451977, "loss": 5.7391, "step": 47000 }, { "epoch": 0.25304182914615697, "grad_norm": 1.7898356914520264, "learning_rate": 0.0009197731910218176, "loss": 5.7399, "step": 47500 }, { "epoch": 0.25570542734769547, "grad_norm": 1.6668163537979126, "learning_rate": 0.0009188813658480901, "loss": 5.7316, "step": 48000 }, { "epoch": 0.25836902554923397, "grad_norm": 1.743788242340088, "learning_rate": 0.0009179895406743626, "loss": 5.7251, "step": 48500 }, { "epoch": 0.26103262375077246, "grad_norm": 1.7427009344100952, "learning_rate": 0.000917097715500635, "loss": 5.7231, "step": 49000 }, { "epoch": 0.26369622195231096, "grad_norm": 1.8911422491073608, "learning_rate": 0.0009162058903269075, "loss": 5.7272, "step": 49500 }, { "epoch": 0.26635982015384946, "grad_norm": 1.7783831357955933, "learning_rate": 0.0009153140651531799, "loss": 5.7193, "step": 50000 }, { "epoch": 0.2690234183553879, "grad_norm": 1.75882089138031, "learning_rate": 0.0009144222399794523, "loss": 5.7233, "step": 50500 }, { "epoch": 0.2716870165569264, "grad_norm": 1.8454984426498413, "learning_rate": 0.0009135304148057249, "loss": 5.7163, "step": 51000 }, { "epoch": 0.2743506147584649, "grad_norm": 1.8908592462539673, "learning_rate": 0.0009126403732823447, "loss": 5.7175, "step": 51500 }, { "epoch": 0.2770142129600034, "grad_norm": 1.6938859224319458, "learning_rate": 0.0009117485481086172, "loss": 5.7113, "step": 52000 }, { "epoch": 0.2796778111615419, "grad_norm": 1.8087745904922485, "learning_rate": 0.0009108567229348896, "loss": 5.7104, "step": 52500 }, { "epoch": 0.2823414093630804, "grad_norm": 1.9441509246826172, "learning_rate": 0.000909964897761162, "loss": 5.7006, "step": 53000 }, { "epoch": 0.2850050075646189, "grad_norm": 2.016289710998535, "learning_rate": 0.000909074856237782, "loss": 5.7084, "step": 53500 }, { "epoch": 0.2876686057661574, "grad_norm": 1.7924542427062988, "learning_rate": 0.0009081830310640544, "loss": 5.6967, "step": 54000 }, { "epoch": 0.2903322039676959, "grad_norm": 1.8578925132751465, "learning_rate": 0.0009072912058903269, "loss": 5.7058, "step": 54500 }, { "epoch": 0.2929958021692344, "grad_norm": 1.8592642545700073, "learning_rate": 0.0009063993807165993, "loss": 5.699, "step": 55000 }, { "epoch": 0.2956594003707729, "grad_norm": 1.726891040802002, "learning_rate": 0.0009055075555428717, "loss": 5.6873, "step": 55500 }, { "epoch": 0.29832299857231137, "grad_norm": 1.8885732889175415, "learning_rate": 0.0009046175140194918, "loss": 5.6859, "step": 56000 }, { "epoch": 0.30098659677384987, "grad_norm": 1.6777235269546509, "learning_rate": 0.0009037256888457643, "loss": 5.6843, "step": 56500 }, { "epoch": 0.30365019497538837, "grad_norm": 1.824777364730835, "learning_rate": 0.0009028338636720367, "loss": 5.6865, "step": 57000 }, { "epoch": 0.30631379317692686, "grad_norm": 1.6151602268218994, "learning_rate": 0.0009019420384983091, "loss": 5.6864, "step": 57500 }, { "epoch": 0.30897739137846536, "grad_norm": 1.7518750429153442, "learning_rate": 0.0009010502133245816, "loss": 5.6835, "step": 58000 }, { "epoch": 0.31164098958000386, "grad_norm": 1.9652341604232788, "learning_rate": 0.0009001583881508541, "loss": 5.6778, "step": 58500 }, { "epoch": 0.31430458778154235, "grad_norm": 1.8396164178848267, "learning_rate": 0.0008992665629771265, "loss": 5.6805, "step": 59000 }, { "epoch": 0.3169681859830808, "grad_norm": 1.7397726774215698, "learning_rate": 0.000898374737803399, "loss": 5.6809, "step": 59500 }, { "epoch": 0.3196317841846193, "grad_norm": 1.6550874710083008, "learning_rate": 0.0008974846962800188, "loss": 5.6713, "step": 60000 }, { "epoch": 0.3222953823861578, "grad_norm": 1.7428010702133179, "learning_rate": 0.0008965928711062913, "loss": 5.6777, "step": 60500 }, { "epoch": 0.3249589805876963, "grad_norm": 1.7465174198150635, "learning_rate": 0.0008957028295829112, "loss": 5.6668, "step": 61000 }, { "epoch": 0.3276225787892348, "grad_norm": 1.719190239906311, "learning_rate": 0.0008948110044091838, "loss": 5.6736, "step": 61500 }, { "epoch": 0.3302861769907733, "grad_norm": 1.6879175901412964, "learning_rate": 0.0008939191792354562, "loss": 5.6585, "step": 62000 }, { "epoch": 0.3329497751923118, "grad_norm": 1.6741931438446045, "learning_rate": 0.0008930273540617286, "loss": 5.6584, "step": 62500 }, { "epoch": 0.3356133733938503, "grad_norm": 1.8733186721801758, "learning_rate": 0.0008921355288880011, "loss": 5.6655, "step": 63000 }, { "epoch": 0.3382769715953888, "grad_norm": 1.8366929292678833, "learning_rate": 0.0008912454873646209, "loss": 5.6551, "step": 63500 }, { "epoch": 0.3409405697969273, "grad_norm": 1.7783548831939697, "learning_rate": 0.0008903536621908935, "loss": 5.6598, "step": 64000 }, { "epoch": 0.34360416799846577, "grad_norm": 1.739394187927246, "learning_rate": 0.0008894618370171659, "loss": 5.6568, "step": 64500 }, { "epoch": 0.34626776620000427, "grad_norm": 1.706986427307129, "learning_rate": 0.0008885700118434383, "loss": 5.6577, "step": 65000 }, { "epoch": 0.34893136440154277, "grad_norm": 1.7595592737197876, "learning_rate": 0.0008876781866697108, "loss": 5.6504, "step": 65500 }, { "epoch": 0.35159496260308126, "grad_norm": 1.7445604801177979, "learning_rate": 0.0008867863614959832, "loss": 5.6457, "step": 66000 }, { "epoch": 0.35425856080461976, "grad_norm": 1.7039164304733276, "learning_rate": 0.0008858945363222557, "loss": 5.652, "step": 66500 }, { "epoch": 0.35692215900615826, "grad_norm": 1.7117230892181396, "learning_rate": 0.0008850027111485282, "loss": 5.6456, "step": 67000 }, { "epoch": 0.35958575720769675, "grad_norm": 1.8759076595306396, "learning_rate": 0.000884112669625148, "loss": 5.6504, "step": 67500 }, { "epoch": 0.36224935540923525, "grad_norm": 1.5524253845214844, "learning_rate": 0.0008832208444514205, "loss": 5.6426, "step": 68000 }, { "epoch": 0.36491295361077375, "grad_norm": 1.648575782775879, "learning_rate": 0.0008823290192776929, "loss": 5.6401, "step": 68500 }, { "epoch": 0.3675765518123122, "grad_norm": 1.6062759160995483, "learning_rate": 0.0008814371941039654, "loss": 5.6466, "step": 69000 }, { "epoch": 0.3702401500138507, "grad_norm": 1.5237386226654053, "learning_rate": 0.0008805471525805854, "loss": 5.6381, "step": 69500 }, { "epoch": 0.3729037482153892, "grad_norm": 1.7291427850723267, "learning_rate": 0.0008796553274068578, "loss": 5.6337, "step": 70000 }, { "epoch": 0.3755673464169277, "grad_norm": 1.875213623046875, "learning_rate": 0.0008787635022331303, "loss": 5.6356, "step": 70500 }, { "epoch": 0.3782309446184662, "grad_norm": 1.8453514575958252, "learning_rate": 0.0008778716770594027, "loss": 5.6348, "step": 71000 }, { "epoch": 0.3808945428200047, "grad_norm": 1.725234866142273, "learning_rate": 0.0008769816355360227, "loss": 5.6318, "step": 71500 }, { "epoch": 0.3835581410215432, "grad_norm": 1.7739455699920654, "learning_rate": 0.0008760898103622951, "loss": 5.6296, "step": 72000 }, { "epoch": 0.3862217392230817, "grad_norm": 1.683827519416809, "learning_rate": 0.0008751979851885675, "loss": 5.6357, "step": 72500 }, { "epoch": 0.38888533742462017, "grad_norm": 1.5576590299606323, "learning_rate": 0.00087430616001484, "loss": 5.63, "step": 73000 }, { "epoch": 0.39154893562615867, "grad_norm": 1.666030764579773, "learning_rate": 0.0008734161184914598, "loss": 5.6178, "step": 73500 }, { "epoch": 0.39421253382769716, "grad_norm": 1.618916392326355, "learning_rate": 0.0008725242933177324, "loss": 5.6273, "step": 74000 }, { "epoch": 0.39687613202923566, "grad_norm": 1.69428551197052, "learning_rate": 0.0008716324681440048, "loss": 5.6188, "step": 74500 }, { "epoch": 0.39953973023077416, "grad_norm": 1.8516380786895752, "learning_rate": 0.0008707406429702772, "loss": 5.6235, "step": 75000 }, { "epoch": 0.40220332843231266, "grad_norm": 1.505953311920166, "learning_rate": 0.0008698506014468972, "loss": 5.6175, "step": 75500 }, { "epoch": 0.40486692663385115, "grad_norm": 1.5639010667800903, "learning_rate": 0.0008689587762731696, "loss": 5.6213, "step": 76000 }, { "epoch": 0.40753052483538965, "grad_norm": 1.7431727647781372, "learning_rate": 0.0008680669510994421, "loss": 5.6198, "step": 76500 }, { "epoch": 0.41019412303692815, "grad_norm": 1.676757574081421, "learning_rate": 0.0008671751259257146, "loss": 5.6252, "step": 77000 }, { "epoch": 0.41285772123846665, "grad_norm": 1.6216061115264893, "learning_rate": 0.0008662850844023345, "loss": 5.6211, "step": 77500 }, { "epoch": 0.4155213194400051, "grad_norm": 1.6766453981399536, "learning_rate": 0.0008653932592286069, "loss": 5.62, "step": 78000 }, { "epoch": 0.4181849176415436, "grad_norm": 1.6790215969085693, "learning_rate": 0.0008645014340548793, "loss": 5.6093, "step": 78500 }, { "epoch": 0.4208485158430821, "grad_norm": 1.8037434816360474, "learning_rate": 0.0008636096088811518, "loss": 5.6085, "step": 79000 }, { "epoch": 0.4235121140446206, "grad_norm": 1.6324502229690552, "learning_rate": 0.0008627195673577717, "loss": 5.6031, "step": 79500 }, { "epoch": 0.4261757122461591, "grad_norm": 1.6987981796264648, "learning_rate": 0.0008618277421840443, "loss": 5.6116, "step": 80000 }, { "epoch": 0.4288393104476976, "grad_norm": 1.6692321300506592, "learning_rate": 0.0008609359170103167, "loss": 5.6062, "step": 80500 }, { "epoch": 0.43150290864923607, "grad_norm": 1.6387773752212524, "learning_rate": 0.0008600440918365891, "loss": 5.6087, "step": 81000 }, { "epoch": 0.43416650685077457, "grad_norm": 1.792861819267273, "learning_rate": 0.000859154050313209, "loss": 5.608, "step": 81500 }, { "epoch": 0.43683010505231307, "grad_norm": 1.676076889038086, "learning_rate": 0.0008582622251394815, "loss": 5.6056, "step": 82000 }, { "epoch": 0.43949370325385156, "grad_norm": 1.772159218788147, "learning_rate": 0.000857370399965754, "loss": 5.6015, "step": 82500 }, { "epoch": 0.44215730145539006, "grad_norm": 1.7022145986557007, "learning_rate": 0.0008564785747920264, "loss": 5.6056, "step": 83000 }, { "epoch": 0.44482089965692856, "grad_norm": 1.6428086757659912, "learning_rate": 0.0008555885332686463, "loss": 5.596, "step": 83500 }, { "epoch": 0.44748449785846706, "grad_norm": 1.6144286394119263, "learning_rate": 0.0008546967080949187, "loss": 5.5974, "step": 84000 }, { "epoch": 0.45014809606000555, "grad_norm": 1.5918573141098022, "learning_rate": 0.0008538048829211912, "loss": 5.604, "step": 84500 }, { "epoch": 0.45281169426154405, "grad_norm": 1.7871578931808472, "learning_rate": 0.0008529130577474637, "loss": 5.5951, "step": 85000 }, { "epoch": 0.45547529246308255, "grad_norm": 1.6631501913070679, "learning_rate": 0.0008520230162240836, "loss": 5.6014, "step": 85500 }, { "epoch": 0.45813889066462105, "grad_norm": 1.6243520975112915, "learning_rate": 0.0008511311910503561, "loss": 5.5942, "step": 86000 }, { "epoch": 0.46080248886615954, "grad_norm": 1.5686520338058472, "learning_rate": 0.0008502393658766285, "loss": 5.5981, "step": 86500 }, { "epoch": 0.463466087067698, "grad_norm": 1.7691351175308228, "learning_rate": 0.0008493475407029009, "loss": 5.5984, "step": 87000 }, { "epoch": 0.4661296852692365, "grad_norm": 1.6885465383529663, "learning_rate": 0.0008484574991795209, "loss": 5.5851, "step": 87500 }, { "epoch": 0.468793283470775, "grad_norm": 1.6488664150238037, "learning_rate": 0.0008475656740057933, "loss": 5.5831, "step": 88000 }, { "epoch": 0.4714568816723135, "grad_norm": 1.5736653804779053, "learning_rate": 0.0008466738488320658, "loss": 5.582, "step": 88500 }, { "epoch": 0.474120479873852, "grad_norm": 1.7857962846755981, "learning_rate": 0.0008457820236583382, "loss": 5.5901, "step": 89000 }, { "epoch": 0.47678407807539047, "grad_norm": 1.7936720848083496, "learning_rate": 0.0008448919821349581, "loss": 5.5822, "step": 89500 }, { "epoch": 0.47944767627692897, "grad_norm": 1.546919345855713, "learning_rate": 0.0008440001569612306, "loss": 5.581, "step": 90000 }, { "epoch": 0.48211127447846747, "grad_norm": 1.778827428817749, "learning_rate": 0.000843108331787503, "loss": 5.5922, "step": 90500 }, { "epoch": 0.48477487268000596, "grad_norm": 1.495205044746399, "learning_rate": 0.0008422165066137755, "loss": 5.5821, "step": 91000 }, { "epoch": 0.48743847088154446, "grad_norm": 1.6151823997497559, "learning_rate": 0.0008413264650903954, "loss": 5.5801, "step": 91500 }, { "epoch": 0.49010206908308296, "grad_norm": 1.7652384042739868, "learning_rate": 0.0008404346399166679, "loss": 5.5785, "step": 92000 }, { "epoch": 0.49276566728462146, "grad_norm": 1.7062280178070068, "learning_rate": 0.0008395428147429404, "loss": 5.5784, "step": 92500 }, { "epoch": 0.49542926548615995, "grad_norm": 1.5986762046813965, "learning_rate": 0.0008386509895692128, "loss": 5.5814, "step": 93000 }, { "epoch": 0.49809286368769845, "grad_norm": 1.672861933708191, "learning_rate": 0.0008377609480458327, "loss": 5.5743, "step": 93500 }, { "epoch": 0.500756461889237, "grad_norm": 1.8104331493377686, "learning_rate": 0.0008368691228721051, "loss": 5.5709, "step": 94000 }, { "epoch": 0.5034200600907754, "grad_norm": 1.8253047466278076, "learning_rate": 0.0008359772976983776, "loss": 5.5642, "step": 94500 }, { "epoch": 0.5060836582923139, "grad_norm": 1.604465126991272, "learning_rate": 0.0008350854725246501, "loss": 5.5691, "step": 95000 }, { "epoch": 0.5087472564938524, "grad_norm": 1.7985742092132568, "learning_rate": 0.00083419543100127, "loss": 5.5611, "step": 95500 }, { "epoch": 0.5114108546953909, "grad_norm": 1.652733325958252, "learning_rate": 0.0008333036058275424, "loss": 5.5577, "step": 96000 }, { "epoch": 0.5140744528969294, "grad_norm": 1.8247016668319702, "learning_rate": 0.0008324117806538148, "loss": 5.5557, "step": 96500 }, { "epoch": 0.5167380510984679, "grad_norm": 1.784303069114685, "learning_rate": 0.0008315199554800873, "loss": 5.5554, "step": 97000 }, { "epoch": 0.5194016493000064, "grad_norm": 1.705725073814392, "learning_rate": 0.0008306299139567072, "loss": 5.5545, "step": 97500 }, { "epoch": 0.5220652475015449, "grad_norm": 1.8760724067687988, "learning_rate": 0.0008297380887829798, "loss": 5.5512, "step": 98000 }, { "epoch": 0.5247288457030834, "grad_norm": 1.7412986755371094, "learning_rate": 0.0008288462636092522, "loss": 5.5522, "step": 98500 }, { "epoch": 0.5273924439046219, "grad_norm": 2.0051610469818115, "learning_rate": 0.0008279544384355246, "loss": 5.5403, "step": 99000 }, { "epoch": 0.5300560421061604, "grad_norm": 1.6867221593856812, "learning_rate": 0.0008270643969121445, "loss": 5.544, "step": 99500 }, { "epoch": 0.5327196403076989, "grad_norm": 1.838189721107483, "learning_rate": 0.0008261725717384169, "loss": 5.5396, "step": 100000 }, { "epoch": 0.5353832385092374, "grad_norm": 1.655271291732788, "learning_rate": 0.0008252807465646895, "loss": 5.5358, "step": 100500 }, { "epoch": 0.5380468367107758, "grad_norm": 1.8378669023513794, "learning_rate": 0.0008243889213909619, "loss": 5.5419, "step": 101000 }, { "epoch": 0.5407104349123143, "grad_norm": 1.7509022951126099, "learning_rate": 0.0008234988798675818, "loss": 5.523, "step": 101500 }, { "epoch": 0.5433740331138528, "grad_norm": 1.9558390378952026, "learning_rate": 0.0008226070546938542, "loss": 5.5322, "step": 102000 }, { "epoch": 0.5460376313153913, "grad_norm": 2.0113561153411865, "learning_rate": 0.0008217152295201266, "loss": 5.5303, "step": 102500 }, { "epoch": 0.5487012295169298, "grad_norm": 1.989725112915039, "learning_rate": 0.0008208234043463993, "loss": 5.5257, "step": 103000 }, { "epoch": 0.5513648277184683, "grad_norm": 1.702812671661377, "learning_rate": 0.0008199315791726717, "loss": 5.5327, "step": 103500 }, { "epoch": 0.5540284259200068, "grad_norm": 1.8519411087036133, "learning_rate": 0.0008190397539989441, "loss": 5.5272, "step": 104000 }, { "epoch": 0.5566920241215453, "grad_norm": 1.856350064277649, "learning_rate": 0.0008181479288252166, "loss": 5.5211, "step": 104500 }, { "epoch": 0.5593556223230838, "grad_norm": 1.7010074853897095, "learning_rate": 0.000817256103651489, "loss": 5.5287, "step": 105000 }, { "epoch": 0.5620192205246223, "grad_norm": 1.6479413509368896, "learning_rate": 0.000816366062128109, "loss": 5.5279, "step": 105500 }, { "epoch": 0.5646828187261608, "grad_norm": 1.9108966588974, "learning_rate": 0.0008154742369543814, "loss": 5.5203, "step": 106000 }, { "epoch": 0.5673464169276993, "grad_norm": 1.9142667055130005, "learning_rate": 0.0008145824117806538, "loss": 5.5189, "step": 106500 }, { "epoch": 0.5700100151292378, "grad_norm": 1.8495519161224365, "learning_rate": 0.0008136905866069263, "loss": 5.5196, "step": 107000 }, { "epoch": 0.5726736133307763, "grad_norm": 2.063087224960327, "learning_rate": 0.0008128005450835461, "loss": 5.5132, "step": 107500 }, { "epoch": 0.5753372115323148, "grad_norm": 2.0009357929229736, "learning_rate": 0.0008119087199098186, "loss": 5.5177, "step": 108000 }, { "epoch": 0.5780008097338533, "grad_norm": 2.0125739574432373, "learning_rate": 0.0008110168947360911, "loss": 5.5112, "step": 108500 }, { "epoch": 0.5806644079353918, "grad_norm": 1.8415509462356567, "learning_rate": 0.0008101250695623635, "loss": 5.509, "step": 109000 }, { "epoch": 0.5833280061369303, "grad_norm": 1.7688753604888916, "learning_rate": 0.0008092350280389835, "loss": 5.5032, "step": 109500 }, { "epoch": 0.5859916043384688, "grad_norm": 1.8354215621948242, "learning_rate": 0.000808343202865256, "loss": 5.5129, "step": 110000 }, { "epoch": 0.5886552025400072, "grad_norm": 2.036357879638672, "learning_rate": 0.0008074513776915284, "loss": 5.5043, "step": 110500 }, { "epoch": 0.5913188007415457, "grad_norm": 1.8382165431976318, "learning_rate": 0.0008065595525178009, "loss": 5.5065, "step": 111000 }, { "epoch": 0.5939823989430842, "grad_norm": 2.001885175704956, "learning_rate": 0.0008056695109944208, "loss": 5.507, "step": 111500 }, { "epoch": 0.5966459971446227, "grad_norm": 1.872819423675537, "learning_rate": 0.0008047776858206932, "loss": 5.5081, "step": 112000 }, { "epoch": 0.5993095953461612, "grad_norm": 1.8629109859466553, "learning_rate": 0.0008038858606469656, "loss": 5.5078, "step": 112500 }, { "epoch": 0.6019731935476997, "grad_norm": 2.0044994354248047, "learning_rate": 0.0008029940354732381, "loss": 5.498, "step": 113000 }, { "epoch": 0.6046367917492382, "grad_norm": 1.9607182741165161, "learning_rate": 0.000802103993949858, "loss": 5.5092, "step": 113500 }, { "epoch": 0.6073003899507767, "grad_norm": 1.9605486392974854, "learning_rate": 0.0008012121687761305, "loss": 5.5013, "step": 114000 }, { "epoch": 0.6099639881523152, "grad_norm": 1.999872088432312, "learning_rate": 0.0008003203436024029, "loss": 5.497, "step": 114500 }, { "epoch": 0.6126275863538537, "grad_norm": 1.7834984064102173, "learning_rate": 0.0007994285184286753, "loss": 5.5001, "step": 115000 }, { "epoch": 0.6152911845553922, "grad_norm": 1.9666252136230469, "learning_rate": 0.0007985384769052953, "loss": 5.5004, "step": 115500 }, { "epoch": 0.6179547827569307, "grad_norm": 1.810936450958252, "learning_rate": 0.0007976484353819152, "loss": 5.4934, "step": 116000 }, { "epoch": 0.6206183809584692, "grad_norm": 1.8183609247207642, "learning_rate": 0.0007967566102081877, "loss": 5.4999, "step": 116500 }, { "epoch": 0.6232819791600077, "grad_norm": 2.1452646255493164, "learning_rate": 0.0007958647850344601, "loss": 5.4937, "step": 117000 }, { "epoch": 0.6259455773615462, "grad_norm": 1.984305739402771, "learning_rate": 0.0007949729598607326, "loss": 5.494, "step": 117500 }, { "epoch": 0.6286091755630847, "grad_norm": 2.1507790088653564, "learning_rate": 0.000794081134687005, "loss": 5.4915, "step": 118000 }, { "epoch": 0.6312727737646232, "grad_norm": 1.821390151977539, "learning_rate": 0.0007931910931636249, "loss": 5.4948, "step": 118500 }, { "epoch": 0.6339363719661616, "grad_norm": 1.901696801185608, "learning_rate": 0.0007922992679898974, "loss": 5.4944, "step": 119000 }, { "epoch": 0.6365999701677001, "grad_norm": 2.214447259902954, "learning_rate": 0.0007914074428161698, "loss": 5.4901, "step": 119500 }, { "epoch": 0.6392635683692386, "grad_norm": 1.8764078617095947, "learning_rate": 0.0007905156176424423, "loss": 5.4837, "step": 120000 }, { "epoch": 0.6419271665707771, "grad_norm": 1.9411547183990479, "learning_rate": 0.0007896237924687147, "loss": 5.4889, "step": 120500 }, { "epoch": 0.6445907647723156, "grad_norm": 1.8323979377746582, "learning_rate": 0.0007887319672949871, "loss": 5.49, "step": 121000 }, { "epoch": 0.6472543629738541, "grad_norm": 1.8666421175003052, "learning_rate": 0.0007878401421212597, "loss": 5.4911, "step": 121500 }, { "epoch": 0.6499179611753926, "grad_norm": 2.0501484870910645, "learning_rate": 0.0007869483169475321, "loss": 5.4894, "step": 122000 }, { "epoch": 0.6525815593769311, "grad_norm": 1.8784074783325195, "learning_rate": 0.0007860600590744995, "loss": 5.4911, "step": 122500 }, { "epoch": 0.6552451575784696, "grad_norm": 1.9021259546279907, "learning_rate": 0.000785168233900772, "loss": 5.4844, "step": 123000 }, { "epoch": 0.6579087557800081, "grad_norm": 2.053755283355713, "learning_rate": 0.0007842764087270444, "loss": 5.4884, "step": 123500 }, { "epoch": 0.6605723539815466, "grad_norm": 1.9320204257965088, "learning_rate": 0.0007833845835533169, "loss": 5.4822, "step": 124000 }, { "epoch": 0.6632359521830851, "grad_norm": 1.793219804763794, "learning_rate": 0.0007824945420299368, "loss": 5.4834, "step": 124500 }, { "epoch": 0.6658995503846236, "grad_norm": 2.0100185871124268, "learning_rate": 0.0007816027168562092, "loss": 5.4872, "step": 125000 }, { "epoch": 0.6685631485861621, "grad_norm": 2.0543274879455566, "learning_rate": 0.0007807108916824816, "loss": 5.4826, "step": 125500 }, { "epoch": 0.6712267467877006, "grad_norm": 1.9622262716293335, "learning_rate": 0.0007798190665087542, "loss": 5.4809, "step": 126000 }, { "epoch": 0.673890344989239, "grad_norm": 1.918966293334961, "learning_rate": 0.0007789272413350267, "loss": 5.4823, "step": 126500 }, { "epoch": 0.6765539431907776, "grad_norm": 1.8516751527786255, "learning_rate": 0.0007780354161612992, "loss": 5.4786, "step": 127000 }, { "epoch": 0.679217541392316, "grad_norm": 1.8985280990600586, "learning_rate": 0.000777145374637919, "loss": 5.4762, "step": 127500 }, { "epoch": 0.6818811395938545, "grad_norm": 2.030210018157959, "learning_rate": 0.0007762535494641915, "loss": 5.4786, "step": 128000 }, { "epoch": 0.684544737795393, "grad_norm": 1.9270013570785522, "learning_rate": 0.0007753617242904639, "loss": 5.4801, "step": 128500 }, { "epoch": 0.6872083359969315, "grad_norm": 1.7799612283706665, "learning_rate": 0.0007744698991167364, "loss": 5.4715, "step": 129000 }, { "epoch": 0.68987193419847, "grad_norm": 2.1841835975646973, "learning_rate": 0.0007735780739430089, "loss": 5.4726, "step": 129500 }, { "epoch": 0.6925355324000085, "grad_norm": 1.970680594444275, "learning_rate": 0.0007726862487692813, "loss": 5.4751, "step": 130000 }, { "epoch": 0.695199130601547, "grad_norm": 2.1457014083862305, "learning_rate": 0.0007717944235955537, "loss": 5.4754, "step": 130500 }, { "epoch": 0.6978627288030855, "grad_norm": 1.8095160722732544, "learning_rate": 0.0007709025984218262, "loss": 5.4723, "step": 131000 }, { "epoch": 0.700526327004624, "grad_norm": 1.8374313116073608, "learning_rate": 0.000770012556898446, "loss": 5.4774, "step": 131500 }, { "epoch": 0.7031899252061625, "grad_norm": 1.8603581190109253, "learning_rate": 0.0007691207317247186, "loss": 5.477, "step": 132000 }, { "epoch": 0.705853523407701, "grad_norm": 1.9838221073150635, "learning_rate": 0.0007682306902013385, "loss": 5.4732, "step": 132500 }, { "epoch": 0.7085171216092395, "grad_norm": 1.9500114917755127, "learning_rate": 0.000767338865027611, "loss": 5.4742, "step": 133000 }, { "epoch": 0.711180719810778, "grad_norm": 1.9748975038528442, "learning_rate": 0.0007664470398538834, "loss": 5.4675, "step": 133500 }, { "epoch": 0.7138443180123165, "grad_norm": 1.7860807180404663, "learning_rate": 0.0007655552146801558, "loss": 5.4711, "step": 134000 }, { "epoch": 0.716507916213855, "grad_norm": 2.076504945755005, "learning_rate": 0.0007646633895064284, "loss": 5.4691, "step": 134500 }, { "epoch": 0.7191715144153935, "grad_norm": 2.1392953395843506, "learning_rate": 0.0007637715643327008, "loss": 5.4763, "step": 135000 }, { "epoch": 0.721835112616932, "grad_norm": 1.7750567197799683, "learning_rate": 0.0007628797391589732, "loss": 5.4624, "step": 135500 }, { "epoch": 0.7244987108184705, "grad_norm": 2.1746318340301514, "learning_rate": 0.0007619879139852457, "loss": 5.4632, "step": 136000 }, { "epoch": 0.727162309020009, "grad_norm": 1.9568692445755005, "learning_rate": 0.0007610978724618655, "loss": 5.4702, "step": 136500 }, { "epoch": 0.7298259072215475, "grad_norm": 1.940618634223938, "learning_rate": 0.0007602060472881381, "loss": 5.4682, "step": 137000 }, { "epoch": 0.7324895054230859, "grad_norm": 2.0432674884796143, "learning_rate": 0.0007593142221144105, "loss": 5.4661, "step": 137500 }, { "epoch": 0.7351531036246244, "grad_norm": 1.989637017250061, "learning_rate": 0.0007584223969406829, "loss": 5.4643, "step": 138000 }, { "epoch": 0.7378167018261629, "grad_norm": 1.7842735052108765, "learning_rate": 0.0007575305717669554, "loss": 5.4633, "step": 138500 }, { "epoch": 0.7404803000277014, "grad_norm": 2.000488519668579, "learning_rate": 0.0007566405302435752, "loss": 5.4645, "step": 139000 }, { "epoch": 0.7431438982292399, "grad_norm": 1.9219857454299927, "learning_rate": 0.0007557487050698478, "loss": 5.4587, "step": 139500 }, { "epoch": 0.7458074964307784, "grad_norm": 1.8964563608169556, "learning_rate": 0.0007548568798961202, "loss": 5.4594, "step": 140000 }, { "epoch": 0.7484710946323169, "grad_norm": 2.0744431018829346, "learning_rate": 0.0007539650547223926, "loss": 5.4677, "step": 140500 }, { "epoch": 0.7511346928338554, "grad_norm": 2.0807344913482666, "learning_rate": 0.0007530732295486651, "loss": 5.4594, "step": 141000 }, { "epoch": 0.7537982910353939, "grad_norm": 1.9063740968704224, "learning_rate": 0.0007521814043749375, "loss": 5.4614, "step": 141500 }, { "epoch": 0.7564618892369324, "grad_norm": 1.8823788166046143, "learning_rate": 0.0007512913628515576, "loss": 5.4612, "step": 142000 }, { "epoch": 0.7591254874384709, "grad_norm": 2.027939558029175, "learning_rate": 0.00075039953767783, "loss": 5.457, "step": 142500 }, { "epoch": 0.7617890856400094, "grad_norm": 1.956814169883728, "learning_rate": 0.0007495077125041024, "loss": 5.4561, "step": 143000 }, { "epoch": 0.7644526838415479, "grad_norm": 1.8203577995300293, "learning_rate": 0.0007486158873303749, "loss": 5.4612, "step": 143500 }, { "epoch": 0.7671162820430864, "grad_norm": 2.0049407482147217, "learning_rate": 0.0007477240621566473, "loss": 5.4572, "step": 144000 }, { "epoch": 0.7697798802446248, "grad_norm": 2.0092926025390625, "learning_rate": 0.0007468322369829198, "loss": 5.4566, "step": 144500 }, { "epoch": 0.7724434784461633, "grad_norm": 1.9448853731155396, "learning_rate": 0.0007459421954595397, "loss": 5.4567, "step": 145000 }, { "epoch": 0.7751070766477018, "grad_norm": 1.9080660343170166, "learning_rate": 0.0007450503702858121, "loss": 5.4529, "step": 145500 }, { "epoch": 0.7777706748492403, "grad_norm": 2.0922887325286865, "learning_rate": 0.0007441585451120846, "loss": 5.4594, "step": 146000 }, { "epoch": 0.7804342730507788, "grad_norm": 2.102870464324951, "learning_rate": 0.000743266719938357, "loss": 5.4533, "step": 146500 }, { "epoch": 0.7830978712523173, "grad_norm": 1.8905880451202393, "learning_rate": 0.0007423748947646295, "loss": 5.4512, "step": 147000 }, { "epoch": 0.7857614694538558, "grad_norm": 1.937587857246399, "learning_rate": 0.000741483069590902, "loss": 5.4577, "step": 147500 }, { "epoch": 0.7884250676553943, "grad_norm": 2.2599427700042725, "learning_rate": 0.0007405912444171744, "loss": 5.4545, "step": 148000 }, { "epoch": 0.7910886658569328, "grad_norm": 2.1247055530548096, "learning_rate": 0.0007396994192434468, "loss": 5.4552, "step": 148500 }, { "epoch": 0.7937522640584713, "grad_norm": 1.8920656442642212, "learning_rate": 0.0007388093777200668, "loss": 5.4551, "step": 149000 }, { "epoch": 0.7964158622600098, "grad_norm": 2.05411696434021, "learning_rate": 0.0007379175525463393, "loss": 5.4581, "step": 149500 }, { "epoch": 0.7990794604615483, "grad_norm": 2.1096110343933105, "learning_rate": 0.0007370257273726118, "loss": 5.4553, "step": 150000 }, { "epoch": 0.8017430586630868, "grad_norm": 2.060760736465454, "learning_rate": 0.0007361339021988842, "loss": 5.4557, "step": 150500 }, { "epoch": 0.8044066568646253, "grad_norm": 1.7533081769943237, "learning_rate": 0.0007352438606755041, "loss": 5.4596, "step": 151000 }, { "epoch": 0.8070702550661638, "grad_norm": 1.948110580444336, "learning_rate": 0.0007343520355017765, "loss": 5.4581, "step": 151500 }, { "epoch": 0.8097338532677023, "grad_norm": 2.0876693725585938, "learning_rate": 0.000733460210328049, "loss": 5.4517, "step": 152000 }, { "epoch": 0.8123974514692408, "grad_norm": 1.8972123861312866, "learning_rate": 0.0007325701688046689, "loss": 5.4529, "step": 152500 }, { "epoch": 0.8150610496707793, "grad_norm": 2.0049657821655273, "learning_rate": 0.0007316783436309413, "loss": 5.4506, "step": 153000 }, { "epoch": 0.8177246478723178, "grad_norm": 1.9599244594573975, "learning_rate": 0.0007307865184572138, "loss": 5.4503, "step": 153500 }, { "epoch": 0.8203882460738563, "grad_norm": 2.090162992477417, "learning_rate": 0.0007298946932834862, "loss": 5.4487, "step": 154000 }, { "epoch": 0.8230518442753948, "grad_norm": 1.9685425758361816, "learning_rate": 0.0007290028681097586, "loss": 5.4459, "step": 154500 }, { "epoch": 0.8257154424769333, "grad_norm": 2.0231292247772217, "learning_rate": 0.0007281110429360312, "loss": 5.4519, "step": 155000 }, { "epoch": 0.8283790406784717, "grad_norm": 1.824242353439331, "learning_rate": 0.0007272192177623036, "loss": 5.4495, "step": 155500 }, { "epoch": 0.8310426388800102, "grad_norm": 1.8740367889404297, "learning_rate": 0.000726327392588576, "loss": 5.4514, "step": 156000 }, { "epoch": 0.8337062370815487, "grad_norm": 1.898790955543518, "learning_rate": 0.000725437351065196, "loss": 5.4442, "step": 156500 }, { "epoch": 0.8363698352830872, "grad_norm": 1.9713107347488403, "learning_rate": 0.0007245455258914684, "loss": 5.4481, "step": 157000 }, { "epoch": 0.8390334334846257, "grad_norm": 1.892471432685852, "learning_rate": 0.000723653700717741, "loss": 5.4514, "step": 157500 }, { "epoch": 0.8416970316861642, "grad_norm": 2.0477683544158936, "learning_rate": 0.0007227618755440134, "loss": 5.4402, "step": 158000 }, { "epoch": 0.8443606298877027, "grad_norm": 1.9651503562927246, "learning_rate": 0.0007218736176709807, "loss": 5.439, "step": 158500 }, { "epoch": 0.8470242280892412, "grad_norm": 1.9664440155029297, "learning_rate": 0.0007209817924972531, "loss": 5.4512, "step": 159000 }, { "epoch": 0.8496878262907797, "grad_norm": 1.9268772602081299, "learning_rate": 0.0007200899673235256, "loss": 5.4445, "step": 159500 }, { "epoch": 0.8523514244923182, "grad_norm": 2.0761542320251465, "learning_rate": 0.0007191981421497981, "loss": 5.4476, "step": 160000 }, { "epoch": 0.8550150226938567, "grad_norm": 2.080336570739746, "learning_rate": 0.0007183063169760705, "loss": 5.4472, "step": 160500 }, { "epoch": 0.8576786208953951, "grad_norm": 1.8157365322113037, "learning_rate": 0.000717414491802343, "loss": 5.4471, "step": 161000 }, { "epoch": 0.8603422190969336, "grad_norm": 1.7620859146118164, "learning_rate": 0.0007165226666286154, "loss": 5.4486, "step": 161500 }, { "epoch": 0.8630058172984721, "grad_norm": 1.8530540466308594, "learning_rate": 0.0007156326251052354, "loss": 5.4403, "step": 162000 }, { "epoch": 0.8656694155000106, "grad_norm": 1.91478431224823, "learning_rate": 0.0007147407999315079, "loss": 5.4453, "step": 162500 }, { "epoch": 0.8683330137015491, "grad_norm": 1.944806456565857, "learning_rate": 0.0007138489747577804, "loss": 5.4438, "step": 163000 }, { "epoch": 0.8709966119030876, "grad_norm": 1.941565752029419, "learning_rate": 0.0007129571495840528, "loss": 5.4403, "step": 163500 }, { "epoch": 0.8736602101046261, "grad_norm": 1.8101640939712524, "learning_rate": 0.0007120653244103252, "loss": 5.4352, "step": 164000 }, { "epoch": 0.8763238083061646, "grad_norm": 2.391594171524048, "learning_rate": 0.0007111752828869451, "loss": 5.4379, "step": 164500 }, { "epoch": 0.8789874065077031, "grad_norm": 1.946295142173767, "learning_rate": 0.0007102834577132175, "loss": 5.4385, "step": 165000 }, { "epoch": 0.8816510047092416, "grad_norm": 2.1615066528320312, "learning_rate": 0.00070939163253949, "loss": 5.4439, "step": 165500 }, { "epoch": 0.8843146029107801, "grad_norm": 2.0320687294006348, "learning_rate": 0.0007084998073657625, "loss": 5.4434, "step": 166000 }, { "epoch": 0.8869782011123186, "grad_norm": 1.8692481517791748, "learning_rate": 0.0007076079821920349, "loss": 5.437, "step": 166500 }, { "epoch": 0.8896417993138571, "grad_norm": 2.007511854171753, "learning_rate": 0.0007067161570183073, "loss": 5.4327, "step": 167000 }, { "epoch": 0.8923053975153956, "grad_norm": 2.02004337310791, "learning_rate": 0.0007058243318445799, "loss": 5.4393, "step": 167500 }, { "epoch": 0.8949689957169341, "grad_norm": 1.7644096612930298, "learning_rate": 0.0007049325066708523, "loss": 5.4304, "step": 168000 }, { "epoch": 0.8976325939184726, "grad_norm": 2.0698578357696533, "learning_rate": 0.0007040424651474723, "loss": 5.4301, "step": 168500 }, { "epoch": 0.9002961921200111, "grad_norm": 1.881465196609497, "learning_rate": 0.0007031506399737447, "loss": 5.4399, "step": 169000 }, { "epoch": 0.9029597903215496, "grad_norm": 2.0607750415802, "learning_rate": 0.0007022588148000172, "loss": 5.4311, "step": 169500 }, { "epoch": 0.9056233885230881, "grad_norm": 2.1066737174987793, "learning_rate": 0.0007013669896262897, "loss": 5.4348, "step": 170000 }, { "epoch": 0.9082869867246266, "grad_norm": 2.0234835147857666, "learning_rate": 0.0007004769481029096, "loss": 5.4337, "step": 170500 }, { "epoch": 0.9109505849261651, "grad_norm": 1.8877592086791992, "learning_rate": 0.000699585122929182, "loss": 5.4389, "step": 171000 }, { "epoch": 0.9136141831277036, "grad_norm": 2.117302417755127, "learning_rate": 0.0006986932977554544, "loss": 5.4333, "step": 171500 }, { "epoch": 0.9162777813292421, "grad_norm": 2.073172092437744, "learning_rate": 0.0006978014725817269, "loss": 5.4318, "step": 172000 }, { "epoch": 0.9189413795307806, "grad_norm": 2.064408540725708, "learning_rate": 0.0006969114310583467, "loss": 5.431, "step": 172500 }, { "epoch": 0.9216049777323191, "grad_norm": 1.9481194019317627, "learning_rate": 0.0006960196058846193, "loss": 5.4321, "step": 173000 }, { "epoch": 0.9242685759338576, "grad_norm": 2.010923147201538, "learning_rate": 0.0006951277807108917, "loss": 5.4342, "step": 173500 }, { "epoch": 0.926932174135396, "grad_norm": 1.9323519468307495, "learning_rate": 0.0006942359555371641, "loss": 5.4303, "step": 174000 }, { "epoch": 0.9295957723369345, "grad_norm": 2.2859385013580322, "learning_rate": 0.0006933459140137841, "loss": 5.4352, "step": 174500 }, { "epoch": 0.932259370538473, "grad_norm": 2.055107593536377, "learning_rate": 0.000692455872490404, "loss": 5.4352, "step": 175000 }, { "epoch": 0.9349229687400115, "grad_norm": 1.9875715970993042, "learning_rate": 0.0006915640473166765, "loss": 5.4392, "step": 175500 }, { "epoch": 0.93758656694155, "grad_norm": 2.097477912902832, "learning_rate": 0.0006906722221429489, "loss": 5.4291, "step": 176000 }, { "epoch": 0.9402501651430885, "grad_norm": 1.8664289712905884, "learning_rate": 0.0006897803969692214, "loss": 5.423, "step": 176500 }, { "epoch": 0.942913763344627, "grad_norm": 2.0907797813415527, "learning_rate": 0.0006888885717954938, "loss": 5.4322, "step": 177000 }, { "epoch": 0.9455773615461655, "grad_norm": 1.9234920740127563, "learning_rate": 0.0006879967466217662, "loss": 5.4303, "step": 177500 }, { "epoch": 0.948240959747704, "grad_norm": 2.0696797370910645, "learning_rate": 0.0006871049214480388, "loss": 5.4251, "step": 178000 }, { "epoch": 0.9509045579492424, "grad_norm": 2.0838043689727783, "learning_rate": 0.0006862130962743112, "loss": 5.4244, "step": 178500 }, { "epoch": 0.9535681561507809, "grad_norm": 2.1029279232025146, "learning_rate": 0.0006853230547509311, "loss": 5.4323, "step": 179000 }, { "epoch": 0.9562317543523194, "grad_norm": 2.1586649417877197, "learning_rate": 0.000684433013227551, "loss": 5.4329, "step": 179500 }, { "epoch": 0.9588953525538579, "grad_norm": 1.8636375665664673, "learning_rate": 0.0006835411880538235, "loss": 5.43, "step": 180000 }, { "epoch": 0.9615589507553964, "grad_norm": 1.9289181232452393, "learning_rate": 0.0006826493628800959, "loss": 5.4193, "step": 180500 }, { "epoch": 0.9642225489569349, "grad_norm": 1.9578914642333984, "learning_rate": 0.0006817575377063684, "loss": 5.4298, "step": 181000 }, { "epoch": 0.9668861471584734, "grad_norm": 2.0745270252227783, "learning_rate": 0.0006808657125326409, "loss": 5.4315, "step": 181500 }, { "epoch": 0.9695497453600119, "grad_norm": 1.9545907974243164, "learning_rate": 0.0006799738873589133, "loss": 5.425, "step": 182000 }, { "epoch": 0.9722133435615504, "grad_norm": 1.9709100723266602, "learning_rate": 0.0006790820621851857, "loss": 5.425, "step": 182500 }, { "epoch": 0.9748769417630889, "grad_norm": 1.8214976787567139, "learning_rate": 0.0006781902370114582, "loss": 5.4307, "step": 183000 }, { "epoch": 0.9775405399646274, "grad_norm": 1.8456212282180786, "learning_rate": 0.0006773001954880781, "loss": 5.4277, "step": 183500 }, { "epoch": 0.9802041381661659, "grad_norm": 2.0278677940368652, "learning_rate": 0.0006764083703143506, "loss": 5.425, "step": 184000 }, { "epoch": 0.9828677363677044, "grad_norm": 1.8401942253112793, "learning_rate": 0.000675516545140623, "loss": 5.4228, "step": 184500 }, { "epoch": 0.9855313345692429, "grad_norm": 2.0018155574798584, "learning_rate": 0.0006746247199668954, "loss": 5.4272, "step": 185000 }, { "epoch": 0.9881949327707814, "grad_norm": 1.9544193744659424, "learning_rate": 0.0006737346784435153, "loss": 5.4297, "step": 185500 }, { "epoch": 0.9908585309723199, "grad_norm": 1.8701244592666626, "learning_rate": 0.0006728428532697878, "loss": 5.4305, "step": 186000 }, { "epoch": 0.9935221291738584, "grad_norm": 1.9702414274215698, "learning_rate": 0.0006719510280960603, "loss": 5.4272, "step": 186500 }, { "epoch": 0.9961857273753969, "grad_norm": 2.005018472671509, "learning_rate": 0.0006710592029223327, "loss": 5.4259, "step": 187000 }, { "epoch": 0.9988493255769354, "grad_norm": 1.9745688438415527, "learning_rate": 0.0006701691613989527, "loss": 5.4255, "step": 187500 }, { "epoch": 1.001512923778474, "grad_norm": 2.119936466217041, "learning_rate": 0.0006692773362252251, "loss": 5.4282, "step": 188000 }, { "epoch": 1.0041765219800123, "grad_norm": 1.8192147016525269, "learning_rate": 0.0006683855110514976, "loss": 5.4272, "step": 188500 }, { "epoch": 1.006840120181551, "grad_norm": 2.0825536251068115, "learning_rate": 0.0006674936858777701, "loss": 5.4191, "step": 189000 }, { "epoch": 1.0095037183830893, "grad_norm": 2.034301519393921, "learning_rate": 0.0006666036443543899, "loss": 5.4212, "step": 189500 }, { "epoch": 1.0121673165846279, "grad_norm": 2.013160467147827, "learning_rate": 0.0006657118191806624, "loss": 5.4216, "step": 190000 }, { "epoch": 1.0148309147861663, "grad_norm": 1.9328818321228027, "learning_rate": 0.0006648199940069348, "loss": 5.4286, "step": 190500 }, { "epoch": 1.0174945129877049, "grad_norm": 2.011674642562866, "learning_rate": 0.0006639281688332073, "loss": 5.426, "step": 191000 }, { "epoch": 1.0201581111892433, "grad_norm": 2.1039912700653076, "learning_rate": 0.0006630381273098273, "loss": 5.4261, "step": 191500 }, { "epoch": 1.0228217093907819, "grad_norm": 1.8038475513458252, "learning_rate": 0.0006621480857864472, "loss": 5.4201, "step": 192000 }, { "epoch": 1.0254853075923203, "grad_norm": 1.8866719007492065, "learning_rate": 0.0006612562606127196, "loss": 5.4156, "step": 192500 }, { "epoch": 1.0281489057938589, "grad_norm": 1.9180611371994019, "learning_rate": 0.000660364435438992, "loss": 5.4219, "step": 193000 }, { "epoch": 1.0308125039953973, "grad_norm": 1.83159339427948, "learning_rate": 0.0006594726102652645, "loss": 5.4158, "step": 193500 }, { "epoch": 1.0334761021969359, "grad_norm": 1.8638277053833008, "learning_rate": 0.000658580785091537, "loss": 5.4196, "step": 194000 }, { "epoch": 1.0361397003984743, "grad_norm": 1.8679394721984863, "learning_rate": 0.0006576889599178094, "loss": 5.4221, "step": 194500 }, { "epoch": 1.0388032986000129, "grad_norm": 1.8080953359603882, "learning_rate": 0.0006567971347440819, "loss": 5.4168, "step": 195000 }, { "epoch": 1.0414668968015512, "grad_norm": 2.044064521789551, "learning_rate": 0.0006559053095703543, "loss": 5.4152, "step": 195500 }, { "epoch": 1.0441304950030899, "grad_norm": 2.067416191101074, "learning_rate": 0.0006550152680469742, "loss": 5.4197, "step": 196000 }, { "epoch": 1.0467940932046282, "grad_norm": 1.8547744750976562, "learning_rate": 0.0006541234428732467, "loss": 5.416, "step": 196500 }, { "epoch": 1.0494576914061668, "grad_norm": 2.1002390384674072, "learning_rate": 0.0006532316176995191, "loss": 5.414, "step": 197000 }, { "epoch": 1.0521212896077052, "grad_norm": 1.8542534112930298, "learning_rate": 0.0006523397925257916, "loss": 5.4176, "step": 197500 }, { "epoch": 1.0547848878092438, "grad_norm": 1.8873697519302368, "learning_rate": 0.000651447967352064, "loss": 5.4155, "step": 198000 }, { "epoch": 1.0574484860107822, "grad_norm": 2.0172159671783447, "learning_rate": 0.0006505561421783364, "loss": 5.4234, "step": 198500 }, { "epoch": 1.0601120842123208, "grad_norm": 1.9374735355377197, "learning_rate": 0.000649664317004609, "loss": 5.4131, "step": 199000 }, { "epoch": 1.0627756824138592, "grad_norm": 2.141655921936035, "learning_rate": 0.0006487724918308814, "loss": 5.4134, "step": 199500 }, { "epoch": 1.0654392806153978, "grad_norm": 1.9056235551834106, "learning_rate": 0.0006478824503075014, "loss": 5.4173, "step": 200000 }, { "epoch": 1.0681028788169362, "grad_norm": 2.3003177642822266, "learning_rate": 0.0006469906251337738, "loss": 5.4049, "step": 200500 }, { "epoch": 1.0707664770184748, "grad_norm": 2.1843066215515137, "learning_rate": 0.0006460987999600462, "loss": 5.411, "step": 201000 }, { "epoch": 1.0734300752200132, "grad_norm": 2.0827953815460205, "learning_rate": 0.0006452069747863188, "loss": 5.4175, "step": 201500 }, { "epoch": 1.0760936734215516, "grad_norm": 2.02587890625, "learning_rate": 0.0006443169332629386, "loss": 5.4183, "step": 202000 }, { "epoch": 1.0787572716230902, "grad_norm": 1.8049343824386597, "learning_rate": 0.0006434251080892111, "loss": 5.4142, "step": 202500 }, { "epoch": 1.0814208698246286, "grad_norm": 2.1238086223602295, "learning_rate": 0.0006425332829154835, "loss": 5.4155, "step": 203000 }, { "epoch": 1.0840844680261672, "grad_norm": 1.9311139583587646, "learning_rate": 0.0006416414577417559, "loss": 5.4132, "step": 203500 }, { "epoch": 1.0867480662277056, "grad_norm": 1.970428228378296, "learning_rate": 0.0006407514162183758, "loss": 5.4073, "step": 204000 }, { "epoch": 1.0894116644292442, "grad_norm": 1.7967313528060913, "learning_rate": 0.0006398595910446483, "loss": 5.4113, "step": 204500 }, { "epoch": 1.0920752626307826, "grad_norm": 1.7493606805801392, "learning_rate": 0.0006389677658709208, "loss": 5.4106, "step": 205000 }, { "epoch": 1.0947388608323212, "grad_norm": 1.868148922920227, "learning_rate": 0.0006380777243475407, "loss": 5.4125, "step": 205500 }, { "epoch": 1.0974024590338596, "grad_norm": 2.0261473655700684, "learning_rate": 0.0006371858991738132, "loss": 5.4119, "step": 206000 }, { "epoch": 1.1000660572353982, "grad_norm": 1.8863203525543213, "learning_rate": 0.0006362940740000856, "loss": 5.4085, "step": 206500 }, { "epoch": 1.1027296554369366, "grad_norm": 1.97225821018219, "learning_rate": 0.0006354022488263581, "loss": 5.4106, "step": 207000 }, { "epoch": 1.1053932536384752, "grad_norm": 2.2650508880615234, "learning_rate": 0.0006345104236526306, "loss": 5.4128, "step": 207500 }, { "epoch": 1.1080568518400136, "grad_norm": 1.9305511713027954, "learning_rate": 0.000633618598478903, "loss": 5.4084, "step": 208000 }, { "epoch": 1.1107204500415522, "grad_norm": 2.110548973083496, "learning_rate": 0.0006327285569555229, "loss": 5.4078, "step": 208500 }, { "epoch": 1.1133840482430906, "grad_norm": 2.0234880447387695, "learning_rate": 0.0006318367317817953, "loss": 5.4125, "step": 209000 }, { "epoch": 1.1160476464446292, "grad_norm": 1.8949861526489258, "learning_rate": 0.0006309449066080678, "loss": 5.4077, "step": 209500 }, { "epoch": 1.1187112446461676, "grad_norm": 1.9646226167678833, "learning_rate": 0.0006300530814343403, "loss": 5.4112, "step": 210000 }, { "epoch": 1.1213748428477062, "grad_norm": 1.9960238933563232, "learning_rate": 0.0006291612562606127, "loss": 5.4062, "step": 210500 }, { "epoch": 1.1240384410492446, "grad_norm": 2.0510716438293457, "learning_rate": 0.0006282694310868851, "loss": 5.4094, "step": 211000 }, { "epoch": 1.1267020392507832, "grad_norm": 1.969011664390564, "learning_rate": 0.0006273776059131576, "loss": 5.4123, "step": 211500 }, { "epoch": 1.1293656374523215, "grad_norm": 2.0459535121917725, "learning_rate": 0.0006264857807394301, "loss": 5.4077, "step": 212000 }, { "epoch": 1.1320292356538602, "grad_norm": 2.093336343765259, "learning_rate": 0.0006255957392160501, "loss": 5.4107, "step": 212500 }, { "epoch": 1.1346928338553985, "grad_norm": 1.8615410327911377, "learning_rate": 0.0006247056976926699, "loss": 5.4078, "step": 213000 }, { "epoch": 1.1373564320569371, "grad_norm": 1.9422777891159058, "learning_rate": 0.0006238138725189424, "loss": 5.4115, "step": 213500 }, { "epoch": 1.1400200302584755, "grad_norm": 1.9412380456924438, "learning_rate": 0.0006229220473452148, "loss": 5.4013, "step": 214000 }, { "epoch": 1.1426836284600141, "grad_norm": 2.2532691955566406, "learning_rate": 0.0006220302221714873, "loss": 5.4061, "step": 214500 }, { "epoch": 1.1453472266615525, "grad_norm": 1.7372703552246094, "learning_rate": 0.0006211383969977598, "loss": 5.41, "step": 215000 }, { "epoch": 1.1480108248630911, "grad_norm": 1.9771249294281006, "learning_rate": 0.0006202465718240322, "loss": 5.4032, "step": 215500 }, { "epoch": 1.1506744230646295, "grad_norm": 1.802037000656128, "learning_rate": 0.0006193547466503046, "loss": 5.4026, "step": 216000 }, { "epoch": 1.1533380212661681, "grad_norm": 1.958177924156189, "learning_rate": 0.0006184629214765771, "loss": 5.4043, "step": 216500 }, { "epoch": 1.1560016194677065, "grad_norm": 1.9318652153015137, "learning_rate": 0.000617572879953197, "loss": 5.4044, "step": 217000 }, { "epoch": 1.158665217669245, "grad_norm": 1.917920470237732, "learning_rate": 0.0006166810547794695, "loss": 5.4051, "step": 217500 }, { "epoch": 1.1613288158707835, "grad_norm": 1.9815441370010376, "learning_rate": 0.0006157892296057419, "loss": 5.4036, "step": 218000 }, { "epoch": 1.1639924140723221, "grad_norm": 2.0141518115997314, "learning_rate": 0.0006148974044320143, "loss": 5.4093, "step": 218500 }, { "epoch": 1.1666560122738605, "grad_norm": 2.0144686698913574, "learning_rate": 0.0006140073629086343, "loss": 5.3992, "step": 219000 }, { "epoch": 1.169319610475399, "grad_norm": 1.848953127861023, "learning_rate": 0.0006131155377349069, "loss": 5.4069, "step": 219500 }, { "epoch": 1.1719832086769375, "grad_norm": 1.8711676597595215, "learning_rate": 0.0006122237125611793, "loss": 5.4058, "step": 220000 }, { "epoch": 1.1746468068784761, "grad_norm": 2.1549181938171387, "learning_rate": 0.0006113318873874517, "loss": 5.4057, "step": 220500 }, { "epoch": 1.1773104050800145, "grad_norm": 2.136955738067627, "learning_rate": 0.0006104418458640716, "loss": 5.4047, "step": 221000 }, { "epoch": 1.1799740032815529, "grad_norm": 1.984183430671692, "learning_rate": 0.000609550020690344, "loss": 5.397, "step": 221500 }, { "epoch": 1.1826376014830915, "grad_norm": 2.173187732696533, "learning_rate": 0.0006086581955166164, "loss": 5.3996, "step": 222000 }, { "epoch": 1.1853011996846299, "grad_norm": 2.0700299739837646, "learning_rate": 0.000607766370342889, "loss": 5.3976, "step": 222500 }, { "epoch": 1.1879647978861685, "grad_norm": 2.1351547241210938, "learning_rate": 0.0006068763288195088, "loss": 5.4113, "step": 223000 }, { "epoch": 1.1906283960877069, "grad_norm": 1.9995781183242798, "learning_rate": 0.0006059845036457813, "loss": 5.4012, "step": 223500 }, { "epoch": 1.1932919942892455, "grad_norm": 2.2745988368988037, "learning_rate": 0.0006050926784720537, "loss": 5.4093, "step": 224000 }, { "epoch": 1.1959555924907839, "grad_norm": 2.5383615493774414, "learning_rate": 0.0006042026369486737, "loss": 5.3934, "step": 224500 }, { "epoch": 1.1986191906923225, "grad_norm": 2.132570266723633, "learning_rate": 0.0006033108117749462, "loss": 5.4143, "step": 225000 }, { "epoch": 1.2012827888938609, "grad_norm": 1.9985568523406982, "learning_rate": 0.0006024189866012187, "loss": 5.3987, "step": 225500 }, { "epoch": 1.2039463870953995, "grad_norm": 1.9169471263885498, "learning_rate": 0.0006015271614274911, "loss": 5.4005, "step": 226000 }, { "epoch": 1.2066099852969379, "grad_norm": 1.9423543214797974, "learning_rate": 0.0006006353362537635, "loss": 5.4016, "step": 226500 }, { "epoch": 1.2092735834984765, "grad_norm": 2.0575485229492188, "learning_rate": 0.000599743511080036, "loss": 5.393, "step": 227000 }, { "epoch": 1.2119371817000149, "grad_norm": 2.034454584121704, "learning_rate": 0.0005988516859063085, "loss": 5.3946, "step": 227500 }, { "epoch": 1.2146007799015535, "grad_norm": 1.9063221216201782, "learning_rate": 0.0005979598607325809, "loss": 5.4005, "step": 228000 }, { "epoch": 1.2172643781030918, "grad_norm": 2.094717025756836, "learning_rate": 0.0005970698192092008, "loss": 5.3943, "step": 228500 }, { "epoch": 1.2199279763046305, "grad_norm": 1.9740791320800781, "learning_rate": 0.0005961779940354732, "loss": 5.399, "step": 229000 }, { "epoch": 1.2225915745061688, "grad_norm": 1.95699143409729, "learning_rate": 0.0005952861688617457, "loss": 5.3971, "step": 229500 }, { "epoch": 1.2252551727077075, "grad_norm": 1.9305535554885864, "learning_rate": 0.0005943943436880182, "loss": 5.399, "step": 230000 }, { "epoch": 1.2279187709092458, "grad_norm": 1.8926870822906494, "learning_rate": 0.000593504302164638, "loss": 5.3967, "step": 230500 }, { "epoch": 1.2305823691107844, "grad_norm": 1.91937255859375, "learning_rate": 0.0005926124769909105, "loss": 5.3966, "step": 231000 }, { "epoch": 1.2332459673123228, "grad_norm": 1.9494017362594604, "learning_rate": 0.0005917224354675305, "loss": 5.3988, "step": 231500 }, { "epoch": 1.2359095655138614, "grad_norm": 1.7676622867584229, "learning_rate": 0.0005908306102938029, "loss": 5.3954, "step": 232000 }, { "epoch": 1.2385731637153998, "grad_norm": 1.9707027673721313, "learning_rate": 0.0005899387851200753, "loss": 5.3987, "step": 232500 }, { "epoch": 1.2412367619169384, "grad_norm": 1.8651105165481567, "learning_rate": 0.0005890469599463479, "loss": 5.3913, "step": 233000 }, { "epoch": 1.2439003601184768, "grad_norm": 2.2256948947906494, "learning_rate": 0.0005881551347726203, "loss": 5.4022, "step": 233500 }, { "epoch": 1.2465639583200154, "grad_norm": 2.0236611366271973, "learning_rate": 0.0005872633095988927, "loss": 5.3928, "step": 234000 }, { "epoch": 1.2492275565215538, "grad_norm": 2.07328724861145, "learning_rate": 0.0005863714844251652, "loss": 5.3964, "step": 234500 }, { "epoch": 1.2518911547230922, "grad_norm": 2.011497974395752, "learning_rate": 0.000585481442901785, "loss": 5.4, "step": 235000 }, { "epoch": 1.2545547529246308, "grad_norm": 1.891579270362854, "learning_rate": 0.0005845896177280576, "loss": 5.3931, "step": 235500 }, { "epoch": 1.2572183511261694, "grad_norm": 1.8369475603103638, "learning_rate": 0.00058369779255433, "loss": 5.388, "step": 236000 }, { "epoch": 1.2598819493277078, "grad_norm": 2.316582441329956, "learning_rate": 0.0005828059673806024, "loss": 5.3878, "step": 236500 }, { "epoch": 1.2625455475292462, "grad_norm": 1.8466497659683228, "learning_rate": 0.0005819141422068749, "loss": 5.3942, "step": 237000 }, { "epoch": 1.2652091457307848, "grad_norm": 1.9420734643936157, "learning_rate": 0.0005810223170331473, "loss": 5.3907, "step": 237500 }, { "epoch": 1.2678727439323234, "grad_norm": 1.9229456186294556, "learning_rate": 0.0005801304918594198, "loss": 5.394, "step": 238000 }, { "epoch": 1.2705363421338618, "grad_norm": 2.126213788986206, "learning_rate": 0.0005792386666856923, "loss": 5.3875, "step": 238500 }, { "epoch": 1.2731999403354002, "grad_norm": 1.9714566469192505, "learning_rate": 0.0005783486251623122, "loss": 5.3938, "step": 239000 }, { "epoch": 1.2758635385369388, "grad_norm": 2.244844436645508, "learning_rate": 0.0005774567999885847, "loss": 5.3974, "step": 239500 }, { "epoch": 1.2785271367384774, "grad_norm": 2.083517551422119, "learning_rate": 0.0005765649748148571, "loss": 5.3827, "step": 240000 }, { "epoch": 1.2811907349400158, "grad_norm": 2.1155362129211426, "learning_rate": 0.0005756749332914771, "loss": 5.3908, "step": 240500 }, { "epoch": 1.2838543331415542, "grad_norm": 2.0415351390838623, "learning_rate": 0.0005747831081177495, "loss": 5.3904, "step": 241000 }, { "epoch": 1.2865179313430928, "grad_norm": 2.4744224548339844, "learning_rate": 0.0005738912829440219, "loss": 5.3825, "step": 241500 }, { "epoch": 1.2891815295446314, "grad_norm": 1.9680261611938477, "learning_rate": 0.0005729994577702944, "loss": 5.3915, "step": 242000 }, { "epoch": 1.2918451277461698, "grad_norm": 2.4636471271514893, "learning_rate": 0.0005721076325965668, "loss": 5.3946, "step": 242500 }, { "epoch": 1.2945087259477082, "grad_norm": 1.8884419202804565, "learning_rate": 0.0005712158074228393, "loss": 5.3905, "step": 243000 }, { "epoch": 1.2971723241492468, "grad_norm": 2.192204236984253, "learning_rate": 0.0005703257658994592, "loss": 5.3891, "step": 243500 }, { "epoch": 1.2998359223507852, "grad_norm": 1.963740587234497, "learning_rate": 0.0005694339407257316, "loss": 5.389, "step": 244000 }, { "epoch": 1.3024995205523238, "grad_norm": 2.2511630058288574, "learning_rate": 0.0005685421155520041, "loss": 5.3988, "step": 244500 }, { "epoch": 1.3051631187538622, "grad_norm": 1.8933221101760864, "learning_rate": 0.0005676502903782765, "loss": 5.39, "step": 245000 }, { "epoch": 1.3078267169554008, "grad_norm": 1.813040852546692, "learning_rate": 0.000566758465204549, "loss": 5.3884, "step": 245500 }, { "epoch": 1.3104903151569391, "grad_norm": 2.3987181186676025, "learning_rate": 0.0005658666400308215, "loss": 5.3888, "step": 246000 }, { "epoch": 1.3131539133584778, "grad_norm": 2.0762851238250732, "learning_rate": 0.0005649748148570939, "loss": 5.3881, "step": 246500 }, { "epoch": 1.3158175115600161, "grad_norm": 2.3197662830352783, "learning_rate": 0.0005640829896833663, "loss": 5.3876, "step": 247000 }, { "epoch": 1.3184811097615547, "grad_norm": 1.9953910112380981, "learning_rate": 0.0005631929481599863, "loss": 5.3892, "step": 247500 }, { "epoch": 1.3211447079630931, "grad_norm": 2.20346999168396, "learning_rate": 0.0005623011229862588, "loss": 5.3844, "step": 248000 }, { "epoch": 1.3238083061646317, "grad_norm": 1.9688447713851929, "learning_rate": 0.0005614092978125313, "loss": 5.3924, "step": 248500 }, { "epoch": 1.3264719043661701, "grad_norm": 1.950621485710144, "learning_rate": 0.0005605174726388037, "loss": 5.382, "step": 249000 }, { "epoch": 1.3291355025677087, "grad_norm": 2.0261106491088867, "learning_rate": 0.0005596274311154236, "loss": 5.3889, "step": 249500 }, { "epoch": 1.3317991007692471, "grad_norm": 1.819598913192749, "learning_rate": 0.000558735605941696, "loss": 5.3879, "step": 250000 }, { "epoch": 1.3344626989707857, "grad_norm": 2.092658042907715, "learning_rate": 0.0005578437807679685, "loss": 5.3897, "step": 250500 }, { "epoch": 1.3371262971723241, "grad_norm": 1.8927563428878784, "learning_rate": 0.000556951955594241, "loss": 5.3888, "step": 251000 }, { "epoch": 1.3397898953738627, "grad_norm": 1.91410493850708, "learning_rate": 0.0005560619140708608, "loss": 5.3865, "step": 251500 }, { "epoch": 1.3424534935754011, "grad_norm": 1.923710584640503, "learning_rate": 0.0005551700888971333, "loss": 5.3831, "step": 252000 }, { "epoch": 1.3451170917769395, "grad_norm": 2.011301279067993, "learning_rate": 0.0005542782637234058, "loss": 5.3832, "step": 252500 }, { "epoch": 1.347780689978478, "grad_norm": 1.8271079063415527, "learning_rate": 0.0005533864385496783, "loss": 5.3843, "step": 253000 }, { "epoch": 1.3504442881800167, "grad_norm": 2.0028188228607178, "learning_rate": 0.0005524963970262982, "loss": 5.383, "step": 253500 }, { "epoch": 1.353107886381555, "grad_norm": 1.8386844396591187, "learning_rate": 0.0005516045718525706, "loss": 5.3873, "step": 254000 }, { "epoch": 1.3557714845830935, "grad_norm": 1.8750890493392944, "learning_rate": 0.0005507127466788431, "loss": 5.3794, "step": 254500 }, { "epoch": 1.358435082784632, "grad_norm": 1.9305578470230103, "learning_rate": 0.0005498209215051155, "loss": 5.3863, "step": 255000 }, { "epoch": 1.3610986809861707, "grad_norm": 2.1922383308410645, "learning_rate": 0.0005489308799817354, "loss": 5.3889, "step": 255500 }, { "epoch": 1.363762279187709, "grad_norm": 2.006162405014038, "learning_rate": 0.0005480390548080079, "loss": 5.3793, "step": 256000 }, { "epoch": 1.3664258773892475, "grad_norm": 2.1891300678253174, "learning_rate": 0.0005471472296342803, "loss": 5.3805, "step": 256500 }, { "epoch": 1.369089475590786, "grad_norm": 2.036553144454956, "learning_rate": 0.0005462554044605528, "loss": 5.3809, "step": 257000 }, { "epoch": 1.3717530737923247, "grad_norm": 1.9189977645874023, "learning_rate": 0.0005453653629371727, "loss": 5.3766, "step": 257500 }, { "epoch": 1.374416671993863, "grad_norm": 1.98636794090271, "learning_rate": 0.0005444735377634452, "loss": 5.39, "step": 258000 }, { "epoch": 1.3770802701954015, "grad_norm": 1.897522211074829, "learning_rate": 0.0005435834962400651, "loss": 5.3839, "step": 258500 }, { "epoch": 1.37974386839694, "grad_norm": 2.0826635360717773, "learning_rate": 0.0005426916710663376, "loss": 5.383, "step": 259000 }, { "epoch": 1.3824074665984787, "grad_norm": 1.8267229795455933, "learning_rate": 0.00054179984589261, "loss": 5.3866, "step": 259500 }, { "epoch": 1.385071064800017, "grad_norm": 2.1117184162139893, "learning_rate": 0.0005409080207188824, "loss": 5.3787, "step": 260000 }, { "epoch": 1.3877346630015555, "grad_norm": 1.9132159948349, "learning_rate": 0.0005400161955451549, "loss": 5.3812, "step": 260500 }, { "epoch": 1.390398261203094, "grad_norm": 1.9600298404693604, "learning_rate": 0.0005391243703714274, "loss": 5.381, "step": 261000 }, { "epoch": 1.3930618594046325, "grad_norm": 2.000422716140747, "learning_rate": 0.0005382325451976998, "loss": 5.3823, "step": 261500 }, { "epoch": 1.395725457606171, "grad_norm": 2.2225003242492676, "learning_rate": 0.0005373407200239723, "loss": 5.3776, "step": 262000 }, { "epoch": 1.3983890558077094, "grad_norm": 2.084779977798462, "learning_rate": 0.0005364506785005921, "loss": 5.3781, "step": 262500 }, { "epoch": 1.401052654009248, "grad_norm": 2.126775026321411, "learning_rate": 0.0005355588533268646, "loss": 5.3832, "step": 263000 }, { "epoch": 1.4037162522107864, "grad_norm": 1.9713746309280396, "learning_rate": 0.0005346670281531371, "loss": 5.3792, "step": 263500 }, { "epoch": 1.406379850412325, "grad_norm": 2.0785419940948486, "learning_rate": 0.0005337752029794095, "loss": 5.3825, "step": 264000 }, { "epoch": 1.4090434486138634, "grad_norm": 2.3811593055725098, "learning_rate": 0.0005328851614560295, "loss": 5.3826, "step": 264500 }, { "epoch": 1.411707046815402, "grad_norm": 2.1196324825286865, "learning_rate": 0.0005319933362823019, "loss": 5.3785, "step": 265000 }, { "epoch": 1.4143706450169404, "grad_norm": 2.06736421585083, "learning_rate": 0.0005311015111085744, "loss": 5.3796, "step": 265500 }, { "epoch": 1.417034243218479, "grad_norm": 2.1438751220703125, "learning_rate": 0.0005302096859348468, "loss": 5.3747, "step": 266000 }, { "epoch": 1.4196978414200174, "grad_norm": 2.0328142642974854, "learning_rate": 0.0005293196444114668, "loss": 5.3726, "step": 266500 }, { "epoch": 1.422361439621556, "grad_norm": 1.9709652662277222, "learning_rate": 0.0005284278192377392, "loss": 5.3835, "step": 267000 }, { "epoch": 1.4250250378230944, "grad_norm": 2.0982072353363037, "learning_rate": 0.0005275359940640116, "loss": 5.3719, "step": 267500 }, { "epoch": 1.427688636024633, "grad_norm": 2.3335447311401367, "learning_rate": 0.0005266441688902841, "loss": 5.3824, "step": 268000 }, { "epoch": 1.4303522342261714, "grad_norm": 1.9240329265594482, "learning_rate": 0.0005257541273669039, "loss": 5.3754, "step": 268500 }, { "epoch": 1.43301583242771, "grad_norm": 2.0762813091278076, "learning_rate": 0.0005248623021931765, "loss": 5.3754, "step": 269000 }, { "epoch": 1.4356794306292484, "grad_norm": 1.9223084449768066, "learning_rate": 0.0005239704770194489, "loss": 5.3751, "step": 269500 }, { "epoch": 1.4383430288307868, "grad_norm": 1.9600517749786377, "learning_rate": 0.0005230786518457213, "loss": 5.3726, "step": 270000 }, { "epoch": 1.4410066270323254, "grad_norm": 2.0275826454162598, "learning_rate": 0.0005221886103223413, "loss": 5.3755, "step": 270500 }, { "epoch": 1.443670225233864, "grad_norm": 2.0879909992218018, "learning_rate": 0.0005212967851486137, "loss": 5.371, "step": 271000 }, { "epoch": 1.4463338234354024, "grad_norm": 2.2107584476470947, "learning_rate": 0.0005204049599748863, "loss": 5.3775, "step": 271500 }, { "epoch": 1.4489974216369408, "grad_norm": 1.9889525175094604, "learning_rate": 0.0005195131348011587, "loss": 5.369, "step": 272000 }, { "epoch": 1.4516610198384794, "grad_norm": 1.8878706693649292, "learning_rate": 0.0005186230932777786, "loss": 5.3762, "step": 272500 }, { "epoch": 1.454324618040018, "grad_norm": 2.0804665088653564, "learning_rate": 0.000517731268104051, "loss": 5.3731, "step": 273000 }, { "epoch": 1.4569882162415564, "grad_norm": 2.3155815601348877, "learning_rate": 0.0005168394429303234, "loss": 5.3696, "step": 273500 }, { "epoch": 1.4596518144430948, "grad_norm": 2.2707676887512207, "learning_rate": 0.000515947617756596, "loss": 5.3763, "step": 274000 }, { "epoch": 1.4623154126446334, "grad_norm": 1.947204828262329, "learning_rate": 0.0005150575762332158, "loss": 5.3689, "step": 274500 }, { "epoch": 1.464979010846172, "grad_norm": 1.9428602457046509, "learning_rate": 0.0005141657510594883, "loss": 5.3797, "step": 275000 }, { "epoch": 1.4676426090477104, "grad_norm": 2.4003546237945557, "learning_rate": 0.0005132739258857608, "loss": 5.3672, "step": 275500 }, { "epoch": 1.4703062072492488, "grad_norm": 2.047048330307007, "learning_rate": 0.0005123821007120333, "loss": 5.3761, "step": 276000 }, { "epoch": 1.4729698054507874, "grad_norm": 2.0965404510498047, "learning_rate": 0.0005114920591886531, "loss": 5.3645, "step": 276500 }, { "epoch": 1.475633403652326, "grad_norm": 1.9648233652114868, "learning_rate": 0.0005106002340149257, "loss": 5.37, "step": 277000 }, { "epoch": 1.4782970018538644, "grad_norm": 1.8992446660995483, "learning_rate": 0.0005097084088411981, "loss": 5.3679, "step": 277500 }, { "epoch": 1.4809606000554028, "grad_norm": 2.125126838684082, "learning_rate": 0.0005088165836674705, "loss": 5.3702, "step": 278000 }, { "epoch": 1.4836241982569414, "grad_norm": 2.030409574508667, "learning_rate": 0.0005079265421440904, "loss": 5.3691, "step": 278500 }, { "epoch": 1.4862877964584797, "grad_norm": 1.9816679954528809, "learning_rate": 0.0005070347169703628, "loss": 5.3723, "step": 279000 }, { "epoch": 1.4889513946600184, "grad_norm": 2.032564401626587, "learning_rate": 0.0005061428917966354, "loss": 5.3695, "step": 279500 }, { "epoch": 1.4916149928615567, "grad_norm": 2.0342843532562256, "learning_rate": 0.0005052510666229078, "loss": 5.3681, "step": 280000 }, { "epoch": 1.4942785910630954, "grad_norm": 1.9113322496414185, "learning_rate": 0.0005043610250995278, "loss": 5.3713, "step": 280500 }, { "epoch": 1.4969421892646337, "grad_norm": 2.1201562881469727, "learning_rate": 0.0005034691999258002, "loss": 5.375, "step": 281000 }, { "epoch": 1.4996057874661723, "grad_norm": 2.1695244312286377, "learning_rate": 0.0005025773747520726, "loss": 5.3666, "step": 281500 }, { "epoch": 1.5022693856677107, "grad_norm": 2.2736222743988037, "learning_rate": 0.0005016873332286925, "loss": 5.3728, "step": 282000 }, { "epoch": 1.5049329838692493, "grad_norm": 1.9306550025939941, "learning_rate": 0.000500795508054965, "loss": 5.3607, "step": 282500 }, { "epoch": 1.507596582070788, "grad_norm": 1.970550537109375, "learning_rate": 0.0004999036828812375, "loss": 5.372, "step": 283000 }, { "epoch": 1.5102601802723261, "grad_norm": 1.7387876510620117, "learning_rate": 0.0004990118577075099, "loss": 5.3728, "step": 283500 }, { "epoch": 1.5129237784738647, "grad_norm": 2.364816188812256, "learning_rate": 0.0004981200325337823, "loss": 5.3667, "step": 284000 }, { "epoch": 1.5155873766754033, "grad_norm": 1.959367036819458, "learning_rate": 0.0004972282073600549, "loss": 5.3672, "step": 284500 }, { "epoch": 1.5182509748769417, "grad_norm": 2.4462456703186035, "learning_rate": 0.0004963363821863273, "loss": 5.3669, "step": 285000 }, { "epoch": 1.52091457307848, "grad_norm": 1.949645757675171, "learning_rate": 0.0004954445570125997, "loss": 5.3669, "step": 285500 }, { "epoch": 1.5235781712800187, "grad_norm": 2.0255677700042725, "learning_rate": 0.0004945545154892197, "loss": 5.3689, "step": 286000 }, { "epoch": 1.5262417694815573, "grad_norm": 2.0761642456054688, "learning_rate": 0.0004936644739658396, "loss": 5.3633, "step": 286500 }, { "epoch": 1.5289053676830957, "grad_norm": 2.1219048500061035, "learning_rate": 0.000492772648792112, "loss": 5.3617, "step": 287000 }, { "epoch": 1.531568965884634, "grad_norm": 1.83650803565979, "learning_rate": 0.0004918808236183844, "loss": 5.3735, "step": 287500 }, { "epoch": 1.5342325640861727, "grad_norm": 2.0275492668151855, "learning_rate": 0.0004909889984446568, "loss": 5.3636, "step": 288000 }, { "epoch": 1.5368961622877113, "grad_norm": 1.9854780435562134, "learning_rate": 0.0004900971732709294, "loss": 5.3595, "step": 288500 }, { "epoch": 1.5395597604892497, "grad_norm": 2.282017707824707, "learning_rate": 0.0004892053480972018, "loss": 5.3673, "step": 289000 }, { "epoch": 1.542223358690788, "grad_norm": 2.0435492992401123, "learning_rate": 0.0004883135229234743, "loss": 5.3771, "step": 289500 }, { "epoch": 1.5448869568923267, "grad_norm": 2.4702582359313965, "learning_rate": 0.0004874216977497467, "loss": 5.3592, "step": 290000 }, { "epoch": 1.5475505550938653, "grad_norm": 2.032315731048584, "learning_rate": 0.00048653165622636666, "loss": 5.3688, "step": 290500 }, { "epoch": 1.5502141532954037, "grad_norm": 2.13460636138916, "learning_rate": 0.0004856398310526391, "loss": 5.3624, "step": 291000 }, { "epoch": 1.552877751496942, "grad_norm": 1.9628610610961914, "learning_rate": 0.0004847480058789115, "loss": 5.3647, "step": 291500 }, { "epoch": 1.5555413496984807, "grad_norm": 1.8896455764770508, "learning_rate": 0.000483856180705184, "loss": 5.3693, "step": 292000 }, { "epoch": 1.5582049479000193, "grad_norm": 1.92352294921875, "learning_rate": 0.0004829661391818039, "loss": 5.3551, "step": 292500 }, { "epoch": 1.5608685461015577, "grad_norm": 2.061492919921875, "learning_rate": 0.0004820743140080764, "loss": 5.3618, "step": 293000 }, { "epoch": 1.563532144303096, "grad_norm": 2.0767364501953125, "learning_rate": 0.0004811842724846963, "loss": 5.3596, "step": 293500 }, { "epoch": 1.5661957425046347, "grad_norm": 2.103719472885132, "learning_rate": 0.00048029244731096876, "loss": 5.3547, "step": 294000 }, { "epoch": 1.5688593407061733, "grad_norm": 2.096832275390625, "learning_rate": 0.00047940062213724124, "loss": 5.3635, "step": 294500 }, { "epoch": 1.5715229389077117, "grad_norm": 2.053567409515381, "learning_rate": 0.0004785087969635137, "loss": 5.3683, "step": 295000 }, { "epoch": 1.57418653710925, "grad_norm": 2.040846586227417, "learning_rate": 0.00047761697178978616, "loss": 5.3623, "step": 295500 }, { "epoch": 1.5768501353107887, "grad_norm": 2.0361154079437256, "learning_rate": 0.0004767251466160586, "loss": 5.3572, "step": 296000 }, { "epoch": 1.5795137335123273, "grad_norm": 2.006989002227783, "learning_rate": 0.00047583332144233103, "loss": 5.3702, "step": 296500 }, { "epoch": 1.5821773317138657, "grad_norm": 2.0891811847686768, "learning_rate": 0.0004749414962686035, "loss": 5.3664, "step": 297000 }, { "epoch": 1.584840929915404, "grad_norm": 2.023730754852295, "learning_rate": 0.0004740514547452234, "loss": 5.3668, "step": 297500 }, { "epoch": 1.5875045281169426, "grad_norm": 1.8560234308242798, "learning_rate": 0.0004731596295714958, "loss": 5.3688, "step": 298000 }, { "epoch": 1.5901681263184813, "grad_norm": 1.84561288356781, "learning_rate": 0.0004722678043977683, "loss": 5.3595, "step": 298500 }, { "epoch": 1.5928317245200196, "grad_norm": 2.0453810691833496, "learning_rate": 0.0004713759792240407, "loss": 5.3612, "step": 299000 }, { "epoch": 1.595495322721558, "grad_norm": 2.03952956199646, "learning_rate": 0.0004704859377006607, "loss": 5.3595, "step": 299500 }, { "epoch": 1.5981589209230966, "grad_norm": 2.175218343734741, "learning_rate": 0.00046959411252693313, "loss": 5.3599, "step": 300000 }, { "epoch": 1.6008225191246352, "grad_norm": 1.9432867765426636, "learning_rate": 0.00046870228735320556, "loss": 5.3579, "step": 300500 }, { "epoch": 1.6034861173261736, "grad_norm": 2.0046420097351074, "learning_rate": 0.00046781046217947805, "loss": 5.3506, "step": 301000 }, { "epoch": 1.606149715527712, "grad_norm": 1.9781187772750854, "learning_rate": 0.00046692042065609796, "loss": 5.3585, "step": 301500 }, { "epoch": 1.6088133137292506, "grad_norm": 2.0884523391723633, "learning_rate": 0.0004660285954823704, "loss": 5.36, "step": 302000 }, { "epoch": 1.611476911930789, "grad_norm": 2.0299806594848633, "learning_rate": 0.0004651367703086429, "loss": 5.3609, "step": 302500 }, { "epoch": 1.6141405101323274, "grad_norm": 2.0034475326538086, "learning_rate": 0.0004642449451349153, "loss": 5.3621, "step": 303000 }, { "epoch": 1.616804108333866, "grad_norm": 2.027804136276245, "learning_rate": 0.00046335490361153523, "loss": 5.3617, "step": 303500 }, { "epoch": 1.6194677065354046, "grad_norm": 2.2879958152770996, "learning_rate": 0.0004624630784378077, "loss": 5.3597, "step": 304000 }, { "epoch": 1.622131304736943, "grad_norm": 2.0821385383605957, "learning_rate": 0.00046157125326408015, "loss": 5.3539, "step": 304500 }, { "epoch": 1.6247949029384814, "grad_norm": 2.0150811672210693, "learning_rate": 0.00046067942809035263, "loss": 5.3568, "step": 305000 }, { "epoch": 1.62745850114002, "grad_norm": 1.944470763206482, "learning_rate": 0.0004597893865669725, "loss": 5.3618, "step": 305500 }, { "epoch": 1.6301220993415586, "grad_norm": 1.8767342567443848, "learning_rate": 0.000458897561393245, "loss": 5.3572, "step": 306000 }, { "epoch": 1.632785697543097, "grad_norm": 2.100074291229248, "learning_rate": 0.0004580057362195174, "loss": 5.3557, "step": 306500 }, { "epoch": 1.6354492957446354, "grad_norm": 1.8953720331192017, "learning_rate": 0.00045711569469613733, "loss": 5.3603, "step": 307000 }, { "epoch": 1.638112893946174, "grad_norm": 2.099968433380127, "learning_rate": 0.0004562238695224098, "loss": 5.3459, "step": 307500 }, { "epoch": 1.6407764921477126, "grad_norm": 2.21608567237854, "learning_rate": 0.00045533204434868225, "loss": 5.3602, "step": 308000 }, { "epoch": 1.643440090349251, "grad_norm": 2.0884177684783936, "learning_rate": 0.0004544402191749547, "loss": 5.3538, "step": 308500 }, { "epoch": 1.6461036885507894, "grad_norm": 2.0560896396636963, "learning_rate": 0.00045354839400122717, "loss": 5.3618, "step": 309000 }, { "epoch": 1.648767286752328, "grad_norm": 2.3166544437408447, "learning_rate": 0.0004526565688274996, "loss": 5.3446, "step": 309500 }, { "epoch": 1.6514308849538666, "grad_norm": 1.9376626014709473, "learning_rate": 0.0004517647436537721, "loss": 5.3565, "step": 310000 }, { "epoch": 1.654094483155405, "grad_norm": 1.8356984853744507, "learning_rate": 0.0004508729184800445, "loss": 5.3585, "step": 310500 }, { "epoch": 1.6567580813569434, "grad_norm": 2.0316951274871826, "learning_rate": 0.00044998287695666443, "loss": 5.3615, "step": 311000 }, { "epoch": 1.659421679558482, "grad_norm": 2.1165359020233154, "learning_rate": 0.00044909283543328435, "loss": 5.357, "step": 311500 }, { "epoch": 1.6620852777600206, "grad_norm": 2.1769607067108154, "learning_rate": 0.0004482010102595568, "loss": 5.3567, "step": 312000 }, { "epoch": 1.664748875961559, "grad_norm": 2.0454256534576416, "learning_rate": 0.0004473091850858292, "loss": 5.3573, "step": 312500 }, { "epoch": 1.6674124741630973, "grad_norm": 2.1431968212127686, "learning_rate": 0.0004464173599121017, "loss": 5.3509, "step": 313000 }, { "epoch": 1.670076072364636, "grad_norm": 2.0397841930389404, "learning_rate": 0.00044552553473837413, "loss": 5.3532, "step": 313500 }, { "epoch": 1.6727396705661746, "grad_norm": 2.080476999282837, "learning_rate": 0.0004446337095646467, "loss": 5.3558, "step": 314000 }, { "epoch": 1.675403268767713, "grad_norm": 1.9653671979904175, "learning_rate": 0.0004437418843909191, "loss": 5.3481, "step": 314500 }, { "epoch": 1.6780668669692513, "grad_norm": 2.2119712829589844, "learning_rate": 0.0004428500592171916, "loss": 5.3555, "step": 315000 }, { "epoch": 1.68073046517079, "grad_norm": 1.990404486656189, "learning_rate": 0.00044196001769381145, "loss": 5.3567, "step": 315500 }, { "epoch": 1.6833940633723286, "grad_norm": 2.0500054359436035, "learning_rate": 0.0004410681925200839, "loss": 5.3503, "step": 316000 }, { "epoch": 1.686057661573867, "grad_norm": 2.205277919769287, "learning_rate": 0.00044017636734635637, "loss": 5.3553, "step": 316500 }, { "epoch": 1.6887212597754053, "grad_norm": 1.9659850597381592, "learning_rate": 0.0004392845421726288, "loss": 5.3456, "step": 317000 }, { "epoch": 1.691384857976944, "grad_norm": 2.029604196548462, "learning_rate": 0.0004383927169989013, "loss": 5.3554, "step": 317500 }, { "epoch": 1.6940484561784825, "grad_norm": 2.041193723678589, "learning_rate": 0.0004375008918251737, "loss": 5.3534, "step": 318000 }, { "epoch": 1.696712054380021, "grad_norm": 2.068268299102783, "learning_rate": 0.00043661085030179364, "loss": 5.3564, "step": 318500 }, { "epoch": 1.6993756525815593, "grad_norm": 2.0078883171081543, "learning_rate": 0.0004357190251280661, "loss": 5.3518, "step": 319000 }, { "epoch": 1.702039250783098, "grad_norm": 1.9186288118362427, "learning_rate": 0.00043482719995433856, "loss": 5.3471, "step": 319500 }, { "epoch": 1.7047028489846365, "grad_norm": 2.0289323329925537, "learning_rate": 0.000433935374780611, "loss": 5.3513, "step": 320000 }, { "epoch": 1.7073664471861747, "grad_norm": 1.69050133228302, "learning_rate": 0.0004330435496068835, "loss": 5.3513, "step": 320500 }, { "epoch": 1.7100300453877133, "grad_norm": 2.0047898292541504, "learning_rate": 0.0004321517244331559, "loss": 5.3531, "step": 321000 }, { "epoch": 1.712693643589252, "grad_norm": 2.1100831031799316, "learning_rate": 0.0004312616829097759, "loss": 5.3494, "step": 321500 }, { "epoch": 1.7153572417907903, "grad_norm": 2.053802013397217, "learning_rate": 0.0004303698577360483, "loss": 5.3573, "step": 322000 }, { "epoch": 1.7180208399923287, "grad_norm": 1.9370436668395996, "learning_rate": 0.00042947803256232074, "loss": 5.3457, "step": 322500 }, { "epoch": 1.7206844381938673, "grad_norm": 2.062244176864624, "learning_rate": 0.00042858620738859323, "loss": 5.3532, "step": 323000 }, { "epoch": 1.723348036395406, "grad_norm": 2.129863739013672, "learning_rate": 0.00042769438221486566, "loss": 5.3469, "step": 323500 }, { "epoch": 1.7260116345969443, "grad_norm": 2.1496474742889404, "learning_rate": 0.0004268043406914855, "loss": 5.3494, "step": 324000 }, { "epoch": 1.7286752327984827, "grad_norm": 2.0887863636016846, "learning_rate": 0.00042591251551775806, "loss": 5.3483, "step": 324500 }, { "epoch": 1.7313388310000213, "grad_norm": 2.4094293117523193, "learning_rate": 0.0004250206903440305, "loss": 5.3485, "step": 325000 }, { "epoch": 1.73400242920156, "grad_norm": 2.046931266784668, "learning_rate": 0.000424128865170303, "loss": 5.345, "step": 325500 }, { "epoch": 1.7366660274030983, "grad_norm": 2.1520516872406006, "learning_rate": 0.0004232370399965754, "loss": 5.351, "step": 326000 }, { "epoch": 1.7393296256046367, "grad_norm": 2.006589651107788, "learning_rate": 0.0004223469984731953, "loss": 5.3511, "step": 326500 }, { "epoch": 1.7419932238061753, "grad_norm": 1.9035310745239258, "learning_rate": 0.00042145517329946776, "loss": 5.3457, "step": 327000 }, { "epoch": 1.7446568220077139, "grad_norm": 2.0777719020843506, "learning_rate": 0.0004205633481257402, "loss": 5.3519, "step": 327500 }, { "epoch": 1.7473204202092523, "grad_norm": 2.2958412170410156, "learning_rate": 0.0004196715229520127, "loss": 5.3455, "step": 328000 }, { "epoch": 1.7499840184107907, "grad_norm": 2.3482723236083984, "learning_rate": 0.0004187796977782851, "loss": 5.3513, "step": 328500 }, { "epoch": 1.7526476166123293, "grad_norm": 2.4552931785583496, "learning_rate": 0.00041788787260455755, "loss": 5.3496, "step": 329000 }, { "epoch": 1.7553112148138679, "grad_norm": 2.0816726684570312, "learning_rate": 0.00041699604743083003, "loss": 5.3434, "step": 329500 }, { "epoch": 1.7579748130154063, "grad_norm": 1.869194746017456, "learning_rate": 0.00041610600590744995, "loss": 5.349, "step": 330000 }, { "epoch": 1.7606384112169446, "grad_norm": 2.020172595977783, "learning_rate": 0.00041521418073372243, "loss": 5.3489, "step": 330500 }, { "epoch": 1.7633020094184833, "grad_norm": 2.1260483264923096, "learning_rate": 0.00041432235555999487, "loss": 5.3523, "step": 331000 }, { "epoch": 1.7659656076200219, "grad_norm": 2.1546857357025146, "learning_rate": 0.0004134305303862673, "loss": 5.3414, "step": 331500 }, { "epoch": 1.7686292058215602, "grad_norm": 2.2955052852630615, "learning_rate": 0.0004125387052125398, "loss": 5.3489, "step": 332000 }, { "epoch": 1.7712928040230986, "grad_norm": 2.0505149364471436, "learning_rate": 0.0004116468800388122, "loss": 5.3543, "step": 332500 }, { "epoch": 1.7739564022246372, "grad_norm": 1.9976879358291626, "learning_rate": 0.0004107550548650847, "loss": 5.3455, "step": 333000 }, { "epoch": 1.7766200004261758, "grad_norm": 2.1872785091400146, "learning_rate": 0.00040986322969135714, "loss": 5.345, "step": 333500 }, { "epoch": 1.7792835986277142, "grad_norm": 2.025681257247925, "learning_rate": 0.00040897318816797705, "loss": 5.3559, "step": 334000 }, { "epoch": 1.7819471968292526, "grad_norm": 2.051701307296753, "learning_rate": 0.00040808136299424954, "loss": 5.3424, "step": 334500 }, { "epoch": 1.7846107950307912, "grad_norm": 2.161292314529419, "learning_rate": 0.00040718953782052197, "loss": 5.3418, "step": 335000 }, { "epoch": 1.7872743932323298, "grad_norm": 2.1306283473968506, "learning_rate": 0.00040629771264679446, "loss": 5.352, "step": 335500 }, { "epoch": 1.7899379914338682, "grad_norm": 2.1994986534118652, "learning_rate": 0.00040540767112341437, "loss": 5.348, "step": 336000 }, { "epoch": 1.7926015896354066, "grad_norm": 2.3227968215942383, "learning_rate": 0.00040451762960003423, "loss": 5.3444, "step": 336500 }, { "epoch": 1.7952651878369452, "grad_norm": 2.1397862434387207, "learning_rate": 0.0004036258044263067, "loss": 5.3556, "step": 337000 }, { "epoch": 1.7979287860384838, "grad_norm": 2.0676870346069336, "learning_rate": 0.00040273397925257915, "loss": 5.3471, "step": 337500 }, { "epoch": 1.8005923842400222, "grad_norm": 2.2523062229156494, "learning_rate": 0.0004018421540788516, "loss": 5.3431, "step": 338000 }, { "epoch": 1.8032559824415606, "grad_norm": 2.1115000247955322, "learning_rate": 0.00040095211255547155, "loss": 5.3467, "step": 338500 }, { "epoch": 1.8059195806430992, "grad_norm": 2.0157132148742676, "learning_rate": 0.000400060287381744, "loss": 5.3462, "step": 339000 }, { "epoch": 1.8085831788446376, "grad_norm": 2.1384365558624268, "learning_rate": 0.0003991684622080165, "loss": 5.3381, "step": 339500 }, { "epoch": 1.811246777046176, "grad_norm": 2.016707420349121, "learning_rate": 0.0003982766370342889, "loss": 5.3424, "step": 340000 }, { "epoch": 1.8139103752477146, "grad_norm": 1.9890104532241821, "learning_rate": 0.00039738481186056134, "loss": 5.3459, "step": 340500 }, { "epoch": 1.8165739734492532, "grad_norm": 1.997981309890747, "learning_rate": 0.0003964947703371813, "loss": 5.3415, "step": 341000 }, { "epoch": 1.8192375716507916, "grad_norm": 2.077340602874756, "learning_rate": 0.00039560294516345374, "loss": 5.3401, "step": 341500 }, { "epoch": 1.82190116985233, "grad_norm": 1.9495571851730347, "learning_rate": 0.00039471111998972617, "loss": 5.3461, "step": 342000 }, { "epoch": 1.8245647680538686, "grad_norm": 2.086167097091675, "learning_rate": 0.00039381929481599866, "loss": 5.3457, "step": 342500 }, { "epoch": 1.8272283662554072, "grad_norm": 1.9157156944274902, "learning_rate": 0.0003929274696422711, "loss": 5.3374, "step": 343000 }, { "epoch": 1.8298919644569456, "grad_norm": 2.2283830642700195, "learning_rate": 0.0003920356444685436, "loss": 5.3403, "step": 343500 }, { "epoch": 1.832555562658484, "grad_norm": 2.155780553817749, "learning_rate": 0.00039114560294516344, "loss": 5.3403, "step": 344000 }, { "epoch": 1.8352191608600226, "grad_norm": 2.0122015476226807, "learning_rate": 0.00039025377777143587, "loss": 5.3485, "step": 344500 }, { "epoch": 1.8378827590615612, "grad_norm": 2.1252944469451904, "learning_rate": 0.00038936195259770836, "loss": 5.3534, "step": 345000 }, { "epoch": 1.8405463572630996, "grad_norm": 2.16573166847229, "learning_rate": 0.00038847012742398084, "loss": 5.3407, "step": 345500 }, { "epoch": 1.843209955464638, "grad_norm": 2.043785810470581, "learning_rate": 0.0003875800859006007, "loss": 5.3441, "step": 346000 }, { "epoch": 1.8458735536661766, "grad_norm": 2.0578818321228027, "learning_rate": 0.0003866882607268732, "loss": 5.344, "step": 346500 }, { "epoch": 1.8485371518677152, "grad_norm": 2.344649076461792, "learning_rate": 0.0003857964355531456, "loss": 5.3401, "step": 347000 }, { "epoch": 1.8512007500692536, "grad_norm": 2.2246205806732178, "learning_rate": 0.0003849046103794181, "loss": 5.3474, "step": 347500 }, { "epoch": 1.853864348270792, "grad_norm": 2.3041775226593018, "learning_rate": 0.00038401278520569054, "loss": 5.3403, "step": 348000 }, { "epoch": 1.8565279464723305, "grad_norm": 2.0579144954681396, "learning_rate": 0.00038312096003196303, "loss": 5.3388, "step": 348500 }, { "epoch": 1.8591915446738692, "grad_norm": 2.1944098472595215, "learning_rate": 0.00038223091850858294, "loss": 5.3412, "step": 349000 }, { "epoch": 1.8618551428754075, "grad_norm": 2.0834217071533203, "learning_rate": 0.0003813390933348554, "loss": 5.3465, "step": 349500 }, { "epoch": 1.864518741076946, "grad_norm": 1.9777040481567383, "learning_rate": 0.00038044726816112786, "loss": 5.3394, "step": 350000 }, { "epoch": 1.8671823392784845, "grad_norm": 2.341625690460205, "learning_rate": 0.0003795554429874003, "loss": 5.3414, "step": 350500 }, { "epoch": 1.8698459374800231, "grad_norm": 1.9645224809646606, "learning_rate": 0.0003786636178136728, "loss": 5.3429, "step": 351000 }, { "epoch": 1.8725095356815615, "grad_norm": 2.217845916748047, "learning_rate": 0.0003777717926399452, "loss": 5.3485, "step": 351500 }, { "epoch": 1.8751731338831, "grad_norm": 2.2836930751800537, "learning_rate": 0.00037687996746621765, "loss": 5.3369, "step": 352000 }, { "epoch": 1.8778367320846385, "grad_norm": 2.1809890270233154, "learning_rate": 0.00037598814229249013, "loss": 5.3375, "step": 352500 }, { "epoch": 1.8805003302861771, "grad_norm": 2.4111125469207764, "learning_rate": 0.00037509810076911005, "loss": 5.3453, "step": 353000 }, { "epoch": 1.8831639284877155, "grad_norm": 2.264157295227051, "learning_rate": 0.0003742062755953825, "loss": 5.3412, "step": 353500 }, { "epoch": 1.885827526689254, "grad_norm": 2.232529878616333, "learning_rate": 0.00037331445042165497, "loss": 5.3481, "step": 354000 }, { "epoch": 1.8884911248907925, "grad_norm": 2.0301549434661865, "learning_rate": 0.00037242440889827483, "loss": 5.3351, "step": 354500 }, { "epoch": 1.8911547230923311, "grad_norm": 2.040621757507324, "learning_rate": 0.0003715325837245473, "loss": 5.3442, "step": 355000 }, { "epoch": 1.8938183212938695, "grad_norm": 2.085535764694214, "learning_rate": 0.0003706407585508198, "loss": 5.3302, "step": 355500 }, { "epoch": 1.896481919495408, "grad_norm": 2.1077394485473633, "learning_rate": 0.00036974893337709223, "loss": 5.3383, "step": 356000 }, { "epoch": 1.8991455176969465, "grad_norm": 2.242241621017456, "learning_rate": 0.0003688571082033647, "loss": 5.3315, "step": 356500 }, { "epoch": 1.901809115898485, "grad_norm": 2.2890877723693848, "learning_rate": 0.00036796528302963715, "loss": 5.3378, "step": 357000 }, { "epoch": 1.9044727141000233, "grad_norm": 2.3517234325408936, "learning_rate": 0.000367075241506257, "loss": 5.3369, "step": 357500 }, { "epoch": 1.9071363123015619, "grad_norm": 2.3767483234405518, "learning_rate": 0.0003661834163325295, "loss": 5.3365, "step": 358000 }, { "epoch": 1.9097999105031005, "grad_norm": 2.2238335609436035, "learning_rate": 0.00036529159115880193, "loss": 5.3353, "step": 358500 }, { "epoch": 1.9124635087046389, "grad_norm": 2.0594356060028076, "learning_rate": 0.0003643997659850744, "loss": 5.3346, "step": 359000 }, { "epoch": 1.9151271069061773, "grad_norm": 2.1106550693511963, "learning_rate": 0.00036350794081134685, "loss": 5.3317, "step": 359500 }, { "epoch": 1.9177907051077159, "grad_norm": 2.0819623470306396, "learning_rate": 0.00036261611563761934, "loss": 5.332, "step": 360000 }, { "epoch": 1.9204543033092545, "grad_norm": 1.9421486854553223, "learning_rate": 0.00036172607411423925, "loss": 5.3425, "step": 360500 }, { "epoch": 1.9231179015107929, "grad_norm": 2.304370641708374, "learning_rate": 0.0003608342489405117, "loss": 5.3278, "step": 361000 }, { "epoch": 1.9257814997123313, "grad_norm": 1.9409058094024658, "learning_rate": 0.00035994242376678417, "loss": 5.3364, "step": 361500 }, { "epoch": 1.9284450979138699, "grad_norm": 2.199068307876587, "learning_rate": 0.0003590505985930566, "loss": 5.3375, "step": 362000 }, { "epoch": 1.9311086961154085, "grad_norm": 2.4809699058532715, "learning_rate": 0.0003581587734193291, "loss": 5.3304, "step": 362500 }, { "epoch": 1.9337722943169469, "grad_norm": 1.8762375116348267, "learning_rate": 0.000357268731895949, "loss": 5.3396, "step": 363000 }, { "epoch": 1.9364358925184852, "grad_norm": 2.14876651763916, "learning_rate": 0.00035637690672222144, "loss": 5.3295, "step": 363500 }, { "epoch": 1.9390994907200239, "grad_norm": 2.0710737705230713, "learning_rate": 0.0003554850815484939, "loss": 5.3319, "step": 364000 }, { "epoch": 1.9417630889215625, "grad_norm": 2.1879022121429443, "learning_rate": 0.00035459325637476636, "loss": 5.3353, "step": 364500 }, { "epoch": 1.9444266871231008, "grad_norm": 2.2101471424102783, "learning_rate": 0.0003537014312010388, "loss": 5.3365, "step": 365000 }, { "epoch": 1.9470902853246392, "grad_norm": 2.1538619995117188, "learning_rate": 0.0003528113896776587, "loss": 5.3345, "step": 365500 }, { "epoch": 1.9497538835261778, "grad_norm": 2.3958141803741455, "learning_rate": 0.0003519195645039312, "loss": 5.3298, "step": 366000 }, { "epoch": 1.9524174817277165, "grad_norm": 2.2059667110443115, "learning_rate": 0.0003510277393302037, "loss": 5.3228, "step": 366500 }, { "epoch": 1.9550810799292548, "grad_norm": 2.0048577785491943, "learning_rate": 0.0003501359141564761, "loss": 5.3336, "step": 367000 }, { "epoch": 1.9577446781307932, "grad_norm": 2.0165789127349854, "learning_rate": 0.00034924408898274854, "loss": 5.3342, "step": 367500 }, { "epoch": 1.9604082763323318, "grad_norm": 2.2053885459899902, "learning_rate": 0.00034835226380902103, "loss": 5.3359, "step": 368000 }, { "epoch": 1.9630718745338704, "grad_norm": 2.316288948059082, "learning_rate": 0.0003474622222856409, "loss": 5.3344, "step": 368500 }, { "epoch": 1.9657354727354088, "grad_norm": 2.385871410369873, "learning_rate": 0.0003465703971119133, "loss": 5.3364, "step": 369000 }, { "epoch": 1.9683990709369472, "grad_norm": 2.3206396102905273, "learning_rate": 0.0003456785719381858, "loss": 5.3309, "step": 369500 }, { "epoch": 1.9710626691384858, "grad_norm": 2.172229766845703, "learning_rate": 0.00034478674676445824, "loss": 5.3338, "step": 370000 }, { "epoch": 1.9737262673400244, "grad_norm": 2.3812954425811768, "learning_rate": 0.0003438967052410782, "loss": 5.3306, "step": 370500 }, { "epoch": 1.9763898655415628, "grad_norm": 2.1423757076263428, "learning_rate": 0.00034300488006735064, "loss": 5.3406, "step": 371000 }, { "epoch": 1.9790534637431012, "grad_norm": 2.2044973373413086, "learning_rate": 0.0003421130548936231, "loss": 5.3371, "step": 371500 }, { "epoch": 1.9817170619446398, "grad_norm": 1.944014549255371, "learning_rate": 0.00034122122971989556, "loss": 5.3348, "step": 372000 }, { "epoch": 1.9843806601461784, "grad_norm": 2.3091371059417725, "learning_rate": 0.000340329404546168, "loss": 5.3283, "step": 372500 }, { "epoch": 1.9870442583477168, "grad_norm": 2.600417137145996, "learning_rate": 0.0003394375793724405, "loss": 5.3292, "step": 373000 }, { "epoch": 1.9897078565492552, "grad_norm": 2.0236728191375732, "learning_rate": 0.0003385457541987129, "loss": 5.3353, "step": 373500 }, { "epoch": 1.9923714547507938, "grad_norm": 2.298342227935791, "learning_rate": 0.00033765392902498535, "loss": 5.3355, "step": 374000 }, { "epoch": 1.9950350529523324, "grad_norm": 1.945620059967041, "learning_rate": 0.0003367638875016053, "loss": 5.3302, "step": 374500 }, { "epoch": 1.9976986511538706, "grad_norm": 2.1642651557922363, "learning_rate": 0.0003358738459782252, "loss": 5.3259, "step": 375000 }, { "epoch": 2.000362249355409, "grad_norm": 2.149771213531494, "learning_rate": 0.0003349820208044976, "loss": 5.3347, "step": 375500 }, { "epoch": 2.003025847556948, "grad_norm": 2.2164316177368164, "learning_rate": 0.0003340901956307701, "loss": 5.3308, "step": 376000 }, { "epoch": 2.0056894457584864, "grad_norm": 2.2055323123931885, "learning_rate": 0.0003331983704570426, "loss": 5.332, "step": 376500 }, { "epoch": 2.0083530439600246, "grad_norm": 2.1814560890197754, "learning_rate": 0.00033230654528331507, "loss": 5.3239, "step": 377000 }, { "epoch": 2.011016642161563, "grad_norm": 2.1237363815307617, "learning_rate": 0.0003314147201095875, "loss": 5.3364, "step": 377500 }, { "epoch": 2.013680240363102, "grad_norm": 2.1073851585388184, "learning_rate": 0.00033052467858620736, "loss": 5.3209, "step": 378000 }, { "epoch": 2.0163438385646404, "grad_norm": 1.9759477376937866, "learning_rate": 0.00032963285341247985, "loss": 5.3272, "step": 378500 }, { "epoch": 2.0190074367661786, "grad_norm": 2.100966691970825, "learning_rate": 0.0003287410282387523, "loss": 5.3226, "step": 379000 }, { "epoch": 2.021671034967717, "grad_norm": 2.141537666320801, "learning_rate": 0.00032784920306502477, "loss": 5.3305, "step": 379500 }, { "epoch": 2.0243346331692558, "grad_norm": 2.2714550495147705, "learning_rate": 0.0003269573778912972, "loss": 5.3335, "step": 380000 }, { "epoch": 2.0269982313707944, "grad_norm": 2.1945018768310547, "learning_rate": 0.0003260673363679171, "loss": 5.3267, "step": 380500 }, { "epoch": 2.0296618295723325, "grad_norm": 2.269015312194824, "learning_rate": 0.0003251755111941896, "loss": 5.3346, "step": 381000 }, { "epoch": 2.032325427773871, "grad_norm": 2.194460391998291, "learning_rate": 0.00032428368602046203, "loss": 5.3216, "step": 381500 }, { "epoch": 2.0349890259754098, "grad_norm": 2.1248984336853027, "learning_rate": 0.0003233918608467345, "loss": 5.3294, "step": 382000 }, { "epoch": 2.0376526241769484, "grad_norm": 2.213801622390747, "learning_rate": 0.00032250003567300695, "loss": 5.3282, "step": 382500 }, { "epoch": 2.0403162223784865, "grad_norm": 2.0801334381103516, "learning_rate": 0.0003216082104992794, "loss": 5.3293, "step": 383000 }, { "epoch": 2.042979820580025, "grad_norm": 2.191882371902466, "learning_rate": 0.00032071816897589935, "loss": 5.3297, "step": 383500 }, { "epoch": 2.0456434187815637, "grad_norm": 2.238471031188965, "learning_rate": 0.0003198263438021718, "loss": 5.3274, "step": 384000 }, { "epoch": 2.0483070169831024, "grad_norm": 2.0454585552215576, "learning_rate": 0.0003189345186284443, "loss": 5.3335, "step": 384500 }, { "epoch": 2.0509706151846405, "grad_norm": 2.449857473373413, "learning_rate": 0.0003180426934547167, "loss": 5.3243, "step": 385000 }, { "epoch": 2.053634213386179, "grad_norm": 2.182969331741333, "learning_rate": 0.00031715265193133657, "loss": 5.3239, "step": 385500 }, { "epoch": 2.0562978115877177, "grad_norm": 2.3800108432769775, "learning_rate": 0.00031626082675760905, "loss": 5.3263, "step": 386000 }, { "epoch": 2.058961409789256, "grad_norm": 2.4917428493499756, "learning_rate": 0.0003153690015838815, "loss": 5.3252, "step": 386500 }, { "epoch": 2.0616250079907945, "grad_norm": 2.25253963470459, "learning_rate": 0.00031447717641015397, "loss": 5.3323, "step": 387000 }, { "epoch": 2.064288606192333, "grad_norm": 2.1959807872772217, "learning_rate": 0.00031358535123642646, "loss": 5.3257, "step": 387500 }, { "epoch": 2.0669522043938717, "grad_norm": 2.202449321746826, "learning_rate": 0.0003126935260626989, "loss": 5.3256, "step": 388000 }, { "epoch": 2.06961580259541, "grad_norm": 2.093303918838501, "learning_rate": 0.0003118017008889714, "loss": 5.3302, "step": 388500 }, { "epoch": 2.0722794007969485, "grad_norm": 2.139282464981079, "learning_rate": 0.0003109098757152438, "loss": 5.3298, "step": 389000 }, { "epoch": 2.074942998998487, "grad_norm": 2.004852533340454, "learning_rate": 0.00031001983419186367, "loss": 5.3329, "step": 389500 }, { "epoch": 2.0776065972000257, "grad_norm": 2.385274648666382, "learning_rate": 0.00030912800901813616, "loss": 5.3266, "step": 390000 }, { "epoch": 2.080270195401564, "grad_norm": 2.218735456466675, "learning_rate": 0.0003082361838444086, "loss": 5.329, "step": 390500 }, { "epoch": 2.0829337936031025, "grad_norm": 2.271380662918091, "learning_rate": 0.0003073443586706811, "loss": 5.3239, "step": 391000 }, { "epoch": 2.085597391804641, "grad_norm": 2.526583433151245, "learning_rate": 0.000306454317147301, "loss": 5.3287, "step": 391500 }, { "epoch": 2.0882609900061797, "grad_norm": 2.1075544357299805, "learning_rate": 0.0003055624919735734, "loss": 5.3264, "step": 392000 }, { "epoch": 2.090924588207718, "grad_norm": 2.0297112464904785, "learning_rate": 0.0003046706667998459, "loss": 5.3279, "step": 392500 }, { "epoch": 2.0935881864092565, "grad_norm": 2.0166475772857666, "learning_rate": 0.00030377884162611834, "loss": 5.3279, "step": 393000 }, { "epoch": 2.096251784610795, "grad_norm": 2.398573398590088, "learning_rate": 0.00030288880010273826, "loss": 5.325, "step": 393500 }, { "epoch": 2.0989153828123337, "grad_norm": 2.2096564769744873, "learning_rate": 0.00030199697492901075, "loss": 5.3241, "step": 394000 }, { "epoch": 2.101578981013872, "grad_norm": 2.2474560737609863, "learning_rate": 0.0003011051497552832, "loss": 5.3232, "step": 394500 }, { "epoch": 2.1042425792154105, "grad_norm": 2.2487635612487793, "learning_rate": 0.00030021332458155566, "loss": 5.3191, "step": 395000 }, { "epoch": 2.106906177416949, "grad_norm": 2.094921112060547, "learning_rate": 0.0002993214994078281, "loss": 5.3354, "step": 395500 }, { "epoch": 2.1095697756184877, "grad_norm": 2.2288858890533447, "learning_rate": 0.00029843145788444796, "loss": 5.3254, "step": 396000 }, { "epoch": 2.112233373820026, "grad_norm": 2.166731595993042, "learning_rate": 0.00029753963271072044, "loss": 5.3239, "step": 396500 }, { "epoch": 2.1148969720215645, "grad_norm": 2.05653715133667, "learning_rate": 0.00029664780753699293, "loss": 5.3305, "step": 397000 }, { "epoch": 2.117560570223103, "grad_norm": 2.08963942527771, "learning_rate": 0.0002957559823632654, "loss": 5.3255, "step": 397500 }, { "epoch": 2.1202241684246417, "grad_norm": 2.268559217453003, "learning_rate": 0.0002948659408398853, "loss": 5.3238, "step": 398000 }, { "epoch": 2.12288776662618, "grad_norm": 2.9195141792297363, "learning_rate": 0.0002939741156661577, "loss": 5.3211, "step": 398500 }, { "epoch": 2.1255513648277184, "grad_norm": 2.2552900314331055, "learning_rate": 0.0002930822904924302, "loss": 5.3251, "step": 399000 }, { "epoch": 2.128214963029257, "grad_norm": 2.294832706451416, "learning_rate": 0.00029219046531870263, "loss": 5.32, "step": 399500 }, { "epoch": 2.1308785612307957, "grad_norm": 2.3486320972442627, "learning_rate": 0.0002912986401449751, "loss": 5.3197, "step": 400000 }, { "epoch": 2.133542159432334, "grad_norm": 2.497387647628784, "learning_rate": 0.00029040681497124755, "loss": 5.3235, "step": 400500 }, { "epoch": 2.1362057576338724, "grad_norm": 2.3829433917999268, "learning_rate": 0.00028951498979752, "loss": 5.3145, "step": 401000 }, { "epoch": 2.138869355835411, "grad_norm": 2.064811944961548, "learning_rate": 0.00028862316462379247, "loss": 5.3168, "step": 401500 }, { "epoch": 2.1415329540369497, "grad_norm": 2.194028377532959, "learning_rate": 0.0002877331231004124, "loss": 5.3221, "step": 402000 }, { "epoch": 2.144196552238488, "grad_norm": 2.1182937622070312, "learning_rate": 0.0002868412979266848, "loss": 5.321, "step": 402500 }, { "epoch": 2.1468601504400264, "grad_norm": 2.3992223739624023, "learning_rate": 0.0002859494727529573, "loss": 5.3237, "step": 403000 }, { "epoch": 2.149523748641565, "grad_norm": 2.256955623626709, "learning_rate": 0.00028505764757922973, "loss": 5.3144, "step": 403500 }, { "epoch": 2.152187346843103, "grad_norm": 2.3727059364318848, "learning_rate": 0.0002841658224055022, "loss": 5.3238, "step": 404000 }, { "epoch": 2.154850945044642, "grad_norm": 2.1184160709381104, "learning_rate": 0.00028327399723177465, "loss": 5.3196, "step": 404500 }, { "epoch": 2.1575145432461804, "grad_norm": 2.1502108573913574, "learning_rate": 0.00028238217205804714, "loss": 5.3141, "step": 405000 }, { "epoch": 2.160178141447719, "grad_norm": 2.176964521408081, "learning_rate": 0.00028149034688431957, "loss": 5.3187, "step": 405500 }, { "epoch": 2.162841739649257, "grad_norm": 2.144890069961548, "learning_rate": 0.0002806003053609395, "loss": 5.3199, "step": 406000 }, { "epoch": 2.165505337850796, "grad_norm": 2.17976975440979, "learning_rate": 0.000279708480187212, "loss": 5.318, "step": 406500 }, { "epoch": 2.1681689360523344, "grad_norm": 2.181568145751953, "learning_rate": 0.0002788166550134844, "loss": 5.3214, "step": 407000 }, { "epoch": 2.170832534253873, "grad_norm": 2.299090623855591, "learning_rate": 0.0002779248298397569, "loss": 5.3225, "step": 407500 }, { "epoch": 2.173496132455411, "grad_norm": 2.189419746398926, "learning_rate": 0.0002770347883163768, "loss": 5.3193, "step": 408000 }, { "epoch": 2.17615973065695, "grad_norm": 2.274648904800415, "learning_rate": 0.00027614296314264924, "loss": 5.3218, "step": 408500 }, { "epoch": 2.1788233288584884, "grad_norm": 2.1534972190856934, "learning_rate": 0.0002752511379689217, "loss": 5.3173, "step": 409000 }, { "epoch": 2.181486927060027, "grad_norm": 2.3284084796905518, "learning_rate": 0.00027435931279519416, "loss": 5.3126, "step": 409500 }, { "epoch": 2.184150525261565, "grad_norm": 2.286384344100952, "learning_rate": 0.0002734674876214666, "loss": 5.3232, "step": 410000 }, { "epoch": 2.1868141234631038, "grad_norm": 2.111091375350952, "learning_rate": 0.0002725774460980865, "loss": 5.3163, "step": 410500 }, { "epoch": 2.1894777216646424, "grad_norm": 2.361741304397583, "learning_rate": 0.00027168562092435894, "loss": 5.3212, "step": 411000 }, { "epoch": 2.192141319866181, "grad_norm": 2.497840642929077, "learning_rate": 0.0002707937957506314, "loss": 5.3238, "step": 411500 }, { "epoch": 2.194804918067719, "grad_norm": 2.227203607559204, "learning_rate": 0.00026990197057690386, "loss": 5.323, "step": 412000 }, { "epoch": 2.1974685162692578, "grad_norm": 2.2768001556396484, "learning_rate": 0.00026901192905352377, "loss": 5.3182, "step": 412500 }, { "epoch": 2.2001321144707964, "grad_norm": 2.157787799835205, "learning_rate": 0.00026812010387979626, "loss": 5.3246, "step": 413000 }, { "epoch": 2.202795712672335, "grad_norm": 2.3759965896606445, "learning_rate": 0.0002672282787060687, "loss": 5.3207, "step": 413500 }, { "epoch": 2.205459310873873, "grad_norm": 2.210963487625122, "learning_rate": 0.0002663364535323411, "loss": 5.3155, "step": 414000 }, { "epoch": 2.2081229090754118, "grad_norm": 2.265197277069092, "learning_rate": 0.0002654446283586136, "loss": 5.3194, "step": 414500 }, { "epoch": 2.2107865072769504, "grad_norm": 2.110173225402832, "learning_rate": 0.0002645545868352335, "loss": 5.3144, "step": 415000 }, { "epoch": 2.213450105478489, "grad_norm": 2.235196590423584, "learning_rate": 0.000263662761661506, "loss": 5.323, "step": 415500 }, { "epoch": 2.216113703680027, "grad_norm": 2.305601119995117, "learning_rate": 0.00026277093648777844, "loss": 5.3187, "step": 416000 }, { "epoch": 2.2187773018815657, "grad_norm": 2.401959180831909, "learning_rate": 0.0002618791113140509, "loss": 5.3175, "step": 416500 }, { "epoch": 2.2214409000831044, "grad_norm": 2.163121223449707, "learning_rate": 0.0002609890697906708, "loss": 5.3169, "step": 417000 }, { "epoch": 2.224104498284643, "grad_norm": 2.265998363494873, "learning_rate": 0.0002600972446169432, "loss": 5.3173, "step": 417500 }, { "epoch": 2.226768096486181, "grad_norm": 2.236154317855835, "learning_rate": 0.00025920541944321577, "loss": 5.3167, "step": 418000 }, { "epoch": 2.2294316946877197, "grad_norm": 2.1707651615142822, "learning_rate": 0.0002583135942694882, "loss": 5.3184, "step": 418500 }, { "epoch": 2.2320952928892583, "grad_norm": 2.121073007583618, "learning_rate": 0.00025742355274610806, "loss": 5.3171, "step": 419000 }, { "epoch": 2.234758891090797, "grad_norm": 2.2292840480804443, "learning_rate": 0.00025653172757238055, "loss": 5.3185, "step": 419500 }, { "epoch": 2.237422489292335, "grad_norm": 2.2376914024353027, "learning_rate": 0.000255639902398653, "loss": 5.3143, "step": 420000 }, { "epoch": 2.2400860874938737, "grad_norm": 2.2844974994659424, "learning_rate": 0.0002547480772249254, "loss": 5.3039, "step": 420500 }, { "epoch": 2.2427496856954123, "grad_norm": 2.278136968612671, "learning_rate": 0.0002538562520511979, "loss": 5.3159, "step": 421000 }, { "epoch": 2.2454132838969505, "grad_norm": 2.3182220458984375, "learning_rate": 0.00025296442687747033, "loss": 5.319, "step": 421500 }, { "epoch": 2.248076882098489, "grad_norm": 2.5095927715301514, "learning_rate": 0.0002520743853540903, "loss": 5.3174, "step": 422000 }, { "epoch": 2.2507404803000277, "grad_norm": 2.3167264461517334, "learning_rate": 0.00025118256018036273, "loss": 5.3131, "step": 422500 }, { "epoch": 2.2534040785015663, "grad_norm": 2.211766481399536, "learning_rate": 0.00025029073500663516, "loss": 5.325, "step": 423000 }, { "epoch": 2.256067676703105, "grad_norm": 2.1502010822296143, "learning_rate": 0.00024939890983290765, "loss": 5.3139, "step": 423500 }, { "epoch": 2.258731274904643, "grad_norm": 2.1429567337036133, "learning_rate": 0.00024850886830952756, "loss": 5.3147, "step": 424000 }, { "epoch": 2.2613948731061817, "grad_norm": 2.272367238998413, "learning_rate": 0.0002476170431358, "loss": 5.3128, "step": 424500 }, { "epoch": 2.2640584713077203, "grad_norm": 2.6372079849243164, "learning_rate": 0.00024672521796207243, "loss": 5.3134, "step": 425000 }, { "epoch": 2.2667220695092585, "grad_norm": 2.4213263988494873, "learning_rate": 0.0002458333927883449, "loss": 5.3142, "step": 425500 }, { "epoch": 2.269385667710797, "grad_norm": 2.2919113636016846, "learning_rate": 0.0002449415676146174, "loss": 5.3199, "step": 426000 }, { "epoch": 2.2720492659123357, "grad_norm": 2.1887030601501465, "learning_rate": 0.0002440515260912373, "loss": 5.3168, "step": 426500 }, { "epoch": 2.2747128641138743, "grad_norm": 2.2401158809661865, "learning_rate": 0.00024315970091750975, "loss": 5.3142, "step": 427000 }, { "epoch": 2.2773764623154125, "grad_norm": 2.264155864715576, "learning_rate": 0.0002422678757437822, "loss": 5.3063, "step": 427500 }, { "epoch": 2.280040060516951, "grad_norm": 2.372823476791382, "learning_rate": 0.00024137605057005467, "loss": 5.3146, "step": 428000 }, { "epoch": 2.2827036587184897, "grad_norm": 2.5441572666168213, "learning_rate": 0.00024048600904667456, "loss": 5.3129, "step": 428500 }, { "epoch": 2.2853672569200283, "grad_norm": 2.107741594314575, "learning_rate": 0.00023959418387294702, "loss": 5.3112, "step": 429000 }, { "epoch": 2.2880308551215665, "grad_norm": 2.1812095642089844, "learning_rate": 0.00023870235869921948, "loss": 5.3144, "step": 429500 }, { "epoch": 2.290694453323105, "grad_norm": 2.3959500789642334, "learning_rate": 0.00023781053352549194, "loss": 5.3108, "step": 430000 }, { "epoch": 2.2933580515246437, "grad_norm": 2.3315865993499756, "learning_rate": 0.00023691870835176437, "loss": 5.3093, "step": 430500 }, { "epoch": 2.2960216497261823, "grad_norm": 2.0199296474456787, "learning_rate": 0.00023602688317803685, "loss": 5.3209, "step": 431000 }, { "epoch": 2.2986852479277204, "grad_norm": 2.2393200397491455, "learning_rate": 0.00023513684165465677, "loss": 5.3074, "step": 431500 }, { "epoch": 2.301348846129259, "grad_norm": 2.4474637508392334, "learning_rate": 0.00023424501648092923, "loss": 5.3158, "step": 432000 }, { "epoch": 2.3040124443307977, "grad_norm": 2.3248863220214844, "learning_rate": 0.00023335319130720166, "loss": 5.3157, "step": 432500 }, { "epoch": 2.3066760425323363, "grad_norm": 2.4158935546875, "learning_rate": 0.00023246136613347412, "loss": 5.3092, "step": 433000 }, { "epoch": 2.3093396407338744, "grad_norm": 2.084850549697876, "learning_rate": 0.00023156954095974658, "loss": 5.3178, "step": 433500 }, { "epoch": 2.312003238935413, "grad_norm": 2.319776773452759, "learning_rate": 0.0002306794994363665, "loss": 5.3074, "step": 434000 }, { "epoch": 2.3146668371369516, "grad_norm": 2.2137837409973145, "learning_rate": 0.00022978767426263893, "loss": 5.3073, "step": 434500 }, { "epoch": 2.31733043533849, "grad_norm": 2.4062960147857666, "learning_rate": 0.0002288958490889114, "loss": 5.3112, "step": 435000 }, { "epoch": 2.3199940335400284, "grad_norm": 2.27229380607605, "learning_rate": 0.00022800402391518385, "loss": 5.3114, "step": 435500 }, { "epoch": 2.322657631741567, "grad_norm": 2.499032974243164, "learning_rate": 0.00022711219874145633, "loss": 5.3132, "step": 436000 }, { "epoch": 2.3253212299431056, "grad_norm": 2.071829080581665, "learning_rate": 0.00022622215721807625, "loss": 5.3181, "step": 436500 }, { "epoch": 2.3279848281446442, "grad_norm": 2.4178686141967773, "learning_rate": 0.00022533033204434868, "loss": 5.3079, "step": 437000 }, { "epoch": 2.3306484263461824, "grad_norm": 2.431913375854492, "learning_rate": 0.00022443850687062114, "loss": 5.311, "step": 437500 }, { "epoch": 2.333312024547721, "grad_norm": 2.3519508838653564, "learning_rate": 0.0002235466816968936, "loss": 5.3149, "step": 438000 }, { "epoch": 2.3359756227492596, "grad_norm": 2.286878824234009, "learning_rate": 0.00022265485652316606, "loss": 5.312, "step": 438500 }, { "epoch": 2.338639220950798, "grad_norm": 2.3200433254241943, "learning_rate": 0.00022176481499978595, "loss": 5.2989, "step": 439000 }, { "epoch": 2.3413028191523364, "grad_norm": 2.165735960006714, "learning_rate": 0.0002208729898260584, "loss": 5.3169, "step": 439500 }, { "epoch": 2.343966417353875, "grad_norm": 2.0269339084625244, "learning_rate": 0.00021998116465233087, "loss": 5.3088, "step": 440000 }, { "epoch": 2.3466300155554136, "grad_norm": 2.2074029445648193, "learning_rate": 0.00021908933947860333, "loss": 5.3096, "step": 440500 }, { "epoch": 2.3492936137569522, "grad_norm": 2.7109835147857666, "learning_rate": 0.0002181975143048758, "loss": 5.3089, "step": 441000 }, { "epoch": 2.3519572119584904, "grad_norm": 2.2240071296691895, "learning_rate": 0.0002173074727814957, "loss": 5.3148, "step": 441500 }, { "epoch": 2.354620810160029, "grad_norm": 2.26788330078125, "learning_rate": 0.00021641564760776816, "loss": 5.3113, "step": 442000 }, { "epoch": 2.3572844083615676, "grad_norm": 2.389122486114502, "learning_rate": 0.00021552382243404062, "loss": 5.3133, "step": 442500 }, { "epoch": 2.3599480065631058, "grad_norm": 2.382267475128174, "learning_rate": 0.00021463199726031308, "loss": 5.3129, "step": 443000 }, { "epoch": 2.3626116047646444, "grad_norm": 2.411574363708496, "learning_rate": 0.00021374195573693297, "loss": 5.3022, "step": 443500 }, { "epoch": 2.365275202966183, "grad_norm": 2.348522424697876, "learning_rate": 0.00021285013056320543, "loss": 5.3137, "step": 444000 }, { "epoch": 2.3679388011677216, "grad_norm": 2.3230319023132324, "learning_rate": 0.00021195830538947789, "loss": 5.3059, "step": 444500 }, { "epoch": 2.3706023993692598, "grad_norm": 2.2816174030303955, "learning_rate": 0.00021106648021575035, "loss": 5.3117, "step": 445000 }, { "epoch": 2.3732659975707984, "grad_norm": 2.400097370147705, "learning_rate": 0.0002101746550420228, "loss": 5.3095, "step": 445500 }, { "epoch": 2.375929595772337, "grad_norm": 2.470815896987915, "learning_rate": 0.00020928461351864272, "loss": 5.3027, "step": 446000 }, { "epoch": 2.3785931939738756, "grad_norm": 2.1947262287139893, "learning_rate": 0.00020839278834491518, "loss": 5.3031, "step": 446500 }, { "epoch": 2.3812567921754138, "grad_norm": 2.3549935817718506, "learning_rate": 0.00020750096317118764, "loss": 5.3083, "step": 447000 }, { "epoch": 2.3839203903769524, "grad_norm": 2.457932949066162, "learning_rate": 0.0002066091379974601, "loss": 5.3052, "step": 447500 }, { "epoch": 2.386583988578491, "grad_norm": 2.2867889404296875, "learning_rate": 0.00020571909647407999, "loss": 5.3155, "step": 448000 }, { "epoch": 2.3892475867800296, "grad_norm": 2.061497688293457, "learning_rate": 0.00020482727130035245, "loss": 5.3087, "step": 448500 }, { "epoch": 2.3919111849815677, "grad_norm": 2.2757697105407715, "learning_rate": 0.0002039354461266249, "loss": 5.3095, "step": 449000 }, { "epoch": 2.3945747831831063, "grad_norm": 2.4835853576660156, "learning_rate": 0.00020304362095289736, "loss": 5.3091, "step": 449500 }, { "epoch": 2.397238381384645, "grad_norm": 2.2896037101745605, "learning_rate": 0.00020215357942951728, "loss": 5.3124, "step": 450000 }, { "epoch": 2.3999019795861836, "grad_norm": 2.31545090675354, "learning_rate": 0.00020126175425578974, "loss": 5.31, "step": 450500 }, { "epoch": 2.4025655777877217, "grad_norm": 2.296827554702759, "learning_rate": 0.0002003699290820622, "loss": 5.3027, "step": 451000 }, { "epoch": 2.4052291759892603, "grad_norm": 2.60396671295166, "learning_rate": 0.00019947810390833466, "loss": 5.312, "step": 451500 }, { "epoch": 2.407892774190799, "grad_norm": 2.500142812728882, "learning_rate": 0.00019858627873460712, "loss": 5.2995, "step": 452000 }, { "epoch": 2.4105563723923376, "grad_norm": 2.179241180419922, "learning_rate": 0.000197696237211227, "loss": 5.3034, "step": 452500 }, { "epoch": 2.4132199705938757, "grad_norm": 2.5400588512420654, "learning_rate": 0.00019680441203749947, "loss": 5.3074, "step": 453000 }, { "epoch": 2.4158835687954143, "grad_norm": 2.4482738971710205, "learning_rate": 0.00019591258686377192, "loss": 5.301, "step": 453500 }, { "epoch": 2.418547166996953, "grad_norm": 2.3452165126800537, "learning_rate": 0.00019502076169004438, "loss": 5.311, "step": 454000 }, { "epoch": 2.4212107651984915, "grad_norm": 2.1771457195281982, "learning_rate": 0.0001941307201666643, "loss": 5.3035, "step": 454500 }, { "epoch": 2.4238743634000297, "grad_norm": 2.195034980773926, "learning_rate": 0.00019323889499293676, "loss": 5.3069, "step": 455000 }, { "epoch": 2.4265379616015683, "grad_norm": 2.3099453449249268, "learning_rate": 0.00019234706981920922, "loss": 5.3075, "step": 455500 }, { "epoch": 2.429201559803107, "grad_norm": 2.5112428665161133, "learning_rate": 0.00019145524464548168, "loss": 5.3093, "step": 456000 }, { "epoch": 2.431865158004645, "grad_norm": 2.470879316329956, "learning_rate": 0.00019056520312210157, "loss": 5.3021, "step": 456500 }, { "epoch": 2.4345287562061837, "grad_norm": 2.381201982498169, "learning_rate": 0.00018967337794837403, "loss": 5.304, "step": 457000 }, { "epoch": 2.4371923544077223, "grad_norm": 2.30584454536438, "learning_rate": 0.00018878155277464648, "loss": 5.3063, "step": 457500 }, { "epoch": 2.439855952609261, "grad_norm": 2.1264095306396484, "learning_rate": 0.00018788972760091894, "loss": 5.303, "step": 458000 }, { "epoch": 2.4425195508107995, "grad_norm": 2.5097908973693848, "learning_rate": 0.0001869979024271914, "loss": 5.3028, "step": 458500 }, { "epoch": 2.4451831490123377, "grad_norm": 2.1753334999084473, "learning_rate": 0.00018610786090381132, "loss": 5.303, "step": 459000 }, { "epoch": 2.4478467472138763, "grad_norm": 2.393508195877075, "learning_rate": 0.00018521603573008378, "loss": 5.3065, "step": 459500 }, { "epoch": 2.450510345415415, "grad_norm": 2.4845023155212402, "learning_rate": 0.00018432421055635624, "loss": 5.3055, "step": 460000 }, { "epoch": 2.453173943616953, "grad_norm": 2.286433458328247, "learning_rate": 0.0001834323853826287, "loss": 5.3093, "step": 460500 }, { "epoch": 2.4558375418184917, "grad_norm": 2.3205184936523438, "learning_rate": 0.00018254056020890113, "loss": 5.3046, "step": 461000 }, { "epoch": 2.4585011400200303, "grad_norm": 2.2458608150482178, "learning_rate": 0.00018165051868552104, "loss": 5.3034, "step": 461500 }, { "epoch": 2.461164738221569, "grad_norm": 2.4838719367980957, "learning_rate": 0.0001807586935117935, "loss": 5.3067, "step": 462000 }, { "epoch": 2.463828336423107, "grad_norm": 2.363417148590088, "learning_rate": 0.00017986686833806596, "loss": 5.3075, "step": 462500 }, { "epoch": 2.4664919346246457, "grad_norm": 2.1464176177978516, "learning_rate": 0.0001789750431643384, "loss": 5.2936, "step": 463000 }, { "epoch": 2.4691555328261843, "grad_norm": 2.1444778442382812, "learning_rate": 0.00017808321799061086, "loss": 5.3012, "step": 463500 }, { "epoch": 2.471819131027723, "grad_norm": 2.1136202812194824, "learning_rate": 0.0001771931764672308, "loss": 5.2991, "step": 464000 }, { "epoch": 2.474482729229261, "grad_norm": 2.325840950012207, "learning_rate": 0.00017630135129350326, "loss": 5.3005, "step": 464500 }, { "epoch": 2.4771463274307997, "grad_norm": 2.1854569911956787, "learning_rate": 0.00017540952611977572, "loss": 5.3041, "step": 465000 }, { "epoch": 2.4798099256323383, "grad_norm": 2.247187614440918, "learning_rate": 0.00017451770094604815, "loss": 5.3038, "step": 465500 }, { "epoch": 2.482473523833877, "grad_norm": 2.3324661254882812, "learning_rate": 0.0001736258757723206, "loss": 5.2999, "step": 466000 }, { "epoch": 2.485137122035415, "grad_norm": 2.3304693698883057, "learning_rate": 0.00017273583424894052, "loss": 5.3022, "step": 466500 }, { "epoch": 2.4878007202369536, "grad_norm": 2.5459063053131104, "learning_rate": 0.00017184400907521298, "loss": 5.3082, "step": 467000 }, { "epoch": 2.4904643184384923, "grad_norm": 2.280992031097412, "learning_rate": 0.00017095218390148542, "loss": 5.3027, "step": 467500 }, { "epoch": 2.493127916640031, "grad_norm": 2.204409599304199, "learning_rate": 0.00017006035872775787, "loss": 5.3056, "step": 468000 }, { "epoch": 2.495791514841569, "grad_norm": 2.7257113456726074, "learning_rate": 0.0001691703172043778, "loss": 5.3043, "step": 468500 }, { "epoch": 2.4984551130431076, "grad_norm": 2.262225866317749, "learning_rate": 0.00016827849203065025, "loss": 5.3022, "step": 469000 }, { "epoch": 2.5011187112446462, "grad_norm": 2.167947769165039, "learning_rate": 0.0001673866668569227, "loss": 5.2977, "step": 469500 }, { "epoch": 2.5037823094461844, "grad_norm": 2.434269428253174, "learning_rate": 0.00016649484168319517, "loss": 5.3003, "step": 470000 }, { "epoch": 2.506445907647723, "grad_norm": 2.2088136672973633, "learning_rate": 0.00016560480015981508, "loss": 5.3048, "step": 470500 }, { "epoch": 2.5091095058492616, "grad_norm": 2.268261194229126, "learning_rate": 0.00016471297498608754, "loss": 5.3048, "step": 471000 }, { "epoch": 2.5117731040508002, "grad_norm": 2.462432384490967, "learning_rate": 0.00016382114981235998, "loss": 5.305, "step": 471500 }, { "epoch": 2.514436702252339, "grad_norm": 2.6072680950164795, "learning_rate": 0.00016292932463863243, "loss": 5.2986, "step": 472000 }, { "epoch": 2.517100300453877, "grad_norm": 2.600860118865967, "learning_rate": 0.0001620374994649049, "loss": 5.299, "step": 472500 }, { "epoch": 2.5197638986554156, "grad_norm": 2.3521888256073, "learning_rate": 0.00016114567429117735, "loss": 5.2936, "step": 473000 }, { "epoch": 2.522427496856954, "grad_norm": 2.712414026260376, "learning_rate": 0.00016025563276779724, "loss": 5.303, "step": 473500 }, { "epoch": 2.5250910950584924, "grad_norm": 2.267749071121216, "learning_rate": 0.0001593638075940697, "loss": 5.3026, "step": 474000 }, { "epoch": 2.527754693260031, "grad_norm": 2.206207275390625, "learning_rate": 0.0001584719824203422, "loss": 5.2967, "step": 474500 }, { "epoch": 2.5304182914615696, "grad_norm": 2.3536181449890137, "learning_rate": 0.00015758015724661465, "loss": 5.2971, "step": 475000 }, { "epoch": 2.533081889663108, "grad_norm": 2.229966163635254, "learning_rate": 0.00015669011572323456, "loss": 5.3002, "step": 475500 }, { "epoch": 2.535745487864647, "grad_norm": 2.391902208328247, "learning_rate": 0.000155798290549507, "loss": 5.3046, "step": 476000 }, { "epoch": 2.538409086066185, "grad_norm": 2.367274522781372, "learning_rate": 0.00015490646537577945, "loss": 5.3014, "step": 476500 }, { "epoch": 2.5410726842677236, "grad_norm": 2.398796319961548, "learning_rate": 0.00015401464020205191, "loss": 5.3012, "step": 477000 }, { "epoch": 2.543736282469262, "grad_norm": 2.2506918907165527, "learning_rate": 0.00015312281502832437, "loss": 5.3034, "step": 477500 }, { "epoch": 2.5463998806708004, "grad_norm": 2.4038991928100586, "learning_rate": 0.00015223277350494426, "loss": 5.298, "step": 478000 }, { "epoch": 2.549063478872339, "grad_norm": 2.2355668544769287, "learning_rate": 0.00015134094833121672, "loss": 5.2999, "step": 478500 }, { "epoch": 2.5517270770738776, "grad_norm": 2.312537908554077, "learning_rate": 0.00015044912315748918, "loss": 5.2987, "step": 479000 }, { "epoch": 2.554390675275416, "grad_norm": 2.4338889122009277, "learning_rate": 0.00014955729798376164, "loss": 5.2936, "step": 479500 }, { "epoch": 2.557054273476955, "grad_norm": 2.303349018096924, "learning_rate": 0.00014866725646038155, "loss": 5.2941, "step": 480000 }, { "epoch": 2.559717871678493, "grad_norm": 2.27744197845459, "learning_rate": 0.00014777543128665401, "loss": 5.2961, "step": 480500 }, { "epoch": 2.5623814698800316, "grad_norm": 2.364135265350342, "learning_rate": 0.00014688360611292647, "loss": 5.2982, "step": 481000 }, { "epoch": 2.56504506808157, "grad_norm": 2.652825355529785, "learning_rate": 0.00014599178093919893, "loss": 5.3072, "step": 481500 }, { "epoch": 2.5677086662831083, "grad_norm": 2.2864181995391846, "learning_rate": 0.00014510173941581882, "loss": 5.3027, "step": 482000 }, { "epoch": 2.570372264484647, "grad_norm": 2.1780378818511963, "learning_rate": 0.00014420991424209128, "loss": 5.2988, "step": 482500 }, { "epoch": 2.5730358626861856, "grad_norm": 2.4762122631073, "learning_rate": 0.00014331808906836374, "loss": 5.2963, "step": 483000 }, { "epoch": 2.575699460887724, "grad_norm": 2.3064920902252197, "learning_rate": 0.0001424262638946362, "loss": 5.304, "step": 483500 }, { "epoch": 2.5783630590892628, "grad_norm": 2.17753529548645, "learning_rate": 0.00014153443872090866, "loss": 5.2909, "step": 484000 }, { "epoch": 2.581026657290801, "grad_norm": 2.442643404006958, "learning_rate": 0.00014064439719752857, "loss": 5.3033, "step": 484500 }, { "epoch": 2.5836902554923395, "grad_norm": 2.5781943798065186, "learning_rate": 0.00013975257202380103, "loss": 5.2969, "step": 485000 }, { "epoch": 2.586353853693878, "grad_norm": 2.1409718990325928, "learning_rate": 0.0001388607468500735, "loss": 5.2987, "step": 485500 }, { "epoch": 2.5890174518954163, "grad_norm": 2.23543381690979, "learning_rate": 0.00013796892167634595, "loss": 5.2989, "step": 486000 }, { "epoch": 2.591681050096955, "grad_norm": 2.418957233428955, "learning_rate": 0.00013707888015296584, "loss": 5.2972, "step": 486500 }, { "epoch": 2.5943446482984935, "grad_norm": 2.292370080947876, "learning_rate": 0.0001361870549792383, "loss": 5.3005, "step": 487000 }, { "epoch": 2.5970082465000317, "grad_norm": 2.360339403152466, "learning_rate": 0.00013529522980551076, "loss": 5.2974, "step": 487500 }, { "epoch": 2.5996718447015703, "grad_norm": 2.2026000022888184, "learning_rate": 0.00013440340463178322, "loss": 5.3012, "step": 488000 }, { "epoch": 2.602335442903109, "grad_norm": 2.273235559463501, "learning_rate": 0.00013351336310840313, "loss": 5.2955, "step": 488500 }, { "epoch": 2.6049990411046475, "grad_norm": 2.349081516265869, "learning_rate": 0.0001326215379346756, "loss": 5.3026, "step": 489000 }, { "epoch": 2.607662639306186, "grad_norm": 2.4691007137298584, "learning_rate": 0.00013172971276094805, "loss": 5.2999, "step": 489500 }, { "epoch": 2.6103262375077243, "grad_norm": 2.3375978469848633, "learning_rate": 0.0001308378875872205, "loss": 5.2849, "step": 490000 }, { "epoch": 2.612989835709263, "grad_norm": 2.3784444332122803, "learning_rate": 0.0001299478460638404, "loss": 5.2937, "step": 490500 }, { "epoch": 2.6156534339108015, "grad_norm": 2.4842257499694824, "learning_rate": 0.00012905602089011286, "loss": 5.2919, "step": 491000 }, { "epoch": 2.6183170321123397, "grad_norm": 2.2826011180877686, "learning_rate": 0.00012816419571638532, "loss": 5.2902, "step": 491500 }, { "epoch": 2.6209806303138783, "grad_norm": 2.300616979598999, "learning_rate": 0.00012727237054265778, "loss": 5.3024, "step": 492000 }, { "epoch": 2.623644228515417, "grad_norm": 2.4524025917053223, "learning_rate": 0.00012638232901927772, "loss": 5.2908, "step": 492500 }, { "epoch": 2.6263078267169555, "grad_norm": 2.3518335819244385, "learning_rate": 0.00012549050384555015, "loss": 5.2977, "step": 493000 }, { "epoch": 2.628971424918494, "grad_norm": 2.5559749603271484, "learning_rate": 0.0001245986786718226, "loss": 5.2933, "step": 493500 }, { "epoch": 2.6316350231200323, "grad_norm": 2.32487416267395, "learning_rate": 0.00012370685349809507, "loss": 5.2941, "step": 494000 }, { "epoch": 2.634298621321571, "grad_norm": 2.384162187576294, "learning_rate": 0.000122816811974715, "loss": 5.2978, "step": 494500 }, { "epoch": 2.6369622195231095, "grad_norm": 2.7350683212280273, "learning_rate": 0.00012192498680098743, "loss": 5.3015, "step": 495000 }, { "epoch": 2.6396258177246477, "grad_norm": 2.5397427082061768, "learning_rate": 0.00012103316162725988, "loss": 5.2924, "step": 495500 }, { "epoch": 2.6422894159261863, "grad_norm": 2.4719595909118652, "learning_rate": 0.00012014133645353234, "loss": 5.2982, "step": 496000 }, { "epoch": 2.644953014127725, "grad_norm": 2.7110893726348877, "learning_rate": 0.0001192495112798048, "loss": 5.2908, "step": 496500 }, { "epoch": 2.6476166123292635, "grad_norm": 2.5090041160583496, "learning_rate": 0.00011835946975642471, "loss": 5.2939, "step": 497000 }, { "epoch": 2.650280210530802, "grad_norm": 2.5113580226898193, "learning_rate": 0.00011746764458269717, "loss": 5.2935, "step": 497500 }, { "epoch": 2.6529438087323403, "grad_norm": 2.4266409873962402, "learning_rate": 0.00011657581940896962, "loss": 5.2931, "step": 498000 }, { "epoch": 2.655607406933879, "grad_norm": 2.4426701068878174, "learning_rate": 0.00011568399423524208, "loss": 5.2909, "step": 498500 }, { "epoch": 2.6582710051354175, "grad_norm": 2.5790412425994873, "learning_rate": 0.00011479216906151452, "loss": 5.2919, "step": 499000 }, { "epoch": 2.6609346033369556, "grad_norm": 2.309144973754883, "learning_rate": 0.00011390212753813445, "loss": 5.2967, "step": 499500 }, { "epoch": 2.6635982015384942, "grad_norm": 2.297360420227051, "learning_rate": 0.0001130103023644069, "loss": 5.2918, "step": 500000 }, { "epoch": 2.666261799740033, "grad_norm": 2.539792776107788, "learning_rate": 0.00011211847719067936, "loss": 5.2914, "step": 500500 }, { "epoch": 2.6689253979415715, "grad_norm": 2.246025800704956, "learning_rate": 0.00011122665201695182, "loss": 5.2968, "step": 501000 }, { "epoch": 2.67158899614311, "grad_norm": 2.34342885017395, "learning_rate": 0.00011033661049357173, "loss": 5.2885, "step": 501500 }, { "epoch": 2.6742525943446482, "grad_norm": 2.4776382446289062, "learning_rate": 0.00010944478531984418, "loss": 5.2944, "step": 502000 }, { "epoch": 2.676916192546187, "grad_norm": 2.583674907684326, "learning_rate": 0.00010855296014611664, "loss": 5.2907, "step": 502500 }, { "epoch": 2.6795797907477255, "grad_norm": 2.3661584854125977, "learning_rate": 0.0001076611349723891, "loss": 5.2969, "step": 503000 }, { "epoch": 2.6822433889492636, "grad_norm": 2.3716771602630615, "learning_rate": 0.00010677109344900901, "loss": 5.2993, "step": 503500 }, { "epoch": 2.6849069871508022, "grad_norm": 2.3315460681915283, "learning_rate": 0.00010587926827528146, "loss": 5.2914, "step": 504000 }, { "epoch": 2.687570585352341, "grad_norm": 2.2361655235290527, "learning_rate": 0.00010498744310155392, "loss": 5.288, "step": 504500 }, { "epoch": 2.690234183553879, "grad_norm": 2.3718972206115723, "learning_rate": 0.00010409561792782638, "loss": 5.2933, "step": 505000 }, { "epoch": 2.6928977817554176, "grad_norm": 2.414783477783203, "learning_rate": 0.0001032055764044463, "loss": 5.2905, "step": 505500 }, { "epoch": 2.695561379956956, "grad_norm": 2.5909764766693115, "learning_rate": 0.00010231375123071875, "loss": 5.2889, "step": 506000 }, { "epoch": 2.698224978158495, "grad_norm": 2.2361748218536377, "learning_rate": 0.0001014219260569912, "loss": 5.2884, "step": 506500 }, { "epoch": 2.7008885763600334, "grad_norm": 2.3554787635803223, "learning_rate": 0.00010053010088326366, "loss": 5.283, "step": 507000 }, { "epoch": 2.7035521745615716, "grad_norm": 2.4235968589782715, "learning_rate": 9.96382757095361e-05, "loss": 5.2991, "step": 507500 }, { "epoch": 2.70621577276311, "grad_norm": 2.334272861480713, "learning_rate": 9.874645053580856e-05, "loss": 5.2921, "step": 508000 }, { "epoch": 2.708879370964649, "grad_norm": 2.443535566329956, "learning_rate": 9.785640901242848e-05, "loss": 5.2934, "step": 508500 }, { "epoch": 2.711542969166187, "grad_norm": 2.4466655254364014, "learning_rate": 9.696458383870094e-05, "loss": 5.2915, "step": 509000 }, { "epoch": 2.7142065673677256, "grad_norm": 2.1013219356536865, "learning_rate": 9.60727586649734e-05, "loss": 5.2942, "step": 509500 }, { "epoch": 2.716870165569264, "grad_norm": 2.486953020095825, "learning_rate": 9.518093349124584e-05, "loss": 5.2948, "step": 510000 }, { "epoch": 2.719533763770803, "grad_norm": 2.246967077255249, "learning_rate": 9.429089196786576e-05, "loss": 5.288, "step": 510500 }, { "epoch": 2.7221973619723414, "grad_norm": 2.308177947998047, "learning_rate": 9.339906679413822e-05, "loss": 5.2925, "step": 511000 }, { "epoch": 2.7248609601738796, "grad_norm": 2.3832600116729736, "learning_rate": 9.250724162041068e-05, "loss": 5.2925, "step": 511500 }, { "epoch": 2.727524558375418, "grad_norm": 2.2219245433807373, "learning_rate": 9.161541644668312e-05, "loss": 5.294, "step": 512000 }, { "epoch": 2.730188156576957, "grad_norm": 2.4265191555023193, "learning_rate": 9.072537492330303e-05, "loss": 5.2875, "step": 512500 }, { "epoch": 2.732851754778495, "grad_norm": 2.553427219390869, "learning_rate": 8.98335497495755e-05, "loss": 5.2984, "step": 513000 }, { "epoch": 2.7355153529800336, "grad_norm": 2.3475024700164795, "learning_rate": 8.894172457584796e-05, "loss": 5.2827, "step": 513500 }, { "epoch": 2.738178951181572, "grad_norm": 2.5305187702178955, "learning_rate": 8.80498994021204e-05, "loss": 5.2937, "step": 514000 }, { "epoch": 2.740842549383111, "grad_norm": 2.4398436546325684, "learning_rate": 8.71598578787403e-05, "loss": 5.2948, "step": 514500 }, { "epoch": 2.7435061475846494, "grad_norm": 2.4077444076538086, "learning_rate": 8.626803270501276e-05, "loss": 5.2882, "step": 515000 }, { "epoch": 2.7461697457861876, "grad_norm": 2.346778392791748, "learning_rate": 8.537620753128524e-05, "loss": 5.2875, "step": 515500 }, { "epoch": 2.748833343987726, "grad_norm": 2.4900453090667725, "learning_rate": 8.448438235755768e-05, "loss": 5.2835, "step": 516000 }, { "epoch": 2.7514969421892648, "grad_norm": 2.4355154037475586, "learning_rate": 8.359255718383014e-05, "loss": 5.29, "step": 516500 }, { "epoch": 2.754160540390803, "grad_norm": 2.18061900138855, "learning_rate": 8.270251566045004e-05, "loss": 5.288, "step": 517000 }, { "epoch": 2.7568241385923415, "grad_norm": 2.3646693229675293, "learning_rate": 8.18106904867225e-05, "loss": 5.2789, "step": 517500 }, { "epoch": 2.75948773679388, "grad_norm": 2.369717836380005, "learning_rate": 8.091886531299498e-05, "loss": 5.2901, "step": 518000 }, { "epoch": 2.7621513349954188, "grad_norm": 2.4666647911071777, "learning_rate": 8.002704013926742e-05, "loss": 5.2858, "step": 518500 }, { "epoch": 2.7648149331969574, "grad_norm": 2.3375349044799805, "learning_rate": 7.913699861588732e-05, "loss": 5.2854, "step": 519000 }, { "epoch": 2.7674785313984955, "grad_norm": 2.2538347244262695, "learning_rate": 7.824517344215978e-05, "loss": 5.2899, "step": 519500 }, { "epoch": 2.770142129600034, "grad_norm": 2.5232772827148438, "learning_rate": 7.735334826843224e-05, "loss": 5.2948, "step": 520000 }, { "epoch": 2.7728057278015728, "grad_norm": 2.3963685035705566, "learning_rate": 7.646152309470469e-05, "loss": 5.2919, "step": 520500 }, { "epoch": 2.775469326003111, "grad_norm": 2.0667736530303955, "learning_rate": 7.55714815713246e-05, "loss": 5.2825, "step": 521000 }, { "epoch": 2.7781329242046495, "grad_norm": 2.421602725982666, "learning_rate": 7.467965639759706e-05, "loss": 5.2949, "step": 521500 }, { "epoch": 2.780796522406188, "grad_norm": 2.3447656631469727, "learning_rate": 7.378783122386952e-05, "loss": 5.2871, "step": 522000 }, { "epoch": 2.7834601206077263, "grad_norm": 2.1411802768707275, "learning_rate": 7.289600605014197e-05, "loss": 5.2861, "step": 522500 }, { "epoch": 2.786123718809265, "grad_norm": 2.5163323879241943, "learning_rate": 7.200418087641443e-05, "loss": 5.286, "step": 523000 }, { "epoch": 2.7887873170108035, "grad_norm": 2.482067108154297, "learning_rate": 7.111413935303434e-05, "loss": 5.2863, "step": 523500 }, { "epoch": 2.791450915212342, "grad_norm": 2.3614418506622314, "learning_rate": 7.02223141793068e-05, "loss": 5.2799, "step": 524000 }, { "epoch": 2.7941145134138807, "grad_norm": 2.333521842956543, "learning_rate": 6.933048900557925e-05, "loss": 5.2873, "step": 524500 }, { "epoch": 2.796778111615419, "grad_norm": 2.2536137104034424, "learning_rate": 6.843866383185171e-05, "loss": 5.2909, "step": 525000 }, { "epoch": 2.7994417098169575, "grad_norm": 2.516286849975586, "learning_rate": 6.754862230847162e-05, "loss": 5.2944, "step": 525500 }, { "epoch": 2.802105308018496, "grad_norm": 2.361598253250122, "learning_rate": 6.665679713474408e-05, "loss": 5.2872, "step": 526000 }, { "epoch": 2.8047689062200343, "grad_norm": 2.387085199356079, "learning_rate": 6.576497196101654e-05, "loss": 5.291, "step": 526500 }, { "epoch": 2.807432504421573, "grad_norm": 2.2874443531036377, "learning_rate": 6.487314678728899e-05, "loss": 5.29, "step": 527000 }, { "epoch": 2.8100961026231115, "grad_norm": 2.4107890129089355, "learning_rate": 6.39831052639089e-05, "loss": 5.2781, "step": 527500 }, { "epoch": 2.81275970082465, "grad_norm": 2.3214197158813477, "learning_rate": 6.309128009018136e-05, "loss": 5.2851, "step": 528000 }, { "epoch": 2.8154232990261887, "grad_norm": 2.3806910514831543, "learning_rate": 6.219945491645382e-05, "loss": 5.2824, "step": 528500 }, { "epoch": 2.818086897227727, "grad_norm": 2.4679012298583984, "learning_rate": 6.130762974272627e-05, "loss": 5.291, "step": 529000 }, { "epoch": 2.8207504954292655, "grad_norm": 2.30574631690979, "learning_rate": 6.041758821934619e-05, "loss": 5.2901, "step": 529500 }, { "epoch": 2.823414093630804, "grad_norm": 2.309056043624878, "learning_rate": 5.9525763045618644e-05, "loss": 5.2778, "step": 530000 }, { "epoch": 2.8260776918323423, "grad_norm": 2.378755569458008, "learning_rate": 5.8633937871891097e-05, "loss": 5.2815, "step": 530500 }, { "epoch": 2.828741290033881, "grad_norm": 2.6057322025299072, "learning_rate": 5.7742112698163556e-05, "loss": 5.2866, "step": 531000 }, { "epoch": 2.8314048882354195, "grad_norm": 2.3079919815063477, "learning_rate": 5.685028752443601e-05, "loss": 5.2791, "step": 531500 }, { "epoch": 2.834068486436958, "grad_norm": 2.2242472171783447, "learning_rate": 5.5960246001055924e-05, "loss": 5.2865, "step": 532000 }, { "epoch": 2.8367320846384967, "grad_norm": 2.3489010334014893, "learning_rate": 5.5068420827328383e-05, "loss": 5.2872, "step": 532500 }, { "epoch": 2.839395682840035, "grad_norm": 2.9294140338897705, "learning_rate": 5.4176595653600836e-05, "loss": 5.2796, "step": 533000 }, { "epoch": 2.8420592810415735, "grad_norm": 2.325824499130249, "learning_rate": 5.328477047987329e-05, "loss": 5.2878, "step": 533500 }, { "epoch": 2.844722879243112, "grad_norm": 2.3206863403320312, "learning_rate": 5.23947289564932e-05, "loss": 5.2827, "step": 534000 }, { "epoch": 2.8473864774446502, "grad_norm": 2.241338014602661, "learning_rate": 5.150290378276566e-05, "loss": 5.2862, "step": 534500 }, { "epoch": 2.850050075646189, "grad_norm": 2.3662049770355225, "learning_rate": 5.0611078609038116e-05, "loss": 5.2868, "step": 535000 }, { "epoch": 2.8527136738477274, "grad_norm": 2.0729544162750244, "learning_rate": 4.971925343531057e-05, "loss": 5.2851, "step": 535500 }, { "epoch": 2.855377272049266, "grad_norm": 2.1059601306915283, "learning_rate": 4.8829211911930484e-05, "loss": 5.2809, "step": 536000 }, { "epoch": 2.8580408702508047, "grad_norm": 2.70766282081604, "learning_rate": 4.793738673820294e-05, "loss": 5.2896, "step": 536500 }, { "epoch": 2.860704468452343, "grad_norm": 2.526292562484741, "learning_rate": 4.704556156447539e-05, "loss": 5.2828, "step": 537000 }, { "epoch": 2.8633680666538814, "grad_norm": 2.246443510055542, "learning_rate": 4.6153736390747856e-05, "loss": 5.2847, "step": 537500 }, { "epoch": 2.86603166485542, "grad_norm": 2.5226643085479736, "learning_rate": 4.5263694867367764e-05, "loss": 5.2871, "step": 538000 }, { "epoch": 2.868695263056958, "grad_norm": 2.416816473007202, "learning_rate": 4.437186969364022e-05, "loss": 5.2825, "step": 538500 }, { "epoch": 2.871358861258497, "grad_norm": 2.5631511211395264, "learning_rate": 4.348004451991267e-05, "loss": 5.2815, "step": 539000 }, { "epoch": 2.8740224594600354, "grad_norm": 2.2883377075195312, "learning_rate": 4.258821934618513e-05, "loss": 5.2824, "step": 539500 }, { "epoch": 2.8766860576615736, "grad_norm": 2.4545071125030518, "learning_rate": 4.1698177822805044e-05, "loss": 5.278, "step": 540000 }, { "epoch": 2.879349655863112, "grad_norm": 2.2015092372894287, "learning_rate": 4.08063526490775e-05, "loss": 5.2806, "step": 540500 }, { "epoch": 2.882013254064651, "grad_norm": 2.7558255195617676, "learning_rate": 3.9914527475349956e-05, "loss": 5.2857, "step": 541000 }, { "epoch": 2.8846768522661894, "grad_norm": 2.376549005508423, "learning_rate": 3.902270230162241e-05, "loss": 5.2792, "step": 541500 }, { "epoch": 2.887340450467728, "grad_norm": 2.3727259635925293, "learning_rate": 3.813266077824232e-05, "loss": 5.2843, "step": 542000 }, { "epoch": 2.890004048669266, "grad_norm": 2.3833839893341064, "learning_rate": 3.724083560451478e-05, "loss": 5.2785, "step": 542500 }, { "epoch": 2.892667646870805, "grad_norm": 2.4702396392822266, "learning_rate": 3.6349010430787236e-05, "loss": 5.2785, "step": 543000 }, { "epoch": 2.8953312450723434, "grad_norm": 2.54264497756958, "learning_rate": 3.545718525705969e-05, "loss": 5.2813, "step": 543500 }, { "epoch": 2.8979948432738816, "grad_norm": 2.356501579284668, "learning_rate": 3.456536008333214e-05, "loss": 5.2886, "step": 544000 }, { "epoch": 2.90065844147542, "grad_norm": 2.546325445175171, "learning_rate": 3.367531855995206e-05, "loss": 5.2778, "step": 544500 }, { "epoch": 2.903322039676959, "grad_norm": 2.3812687397003174, "learning_rate": 3.2783493386224516e-05, "loss": 5.284, "step": 545000 }, { "epoch": 2.9059856378784974, "grad_norm": 2.3538711071014404, "learning_rate": 3.189166821249697e-05, "loss": 5.2755, "step": 545500 }, { "epoch": 2.908649236080036, "grad_norm": 2.2477262020111084, "learning_rate": 3.099984303876943e-05, "loss": 5.2876, "step": 546000 }, { "epoch": 2.911312834281574, "grad_norm": 2.2652475833892822, "learning_rate": 3.0109801515389333e-05, "loss": 5.2777, "step": 546500 }, { "epoch": 2.9139764324831128, "grad_norm": 2.468841791152954, "learning_rate": 2.9217976341661793e-05, "loss": 5.2779, "step": 547000 }, { "epoch": 2.9166400306846514, "grad_norm": 2.151130437850952, "learning_rate": 2.832615116793425e-05, "loss": 5.2883, "step": 547500 }, { "epoch": 2.9193036288861895, "grad_norm": 2.464799404144287, "learning_rate": 2.74343259942067e-05, "loss": 5.2843, "step": 548000 }, { "epoch": 2.921967227087728, "grad_norm": 2.6122734546661377, "learning_rate": 2.6544284470826617e-05, "loss": 5.2854, "step": 548500 }, { "epoch": 2.9246308252892668, "grad_norm": 2.257554769515991, "learning_rate": 2.565245929709907e-05, "loss": 5.277, "step": 549000 }, { "epoch": 2.9272944234908054, "grad_norm": 2.2422280311584473, "learning_rate": 2.476063412337153e-05, "loss": 5.2804, "step": 549500 }, { "epoch": 2.929958021692344, "grad_norm": 2.4912326335906982, "learning_rate": 2.3868808949643985e-05, "loss": 5.2758, "step": 550000 }, { "epoch": 2.932621619893882, "grad_norm": 2.305392265319824, "learning_rate": 2.2978767426263897e-05, "loss": 5.2831, "step": 550500 }, { "epoch": 2.9352852180954208, "grad_norm": 2.699528217315674, "learning_rate": 2.2086942252536353e-05, "loss": 5.2841, "step": 551000 }, { "epoch": 2.9379488162969594, "grad_norm": 2.3196749687194824, "learning_rate": 2.1195117078808806e-05, "loss": 5.2792, "step": 551500 }, { "epoch": 2.9406124144984975, "grad_norm": 2.134294033050537, "learning_rate": 2.0303291905081265e-05, "loss": 5.2845, "step": 552000 }, { "epoch": 2.943276012700036, "grad_norm": 2.25675892829895, "learning_rate": 1.941146673135372e-05, "loss": 5.2778, "step": 552500 }, { "epoch": 2.9459396109015747, "grad_norm": 2.141127824783325, "learning_rate": 1.852142520797363e-05, "loss": 5.2738, "step": 553000 }, { "epoch": 2.9486032091031134, "grad_norm": 2.3503618240356445, "learning_rate": 1.762960003424609e-05, "loss": 5.277, "step": 553500 }, { "epoch": 2.951266807304652, "grad_norm": 2.2987284660339355, "learning_rate": 1.673777486051854e-05, "loss": 5.2864, "step": 554000 }, { "epoch": 2.95393040550619, "grad_norm": 2.384070873260498, "learning_rate": 1.5845949686791e-05, "loss": 5.2798, "step": 554500 }, { "epoch": 2.9565940037077287, "grad_norm": 2.272744655609131, "learning_rate": 1.4954124513063455e-05, "loss": 5.2806, "step": 555000 }, { "epoch": 2.9592576019092673, "grad_norm": 2.2945611476898193, "learning_rate": 1.4064082989683367e-05, "loss": 5.2852, "step": 555500 }, { "epoch": 2.9619212001108055, "grad_norm": 2.5340495109558105, "learning_rate": 1.3172257815955822e-05, "loss": 5.2764, "step": 556000 }, { "epoch": 2.964584798312344, "grad_norm": 2.3637685775756836, "learning_rate": 1.228043264222828e-05, "loss": 5.28, "step": 556500 }, { "epoch": 2.9672483965138827, "grad_norm": 2.401252031326294, "learning_rate": 1.1388607468500735e-05, "loss": 5.2809, "step": 557000 }, { "epoch": 2.9699119947154213, "grad_norm": 2.256577253341675, "learning_rate": 1.0498565945120647e-05, "loss": 5.2798, "step": 557500 }, { "epoch": 2.9725755929169595, "grad_norm": 2.1444365978240967, "learning_rate": 9.606740771393103e-06, "loss": 5.2761, "step": 558000 }, { "epoch": 2.975239191118498, "grad_norm": 2.325979471206665, "learning_rate": 8.714915597665558e-06, "loss": 5.2804, "step": 558500 }, { "epoch": 2.9779027893200367, "grad_norm": 2.1250107288360596, "learning_rate": 7.823090423938014e-06, "loss": 5.2767, "step": 559000 }, { "epoch": 2.9805663875215753, "grad_norm": 2.4525716304779053, "learning_rate": 6.933048900557926e-06, "loss": 5.2805, "step": 559500 }, { "epoch": 2.9832299857231135, "grad_norm": 2.176084041595459, "learning_rate": 6.0412237268303826e-06, "loss": 5.278, "step": 560000 }, { "epoch": 2.985893583924652, "grad_norm": 2.607921600341797, "learning_rate": 5.149398553102839e-06, "loss": 5.2778, "step": 560500 }, { "epoch": 2.9885571821261907, "grad_norm": 2.287775993347168, "learning_rate": 4.257573379375294e-06, "loss": 5.2721, "step": 561000 }, { "epoch": 2.991220780327729, "grad_norm": 2.258080005645752, "learning_rate": 3.3657482056477507e-06, "loss": 5.2754, "step": 561500 }, { "epoch": 2.9938843785292675, "grad_norm": 2.214787244796753, "learning_rate": 2.475706682267662e-06, "loss": 5.2853, "step": 562000 }, { "epoch": 2.996547976730806, "grad_norm": 2.4470176696777344, "learning_rate": 1.583881508540118e-06, "loss": 5.2732, "step": 562500 }, { "epoch": 2.9992115749323447, "grad_norm": 2.2027597427368164, "learning_rate": 6.920563348125741e-07, "loss": 5.2723, "step": 563000 }, { "epoch": 3.0, "step": 563148, "total_flos": 1.7947279651188326e+17, "train_loss": 5.458770358526144, "train_runtime": 36904.8634, "train_samples_per_second": 976.604, "train_steps_per_second": 15.259 } ], "logging_steps": 500, "max_steps": 563148, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 5000, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 1.7947279651188326e+17, "train_batch_size": 64, "trial_name": null, "trial_params": null }