{ "best_metric": null, "best_model_checkpoint": null, "epoch": 2.700154030463803, "eval_steps": 877, "global_step": 15777, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.00017114495978093444, "grad_norm": NaN, "learning_rate": 0.0, "loss": 9.5502, "step": 1 }, { "epoch": 0.0003422899195618689, "grad_norm": NaN, "learning_rate": 0.0, "loss": 17.5546, "step": 2 }, { "epoch": 0.0005134348793428033, "grad_norm": Infinity, "learning_rate": 0.0, "loss": 25.9988, "step": 3 }, { "epoch": 0.0006845798391237378, "grad_norm": 60.18782043457031, "learning_rate": 5.704506560182544e-09, "loss": 9.6042, "step": 4 }, { "epoch": 0.0008557247989046722, "grad_norm": Infinity, "learning_rate": 5.704506560182544e-09, "loss": 17.9169, "step": 5 }, { "epoch": 0.0010268697586856067, "grad_norm": 84.36212158203125, "learning_rate": 1.1409013120365088e-08, "loss": 10.1633, "step": 6 }, { "epoch": 0.001198014718466541, "grad_norm": 82.20382690429688, "learning_rate": 1.711351968054763e-08, "loss": 8.4392, "step": 7 }, { "epoch": 0.0013691596782474755, "grad_norm": 16.687606811523438, "learning_rate": 2.2818026240730176e-08, "loss": 6.4113, "step": 8 }, { "epoch": 0.00154030463802841, "grad_norm": NaN, "learning_rate": 2.2818026240730176e-08, "loss": 17.185, "step": 9 }, { "epoch": 0.0017114495978093444, "grad_norm": 48.527183532714844, "learning_rate": 2.852253280091272e-08, "loss": 7.868, "step": 10 }, { "epoch": 0.0018825945575902789, "grad_norm": 76.19969177246094, "learning_rate": 3.422703936109526e-08, "loss": 25.5097, "step": 11 }, { "epoch": 0.0020537395173712133, "grad_norm": 44.624080657958984, "learning_rate": 3.9931545921277814e-08, "loss": 8.533, "step": 12 }, { "epoch": 0.002224884477152148, "grad_norm": 68.30242156982422, "learning_rate": 4.563605248146035e-08, "loss": 12.1618, "step": 13 }, { "epoch": 0.002396029436933082, "grad_norm": 77.02323913574219, "learning_rate": 5.1340559041642904e-08, "loss": 10.1531, "step": 14 }, { "epoch": 0.002567174396714017, "grad_norm": 37.27943420410156, "learning_rate": 5.704506560182544e-08, "loss": 8.2236, "step": 15 }, { "epoch": 0.002738319356494951, "grad_norm": 47.14276123046875, "learning_rate": 6.274957216200798e-08, "loss": 6.6764, "step": 16 }, { "epoch": 0.0029094643162758858, "grad_norm": 68.288818359375, "learning_rate": 6.845407872219053e-08, "loss": 9.6404, "step": 17 }, { "epoch": 0.00308060927605682, "grad_norm": 72.5563735961914, "learning_rate": 7.415858528237308e-08, "loss": 7.2553, "step": 18 }, { "epoch": 0.0032517542358377546, "grad_norm": 133.2047576904297, "learning_rate": 7.986309184255563e-08, "loss": 17.6761, "step": 19 }, { "epoch": 0.003422899195618689, "grad_norm": 96.22279357910156, "learning_rate": 8.556759840273816e-08, "loss": 23.6993, "step": 20 }, { "epoch": 0.0035940441553996235, "grad_norm": 72.40164947509766, "learning_rate": 9.12721049629207e-08, "loss": 12.5069, "step": 21 }, { "epoch": 0.0037651891151805577, "grad_norm": 58.689125061035156, "learning_rate": 9.697661152310325e-08, "loss": 10.4915, "step": 22 }, { "epoch": 0.003936334074961492, "grad_norm": 41.391170501708984, "learning_rate": 1.0268111808328581e-07, "loss": 8.2323, "step": 23 }, { "epoch": 0.004107479034742427, "grad_norm": 60.9368896484375, "learning_rate": 1.0838562464346835e-07, "loss": 9.4007, "step": 24 }, { "epoch": 0.004278623994523361, "grad_norm": 132.59597778320312, "learning_rate": 1.1409013120365088e-07, "loss": 16.9119, "step": 25 }, { "epoch": 0.004449768954304296, "grad_norm": 72.2205581665039, "learning_rate": 1.1979463776383346e-07, "loss": 12.5137, "step": 26 }, { "epoch": 0.00462091391408523, "grad_norm": 69.2486572265625, "learning_rate": 1.2549914432401596e-07, "loss": 10.2477, "step": 27 }, { "epoch": 0.004792058873866164, "grad_norm": 30.01091194152832, "learning_rate": 1.3120365088419852e-07, "loss": 6.6456, "step": 28 }, { "epoch": 0.0049632038336470995, "grad_norm": 75.28530883789062, "learning_rate": 1.3690815744438105e-07, "loss": 9.7946, "step": 29 }, { "epoch": 0.005134348793428034, "grad_norm": 70.37921142578125, "learning_rate": 1.426126640045636e-07, "loss": 12.4969, "step": 30 }, { "epoch": 0.005305493753208968, "grad_norm": 90.83671569824219, "learning_rate": 1.4831717056474617e-07, "loss": 7.6589, "step": 31 }, { "epoch": 0.005476638712989902, "grad_norm": 65.92588806152344, "learning_rate": 1.540216771249287e-07, "loss": 9.7764, "step": 32 }, { "epoch": 0.005647783672770837, "grad_norm": 86.14967346191406, "learning_rate": 1.5972618368511126e-07, "loss": 8.2129, "step": 33 }, { "epoch": 0.0058189286325517715, "grad_norm": 145.2432098388672, "learning_rate": 1.654306902452938e-07, "loss": 17.7377, "step": 34 }, { "epoch": 0.005990073592332706, "grad_norm": 64.69364929199219, "learning_rate": 1.7113519680547632e-07, "loss": 9.9994, "step": 35 }, { "epoch": 0.00616121855211364, "grad_norm": 58.662803649902344, "learning_rate": 1.7683970336565888e-07, "loss": 11.459, "step": 36 }, { "epoch": 0.006332363511894575, "grad_norm": 123.47699737548828, "learning_rate": 1.825442099258414e-07, "loss": 16.278, "step": 37 }, { "epoch": 0.006503508471675509, "grad_norm": 40.24553680419922, "learning_rate": 1.8824871648602397e-07, "loss": 8.1959, "step": 38 }, { "epoch": 0.0066746534314564435, "grad_norm": 71.55098724365234, "learning_rate": 1.939532230462065e-07, "loss": 12.4429, "step": 39 }, { "epoch": 0.006845798391237378, "grad_norm": 73.94329833984375, "learning_rate": 1.9965772960638906e-07, "loss": 12.4672, "step": 40 }, { "epoch": 0.007016943351018313, "grad_norm": 78.10585021972656, "learning_rate": 2.0536223616657162e-07, "loss": 12.6372, "step": 41 }, { "epoch": 0.007188088310799247, "grad_norm": 154.09982299804688, "learning_rate": 2.1106674272675415e-07, "loss": 18.2102, "step": 42 }, { "epoch": 0.007359233270580181, "grad_norm": 69.93523406982422, "learning_rate": 2.167712492869367e-07, "loss": 10.2355, "step": 43 }, { "epoch": 0.0075303782303611155, "grad_norm": 53.971500396728516, "learning_rate": 2.224757558471192e-07, "loss": 11.0821, "step": 44 }, { "epoch": 0.007701523190142051, "grad_norm": 79.0840835571289, "learning_rate": 2.2818026240730177e-07, "loss": 12.838, "step": 45 }, { "epoch": 0.007872668149922985, "grad_norm": 68.8064956665039, "learning_rate": 2.3388476896748433e-07, "loss": 11.9172, "step": 46 }, { "epoch": 0.00804381310970392, "grad_norm": 114.59835815429688, "learning_rate": 2.395892755276669e-07, "loss": 16.4603, "step": 47 }, { "epoch": 0.008214958069484853, "grad_norm": 76.0896987915039, "learning_rate": 2.452937820878494e-07, "loss": 12.7007, "step": 48 }, { "epoch": 0.008386103029265788, "grad_norm": 45.47982406616211, "learning_rate": 2.509982886480319e-07, "loss": 10.2213, "step": 49 }, { "epoch": 0.008557247989046722, "grad_norm": 136.47836303710938, "learning_rate": 2.567027952082145e-07, "loss": 17.2777, "step": 50 }, { "epoch": 0.008728392948827657, "grad_norm": 57.07033920288086, "learning_rate": 2.6240730176839704e-07, "loss": 10.8704, "step": 51 }, { "epoch": 0.008899537908608592, "grad_norm": 50.97236633300781, "learning_rate": 2.681118083285796e-07, "loss": 10.8266, "step": 52 }, { "epoch": 0.009070682868389525, "grad_norm": 166.14840698242188, "learning_rate": 2.738163148887621e-07, "loss": 17.2345, "step": 53 }, { "epoch": 0.00924182782817046, "grad_norm": 154.5965576171875, "learning_rate": 2.795208214489447e-07, "loss": 18.5922, "step": 54 }, { "epoch": 0.009412972787951395, "grad_norm": 61.19700622558594, "learning_rate": 2.852253280091272e-07, "loss": 9.3514, "step": 55 }, { "epoch": 0.009584117747732329, "grad_norm": 64.54351806640625, "learning_rate": 2.909298345693098e-07, "loss": 7.0867, "step": 56 }, { "epoch": 0.009755262707513264, "grad_norm": 41.97494888305664, "learning_rate": 2.9663434112949233e-07, "loss": 8.319, "step": 57 }, { "epoch": 0.009926407667294199, "grad_norm": 158.86936950683594, "learning_rate": 3.023388476896748e-07, "loss": 18.7347, "step": 58 }, { "epoch": 0.010097552627075132, "grad_norm": 93.42990112304688, "learning_rate": 3.080433542498574e-07, "loss": 24.4121, "step": 59 }, { "epoch": 0.010268697586856067, "grad_norm": 40.078529357910156, "learning_rate": 3.1374786081003993e-07, "loss": 8.4254, "step": 60 }, { "epoch": 0.010439842546637, "grad_norm": 172.25357055664062, "learning_rate": 3.194523673702225e-07, "loss": 17.8112, "step": 61 }, { "epoch": 0.010610987506417936, "grad_norm": 64.05378723144531, "learning_rate": 3.2515687393040504e-07, "loss": 11.648, "step": 62 }, { "epoch": 0.010782132466198871, "grad_norm": 31.915542602539062, "learning_rate": 3.308613804905876e-07, "loss": 6.576, "step": 63 }, { "epoch": 0.010953277425979804, "grad_norm": 97.90230560302734, "learning_rate": 3.365658870507701e-07, "loss": 24.4094, "step": 64 }, { "epoch": 0.01112442238576074, "grad_norm": 49.965126037597656, "learning_rate": 3.4227039361095264e-07, "loss": 10.7296, "step": 65 }, { "epoch": 0.011295567345541675, "grad_norm": 103.74539947509766, "learning_rate": 3.479749001711352e-07, "loss": 24.3446, "step": 66 }, { "epoch": 0.011466712305322608, "grad_norm": 100.20292663574219, "learning_rate": 3.5367940673131776e-07, "loss": 24.4799, "step": 67 }, { "epoch": 0.011637857265103543, "grad_norm": 87.23709869384766, "learning_rate": 3.593839132915003e-07, "loss": 7.8589, "step": 68 }, { "epoch": 0.011809002224884476, "grad_norm": 105.97203063964844, "learning_rate": 3.650884198516828e-07, "loss": 24.5711, "step": 69 }, { "epoch": 0.011980147184665411, "grad_norm": 139.98709106445312, "learning_rate": 3.707929264118654e-07, "loss": 18.0228, "step": 70 }, { "epoch": 0.012151292144446347, "grad_norm": 32.724159240722656, "learning_rate": 3.7649743297204793e-07, "loss": 8.2503, "step": 71 }, { "epoch": 0.01232243710422728, "grad_norm": 55.843509674072266, "learning_rate": 3.822019395322305e-07, "loss": 7.1578, "step": 72 }, { "epoch": 0.012493582064008215, "grad_norm": 183.3024444580078, "learning_rate": 3.87906446092413e-07, "loss": 18.0044, "step": 73 }, { "epoch": 0.01266472702378915, "grad_norm": 118.88136291503906, "learning_rate": 3.9361095265259553e-07, "loss": 15.586, "step": 74 }, { "epoch": 0.012835871983570083, "grad_norm": 64.66754150390625, "learning_rate": 3.993154592127781e-07, "loss": 12.3289, "step": 75 }, { "epoch": 0.013007016943351019, "grad_norm": 62.58869552612305, "learning_rate": 4.0501996577296065e-07, "loss": 8.9949, "step": 76 }, { "epoch": 0.013178161903131952, "grad_norm": 66.82809448242188, "learning_rate": 4.1072447233314323e-07, "loss": 12.1411, "step": 77 }, { "epoch": 0.013349306862912887, "grad_norm": 144.4269256591797, "learning_rate": 4.164289788933257e-07, "loss": 18.5364, "step": 78 }, { "epoch": 0.013520451822693822, "grad_norm": 60.626834869384766, "learning_rate": 4.221334854535083e-07, "loss": 11.9262, "step": 79 }, { "epoch": 0.013691596782474755, "grad_norm": 139.23158264160156, "learning_rate": 4.278379920136908e-07, "loss": 17.5537, "step": 80 }, { "epoch": 0.01386274174225569, "grad_norm": 74.82394409179688, "learning_rate": 4.335424985738734e-07, "loss": 12.6047, "step": 81 }, { "epoch": 0.014033886702036626, "grad_norm": 128.57716369628906, "learning_rate": 4.3924700513405594e-07, "loss": 17.2671, "step": 82 }, { "epoch": 0.014205031661817559, "grad_norm": 50.15757369995117, "learning_rate": 4.449515116942384e-07, "loss": 9.8308, "step": 83 }, { "epoch": 0.014376176621598494, "grad_norm": 46.99615478515625, "learning_rate": 4.50656018254421e-07, "loss": 8.6575, "step": 84 }, { "epoch": 0.01454732158137943, "grad_norm": 61.31080627441406, "learning_rate": 4.5636052481460354e-07, "loss": 6.8446, "step": 85 }, { "epoch": 0.014718466541160363, "grad_norm": 60.068443298339844, "learning_rate": 4.620650313747861e-07, "loss": 11.2968, "step": 86 }, { "epoch": 0.014889611500941298, "grad_norm": 67.9156265258789, "learning_rate": 4.6776953793496865e-07, "loss": 12.672, "step": 87 }, { "epoch": 0.015060756460722231, "grad_norm": 49.77825164794922, "learning_rate": 4.734740444951512e-07, "loss": 9.5329, "step": 88 }, { "epoch": 0.015231901420503166, "grad_norm": 57.99795913696289, "learning_rate": 4.791785510553338e-07, "loss": 11.3883, "step": 89 }, { "epoch": 0.015403046380284101, "grad_norm": 54.070491790771484, "learning_rate": 4.848830576155162e-07, "loss": 11.282, "step": 90 }, { "epoch": 0.015574191340065035, "grad_norm": 46.44948959350586, "learning_rate": 4.905875641756988e-07, "loss": 10.4933, "step": 91 }, { "epoch": 0.01574533629984597, "grad_norm": 91.90750885009766, "learning_rate": 4.962920707358814e-07, "loss": 14.1568, "step": 92 }, { "epoch": 0.015916481259626903, "grad_norm": 45.950286865234375, "learning_rate": 5.019965772960638e-07, "loss": 8.6589, "step": 93 }, { "epoch": 0.01608762621940784, "grad_norm": 172.71951293945312, "learning_rate": 5.077010838562465e-07, "loss": 16.7382, "step": 94 }, { "epoch": 0.016258771179188773, "grad_norm": 54.993247985839844, "learning_rate": 5.13405590416429e-07, "loss": 11.4407, "step": 95 }, { "epoch": 0.016429916138969707, "grad_norm": 110.25296020507812, "learning_rate": 5.191100969766115e-07, "loss": 23.8689, "step": 96 }, { "epoch": 0.016601061098750643, "grad_norm": 43.9764289855957, "learning_rate": 5.248146035367941e-07, "loss": 7.3984, "step": 97 }, { "epoch": 0.016772206058531577, "grad_norm": 72.5784912109375, "learning_rate": 5.305191100969766e-07, "loss": 7.2256, "step": 98 }, { "epoch": 0.01694335101831251, "grad_norm": 35.58343505859375, "learning_rate": 5.362236166571592e-07, "loss": 7.8513, "step": 99 }, { "epoch": 0.017114495978093443, "grad_norm": 69.7222900390625, "learning_rate": 5.419281232173417e-07, "loss": 11.8726, "step": 100 }, { "epoch": 0.01728564093787438, "grad_norm": 54.240943908691406, "learning_rate": 5.476326297775242e-07, "loss": 11.6433, "step": 101 }, { "epoch": 0.017456785897655314, "grad_norm": 40.061763763427734, "learning_rate": 5.533371363377068e-07, "loss": 6.452, "step": 102 }, { "epoch": 0.017627930857436247, "grad_norm": 43.3102912902832, "learning_rate": 5.590416428978894e-07, "loss": 10.9229, "step": 103 }, { "epoch": 0.017799075817217184, "grad_norm": 48.96671676635742, "learning_rate": 5.647461494580719e-07, "loss": 10.9523, "step": 104 }, { "epoch": 0.017970220776998117, "grad_norm": 107.66687774658203, "learning_rate": 5.704506560182544e-07, "loss": 15.756, "step": 105 }, { "epoch": 0.01814136573677905, "grad_norm": 50.87533950805664, "learning_rate": 5.76155162578437e-07, "loss": 9.8941, "step": 106 }, { "epoch": 0.018312510696559987, "grad_norm": 142.70115661621094, "learning_rate": 5.818596691386196e-07, "loss": 16.205, "step": 107 }, { "epoch": 0.01848365565634092, "grad_norm": 62.69704818725586, "learning_rate": 5.87564175698802e-07, "loss": 9.7933, "step": 108 }, { "epoch": 0.018654800616121854, "grad_norm": 52.710227966308594, "learning_rate": 5.932686822589847e-07, "loss": 10.7189, "step": 109 }, { "epoch": 0.01882594557590279, "grad_norm": 131.87474060058594, "learning_rate": 5.989731888191672e-07, "loss": 24.6335, "step": 110 }, { "epoch": 0.018997090535683724, "grad_norm": 105.79902648925781, "learning_rate": 6.046776953793496e-07, "loss": 16.133, "step": 111 }, { "epoch": 0.019168235495464658, "grad_norm": 56.011474609375, "learning_rate": 6.103822019395323e-07, "loss": 11.9402, "step": 112 }, { "epoch": 0.019339380455245594, "grad_norm": 97.4761962890625, "learning_rate": 6.160867084997148e-07, "loss": 14.1356, "step": 113 }, { "epoch": 0.019510525415026528, "grad_norm": 52.7200813293457, "learning_rate": 6.217912150598974e-07, "loss": 11.4511, "step": 114 }, { "epoch": 0.01968167037480746, "grad_norm": 42.6909065246582, "learning_rate": 6.274957216200799e-07, "loss": 8.5194, "step": 115 }, { "epoch": 0.019852815334588398, "grad_norm": 44.77908706665039, "learning_rate": 6.332002281802624e-07, "loss": 11.437, "step": 116 }, { "epoch": 0.02002396029436933, "grad_norm": 136.7108612060547, "learning_rate": 6.38904734740445e-07, "loss": 23.712, "step": 117 }, { "epoch": 0.020195105254150265, "grad_norm": 44.484893798828125, "learning_rate": 6.446092413006275e-07, "loss": 10.4753, "step": 118 }, { "epoch": 0.020366250213931198, "grad_norm": 57.66374206542969, "learning_rate": 6.503137478608101e-07, "loss": 11.2803, "step": 119 }, { "epoch": 0.020537395173712135, "grad_norm": 110.59872436523438, "learning_rate": 6.560182544209926e-07, "loss": 15.7956, "step": 120 }, { "epoch": 0.02070854013349307, "grad_norm": 33.806732177734375, "learning_rate": 6.617227609811752e-07, "loss": 10.327, "step": 121 }, { "epoch": 0.020879685093274, "grad_norm": 52.41442108154297, "learning_rate": 6.674272675413577e-07, "loss": 10.5716, "step": 122 }, { "epoch": 0.02105083005305494, "grad_norm": 40.95213317871094, "learning_rate": 6.731317741015402e-07, "loss": 11.6353, "step": 123 }, { "epoch": 0.021221975012835872, "grad_norm": 43.268775939941406, "learning_rate": 6.788362806617229e-07, "loss": 11.2876, "step": 124 }, { "epoch": 0.021393119972616805, "grad_norm": 102.84829711914062, "learning_rate": 6.845407872219053e-07, "loss": 15.5444, "step": 125 }, { "epoch": 0.021564264932397742, "grad_norm": 56.55605697631836, "learning_rate": 6.902452937820878e-07, "loss": 10.2011, "step": 126 }, { "epoch": 0.021735409892178675, "grad_norm": 37.294795989990234, "learning_rate": 6.959498003422704e-07, "loss": 8.1014, "step": 127 }, { "epoch": 0.02190655485195961, "grad_norm": 55.67061233520508, "learning_rate": 7.01654306902453e-07, "loss": 11.638, "step": 128 }, { "epoch": 0.022077699811740546, "grad_norm": 67.4786605834961, "learning_rate": 7.073588134626355e-07, "loss": 6.8779, "step": 129 }, { "epoch": 0.02224884477152148, "grad_norm": 30.9260196685791, "learning_rate": 7.13063320022818e-07, "loss": 9.4357, "step": 130 }, { "epoch": 0.022419989731302412, "grad_norm": 100.22219848632812, "learning_rate": 7.187678265830006e-07, "loss": 14.6476, "step": 131 }, { "epoch": 0.02259113469108335, "grad_norm": 29.24936294555664, "learning_rate": 7.244723331431832e-07, "loss": 6.5599, "step": 132 }, { "epoch": 0.022762279650864282, "grad_norm": 31.59239959716797, "learning_rate": 7.301768397033656e-07, "loss": 6.6734, "step": 133 }, { "epoch": 0.022933424610645216, "grad_norm": 42.93860626220703, "learning_rate": 7.358813462635483e-07, "loss": 9.125, "step": 134 }, { "epoch": 0.023104569570426153, "grad_norm": 46.8751335144043, "learning_rate": 7.415858528237308e-07, "loss": 10.3878, "step": 135 }, { "epoch": 0.023275714530207086, "grad_norm": 106.5069351196289, "learning_rate": 7.472903593839132e-07, "loss": 14.3693, "step": 136 }, { "epoch": 0.02344685948998802, "grad_norm": 126.05512237548828, "learning_rate": 7.529948659440959e-07, "loss": 22.8488, "step": 137 }, { "epoch": 0.023618004449768953, "grad_norm": 89.99185180664062, "learning_rate": 7.586993725042784e-07, "loss": 13.3307, "step": 138 }, { "epoch": 0.02378914940954989, "grad_norm": 35.95622253417969, "learning_rate": 7.64403879064461e-07, "loss": 9.7673, "step": 139 }, { "epoch": 0.023960294369330823, "grad_norm": 31.504724502563477, "learning_rate": 7.701083856246435e-07, "loss": 9.0915, "step": 140 }, { "epoch": 0.024131439329111756, "grad_norm": 74.6131591796875, "learning_rate": 7.75812892184826e-07, "loss": 12.7378, "step": 141 }, { "epoch": 0.024302584288892693, "grad_norm": 40.63880920410156, "learning_rate": 7.815173987450086e-07, "loss": 8.9323, "step": 142 }, { "epoch": 0.024473729248673626, "grad_norm": 101.69804382324219, "learning_rate": 7.872219053051911e-07, "loss": 14.547, "step": 143 }, { "epoch": 0.02464487420845456, "grad_norm": 131.046142578125, "learning_rate": 7.929264118653737e-07, "loss": 23.0012, "step": 144 }, { "epoch": 0.024816019168235497, "grad_norm": 74.94658660888672, "learning_rate": 7.986309184255562e-07, "loss": 12.8286, "step": 145 }, { "epoch": 0.02498716412801643, "grad_norm": 50.227718353271484, "learning_rate": 8.043354249857388e-07, "loss": 10.9684, "step": 146 }, { "epoch": 0.025158309087797363, "grad_norm": 162.12603759765625, "learning_rate": 8.100399315459213e-07, "loss": 23.0181, "step": 147 }, { "epoch": 0.0253294540475783, "grad_norm": 34.11660385131836, "learning_rate": 8.157444381061038e-07, "loss": 9.2808, "step": 148 }, { "epoch": 0.025500599007359234, "grad_norm": 34.07155990600586, "learning_rate": 8.214489446662865e-07, "loss": 11.3059, "step": 149 }, { "epoch": 0.025671743967140167, "grad_norm": 42.31085968017578, "learning_rate": 8.271534512264689e-07, "loss": 8.7746, "step": 150 }, { "epoch": 0.025842888926921104, "grad_norm": 132.6522216796875, "learning_rate": 8.328579577866514e-07, "loss": 16.6408, "step": 151 }, { "epoch": 0.026014033886702037, "grad_norm": 21.736328125, "learning_rate": 8.385624643468341e-07, "loss": 9.6907, "step": 152 }, { "epoch": 0.02618517884648297, "grad_norm": 64.54568481445312, "learning_rate": 8.442669709070166e-07, "loss": 12.8573, "step": 153 }, { "epoch": 0.026356323806263904, "grad_norm": 32.484703063964844, "learning_rate": 8.499714774671991e-07, "loss": 7.4299, "step": 154 }, { "epoch": 0.02652746876604484, "grad_norm": 86.55378723144531, "learning_rate": 8.556759840273817e-07, "loss": 13.347, "step": 155 }, { "epoch": 0.026698613725825774, "grad_norm": 32.97962188720703, "learning_rate": 8.613804905875642e-07, "loss": 9.2004, "step": 156 }, { "epoch": 0.026869758685606707, "grad_norm": 66.87654113769531, "learning_rate": 8.670849971477468e-07, "loss": 6.6107, "step": 157 }, { "epoch": 0.027040903645387644, "grad_norm": 56.53002166748047, "learning_rate": 8.727895037079292e-07, "loss": 6.0957, "step": 158 }, { "epoch": 0.027212048605168578, "grad_norm": 37.223453521728516, "learning_rate": 8.784940102681119e-07, "loss": 8.9968, "step": 159 }, { "epoch": 0.02738319356494951, "grad_norm": 30.637619018554688, "learning_rate": 8.841985168282944e-07, "loss": 8.9597, "step": 160 }, { "epoch": 0.027554338524730448, "grad_norm": 22.8154354095459, "learning_rate": 8.899030233884768e-07, "loss": 9.2794, "step": 161 }, { "epoch": 0.02772548348451138, "grad_norm": 64.24419403076172, "learning_rate": 8.956075299486595e-07, "loss": 12.9209, "step": 162 }, { "epoch": 0.027896628444292314, "grad_norm": 27.159826278686523, "learning_rate": 9.01312036508842e-07, "loss": 10.7092, "step": 163 }, { "epoch": 0.02806777340407325, "grad_norm": 29.741992950439453, "learning_rate": 9.070165430690246e-07, "loss": 10.1098, "step": 164 }, { "epoch": 0.028238918363854185, "grad_norm": 61.4916877746582, "learning_rate": 9.127210496292071e-07, "loss": 12.5023, "step": 165 }, { "epoch": 0.028410063323635118, "grad_norm": 21.36608123779297, "learning_rate": 9.184255561893896e-07, "loss": 7.2161, "step": 166 }, { "epoch": 0.028581208283416055, "grad_norm": 51.13070297241211, "learning_rate": 9.241300627495722e-07, "loss": 5.5324, "step": 167 }, { "epoch": 0.028752353243196988, "grad_norm": 27.232070922851562, "learning_rate": 9.298345693097547e-07, "loss": 9.3162, "step": 168 }, { "epoch": 0.02892349820297792, "grad_norm": 51.84492111206055, "learning_rate": 9.355390758699373e-07, "loss": 6.0306, "step": 169 }, { "epoch": 0.02909464316275886, "grad_norm": 24.21738052368164, "learning_rate": 9.412435824301197e-07, "loss": 6.6994, "step": 170 }, { "epoch": 0.02926578812253979, "grad_norm": 27.428897857666016, "learning_rate": 9.469480889903024e-07, "loss": 10.5412, "step": 171 }, { "epoch": 0.029436933082320725, "grad_norm": 123.71875762939453, "learning_rate": 9.526525955504849e-07, "loss": 15.9849, "step": 172 }, { "epoch": 0.02960807804210166, "grad_norm": 34.90501403808594, "learning_rate": 9.583571021106676e-07, "loss": 9.2574, "step": 173 }, { "epoch": 0.029779223001882595, "grad_norm": 26.623390197753906, "learning_rate": 9.6406160867085e-07, "loss": 8.7904, "step": 174 }, { "epoch": 0.02995036796166353, "grad_norm": 21.868566513061523, "learning_rate": 9.697661152310325e-07, "loss": 9.2638, "step": 175 }, { "epoch": 0.030121512921444462, "grad_norm": 28.389110565185547, "learning_rate": 9.754706217912152e-07, "loss": 5.9086, "step": 176 }, { "epoch": 0.0302926578812254, "grad_norm": 51.29762649536133, "learning_rate": 9.811751283513976e-07, "loss": 5.9646, "step": 177 }, { "epoch": 0.030463802841006332, "grad_norm": 28.91325569152832, "learning_rate": 9.8687963491158e-07, "loss": 6.0877, "step": 178 }, { "epoch": 0.030634947800787266, "grad_norm": 66.74105834960938, "learning_rate": 9.925841414717628e-07, "loss": 12.4348, "step": 179 }, { "epoch": 0.030806092760568202, "grad_norm": 19.138124465942383, "learning_rate": 9.982886480319452e-07, "loss": 9.5496, "step": 180 }, { "epoch": 0.030977237720349136, "grad_norm": 43.17308044433594, "learning_rate": 1.0039931545921277e-06, "loss": 5.5641, "step": 181 }, { "epoch": 0.03114838268013007, "grad_norm": 32.97599411010742, "learning_rate": 1.0096976611523104e-06, "loss": 9.0529, "step": 182 }, { "epoch": 0.031319527639911006, "grad_norm": 56.315521240234375, "learning_rate": 1.015402167712493e-06, "loss": 12.0747, "step": 183 }, { "epoch": 0.03149067259969194, "grad_norm": 76.77662658691406, "learning_rate": 1.0211066742726755e-06, "loss": 13.0892, "step": 184 }, { "epoch": 0.03166181755947287, "grad_norm": 25.544397354125977, "learning_rate": 1.026811180832858e-06, "loss": 7.7117, "step": 185 }, { "epoch": 0.031832962519253806, "grad_norm": 24.205764770507812, "learning_rate": 1.0325156873930406e-06, "loss": 6.6426, "step": 186 }, { "epoch": 0.03200410747903474, "grad_norm": 25.586280822753906, "learning_rate": 1.038220193953223e-06, "loss": 10.4785, "step": 187 }, { "epoch": 0.03217525243881568, "grad_norm": 68.83911895751953, "learning_rate": 1.0439247005134056e-06, "loss": 12.2132, "step": 188 }, { "epoch": 0.03234639739859661, "grad_norm": 24.825489044189453, "learning_rate": 1.0496292070735881e-06, "loss": 6.3336, "step": 189 }, { "epoch": 0.032517542358377546, "grad_norm": 28.293699264526367, "learning_rate": 1.0553337136337707e-06, "loss": 8.5374, "step": 190 }, { "epoch": 0.03268868731815848, "grad_norm": 28.26664924621582, "learning_rate": 1.0610382201939532e-06, "loss": 9.7218, "step": 191 }, { "epoch": 0.03285983227793941, "grad_norm": 84.32862854003906, "learning_rate": 1.0667427267541357e-06, "loss": 12.782, "step": 192 }, { "epoch": 0.033030977237720346, "grad_norm": 26.818071365356445, "learning_rate": 1.0724472333143185e-06, "loss": 7.3125, "step": 193 }, { "epoch": 0.03320212219750129, "grad_norm": 16.650196075439453, "learning_rate": 1.0781517398745008e-06, "loss": 9.0232, "step": 194 }, { "epoch": 0.03337326715728222, "grad_norm": 22.659135818481445, "learning_rate": 1.0838562464346833e-06, "loss": 6.2787, "step": 195 }, { "epoch": 0.03354441211706315, "grad_norm": 24.644168853759766, "learning_rate": 1.089560752994866e-06, "loss": 6.0047, "step": 196 }, { "epoch": 0.03371555707684409, "grad_norm": 32.078712463378906, "learning_rate": 1.0952652595550484e-06, "loss": 7.5748, "step": 197 }, { "epoch": 0.03388670203662502, "grad_norm": 55.345855712890625, "learning_rate": 1.1009697661152311e-06, "loss": 11.8703, "step": 198 }, { "epoch": 0.034057846996405954, "grad_norm": 70.49486541748047, "learning_rate": 1.1066742726754137e-06, "loss": 11.7983, "step": 199 }, { "epoch": 0.03422899195618689, "grad_norm": 29.946758270263672, "learning_rate": 1.112378779235596e-06, "loss": 7.6286, "step": 200 }, { "epoch": 0.03440013691596783, "grad_norm": 278.3395080566406, "learning_rate": 1.1180832857957787e-06, "loss": 17.6192, "step": 201 }, { "epoch": 0.03457128187574876, "grad_norm": 310.682861328125, "learning_rate": 1.1237877923559613e-06, "loss": 17.6315, "step": 202 }, { "epoch": 0.034742426835529694, "grad_norm": 46.159568786621094, "learning_rate": 1.1294922989161438e-06, "loss": 11.6001, "step": 203 }, { "epoch": 0.03491357179531063, "grad_norm": 20.635892868041992, "learning_rate": 1.1351968054763263e-06, "loss": 9.4128, "step": 204 }, { "epoch": 0.03508471675509156, "grad_norm": 143.4097137451172, "learning_rate": 1.1409013120365089e-06, "loss": 16.3943, "step": 205 }, { "epoch": 0.035255861714872494, "grad_norm": 265.5577087402344, "learning_rate": 1.1466058185966914e-06, "loss": 18.6869, "step": 206 }, { "epoch": 0.035427006674653434, "grad_norm": 19.766063690185547, "learning_rate": 1.152310325156874e-06, "loss": 8.6515, "step": 207 }, { "epoch": 0.03559815163443437, "grad_norm": 43.8801383972168, "learning_rate": 1.1580148317170565e-06, "loss": 11.424, "step": 208 }, { "epoch": 0.0357692965942153, "grad_norm": 12.928386688232422, "learning_rate": 1.1637193382772392e-06, "loss": 5.5902, "step": 209 }, { "epoch": 0.035940441553996234, "grad_norm": 123.55076599121094, "learning_rate": 1.1694238448374215e-06, "loss": 15.6958, "step": 210 }, { "epoch": 0.03611158651377717, "grad_norm": 44.79010772705078, "learning_rate": 1.175128351397604e-06, "loss": 11.1894, "step": 211 }, { "epoch": 0.0362827314735581, "grad_norm": 26.461137771606445, "learning_rate": 1.1808328579577868e-06, "loss": 7.3237, "step": 212 }, { "epoch": 0.03645387643333904, "grad_norm": 24.63947296142578, "learning_rate": 1.1865373645179693e-06, "loss": 5.7252, "step": 213 }, { "epoch": 0.036625021393119975, "grad_norm": 17.151113510131836, "learning_rate": 1.1922418710781517e-06, "loss": 9.0419, "step": 214 }, { "epoch": 0.03679616635290091, "grad_norm": 26.69593620300293, "learning_rate": 1.1979463776383344e-06, "loss": 9.4836, "step": 215 }, { "epoch": 0.03696731131268184, "grad_norm": 50.901573181152344, "learning_rate": 1.203650884198517e-06, "loss": 11.2858, "step": 216 }, { "epoch": 0.037138456272462775, "grad_norm": 48.110328674316406, "learning_rate": 1.2093553907586992e-06, "loss": 11.5594, "step": 217 }, { "epoch": 0.03730960123224371, "grad_norm": 51.77389907836914, "learning_rate": 1.215059897318882e-06, "loss": 11.6974, "step": 218 }, { "epoch": 0.03748074619202464, "grad_norm": 23.52347183227539, "learning_rate": 1.2207644038790645e-06, "loss": 9.5737, "step": 219 }, { "epoch": 0.03765189115180558, "grad_norm": 20.402074813842773, "learning_rate": 1.2264689104392468e-06, "loss": 6.1995, "step": 220 }, { "epoch": 0.037823036111586515, "grad_norm": 18.76962661743164, "learning_rate": 1.2321734169994296e-06, "loss": 7.1013, "step": 221 }, { "epoch": 0.03799418107136745, "grad_norm": 21.817501068115234, "learning_rate": 1.2378779235596121e-06, "loss": 9.3332, "step": 222 }, { "epoch": 0.03816532603114838, "grad_norm": 11.452000617980957, "learning_rate": 1.2435824301197949e-06, "loss": 6.2887, "step": 223 }, { "epoch": 0.038336470990929315, "grad_norm": 22.69776153564453, "learning_rate": 1.2492869366799772e-06, "loss": 7.9947, "step": 224 }, { "epoch": 0.03850761595071025, "grad_norm": 25.39488410949707, "learning_rate": 1.2549914432401597e-06, "loss": 5.1894, "step": 225 }, { "epoch": 0.03867876091049119, "grad_norm": 17.65719223022461, "learning_rate": 1.2606959498003425e-06, "loss": 7.4931, "step": 226 }, { "epoch": 0.03884990587027212, "grad_norm": 23.45711898803711, "learning_rate": 1.2664004563605248e-06, "loss": 9.6157, "step": 227 }, { "epoch": 0.039021050830053056, "grad_norm": 29.114194869995117, "learning_rate": 1.2721049629207073e-06, "loss": 10.4857, "step": 228 }, { "epoch": 0.03919219578983399, "grad_norm": 46.365013122558594, "learning_rate": 1.27780946948089e-06, "loss": 11.9216, "step": 229 }, { "epoch": 0.03936334074961492, "grad_norm": 23.066879272460938, "learning_rate": 1.2835139760410724e-06, "loss": 9.4344, "step": 230 }, { "epoch": 0.039534485709395856, "grad_norm": 15.414644241333008, "learning_rate": 1.289218482601255e-06, "loss": 6.4409, "step": 231 }, { "epoch": 0.039705630669176796, "grad_norm": 16.58795166015625, "learning_rate": 1.2949229891614376e-06, "loss": 7.3307, "step": 232 }, { "epoch": 0.03987677562895773, "grad_norm": 36.44779968261719, "learning_rate": 1.3006274957216202e-06, "loss": 11.1388, "step": 233 }, { "epoch": 0.04004792058873866, "grad_norm": 20.902912139892578, "learning_rate": 1.3063320022818027e-06, "loss": 7.378, "step": 234 }, { "epoch": 0.040219065548519596, "grad_norm": 20.50259017944336, "learning_rate": 1.3120365088419852e-06, "loss": 6.156, "step": 235 }, { "epoch": 0.04039021050830053, "grad_norm": 22.57229995727539, "learning_rate": 1.3177410154021678e-06, "loss": 7.0029, "step": 236 }, { "epoch": 0.04056135546808146, "grad_norm": 25.610868453979492, "learning_rate": 1.3234455219623503e-06, "loss": 8.7721, "step": 237 }, { "epoch": 0.040732500427862396, "grad_norm": 278.795654296875, "learning_rate": 1.3291500285225328e-06, "loss": 15.9633, "step": 238 }, { "epoch": 0.040903645387643336, "grad_norm": 11.644048690795898, "learning_rate": 1.3348545350827154e-06, "loss": 6.3707, "step": 239 }, { "epoch": 0.04107479034742427, "grad_norm": 36.32057189941406, "learning_rate": 1.340559041642898e-06, "loss": 10.7901, "step": 240 }, { "epoch": 0.0412459353072052, "grad_norm": 22.911476135253906, "learning_rate": 1.3462635482030804e-06, "loss": 9.4097, "step": 241 }, { "epoch": 0.04141708026698614, "grad_norm": 24.35552406311035, "learning_rate": 1.351968054763263e-06, "loss": 9.081, "step": 242 }, { "epoch": 0.04158822522676707, "grad_norm": 18.466432571411133, "learning_rate": 1.3576725613234457e-06, "loss": 7.4805, "step": 243 }, { "epoch": 0.041759370186548, "grad_norm": 44.41029357910156, "learning_rate": 1.363377067883628e-06, "loss": 11.2131, "step": 244 }, { "epoch": 0.041930515146328944, "grad_norm": 15.328824043273926, "learning_rate": 1.3690815744438106e-06, "loss": 8.2706, "step": 245 }, { "epoch": 0.04210166010610988, "grad_norm": 274.3642578125, "learning_rate": 1.3747860810039933e-06, "loss": 15.8791, "step": 246 }, { "epoch": 0.04227280506589081, "grad_norm": 18.105318069458008, "learning_rate": 1.3804905875641756e-06, "loss": 8.9079, "step": 247 }, { "epoch": 0.042443950025671744, "grad_norm": 22.90168571472168, "learning_rate": 1.3861950941243584e-06, "loss": 6.6905, "step": 248 }, { "epoch": 0.04261509498545268, "grad_norm": 16.96687126159668, "learning_rate": 1.391899600684541e-06, "loss": 8.5567, "step": 249 }, { "epoch": 0.04278623994523361, "grad_norm": 283.76409912109375, "learning_rate": 1.3976041072447232e-06, "loss": 14.4204, "step": 250 }, { "epoch": 0.04295738490501455, "grad_norm": 22.41378402709961, "learning_rate": 1.403308613804906e-06, "loss": 9.6063, "step": 251 }, { "epoch": 0.043128529864795484, "grad_norm": 23.26137924194336, "learning_rate": 1.4090131203650885e-06, "loss": 9.9569, "step": 252 }, { "epoch": 0.04329967482457642, "grad_norm": 19.40400505065918, "learning_rate": 1.414717626925271e-06, "loss": 6.4322, "step": 253 }, { "epoch": 0.04347081978435735, "grad_norm": 21.541933059692383, "learning_rate": 1.4204221334854536e-06, "loss": 4.5325, "step": 254 }, { "epoch": 0.043641964744138284, "grad_norm": 17.52275276184082, "learning_rate": 1.426126640045636e-06, "loss": 8.3479, "step": 255 }, { "epoch": 0.04381310970391922, "grad_norm": 125.6756591796875, "learning_rate": 1.4318311466058186e-06, "loss": 15.4145, "step": 256 }, { "epoch": 0.04398425466370015, "grad_norm": 18.166152954101562, "learning_rate": 1.4375356531660011e-06, "loss": 4.2531, "step": 257 }, { "epoch": 0.04415539962348109, "grad_norm": 25.4247989654541, "learning_rate": 1.4432401597261837e-06, "loss": 10.4856, "step": 258 }, { "epoch": 0.044326544583262024, "grad_norm": 17.259897232055664, "learning_rate": 1.4489446662863664e-06, "loss": 8.6032, "step": 259 }, { "epoch": 0.04449768954304296, "grad_norm": 23.197059631347656, "learning_rate": 1.4546491728465487e-06, "loss": 7.8062, "step": 260 }, { "epoch": 0.04466883450282389, "grad_norm": 43.4500617980957, "learning_rate": 1.4603536794067313e-06, "loss": 11.1986, "step": 261 }, { "epoch": 0.044839979462604825, "grad_norm": 122.06368255615234, "learning_rate": 1.466058185966914e-06, "loss": 15.5832, "step": 262 }, { "epoch": 0.04501112442238576, "grad_norm": 16.506317138671875, "learning_rate": 1.4717626925270965e-06, "loss": 8.7747, "step": 263 }, { "epoch": 0.0451822693821667, "grad_norm": 19.03982162475586, "learning_rate": 1.4774671990872789e-06, "loss": 7.6134, "step": 264 }, { "epoch": 0.04535341434194763, "grad_norm": 33.20307540893555, "learning_rate": 1.4831717056474616e-06, "loss": 10.3325, "step": 265 }, { "epoch": 0.045524559301728565, "grad_norm": 16.946876525878906, "learning_rate": 1.4888762122076441e-06, "loss": 8.2866, "step": 266 }, { "epoch": 0.0456957042615095, "grad_norm": 25.170318603515625, "learning_rate": 1.4945807187678265e-06, "loss": 10.2958, "step": 267 }, { "epoch": 0.04586684922129043, "grad_norm": 16.860721588134766, "learning_rate": 1.5002852253280092e-06, "loss": 8.5198, "step": 268 }, { "epoch": 0.046037994181071365, "grad_norm": 18.003284454345703, "learning_rate": 1.5059897318881917e-06, "loss": 8.8484, "step": 269 }, { "epoch": 0.046209139140852305, "grad_norm": 17.796016693115234, "learning_rate": 1.511694238448374e-06, "loss": 6.2495, "step": 270 }, { "epoch": 0.04638028410063324, "grad_norm": 23.97182846069336, "learning_rate": 1.5173987450085568e-06, "loss": 7.0879, "step": 271 }, { "epoch": 0.04655142906041417, "grad_norm": 213.1482696533203, "learning_rate": 1.5231032515687393e-06, "loss": 12.8754, "step": 272 }, { "epoch": 0.046722574020195105, "grad_norm": 25.503662109375, "learning_rate": 1.528807758128922e-06, "loss": 7.9125, "step": 273 }, { "epoch": 0.04689371897997604, "grad_norm": 19.832860946655273, "learning_rate": 1.5345122646891044e-06, "loss": 9.0794, "step": 274 }, { "epoch": 0.04706486393975697, "grad_norm": 32.311920166015625, "learning_rate": 1.540216771249287e-06, "loss": 10.648, "step": 275 }, { "epoch": 0.047236008899537905, "grad_norm": 39.916603088378906, "learning_rate": 1.5459212778094697e-06, "loss": 10.9246, "step": 276 }, { "epoch": 0.047407153859318846, "grad_norm": 21.337602615356445, "learning_rate": 1.551625784369652e-06, "loss": 9.2191, "step": 277 }, { "epoch": 0.04757829881909978, "grad_norm": 25.114675521850586, "learning_rate": 1.5573302909298345e-06, "loss": 10.2576, "step": 278 }, { "epoch": 0.04774944377888071, "grad_norm": 14.945568084716797, "learning_rate": 1.5630347974900173e-06, "loss": 8.4857, "step": 279 }, { "epoch": 0.047920588738661646, "grad_norm": 33.542449951171875, "learning_rate": 1.5687393040501996e-06, "loss": 10.9193, "step": 280 }, { "epoch": 0.04809173369844258, "grad_norm": 27.331628799438477, "learning_rate": 1.5744438106103821e-06, "loss": 9.9441, "step": 281 }, { "epoch": 0.04826287865822351, "grad_norm": 17.784677505493164, "learning_rate": 1.5801483171705649e-06, "loss": 6.4105, "step": 282 }, { "epoch": 0.04843402361800445, "grad_norm": 46.38033676147461, "learning_rate": 1.5858528237307474e-06, "loss": 10.5075, "step": 283 }, { "epoch": 0.048605168577785386, "grad_norm": 13.535309791564941, "learning_rate": 1.59155733029093e-06, "loss": 4.4568, "step": 284 }, { "epoch": 0.04877631353756632, "grad_norm": 27.45166015625, "learning_rate": 1.5972618368511125e-06, "loss": 10.2344, "step": 285 }, { "epoch": 0.04894745849734725, "grad_norm": 16.50087547302246, "learning_rate": 1.602966343411295e-06, "loss": 8.5428, "step": 286 }, { "epoch": 0.049118603457128186, "grad_norm": 42.31341552734375, "learning_rate": 1.6086708499714775e-06, "loss": 10.0868, "step": 287 }, { "epoch": 0.04928974841690912, "grad_norm": 17.977153778076172, "learning_rate": 1.61437535653166e-06, "loss": 9.012, "step": 288 }, { "epoch": 0.04946089337669006, "grad_norm": 104.7464828491211, "learning_rate": 1.6200798630918426e-06, "loss": 14.6671, "step": 289 }, { "epoch": 0.04963203833647099, "grad_norm": 17.432056427001953, "learning_rate": 1.6257843696520251e-06, "loss": 6.8872, "step": 290 }, { "epoch": 0.04980318329625193, "grad_norm": 242.7275390625, "learning_rate": 1.6314888762122076e-06, "loss": 11.2526, "step": 291 }, { "epoch": 0.04997432825603286, "grad_norm": 15.779862403869629, "learning_rate": 1.6371933827723902e-06, "loss": 8.7887, "step": 292 }, { "epoch": 0.05014547321581379, "grad_norm": 13.621806144714355, "learning_rate": 1.642897889332573e-06, "loss": 7.0578, "step": 293 }, { "epoch": 0.05031661817559473, "grad_norm": 14.4631986618042, "learning_rate": 1.6486023958927552e-06, "loss": 8.2147, "step": 294 }, { "epoch": 0.05048776313537566, "grad_norm": 18.11038589477539, "learning_rate": 1.6543069024529378e-06, "loss": 6.4308, "step": 295 }, { "epoch": 0.0506589080951566, "grad_norm": 16.797258377075195, "learning_rate": 1.6600114090131205e-06, "loss": 6.3738, "step": 296 }, { "epoch": 0.050830053054937534, "grad_norm": 17.457462310791016, "learning_rate": 1.6657159155733028e-06, "loss": 6.3681, "step": 297 }, { "epoch": 0.05100119801471847, "grad_norm": 14.502140045166016, "learning_rate": 1.6714204221334856e-06, "loss": 7.1297, "step": 298 }, { "epoch": 0.0511723429744994, "grad_norm": 14.4544677734375, "learning_rate": 1.6771249286936681e-06, "loss": 8.5584, "step": 299 }, { "epoch": 0.051343487934280334, "grad_norm": 13.313618659973145, "learning_rate": 1.6828294352538504e-06, "loss": 8.1348, "step": 300 }, { "epoch": 0.05151463289406127, "grad_norm": 91.8434829711914, "learning_rate": 1.6885339418140332e-06, "loss": 13.9421, "step": 301 }, { "epoch": 0.05168577785384221, "grad_norm": 39.31818389892578, "learning_rate": 1.6942384483742157e-06, "loss": 10.3291, "step": 302 }, { "epoch": 0.05185692281362314, "grad_norm": 16.320667266845703, "learning_rate": 1.6999429549343982e-06, "loss": 4.6866, "step": 303 }, { "epoch": 0.052028067773404074, "grad_norm": 13.367071151733398, "learning_rate": 1.7056474614945808e-06, "loss": 8.1535, "step": 304 }, { "epoch": 0.05219921273318501, "grad_norm": 186.96824645996094, "learning_rate": 1.7113519680547633e-06, "loss": 10.6341, "step": 305 }, { "epoch": 0.05237035769296594, "grad_norm": 28.400169372558594, "learning_rate": 1.7170564746149458e-06, "loss": 9.7369, "step": 306 }, { "epoch": 0.052541502652746874, "grad_norm": 15.559652328491211, "learning_rate": 1.7227609811751284e-06, "loss": 7.1427, "step": 307 }, { "epoch": 0.05271264761252781, "grad_norm": 5.730342864990234, "learning_rate": 1.728465487735311e-06, "loss": 5.4861, "step": 308 }, { "epoch": 0.05288379257230875, "grad_norm": 19.06242561340332, "learning_rate": 1.7341699942954936e-06, "loss": 9.0657, "step": 309 }, { "epoch": 0.05305493753208968, "grad_norm": 18.580720901489258, "learning_rate": 1.739874500855676e-06, "loss": 5.9947, "step": 310 }, { "epoch": 0.053226082491870615, "grad_norm": 13.939530372619629, "learning_rate": 1.7455790074158585e-06, "loss": 7.1715, "step": 311 }, { "epoch": 0.05339722745165155, "grad_norm": 12.347646713256836, "learning_rate": 1.7512835139760412e-06, "loss": 4.5087, "step": 312 }, { "epoch": 0.05356837241143248, "grad_norm": 16.251863479614258, "learning_rate": 1.7569880205362238e-06, "loss": 8.7544, "step": 313 }, { "epoch": 0.053739517371213415, "grad_norm": 18.887571334838867, "learning_rate": 1.762692527096406e-06, "loss": 7.1006, "step": 314 }, { "epoch": 0.053910662330994355, "grad_norm": 29.57771873474121, "learning_rate": 1.7683970336565888e-06, "loss": 10.2554, "step": 315 }, { "epoch": 0.05408180729077529, "grad_norm": 215.26080322265625, "learning_rate": 1.7741015402167714e-06, "loss": 10.6589, "step": 316 }, { "epoch": 0.05425295225055622, "grad_norm": 6.18715763092041, "learning_rate": 1.7798060467769537e-06, "loss": 5.3794, "step": 317 }, { "epoch": 0.054424097210337155, "grad_norm": 30.351348876953125, "learning_rate": 1.7855105533371364e-06, "loss": 10.3749, "step": 318 }, { "epoch": 0.05459524217011809, "grad_norm": 16.978347778320312, "learning_rate": 1.791215059897319e-06, "loss": 6.2012, "step": 319 }, { "epoch": 0.05476638712989902, "grad_norm": 19.239072799682617, "learning_rate": 1.7969195664575015e-06, "loss": 9.1925, "step": 320 }, { "epoch": 0.05493753208967996, "grad_norm": 20.378984451293945, "learning_rate": 1.802624073017684e-06, "loss": 8.7484, "step": 321 }, { "epoch": 0.055108677049460895, "grad_norm": 11.863981246948242, "learning_rate": 1.8083285795778666e-06, "loss": 6.308, "step": 322 }, { "epoch": 0.05527982200924183, "grad_norm": 15.815791130065918, "learning_rate": 1.8140330861380493e-06, "loss": 8.9935, "step": 323 }, { "epoch": 0.05545096696902276, "grad_norm": 31.865665435791016, "learning_rate": 1.8197375926982316e-06, "loss": 10.1397, "step": 324 }, { "epoch": 0.055622111928803696, "grad_norm": 160.87301635742188, "learning_rate": 1.8254420992584141e-06, "loss": 9.2965, "step": 325 }, { "epoch": 0.05579325688858463, "grad_norm": 16.763856887817383, "learning_rate": 1.8311466058185969e-06, "loss": 6.6638, "step": 326 }, { "epoch": 0.05596440184836556, "grad_norm": 12.291769981384277, "learning_rate": 1.8368511123787792e-06, "loss": 8.2182, "step": 327 }, { "epoch": 0.0561355468081465, "grad_norm": 20.839473724365234, "learning_rate": 1.8425556189389617e-06, "loss": 5.9446, "step": 328 }, { "epoch": 0.056306691767927436, "grad_norm": 41.371337890625, "learning_rate": 1.8482601254991445e-06, "loss": 10.0738, "step": 329 }, { "epoch": 0.05647783672770837, "grad_norm": 12.416519165039062, "learning_rate": 1.8539646320593268e-06, "loss": 7.9372, "step": 330 }, { "epoch": 0.0566489816874893, "grad_norm": 12.856998443603516, "learning_rate": 1.8596691386195093e-06, "loss": 8.5894, "step": 331 }, { "epoch": 0.056820126647270236, "grad_norm": 28.67165184020996, "learning_rate": 1.865373645179692e-06, "loss": 9.856, "step": 332 }, { "epoch": 0.05699127160705117, "grad_norm": 17.425006866455078, "learning_rate": 1.8710781517398746e-06, "loss": 7.6487, "step": 333 }, { "epoch": 0.05716241656683211, "grad_norm": 29.102951049804688, "learning_rate": 1.8767826583000571e-06, "loss": 9.7985, "step": 334 }, { "epoch": 0.05733356152661304, "grad_norm": 15.120597839355469, "learning_rate": 1.8824871648602395e-06, "loss": 8.856, "step": 335 }, { "epoch": 0.057504706486393976, "grad_norm": 188.02642822265625, "learning_rate": 1.8881916714204222e-06, "loss": 9.3274, "step": 336 }, { "epoch": 0.05767585144617491, "grad_norm": 14.4713134765625, "learning_rate": 1.8938961779806047e-06, "loss": 8.8408, "step": 337 }, { "epoch": 0.05784699640595584, "grad_norm": 27.848546981811523, "learning_rate": 1.8996006845407875e-06, "loss": 10.1598, "step": 338 }, { "epoch": 0.058018141365736776, "grad_norm": 12.024163246154785, "learning_rate": 1.9053051911009698e-06, "loss": 6.2088, "step": 339 }, { "epoch": 0.05818928632551772, "grad_norm": 11.968954086303711, "learning_rate": 1.9110096976611523e-06, "loss": 7.3791, "step": 340 }, { "epoch": 0.05836043128529865, "grad_norm": 27.01519775390625, "learning_rate": 1.9167142042213353e-06, "loss": 10.5111, "step": 341 }, { "epoch": 0.05853157624507958, "grad_norm": 13.136455535888672, "learning_rate": 1.9224187107815174e-06, "loss": 4.512, "step": 342 }, { "epoch": 0.05870272120486052, "grad_norm": 16.26902198791504, "learning_rate": 1.9281232173417e-06, "loss": 6.8285, "step": 343 }, { "epoch": 0.05887386616464145, "grad_norm": 16.47487449645996, "learning_rate": 1.933827723901883e-06, "loss": 8.8793, "step": 344 }, { "epoch": 0.059045011124422384, "grad_norm": 32.750850677490234, "learning_rate": 1.939532230462065e-06, "loss": 10.0536, "step": 345 }, { "epoch": 0.05921615608420332, "grad_norm": 18.996196746826172, "learning_rate": 1.9452367370222475e-06, "loss": 6.1966, "step": 346 }, { "epoch": 0.05938730104398426, "grad_norm": 24.546964645385742, "learning_rate": 1.9509412435824305e-06, "loss": 9.4677, "step": 347 }, { "epoch": 0.05955844600376519, "grad_norm": 84.20301055908203, "learning_rate": 1.9566457501426126e-06, "loss": 14.0007, "step": 348 }, { "epoch": 0.059729590963546124, "grad_norm": 18.845518112182617, "learning_rate": 1.962350256702795e-06, "loss": 6.1715, "step": 349 }, { "epoch": 0.05990073592332706, "grad_norm": 32.177085876464844, "learning_rate": 1.968054763262978e-06, "loss": 10.5934, "step": 350 }, { "epoch": 0.06007188088310799, "grad_norm": 24.051923751831055, "learning_rate": 1.97375926982316e-06, "loss": 9.6501, "step": 351 }, { "epoch": 0.060243025842888924, "grad_norm": 13.522736549377441, "learning_rate": 1.9794637763833427e-06, "loss": 8.8967, "step": 352 }, { "epoch": 0.060414170802669864, "grad_norm": 21.437868118286133, "learning_rate": 1.9851682829435257e-06, "loss": 9.8243, "step": 353 }, { "epoch": 0.0605853157624508, "grad_norm": 30.177589416503906, "learning_rate": 1.9908727895037078e-06, "loss": 9.5401, "step": 354 }, { "epoch": 0.06075646072223173, "grad_norm": 12.939532279968262, "learning_rate": 1.9965772960638903e-06, "loss": 6.4143, "step": 355 }, { "epoch": 0.060927605682012664, "grad_norm": 18.022136688232422, "learning_rate": 2.0022818026240733e-06, "loss": 9.6522, "step": 356 }, { "epoch": 0.0610987506417936, "grad_norm": 12.483067512512207, "learning_rate": 2.0079863091842554e-06, "loss": 9.2254, "step": 357 }, { "epoch": 0.06126989560157453, "grad_norm": 19.432615280151367, "learning_rate": 2.0136908157444383e-06, "loss": 6.2483, "step": 358 }, { "epoch": 0.06144104056135547, "grad_norm": 177.3258819580078, "learning_rate": 2.019395322304621e-06, "loss": 9.2975, "step": 359 }, { "epoch": 0.061612185521136405, "grad_norm": 14.458636283874512, "learning_rate": 2.025099828864803e-06, "loss": 8.5874, "step": 360 }, { "epoch": 0.06178333048091734, "grad_norm": 21.112350463867188, "learning_rate": 2.030804335424986e-06, "loss": 9.4896, "step": 361 }, { "epoch": 0.06195447544069827, "grad_norm": 15.956084251403809, "learning_rate": 2.0365088419851685e-06, "loss": 9.3311, "step": 362 }, { "epoch": 0.062125620400479205, "grad_norm": 11.96216869354248, "learning_rate": 2.042213348545351e-06, "loss": 8.2885, "step": 363 }, { "epoch": 0.06229676536026014, "grad_norm": 16.588687896728516, "learning_rate": 2.0479178551055335e-06, "loss": 8.5745, "step": 364 }, { "epoch": 0.06246791032004107, "grad_norm": 20.95501708984375, "learning_rate": 2.053622361665716e-06, "loss": 9.5327, "step": 365 }, { "epoch": 0.06263905527982201, "grad_norm": 14.255351066589355, "learning_rate": 2.0593268682258986e-06, "loss": 9.1372, "step": 366 }, { "epoch": 0.06281020023960295, "grad_norm": 17.529571533203125, "learning_rate": 2.065031374786081e-06, "loss": 6.9098, "step": 367 }, { "epoch": 0.06298134519938388, "grad_norm": 23.381641387939453, "learning_rate": 2.0707358813462636e-06, "loss": 9.4994, "step": 368 }, { "epoch": 0.06315249015916481, "grad_norm": 152.30535888671875, "learning_rate": 2.076440387906446e-06, "loss": 8.5952, "step": 369 }, { "epoch": 0.06332363511894575, "grad_norm": 15.447931289672852, "learning_rate": 2.0821448944666287e-06, "loss": 7.1287, "step": 370 }, { "epoch": 0.06349478007872668, "grad_norm": 13.553053855895996, "learning_rate": 2.0878494010268112e-06, "loss": 8.0622, "step": 371 }, { "epoch": 0.06366592503850761, "grad_norm": 13.198517799377441, "learning_rate": 2.0935539075869938e-06, "loss": 8.3527, "step": 372 }, { "epoch": 0.06383706999828855, "grad_norm": 21.851369857788086, "learning_rate": 2.0992584141471763e-06, "loss": 6.7771, "step": 373 }, { "epoch": 0.06400821495806948, "grad_norm": 30.56134605407715, "learning_rate": 2.104962920707359e-06, "loss": 9.666, "step": 374 }, { "epoch": 0.06417935991785043, "grad_norm": 18.76494026184082, "learning_rate": 2.1106674272675414e-06, "loss": 9.3941, "step": 375 }, { "epoch": 0.06435050487763136, "grad_norm": 19.92658805847168, "learning_rate": 2.116371933827724e-06, "loss": 9.3741, "step": 376 }, { "epoch": 0.06452164983741229, "grad_norm": 10.430363655090332, "learning_rate": 2.1220764403879064e-06, "loss": 7.8113, "step": 377 }, { "epoch": 0.06469279479719323, "grad_norm": 18.093847274780273, "learning_rate": 2.1277809469480894e-06, "loss": 6.1706, "step": 378 }, { "epoch": 0.06486393975697416, "grad_norm": 21.807714462280273, "learning_rate": 2.1334854535082715e-06, "loss": 9.471, "step": 379 }, { "epoch": 0.06503508471675509, "grad_norm": 10.38511848449707, "learning_rate": 2.139189960068454e-06, "loss": 4.2784, "step": 380 }, { "epoch": 0.06520622967653603, "grad_norm": 18.564613342285156, "learning_rate": 2.144894466628637e-06, "loss": 9.548, "step": 381 }, { "epoch": 0.06537737463631696, "grad_norm": 13.890935897827148, "learning_rate": 2.150598973188819e-06, "loss": 7.9354, "step": 382 }, { "epoch": 0.06554851959609789, "grad_norm": 18.593252182006836, "learning_rate": 2.1563034797490016e-06, "loss": 6.34, "step": 383 }, { "epoch": 0.06571966455587883, "grad_norm": 10.455931663513184, "learning_rate": 2.1620079863091846e-06, "loss": 6.2716, "step": 384 }, { "epoch": 0.06589080951565976, "grad_norm": 21.231943130493164, "learning_rate": 2.1677124928693667e-06, "loss": 5.9761, "step": 385 }, { "epoch": 0.06606195447544069, "grad_norm": 11.568195343017578, "learning_rate": 2.173416999429549e-06, "loss": 4.4776, "step": 386 }, { "epoch": 0.06623309943522163, "grad_norm": 23.829204559326172, "learning_rate": 2.179121505989732e-06, "loss": 9.648, "step": 387 }, { "epoch": 0.06640424439500257, "grad_norm": 10.398987770080566, "learning_rate": 2.1848260125499147e-06, "loss": 4.7062, "step": 388 }, { "epoch": 0.0665753893547835, "grad_norm": 11.396307945251465, "learning_rate": 2.190530519110097e-06, "loss": 8.1087, "step": 389 }, { "epoch": 0.06674653431456444, "grad_norm": 18.780866622924805, "learning_rate": 2.1962350256702798e-06, "loss": 6.196, "step": 390 }, { "epoch": 0.06691767927434537, "grad_norm": 18.36736488342285, "learning_rate": 2.2019395322304623e-06, "loss": 6.2459, "step": 391 }, { "epoch": 0.0670888242341263, "grad_norm": 18.681446075439453, "learning_rate": 2.2076440387906444e-06, "loss": 9.4161, "step": 392 }, { "epoch": 0.06725996919390724, "grad_norm": 15.113629341125488, "learning_rate": 2.2133485453508274e-06, "loss": 6.3517, "step": 393 }, { "epoch": 0.06743111415368817, "grad_norm": 11.273137092590332, "learning_rate": 2.21905305191101e-06, "loss": 8.0886, "step": 394 }, { "epoch": 0.06760225911346911, "grad_norm": 17.580646514892578, "learning_rate": 2.224757558471192e-06, "loss": 6.5059, "step": 395 }, { "epoch": 0.06777340407325004, "grad_norm": 15.864416122436523, "learning_rate": 2.230462065031375e-06, "loss": 8.5624, "step": 396 }, { "epoch": 0.06794454903303097, "grad_norm": 11.407431602478027, "learning_rate": 2.2361665715915575e-06, "loss": 7.853, "step": 397 }, { "epoch": 0.06811569399281191, "grad_norm": 28.192079544067383, "learning_rate": 2.24187107815174e-06, "loss": 9.4467, "step": 398 }, { "epoch": 0.06828683895259284, "grad_norm": 19.4180965423584, "learning_rate": 2.2475755847119225e-06, "loss": 5.6605, "step": 399 }, { "epoch": 0.06845798391237377, "grad_norm": 19.75929069519043, "learning_rate": 2.253280091272105e-06, "loss": 9.4512, "step": 400 }, { "epoch": 0.06862912887215472, "grad_norm": 10.311906814575195, "learning_rate": 2.2589845978322876e-06, "loss": 7.9644, "step": 401 }, { "epoch": 0.06880027383193565, "grad_norm": 20.4741268157959, "learning_rate": 2.26468910439247e-06, "loss": 9.4577, "step": 402 }, { "epoch": 0.06897141879171659, "grad_norm": 25.65606117248535, "learning_rate": 2.2703936109526527e-06, "loss": 9.6371, "step": 403 }, { "epoch": 0.06914256375149752, "grad_norm": 26.26441192626953, "learning_rate": 2.276098117512835e-06, "loss": 9.5365, "step": 404 }, { "epoch": 0.06931370871127845, "grad_norm": 14.249612808227539, "learning_rate": 2.2818026240730177e-06, "loss": 8.5897, "step": 405 }, { "epoch": 0.06948485367105939, "grad_norm": 17.306989669799805, "learning_rate": 2.2875071306332003e-06, "loss": 6.9065, "step": 406 }, { "epoch": 0.06965599863084032, "grad_norm": 10.925597190856934, "learning_rate": 2.293211637193383e-06, "loss": 4.4132, "step": 407 }, { "epoch": 0.06982714359062125, "grad_norm": 20.995426177978516, "learning_rate": 2.2989161437535653e-06, "loss": 9.6018, "step": 408 }, { "epoch": 0.06999828855040219, "grad_norm": 13.343510627746582, "learning_rate": 2.304620650313748e-06, "loss": 8.3354, "step": 409 }, { "epoch": 0.07016943351018312, "grad_norm": 21.461809158325195, "learning_rate": 2.3103251568739304e-06, "loss": 9.3101, "step": 410 }, { "epoch": 0.07034057846996405, "grad_norm": 25.428903579711914, "learning_rate": 2.316029663434113e-06, "loss": 9.4155, "step": 411 }, { "epoch": 0.07051172342974499, "grad_norm": 22.469390869140625, "learning_rate": 2.3217341699942955e-06, "loss": 6.4331, "step": 412 }, { "epoch": 0.07068286838952594, "grad_norm": 157.02752685546875, "learning_rate": 2.3274386765544784e-06, "loss": 7.6313, "step": 413 }, { "epoch": 0.07085401334930687, "grad_norm": 12.20741081237793, "learning_rate": 2.3331431831146605e-06, "loss": 4.2273, "step": 414 }, { "epoch": 0.0710251583090878, "grad_norm": 19.81876564025879, "learning_rate": 2.338847689674843e-06, "loss": 9.5364, "step": 415 }, { "epoch": 0.07119630326886874, "grad_norm": 17.362276077270508, "learning_rate": 2.344552196235026e-06, "loss": 9.4605, "step": 416 }, { "epoch": 0.07136744822864967, "grad_norm": 22.898147583007812, "learning_rate": 2.350256702795208e-06, "loss": 9.5846, "step": 417 }, { "epoch": 0.0715385931884306, "grad_norm": 17.685535430908203, "learning_rate": 2.3559612093553906e-06, "loss": 8.0604, "step": 418 }, { "epoch": 0.07170973814821154, "grad_norm": 16.97225570678711, "learning_rate": 2.3616657159155736e-06, "loss": 9.0822, "step": 419 }, { "epoch": 0.07188088310799247, "grad_norm": 21.690431594848633, "learning_rate": 2.3673702224757557e-06, "loss": 5.9587, "step": 420 }, { "epoch": 0.0720520280677734, "grad_norm": 20.209810256958008, "learning_rate": 2.3730747290359387e-06, "loss": 6.1507, "step": 421 }, { "epoch": 0.07222317302755434, "grad_norm": 19.15233039855957, "learning_rate": 2.378779235596121e-06, "loss": 9.5222, "step": 422 }, { "epoch": 0.07239431798733527, "grad_norm": 15.19393539428711, "learning_rate": 2.3844837421563033e-06, "loss": 8.3063, "step": 423 }, { "epoch": 0.0725654629471162, "grad_norm": 14.138923645019531, "learning_rate": 2.3901882487164863e-06, "loss": 8.3802, "step": 424 }, { "epoch": 0.07273660790689714, "grad_norm": 23.83425521850586, "learning_rate": 2.395892755276669e-06, "loss": 9.554, "step": 425 }, { "epoch": 0.07290775286667808, "grad_norm": 19.778850555419922, "learning_rate": 2.401597261836851e-06, "loss": 6.0866, "step": 426 }, { "epoch": 0.07307889782645902, "grad_norm": 12.418360710144043, "learning_rate": 2.407301768397034e-06, "loss": 7.7723, "step": 427 }, { "epoch": 0.07325004278623995, "grad_norm": 21.105587005615234, "learning_rate": 2.4130062749572164e-06, "loss": 6.011, "step": 428 }, { "epoch": 0.07342118774602088, "grad_norm": 18.78055763244629, "learning_rate": 2.4187107815173985e-06, "loss": 6.4389, "step": 429 }, { "epoch": 0.07359233270580182, "grad_norm": 17.227916717529297, "learning_rate": 2.4244152880775814e-06, "loss": 6.6973, "step": 430 }, { "epoch": 0.07376347766558275, "grad_norm": 21.845876693725586, "learning_rate": 2.430119794637764e-06, "loss": 5.9158, "step": 431 }, { "epoch": 0.07393462262536368, "grad_norm": 14.355096817016602, "learning_rate": 2.435824301197946e-06, "loss": 8.6576, "step": 432 }, { "epoch": 0.07410576758514462, "grad_norm": 149.28054809570312, "learning_rate": 2.441528807758129e-06, "loss": 7.7649, "step": 433 }, { "epoch": 0.07427691254492555, "grad_norm": 18.152389526367188, "learning_rate": 2.4472333143183116e-06, "loss": 6.6434, "step": 434 }, { "epoch": 0.07444805750470648, "grad_norm": 17.05584716796875, "learning_rate": 2.4529378208784937e-06, "loss": 9.1462, "step": 435 }, { "epoch": 0.07461920246448742, "grad_norm": 11.82278060913086, "learning_rate": 2.4586423274386766e-06, "loss": 8.2832, "step": 436 }, { "epoch": 0.07479034742426835, "grad_norm": 17.951648712158203, "learning_rate": 2.464346833998859e-06, "loss": 8.4052, "step": 437 }, { "epoch": 0.07496149238404928, "grad_norm": 31.258188247680664, "learning_rate": 2.4700513405590417e-06, "loss": 9.4477, "step": 438 }, { "epoch": 0.07513263734383023, "grad_norm": 138.91761779785156, "learning_rate": 2.4757558471192242e-06, "loss": 8.3869, "step": 439 }, { "epoch": 0.07530378230361116, "grad_norm": 17.930551528930664, "learning_rate": 2.4814603536794068e-06, "loss": 9.1768, "step": 440 }, { "epoch": 0.0754749272633921, "grad_norm": 10.999883651733398, "learning_rate": 2.4871648602395897e-06, "loss": 4.1341, "step": 441 }, { "epoch": 0.07564607222317303, "grad_norm": 19.707490921020508, "learning_rate": 2.492869366799772e-06, "loss": 6.0241, "step": 442 }, { "epoch": 0.07581721718295396, "grad_norm": 19.63069725036621, "learning_rate": 2.4985738733599544e-06, "loss": 9.5659, "step": 443 }, { "epoch": 0.0759883621427349, "grad_norm": 19.783658981323242, "learning_rate": 2.5042783799201373e-06, "loss": 6.632, "step": 444 }, { "epoch": 0.07615950710251583, "grad_norm": 11.193924903869629, "learning_rate": 2.5099828864803194e-06, "loss": 4.213, "step": 445 }, { "epoch": 0.07633065206229676, "grad_norm": 65.09992218017578, "learning_rate": 2.515687393040502e-06, "loss": 13.1721, "step": 446 }, { "epoch": 0.0765017970220777, "grad_norm": 19.081214904785156, "learning_rate": 2.521391899600685e-06, "loss": 8.7605, "step": 447 }, { "epoch": 0.07667294198185863, "grad_norm": 17.08602523803711, "learning_rate": 2.527096406160867e-06, "loss": 8.4352, "step": 448 }, { "epoch": 0.07684408694163956, "grad_norm": 11.796391487121582, "learning_rate": 2.5328009127210495e-06, "loss": 7.9838, "step": 449 }, { "epoch": 0.0770152319014205, "grad_norm": 17.306316375732422, "learning_rate": 2.5385054192812325e-06, "loss": 7.9123, "step": 450 }, { "epoch": 0.07718637686120144, "grad_norm": 11.991724014282227, "learning_rate": 2.5442099258414146e-06, "loss": 7.904, "step": 451 }, { "epoch": 0.07735752182098238, "grad_norm": 18.394563674926758, "learning_rate": 2.549914432401597e-06, "loss": 6.7541, "step": 452 }, { "epoch": 0.07752866678076331, "grad_norm": 21.436811447143555, "learning_rate": 2.55561893896178e-06, "loss": 5.5488, "step": 453 }, { "epoch": 0.07769981174054424, "grad_norm": 15.822162628173828, "learning_rate": 2.561323445521962e-06, "loss": 7.7392, "step": 454 }, { "epoch": 0.07787095670032518, "grad_norm": 19.68645668029785, "learning_rate": 2.5670279520821447e-06, "loss": 6.6529, "step": 455 }, { "epoch": 0.07804210166010611, "grad_norm": 18.808198928833008, "learning_rate": 2.5727324586423277e-06, "loss": 8.784, "step": 456 }, { "epoch": 0.07821324661988704, "grad_norm": 131.1753692626953, "learning_rate": 2.57843696520251e-06, "loss": 7.8706, "step": 457 }, { "epoch": 0.07838439157966798, "grad_norm": 11.708639144897461, "learning_rate": 2.5841414717626923e-06, "loss": 7.7402, "step": 458 }, { "epoch": 0.07855553653944891, "grad_norm": 15.965631484985352, "learning_rate": 2.5898459783228753e-06, "loss": 8.301, "step": 459 }, { "epoch": 0.07872668149922984, "grad_norm": 14.710309982299805, "learning_rate": 2.5955504848830574e-06, "loss": 7.9566, "step": 460 }, { "epoch": 0.07889782645901078, "grad_norm": 15.00783634185791, "learning_rate": 2.6012549914432404e-06, "loss": 8.488, "step": 461 }, { "epoch": 0.07906897141879171, "grad_norm": 13.231627464294434, "learning_rate": 2.606959498003423e-06, "loss": 8.1184, "step": 462 }, { "epoch": 0.07924011637857264, "grad_norm": 170.4566192626953, "learning_rate": 2.6126640045636054e-06, "loss": 8.1805, "step": 463 }, { "epoch": 0.07941126133835359, "grad_norm": 23.66990089416504, "learning_rate": 2.618368511123788e-06, "loss": 9.2852, "step": 464 }, { "epoch": 0.07958240629813453, "grad_norm": 20.218496322631836, "learning_rate": 2.6240730176839705e-06, "loss": 6.264, "step": 465 }, { "epoch": 0.07975355125791546, "grad_norm": 27.905323028564453, "learning_rate": 2.629777524244153e-06, "loss": 10.0002, "step": 466 }, { "epoch": 0.07992469621769639, "grad_norm": 22.043649673461914, "learning_rate": 2.6354820308043355e-06, "loss": 8.6303, "step": 467 }, { "epoch": 0.08009584117747733, "grad_norm": 20.095890045166016, "learning_rate": 2.641186537364518e-06, "loss": 8.7857, "step": 468 }, { "epoch": 0.08026698613725826, "grad_norm": 30.715435028076172, "learning_rate": 2.6468910439247006e-06, "loss": 9.6486, "step": 469 }, { "epoch": 0.08043813109703919, "grad_norm": 18.83611488342285, "learning_rate": 2.652595550484883e-06, "loss": 7.9544, "step": 470 }, { "epoch": 0.08060927605682013, "grad_norm": 20.929931640625, "learning_rate": 2.6583000570450657e-06, "loss": 6.2772, "step": 471 }, { "epoch": 0.08078042101660106, "grad_norm": 18.414594650268555, "learning_rate": 2.664004563605248e-06, "loss": 6.1477, "step": 472 }, { "epoch": 0.08095156597638199, "grad_norm": 18.188846588134766, "learning_rate": 2.6697090701654307e-06, "loss": 7.099, "step": 473 }, { "epoch": 0.08112271093616293, "grad_norm": 8.666217803955078, "learning_rate": 2.6754135767256133e-06, "loss": 5.0929, "step": 474 }, { "epoch": 0.08129385589594386, "grad_norm": 15.457167625427246, "learning_rate": 2.681118083285796e-06, "loss": 7.8706, "step": 475 }, { "epoch": 0.08146500085572479, "grad_norm": 17.11892318725586, "learning_rate": 2.6868225898459783e-06, "loss": 8.5293, "step": 476 }, { "epoch": 0.08163614581550574, "grad_norm": 28.18759536743164, "learning_rate": 2.692527096406161e-06, "loss": 5.7448, "step": 477 }, { "epoch": 0.08180729077528667, "grad_norm": 19.842830657958984, "learning_rate": 2.6982316029663434e-06, "loss": 8.5854, "step": 478 }, { "epoch": 0.0819784357350676, "grad_norm": 59.76820373535156, "learning_rate": 2.703936109526526e-06, "loss": 12.4879, "step": 479 }, { "epoch": 0.08214958069484854, "grad_norm": 15.530830383300781, "learning_rate": 2.7096406160867085e-06, "loss": 8.191, "step": 480 }, { "epoch": 0.08232072565462947, "grad_norm": 21.211435317993164, "learning_rate": 2.7153451226468914e-06, "loss": 9.4326, "step": 481 }, { "epoch": 0.0824918706144104, "grad_norm": 16.38536834716797, "learning_rate": 2.7210496292070735e-06, "loss": 6.3342, "step": 482 }, { "epoch": 0.08266301557419134, "grad_norm": 30.17742919921875, "learning_rate": 2.726754135767256e-06, "loss": 5.5068, "step": 483 }, { "epoch": 0.08283416053397227, "grad_norm": 27.44713020324707, "learning_rate": 2.732458642327439e-06, "loss": 9.4586, "step": 484 }, { "epoch": 0.0830053054937532, "grad_norm": 59.46120071411133, "learning_rate": 2.738163148887621e-06, "loss": 12.3889, "step": 485 }, { "epoch": 0.08317645045353414, "grad_norm": 26.801589965820312, "learning_rate": 2.7438676554478036e-06, "loss": 5.3141, "step": 486 }, { "epoch": 0.08334759541331507, "grad_norm": 32.20411682128906, "learning_rate": 2.7495721620079866e-06, "loss": 5.4274, "step": 487 }, { "epoch": 0.083518740373096, "grad_norm": 16.14412498474121, "learning_rate": 2.755276668568169e-06, "loss": 8.3447, "step": 488 }, { "epoch": 0.08368988533287694, "grad_norm": 16.79600715637207, "learning_rate": 2.7609811751283512e-06, "loss": 7.7737, "step": 489 }, { "epoch": 0.08386103029265789, "grad_norm": 171.59872436523438, "learning_rate": 2.766685681688534e-06, "loss": 8.277, "step": 490 }, { "epoch": 0.08403217525243882, "grad_norm": 29.80289649963379, "learning_rate": 2.7723901882487167e-06, "loss": 5.273, "step": 491 }, { "epoch": 0.08420332021221975, "grad_norm": 15.38176155090332, "learning_rate": 2.778094694808899e-06, "loss": 7.8611, "step": 492 }, { "epoch": 0.08437446517200069, "grad_norm": 19.766082763671875, "learning_rate": 2.783799201369082e-06, "loss": 7.7926, "step": 493 }, { "epoch": 0.08454561013178162, "grad_norm": 13.274962425231934, "learning_rate": 2.7895037079292643e-06, "loss": 4.1215, "step": 494 }, { "epoch": 0.08471675509156255, "grad_norm": 29.015403747558594, "learning_rate": 2.7952082144894464e-06, "loss": 5.4146, "step": 495 }, { "epoch": 0.08488790005134349, "grad_norm": 22.243703842163086, "learning_rate": 2.8009127210496294e-06, "loss": 5.753, "step": 496 }, { "epoch": 0.08505904501112442, "grad_norm": 23.75475311279297, "learning_rate": 2.806617227609812e-06, "loss": 5.7119, "step": 497 }, { "epoch": 0.08523018997090535, "grad_norm": 19.524032592773438, "learning_rate": 2.812321734169994e-06, "loss": 8.9719, "step": 498 }, { "epoch": 0.08540133493068629, "grad_norm": 22.207155227661133, "learning_rate": 2.818026240730177e-06, "loss": 8.5433, "step": 499 }, { "epoch": 0.08557247989046722, "grad_norm": 20.369564056396484, "learning_rate": 2.8237307472903595e-06, "loss": 9.2212, "step": 500 }, { "epoch": 0.08574362485024815, "grad_norm": 12.617632865905762, "learning_rate": 2.829435253850542e-06, "loss": 7.5878, "step": 501 }, { "epoch": 0.0859147698100291, "grad_norm": 16.92389678955078, "learning_rate": 2.8351397604107246e-06, "loss": 8.1394, "step": 502 }, { "epoch": 0.08608591476981003, "grad_norm": 52.22781753540039, "learning_rate": 2.840844266970907e-06, "loss": 11.9304, "step": 503 }, { "epoch": 0.08625705972959097, "grad_norm": 19.299196243286133, "learning_rate": 2.8465487735310896e-06, "loss": 7.5487, "step": 504 }, { "epoch": 0.0864282046893719, "grad_norm": 25.007366180419922, "learning_rate": 2.852253280091272e-06, "loss": 6.4953, "step": 505 }, { "epoch": 0.08659934964915283, "grad_norm": 44.58477020263672, "learning_rate": 2.8579577866514547e-06, "loss": 11.543, "step": 506 }, { "epoch": 0.08677049460893377, "grad_norm": 18.95302963256836, "learning_rate": 2.8636622932116372e-06, "loss": 7.7713, "step": 507 }, { "epoch": 0.0869416395687147, "grad_norm": 15.56648063659668, "learning_rate": 2.8693667997718198e-06, "loss": 8.5567, "step": 508 }, { "epoch": 0.08711278452849563, "grad_norm": 20.78284454345703, "learning_rate": 2.8750713063320023e-06, "loss": 7.7135, "step": 509 }, { "epoch": 0.08728392948827657, "grad_norm": 23.176607131958008, "learning_rate": 2.880775812892185e-06, "loss": 8.2685, "step": 510 }, { "epoch": 0.0874550744480575, "grad_norm": 25.212718963623047, "learning_rate": 2.8864803194523674e-06, "loss": 8.9983, "step": 511 }, { "epoch": 0.08762621940783843, "grad_norm": 27.220836639404297, "learning_rate": 2.89218482601255e-06, "loss": 6.6334, "step": 512 }, { "epoch": 0.08779736436761937, "grad_norm": 13.128168106079102, "learning_rate": 2.897889332572733e-06, "loss": 3.847, "step": 513 }, { "epoch": 0.0879685093274003, "grad_norm": 19.84160614013672, "learning_rate": 2.903593839132915e-06, "loss": 8.0045, "step": 514 }, { "epoch": 0.08813965428718125, "grad_norm": 15.77076530456543, "learning_rate": 2.9092983456930975e-06, "loss": 7.8019, "step": 515 }, { "epoch": 0.08831079924696218, "grad_norm": 158.41465759277344, "learning_rate": 2.9150028522532804e-06, "loss": 8.6448, "step": 516 }, { "epoch": 0.08848194420674312, "grad_norm": 23.563339233398438, "learning_rate": 2.9207073588134625e-06, "loss": 8.8163, "step": 517 }, { "epoch": 0.08865308916652405, "grad_norm": 30.82549476623535, "learning_rate": 2.926411865373645e-06, "loss": 8.627, "step": 518 }, { "epoch": 0.08882423412630498, "grad_norm": 24.138612747192383, "learning_rate": 2.932116371933828e-06, "loss": 6.2112, "step": 519 }, { "epoch": 0.08899537908608592, "grad_norm": 42.6961784362793, "learning_rate": 2.93782087849401e-06, "loss": 11.5101, "step": 520 }, { "epoch": 0.08916652404586685, "grad_norm": 16.58330726623535, "learning_rate": 2.943525385054193e-06, "loss": 7.885, "step": 521 }, { "epoch": 0.08933766900564778, "grad_norm": 17.490467071533203, "learning_rate": 2.9492298916143756e-06, "loss": 7.6631, "step": 522 }, { "epoch": 0.08950881396542872, "grad_norm": 24.303665161132812, "learning_rate": 2.9549343981745577e-06, "loss": 8.5512, "step": 523 }, { "epoch": 0.08967995892520965, "grad_norm": 14.5447416305542, "learning_rate": 2.9606389047347407e-06, "loss": 3.9844, "step": 524 }, { "epoch": 0.08985110388499058, "grad_norm": 28.421756744384766, "learning_rate": 2.9663434112949232e-06, "loss": 9.2179, "step": 525 }, { "epoch": 0.09002224884477152, "grad_norm": 20.097034454345703, "learning_rate": 2.9720479178551053e-06, "loss": 9.2422, "step": 526 }, { "epoch": 0.09019339380455245, "grad_norm": 20.862869262695312, "learning_rate": 2.9777524244152883e-06, "loss": 8.2303, "step": 527 }, { "epoch": 0.0903645387643334, "grad_norm": 30.980390548706055, "learning_rate": 2.983456930975471e-06, "loss": 9.1253, "step": 528 }, { "epoch": 0.09053568372411433, "grad_norm": 29.973567962646484, "learning_rate": 2.989161437535653e-06, "loss": 4.8928, "step": 529 }, { "epoch": 0.09070682868389526, "grad_norm": 35.399349212646484, "learning_rate": 2.994865944095836e-06, "loss": 4.5559, "step": 530 }, { "epoch": 0.0908779736436762, "grad_norm": 21.178098678588867, "learning_rate": 3.0005704506560184e-06, "loss": 8.8876, "step": 531 }, { "epoch": 0.09104911860345713, "grad_norm": 24.755205154418945, "learning_rate": 3.0062749572162005e-06, "loss": 7.5809, "step": 532 }, { "epoch": 0.09122026356323806, "grad_norm": 23.76934051513672, "learning_rate": 3.0119794637763835e-06, "loss": 8.5706, "step": 533 }, { "epoch": 0.091391408523019, "grad_norm": 40.431190490722656, "learning_rate": 3.017683970336566e-06, "loss": 11.3342, "step": 534 }, { "epoch": 0.09156255348279993, "grad_norm": 22.674354553222656, "learning_rate": 3.023388476896748e-06, "loss": 8.8756, "step": 535 }, { "epoch": 0.09173369844258086, "grad_norm": 33.92606735229492, "learning_rate": 3.029092983456931e-06, "loss": 4.738, "step": 536 }, { "epoch": 0.0919048434023618, "grad_norm": 27.1170711517334, "learning_rate": 3.0347974900171136e-06, "loss": 6.4223, "step": 537 }, { "epoch": 0.09207598836214273, "grad_norm": 25.11066246032715, "learning_rate": 3.040501996577296e-06, "loss": 9.0946, "step": 538 }, { "epoch": 0.09224713332192366, "grad_norm": 23.894901275634766, "learning_rate": 3.0462065031374787e-06, "loss": 6.3395, "step": 539 }, { "epoch": 0.09241827828170461, "grad_norm": 20.199861526489258, "learning_rate": 3.051911009697661e-06, "loss": 6.0524, "step": 540 }, { "epoch": 0.09258942324148554, "grad_norm": 22.757362365722656, "learning_rate": 3.057615516257844e-06, "loss": 7.1293, "step": 541 }, { "epoch": 0.09276056820126648, "grad_norm": 22.62543487548828, "learning_rate": 3.0633200228180263e-06, "loss": 7.5053, "step": 542 }, { "epoch": 0.09293171316104741, "grad_norm": 16.598411560058594, "learning_rate": 3.069024529378209e-06, "loss": 7.8322, "step": 543 }, { "epoch": 0.09310285812082834, "grad_norm": 20.656627655029297, "learning_rate": 3.0747290359383917e-06, "loss": 8.8013, "step": 544 }, { "epoch": 0.09327400308060928, "grad_norm": 20.95423126220703, "learning_rate": 3.080433542498574e-06, "loss": 6.1923, "step": 545 }, { "epoch": 0.09344514804039021, "grad_norm": 175.26722717285156, "learning_rate": 3.0861380490587564e-06, "loss": 10.2252, "step": 546 }, { "epoch": 0.09361629300017114, "grad_norm": 21.737558364868164, "learning_rate": 3.0918425556189393e-06, "loss": 7.7486, "step": 547 }, { "epoch": 0.09378743795995208, "grad_norm": 41.67558288574219, "learning_rate": 3.0975470621791215e-06, "loss": 11.1347, "step": 548 }, { "epoch": 0.09395858291973301, "grad_norm": 24.20724868774414, "learning_rate": 3.103251568739304e-06, "loss": 8.1228, "step": 549 }, { "epoch": 0.09412972787951394, "grad_norm": 23.995750427246094, "learning_rate": 3.108956075299487e-06, "loss": 8.1871, "step": 550 }, { "epoch": 0.09430087283929488, "grad_norm": 18.58646583557129, "learning_rate": 3.114660581859669e-06, "loss": 7.4311, "step": 551 }, { "epoch": 0.09447201779907581, "grad_norm": 26.01420021057129, "learning_rate": 3.1203650884198516e-06, "loss": 9.3426, "step": 552 }, { "epoch": 0.09464316275885676, "grad_norm": 18.335588455200195, "learning_rate": 3.1260695949800345e-06, "loss": 8.9575, "step": 553 }, { "epoch": 0.09481430771863769, "grad_norm": 21.414621353149414, "learning_rate": 3.1317741015402166e-06, "loss": 7.744, "step": 554 }, { "epoch": 0.09498545267841862, "grad_norm": 15.28297233581543, "learning_rate": 3.137478608100399e-06, "loss": 8.0683, "step": 555 }, { "epoch": 0.09515659763819956, "grad_norm": 20.182992935180664, "learning_rate": 3.143183114660582e-06, "loss": 7.5161, "step": 556 }, { "epoch": 0.09532774259798049, "grad_norm": 22.94892120361328, "learning_rate": 3.1488876212207642e-06, "loss": 7.728, "step": 557 }, { "epoch": 0.09549888755776142, "grad_norm": 16.93927764892578, "learning_rate": 3.1545921277809468e-06, "loss": 7.7731, "step": 558 }, { "epoch": 0.09567003251754236, "grad_norm": 21.27535629272461, "learning_rate": 3.1602966343411297e-06, "loss": 8.7629, "step": 559 }, { "epoch": 0.09584117747732329, "grad_norm": 20.056377410888672, "learning_rate": 3.166001140901312e-06, "loss": 7.3943, "step": 560 }, { "epoch": 0.09601232243710422, "grad_norm": 37.84750747680664, "learning_rate": 3.1717056474614948e-06, "loss": 10.5613, "step": 561 }, { "epoch": 0.09618346739688516, "grad_norm": 19.577177047729492, "learning_rate": 3.1774101540216773e-06, "loss": 7.6761, "step": 562 }, { "epoch": 0.09635461235666609, "grad_norm": 22.209712982177734, "learning_rate": 3.18311466058186e-06, "loss": 6.9584, "step": 563 }, { "epoch": 0.09652575731644703, "grad_norm": 25.258302688598633, "learning_rate": 3.1888191671420424e-06, "loss": 8.0565, "step": 564 }, { "epoch": 0.09669690227622796, "grad_norm": 15.993329048156738, "learning_rate": 3.194523673702225e-06, "loss": 4.2519, "step": 565 }, { "epoch": 0.0968680472360089, "grad_norm": 18.609046936035156, "learning_rate": 3.2002281802624074e-06, "loss": 8.0901, "step": 566 }, { "epoch": 0.09703919219578984, "grad_norm": 39.24065017700195, "learning_rate": 3.20593268682259e-06, "loss": 4.2716, "step": 567 }, { "epoch": 0.09721033715557077, "grad_norm": 158.3350067138672, "learning_rate": 3.2116371933827725e-06, "loss": 9.5332, "step": 568 }, { "epoch": 0.0973814821153517, "grad_norm": 59.29450607299805, "learning_rate": 3.217341699942955e-06, "loss": 10.343, "step": 569 }, { "epoch": 0.09755262707513264, "grad_norm": 16.113664627075195, "learning_rate": 3.2230462065031376e-06, "loss": 3.4944, "step": 570 }, { "epoch": 0.09772377203491357, "grad_norm": 23.105350494384766, "learning_rate": 3.22875071306332e-06, "loss": 8.2848, "step": 571 }, { "epoch": 0.0978949169946945, "grad_norm": 21.425796508789062, "learning_rate": 3.2344552196235026e-06, "loss": 8.7655, "step": 572 }, { "epoch": 0.09806606195447544, "grad_norm": 22.587278366088867, "learning_rate": 3.240159726183685e-06, "loss": 5.394, "step": 573 }, { "epoch": 0.09823720691425637, "grad_norm": 37.69017028808594, "learning_rate": 3.2458642327438677e-06, "loss": 9.9902, "step": 574 }, { "epoch": 0.0984083518740373, "grad_norm": 25.255393981933594, "learning_rate": 3.2515687393040502e-06, "loss": 8.3951, "step": 575 }, { "epoch": 0.09857949683381824, "grad_norm": 18.790040969848633, "learning_rate": 3.2572732458642328e-06, "loss": 8.2647, "step": 576 }, { "epoch": 0.09875064179359917, "grad_norm": 18.215757369995117, "learning_rate": 3.2629777524244153e-06, "loss": 7.8027, "step": 577 }, { "epoch": 0.09892178675338012, "grad_norm": 17.263294219970703, "learning_rate": 3.268682258984598e-06, "loss": 7.7728, "step": 578 }, { "epoch": 0.09909293171316105, "grad_norm": 18.384496688842773, "learning_rate": 3.2743867655447804e-06, "loss": 7.9191, "step": 579 }, { "epoch": 0.09926407667294199, "grad_norm": 32.68930435180664, "learning_rate": 3.280091272104963e-06, "loss": 7.8591, "step": 580 }, { "epoch": 0.09943522163272292, "grad_norm": 31.514266967773438, "learning_rate": 3.285795778665146e-06, "loss": 5.5621, "step": 581 }, { "epoch": 0.09960636659250385, "grad_norm": 18.912736892700195, "learning_rate": 3.291500285225328e-06, "loss": 7.6952, "step": 582 }, { "epoch": 0.09977751155228479, "grad_norm": 37.68309783935547, "learning_rate": 3.2972047917855105e-06, "loss": 4.1392, "step": 583 }, { "epoch": 0.09994865651206572, "grad_norm": 31.56082534790039, "learning_rate": 3.3029092983456934e-06, "loss": 4.3801, "step": 584 }, { "epoch": 0.10011980147184665, "grad_norm": 24.57911491394043, "learning_rate": 3.3086138049058755e-06, "loss": 8.6316, "step": 585 }, { "epoch": 0.10029094643162759, "grad_norm": 23.80208396911621, "learning_rate": 3.314318311466058e-06, "loss": 7.9077, "step": 586 }, { "epoch": 0.10046209139140852, "grad_norm": 16.849803924560547, "learning_rate": 3.320022818026241e-06, "loss": 7.6992, "step": 587 }, { "epoch": 0.10063323635118945, "grad_norm": 18.981300354003906, "learning_rate": 3.3257273245864236e-06, "loss": 8.6023, "step": 588 }, { "epoch": 0.10080438131097039, "grad_norm": 24.398378372192383, "learning_rate": 3.3314318311466057e-06, "loss": 7.0319, "step": 589 }, { "epoch": 0.10097552627075132, "grad_norm": 14.639533996582031, "learning_rate": 3.3371363377067886e-06, "loss": 4.7698, "step": 590 }, { "epoch": 0.10114667123053227, "grad_norm": 25.046255111694336, "learning_rate": 3.342840844266971e-06, "loss": 7.1782, "step": 591 }, { "epoch": 0.1013178161903132, "grad_norm": 20.012542724609375, "learning_rate": 3.3485453508271533e-06, "loss": 7.3167, "step": 592 }, { "epoch": 0.10148896115009413, "grad_norm": 27.766891479492188, "learning_rate": 3.3542498573873362e-06, "loss": 4.6521, "step": 593 }, { "epoch": 0.10166010610987507, "grad_norm": 30.79694175720215, "learning_rate": 3.3599543639475188e-06, "loss": 5.1196, "step": 594 }, { "epoch": 0.101831251069656, "grad_norm": 24.9854736328125, "learning_rate": 3.365658870507701e-06, "loss": 7.8056, "step": 595 }, { "epoch": 0.10200239602943693, "grad_norm": 30.63117218017578, "learning_rate": 3.371363377067884e-06, "loss": 4.7903, "step": 596 }, { "epoch": 0.10217354098921787, "grad_norm": 34.852256774902344, "learning_rate": 3.3770678836280663e-06, "loss": 4.1994, "step": 597 }, { "epoch": 0.1023446859489988, "grad_norm": 26.979557037353516, "learning_rate": 3.3827723901882485e-06, "loss": 8.5955, "step": 598 }, { "epoch": 0.10251583090877973, "grad_norm": 21.797626495361328, "learning_rate": 3.3884768967484314e-06, "loss": 7.5743, "step": 599 }, { "epoch": 0.10268697586856067, "grad_norm": 37.774139404296875, "learning_rate": 3.394181403308614e-06, "loss": 6.7935, "step": 600 }, { "epoch": 0.1028581208283416, "grad_norm": 27.917823791503906, "learning_rate": 3.3998859098687965e-06, "loss": 7.9018, "step": 601 }, { "epoch": 0.10302926578812253, "grad_norm": 28.479934692382812, "learning_rate": 3.405590416428979e-06, "loss": 4.566, "step": 602 }, { "epoch": 0.10320041074790347, "grad_norm": 33.35675811767578, "learning_rate": 3.4112949229891615e-06, "loss": 4.1986, "step": 603 }, { "epoch": 0.10337155570768441, "grad_norm": 17.17736053466797, "learning_rate": 3.416999429549344e-06, "loss": 4.4831, "step": 604 }, { "epoch": 0.10354270066746535, "grad_norm": 33.52507781982422, "learning_rate": 3.4227039361095266e-06, "loss": 5.2033, "step": 605 }, { "epoch": 0.10371384562724628, "grad_norm": 38.001678466796875, "learning_rate": 3.428408442669709e-06, "loss": 4.2796, "step": 606 }, { "epoch": 0.10388499058702722, "grad_norm": 27.487375259399414, "learning_rate": 3.4341129492298917e-06, "loss": 4.3677, "step": 607 }, { "epoch": 0.10405613554680815, "grad_norm": 43.33926010131836, "learning_rate": 3.439817455790074e-06, "loss": 8.1044, "step": 608 }, { "epoch": 0.10422728050658908, "grad_norm": 19.231143951416016, "learning_rate": 3.4455219623502567e-06, "loss": 3.8195, "step": 609 }, { "epoch": 0.10439842546637002, "grad_norm": 51.54021453857422, "learning_rate": 3.4512264689104393e-06, "loss": 8.4844, "step": 610 }, { "epoch": 0.10456957042615095, "grad_norm": 18.752532958984375, "learning_rate": 3.456930975470622e-06, "loss": 4.7959, "step": 611 }, { "epoch": 0.10474071538593188, "grad_norm": 31.644916534423828, "learning_rate": 3.4626354820308043e-06, "loss": 3.5838, "step": 612 }, { "epoch": 0.10491186034571282, "grad_norm": 29.887203216552734, "learning_rate": 3.4683399885909873e-06, "loss": 8.1693, "step": 613 }, { "epoch": 0.10508300530549375, "grad_norm": 26.583890914916992, "learning_rate": 3.4740444951511694e-06, "loss": 6.6237, "step": 614 }, { "epoch": 0.10525415026527468, "grad_norm": 30.845338821411133, "learning_rate": 3.479749001711352e-06, "loss": 3.4824, "step": 615 }, { "epoch": 0.10542529522505562, "grad_norm": 38.40910339355469, "learning_rate": 3.485453508271535e-06, "loss": 3.9889, "step": 616 }, { "epoch": 0.10559644018483656, "grad_norm": 39.16193389892578, "learning_rate": 3.491158014831717e-06, "loss": 8.4244, "step": 617 }, { "epoch": 0.1057675851446175, "grad_norm": 17.97920036315918, "learning_rate": 3.4968625213918995e-06, "loss": 3.2418, "step": 618 }, { "epoch": 0.10593873010439843, "grad_norm": 176.26966857910156, "learning_rate": 3.5025670279520825e-06, "loss": 7.2639, "step": 619 }, { "epoch": 0.10610987506417936, "grad_norm": 31.25491714477539, "learning_rate": 3.5082715345122646e-06, "loss": 7.428, "step": 620 }, { "epoch": 0.1062810200239603, "grad_norm": 291.8013916015625, "learning_rate": 3.5139760410724475e-06, "loss": 12.9756, "step": 621 }, { "epoch": 0.10645216498374123, "grad_norm": 34.713497161865234, "learning_rate": 3.51968054763263e-06, "loss": 4.2338, "step": 622 }, { "epoch": 0.10662330994352216, "grad_norm": 30.151296615600586, "learning_rate": 3.525385054192812e-06, "loss": 8.7988, "step": 623 }, { "epoch": 0.1067944549033031, "grad_norm": 36.128414154052734, "learning_rate": 3.531089560752995e-06, "loss": 8.0574, "step": 624 }, { "epoch": 0.10696559986308403, "grad_norm": 15.85501480102539, "learning_rate": 3.5367940673131777e-06, "loss": 3.4627, "step": 625 }, { "epoch": 0.10713674482286496, "grad_norm": 296.1280212402344, "learning_rate": 3.5424985738733598e-06, "loss": 11.8753, "step": 626 }, { "epoch": 0.1073078897826459, "grad_norm": 30.480113983154297, "learning_rate": 3.5482030804335427e-06, "loss": 6.966, "step": 627 }, { "epoch": 0.10747903474242683, "grad_norm": 34.42314529418945, "learning_rate": 3.5539075869937253e-06, "loss": 6.8004, "step": 628 }, { "epoch": 0.10765017970220778, "grad_norm": 239.69007873535156, "learning_rate": 3.5596120935539074e-06, "loss": 10.727, "step": 629 }, { "epoch": 0.10782132466198871, "grad_norm": 42.74559783935547, "learning_rate": 3.5653166001140903e-06, "loss": 7.948, "step": 630 }, { "epoch": 0.10799246962176964, "grad_norm": 24.176240921020508, "learning_rate": 3.571021106674273e-06, "loss": 5.4348, "step": 631 }, { "epoch": 0.10816361458155058, "grad_norm": 32.64130783081055, "learning_rate": 3.576725613234455e-06, "loss": 8.4, "step": 632 }, { "epoch": 0.10833475954133151, "grad_norm": 32.354248046875, "learning_rate": 3.582430119794638e-06, "loss": 6.4397, "step": 633 }, { "epoch": 0.10850590450111244, "grad_norm": 25.767475128173828, "learning_rate": 3.5881346263548204e-06, "loss": 5.7136, "step": 634 }, { "epoch": 0.10867704946089338, "grad_norm": 28.90591812133789, "learning_rate": 3.593839132915003e-06, "loss": 5.9131, "step": 635 }, { "epoch": 0.10884819442067431, "grad_norm": 32.62278747558594, "learning_rate": 3.5995436394751855e-06, "loss": 7.6547, "step": 636 }, { "epoch": 0.10901933938045524, "grad_norm": 30.387760162353516, "learning_rate": 3.605248146035368e-06, "loss": 4.7063, "step": 637 }, { "epoch": 0.10919048434023618, "grad_norm": 33.034420013427734, "learning_rate": 3.6109526525955506e-06, "loss": 6.8851, "step": 638 }, { "epoch": 0.10936162930001711, "grad_norm": 31.42691421508789, "learning_rate": 3.616657159155733e-06, "loss": 4.1348, "step": 639 }, { "epoch": 0.10953277425979804, "grad_norm": 32.439395904541016, "learning_rate": 3.6223616657159156e-06, "loss": 6.3162, "step": 640 }, { "epoch": 0.10970391921957898, "grad_norm": 26.49324607849121, "learning_rate": 3.6280661722760986e-06, "loss": 5.1818, "step": 641 }, { "epoch": 0.10987506417935992, "grad_norm": 25.558427810668945, "learning_rate": 3.6337706788362807e-06, "loss": 6.7631, "step": 642 }, { "epoch": 0.11004620913914086, "grad_norm": 24.655729293823242, "learning_rate": 3.6394751853964632e-06, "loss": 7.4925, "step": 643 }, { "epoch": 0.11021735409892179, "grad_norm": 28.129770278930664, "learning_rate": 3.645179691956646e-06, "loss": 7.4969, "step": 644 }, { "epoch": 0.11038849905870272, "grad_norm": 14.367050170898438, "learning_rate": 3.6508841985168283e-06, "loss": 2.9942, "step": 645 }, { "epoch": 0.11055964401848366, "grad_norm": 17.681976318359375, "learning_rate": 3.656588705077011e-06, "loss": 3.4156, "step": 646 }, { "epoch": 0.11073078897826459, "grad_norm": 16.25703239440918, "learning_rate": 3.6622932116371938e-06, "loss": 4.095, "step": 647 }, { "epoch": 0.11090193393804552, "grad_norm": 26.604623794555664, "learning_rate": 3.667997718197376e-06, "loss": 7.7521, "step": 648 }, { "epoch": 0.11107307889782646, "grad_norm": 24.250492095947266, "learning_rate": 3.6737022247575584e-06, "loss": 5.5501, "step": 649 }, { "epoch": 0.11124422385760739, "grad_norm": 31.94316864013672, "learning_rate": 3.6794067313177414e-06, "loss": 6.24, "step": 650 }, { "epoch": 0.11141536881738832, "grad_norm": 18.14836883544922, "learning_rate": 3.6851112378779235e-06, "loss": 3.024, "step": 651 }, { "epoch": 0.11158651377716926, "grad_norm": 25.239274978637695, "learning_rate": 3.690815744438106e-06, "loss": 7.4699, "step": 652 }, { "epoch": 0.11175765873695019, "grad_norm": 66.97354125976562, "learning_rate": 3.696520250998289e-06, "loss": 11.3565, "step": 653 }, { "epoch": 0.11192880369673112, "grad_norm": 30.029356002807617, "learning_rate": 3.702224757558471e-06, "loss": 7.0383, "step": 654 }, { "epoch": 0.11209994865651207, "grad_norm": 22.021820068359375, "learning_rate": 3.7079292641186536e-06, "loss": 5.1655, "step": 655 }, { "epoch": 0.112271093616293, "grad_norm": 40.79402160644531, "learning_rate": 3.7136337706788366e-06, "loss": 7.3738, "step": 656 }, { "epoch": 0.11244223857607394, "grad_norm": 49.726810455322266, "learning_rate": 3.7193382772390187e-06, "loss": 10.1751, "step": 657 }, { "epoch": 0.11261338353585487, "grad_norm": 34.322078704833984, "learning_rate": 3.725042783799201e-06, "loss": 8.127, "step": 658 }, { "epoch": 0.1127845284956358, "grad_norm": 31.094890594482422, "learning_rate": 3.730747290359384e-06, "loss": 7.2578, "step": 659 }, { "epoch": 0.11295567345541674, "grad_norm": 17.61489486694336, "learning_rate": 3.7364517969195667e-06, "loss": 2.9985, "step": 660 }, { "epoch": 0.11312681841519767, "grad_norm": 31.467206954956055, "learning_rate": 3.7421563034797492e-06, "loss": 3.7049, "step": 661 }, { "epoch": 0.1132979633749786, "grad_norm": 24.94162368774414, "learning_rate": 3.7478608100399318e-06, "loss": 5.2144, "step": 662 }, { "epoch": 0.11346910833475954, "grad_norm": 61.16570281982422, "learning_rate": 3.7535653166001143e-06, "loss": 10.6124, "step": 663 }, { "epoch": 0.11364025329454047, "grad_norm": 30.18357276916504, "learning_rate": 3.7592698231602964e-06, "loss": 7.2241, "step": 664 }, { "epoch": 0.1138113982543214, "grad_norm": 40.68777847290039, "learning_rate": 3.764974329720479e-06, "loss": 7.604, "step": 665 }, { "epoch": 0.11398254321410234, "grad_norm": 24.30128288269043, "learning_rate": 3.7706788362806623e-06, "loss": 4.6842, "step": 666 }, { "epoch": 0.11415368817388329, "grad_norm": 33.77325439453125, "learning_rate": 3.7763833428408444e-06, "loss": 7.3903, "step": 667 }, { "epoch": 0.11432483313366422, "grad_norm": 30.10031509399414, "learning_rate": 3.782087849401027e-06, "loss": 7.0533, "step": 668 }, { "epoch": 0.11449597809344515, "grad_norm": 34.8586540222168, "learning_rate": 3.7877923559612095e-06, "loss": 7.5631, "step": 669 }, { "epoch": 0.11466712305322609, "grad_norm": 33.20988082885742, "learning_rate": 3.7934968625213916e-06, "loss": 3.7029, "step": 670 }, { "epoch": 0.11483826801300702, "grad_norm": 31.075176239013672, "learning_rate": 3.799201369081575e-06, "loss": 8.2182, "step": 671 }, { "epoch": 0.11500941297278795, "grad_norm": 30.962139129638672, "learning_rate": 3.8049058756417575e-06, "loss": 6.5288, "step": 672 }, { "epoch": 0.11518055793256889, "grad_norm": 37.01807403564453, "learning_rate": 3.8106103822019396e-06, "loss": 7.8449, "step": 673 }, { "epoch": 0.11535170289234982, "grad_norm": 35.002742767333984, "learning_rate": 3.816314888762122e-06, "loss": 6.4509, "step": 674 }, { "epoch": 0.11552284785213075, "grad_norm": 51.06761169433594, "learning_rate": 3.822019395322305e-06, "loss": 10.6236, "step": 675 }, { "epoch": 0.11569399281191169, "grad_norm": 37.48448181152344, "learning_rate": 3.827723901882487e-06, "loss": 6.7785, "step": 676 }, { "epoch": 0.11586513777169262, "grad_norm": 35.638832092285156, "learning_rate": 3.8334284084426706e-06, "loss": 7.6172, "step": 677 }, { "epoch": 0.11603628273147355, "grad_norm": 35.00564956665039, "learning_rate": 3.839132915002852e-06, "loss": 7.1866, "step": 678 }, { "epoch": 0.11620742769125449, "grad_norm": 31.42662811279297, "learning_rate": 3.844837421563035e-06, "loss": 3.2075, "step": 679 }, { "epoch": 0.11637857265103543, "grad_norm": 16.111412048339844, "learning_rate": 3.850541928123217e-06, "loss": 4.0132, "step": 680 }, { "epoch": 0.11654971761081637, "grad_norm": 29.7305850982666, "learning_rate": 3.8562464346834e-06, "loss": 7.1253, "step": 681 }, { "epoch": 0.1167208625705973, "grad_norm": 28.033987045288086, "learning_rate": 3.861950941243582e-06, "loss": 3.1947, "step": 682 }, { "epoch": 0.11689200753037823, "grad_norm": 31.460405349731445, "learning_rate": 3.867655447803766e-06, "loss": 7.1427, "step": 683 }, { "epoch": 0.11706315249015917, "grad_norm": 269.6858825683594, "learning_rate": 3.8733599543639474e-06, "loss": 11.9947, "step": 684 }, { "epoch": 0.1172342974499401, "grad_norm": 35.384727478027344, "learning_rate": 3.87906446092413e-06, "loss": 6.8334, "step": 685 }, { "epoch": 0.11740544240972103, "grad_norm": 25.98334312438965, "learning_rate": 3.8847689674843125e-06, "loss": 7.2384, "step": 686 }, { "epoch": 0.11757658736950197, "grad_norm": 33.84842300415039, "learning_rate": 3.890473474044495e-06, "loss": 5.9906, "step": 687 }, { "epoch": 0.1177477323292829, "grad_norm": 41.04487609863281, "learning_rate": 3.8961779806046776e-06, "loss": 6.2759, "step": 688 }, { "epoch": 0.11791887728906383, "grad_norm": 16.468915939331055, "learning_rate": 3.901882487164861e-06, "loss": 3.8545, "step": 689 }, { "epoch": 0.11809002224884477, "grad_norm": 27.10782241821289, "learning_rate": 3.907586993725043e-06, "loss": 7.596, "step": 690 }, { "epoch": 0.1182611672086257, "grad_norm": 25.684919357299805, "learning_rate": 3.913291500285225e-06, "loss": 4.2235, "step": 691 }, { "epoch": 0.11843231216840663, "grad_norm": 27.2288761138916, "learning_rate": 3.918996006845408e-06, "loss": 6.9975, "step": 692 }, { "epoch": 0.11860345712818758, "grad_norm": 33.26142120361328, "learning_rate": 3.92470051340559e-06, "loss": 6.5592, "step": 693 }, { "epoch": 0.11877460208796851, "grad_norm": 38.89694595336914, "learning_rate": 3.930405019965774e-06, "loss": 7.2757, "step": 694 }, { "epoch": 0.11894574704774945, "grad_norm": 27.99795150756836, "learning_rate": 3.936109526525956e-06, "loss": 6.7422, "step": 695 }, { "epoch": 0.11911689200753038, "grad_norm": 24.109289169311523, "learning_rate": 3.941814033086138e-06, "loss": 4.9175, "step": 696 }, { "epoch": 0.11928803696731131, "grad_norm": 15.462040901184082, "learning_rate": 3.94751853964632e-06, "loss": 2.7684, "step": 697 }, { "epoch": 0.11945918192709225, "grad_norm": 39.17838668823242, "learning_rate": 3.953223046206503e-06, "loss": 7.1518, "step": 698 }, { "epoch": 0.11963032688687318, "grad_norm": 30.83951759338379, "learning_rate": 3.958927552766685e-06, "loss": 3.8832, "step": 699 }, { "epoch": 0.11980147184665411, "grad_norm": 26.964744567871094, "learning_rate": 3.964632059326869e-06, "loss": 7.224, "step": 700 }, { "epoch": 0.11997261680643505, "grad_norm": 36.607975006103516, "learning_rate": 3.970336565887051e-06, "loss": 7.3389, "step": 701 }, { "epoch": 0.12014376176621598, "grad_norm": 37.18532180786133, "learning_rate": 3.976041072447234e-06, "loss": 6.1083, "step": 702 }, { "epoch": 0.12031490672599691, "grad_norm": 29.550649642944336, "learning_rate": 3.9817455790074155e-06, "loss": 5.1898, "step": 703 }, { "epoch": 0.12048605168577785, "grad_norm": 24.146198272705078, "learning_rate": 3.987450085567598e-06, "loss": 4.6196, "step": 704 }, { "epoch": 0.1206571966455588, "grad_norm": 25.126737594604492, "learning_rate": 3.993154592127781e-06, "loss": 2.7422, "step": 705 }, { "epoch": 0.12082834160533973, "grad_norm": 18.79334259033203, "learning_rate": 3.998859098687964e-06, "loss": 2.6716, "step": 706 }, { "epoch": 0.12099948656512066, "grad_norm": 33.249168395996094, "learning_rate": 4.0045636052481465e-06, "loss": 5.3053, "step": 707 }, { "epoch": 0.1211706315249016, "grad_norm": 26.934682846069336, "learning_rate": 4.010268111808329e-06, "loss": 4.514, "step": 708 }, { "epoch": 0.12134177648468253, "grad_norm": 44.88846206665039, "learning_rate": 4.015972618368511e-06, "loss": 6.4733, "step": 709 }, { "epoch": 0.12151292144446346, "grad_norm": 41.93711471557617, "learning_rate": 4.021677124928693e-06, "loss": 7.4558, "step": 710 }, { "epoch": 0.1216840664042444, "grad_norm": 41.59209060668945, "learning_rate": 4.027381631488877e-06, "loss": 7.0372, "step": 711 }, { "epoch": 0.12185521136402533, "grad_norm": 41.47358703613281, "learning_rate": 4.033086138049059e-06, "loss": 7.2868, "step": 712 }, { "epoch": 0.12202635632380626, "grad_norm": 41.380741119384766, "learning_rate": 4.038790644609242e-06, "loss": 7.6225, "step": 713 }, { "epoch": 0.1221975012835872, "grad_norm": 40.343788146972656, "learning_rate": 4.044495151169424e-06, "loss": 7.2916, "step": 714 }, { "epoch": 0.12236864624336813, "grad_norm": 30.69339370727539, "learning_rate": 4.050199657729606e-06, "loss": 5.8809, "step": 715 }, { "epoch": 0.12253979120314906, "grad_norm": 25.84669303894043, "learning_rate": 4.0559041642897885e-06, "loss": 2.8596, "step": 716 }, { "epoch": 0.12271093616293, "grad_norm": 37.5709114074707, "learning_rate": 4.061608670849972e-06, "loss": 6.5536, "step": 717 }, { "epoch": 0.12288208112271094, "grad_norm": 44.87430191040039, "learning_rate": 4.067313177410154e-06, "loss": 7.8952, "step": 718 }, { "epoch": 0.12305322608249188, "grad_norm": 29.630413055419922, "learning_rate": 4.073017683970337e-06, "loss": 7.2852, "step": 719 }, { "epoch": 0.12322437104227281, "grad_norm": 38.17768096923828, "learning_rate": 4.0787221905305194e-06, "loss": 7.2025, "step": 720 }, { "epoch": 0.12339551600205374, "grad_norm": 31.9378719329834, "learning_rate": 4.084426697090702e-06, "loss": 3.3231, "step": 721 }, { "epoch": 0.12356666096183468, "grad_norm": 15.323390007019043, "learning_rate": 4.090131203650884e-06, "loss": 3.5441, "step": 722 }, { "epoch": 0.12373780592161561, "grad_norm": 32.09744644165039, "learning_rate": 4.095835710211067e-06, "loss": 5.9737, "step": 723 }, { "epoch": 0.12390895088139654, "grad_norm": 32.49777603149414, "learning_rate": 4.1015402167712496e-06, "loss": 5.6116, "step": 724 }, { "epoch": 0.12408009584117748, "grad_norm": 32.568031311035156, "learning_rate": 4.107244723331432e-06, "loss": 6.8512, "step": 725 }, { "epoch": 0.12425124080095841, "grad_norm": 27.68449592590332, "learning_rate": 4.112949229891615e-06, "loss": 3.8807, "step": 726 }, { "epoch": 0.12442238576073934, "grad_norm": 28.595746994018555, "learning_rate": 4.118653736451797e-06, "loss": 2.7513, "step": 727 }, { "epoch": 0.12459353072052028, "grad_norm": 40.44917678833008, "learning_rate": 4.124358243011979e-06, "loss": 6.9441, "step": 728 }, { "epoch": 0.12476467568030121, "grad_norm": 34.75537872314453, "learning_rate": 4.130062749572162e-06, "loss": 6.2836, "step": 729 }, { "epoch": 0.12493582064008214, "grad_norm": 32.49576950073242, "learning_rate": 4.135767256132345e-06, "loss": 3.6985, "step": 730 }, { "epoch": 0.1251069655998631, "grad_norm": 33.09941482543945, "learning_rate": 4.141471762692527e-06, "loss": 6.414, "step": 731 }, { "epoch": 0.12527811055964402, "grad_norm": 33.988101959228516, "learning_rate": 4.14717626925271e-06, "loss": 6.8846, "step": 732 }, { "epoch": 0.12544925551942496, "grad_norm": 34.69337844848633, "learning_rate": 4.152880775812892e-06, "loss": 5.9908, "step": 733 }, { "epoch": 0.1256204004792059, "grad_norm": 42.33815383911133, "learning_rate": 4.158585282373075e-06, "loss": 7.2186, "step": 734 }, { "epoch": 0.12579154543898682, "grad_norm": 21.35869598388672, "learning_rate": 4.164289788933257e-06, "loss": 3.2239, "step": 735 }, { "epoch": 0.12596269039876776, "grad_norm": 34.62517166137695, "learning_rate": 4.16999429549344e-06, "loss": 6.2926, "step": 736 }, { "epoch": 0.1261338353585487, "grad_norm": 32.758544921875, "learning_rate": 4.1756988020536225e-06, "loss": 6.2203, "step": 737 }, { "epoch": 0.12630498031832962, "grad_norm": 17.39285659790039, "learning_rate": 4.181403308613805e-06, "loss": 3.2563, "step": 738 }, { "epoch": 0.12647612527811056, "grad_norm": 32.22175598144531, "learning_rate": 4.1871078151739875e-06, "loss": 5.3068, "step": 739 }, { "epoch": 0.1266472702378915, "grad_norm": 38.13700485229492, "learning_rate": 4.19281232173417e-06, "loss": 7.6128, "step": 740 }, { "epoch": 0.12681841519767242, "grad_norm": 35.74038314819336, "learning_rate": 4.198516828294353e-06, "loss": 6.6528, "step": 741 }, { "epoch": 0.12698956015745336, "grad_norm": 12.027849197387695, "learning_rate": 4.204221334854535e-06, "loss": 2.255, "step": 742 }, { "epoch": 0.1271607051172343, "grad_norm": 36.75061798095703, "learning_rate": 4.209925841414718e-06, "loss": 6.2444, "step": 743 }, { "epoch": 0.12733185007701522, "grad_norm": 43.853187561035156, "learning_rate": 4.2156303479749e-06, "loss": 6.3509, "step": 744 }, { "epoch": 0.12750299503679616, "grad_norm": 31.670143127441406, "learning_rate": 4.221334854535083e-06, "loss": 7.3242, "step": 745 }, { "epoch": 0.1276741399965771, "grad_norm": 24.049455642700195, "learning_rate": 4.227039361095265e-06, "loss": 4.5557, "step": 746 }, { "epoch": 0.12784528495635802, "grad_norm": 22.603431701660156, "learning_rate": 4.232743867655448e-06, "loss": 4.3686, "step": 747 }, { "epoch": 0.12801642991613896, "grad_norm": 33.28196716308594, "learning_rate": 4.23844837421563e-06, "loss": 7.897, "step": 748 }, { "epoch": 0.1281875748759199, "grad_norm": 14.154582023620605, "learning_rate": 4.244152880775813e-06, "loss": 2.5184, "step": 749 }, { "epoch": 0.12835871983570085, "grad_norm": 34.31758117675781, "learning_rate": 4.249857387335995e-06, "loss": 6.4663, "step": 750 }, { "epoch": 0.12852986479548179, "grad_norm": 29.4487361907959, "learning_rate": 4.255561893896179e-06, "loss": 6.4908, "step": 751 }, { "epoch": 0.12870100975526272, "grad_norm": 26.144145965576172, "learning_rate": 4.261266400456361e-06, "loss": 2.7209, "step": 752 }, { "epoch": 0.12887215471504365, "grad_norm": 32.20002746582031, "learning_rate": 4.266970907016543e-06, "loss": 5.3474, "step": 753 }, { "epoch": 0.12904329967482459, "grad_norm": 22.889114379882812, "learning_rate": 4.2726754135767255e-06, "loss": 4.439, "step": 754 }, { "epoch": 0.12921444463460552, "grad_norm": 29.033794403076172, "learning_rate": 4.278379920136908e-06, "loss": 3.1768, "step": 755 }, { "epoch": 0.12938558959438645, "grad_norm": 36.977718353271484, "learning_rate": 4.2840844266970906e-06, "loss": 6.725, "step": 756 }, { "epoch": 0.12955673455416739, "grad_norm": 24.76682472229004, "learning_rate": 4.289788933257274e-06, "loss": 2.3437, "step": 757 }, { "epoch": 0.12972787951394832, "grad_norm": 16.016826629638672, "learning_rate": 4.2954934398174565e-06, "loss": 2.8201, "step": 758 }, { "epoch": 0.12989902447372925, "grad_norm": 14.587915420532227, "learning_rate": 4.301197946377638e-06, "loss": 3.5146, "step": 759 }, { "epoch": 0.13007016943351019, "grad_norm": 26.081321716308594, "learning_rate": 4.306902452937821e-06, "loss": 2.6582, "step": 760 }, { "epoch": 0.13024131439329112, "grad_norm": 16.497404098510742, "learning_rate": 4.312606959498003e-06, "loss": 2.6039, "step": 761 }, { "epoch": 0.13041245935307205, "grad_norm": 30.642013549804688, "learning_rate": 4.318311466058186e-06, "loss": 5.6791, "step": 762 }, { "epoch": 0.13058360431285299, "grad_norm": 78.80982971191406, "learning_rate": 4.324015972618369e-06, "loss": 7.0474, "step": 763 }, { "epoch": 0.13075474927263392, "grad_norm": 31.678878784179688, "learning_rate": 4.329720479178552e-06, "loss": 7.2185, "step": 764 }, { "epoch": 0.13092589423241485, "grad_norm": 67.14193725585938, "learning_rate": 4.335424985738733e-06, "loss": 11.1752, "step": 765 }, { "epoch": 0.13109703919219579, "grad_norm": 30.7507381439209, "learning_rate": 4.341129492298916e-06, "loss": 5.3445, "step": 766 }, { "epoch": 0.13126818415197672, "grad_norm": 295.94195556640625, "learning_rate": 4.346833998859098e-06, "loss": 14.8463, "step": 767 }, { "epoch": 0.13143932911175765, "grad_norm": 31.96709442138672, "learning_rate": 4.352538505419281e-06, "loss": 5.9573, "step": 768 }, { "epoch": 0.13161047407153859, "grad_norm": 21.086137771606445, "learning_rate": 4.358243011979464e-06, "loss": 2.51, "step": 769 }, { "epoch": 0.13178161903131952, "grad_norm": 23.69211196899414, "learning_rate": 4.363947518539647e-06, "loss": 3.9384, "step": 770 }, { "epoch": 0.13195276399110045, "grad_norm": 29.09503173828125, "learning_rate": 4.369652025099829e-06, "loss": 2.6477, "step": 771 }, { "epoch": 0.13212390895088139, "grad_norm": 34.086483001708984, "learning_rate": 4.375356531660011e-06, "loss": 6.8362, "step": 772 }, { "epoch": 0.13229505391066232, "grad_norm": 22.358131408691406, "learning_rate": 4.381061038220194e-06, "loss": 2.5553, "step": 773 }, { "epoch": 0.13246619887044325, "grad_norm": 32.83020782470703, "learning_rate": 4.386765544780377e-06, "loss": 5.6526, "step": 774 }, { "epoch": 0.1326373438302242, "grad_norm": 32.111629486083984, "learning_rate": 4.3924700513405595e-06, "loss": 7.0077, "step": 775 }, { "epoch": 0.13280848879000515, "grad_norm": 28.587032318115234, "learning_rate": 4.398174557900742e-06, "loss": 3.9449, "step": 776 }, { "epoch": 0.13297963374978608, "grad_norm": 28.547178268432617, "learning_rate": 4.403879064460925e-06, "loss": 5.1286, "step": 777 }, { "epoch": 0.133150778709567, "grad_norm": 31.409543991088867, "learning_rate": 4.409583571021106e-06, "loss": 5.3343, "step": 778 }, { "epoch": 0.13332192366934795, "grad_norm": 33.33236312866211, "learning_rate": 4.415288077581289e-06, "loss": 6.5061, "step": 779 }, { "epoch": 0.13349306862912888, "grad_norm": 226.51580810546875, "learning_rate": 4.420992584141472e-06, "loss": 13.5725, "step": 780 }, { "epoch": 0.1336642135889098, "grad_norm": 29.707599639892578, "learning_rate": 4.426697090701655e-06, "loss": 3.5054, "step": 781 }, { "epoch": 0.13383535854869075, "grad_norm": 29.84592628479004, "learning_rate": 4.432401597261837e-06, "loss": 5.5461, "step": 782 }, { "epoch": 0.13400650350847168, "grad_norm": 23.87710189819336, "learning_rate": 4.43810610382202e-06, "loss": 2.7581, "step": 783 }, { "epoch": 0.1341776484682526, "grad_norm": 27.90047264099121, "learning_rate": 4.4438106103822015e-06, "loss": 5.0602, "step": 784 }, { "epoch": 0.13434879342803355, "grad_norm": 229.59202575683594, "learning_rate": 4.449515116942384e-06, "loss": 8.4299, "step": 785 }, { "epoch": 0.13451993838781448, "grad_norm": 35.904483795166016, "learning_rate": 4.455219623502567e-06, "loss": 6.6261, "step": 786 }, { "epoch": 0.13469108334759541, "grad_norm": 13.451172828674316, "learning_rate": 4.46092413006275e-06, "loss": 3.6844, "step": 787 }, { "epoch": 0.13486222830737635, "grad_norm": 220.5408172607422, "learning_rate": 4.4666286366229324e-06, "loss": 12.0786, "step": 788 }, { "epoch": 0.13503337326715728, "grad_norm": 30.378768920898438, "learning_rate": 4.472333143183115e-06, "loss": 6.4301, "step": 789 }, { "epoch": 0.13520451822693821, "grad_norm": 24.894784927368164, "learning_rate": 4.478037649743297e-06, "loss": 4.0456, "step": 790 }, { "epoch": 0.13537566318671915, "grad_norm": 63.11225509643555, "learning_rate": 4.48374215630348e-06, "loss": 10.8823, "step": 791 }, { "epoch": 0.13554680814650008, "grad_norm": 30.484046936035156, "learning_rate": 4.4894466628636626e-06, "loss": 4.5215, "step": 792 }, { "epoch": 0.13571795310628101, "grad_norm": 33.15967559814453, "learning_rate": 4.495151169423845e-06, "loss": 6.1466, "step": 793 }, { "epoch": 0.13588909806606195, "grad_norm": 31.415679931640625, "learning_rate": 4.500855675984028e-06, "loss": 5.0997, "step": 794 }, { "epoch": 0.13606024302584288, "grad_norm": 29.878276824951172, "learning_rate": 4.50656018254421e-06, "loss": 6.9375, "step": 795 }, { "epoch": 0.13623138798562381, "grad_norm": 33.10092544555664, "learning_rate": 4.512264689104393e-06, "loss": 6.2231, "step": 796 }, { "epoch": 0.13640253294540475, "grad_norm": 21.412826538085938, "learning_rate": 4.517969195664575e-06, "loss": 1.8474, "step": 797 }, { "epoch": 0.13657367790518568, "grad_norm": 31.297100067138672, "learning_rate": 4.523673702224758e-06, "loss": 5.4762, "step": 798 }, { "epoch": 0.13674482286496661, "grad_norm": 234.58111572265625, "learning_rate": 4.52937820878494e-06, "loss": 11.739, "step": 799 }, { "epoch": 0.13691596782474755, "grad_norm": 204.88748168945312, "learning_rate": 4.535082715345123e-06, "loss": 13.5482, "step": 800 }, { "epoch": 0.1370871127845285, "grad_norm": 33.66855239868164, "learning_rate": 4.540787221905305e-06, "loss": 5.9551, "step": 801 }, { "epoch": 0.13725825774430944, "grad_norm": 30.423555374145508, "learning_rate": 4.546491728465488e-06, "loss": 6.3931, "step": 802 }, { "epoch": 0.13742940270409038, "grad_norm": 30.737445831298828, "learning_rate": 4.55219623502567e-06, "loss": 4.7871, "step": 803 }, { "epoch": 0.1376005476638713, "grad_norm": 95.86985778808594, "learning_rate": 4.557900741585853e-06, "loss": 6.8129, "step": 804 }, { "epoch": 0.13777169262365224, "grad_norm": 36.5138053894043, "learning_rate": 4.5636052481460355e-06, "loss": 6.4333, "step": 805 }, { "epoch": 0.13794283758343318, "grad_norm": 31.310596466064453, "learning_rate": 4.569309754706218e-06, "loss": 6.1982, "step": 806 }, { "epoch": 0.1381139825432141, "grad_norm": 32.40011978149414, "learning_rate": 4.5750142612664005e-06, "loss": 6.5281, "step": 807 }, { "epoch": 0.13828512750299504, "grad_norm": 33.58089828491211, "learning_rate": 4.580718767826583e-06, "loss": 5.0059, "step": 808 }, { "epoch": 0.13845627246277598, "grad_norm": 46.53955841064453, "learning_rate": 4.586423274386766e-06, "loss": 10.3345, "step": 809 }, { "epoch": 0.1386274174225569, "grad_norm": 23.006080627441406, "learning_rate": 4.592127780946948e-06, "loss": 2.1468, "step": 810 }, { "epoch": 0.13879856238233784, "grad_norm": 21.113685607910156, "learning_rate": 4.597832287507131e-06, "loss": 2.0972, "step": 811 }, { "epoch": 0.13896970734211878, "grad_norm": 29.228193283081055, "learning_rate": 4.603536794067313e-06, "loss": 2.9408, "step": 812 }, { "epoch": 0.1391408523018997, "grad_norm": 39.542686462402344, "learning_rate": 4.609241300627496e-06, "loss": 6.4624, "step": 813 }, { "epoch": 0.13931199726168064, "grad_norm": 42.17389678955078, "learning_rate": 4.614945807187679e-06, "loss": 7.4244, "step": 814 }, { "epoch": 0.13948314222146158, "grad_norm": 31.26105308532715, "learning_rate": 4.620650313747861e-06, "loss": 6.5606, "step": 815 }, { "epoch": 0.1396542871812425, "grad_norm": 40.22693634033203, "learning_rate": 4.626354820308043e-06, "loss": 6.2725, "step": 816 }, { "epoch": 0.13982543214102344, "grad_norm": 25.14350700378418, "learning_rate": 4.632059326868226e-06, "loss": 4.0754, "step": 817 }, { "epoch": 0.13999657710080438, "grad_norm": 23.578937530517578, "learning_rate": 4.637763833428408e-06, "loss": 4.1309, "step": 818 }, { "epoch": 0.1401677220605853, "grad_norm": 37.57481002807617, "learning_rate": 4.643468339988591e-06, "loss": 5.8135, "step": 819 }, { "epoch": 0.14033886702036624, "grad_norm": 35.21710205078125, "learning_rate": 4.649172846548774e-06, "loss": 6.6982, "step": 820 }, { "epoch": 0.14051001198014718, "grad_norm": 14.915112495422363, "learning_rate": 4.654877353108957e-06, "loss": 2.1068, "step": 821 }, { "epoch": 0.1406811569399281, "grad_norm": 27.366252899169922, "learning_rate": 4.6605818596691385e-06, "loss": 3.2475, "step": 822 }, { "epoch": 0.14085230189970904, "grad_norm": 36.40489196777344, "learning_rate": 4.666286366229321e-06, "loss": 6.7448, "step": 823 }, { "epoch": 0.14102344685948998, "grad_norm": 37.40996551513672, "learning_rate": 4.6719908727895036e-06, "loss": 6.8328, "step": 824 }, { "epoch": 0.1411945918192709, "grad_norm": 255.09320068359375, "learning_rate": 4.677695379349686e-06, "loss": 12.0992, "step": 825 }, { "epoch": 0.14136573677905187, "grad_norm": 41.39365768432617, "learning_rate": 4.6833998859098695e-06, "loss": 6.1908, "step": 826 }, { "epoch": 0.1415368817388328, "grad_norm": 14.086997032165527, "learning_rate": 4.689104392470052e-06, "loss": 3.3856, "step": 827 }, { "epoch": 0.14170802669861374, "grad_norm": 33.170352935791016, "learning_rate": 4.694808899030234e-06, "loss": 6.9479, "step": 828 }, { "epoch": 0.14187917165839467, "grad_norm": 37.625064849853516, "learning_rate": 4.700513405590416e-06, "loss": 7.6713, "step": 829 }, { "epoch": 0.1420503166181756, "grad_norm": 25.476303100585938, "learning_rate": 4.706217912150599e-06, "loss": 4.2481, "step": 830 }, { "epoch": 0.14222146157795654, "grad_norm": 27.399072647094727, "learning_rate": 4.711922418710781e-06, "loss": 5.508, "step": 831 }, { "epoch": 0.14239260653773747, "grad_norm": 31.020893096923828, "learning_rate": 4.717626925270965e-06, "loss": 5.8831, "step": 832 }, { "epoch": 0.1425637514975184, "grad_norm": 26.108135223388672, "learning_rate": 4.723331431831147e-06, "loss": 3.5932, "step": 833 }, { "epoch": 0.14273489645729934, "grad_norm": 35.8662109375, "learning_rate": 4.729035938391329e-06, "loss": 5.1499, "step": 834 }, { "epoch": 0.14290604141708027, "grad_norm": 34.714324951171875, "learning_rate": 4.734740444951511e-06, "loss": 5.9969, "step": 835 }, { "epoch": 0.1430771863768612, "grad_norm": 34.023067474365234, "learning_rate": 4.740444951511694e-06, "loss": 6.4575, "step": 836 }, { "epoch": 0.14324833133664214, "grad_norm": 17.601118087768555, "learning_rate": 4.746149458071877e-06, "loss": 2.5208, "step": 837 }, { "epoch": 0.14341947629642307, "grad_norm": 19.672815322875977, "learning_rate": 4.75185396463206e-06, "loss": 2.1216, "step": 838 }, { "epoch": 0.143590621256204, "grad_norm": 25.771137237548828, "learning_rate": 4.757558471192242e-06, "loss": 2.6146, "step": 839 }, { "epoch": 0.14376176621598494, "grad_norm": 32.17550277709961, "learning_rate": 4.763262977752424e-06, "loss": 5.8516, "step": 840 }, { "epoch": 0.14393291117576587, "grad_norm": 72.34523010253906, "learning_rate": 4.768967484312607e-06, "loss": 11.0212, "step": 841 }, { "epoch": 0.1441040561355468, "grad_norm": 22.756717681884766, "learning_rate": 4.774671990872789e-06, "loss": 2.3166, "step": 842 }, { "epoch": 0.14427520109532774, "grad_norm": 22.13291358947754, "learning_rate": 4.7803764974329725e-06, "loss": 2.2079, "step": 843 }, { "epoch": 0.14444634605510867, "grad_norm": 65.32748413085938, "learning_rate": 4.786081003993155e-06, "loss": 6.3309, "step": 844 }, { "epoch": 0.1446174910148896, "grad_norm": 242.9714813232422, "learning_rate": 4.791785510553338e-06, "loss": 11.9379, "step": 845 }, { "epoch": 0.14478863597467054, "grad_norm": 21.737802505493164, "learning_rate": 4.79749001711352e-06, "loss": 3.806, "step": 846 }, { "epoch": 0.14495978093445147, "grad_norm": 29.438758850097656, "learning_rate": 4.803194523673702e-06, "loss": 5.7729, "step": 847 }, { "epoch": 0.1451309258942324, "grad_norm": 25.701087951660156, "learning_rate": 4.808899030233884e-06, "loss": 2.8536, "step": 848 }, { "epoch": 0.14530207085401334, "grad_norm": 130.01524353027344, "learning_rate": 4.814603536794068e-06, "loss": 7.5391, "step": 849 }, { "epoch": 0.14547321581379427, "grad_norm": 30.284828186035156, "learning_rate": 4.82030804335425e-06, "loss": 3.806, "step": 850 }, { "epoch": 0.1456443607735752, "grad_norm": 23.351642608642578, "learning_rate": 4.826012549914433e-06, "loss": 4.2263, "step": 851 }, { "epoch": 0.14581550573335617, "grad_norm": 216.2431182861328, "learning_rate": 4.831717056474615e-06, "loss": 9.159, "step": 852 }, { "epoch": 0.1459866506931371, "grad_norm": 35.071754455566406, "learning_rate": 4.837421563034797e-06, "loss": 6.503, "step": 853 }, { "epoch": 0.14615779565291803, "grad_norm": 34.0211296081543, "learning_rate": 4.84312606959498e-06, "loss": 6.4636, "step": 854 }, { "epoch": 0.14632894061269897, "grad_norm": 17.20896339416504, "learning_rate": 4.848830576155163e-06, "loss": 2.7218, "step": 855 }, { "epoch": 0.1465000855724799, "grad_norm": 136.72647094726562, "learning_rate": 4.8545350827153454e-06, "loss": 7.7082, "step": 856 }, { "epoch": 0.14667123053226083, "grad_norm": 53.50956344604492, "learning_rate": 4.860239589275528e-06, "loss": 10.0171, "step": 857 }, { "epoch": 0.14684237549204177, "grad_norm": 21.030473709106445, "learning_rate": 4.8659440958357105e-06, "loss": 4.1916, "step": 858 }, { "epoch": 0.1470135204518227, "grad_norm": 34.38727569580078, "learning_rate": 4.871648602395892e-06, "loss": 5.969, "step": 859 }, { "epoch": 0.14718466541160363, "grad_norm": 22.703882217407227, "learning_rate": 4.8773531089560756e-06, "loss": 2.4073, "step": 860 }, { "epoch": 0.14735581037138457, "grad_norm": 33.388858795166016, "learning_rate": 4.883057615516258e-06, "loss": 5.7571, "step": 861 }, { "epoch": 0.1475269553311655, "grad_norm": 35.79853820800781, "learning_rate": 4.888762122076441e-06, "loss": 5.9363, "step": 862 }, { "epoch": 0.14769810029094643, "grad_norm": 20.656721115112305, "learning_rate": 4.894466628636623e-06, "loss": 2.0406, "step": 863 }, { "epoch": 0.14786924525072737, "grad_norm": 35.20976638793945, "learning_rate": 4.900171135196806e-06, "loss": 5.8613, "step": 864 }, { "epoch": 0.1480403902105083, "grad_norm": 22.342880249023438, "learning_rate": 4.905875641756987e-06, "loss": 4.0119, "step": 865 }, { "epoch": 0.14821153517028923, "grad_norm": 33.253292083740234, "learning_rate": 4.911580148317171e-06, "loss": 4.62, "step": 866 }, { "epoch": 0.14838268013007017, "grad_norm": 186.65093994140625, "learning_rate": 4.917284654877353e-06, "loss": 11.2662, "step": 867 }, { "epoch": 0.1485538250898511, "grad_norm": 15.842426300048828, "learning_rate": 4.922989161437536e-06, "loss": 2.0607, "step": 868 }, { "epoch": 0.14872497004963203, "grad_norm": 26.70699119567871, "learning_rate": 4.928693667997718e-06, "loss": 3.1737, "step": 869 }, { "epoch": 0.14889611500941297, "grad_norm": 33.37158966064453, "learning_rate": 4.934398174557901e-06, "loss": 4.7352, "step": 870 }, { "epoch": 0.1490672599691939, "grad_norm": 26.4490966796875, "learning_rate": 4.940102681118083e-06, "loss": 4.2178, "step": 871 }, { "epoch": 0.14923840492897483, "grad_norm": 33.25678634643555, "learning_rate": 4.945807187678266e-06, "loss": 5.0764, "step": 872 }, { "epoch": 0.14940954988875577, "grad_norm": 38.204769134521484, "learning_rate": 4.9515116942384485e-06, "loss": 5.8078, "step": 873 }, { "epoch": 0.1495806948485367, "grad_norm": 27.79875946044922, "learning_rate": 4.957216200798631e-06, "loss": 5.6432, "step": 874 }, { "epoch": 0.14975183980831763, "grad_norm": 32.442115783691406, "learning_rate": 4.9629207073588135e-06, "loss": 5.7378, "step": 875 }, { "epoch": 0.14992298476809857, "grad_norm": 57.06877517700195, "learning_rate": 4.968625213918996e-06, "loss": 10.3136, "step": 876 }, { "epoch": 0.15009412972787953, "grad_norm": 32.131187438964844, "learning_rate": 4.9743297204791794e-06, "loss": 4.6921, "step": 877 }, { "epoch": 0.15009412972787953, "eval_nli-pairs_loss": 5.535374164581299, "eval_nli-pairs_runtime": 4.3709, "eval_nli-pairs_samples_per_second": 45.757, "eval_nli-pairs_steps_per_second": 1.601, "eval_sts-test_pearson_cosine": 0.6147169012893178, "eval_sts-test_pearson_dot": 0.4334302941897573, "eval_sts-test_pearson_euclidean": 0.6082490673246602, "eval_sts-test_pearson_manhattan": 0.616700428941834, "eval_sts-test_pearson_max": 0.616700428941834, "eval_sts-test_spearman_cosine": 0.5972327557562241, "eval_sts-test_spearman_dot": 0.41946207508864325, "eval_sts-test_spearman_euclidean": 0.5959187544369754, "eval_sts-test_spearman_manhattan": 0.6029031731511296, "eval_sts-test_spearman_max": 0.6029031731511296, "step": 877 }, { "epoch": 0.15009412972787953, "eval_vitaminc-pairs_loss": 3.619838237762451, "eval_vitaminc-pairs_runtime": 2.7372, "eval_vitaminc-pairs_samples_per_second": 73.068, "eval_vitaminc-pairs_steps_per_second": 2.557, "step": 877 }, { "epoch": 0.15009412972787953, "eval_qnli-contrastive_loss": 12.3779878616333, "eval_qnli-contrastive_runtime": 0.6382, "eval_qnli-contrastive_samples_per_second": 313.373, "eval_qnli-contrastive_steps_per_second": 10.968, "step": 877 }, { "epoch": 0.15009412972787953, "eval_scitail-pairs-qa_loss": 1.6706750392913818, "eval_scitail-pairs-qa_runtime": 1.6279, "eval_scitail-pairs-qa_samples_per_second": 122.855, "eval_scitail-pairs-qa_steps_per_second": 4.3, "step": 877 }, { "epoch": 0.15009412972787953, "eval_scitail-pairs-pos_loss": 3.0242857933044434, "eval_scitail-pairs-pos_runtime": 2.6188, "eval_scitail-pairs-pos_samples_per_second": 76.369, "eval_scitail-pairs-pos_steps_per_second": 2.673, "step": 877 }, { "epoch": 0.15009412972787953, "eval_xsum-pairs_loss": 3.0581634044647217, "eval_xsum-pairs_runtime": 2.6458, "eval_xsum-pairs_samples_per_second": 66.142, "eval_xsum-pairs_steps_per_second": 2.268, "step": 877 }, { "epoch": 0.15009412972787953, "eval_compression-pairs_loss": 1.9685934782028198, "eval_compression-pairs_runtime": 0.5084, "eval_compression-pairs_samples_per_second": 393.398, "eval_compression-pairs_steps_per_second": 13.769, "step": 877 }, { "epoch": 0.15009412972787953, "eval_sciq_pairs_loss": 6.824851989746094, "eval_sciq_pairs_runtime": 9.1685, "eval_sciq_pairs_samples_per_second": 21.814, "eval_sciq_pairs_steps_per_second": 0.763, "step": 877 }, { "epoch": 0.15009412972787953, "eval_qasc_pairs_loss": 10.253314018249512, "eval_qasc_pairs_runtime": 2.6538, "eval_qasc_pairs_samples_per_second": 75.363, "eval_qasc_pairs_steps_per_second": 2.638, "step": 877 }, { "epoch": 0.15009412972787953, "eval_openbookqa_pairs_loss": 5.933743953704834, "eval_openbookqa_pairs_runtime": 0.6418, "eval_openbookqa_pairs_samples_per_second": 107.513, "eval_openbookqa_pairs_steps_per_second": 4.674, "step": 877 }, { "epoch": 0.15009412972787953, "eval_msmarco_pairs_loss": 5.185385704040527, "eval_msmarco_pairs_runtime": 3.9947, "eval_msmarco_pairs_samples_per_second": 50.067, "eval_msmarco_pairs_steps_per_second": 1.752, "step": 877 }, { "epoch": 0.15009412972787953, "eval_nq_pairs_loss": 6.44993782043457, "eval_nq_pairs_runtime": 8.638, "eval_nq_pairs_samples_per_second": 23.153, "eval_nq_pairs_steps_per_second": 0.81, "step": 877 }, { "epoch": 0.15009412972787953, "eval_trivia_pairs_loss": 6.129721641540527, "eval_trivia_pairs_runtime": 12.8296, "eval_trivia_pairs_samples_per_second": 15.589, "eval_trivia_pairs_steps_per_second": 0.546, "step": 877 }, { "epoch": 0.15009412972787953, "eval_quora_pairs_loss": 1.7218067646026611, "eval_quora_pairs_runtime": 1.5931, "eval_quora_pairs_samples_per_second": 125.544, "eval_quora_pairs_steps_per_second": 4.394, "step": 877 }, { "epoch": 0.15009412972787953, "eval_gooaq_pairs_loss": 4.168159008026123, "eval_gooaq_pairs_runtime": 2.6679, "eval_gooaq_pairs_samples_per_second": 74.966, "eval_gooaq_pairs_steps_per_second": 2.624, "step": 877 }, { "epoch": 0.15026527468766046, "grad_norm": 29.085119247436523, "learning_rate": 4.980034227039361e-06, "loss": 5.8249, "step": 878 }, { "epoch": 0.1504364196474414, "grad_norm": 35.45232009887695, "learning_rate": 4.985738733599544e-06, "loss": 6.378, "step": 879 }, { "epoch": 0.15060756460722233, "grad_norm": 34.018470764160156, "learning_rate": 4.991443240159726e-06, "loss": 5.326, "step": 880 }, { "epoch": 0.15077870956700326, "grad_norm": 22.30814552307129, "learning_rate": 4.997147746719909e-06, "loss": 2.6674, "step": 881 }, { "epoch": 0.1509498545267842, "grad_norm": 36.679046630859375, "learning_rate": 5.002852253280091e-06, "loss": 6.6655, "step": 882 }, { "epoch": 0.15112099948656513, "grad_norm": 36.78900146484375, "learning_rate": 5.008556759840275e-06, "loss": 4.5851, "step": 883 }, { "epoch": 0.15129214444634606, "grad_norm": 46.770057678222656, "learning_rate": 5.014261266400456e-06, "loss": 9.9308, "step": 884 }, { "epoch": 0.151463289406127, "grad_norm": 27.262338638305664, "learning_rate": 5.019965772960639e-06, "loss": 2.2515, "step": 885 }, { "epoch": 0.15163443436590793, "grad_norm": 193.24122619628906, "learning_rate": 5.025670279520821e-06, "loss": 10.7631, "step": 886 }, { "epoch": 0.15180557932568886, "grad_norm": 30.53336524963379, "learning_rate": 5.031374786081004e-06, "loss": 3.9297, "step": 887 }, { "epoch": 0.1519767242854698, "grad_norm": 13.035544395446777, "learning_rate": 5.0370792926411864e-06, "loss": 3.16, "step": 888 }, { "epoch": 0.15214786924525073, "grad_norm": 27.65202522277832, "learning_rate": 5.04278379920137e-06, "loss": 3.1012, "step": 889 }, { "epoch": 0.15231901420503166, "grad_norm": 28.412954330444336, "learning_rate": 5.0484883057615515e-06, "loss": 2.4251, "step": 890 }, { "epoch": 0.1524901591648126, "grad_norm": 35.567386627197266, "learning_rate": 5.054192812321734e-06, "loss": 5.1793, "step": 891 }, { "epoch": 0.15266130412459353, "grad_norm": 31.945302963256836, "learning_rate": 5.0598973188819166e-06, "loss": 4.9138, "step": 892 }, { "epoch": 0.15283244908437446, "grad_norm": 30.31682014465332, "learning_rate": 5.065601825442099e-06, "loss": 4.8582, "step": 893 }, { "epoch": 0.1530035940441554, "grad_norm": 22.3225040435791, "learning_rate": 5.0713063320022825e-06, "loss": 2.003, "step": 894 }, { "epoch": 0.15317473900393633, "grad_norm": 23.375139236450195, "learning_rate": 5.077010838562465e-06, "loss": 2.3547, "step": 895 }, { "epoch": 0.15334588396371726, "grad_norm": 32.41263198852539, "learning_rate": 5.0827153451226475e-06, "loss": 6.2287, "step": 896 }, { "epoch": 0.1535170289234982, "grad_norm": 20.43022346496582, "learning_rate": 5.088419851682829e-06, "loss": 2.1189, "step": 897 }, { "epoch": 0.15368817388327913, "grad_norm": 37.203250885009766, "learning_rate": 5.094124358243012e-06, "loss": 6.3629, "step": 898 }, { "epoch": 0.15385931884306006, "grad_norm": 19.725624084472656, "learning_rate": 5.099828864803194e-06, "loss": 2.2277, "step": 899 }, { "epoch": 0.154030463802841, "grad_norm": 27.29782485961914, "learning_rate": 5.105533371363378e-06, "loss": 2.8851, "step": 900 }, { "epoch": 0.15420160876262193, "grad_norm": 172.8111572265625, "learning_rate": 5.11123787792356e-06, "loss": 9.9783, "step": 901 }, { "epoch": 0.1543727537224029, "grad_norm": 56.5546875, "learning_rate": 5.116942384483743e-06, "loss": 10.3301, "step": 902 }, { "epoch": 0.15454389868218382, "grad_norm": 32.12007522583008, "learning_rate": 5.122646891043924e-06, "loss": 3.3146, "step": 903 }, { "epoch": 0.15471504364196476, "grad_norm": 197.39170837402344, "learning_rate": 5.128351397604107e-06, "loss": 11.016, "step": 904 }, { "epoch": 0.1548861886017457, "grad_norm": 36.48847579956055, "learning_rate": 5.1340559041642895e-06, "loss": 4.8215, "step": 905 }, { "epoch": 0.15505733356152662, "grad_norm": 31.014644622802734, "learning_rate": 5.139760410724473e-06, "loss": 4.7237, "step": 906 }, { "epoch": 0.15522847852130756, "grad_norm": 31.436952590942383, "learning_rate": 5.145464917284655e-06, "loss": 4.6175, "step": 907 }, { "epoch": 0.1553996234810885, "grad_norm": 27.38591194152832, "learning_rate": 5.151169423844838e-06, "loss": 4.0958, "step": 908 }, { "epoch": 0.15557076844086942, "grad_norm": 31.732324600219727, "learning_rate": 5.15687393040502e-06, "loss": 4.4682, "step": 909 }, { "epoch": 0.15574191340065036, "grad_norm": 15.360635757446289, "learning_rate": 5.162578436965202e-06, "loss": 2.4148, "step": 910 }, { "epoch": 0.1559130583604313, "grad_norm": 172.3378448486328, "learning_rate": 5.168282943525385e-06, "loss": 9.8466, "step": 911 }, { "epoch": 0.15608420332021222, "grad_norm": 31.59737777709961, "learning_rate": 5.173987450085568e-06, "loss": 6.1221, "step": 912 }, { "epoch": 0.15625534827999316, "grad_norm": 20.06523323059082, "learning_rate": 5.179691956645751e-06, "loss": 2.0035, "step": 913 }, { "epoch": 0.1564264932397741, "grad_norm": 25.82581329345703, "learning_rate": 5.185396463205933e-06, "loss": 4.7388, "step": 914 }, { "epoch": 0.15659763819955502, "grad_norm": 13.644715309143066, "learning_rate": 5.191100969766115e-06, "loss": 2.1442, "step": 915 }, { "epoch": 0.15676878315933596, "grad_norm": 36.4990119934082, "learning_rate": 5.196805476326297e-06, "loss": 6.2552, "step": 916 }, { "epoch": 0.1569399281191169, "grad_norm": 35.6190185546875, "learning_rate": 5.202509982886481e-06, "loss": 6.3529, "step": 917 }, { "epoch": 0.15711107307889782, "grad_norm": 13.495047569274902, "learning_rate": 5.208214489446663e-06, "loss": 3.5731, "step": 918 }, { "epoch": 0.15728221803867876, "grad_norm": 236.7681121826172, "learning_rate": 5.213918996006846e-06, "loss": 10.5726, "step": 919 }, { "epoch": 0.1574533629984597, "grad_norm": 34.39946746826172, "learning_rate": 5.219623502567028e-06, "loss": 6.0673, "step": 920 }, { "epoch": 0.15762450795824062, "grad_norm": 12.590995788574219, "learning_rate": 5.225328009127211e-06, "loss": 2.77, "step": 921 }, { "epoch": 0.15779565291802156, "grad_norm": 31.968891143798828, "learning_rate": 5.2310325156873925e-06, "loss": 4.1677, "step": 922 }, { "epoch": 0.1579667978778025, "grad_norm": 31.067489624023438, "learning_rate": 5.236737022247576e-06, "loss": 4.716, "step": 923 }, { "epoch": 0.15813794283758342, "grad_norm": 36.08390808105469, "learning_rate": 5.2424415288077584e-06, "loss": 6.528, "step": 924 }, { "epoch": 0.15830908779736436, "grad_norm": 34.2723274230957, "learning_rate": 5.248146035367941e-06, "loss": 6.4655, "step": 925 }, { "epoch": 0.1584802327571453, "grad_norm": 43.43145751953125, "learning_rate": 5.2538505419281235e-06, "loss": 5.6795, "step": 926 }, { "epoch": 0.15865137771692622, "grad_norm": 32.78499221801758, "learning_rate": 5.259555048488306e-06, "loss": 5.6396, "step": 927 }, { "epoch": 0.15882252267670718, "grad_norm": 35.156925201416016, "learning_rate": 5.265259555048488e-06, "loss": 4.7143, "step": 928 }, { "epoch": 0.15899366763648812, "grad_norm": 34.6341552734375, "learning_rate": 5.270964061608671e-06, "loss": 5.6931, "step": 929 }, { "epoch": 0.15916481259626905, "grad_norm": 35.668331146240234, "learning_rate": 5.276668568168854e-06, "loss": 5.6404, "step": 930 }, { "epoch": 0.15933595755604998, "grad_norm": 34.62514877319336, "learning_rate": 5.282373074729036e-06, "loss": 5.0469, "step": 931 }, { "epoch": 0.15950710251583092, "grad_norm": 37.79499435424805, "learning_rate": 5.288077581289219e-06, "loss": 5.3761, "step": 932 }, { "epoch": 0.15967824747561185, "grad_norm": 40.4017333984375, "learning_rate": 5.293782087849401e-06, "loss": 5.6738, "step": 933 }, { "epoch": 0.15984939243539278, "grad_norm": 35.31856155395508, "learning_rate": 5.299486594409584e-06, "loss": 6.4936, "step": 934 }, { "epoch": 0.16002053739517372, "grad_norm": 126.11963653564453, "learning_rate": 5.305191100969766e-06, "loss": 9.9326, "step": 935 }, { "epoch": 0.16019168235495465, "grad_norm": 34.740753173828125, "learning_rate": 5.310895607529949e-06, "loss": 2.0987, "step": 936 }, { "epoch": 0.16036282731473558, "grad_norm": 34.9671745300293, "learning_rate": 5.316600114090131e-06, "loss": 6.2338, "step": 937 }, { "epoch": 0.16053397227451652, "grad_norm": 21.198925018310547, "learning_rate": 5.322304620650314e-06, "loss": 3.5463, "step": 938 }, { "epoch": 0.16070511723429745, "grad_norm": 30.98229217529297, "learning_rate": 5.328009127210496e-06, "loss": 4.7342, "step": 939 }, { "epoch": 0.16087626219407838, "grad_norm": 41.88993835449219, "learning_rate": 5.333713633770679e-06, "loss": 6.5058, "step": 940 }, { "epoch": 0.16104740715385932, "grad_norm": 24.218576431274414, "learning_rate": 5.3394181403308615e-06, "loss": 2.0172, "step": 941 }, { "epoch": 0.16121855211364025, "grad_norm": 32.891719818115234, "learning_rate": 5.345122646891044e-06, "loss": 5.893, "step": 942 }, { "epoch": 0.16138969707342118, "grad_norm": 38.93867874145508, "learning_rate": 5.3508271534512265e-06, "loss": 5.8157, "step": 943 }, { "epoch": 0.16156084203320212, "grad_norm": 31.02938461303711, "learning_rate": 5.356531660011409e-06, "loss": 5.529, "step": 944 }, { "epoch": 0.16173198699298305, "grad_norm": 36.240440368652344, "learning_rate": 5.362236166571592e-06, "loss": 4.7931, "step": 945 }, { "epoch": 0.16190313195276398, "grad_norm": 23.227556228637695, "learning_rate": 5.367940673131775e-06, "loss": 2.1265, "step": 946 }, { "epoch": 0.16207427691254492, "grad_norm": 40.07374954223633, "learning_rate": 5.373645179691957e-06, "loss": 5.8823, "step": 947 }, { "epoch": 0.16224542187232585, "grad_norm": 29.960735321044922, "learning_rate": 5.379349686252139e-06, "loss": 4.6281, "step": 948 }, { "epoch": 0.16241656683210678, "grad_norm": 173.5910186767578, "learning_rate": 5.385054192812322e-06, "loss": 10.3282, "step": 949 }, { "epoch": 0.16258771179188772, "grad_norm": 37.48442840576172, "learning_rate": 5.390758699372504e-06, "loss": 6.1584, "step": 950 }, { "epoch": 0.16275885675166865, "grad_norm": 39.48939514160156, "learning_rate": 5.396463205932687e-06, "loss": 5.655, "step": 951 }, { "epoch": 0.16293000171144958, "grad_norm": 34.57015609741211, "learning_rate": 5.40216771249287e-06, "loss": 5.4251, "step": 952 }, { "epoch": 0.16310114667123055, "grad_norm": 51.02991485595703, "learning_rate": 5.407872219053052e-06, "loss": 10.2283, "step": 953 }, { "epoch": 0.16327229163101148, "grad_norm": 31.77302360534668, "learning_rate": 5.413576725613234e-06, "loss": 4.0174, "step": 954 }, { "epoch": 0.1634434365907924, "grad_norm": 31.242929458618164, "learning_rate": 5.419281232173417e-06, "loss": 5.5883, "step": 955 }, { "epoch": 0.16361458155057335, "grad_norm": 31.789701461791992, "learning_rate": 5.4249857387335994e-06, "loss": 4.5646, "step": 956 }, { "epoch": 0.16378572651035428, "grad_norm": 34.09980392456055, "learning_rate": 5.430690245293783e-06, "loss": 4.9872, "step": 957 }, { "epoch": 0.1639568714701352, "grad_norm": 31.57735252380371, "learning_rate": 5.436394751853965e-06, "loss": 5.158, "step": 958 }, { "epoch": 0.16412801642991615, "grad_norm": 32.941917419433594, "learning_rate": 5.442099258414147e-06, "loss": 5.4497, "step": 959 }, { "epoch": 0.16429916138969708, "grad_norm": 200.919921875, "learning_rate": 5.4478037649743296e-06, "loss": 9.7888, "step": 960 }, { "epoch": 0.164470306349478, "grad_norm": 28.78856658935547, "learning_rate": 5.453508271534512e-06, "loss": 5.0757, "step": 961 }, { "epoch": 0.16464145130925895, "grad_norm": 22.877927780151367, "learning_rate": 5.459212778094695e-06, "loss": 3.6177, "step": 962 }, { "epoch": 0.16481259626903988, "grad_norm": 24.904977798461914, "learning_rate": 5.464917284654878e-06, "loss": 4.2287, "step": 963 }, { "epoch": 0.1649837412288208, "grad_norm": 35.849124908447266, "learning_rate": 5.4706217912150605e-06, "loss": 5.1121, "step": 964 }, { "epoch": 0.16515488618860175, "grad_norm": 31.580976486206055, "learning_rate": 5.476326297775242e-06, "loss": 4.4859, "step": 965 }, { "epoch": 0.16532603114838268, "grad_norm": 30.3056697845459, "learning_rate": 5.482030804335425e-06, "loss": 4.5076, "step": 966 }, { "epoch": 0.1654971761081636, "grad_norm": 34.674468994140625, "learning_rate": 5.487735310895607e-06, "loss": 5.7789, "step": 967 }, { "epoch": 0.16566832106794455, "grad_norm": 28.0445556640625, "learning_rate": 5.49343981745579e-06, "loss": 2.7613, "step": 968 }, { "epoch": 0.16583946602772548, "grad_norm": 33.28575134277344, "learning_rate": 5.499144324015973e-06, "loss": 5.1032, "step": 969 }, { "epoch": 0.1660106109875064, "grad_norm": 35.53700637817383, "learning_rate": 5.504848830576156e-06, "loss": 5.2129, "step": 970 }, { "epoch": 0.16618175594728735, "grad_norm": 33.2183952331543, "learning_rate": 5.510553337136338e-06, "loss": 5.6908, "step": 971 }, { "epoch": 0.16635290090706828, "grad_norm": 30.640926361083984, "learning_rate": 5.51625784369652e-06, "loss": 4.4325, "step": 972 }, { "epoch": 0.1665240458668492, "grad_norm": 24.672338485717773, "learning_rate": 5.5219623502567025e-06, "loss": 3.9552, "step": 973 }, { "epoch": 0.16669519082663015, "grad_norm": 33.66337585449219, "learning_rate": 5.527666856816886e-06, "loss": 5.4014, "step": 974 }, { "epoch": 0.16686633578641108, "grad_norm": 32.082942962646484, "learning_rate": 5.533371363377068e-06, "loss": 5.9258, "step": 975 }, { "epoch": 0.167037480746192, "grad_norm": 37.91094970703125, "learning_rate": 5.539075869937251e-06, "loss": 5.717, "step": 976 }, { "epoch": 0.16720862570597295, "grad_norm": 20.26280975341797, "learning_rate": 5.5447803764974335e-06, "loss": 2.2263, "step": 977 }, { "epoch": 0.16737977066575388, "grad_norm": 48.14308547973633, "learning_rate": 5.550484883057615e-06, "loss": 9.6938, "step": 978 }, { "epoch": 0.16755091562553484, "grad_norm": 22.81192970275879, "learning_rate": 5.556189389617798e-06, "loss": 3.7015, "step": 979 }, { "epoch": 0.16772206058531577, "grad_norm": 27.474571228027344, "learning_rate": 5.561893896177981e-06, "loss": 2.9404, "step": 980 }, { "epoch": 0.1678932055450967, "grad_norm": 25.376007080078125, "learning_rate": 5.567598402738164e-06, "loss": 2.3926, "step": 981 }, { "epoch": 0.16806435050487764, "grad_norm": 31.575468063354492, "learning_rate": 5.573302909298346e-06, "loss": 4.7349, "step": 982 }, { "epoch": 0.16823549546465857, "grad_norm": 194.93817138671875, "learning_rate": 5.579007415858529e-06, "loss": 9.7172, "step": 983 }, { "epoch": 0.1684066404244395, "grad_norm": 31.26558494567871, "learning_rate": 5.58471192241871e-06, "loss": 3.9837, "step": 984 }, { "epoch": 0.16857778538422044, "grad_norm": 32.1373405456543, "learning_rate": 5.590416428978893e-06, "loss": 5.0026, "step": 985 }, { "epoch": 0.16874893034400137, "grad_norm": 37.07416915893555, "learning_rate": 5.596120935539076e-06, "loss": 5.8572, "step": 986 }, { "epoch": 0.1689200753037823, "grad_norm": 35.09983825683594, "learning_rate": 5.601825442099259e-06, "loss": 5.6302, "step": 987 }, { "epoch": 0.16909122026356324, "grad_norm": 46.96855926513672, "learning_rate": 5.607529948659441e-06, "loss": 9.6255, "step": 988 }, { "epoch": 0.16926236522334417, "grad_norm": 36.15262985229492, "learning_rate": 5.613234455219624e-06, "loss": 5.5484, "step": 989 }, { "epoch": 0.1694335101831251, "grad_norm": 33.642967224121094, "learning_rate": 5.6189389617798055e-06, "loss": 5.5827, "step": 990 }, { "epoch": 0.16960465514290604, "grad_norm": 27.581716537475586, "learning_rate": 5.624643468339988e-06, "loss": 2.9652, "step": 991 }, { "epoch": 0.16977580010268697, "grad_norm": 19.107044219970703, "learning_rate": 5.6303479749001714e-06, "loss": 1.7442, "step": 992 }, { "epoch": 0.1699469450624679, "grad_norm": 165.6937255859375, "learning_rate": 5.636052481460354e-06, "loss": 10.2439, "step": 993 }, { "epoch": 0.17011809002224884, "grad_norm": 171.38658142089844, "learning_rate": 5.6417569880205365e-06, "loss": 10.7544, "step": 994 }, { "epoch": 0.17028923498202977, "grad_norm": 29.20503807067871, "learning_rate": 5.647461494580719e-06, "loss": 4.176, "step": 995 }, { "epoch": 0.1704603799418107, "grad_norm": 29.09612274169922, "learning_rate": 5.6531660011409016e-06, "loss": 4.1945, "step": 996 }, { "epoch": 0.17063152490159164, "grad_norm": 39.78682327270508, "learning_rate": 5.658870507701084e-06, "loss": 6.4205, "step": 997 }, { "epoch": 0.17080266986137257, "grad_norm": 13.687639236450195, "learning_rate": 5.664575014261267e-06, "loss": 3.468, "step": 998 }, { "epoch": 0.1709738148211535, "grad_norm": 41.89799118041992, "learning_rate": 5.670279520821449e-06, "loss": 7.13, "step": 999 }, { "epoch": 0.17114495978093444, "grad_norm": 22.78835678100586, "learning_rate": 5.675984027381632e-06, "loss": 2.7249, "step": 1000 }, { "epoch": 0.17131610474071537, "grad_norm": 26.538780212402344, "learning_rate": 5.681688533941814e-06, "loss": 3.2385, "step": 1001 }, { "epoch": 0.1714872497004963, "grad_norm": 24.171205520629883, "learning_rate": 5.687393040501997e-06, "loss": 3.7183, "step": 1002 }, { "epoch": 0.17165839466027724, "grad_norm": 35.46499252319336, "learning_rate": 5.693097547062179e-06, "loss": 5.4996, "step": 1003 }, { "epoch": 0.1718295396200582, "grad_norm": 15.119646072387695, "learning_rate": 5.698802053622362e-06, "loss": 2.4476, "step": 1004 }, { "epoch": 0.17200068457983914, "grad_norm": 43.560546875, "learning_rate": 5.704506560182544e-06, "loss": 9.1856, "step": 1005 }, { "epoch": 0.17217182953962007, "grad_norm": 42.41808319091797, "learning_rate": 5.710211066742727e-06, "loss": 5.6756, "step": 1006 }, { "epoch": 0.172342974499401, "grad_norm": 34.344207763671875, "learning_rate": 5.715915573302909e-06, "loss": 5.2383, "step": 1007 }, { "epoch": 0.17251411945918194, "grad_norm": 19.511310577392578, "learning_rate": 5.721620079863092e-06, "loss": 3.3214, "step": 1008 }, { "epoch": 0.17268526441896287, "grad_norm": 33.06563949584961, "learning_rate": 5.7273245864232745e-06, "loss": 5.6944, "step": 1009 }, { "epoch": 0.1728564093787438, "grad_norm": 38.382041931152344, "learning_rate": 5.733029092983457e-06, "loss": 5.9898, "step": 1010 }, { "epoch": 0.17302755433852474, "grad_norm": 28.5861759185791, "learning_rate": 5.7387335995436395e-06, "loss": 5.2048, "step": 1011 }, { "epoch": 0.17319869929830567, "grad_norm": 31.76646614074707, "learning_rate": 5.744438106103822e-06, "loss": 6.0811, "step": 1012 }, { "epoch": 0.1733698442580866, "grad_norm": 37.81482696533203, "learning_rate": 5.750142612664005e-06, "loss": 4.8642, "step": 1013 }, { "epoch": 0.17354098921786754, "grad_norm": 45.32394790649414, "learning_rate": 5.755847119224188e-06, "loss": 9.5803, "step": 1014 }, { "epoch": 0.17371213417764847, "grad_norm": 35.39071273803711, "learning_rate": 5.76155162578437e-06, "loss": 4.3758, "step": 1015 }, { "epoch": 0.1738832791374294, "grad_norm": 31.971323013305664, "learning_rate": 5.767256132344552e-06, "loss": 4.2616, "step": 1016 }, { "epoch": 0.17405442409721034, "grad_norm": 29.855161666870117, "learning_rate": 5.772960638904735e-06, "loss": 5.5371, "step": 1017 }, { "epoch": 0.17422556905699127, "grad_norm": 21.00974464416504, "learning_rate": 5.778665145464917e-06, "loss": 1.9809, "step": 1018 }, { "epoch": 0.1743967140167722, "grad_norm": 23.60835075378418, "learning_rate": 5.7843696520251e-06, "loss": 2.5916, "step": 1019 }, { "epoch": 0.17456785897655314, "grad_norm": 36.11520767211914, "learning_rate": 5.790074158585283e-06, "loss": 4.9198, "step": 1020 }, { "epoch": 0.17473900393633407, "grad_norm": 21.838703155517578, "learning_rate": 5.795778665145466e-06, "loss": 2.1235, "step": 1021 }, { "epoch": 0.174910148896115, "grad_norm": 28.41387367248535, "learning_rate": 5.801483171705647e-06, "loss": 5.0401, "step": 1022 }, { "epoch": 0.17508129385589594, "grad_norm": 28.482187271118164, "learning_rate": 5.80718767826583e-06, "loss": 4.7167, "step": 1023 }, { "epoch": 0.17525243881567687, "grad_norm": 33.954307556152344, "learning_rate": 5.8128921848260124e-06, "loss": 4.9666, "step": 1024 }, { "epoch": 0.1754235837754578, "grad_norm": 33.401920318603516, "learning_rate": 5.818596691386195e-06, "loss": 6.3783, "step": 1025 }, { "epoch": 0.17559472873523874, "grad_norm": 37.047691345214844, "learning_rate": 5.824301197946378e-06, "loss": 5.5925, "step": 1026 }, { "epoch": 0.17576587369501967, "grad_norm": 30.060083389282227, "learning_rate": 5.830005704506561e-06, "loss": 3.8415, "step": 1027 }, { "epoch": 0.1759370186548006, "grad_norm": 30.832544326782227, "learning_rate": 5.8357102110667426e-06, "loss": 4.9379, "step": 1028 }, { "epoch": 0.17610816361458156, "grad_norm": 30.651966094970703, "learning_rate": 5.841414717626925e-06, "loss": 3.9393, "step": 1029 }, { "epoch": 0.1762793085743625, "grad_norm": 12.284616470336914, "learning_rate": 5.847119224187108e-06, "loss": 2.7979, "step": 1030 }, { "epoch": 0.17645045353414343, "grad_norm": 25.138864517211914, "learning_rate": 5.85282373074729e-06, "loss": 3.6294, "step": 1031 }, { "epoch": 0.17662159849392436, "grad_norm": 19.136524200439453, "learning_rate": 5.8585282373074735e-06, "loss": 1.5926, "step": 1032 }, { "epoch": 0.1767927434537053, "grad_norm": 36.646968841552734, "learning_rate": 5.864232743867656e-06, "loss": 5.8265, "step": 1033 }, { "epoch": 0.17696388841348623, "grad_norm": 17.363170623779297, "learning_rate": 5.869937250427838e-06, "loss": 1.7465, "step": 1034 }, { "epoch": 0.17713503337326716, "grad_norm": 29.55439567565918, "learning_rate": 5.87564175698802e-06, "loss": 3.617, "step": 1035 }, { "epoch": 0.1773061783330481, "grad_norm": 203.16549682617188, "learning_rate": 5.881346263548203e-06, "loss": 7.9826, "step": 1036 }, { "epoch": 0.17747732329282903, "grad_norm": 17.790836334228516, "learning_rate": 5.887050770108386e-06, "loss": 2.1574, "step": 1037 }, { "epoch": 0.17764846825260996, "grad_norm": 40.40040969848633, "learning_rate": 5.892755276668569e-06, "loss": 5.5116, "step": 1038 }, { "epoch": 0.1778196132123909, "grad_norm": 30.316959381103516, "learning_rate": 5.898459783228751e-06, "loss": 4.4268, "step": 1039 }, { "epoch": 0.17799075817217183, "grad_norm": 34.86418151855469, "learning_rate": 5.904164289788933e-06, "loss": 4.9673, "step": 1040 }, { "epoch": 0.17816190313195276, "grad_norm": 198.34268188476562, "learning_rate": 5.9098687963491155e-06, "loss": 10.3881, "step": 1041 }, { "epoch": 0.1783330480917337, "grad_norm": 29.608211517333984, "learning_rate": 5.915573302909298e-06, "loss": 3.9641, "step": 1042 }, { "epoch": 0.17850419305151463, "grad_norm": 28.76857566833496, "learning_rate": 5.921277809469481e-06, "loss": 4.0211, "step": 1043 }, { "epoch": 0.17867533801129556, "grad_norm": 26.37080955505371, "learning_rate": 5.926982316029664e-06, "loss": 4.6642, "step": 1044 }, { "epoch": 0.1788464829710765, "grad_norm": 32.01490020751953, "learning_rate": 5.9326868225898464e-06, "loss": 5.5217, "step": 1045 }, { "epoch": 0.17901762793085743, "grad_norm": 22.62516212463379, "learning_rate": 5.938391329150029e-06, "loss": 1.9563, "step": 1046 }, { "epoch": 0.17918877289063836, "grad_norm": 40.089229583740234, "learning_rate": 5.944095835710211e-06, "loss": 5.9567, "step": 1047 }, { "epoch": 0.1793599178504193, "grad_norm": 22.854562759399414, "learning_rate": 5.949800342270393e-06, "loss": 1.9063, "step": 1048 }, { "epoch": 0.17953106281020023, "grad_norm": 99.86076354980469, "learning_rate": 5.9555048488305766e-06, "loss": 6.6872, "step": 1049 }, { "epoch": 0.17970220776998116, "grad_norm": 42.04011154174805, "learning_rate": 5.961209355390759e-06, "loss": 6.4974, "step": 1050 }, { "epoch": 0.1798733527297621, "grad_norm": 26.85508155822754, "learning_rate": 5.966913861950942e-06, "loss": 4.3443, "step": 1051 }, { "epoch": 0.18004449768954303, "grad_norm": 29.8301944732666, "learning_rate": 5.972618368511124e-06, "loss": 5.0599, "step": 1052 }, { "epoch": 0.18021564264932396, "grad_norm": 50.89991760253906, "learning_rate": 5.978322875071306e-06, "loss": 9.764, "step": 1053 }, { "epoch": 0.1803867876091049, "grad_norm": 32.19784927368164, "learning_rate": 5.984027381631489e-06, "loss": 4.1811, "step": 1054 }, { "epoch": 0.18055793256888586, "grad_norm": 46.780487060546875, "learning_rate": 5.989731888191672e-06, "loss": 9.4505, "step": 1055 }, { "epoch": 0.1807290775286668, "grad_norm": 17.571828842163086, "learning_rate": 5.995436394751854e-06, "loss": 1.8957, "step": 1056 }, { "epoch": 0.18090022248844773, "grad_norm": 30.740095138549805, "learning_rate": 6.001140901312037e-06, "loss": 4.0522, "step": 1057 }, { "epoch": 0.18107136744822866, "grad_norm": 36.38762283325195, "learning_rate": 6.006845407872219e-06, "loss": 5.546, "step": 1058 }, { "epoch": 0.1812425124080096, "grad_norm": 37.66824722290039, "learning_rate": 6.012549914432401e-06, "loss": 4.7406, "step": 1059 }, { "epoch": 0.18141365736779053, "grad_norm": 33.9829216003418, "learning_rate": 6.018254420992584e-06, "loss": 4.8123, "step": 1060 }, { "epoch": 0.18158480232757146, "grad_norm": 25.99117088317871, "learning_rate": 6.023958927552767e-06, "loss": 4.6063, "step": 1061 }, { "epoch": 0.1817559472873524, "grad_norm": 29.198394775390625, "learning_rate": 6.0296634341129495e-06, "loss": 5.0514, "step": 1062 }, { "epoch": 0.18192709224713333, "grad_norm": 14.127655982971191, "learning_rate": 6.035367940673132e-06, "loss": 1.3962, "step": 1063 }, { "epoch": 0.18209823720691426, "grad_norm": 12.10257339477539, "learning_rate": 6.0410724472333145e-06, "loss": 2.0181, "step": 1064 }, { "epoch": 0.1822693821666952, "grad_norm": 19.635854721069336, "learning_rate": 6.046776953793496e-06, "loss": 1.7151, "step": 1065 }, { "epoch": 0.18244052712647613, "grad_norm": 189.35772705078125, "learning_rate": 6.05248146035368e-06, "loss": 9.8327, "step": 1066 }, { "epoch": 0.18261167208625706, "grad_norm": 34.833229064941406, "learning_rate": 6.058185966913862e-06, "loss": 5.6448, "step": 1067 }, { "epoch": 0.182782817046038, "grad_norm": 24.17336654663086, "learning_rate": 6.063890473474045e-06, "loss": 3.8977, "step": 1068 }, { "epoch": 0.18295396200581893, "grad_norm": 32.84638214111328, "learning_rate": 6.069594980034227e-06, "loss": 5.7649, "step": 1069 }, { "epoch": 0.18312510696559986, "grad_norm": 46.32835388183594, "learning_rate": 6.07529948659441e-06, "loss": 9.2569, "step": 1070 }, { "epoch": 0.1832962519253808, "grad_norm": 15.697673797607422, "learning_rate": 6.081003993154592e-06, "loss": 1.6445, "step": 1071 }, { "epoch": 0.18346739688516173, "grad_norm": 31.891868591308594, "learning_rate": 6.086708499714775e-06, "loss": 5.4669, "step": 1072 }, { "epoch": 0.18363854184494266, "grad_norm": 29.735248565673828, "learning_rate": 6.092413006274957e-06, "loss": 5.0552, "step": 1073 }, { "epoch": 0.1838096868047236, "grad_norm": 15.486328125, "learning_rate": 6.09811751283514e-06, "loss": 2.2292, "step": 1074 }, { "epoch": 0.18398083176450453, "grad_norm": 24.518693923950195, "learning_rate": 6.103822019395322e-06, "loss": 3.5355, "step": 1075 }, { "epoch": 0.18415197672428546, "grad_norm": 27.474645614624023, "learning_rate": 6.109526525955505e-06, "loss": 2.0704, "step": 1076 }, { "epoch": 0.1843231216840664, "grad_norm": 21.003856658935547, "learning_rate": 6.115231032515688e-06, "loss": 2.0773, "step": 1077 }, { "epoch": 0.18449426664384733, "grad_norm": 12.948555946350098, "learning_rate": 6.12093553907587e-06, "loss": 1.9105, "step": 1078 }, { "epoch": 0.18466541160362826, "grad_norm": 28.35967254638672, "learning_rate": 6.1266400456360525e-06, "loss": 5.1778, "step": 1079 }, { "epoch": 0.18483655656340922, "grad_norm": 28.59235954284668, "learning_rate": 6.132344552196235e-06, "loss": 3.9724, "step": 1080 }, { "epoch": 0.18500770152319015, "grad_norm": 32.077518463134766, "learning_rate": 6.138049058756418e-06, "loss": 4.2397, "step": 1081 }, { "epoch": 0.1851788464829711, "grad_norm": 34.8428955078125, "learning_rate": 6.1437535653166e-06, "loss": 4.3906, "step": 1082 }, { "epoch": 0.18534999144275202, "grad_norm": 36.8244743347168, "learning_rate": 6.1494580718767835e-06, "loss": 4.6433, "step": 1083 }, { "epoch": 0.18552113640253295, "grad_norm": 34.37318420410156, "learning_rate": 6.155162578436965e-06, "loss": 4.7285, "step": 1084 }, { "epoch": 0.1856922813623139, "grad_norm": 34.02301025390625, "learning_rate": 6.160867084997148e-06, "loss": 5.1995, "step": 1085 }, { "epoch": 0.18586342632209482, "grad_norm": 15.779897689819336, "learning_rate": 6.16657159155733e-06, "loss": 1.5138, "step": 1086 }, { "epoch": 0.18603457128187575, "grad_norm": 45.183841705322266, "learning_rate": 6.172276098117513e-06, "loss": 6.6194, "step": 1087 }, { "epoch": 0.1862057162416567, "grad_norm": 15.437774658203125, "learning_rate": 6.177980604677695e-06, "loss": 1.4242, "step": 1088 }, { "epoch": 0.18637686120143762, "grad_norm": 246.0555419921875, "learning_rate": 6.183685111237879e-06, "loss": 10.7677, "step": 1089 }, { "epoch": 0.18654800616121855, "grad_norm": 8.7081937789917, "learning_rate": 6.18938961779806e-06, "loss": 2.3527, "step": 1090 }, { "epoch": 0.1867191511209995, "grad_norm": 35.0928840637207, "learning_rate": 6.195094124358243e-06, "loss": 5.4856, "step": 1091 }, { "epoch": 0.18689029608078042, "grad_norm": 36.24078369140625, "learning_rate": 6.2007986309184254e-06, "loss": 5.1105, "step": 1092 }, { "epoch": 0.18706144104056135, "grad_norm": 41.07029724121094, "learning_rate": 6.206503137478608e-06, "loss": 5.543, "step": 1093 }, { "epoch": 0.1872325860003423, "grad_norm": 36.27534484863281, "learning_rate": 6.212207644038791e-06, "loss": 4.4058, "step": 1094 }, { "epoch": 0.18740373096012322, "grad_norm": 34.61309814453125, "learning_rate": 6.217912150598974e-06, "loss": 4.9065, "step": 1095 }, { "epoch": 0.18757487591990415, "grad_norm": 36.856388092041016, "learning_rate": 6.223616657159156e-06, "loss": 4.8059, "step": 1096 }, { "epoch": 0.1877460208796851, "grad_norm": 39.40951156616211, "learning_rate": 6.229321163719338e-06, "loss": 5.8853, "step": 1097 }, { "epoch": 0.18791716583946602, "grad_norm": 30.013790130615234, "learning_rate": 6.235025670279521e-06, "loss": 4.1051, "step": 1098 }, { "epoch": 0.18808831079924695, "grad_norm": 27.43667984008789, "learning_rate": 6.240730176839703e-06, "loss": 3.661, "step": 1099 }, { "epoch": 0.1882594557590279, "grad_norm": 22.01202964782715, "learning_rate": 6.2464346833998865e-06, "loss": 2.0165, "step": 1100 }, { "epoch": 0.18843060071880882, "grad_norm": 23.981887817382812, "learning_rate": 6.252139189960069e-06, "loss": 1.8586, "step": 1101 }, { "epoch": 0.18860174567858976, "grad_norm": 221.93540954589844, "learning_rate": 6.257843696520252e-06, "loss": 8.0869, "step": 1102 }, { "epoch": 0.1887728906383707, "grad_norm": 32.2524299621582, "learning_rate": 6.263548203080433e-06, "loss": 4.6553, "step": 1103 }, { "epoch": 0.18894403559815162, "grad_norm": 14.555329322814941, "learning_rate": 6.269252709640616e-06, "loss": 2.0657, "step": 1104 }, { "epoch": 0.18911518055793256, "grad_norm": 27.233903884887695, "learning_rate": 6.274957216200798e-06, "loss": 3.7143, "step": 1105 }, { "epoch": 0.18928632551771352, "grad_norm": 15.294402122497559, "learning_rate": 6.280661722760982e-06, "loss": 1.4409, "step": 1106 }, { "epoch": 0.18945747047749445, "grad_norm": 223.1316375732422, "learning_rate": 6.286366229321164e-06, "loss": 9.676, "step": 1107 }, { "epoch": 0.18962861543727538, "grad_norm": 36.643463134765625, "learning_rate": 6.292070735881347e-06, "loss": 4.7202, "step": 1108 }, { "epoch": 0.18979976039705632, "grad_norm": 37.47721481323242, "learning_rate": 6.2977752424415285e-06, "loss": 4.8366, "step": 1109 }, { "epoch": 0.18997090535683725, "grad_norm": 34.74982833862305, "learning_rate": 6.303479749001711e-06, "loss": 4.6667, "step": 1110 }, { "epoch": 0.19014205031661818, "grad_norm": 38.055728912353516, "learning_rate": 6.3091842555618935e-06, "loss": 5.3396, "step": 1111 }, { "epoch": 0.19031319527639912, "grad_norm": 33.44966506958008, "learning_rate": 6.314888762122077e-06, "loss": 5.0909, "step": 1112 }, { "epoch": 0.19048434023618005, "grad_norm": 34.397132873535156, "learning_rate": 6.3205932686822594e-06, "loss": 5.3514, "step": 1113 }, { "epoch": 0.19065548519596098, "grad_norm": 39.06338119506836, "learning_rate": 6.326297775242442e-06, "loss": 6.3797, "step": 1114 }, { "epoch": 0.19082663015574192, "grad_norm": 40.017799377441406, "learning_rate": 6.332002281802624e-06, "loss": 5.5943, "step": 1115 }, { "epoch": 0.19099777511552285, "grad_norm": 11.964347839355469, "learning_rate": 6.337706788362806e-06, "loss": 1.8095, "step": 1116 }, { "epoch": 0.19116892007530378, "grad_norm": 12.956400871276855, "learning_rate": 6.3434112949229896e-06, "loss": 1.3529, "step": 1117 }, { "epoch": 0.19134006503508472, "grad_norm": 36.93289566040039, "learning_rate": 6.349115801483172e-06, "loss": 6.0492, "step": 1118 }, { "epoch": 0.19151120999486565, "grad_norm": 33.92202377319336, "learning_rate": 6.354820308043355e-06, "loss": 5.9093, "step": 1119 }, { "epoch": 0.19168235495464658, "grad_norm": 37.51108169555664, "learning_rate": 6.360524814603537e-06, "loss": 5.5156, "step": 1120 }, { "epoch": 0.19185349991442752, "grad_norm": 23.369075775146484, "learning_rate": 6.36622932116372e-06, "loss": 3.9585, "step": 1121 }, { "epoch": 0.19202464487420845, "grad_norm": 27.76898765563965, "learning_rate": 6.371933827723901e-06, "loss": 4.0578, "step": 1122 }, { "epoch": 0.19219578983398938, "grad_norm": 21.719980239868164, "learning_rate": 6.377638334284085e-06, "loss": 1.6746, "step": 1123 }, { "epoch": 0.19236693479377032, "grad_norm": 32.65765380859375, "learning_rate": 6.383342840844267e-06, "loss": 4.4355, "step": 1124 }, { "epoch": 0.19253807975355125, "grad_norm": 31.302228927612305, "learning_rate": 6.38904734740445e-06, "loss": 4.3111, "step": 1125 }, { "epoch": 0.19270922471333218, "grad_norm": 36.785396575927734, "learning_rate": 6.394751853964632e-06, "loss": 5.3737, "step": 1126 }, { "epoch": 0.19288036967311312, "grad_norm": 32.185787200927734, "learning_rate": 6.400456360524815e-06, "loss": 4.2842, "step": 1127 }, { "epoch": 0.19305151463289405, "grad_norm": 49.154666900634766, "learning_rate": 6.4061608670849966e-06, "loss": 8.8989, "step": 1128 }, { "epoch": 0.19322265959267498, "grad_norm": 31.552207946777344, "learning_rate": 6.41186537364518e-06, "loss": 4.2685, "step": 1129 }, { "epoch": 0.19339380455245592, "grad_norm": 21.41136932373047, "learning_rate": 6.4175698802053625e-06, "loss": 2.3051, "step": 1130 }, { "epoch": 0.19356494951223688, "grad_norm": 13.525940895080566, "learning_rate": 6.423274386765545e-06, "loss": 2.1123, "step": 1131 }, { "epoch": 0.1937360944720178, "grad_norm": 37.48530960083008, "learning_rate": 6.4289788933257275e-06, "loss": 4.8037, "step": 1132 }, { "epoch": 0.19390723943179874, "grad_norm": 38.14132308959961, "learning_rate": 6.43468339988591e-06, "loss": 6.2294, "step": 1133 }, { "epoch": 0.19407838439157968, "grad_norm": 33.01750183105469, "learning_rate": 6.440387906446093e-06, "loss": 4.9204, "step": 1134 }, { "epoch": 0.1942495293513606, "grad_norm": 36.364158630371094, "learning_rate": 6.446092413006275e-06, "loss": 4.5797, "step": 1135 }, { "epoch": 0.19442067431114154, "grad_norm": 46.81378173828125, "learning_rate": 6.451796919566458e-06, "loss": 6.538, "step": 1136 }, { "epoch": 0.19459181927092248, "grad_norm": 23.135957717895508, "learning_rate": 6.45750142612664e-06, "loss": 4.3991, "step": 1137 }, { "epoch": 0.1947629642307034, "grad_norm": 25.031917572021484, "learning_rate": 6.463205932686823e-06, "loss": 2.3886, "step": 1138 }, { "epoch": 0.19493410919048434, "grad_norm": 35.31920623779297, "learning_rate": 6.468910439247005e-06, "loss": 6.0172, "step": 1139 }, { "epoch": 0.19510525415026528, "grad_norm": 36.97047424316406, "learning_rate": 6.474614945807188e-06, "loss": 5.4822, "step": 1140 }, { "epoch": 0.1952763991100462, "grad_norm": 31.77883529663086, "learning_rate": 6.48031945236737e-06, "loss": 4.7072, "step": 1141 }, { "epoch": 0.19544754406982714, "grad_norm": 28.897930145263672, "learning_rate": 6.486023958927553e-06, "loss": 3.7105, "step": 1142 }, { "epoch": 0.19561868902960808, "grad_norm": 29.99696922302246, "learning_rate": 6.491728465487735e-06, "loss": 4.5102, "step": 1143 }, { "epoch": 0.195789833989389, "grad_norm": 25.783557891845703, "learning_rate": 6.497432972047918e-06, "loss": 3.6023, "step": 1144 }, { "epoch": 0.19596097894916994, "grad_norm": 35.004642486572266, "learning_rate": 6.5031374786081005e-06, "loss": 4.1587, "step": 1145 }, { "epoch": 0.19613212390895088, "grad_norm": 173.46754455566406, "learning_rate": 6.508841985168284e-06, "loss": 7.5547, "step": 1146 }, { "epoch": 0.1963032688687318, "grad_norm": 18.749853134155273, "learning_rate": 6.5145464917284655e-06, "loss": 1.7298, "step": 1147 }, { "epoch": 0.19647441382851275, "grad_norm": 31.15353012084961, "learning_rate": 6.520250998288648e-06, "loss": 5.4053, "step": 1148 }, { "epoch": 0.19664555878829368, "grad_norm": 21.659912109375, "learning_rate": 6.525955504848831e-06, "loss": 1.8891, "step": 1149 }, { "epoch": 0.1968167037480746, "grad_norm": 23.412139892578125, "learning_rate": 6.531660011409013e-06, "loss": 3.8619, "step": 1150 }, { "epoch": 0.19698784870785555, "grad_norm": 22.16069221496582, "learning_rate": 6.537364517969196e-06, "loss": 2.0106, "step": 1151 }, { "epoch": 0.19715899366763648, "grad_norm": 33.494136810302734, "learning_rate": 6.543069024529379e-06, "loss": 5.4958, "step": 1152 }, { "epoch": 0.1973301386274174, "grad_norm": 32.96882629394531, "learning_rate": 6.548773531089561e-06, "loss": 4.5927, "step": 1153 }, { "epoch": 0.19750128358719835, "grad_norm": 36.14384078979492, "learning_rate": 6.554478037649743e-06, "loss": 5.6357, "step": 1154 }, { "epoch": 0.19767242854697928, "grad_norm": 23.875118255615234, "learning_rate": 6.560182544209926e-06, "loss": 3.158, "step": 1155 }, { "epoch": 0.19784357350676024, "grad_norm": 23.001026153564453, "learning_rate": 6.565887050770108e-06, "loss": 1.8949, "step": 1156 }, { "epoch": 0.19801471846654117, "grad_norm": 46.26600646972656, "learning_rate": 6.571591557330292e-06, "loss": 9.1329, "step": 1157 }, { "epoch": 0.1981858634263221, "grad_norm": 16.32296371459961, "learning_rate": 6.577296063890474e-06, "loss": 1.5302, "step": 1158 }, { "epoch": 0.19835700838610304, "grad_norm": 26.114614486694336, "learning_rate": 6.583000570450656e-06, "loss": 2.3763, "step": 1159 }, { "epoch": 0.19852815334588397, "grad_norm": 37.42622756958008, "learning_rate": 6.5887050770108384e-06, "loss": 5.5999, "step": 1160 }, { "epoch": 0.1986992983056649, "grad_norm": 21.48786735534668, "learning_rate": 6.594409583571021e-06, "loss": 3.4369, "step": 1161 }, { "epoch": 0.19887044326544584, "grad_norm": 24.472808837890625, "learning_rate": 6.6001140901312035e-06, "loss": 2.0175, "step": 1162 }, { "epoch": 0.19904158822522677, "grad_norm": 25.275909423828125, "learning_rate": 6.605818596691387e-06, "loss": 2.6992, "step": 1163 }, { "epoch": 0.1992127331850077, "grad_norm": 29.439197540283203, "learning_rate": 6.611523103251569e-06, "loss": 4.4373, "step": 1164 }, { "epoch": 0.19938387814478864, "grad_norm": 224.64663696289062, "learning_rate": 6.617227609811751e-06, "loss": 10.3737, "step": 1165 }, { "epoch": 0.19955502310456957, "grad_norm": 34.043575286865234, "learning_rate": 6.622932116371934e-06, "loss": 5.0921, "step": 1166 }, { "epoch": 0.1997261680643505, "grad_norm": 11.060107231140137, "learning_rate": 6.628636622932116e-06, "loss": 1.2996, "step": 1167 }, { "epoch": 0.19989731302413144, "grad_norm": 32.19368362426758, "learning_rate": 6.634341129492299e-06, "loss": 4.2537, "step": 1168 }, { "epoch": 0.20006845798391237, "grad_norm": 48.267578125, "learning_rate": 6.640045636052482e-06, "loss": 9.335, "step": 1169 }, { "epoch": 0.2002396029436933, "grad_norm": 19.327762603759766, "learning_rate": 6.645750142612665e-06, "loss": 1.8859, "step": 1170 }, { "epoch": 0.20041074790347424, "grad_norm": 28.81614875793457, "learning_rate": 6.651454649172847e-06, "loss": 3.8125, "step": 1171 }, { "epoch": 0.20058189286325517, "grad_norm": 24.971960067749023, "learning_rate": 6.657159155733029e-06, "loss": 3.0816, "step": 1172 }, { "epoch": 0.2007530378230361, "grad_norm": 154.4432373046875, "learning_rate": 6.662863662293211e-06, "loss": 8.568, "step": 1173 }, { "epoch": 0.20092418278281704, "grad_norm": 47.04978942871094, "learning_rate": 6.668568168853395e-06, "loss": 5.1816, "step": 1174 }, { "epoch": 0.20109532774259797, "grad_norm": 24.374345779418945, "learning_rate": 6.674272675413577e-06, "loss": 2.6078, "step": 1175 }, { "epoch": 0.2012664727023789, "grad_norm": 36.597232818603516, "learning_rate": 6.67997718197376e-06, "loss": 5.5402, "step": 1176 }, { "epoch": 0.20143761766215984, "grad_norm": 36.612060546875, "learning_rate": 6.685681688533942e-06, "loss": 5.17, "step": 1177 }, { "epoch": 0.20160876262194077, "grad_norm": 39.452117919921875, "learning_rate": 6.691386195094124e-06, "loss": 6.2861, "step": 1178 }, { "epoch": 0.2017799075817217, "grad_norm": 35.985816955566406, "learning_rate": 6.6970907016543065e-06, "loss": 5.7763, "step": 1179 }, { "epoch": 0.20195105254150264, "grad_norm": 11.960805892944336, "learning_rate": 6.70279520821449e-06, "loss": 2.7312, "step": 1180 }, { "epoch": 0.20212219750128357, "grad_norm": 154.7554168701172, "learning_rate": 6.7084997147746724e-06, "loss": 9.5806, "step": 1181 }, { "epoch": 0.20229334246106453, "grad_norm": 31.713943481445312, "learning_rate": 6.714204221334855e-06, "loss": 4.9006, "step": 1182 }, { "epoch": 0.20246448742084547, "grad_norm": 11.431591987609863, "learning_rate": 6.7199087278950375e-06, "loss": 3.1028, "step": 1183 }, { "epoch": 0.2026356323806264, "grad_norm": 208.2880859375, "learning_rate": 6.725613234455219e-06, "loss": 8.5447, "step": 1184 }, { "epoch": 0.20280677734040733, "grad_norm": 32.78763198852539, "learning_rate": 6.731317741015402e-06, "loss": 5.0437, "step": 1185 }, { "epoch": 0.20297792230018827, "grad_norm": 31.15655517578125, "learning_rate": 6.737022247575585e-06, "loss": 4.1921, "step": 1186 }, { "epoch": 0.2031490672599692, "grad_norm": 12.072607040405273, "learning_rate": 6.742726754135768e-06, "loss": 1.9291, "step": 1187 }, { "epoch": 0.20332021221975013, "grad_norm": 46.76679992675781, "learning_rate": 6.74843126069595e-06, "loss": 9.0577, "step": 1188 }, { "epoch": 0.20349135717953107, "grad_norm": 28.912738800048828, "learning_rate": 6.754135767256133e-06, "loss": 4.3274, "step": 1189 }, { "epoch": 0.203662502139312, "grad_norm": 151.7112579345703, "learning_rate": 6.759840273816315e-06, "loss": 8.1049, "step": 1190 }, { "epoch": 0.20383364709909293, "grad_norm": 19.557729721069336, "learning_rate": 6.765544780376497e-06, "loss": 1.6717, "step": 1191 }, { "epoch": 0.20400479205887387, "grad_norm": 37.28075408935547, "learning_rate": 6.77124928693668e-06, "loss": 5.6393, "step": 1192 }, { "epoch": 0.2041759370186548, "grad_norm": 33.639183044433594, "learning_rate": 6.776953793496863e-06, "loss": 4.9937, "step": 1193 }, { "epoch": 0.20434708197843574, "grad_norm": 16.514705657958984, "learning_rate": 6.782658300057045e-06, "loss": 2.2396, "step": 1194 }, { "epoch": 0.20451822693821667, "grad_norm": 29.29157066345215, "learning_rate": 6.788362806617228e-06, "loss": 4.5062, "step": 1195 }, { "epoch": 0.2046893718979976, "grad_norm": 24.25420570373535, "learning_rate": 6.79406731317741e-06, "loss": 2.5282, "step": 1196 }, { "epoch": 0.20486051685777854, "grad_norm": 21.87625503540039, "learning_rate": 6.799771819737593e-06, "loss": 2.2101, "step": 1197 }, { "epoch": 0.20503166181755947, "grad_norm": 29.727163314819336, "learning_rate": 6.8054763262977755e-06, "loss": 3.5679, "step": 1198 }, { "epoch": 0.2052028067773404, "grad_norm": 23.502267837524414, "learning_rate": 6.811180832857958e-06, "loss": 3.9821, "step": 1199 }, { "epoch": 0.20537395173712134, "grad_norm": 31.961931228637695, "learning_rate": 6.8168853394181405e-06, "loss": 4.6, "step": 1200 }, { "epoch": 0.20554509669690227, "grad_norm": 27.584300994873047, "learning_rate": 6.822589845978323e-06, "loss": 3.389, "step": 1201 }, { "epoch": 0.2057162416566832, "grad_norm": 34.41096115112305, "learning_rate": 6.828294352538506e-06, "loss": 4.722, "step": 1202 }, { "epoch": 0.20588738661646414, "grad_norm": 41.341312408447266, "learning_rate": 6.833998859098688e-06, "loss": 6.7225, "step": 1203 }, { "epoch": 0.20605853157624507, "grad_norm": 160.5906982421875, "learning_rate": 6.839703365658871e-06, "loss": 9.8412, "step": 1204 }, { "epoch": 0.206229676536026, "grad_norm": 23.49472999572754, "learning_rate": 6.845407872219053e-06, "loss": 3.6378, "step": 1205 }, { "epoch": 0.20640082149580694, "grad_norm": 31.307947158813477, "learning_rate": 6.851112378779236e-06, "loss": 3.6813, "step": 1206 }, { "epoch": 0.2065719664555879, "grad_norm": 27.893850326538086, "learning_rate": 6.856816885339418e-06, "loss": 4.5216, "step": 1207 }, { "epoch": 0.20674311141536883, "grad_norm": 32.200157165527344, "learning_rate": 6.862521391899601e-06, "loss": 4.5525, "step": 1208 }, { "epoch": 0.20691425637514976, "grad_norm": 31.765216827392578, "learning_rate": 6.868225898459783e-06, "loss": 5.2865, "step": 1209 }, { "epoch": 0.2070854013349307, "grad_norm": 35.562294006347656, "learning_rate": 6.873930405019966e-06, "loss": 5.0758, "step": 1210 }, { "epoch": 0.20725654629471163, "grad_norm": 44.582786560058594, "learning_rate": 6.879634911580148e-06, "loss": 8.7973, "step": 1211 }, { "epoch": 0.20742769125449256, "grad_norm": 29.667964935302734, "learning_rate": 6.885339418140331e-06, "loss": 3.7483, "step": 1212 }, { "epoch": 0.2075988362142735, "grad_norm": 33.826454162597656, "learning_rate": 6.8910439247005135e-06, "loss": 5.321, "step": 1213 }, { "epoch": 0.20776998117405443, "grad_norm": 36.56757354736328, "learning_rate": 6.896748431260697e-06, "loss": 4.6366, "step": 1214 }, { "epoch": 0.20794112613383536, "grad_norm": 21.483030319213867, "learning_rate": 6.9024529378208785e-06, "loss": 1.7844, "step": 1215 }, { "epoch": 0.2081122710936163, "grad_norm": 22.398630142211914, "learning_rate": 6.908157444381061e-06, "loss": 2.9002, "step": 1216 }, { "epoch": 0.20828341605339723, "grad_norm": 16.41680145263672, "learning_rate": 6.913861950941244e-06, "loss": 1.5466, "step": 1217 }, { "epoch": 0.20845456101317816, "grad_norm": 22.448949813842773, "learning_rate": 6.919566457501426e-06, "loss": 3.4011, "step": 1218 }, { "epoch": 0.2086257059729591, "grad_norm": 35.074989318847656, "learning_rate": 6.925270964061609e-06, "loss": 4.4769, "step": 1219 }, { "epoch": 0.20879685093274003, "grad_norm": 29.737442016601562, "learning_rate": 6.930975470621792e-06, "loss": 4.6152, "step": 1220 }, { "epoch": 0.20896799589252096, "grad_norm": 29.097299575805664, "learning_rate": 6.9366799771819746e-06, "loss": 3.8591, "step": 1221 }, { "epoch": 0.2091391408523019, "grad_norm": 22.356008529663086, "learning_rate": 6.942384483742156e-06, "loss": 3.6379, "step": 1222 }, { "epoch": 0.20931028581208283, "grad_norm": 29.412656784057617, "learning_rate": 6.948088990302339e-06, "loss": 3.5976, "step": 1223 }, { "epoch": 0.20948143077186376, "grad_norm": 19.5412654876709, "learning_rate": 6.953793496862521e-06, "loss": 2.0718, "step": 1224 }, { "epoch": 0.2096525757316447, "grad_norm": 17.43561363220215, "learning_rate": 6.959498003422704e-06, "loss": 1.5389, "step": 1225 }, { "epoch": 0.20982372069142563, "grad_norm": 34.85890579223633, "learning_rate": 6.965202509982887e-06, "loss": 4.4105, "step": 1226 }, { "epoch": 0.20999486565120656, "grad_norm": 33.83147430419922, "learning_rate": 6.97090701654307e-06, "loss": 4.108, "step": 1227 }, { "epoch": 0.2101660106109875, "grad_norm": 33.77149963378906, "learning_rate": 6.9766115231032514e-06, "loss": 4.4198, "step": 1228 }, { "epoch": 0.21033715557076843, "grad_norm": 12.30455207824707, "learning_rate": 6.982316029663434e-06, "loss": 1.7759, "step": 1229 }, { "epoch": 0.21050830053054936, "grad_norm": 34.55380630493164, "learning_rate": 6.9880205362236165e-06, "loss": 4.4813, "step": 1230 }, { "epoch": 0.2106794454903303, "grad_norm": 23.975025177001953, "learning_rate": 6.993725042783799e-06, "loss": 3.728, "step": 1231 }, { "epoch": 0.21085059045011123, "grad_norm": 190.6012725830078, "learning_rate": 6.999429549343982e-06, "loss": 10.1602, "step": 1232 }, { "epoch": 0.2110217354098922, "grad_norm": 34.527076721191406, "learning_rate": 7.005134055904165e-06, "loss": 4.7483, "step": 1233 }, { "epoch": 0.21119288036967312, "grad_norm": 35.65943908691406, "learning_rate": 7.010838562464347e-06, "loss": 5.5499, "step": 1234 }, { "epoch": 0.21136402532945406, "grad_norm": 34.03565216064453, "learning_rate": 7.016543069024529e-06, "loss": 4.7829, "step": 1235 }, { "epoch": 0.211535170289235, "grad_norm": 20.10201072692871, "learning_rate": 7.022247575584712e-06, "loss": 2.9853, "step": 1236 }, { "epoch": 0.21170631524901593, "grad_norm": 72.77118682861328, "learning_rate": 7.027952082144895e-06, "loss": 6.8184, "step": 1237 }, { "epoch": 0.21187746020879686, "grad_norm": 32.084381103515625, "learning_rate": 7.033656588705078e-06, "loss": 5.0572, "step": 1238 }, { "epoch": 0.2120486051685778, "grad_norm": 28.180423736572266, "learning_rate": 7.03936109526526e-06, "loss": 3.8185, "step": 1239 }, { "epoch": 0.21221975012835873, "grad_norm": 20.687843322753906, "learning_rate": 7.045065601825443e-06, "loss": 2.1643, "step": 1240 }, { "epoch": 0.21239089508813966, "grad_norm": 15.380537033081055, "learning_rate": 7.050770108385624e-06, "loss": 1.6453, "step": 1241 }, { "epoch": 0.2125620400479206, "grad_norm": 38.16814422607422, "learning_rate": 7.056474614945807e-06, "loss": 5.8775, "step": 1242 }, { "epoch": 0.21273318500770153, "grad_norm": 43.55405807495117, "learning_rate": 7.06217912150599e-06, "loss": 5.1528, "step": 1243 }, { "epoch": 0.21290432996748246, "grad_norm": 30.40400505065918, "learning_rate": 7.067883628066173e-06, "loss": 4.155, "step": 1244 }, { "epoch": 0.2130754749272634, "grad_norm": 39.55487823486328, "learning_rate": 7.073588134626355e-06, "loss": 6.8649, "step": 1245 }, { "epoch": 0.21324661988704433, "grad_norm": 46.886600494384766, "learning_rate": 7.079292641186538e-06, "loss": 4.8251, "step": 1246 }, { "epoch": 0.21341776484682526, "grad_norm": 35.842594146728516, "learning_rate": 7.0849971477467195e-06, "loss": 5.3382, "step": 1247 }, { "epoch": 0.2135889098066062, "grad_norm": 10.459444999694824, "learning_rate": 7.090701654306902e-06, "loss": 1.1781, "step": 1248 }, { "epoch": 0.21376005476638713, "grad_norm": 31.134531021118164, "learning_rate": 7.0964061608670854e-06, "loss": 3.3419, "step": 1249 }, { "epoch": 0.21393119972616806, "grad_norm": 32.50645065307617, "learning_rate": 7.102110667427268e-06, "loss": 4.1592, "step": 1250 }, { "epoch": 0.214102344685949, "grad_norm": 38.065643310546875, "learning_rate": 7.1078151739874505e-06, "loss": 6.1903, "step": 1251 }, { "epoch": 0.21427348964572993, "grad_norm": 32.13066482543945, "learning_rate": 7.113519680547633e-06, "loss": 3.8917, "step": 1252 }, { "epoch": 0.21444463460551086, "grad_norm": 22.333932876586914, "learning_rate": 7.119224187107815e-06, "loss": 3.308, "step": 1253 }, { "epoch": 0.2146157795652918, "grad_norm": 8.437789916992188, "learning_rate": 7.124928693667997e-06, "loss": 2.2375, "step": 1254 }, { "epoch": 0.21478692452507273, "grad_norm": 32.72603225708008, "learning_rate": 7.130633200228181e-06, "loss": 4.8237, "step": 1255 }, { "epoch": 0.21495806948485366, "grad_norm": 34.640647888183594, "learning_rate": 7.136337706788363e-06, "loss": 5.2757, "step": 1256 }, { "epoch": 0.2151292144446346, "grad_norm": 20.100618362426758, "learning_rate": 7.142042213348546e-06, "loss": 2.961, "step": 1257 }, { "epoch": 0.21530035940441555, "grad_norm": 43.29427719116211, "learning_rate": 7.147746719908728e-06, "loss": 8.933, "step": 1258 }, { "epoch": 0.2154715043641965, "grad_norm": 33.56546401977539, "learning_rate": 7.15345122646891e-06, "loss": 4.6558, "step": 1259 }, { "epoch": 0.21564264932397742, "grad_norm": 33.7791633605957, "learning_rate": 7.159155733029093e-06, "loss": 4.183, "step": 1260 }, { "epoch": 0.21581379428375835, "grad_norm": 33.235233306884766, "learning_rate": 7.164860239589276e-06, "loss": 3.7487, "step": 1261 }, { "epoch": 0.2159849392435393, "grad_norm": 140.30621337890625, "learning_rate": 7.170564746149458e-06, "loss": 9.0381, "step": 1262 }, { "epoch": 0.21615608420332022, "grad_norm": 20.70719337463379, "learning_rate": 7.176269252709641e-06, "loss": 1.7769, "step": 1263 }, { "epoch": 0.21632722916310115, "grad_norm": 36.93478012084961, "learning_rate": 7.181973759269823e-06, "loss": 4.5665, "step": 1264 }, { "epoch": 0.2164983741228821, "grad_norm": 81.26618957519531, "learning_rate": 7.187678265830006e-06, "loss": 7.0141, "step": 1265 }, { "epoch": 0.21666951908266302, "grad_norm": 33.15439224243164, "learning_rate": 7.1933827723901885e-06, "loss": 4.5814, "step": 1266 }, { "epoch": 0.21684066404244395, "grad_norm": 26.268171310424805, "learning_rate": 7.199087278950371e-06, "loss": 3.0891, "step": 1267 }, { "epoch": 0.2170118090022249, "grad_norm": 35.35780715942383, "learning_rate": 7.2047917855105535e-06, "loss": 4.8355, "step": 1268 }, { "epoch": 0.21718295396200582, "grad_norm": 21.87150764465332, "learning_rate": 7.210496292070736e-06, "loss": 1.7614, "step": 1269 }, { "epoch": 0.21735409892178675, "grad_norm": 36.49989318847656, "learning_rate": 7.216200798630919e-06, "loss": 5.8824, "step": 1270 }, { "epoch": 0.2175252438815677, "grad_norm": 11.613662719726562, "learning_rate": 7.221905305191101e-06, "loss": 1.7057, "step": 1271 }, { "epoch": 0.21769638884134862, "grad_norm": 28.447458267211914, "learning_rate": 7.227609811751284e-06, "loss": 4.3815, "step": 1272 }, { "epoch": 0.21786753380112955, "grad_norm": 34.95615005493164, "learning_rate": 7.233314318311466e-06, "loss": 4.7223, "step": 1273 }, { "epoch": 0.2180386787609105, "grad_norm": 36.12034606933594, "learning_rate": 7.239018824871649e-06, "loss": 5.4639, "step": 1274 }, { "epoch": 0.21820982372069142, "grad_norm": 29.200042724609375, "learning_rate": 7.244723331431831e-06, "loss": 3.9203, "step": 1275 }, { "epoch": 0.21838096868047235, "grad_norm": 173.54055786132812, "learning_rate": 7.250427837992014e-06, "loss": 9.2819, "step": 1276 }, { "epoch": 0.2185521136402533, "grad_norm": 30.67865562438965, "learning_rate": 7.256132344552197e-06, "loss": 4.7412, "step": 1277 }, { "epoch": 0.21872325860003422, "grad_norm": 35.703468322753906, "learning_rate": 7.261836851112379e-06, "loss": 5.3418, "step": 1278 }, { "epoch": 0.21889440355981515, "grad_norm": 35.29546356201172, "learning_rate": 7.267541357672561e-06, "loss": 5.1735, "step": 1279 }, { "epoch": 0.2190655485195961, "grad_norm": 20.382551193237305, "learning_rate": 7.273245864232744e-06, "loss": 1.8851, "step": 1280 }, { "epoch": 0.21923669347937702, "grad_norm": 20.68045997619629, "learning_rate": 7.2789503707929265e-06, "loss": 2.681, "step": 1281 }, { "epoch": 0.21940783843915795, "grad_norm": 37.52497482299805, "learning_rate": 7.284654877353109e-06, "loss": 5.9113, "step": 1282 }, { "epoch": 0.21957898339893892, "grad_norm": 154.6285858154297, "learning_rate": 7.290359383913292e-06, "loss": 8.0077, "step": 1283 }, { "epoch": 0.21975012835871985, "grad_norm": 28.380836486816406, "learning_rate": 7.296063890473474e-06, "loss": 3.5758, "step": 1284 }, { "epoch": 0.21992127331850078, "grad_norm": 13.987469673156738, "learning_rate": 7.301768397033657e-06, "loss": 1.4051, "step": 1285 }, { "epoch": 0.22009241827828172, "grad_norm": 21.18030548095703, "learning_rate": 7.307472903593839e-06, "loss": 3.1844, "step": 1286 }, { "epoch": 0.22026356323806265, "grad_norm": 13.61611270904541, "learning_rate": 7.313177410154022e-06, "loss": 1.4008, "step": 1287 }, { "epoch": 0.22043470819784358, "grad_norm": 32.63056182861328, "learning_rate": 7.318881916714204e-06, "loss": 5.485, "step": 1288 }, { "epoch": 0.22060585315762452, "grad_norm": 12.39704704284668, "learning_rate": 7.3245864232743876e-06, "loss": 2.8362, "step": 1289 }, { "epoch": 0.22077699811740545, "grad_norm": 160.39300537109375, "learning_rate": 7.33029092983457e-06, "loss": 9.3207, "step": 1290 }, { "epoch": 0.22094814307718638, "grad_norm": 35.63487243652344, "learning_rate": 7.335995436394752e-06, "loss": 4.3364, "step": 1291 }, { "epoch": 0.22111928803696732, "grad_norm": 18.865745544433594, "learning_rate": 7.341699942954934e-06, "loss": 1.9152, "step": 1292 }, { "epoch": 0.22129043299674825, "grad_norm": 34.95203399658203, "learning_rate": 7.347404449515117e-06, "loss": 4.2394, "step": 1293 }, { "epoch": 0.22146157795652918, "grad_norm": 32.99889373779297, "learning_rate": 7.353108956075299e-06, "loss": 5.7603, "step": 1294 }, { "epoch": 0.22163272291631012, "grad_norm": 31.541820526123047, "learning_rate": 7.358813462635483e-06, "loss": 4.7464, "step": 1295 }, { "epoch": 0.22180386787609105, "grad_norm": 22.86473274230957, "learning_rate": 7.364517969195665e-06, "loss": 3.2885, "step": 1296 }, { "epoch": 0.22197501283587198, "grad_norm": 34.75326919555664, "learning_rate": 7.370222475755847e-06, "loss": 4.4337, "step": 1297 }, { "epoch": 0.22214615779565292, "grad_norm": 33.42300796508789, "learning_rate": 7.3759269823160295e-06, "loss": 4.8641, "step": 1298 }, { "epoch": 0.22231730275543385, "grad_norm": 40.14048385620117, "learning_rate": 7.381631488876212e-06, "loss": 5.3092, "step": 1299 }, { "epoch": 0.22248844771521478, "grad_norm": 33.59206008911133, "learning_rate": 7.387335995436395e-06, "loss": 4.6114, "step": 1300 }, { "epoch": 0.22265959267499572, "grad_norm": 32.96902084350586, "learning_rate": 7.393040501996578e-06, "loss": 4.9559, "step": 1301 }, { "epoch": 0.22283073763477665, "grad_norm": 76.84076690673828, "learning_rate": 7.3987450085567605e-06, "loss": 7.2409, "step": 1302 }, { "epoch": 0.22300188259455758, "grad_norm": 29.227497100830078, "learning_rate": 7.404449515116942e-06, "loss": 3.4494, "step": 1303 }, { "epoch": 0.22317302755433852, "grad_norm": 34.10039520263672, "learning_rate": 7.410154021677125e-06, "loss": 4.6513, "step": 1304 }, { "epoch": 0.22334417251411945, "grad_norm": 43.62645721435547, "learning_rate": 7.415858528237307e-06, "loss": 6.1141, "step": 1305 }, { "epoch": 0.22351531747390038, "grad_norm": 29.59916877746582, "learning_rate": 7.421563034797491e-06, "loss": 4.5189, "step": 1306 }, { "epoch": 0.22368646243368132, "grad_norm": 32.00434494018555, "learning_rate": 7.427267541357673e-06, "loss": 3.7625, "step": 1307 }, { "epoch": 0.22385760739346225, "grad_norm": 12.214600563049316, "learning_rate": 7.432972047917856e-06, "loss": 1.6093, "step": 1308 }, { "epoch": 0.2240287523532432, "grad_norm": 13.289321899414062, "learning_rate": 7.438676554478037e-06, "loss": 1.8433, "step": 1309 }, { "epoch": 0.22419989731302414, "grad_norm": 12.391509056091309, "learning_rate": 7.44438106103822e-06, "loss": 1.8211, "step": 1310 }, { "epoch": 0.22437104227280508, "grad_norm": 31.827852249145508, "learning_rate": 7.450085567598402e-06, "loss": 3.5607, "step": 1311 }, { "epoch": 0.224542187232586, "grad_norm": 172.93185424804688, "learning_rate": 7.455790074158586e-06, "loss": 9.5445, "step": 1312 }, { "epoch": 0.22471333219236694, "grad_norm": 18.688396453857422, "learning_rate": 7.461494580718768e-06, "loss": 1.9759, "step": 1313 }, { "epoch": 0.22488447715214788, "grad_norm": 26.364185333251953, "learning_rate": 7.467199087278951e-06, "loss": 3.2682, "step": 1314 }, { "epoch": 0.2250556221119288, "grad_norm": 8.573413848876953, "learning_rate": 7.472903593839133e-06, "loss": 1.3051, "step": 1315 }, { "epoch": 0.22522676707170974, "grad_norm": 24.913686752319336, "learning_rate": 7.478608100399315e-06, "loss": 2.4598, "step": 1316 }, { "epoch": 0.22539791203149068, "grad_norm": 30.283504486083984, "learning_rate": 7.4843126069594984e-06, "loss": 4.1503, "step": 1317 }, { "epoch": 0.2255690569912716, "grad_norm": 18.146724700927734, "learning_rate": 7.490017113519681e-06, "loss": 1.8957, "step": 1318 }, { "epoch": 0.22574020195105254, "grad_norm": 11.016623497009277, "learning_rate": 7.4957216200798635e-06, "loss": 2.636, "step": 1319 }, { "epoch": 0.22591134691083348, "grad_norm": 35.766883850097656, "learning_rate": 7.501426126640046e-06, "loss": 4.3588, "step": 1320 }, { "epoch": 0.2260824918706144, "grad_norm": 24.76753807067871, "learning_rate": 7.5071306332002286e-06, "loss": 3.2106, "step": 1321 }, { "epoch": 0.22625363683039534, "grad_norm": 35.969505310058594, "learning_rate": 7.51283513976041e-06, "loss": 4.5488, "step": 1322 }, { "epoch": 0.22642478179017628, "grad_norm": 13.215656280517578, "learning_rate": 7.518539646320593e-06, "loss": 1.7273, "step": 1323 }, { "epoch": 0.2265959267499572, "grad_norm": 32.75537872314453, "learning_rate": 7.524244152880775e-06, "loss": 4.442, "step": 1324 }, { "epoch": 0.22676707170973814, "grad_norm": 13.069498062133789, "learning_rate": 7.529948659440958e-06, "loss": 1.2864, "step": 1325 }, { "epoch": 0.22693821666951908, "grad_norm": 29.5541934967041, "learning_rate": 7.535653166001142e-06, "loss": 3.5993, "step": 1326 }, { "epoch": 0.2271093616293, "grad_norm": 36.506736755371094, "learning_rate": 7.541357672561325e-06, "loss": 4.7108, "step": 1327 }, { "epoch": 0.22728050658908094, "grad_norm": 30.510953903198242, "learning_rate": 7.547062179121506e-06, "loss": 4.168, "step": 1328 }, { "epoch": 0.22745165154886188, "grad_norm": 11.754740715026855, "learning_rate": 7.552766685681689e-06, "loss": 2.7865, "step": 1329 }, { "epoch": 0.2276227965086428, "grad_norm": 31.793643951416016, "learning_rate": 7.558471192241871e-06, "loss": 3.4931, "step": 1330 }, { "epoch": 0.22779394146842374, "grad_norm": 23.95293426513672, "learning_rate": 7.564175698802054e-06, "loss": 3.0252, "step": 1331 }, { "epoch": 0.22796508642820468, "grad_norm": 28.809511184692383, "learning_rate": 7.569880205362236e-06, "loss": 4.2144, "step": 1332 }, { "epoch": 0.2281362313879856, "grad_norm": 34.645267486572266, "learning_rate": 7.575584711922419e-06, "loss": 4.5155, "step": 1333 }, { "epoch": 0.22830737634776657, "grad_norm": 31.90658950805664, "learning_rate": 7.581289218482601e-06, "loss": 3.6445, "step": 1334 }, { "epoch": 0.2284785213075475, "grad_norm": 26.37479591369629, "learning_rate": 7.586993725042783e-06, "loss": 2.6728, "step": 1335 }, { "epoch": 0.22864966626732844, "grad_norm": 29.64954376220703, "learning_rate": 7.592698231602966e-06, "loss": 4.0421, "step": 1336 }, { "epoch": 0.22882081122710937, "grad_norm": 28.596891403198242, "learning_rate": 7.59840273816315e-06, "loss": 3.3059, "step": 1337 }, { "epoch": 0.2289919561868903, "grad_norm": 36.07052993774414, "learning_rate": 7.6041072447233325e-06, "loss": 4.2618, "step": 1338 }, { "epoch": 0.22916310114667124, "grad_norm": 50.589454650878906, "learning_rate": 7.609811751283515e-06, "loss": 9.3326, "step": 1339 }, { "epoch": 0.22933424610645217, "grad_norm": 31.4276180267334, "learning_rate": 7.6155162578436975e-06, "loss": 4.6035, "step": 1340 }, { "epoch": 0.2295053910662331, "grad_norm": 32.5452766418457, "learning_rate": 7.621220764403879e-06, "loss": 3.9264, "step": 1341 }, { "epoch": 0.22967653602601404, "grad_norm": 32.74778747558594, "learning_rate": 7.626925270964062e-06, "loss": 4.6618, "step": 1342 }, { "epoch": 0.22984768098579497, "grad_norm": 11.447990417480469, "learning_rate": 7.632629777524244e-06, "loss": 1.2467, "step": 1343 }, { "epoch": 0.2300188259455759, "grad_norm": 19.261301040649414, "learning_rate": 7.638334284084426e-06, "loss": 1.4108, "step": 1344 }, { "epoch": 0.23018997090535684, "grad_norm": 17.838138580322266, "learning_rate": 7.64403879064461e-06, "loss": 1.4796, "step": 1345 }, { "epoch": 0.23036111586513777, "grad_norm": 36.09761047363281, "learning_rate": 7.649743297204791e-06, "loss": 4.8769, "step": 1346 }, { "epoch": 0.2305322608249187, "grad_norm": 17.18463706970215, "learning_rate": 7.655447803764974e-06, "loss": 1.6009, "step": 1347 }, { "epoch": 0.23070340578469964, "grad_norm": 20.603784561157227, "learning_rate": 7.661152310325156e-06, "loss": 3.0856, "step": 1348 }, { "epoch": 0.23087455074448057, "grad_norm": 41.716590881347656, "learning_rate": 7.666856816885341e-06, "loss": 5.4047, "step": 1349 }, { "epoch": 0.2310456957042615, "grad_norm": 181.26748657226562, "learning_rate": 7.672561323445523e-06, "loss": 8.5903, "step": 1350 }, { "epoch": 0.23121684066404244, "grad_norm": 41.98673629760742, "learning_rate": 7.678265830005705e-06, "loss": 5.2161, "step": 1351 }, { "epoch": 0.23138798562382337, "grad_norm": 35.29446792602539, "learning_rate": 7.683970336565888e-06, "loss": 4.2135, "step": 1352 }, { "epoch": 0.2315591305836043, "grad_norm": 164.35977172851562, "learning_rate": 7.68967484312607e-06, "loss": 7.3687, "step": 1353 }, { "epoch": 0.23173027554338524, "grad_norm": 20.39377784729004, "learning_rate": 7.695379349686253e-06, "loss": 1.6669, "step": 1354 }, { "epoch": 0.23190142050316617, "grad_norm": 33.71407699584961, "learning_rate": 7.701083856246435e-06, "loss": 4.5662, "step": 1355 }, { "epoch": 0.2320725654629471, "grad_norm": 9.964597702026367, "learning_rate": 7.706788362806616e-06, "loss": 2.2199, "step": 1356 }, { "epoch": 0.23224371042272804, "grad_norm": 41.83567810058594, "learning_rate": 7.7124928693668e-06, "loss": 5.3555, "step": 1357 }, { "epoch": 0.23241485538250897, "grad_norm": 19.700429916381836, "learning_rate": 7.718197375926981e-06, "loss": 1.6864, "step": 1358 }, { "epoch": 0.2325860003422899, "grad_norm": 32.94630432128906, "learning_rate": 7.723901882487165e-06, "loss": 3.5872, "step": 1359 }, { "epoch": 0.23275714530207087, "grad_norm": 26.41133689880371, "learning_rate": 7.729606389047348e-06, "loss": 3.5806, "step": 1360 }, { "epoch": 0.2329282902618518, "grad_norm": 17.184593200683594, "learning_rate": 7.735310895607532e-06, "loss": 1.6527, "step": 1361 }, { "epoch": 0.23309943522163273, "grad_norm": 11.024751663208008, "learning_rate": 7.741015402167713e-06, "loss": 1.2203, "step": 1362 }, { "epoch": 0.23327058018141367, "grad_norm": 35.2708625793457, "learning_rate": 7.746719908727895e-06, "loss": 4.5728, "step": 1363 }, { "epoch": 0.2334417251411946, "grad_norm": 35.836387634277344, "learning_rate": 7.752424415288078e-06, "loss": 4.9165, "step": 1364 }, { "epoch": 0.23361287010097553, "grad_norm": 24.741012573242188, "learning_rate": 7.75812892184826e-06, "loss": 2.2991, "step": 1365 }, { "epoch": 0.23378401506075647, "grad_norm": 41.604007720947266, "learning_rate": 7.763833428408443e-06, "loss": 4.7384, "step": 1366 }, { "epoch": 0.2339551600205374, "grad_norm": 37.068485260009766, "learning_rate": 7.769537934968625e-06, "loss": 4.1609, "step": 1367 }, { "epoch": 0.23412630498031833, "grad_norm": 31.635995864868164, "learning_rate": 7.775242441528808e-06, "loss": 3.6394, "step": 1368 }, { "epoch": 0.23429744994009927, "grad_norm": 36.181602478027344, "learning_rate": 7.78094694808899e-06, "loss": 3.9604, "step": 1369 }, { "epoch": 0.2344685948998802, "grad_norm": 34.47708511352539, "learning_rate": 7.786651454649172e-06, "loss": 4.4621, "step": 1370 }, { "epoch": 0.23463973985966113, "grad_norm": 36.583919525146484, "learning_rate": 7.792355961209355e-06, "loss": 5.4214, "step": 1371 }, { "epoch": 0.23481088481944207, "grad_norm": 139.80113220214844, "learning_rate": 7.798060467769539e-06, "loss": 7.582, "step": 1372 }, { "epoch": 0.234982029779223, "grad_norm": 10.627038955688477, "learning_rate": 7.803764974329722e-06, "loss": 1.1265, "step": 1373 }, { "epoch": 0.23515317473900393, "grad_norm": 56.01224899291992, "learning_rate": 7.809469480889904e-06, "loss": 9.2401, "step": 1374 }, { "epoch": 0.23532431969878487, "grad_norm": 13.42536449432373, "learning_rate": 7.815173987450085e-06, "loss": 1.3022, "step": 1375 }, { "epoch": 0.2354954646585658, "grad_norm": 34.816341400146484, "learning_rate": 7.820878494010269e-06, "loss": 4.6249, "step": 1376 }, { "epoch": 0.23566660961834673, "grad_norm": 13.037670135498047, "learning_rate": 7.82658300057045e-06, "loss": 1.5747, "step": 1377 }, { "epoch": 0.23583775457812767, "grad_norm": 38.446537017822266, "learning_rate": 7.832287507130634e-06, "loss": 4.9983, "step": 1378 }, { "epoch": 0.2360088995379086, "grad_norm": 32.81908416748047, "learning_rate": 7.837992013690815e-06, "loss": 3.4363, "step": 1379 }, { "epoch": 0.23618004449768953, "grad_norm": 12.17697525024414, "learning_rate": 7.843696520250999e-06, "loss": 1.6211, "step": 1380 }, { "epoch": 0.23635118945747047, "grad_norm": 35.46131896972656, "learning_rate": 7.84940102681118e-06, "loss": 4.8981, "step": 1381 }, { "epoch": 0.2365223344172514, "grad_norm": 29.793787002563477, "learning_rate": 7.855105533371362e-06, "loss": 3.5648, "step": 1382 }, { "epoch": 0.23669347937703233, "grad_norm": 14.550475120544434, "learning_rate": 7.860810039931547e-06, "loss": 1.6714, "step": 1383 }, { "epoch": 0.23686462433681327, "grad_norm": 36.01753234863281, "learning_rate": 7.866514546491729e-06, "loss": 4.936, "step": 1384 }, { "epoch": 0.23703576929659423, "grad_norm": 21.261749267578125, "learning_rate": 7.872219053051912e-06, "loss": 2.3239, "step": 1385 }, { "epoch": 0.23720691425637516, "grad_norm": 160.96620178222656, "learning_rate": 7.877923559612094e-06, "loss": 7.9267, "step": 1386 }, { "epoch": 0.2373780592161561, "grad_norm": 34.994293212890625, "learning_rate": 7.883628066172276e-06, "loss": 4.6021, "step": 1387 }, { "epoch": 0.23754920417593703, "grad_norm": 32.08713912963867, "learning_rate": 7.889332572732459e-06, "loss": 4.0803, "step": 1388 }, { "epoch": 0.23772034913571796, "grad_norm": 36.49545669555664, "learning_rate": 7.89503707929264e-06, "loss": 4.4858, "step": 1389 }, { "epoch": 0.2378914940954989, "grad_norm": 146.2379608154297, "learning_rate": 7.900741585852824e-06, "loss": 8.1082, "step": 1390 }, { "epoch": 0.23806263905527983, "grad_norm": 31.705169677734375, "learning_rate": 7.906446092413006e-06, "loss": 4.1572, "step": 1391 }, { "epoch": 0.23823378401506076, "grad_norm": 13.439140319824219, "learning_rate": 7.91215059897319e-06, "loss": 1.1091, "step": 1392 }, { "epoch": 0.2384049289748417, "grad_norm": 51.37181854248047, "learning_rate": 7.91785510553337e-06, "loss": 9.8544, "step": 1393 }, { "epoch": 0.23857607393462263, "grad_norm": 16.763200759887695, "learning_rate": 7.923559612093553e-06, "loss": 1.4605, "step": 1394 }, { "epoch": 0.23874721889440356, "grad_norm": 32.19613265991211, "learning_rate": 7.929264118653738e-06, "loss": 4.0605, "step": 1395 }, { "epoch": 0.2389183638541845, "grad_norm": 36.1611442565918, "learning_rate": 7.93496862521392e-06, "loss": 4.1027, "step": 1396 }, { "epoch": 0.23908950881396543, "grad_norm": 36.234344482421875, "learning_rate": 7.940673131774103e-06, "loss": 5.0933, "step": 1397 }, { "epoch": 0.23926065377374636, "grad_norm": 39.589111328125, "learning_rate": 7.946377638334284e-06, "loss": 5.4176, "step": 1398 }, { "epoch": 0.2394317987335273, "grad_norm": 13.162062644958496, "learning_rate": 7.952082144894468e-06, "loss": 1.3262, "step": 1399 }, { "epoch": 0.23960294369330823, "grad_norm": 11.512036323547363, "learning_rate": 7.95778665145465e-06, "loss": 2.8916, "step": 1400 }, { "epoch": 0.23977408865308916, "grad_norm": 30.82523536682129, "learning_rate": 7.963491158014831e-06, "loss": 3.7983, "step": 1401 }, { "epoch": 0.2399452336128701, "grad_norm": 9.881488800048828, "learning_rate": 7.969195664575014e-06, "loss": 1.6009, "step": 1402 }, { "epoch": 0.24011637857265103, "grad_norm": 26.221534729003906, "learning_rate": 7.974900171135196e-06, "loss": 3.2459, "step": 1403 }, { "epoch": 0.24028752353243196, "grad_norm": 34.7869987487793, "learning_rate": 7.98060467769538e-06, "loss": 4.2736, "step": 1404 }, { "epoch": 0.2404586684922129, "grad_norm": 42.81889343261719, "learning_rate": 7.986309184255561e-06, "loss": 6.0254, "step": 1405 }, { "epoch": 0.24062981345199383, "grad_norm": 35.25808334350586, "learning_rate": 7.992013690815745e-06, "loss": 3.8331, "step": 1406 }, { "epoch": 0.24080095841177476, "grad_norm": 29.81654167175293, "learning_rate": 7.997718197375928e-06, "loss": 3.3841, "step": 1407 }, { "epoch": 0.2409721033715557, "grad_norm": 34.251243591308594, "learning_rate": 8.00342270393611e-06, "loss": 4.8157, "step": 1408 }, { "epoch": 0.24114324833133663, "grad_norm": 31.04636573791504, "learning_rate": 8.009127210496293e-06, "loss": 3.4431, "step": 1409 }, { "epoch": 0.2413143932911176, "grad_norm": 33.0612678527832, "learning_rate": 8.014831717056475e-06, "loss": 3.8054, "step": 1410 }, { "epoch": 0.24148553825089852, "grad_norm": 25.215789794921875, "learning_rate": 8.020536223616658e-06, "loss": 3.2052, "step": 1411 }, { "epoch": 0.24165668321067946, "grad_norm": 22.657257080078125, "learning_rate": 8.02624073017684e-06, "loss": 2.5621, "step": 1412 }, { "epoch": 0.2418278281704604, "grad_norm": 32.54667282104492, "learning_rate": 8.031945236737021e-06, "loss": 4.1257, "step": 1413 }, { "epoch": 0.24199897313024132, "grad_norm": 14.109042167663574, "learning_rate": 8.037649743297205e-06, "loss": 1.2616, "step": 1414 }, { "epoch": 0.24217011809002226, "grad_norm": 35.718116760253906, "learning_rate": 8.043354249857387e-06, "loss": 5.263, "step": 1415 }, { "epoch": 0.2423412630498032, "grad_norm": 10.830004692077637, "learning_rate": 8.04905875641757e-06, "loss": 1.6628, "step": 1416 }, { "epoch": 0.24251240800958412, "grad_norm": 21.519893646240234, "learning_rate": 8.054763262977753e-06, "loss": 2.2681, "step": 1417 }, { "epoch": 0.24268355296936506, "grad_norm": 16.527233123779297, "learning_rate": 8.060467769537937e-06, "loss": 1.7274, "step": 1418 }, { "epoch": 0.242854697929146, "grad_norm": 17.97334098815918, "learning_rate": 8.066172276098118e-06, "loss": 1.4341, "step": 1419 }, { "epoch": 0.24302584288892692, "grad_norm": 38.63325500488281, "learning_rate": 8.0718767826583e-06, "loss": 5.4521, "step": 1420 }, { "epoch": 0.24319698784870786, "grad_norm": 37.572818756103516, "learning_rate": 8.077581289218483e-06, "loss": 4.057, "step": 1421 }, { "epoch": 0.2433681328084888, "grad_norm": 36.495025634765625, "learning_rate": 8.083285795778665e-06, "loss": 5.3841, "step": 1422 }, { "epoch": 0.24353927776826972, "grad_norm": 46.322486877441406, "learning_rate": 8.088990302338848e-06, "loss": 9.2447, "step": 1423 }, { "epoch": 0.24371042272805066, "grad_norm": 32.26517868041992, "learning_rate": 8.09469480889903e-06, "loss": 3.4902, "step": 1424 }, { "epoch": 0.2438815676878316, "grad_norm": 29.286020278930664, "learning_rate": 8.100399315459212e-06, "loss": 3.6562, "step": 1425 }, { "epoch": 0.24405271264761252, "grad_norm": 9.768603324890137, "learning_rate": 8.106103822019395e-06, "loss": 1.0808, "step": 1426 }, { "epoch": 0.24422385760739346, "grad_norm": 40.53557205200195, "learning_rate": 8.111808328579577e-06, "loss": 5.3038, "step": 1427 }, { "epoch": 0.2443950025671744, "grad_norm": 36.29978561401367, "learning_rate": 8.11751283513976e-06, "loss": 4.9487, "step": 1428 }, { "epoch": 0.24456614752695532, "grad_norm": 50.365440368652344, "learning_rate": 8.123217341699944e-06, "loss": 9.1753, "step": 1429 }, { "epoch": 0.24473729248673626, "grad_norm": 25.204608917236328, "learning_rate": 8.128921848260127e-06, "loss": 3.06, "step": 1430 }, { "epoch": 0.2449084374465172, "grad_norm": 36.821929931640625, "learning_rate": 8.134626354820309e-06, "loss": 4.2367, "step": 1431 }, { "epoch": 0.24507958240629812, "grad_norm": 9.532563209533691, "learning_rate": 8.14033086138049e-06, "loss": 1.0511, "step": 1432 }, { "epoch": 0.24525072736607906, "grad_norm": 31.35403060913086, "learning_rate": 8.146035367940674e-06, "loss": 4.1655, "step": 1433 }, { "epoch": 0.24542187232586, "grad_norm": 29.057531356811523, "learning_rate": 8.151739874500855e-06, "loss": 3.6622, "step": 1434 }, { "epoch": 0.24559301728564092, "grad_norm": 18.69387435913086, "learning_rate": 8.157444381061039e-06, "loss": 1.6006, "step": 1435 }, { "epoch": 0.24576416224542189, "grad_norm": 27.337491989135742, "learning_rate": 8.16314888762122e-06, "loss": 2.1133, "step": 1436 }, { "epoch": 0.24593530720520282, "grad_norm": 59.810035705566406, "learning_rate": 8.168853394181404e-06, "loss": 9.2893, "step": 1437 }, { "epoch": 0.24610645216498375, "grad_norm": 34.85076141357422, "learning_rate": 8.174557900741586e-06, "loss": 4.76, "step": 1438 }, { "epoch": 0.24627759712476469, "grad_norm": 16.229951858520508, "learning_rate": 8.180262407301767e-06, "loss": 1.111, "step": 1439 }, { "epoch": 0.24644874208454562, "grad_norm": 191.14859008789062, "learning_rate": 8.185966913861952e-06, "loss": 8.6606, "step": 1440 }, { "epoch": 0.24661988704432655, "grad_norm": 25.192026138305664, "learning_rate": 8.191671420422134e-06, "loss": 2.2213, "step": 1441 }, { "epoch": 0.24679103200410749, "grad_norm": 16.577152252197266, "learning_rate": 8.197375926982317e-06, "loss": 1.4564, "step": 1442 }, { "epoch": 0.24696217696388842, "grad_norm": 37.47216796875, "learning_rate": 8.203080433542499e-06, "loss": 4.9652, "step": 1443 }, { "epoch": 0.24713332192366935, "grad_norm": 33.50614547729492, "learning_rate": 8.20878494010268e-06, "loss": 3.8217, "step": 1444 }, { "epoch": 0.24730446688345029, "grad_norm": 35.54981994628906, "learning_rate": 8.214489446662864e-06, "loss": 5.0781, "step": 1445 }, { "epoch": 0.24747561184323122, "grad_norm": 29.486570358276367, "learning_rate": 8.220193953223046e-06, "loss": 3.4324, "step": 1446 }, { "epoch": 0.24764675680301215, "grad_norm": 23.952808380126953, "learning_rate": 8.22589845978323e-06, "loss": 2.9791, "step": 1447 }, { "epoch": 0.24781790176279309, "grad_norm": 22.885963439941406, "learning_rate": 8.231602966343411e-06, "loss": 2.1029, "step": 1448 }, { "epoch": 0.24798904672257402, "grad_norm": 38.23826217651367, "learning_rate": 8.237307472903594e-06, "loss": 5.1107, "step": 1449 }, { "epoch": 0.24816019168235495, "grad_norm": 21.183773040771484, "learning_rate": 8.243011979463776e-06, "loss": 2.6462, "step": 1450 }, { "epoch": 0.24833133664213589, "grad_norm": 11.436287879943848, "learning_rate": 8.248716486023958e-06, "loss": 1.139, "step": 1451 }, { "epoch": 0.24850248160191682, "grad_norm": 21.1058349609375, "learning_rate": 8.254420992584143e-06, "loss": 2.6237, "step": 1452 }, { "epoch": 0.24867362656169775, "grad_norm": 29.661510467529297, "learning_rate": 8.260125499144324e-06, "loss": 3.9416, "step": 1453 }, { "epoch": 0.24884477152147869, "grad_norm": 25.654918670654297, "learning_rate": 8.265830005704508e-06, "loss": 2.9109, "step": 1454 }, { "epoch": 0.24901591648125962, "grad_norm": 29.254196166992188, "learning_rate": 8.27153451226469e-06, "loss": 3.9703, "step": 1455 }, { "epoch": 0.24918706144104055, "grad_norm": 15.34985065460205, "learning_rate": 8.277239018824871e-06, "loss": 1.277, "step": 1456 }, { "epoch": 0.24935820640082149, "grad_norm": 20.940813064575195, "learning_rate": 8.282943525385055e-06, "loss": 2.8225, "step": 1457 }, { "epoch": 0.24952935136060242, "grad_norm": 156.33163452148438, "learning_rate": 8.288648031945236e-06, "loss": 6.8667, "step": 1458 }, { "epoch": 0.24970049632038335, "grad_norm": 142.04833984375, "learning_rate": 8.29435253850542e-06, "loss": 7.7845, "step": 1459 }, { "epoch": 0.24987164128016429, "grad_norm": 52.80269241333008, "learning_rate": 8.300057045065601e-06, "loss": 9.2945, "step": 1460 }, { "epoch": 0.25004278623994525, "grad_norm": 36.25229263305664, "learning_rate": 8.305761551625785e-06, "loss": 4.1385, "step": 1461 }, { "epoch": 0.2502139311997262, "grad_norm": 32.63280487060547, "learning_rate": 8.311466058185966e-06, "loss": 4.9526, "step": 1462 }, { "epoch": 0.2503850761595071, "grad_norm": 36.09181213378906, "learning_rate": 8.31717056474615e-06, "loss": 4.9655, "step": 1463 }, { "epoch": 0.25055622111928805, "grad_norm": 13.666475296020508, "learning_rate": 8.322875071306333e-06, "loss": 1.2171, "step": 1464 }, { "epoch": 0.250727366079069, "grad_norm": 21.431262969970703, "learning_rate": 8.328579577866515e-06, "loss": 2.0253, "step": 1465 }, { "epoch": 0.2508985110388499, "grad_norm": 34.866493225097656, "learning_rate": 8.334284084426698e-06, "loss": 4.7963, "step": 1466 }, { "epoch": 0.25106965599863085, "grad_norm": 28.299697875976562, "learning_rate": 8.33998859098688e-06, "loss": 3.2393, "step": 1467 }, { "epoch": 0.2512408009584118, "grad_norm": 30.702220916748047, "learning_rate": 8.345693097547063e-06, "loss": 4.459, "step": 1468 }, { "epoch": 0.2514119459181927, "grad_norm": 35.572662353515625, "learning_rate": 8.351397604107245e-06, "loss": 4.0362, "step": 1469 }, { "epoch": 0.25158309087797365, "grad_norm": 31.228361129760742, "learning_rate": 8.357102110667427e-06, "loss": 3.7291, "step": 1470 }, { "epoch": 0.2517542358377546, "grad_norm": 158.43309020996094, "learning_rate": 8.36280661722761e-06, "loss": 7.5395, "step": 1471 }, { "epoch": 0.2519253807975355, "grad_norm": 26.111873626708984, "learning_rate": 8.368511123787792e-06, "loss": 3.2816, "step": 1472 }, { "epoch": 0.25209652575731645, "grad_norm": 152.1773681640625, "learning_rate": 8.374215630347975e-06, "loss": 9.2757, "step": 1473 }, { "epoch": 0.2522676707170974, "grad_norm": 28.91309928894043, "learning_rate": 8.379920136908157e-06, "loss": 3.8, "step": 1474 }, { "epoch": 0.2524388156768783, "grad_norm": 138.71820068359375, "learning_rate": 8.38562464346834e-06, "loss": 8.3701, "step": 1475 }, { "epoch": 0.25260996063665925, "grad_norm": 10.94738483428955, "learning_rate": 8.391329150028524e-06, "loss": 1.0987, "step": 1476 }, { "epoch": 0.2527811055964402, "grad_norm": 33.45675277709961, "learning_rate": 8.397033656588705e-06, "loss": 3.8679, "step": 1477 }, { "epoch": 0.2529522505562211, "grad_norm": 30.219728469848633, "learning_rate": 8.402738163148889e-06, "loss": 3.7668, "step": 1478 }, { "epoch": 0.25312339551600205, "grad_norm": 153.4755859375, "learning_rate": 8.40844266970907e-06, "loss": 8.493, "step": 1479 }, { "epoch": 0.253294540475783, "grad_norm": 27.030277252197266, "learning_rate": 8.414147176269254e-06, "loss": 3.6373, "step": 1480 }, { "epoch": 0.2534656854355639, "grad_norm": 26.931581497192383, "learning_rate": 8.419851682829435e-06, "loss": 2.4114, "step": 1481 }, { "epoch": 0.25363683039534485, "grad_norm": 33.86345672607422, "learning_rate": 8.425556189389617e-06, "loss": 4.18, "step": 1482 }, { "epoch": 0.2538079753551258, "grad_norm": 40.67789840698242, "learning_rate": 8.4312606959498e-06, "loss": 5.2501, "step": 1483 }, { "epoch": 0.2539791203149067, "grad_norm": 11.627734184265137, "learning_rate": 8.436965202509982e-06, "loss": 1.2352, "step": 1484 }, { "epoch": 0.25415026527468765, "grad_norm": 27.1390438079834, "learning_rate": 8.442669709070165e-06, "loss": 2.4447, "step": 1485 }, { "epoch": 0.2543214102344686, "grad_norm": 33.907615661621094, "learning_rate": 8.448374215630349e-06, "loss": 5.7028, "step": 1486 }, { "epoch": 0.2544925551942495, "grad_norm": 34.770687103271484, "learning_rate": 8.45407872219053e-06, "loss": 5.4022, "step": 1487 }, { "epoch": 0.25466370015403045, "grad_norm": 87.67970275878906, "learning_rate": 8.459783228750714e-06, "loss": 7.2429, "step": 1488 }, { "epoch": 0.2548348451138114, "grad_norm": 36.1263313293457, "learning_rate": 8.465487735310896e-06, "loss": 4.7788, "step": 1489 }, { "epoch": 0.2550059900735923, "grad_norm": 35.22165298461914, "learning_rate": 8.471192241871079e-06, "loss": 4.132, "step": 1490 }, { "epoch": 0.25517713503337325, "grad_norm": 28.420682907104492, "learning_rate": 8.47689674843126e-06, "loss": 3.6288, "step": 1491 }, { "epoch": 0.2553482799931542, "grad_norm": 36.37025451660156, "learning_rate": 8.482601254991444e-06, "loss": 5.1911, "step": 1492 }, { "epoch": 0.2555194249529351, "grad_norm": 40.647789001464844, "learning_rate": 8.488305761551626e-06, "loss": 5.5946, "step": 1493 }, { "epoch": 0.25569056991271605, "grad_norm": 19.504039764404297, "learning_rate": 8.494010268111807e-06, "loss": 1.7075, "step": 1494 }, { "epoch": 0.255861714872497, "grad_norm": 32.866695404052734, "learning_rate": 8.49971477467199e-06, "loss": 4.4763, "step": 1495 }, { "epoch": 0.2560328598322779, "grad_norm": 33.1104736328125, "learning_rate": 8.505419281232172e-06, "loss": 4.4053, "step": 1496 }, { "epoch": 0.25620400479205885, "grad_norm": 22.860944747924805, "learning_rate": 8.511123787792358e-06, "loss": 2.5604, "step": 1497 }, { "epoch": 0.2563751497518398, "grad_norm": 34.79046630859375, "learning_rate": 8.51682829435254e-06, "loss": 4.993, "step": 1498 }, { "epoch": 0.25654629471162077, "grad_norm": 28.405912399291992, "learning_rate": 8.522532800912723e-06, "loss": 3.3138, "step": 1499 }, { "epoch": 0.2567174396714017, "grad_norm": 32.89986038208008, "learning_rate": 8.528237307472904e-06, "loss": 3.1908, "step": 1500 }, { "epoch": 0.25688858463118264, "grad_norm": 20.201610565185547, "learning_rate": 8.533941814033086e-06, "loss": 1.974, "step": 1501 }, { "epoch": 0.25705972959096357, "grad_norm": 32.933231353759766, "learning_rate": 8.53964632059327e-06, "loss": 4.8342, "step": 1502 }, { "epoch": 0.2572308745507445, "grad_norm": 25.67669105529785, "learning_rate": 8.545350827153451e-06, "loss": 2.8345, "step": 1503 }, { "epoch": 0.25740201951052544, "grad_norm": 50.461097717285156, "learning_rate": 8.551055333713634e-06, "loss": 6.9385, "step": 1504 }, { "epoch": 0.25757316447030637, "grad_norm": 32.42000198364258, "learning_rate": 8.556759840273816e-06, "loss": 3.4542, "step": 1505 }, { "epoch": 0.2577443094300873, "grad_norm": 29.946523666381836, "learning_rate": 8.562464346833998e-06, "loss": 3.2486, "step": 1506 }, { "epoch": 0.25791545438986824, "grad_norm": 17.451496124267578, "learning_rate": 8.568168853394181e-06, "loss": 1.4946, "step": 1507 }, { "epoch": 0.25808659934964917, "grad_norm": 30.164350509643555, "learning_rate": 8.573873359954363e-06, "loss": 3.8272, "step": 1508 }, { "epoch": 0.2582577443094301, "grad_norm": 26.747682571411133, "learning_rate": 8.579577866514548e-06, "loss": 3.0653, "step": 1509 }, { "epoch": 0.25842888926921104, "grad_norm": 20.9317626953125, "learning_rate": 8.58528237307473e-06, "loss": 1.8431, "step": 1510 }, { "epoch": 0.25860003422899197, "grad_norm": 36.90618896484375, "learning_rate": 8.590986879634913e-06, "loss": 3.7371, "step": 1511 }, { "epoch": 0.2587711791887729, "grad_norm": 19.612281799316406, "learning_rate": 8.596691386195095e-06, "loss": 1.4799, "step": 1512 }, { "epoch": 0.25894232414855384, "grad_norm": 35.63535690307617, "learning_rate": 8.602395892755276e-06, "loss": 4.2458, "step": 1513 }, { "epoch": 0.25911346910833477, "grad_norm": 37.25559997558594, "learning_rate": 8.60810039931546e-06, "loss": 3.7735, "step": 1514 }, { "epoch": 0.2592846140681157, "grad_norm": 26.81685447692871, "learning_rate": 8.613804905875641e-06, "loss": 2.621, "step": 1515 }, { "epoch": 0.25945575902789664, "grad_norm": 22.918485641479492, "learning_rate": 8.619509412435825e-06, "loss": 1.6105, "step": 1516 }, { "epoch": 0.25962690398767757, "grad_norm": 12.06033992767334, "learning_rate": 8.625213918996006e-06, "loss": 1.1731, "step": 1517 }, { "epoch": 0.2597980489474585, "grad_norm": 35.15945053100586, "learning_rate": 8.63091842555619e-06, "loss": 3.8198, "step": 1518 }, { "epoch": 0.25996919390723944, "grad_norm": 13.90102767944336, "learning_rate": 8.636622932116372e-06, "loss": 2.736, "step": 1519 }, { "epoch": 0.26014033886702037, "grad_norm": 35.0964469909668, "learning_rate": 8.642327438676555e-06, "loss": 4.3737, "step": 1520 }, { "epoch": 0.2603114838268013, "grad_norm": 33.16070556640625, "learning_rate": 8.648031945236738e-06, "loss": 3.8065, "step": 1521 }, { "epoch": 0.26048262878658224, "grad_norm": 16.28618621826172, "learning_rate": 8.65373645179692e-06, "loss": 1.257, "step": 1522 }, { "epoch": 0.26065377374636317, "grad_norm": 28.174516677856445, "learning_rate": 8.659440958357103e-06, "loss": 3.7114, "step": 1523 }, { "epoch": 0.2608249187061441, "grad_norm": 26.44544792175293, "learning_rate": 8.665145464917285e-06, "loss": 2.829, "step": 1524 }, { "epoch": 0.26099606366592504, "grad_norm": 38.186378479003906, "learning_rate": 8.670849971477467e-06, "loss": 4.3011, "step": 1525 }, { "epoch": 0.26116720862570597, "grad_norm": 206.24801635742188, "learning_rate": 8.67655447803765e-06, "loss": 9.2851, "step": 1526 }, { "epoch": 0.2613383535854869, "grad_norm": 33.12008285522461, "learning_rate": 8.682258984597832e-06, "loss": 4.3036, "step": 1527 }, { "epoch": 0.26150949854526784, "grad_norm": 136.57029724121094, "learning_rate": 8.687963491158015e-06, "loss": 8.4189, "step": 1528 }, { "epoch": 0.26168064350504877, "grad_norm": 40.36309051513672, "learning_rate": 8.693667997718197e-06, "loss": 5.4948, "step": 1529 }, { "epoch": 0.2618517884648297, "grad_norm": 19.74286651611328, "learning_rate": 8.69937250427838e-06, "loss": 2.0893, "step": 1530 }, { "epoch": 0.26202293342461064, "grad_norm": 33.62118148803711, "learning_rate": 8.705077010838562e-06, "loss": 3.796, "step": 1531 }, { "epoch": 0.26219407838439157, "grad_norm": 36.64006805419922, "learning_rate": 8.710781517398745e-06, "loss": 3.9848, "step": 1532 }, { "epoch": 0.2623652233441725, "grad_norm": 12.980084419250488, "learning_rate": 8.716486023958929e-06, "loss": 1.1166, "step": 1533 }, { "epoch": 0.26253636830395344, "grad_norm": 35.808021545410156, "learning_rate": 8.72219053051911e-06, "loss": 4.3018, "step": 1534 }, { "epoch": 0.26270751326373437, "grad_norm": 51.2911491394043, "learning_rate": 8.727895037079294e-06, "loss": 9.237, "step": 1535 }, { "epoch": 0.2628786582235153, "grad_norm": 26.75223731994629, "learning_rate": 8.733599543639475e-06, "loss": 3.3625, "step": 1536 }, { "epoch": 0.26304980318329624, "grad_norm": 81.07520294189453, "learning_rate": 8.739304050199659e-06, "loss": 7.5686, "step": 1537 }, { "epoch": 0.26322094814307717, "grad_norm": 37.027191162109375, "learning_rate": 8.74500855675984e-06, "loss": 3.7701, "step": 1538 }, { "epoch": 0.2633920931028581, "grad_norm": 47.393333435058594, "learning_rate": 8.750713063320022e-06, "loss": 9.0139, "step": 1539 }, { "epoch": 0.26356323806263904, "grad_norm": 34.1210823059082, "learning_rate": 8.756417569880206e-06, "loss": 4.4995, "step": 1540 }, { "epoch": 0.26373438302241997, "grad_norm": 14.312548637390137, "learning_rate": 8.762122076440387e-06, "loss": 2.1827, "step": 1541 }, { "epoch": 0.2639055279822009, "grad_norm": 30.19961166381836, "learning_rate": 8.76782658300057e-06, "loss": 3.9737, "step": 1542 }, { "epoch": 0.26407667294198184, "grad_norm": 10.720991134643555, "learning_rate": 8.773531089560754e-06, "loss": 1.1108, "step": 1543 }, { "epoch": 0.26424781790176277, "grad_norm": 26.29660987854004, "learning_rate": 8.779235596120936e-06, "loss": 2.9509, "step": 1544 }, { "epoch": 0.2644189628615437, "grad_norm": 7.651371479034424, "learning_rate": 8.784940102681119e-06, "loss": 0.8929, "step": 1545 }, { "epoch": 0.26459010782132464, "grad_norm": 32.411407470703125, "learning_rate": 8.7906446092413e-06, "loss": 3.9279, "step": 1546 }, { "epoch": 0.26476125278110557, "grad_norm": 43.62602233886719, "learning_rate": 8.796349115801484e-06, "loss": 8.7932, "step": 1547 }, { "epoch": 0.2649323977408865, "grad_norm": 28.391075134277344, "learning_rate": 8.802053622361666e-06, "loss": 3.3049, "step": 1548 }, { "epoch": 0.26510354270066744, "grad_norm": 35.11864471435547, "learning_rate": 8.80775812892185e-06, "loss": 4.0323, "step": 1549 }, { "epoch": 0.2652746876604484, "grad_norm": 10.911874771118164, "learning_rate": 8.813462635482031e-06, "loss": 1.3744, "step": 1550 }, { "epoch": 0.26544583262022936, "grad_norm": 22.232980728149414, "learning_rate": 8.819167142042213e-06, "loss": 1.972, "step": 1551 }, { "epoch": 0.2656169775800103, "grad_norm": 171.640625, "learning_rate": 8.824871648602396e-06, "loss": 8.4712, "step": 1552 }, { "epoch": 0.2657881225397912, "grad_norm": 30.831897735595703, "learning_rate": 8.830576155162578e-06, "loss": 3.5869, "step": 1553 }, { "epoch": 0.26595926749957216, "grad_norm": 36.305782318115234, "learning_rate": 8.836280661722761e-06, "loss": 4.9009, "step": 1554 }, { "epoch": 0.2661304124593531, "grad_norm": 44.463626861572266, "learning_rate": 8.841985168282944e-06, "loss": 4.6015, "step": 1555 }, { "epoch": 0.266301557419134, "grad_norm": 22.66800308227539, "learning_rate": 8.847689674843126e-06, "loss": 2.1498, "step": 1556 }, { "epoch": 0.26647270237891496, "grad_norm": 30.886274337768555, "learning_rate": 8.85339418140331e-06, "loss": 4.3322, "step": 1557 }, { "epoch": 0.2666438473386959, "grad_norm": 34.30126190185547, "learning_rate": 8.859098687963491e-06, "loss": 4.5378, "step": 1558 }, { "epoch": 0.2668149922984768, "grad_norm": 36.92926025390625, "learning_rate": 8.864803194523674e-06, "loss": 4.2903, "step": 1559 }, { "epoch": 0.26698613725825776, "grad_norm": 34.588077545166016, "learning_rate": 8.870507701083856e-06, "loss": 4.9088, "step": 1560 }, { "epoch": 0.2671572822180387, "grad_norm": 30.621044158935547, "learning_rate": 8.87621220764404e-06, "loss": 3.6051, "step": 1561 }, { "epoch": 0.2673284271778196, "grad_norm": 30.107677459716797, "learning_rate": 8.881916714204221e-06, "loss": 3.4027, "step": 1562 }, { "epoch": 0.26749957213760056, "grad_norm": 16.614532470703125, "learning_rate": 8.887621220764403e-06, "loss": 1.5846, "step": 1563 }, { "epoch": 0.2676707170973815, "grad_norm": 35.577842712402344, "learning_rate": 8.893325727324586e-06, "loss": 4.2335, "step": 1564 }, { "epoch": 0.2678418620571624, "grad_norm": 33.13545227050781, "learning_rate": 8.899030233884768e-06, "loss": 4.6539, "step": 1565 }, { "epoch": 0.26801300701694336, "grad_norm": 170.64297485351562, "learning_rate": 8.904734740444953e-06, "loss": 9.3362, "step": 1566 }, { "epoch": 0.2681841519767243, "grad_norm": 12.3065185546875, "learning_rate": 8.910439247005135e-06, "loss": 1.573, "step": 1567 }, { "epoch": 0.2683552969365052, "grad_norm": 38.08529281616211, "learning_rate": 8.916143753565318e-06, "loss": 3.7314, "step": 1568 }, { "epoch": 0.26852644189628616, "grad_norm": 169.76089477539062, "learning_rate": 8.9218482601255e-06, "loss": 9.6942, "step": 1569 }, { "epoch": 0.2686975868560671, "grad_norm": 38.42169952392578, "learning_rate": 8.927552766685681e-06, "loss": 5.3158, "step": 1570 }, { "epoch": 0.268868731815848, "grad_norm": 14.410723686218262, "learning_rate": 8.933257273245865e-06, "loss": 1.2377, "step": 1571 }, { "epoch": 0.26903987677562896, "grad_norm": 52.682533264160156, "learning_rate": 8.938961779806047e-06, "loss": 6.516, "step": 1572 }, { "epoch": 0.2692110217354099, "grad_norm": 34.07759094238281, "learning_rate": 8.94466628636623e-06, "loss": 4.013, "step": 1573 }, { "epoch": 0.26938216669519083, "grad_norm": 29.74109649658203, "learning_rate": 8.950370792926412e-06, "loss": 3.4177, "step": 1574 }, { "epoch": 0.26955331165497176, "grad_norm": 35.098876953125, "learning_rate": 8.956075299486593e-06, "loss": 4.1055, "step": 1575 }, { "epoch": 0.2697244566147527, "grad_norm": 50.082366943359375, "learning_rate": 8.961779806046777e-06, "loss": 8.4876, "step": 1576 }, { "epoch": 0.26989560157453363, "grad_norm": 116.58244323730469, "learning_rate": 8.96748431260696e-06, "loss": 7.8558, "step": 1577 }, { "epoch": 0.27006674653431456, "grad_norm": 32.75837326049805, "learning_rate": 8.973188819167143e-06, "loss": 3.8977, "step": 1578 }, { "epoch": 0.2702378914940955, "grad_norm": 13.686226844787598, "learning_rate": 8.978893325727325e-06, "loss": 1.5984, "step": 1579 }, { "epoch": 0.27040903645387643, "grad_norm": 31.057418823242188, "learning_rate": 8.984597832287508e-06, "loss": 4.2033, "step": 1580 }, { "epoch": 0.27058018141365736, "grad_norm": 31.405447006225586, "learning_rate": 8.99030233884769e-06, "loss": 3.2895, "step": 1581 }, { "epoch": 0.2707513263734383, "grad_norm": 29.978918075561523, "learning_rate": 8.996006845407872e-06, "loss": 4.0648, "step": 1582 }, { "epoch": 0.27092247133321923, "grad_norm": 11.317312240600586, "learning_rate": 9.001711351968055e-06, "loss": 0.9835, "step": 1583 }, { "epoch": 0.27109361629300016, "grad_norm": 17.877771377563477, "learning_rate": 9.007415858528237e-06, "loss": 1.4293, "step": 1584 }, { "epoch": 0.2712647612527811, "grad_norm": 26.353673934936523, "learning_rate": 9.01312036508842e-06, "loss": 2.6549, "step": 1585 }, { "epoch": 0.27143590621256203, "grad_norm": 31.735876083374023, "learning_rate": 9.018824871648602e-06, "loss": 3.9997, "step": 1586 }, { "epoch": 0.27160705117234296, "grad_norm": 35.91917037963867, "learning_rate": 9.024529378208785e-06, "loss": 4.2824, "step": 1587 }, { "epoch": 0.2717781961321239, "grad_norm": 32.27674865722656, "learning_rate": 9.030233884768967e-06, "loss": 4.0964, "step": 1588 }, { "epoch": 0.27194934109190483, "grad_norm": 37.242549896240234, "learning_rate": 9.03593839132915e-06, "loss": 4.4567, "step": 1589 }, { "epoch": 0.27212048605168576, "grad_norm": 15.34211540222168, "learning_rate": 9.041642897889334e-06, "loss": 1.1567, "step": 1590 }, { "epoch": 0.2722916310114667, "grad_norm": 35.38195037841797, "learning_rate": 9.047347404449515e-06, "loss": 4.7975, "step": 1591 }, { "epoch": 0.27246277597124763, "grad_norm": 29.104900360107422, "learning_rate": 9.053051911009699e-06, "loss": 3.1354, "step": 1592 }, { "epoch": 0.27263392093102856, "grad_norm": 15.004528999328613, "learning_rate": 9.05875641756988e-06, "loss": 1.1248, "step": 1593 }, { "epoch": 0.2728050658908095, "grad_norm": 26.269655227661133, "learning_rate": 9.064460924130062e-06, "loss": 2.0743, "step": 1594 }, { "epoch": 0.27297621085059043, "grad_norm": 19.79959487915039, "learning_rate": 9.070165430690246e-06, "loss": 1.3031, "step": 1595 }, { "epoch": 0.27314735581037136, "grad_norm": 43.51731491088867, "learning_rate": 9.075869937250427e-06, "loss": 4.4293, "step": 1596 }, { "epoch": 0.2733185007701523, "grad_norm": 7.138434410095215, "learning_rate": 9.08157444381061e-06, "loss": 0.8485, "step": 1597 }, { "epoch": 0.27348964572993323, "grad_norm": 32.309593200683594, "learning_rate": 9.087278950370792e-06, "loss": 3.4497, "step": 1598 }, { "epoch": 0.27366079068971416, "grad_norm": 24.805715560913086, "learning_rate": 9.092983456930976e-06, "loss": 2.9256, "step": 1599 }, { "epoch": 0.2738319356494951, "grad_norm": 61.22898483276367, "learning_rate": 9.098687963491159e-06, "loss": 5.9283, "step": 1600 }, { "epoch": 0.2740030806092761, "grad_norm": 29.417680740356445, "learning_rate": 9.10439247005134e-06, "loss": 3.8084, "step": 1601 }, { "epoch": 0.274174225569057, "grad_norm": 34.00372314453125, "learning_rate": 9.110096976611524e-06, "loss": 3.4933, "step": 1602 }, { "epoch": 0.27434537052883795, "grad_norm": 14.374422073364258, "learning_rate": 9.115801483171706e-06, "loss": 1.4626, "step": 1603 }, { "epoch": 0.2745165154886189, "grad_norm": 12.729880332946777, "learning_rate": 9.12150598973189e-06, "loss": 1.1151, "step": 1604 }, { "epoch": 0.2746876604483998, "grad_norm": 17.94257164001465, "learning_rate": 9.127210496292071e-06, "loss": 1.3846, "step": 1605 }, { "epoch": 0.27485880540818075, "grad_norm": 38.29545974731445, "learning_rate": 9.132915002852253e-06, "loss": 4.5905, "step": 1606 }, { "epoch": 0.2750299503679617, "grad_norm": 35.37318420410156, "learning_rate": 9.138619509412436e-06, "loss": 4.3784, "step": 1607 }, { "epoch": 0.2752010953277426, "grad_norm": 35.77292251586914, "learning_rate": 9.144324015972618e-06, "loss": 3.315, "step": 1608 }, { "epoch": 0.27537224028752355, "grad_norm": 38.70093536376953, "learning_rate": 9.150028522532801e-06, "loss": 5.4718, "step": 1609 }, { "epoch": 0.2755433852473045, "grad_norm": 185.0310516357422, "learning_rate": 9.155733029092983e-06, "loss": 7.5009, "step": 1610 }, { "epoch": 0.2757145302070854, "grad_norm": 28.145288467407227, "learning_rate": 9.161437535653166e-06, "loss": 2.8764, "step": 1611 }, { "epoch": 0.27588567516686635, "grad_norm": 7.594282150268555, "learning_rate": 9.16714204221335e-06, "loss": 0.8713, "step": 1612 }, { "epoch": 0.2760568201266473, "grad_norm": 32.899845123291016, "learning_rate": 9.172846548773531e-06, "loss": 4.6094, "step": 1613 }, { "epoch": 0.2762279650864282, "grad_norm": 39.75630569458008, "learning_rate": 9.178551055333715e-06, "loss": 4.5632, "step": 1614 }, { "epoch": 0.27639911004620915, "grad_norm": 29.607851028442383, "learning_rate": 9.184255561893896e-06, "loss": 2.9606, "step": 1615 }, { "epoch": 0.2765702550059901, "grad_norm": 76.37677001953125, "learning_rate": 9.18996006845408e-06, "loss": 7.355, "step": 1616 }, { "epoch": 0.276741399965771, "grad_norm": 22.215526580810547, "learning_rate": 9.195664575014261e-06, "loss": 2.8241, "step": 1617 }, { "epoch": 0.27691254492555195, "grad_norm": 9.465276718139648, "learning_rate": 9.201369081574445e-06, "loss": 0.9882, "step": 1618 }, { "epoch": 0.2770836898853329, "grad_norm": 27.726600646972656, "learning_rate": 9.207073588134626e-06, "loss": 3.238, "step": 1619 }, { "epoch": 0.2772548348451138, "grad_norm": 35.69710922241211, "learning_rate": 9.212778094694808e-06, "loss": 4.4113, "step": 1620 }, { "epoch": 0.27742597980489475, "grad_norm": 34.97329330444336, "learning_rate": 9.218482601254991e-06, "loss": 5.005, "step": 1621 }, { "epoch": 0.2775971247646757, "grad_norm": 18.749282836914062, "learning_rate": 9.224187107815173e-06, "loss": 1.7009, "step": 1622 }, { "epoch": 0.2777682697244566, "grad_norm": 130.61004638671875, "learning_rate": 9.229891614375358e-06, "loss": 7.8661, "step": 1623 }, { "epoch": 0.27793941468423755, "grad_norm": 12.980770111083984, "learning_rate": 9.23559612093554e-06, "loss": 1.1125, "step": 1624 }, { "epoch": 0.2781105596440185, "grad_norm": 46.32781219482422, "learning_rate": 9.241300627495722e-06, "loss": 8.7552, "step": 1625 }, { "epoch": 0.2782817046037994, "grad_norm": 101.5696029663086, "learning_rate": 9.247005134055905e-06, "loss": 7.1054, "step": 1626 }, { "epoch": 0.27845284956358035, "grad_norm": 22.125795364379883, "learning_rate": 9.252709640616087e-06, "loss": 1.7911, "step": 1627 }, { "epoch": 0.2786239945233613, "grad_norm": 34.277095794677734, "learning_rate": 9.25841414717627e-06, "loss": 4.438, "step": 1628 }, { "epoch": 0.2787951394831422, "grad_norm": 22.72269058227539, "learning_rate": 9.264118653736452e-06, "loss": 1.7455, "step": 1629 }, { "epoch": 0.27896628444292315, "grad_norm": 30.11455726623535, "learning_rate": 9.269823160296635e-06, "loss": 3.3549, "step": 1630 }, { "epoch": 0.2791374294027041, "grad_norm": 34.13120651245117, "learning_rate": 9.275527666856817e-06, "loss": 3.7081, "step": 1631 }, { "epoch": 0.279308574362485, "grad_norm": 8.457001686096191, "learning_rate": 9.281232173416998e-06, "loss": 0.9564, "step": 1632 }, { "epoch": 0.27947971932226595, "grad_norm": 38.574615478515625, "learning_rate": 9.286936679977182e-06, "loss": 4.3973, "step": 1633 }, { "epoch": 0.2796508642820469, "grad_norm": 11.158347129821777, "learning_rate": 9.292641186537364e-06, "loss": 0.9696, "step": 1634 }, { "epoch": 0.2798220092418278, "grad_norm": 11.847931861877441, "learning_rate": 9.298345693097549e-06, "loss": 1.5377, "step": 1635 }, { "epoch": 0.27999315420160875, "grad_norm": 11.096319198608398, "learning_rate": 9.30405019965773e-06, "loss": 1.136, "step": 1636 }, { "epoch": 0.2801642991613897, "grad_norm": 36.63529586791992, "learning_rate": 9.309754706217914e-06, "loss": 5.2998, "step": 1637 }, { "epoch": 0.2803354441211706, "grad_norm": 30.421175003051758, "learning_rate": 9.315459212778095e-06, "loss": 3.5562, "step": 1638 }, { "epoch": 0.28050658908095155, "grad_norm": 34.89402770996094, "learning_rate": 9.321163719338277e-06, "loss": 4.9255, "step": 1639 }, { "epoch": 0.2806777340407325, "grad_norm": 28.486478805541992, "learning_rate": 9.32686822589846e-06, "loss": 3.4583, "step": 1640 }, { "epoch": 0.2808488790005134, "grad_norm": 7.498641490936279, "learning_rate": 9.332572732458642e-06, "loss": 0.8123, "step": 1641 }, { "epoch": 0.28102002396029435, "grad_norm": 47.50094223022461, "learning_rate": 9.338277239018825e-06, "loss": 7.894, "step": 1642 }, { "epoch": 0.2811911689200753, "grad_norm": 62.95503616333008, "learning_rate": 9.343981745579007e-06, "loss": 6.7316, "step": 1643 }, { "epoch": 0.2813623138798562, "grad_norm": 26.29498291015625, "learning_rate": 9.349686252139189e-06, "loss": 2.9299, "step": 1644 }, { "epoch": 0.28153345883963715, "grad_norm": 13.663917541503906, "learning_rate": 9.355390758699372e-06, "loss": 1.6658, "step": 1645 }, { "epoch": 0.2817046037994181, "grad_norm": 31.745132446289062, "learning_rate": 9.361095265259556e-06, "loss": 4.9097, "step": 1646 }, { "epoch": 0.281875748759199, "grad_norm": 16.757953643798828, "learning_rate": 9.366799771819739e-06, "loss": 1.4769, "step": 1647 }, { "epoch": 0.28204689371897995, "grad_norm": 21.601877212524414, "learning_rate": 9.37250427837992e-06, "loss": 1.7352, "step": 1648 }, { "epoch": 0.2822180386787609, "grad_norm": 38.61962127685547, "learning_rate": 9.378208784940104e-06, "loss": 4.4803, "step": 1649 }, { "epoch": 0.2823891836385418, "grad_norm": 31.342639923095703, "learning_rate": 9.383913291500286e-06, "loss": 3.8044, "step": 1650 }, { "epoch": 0.28256032859832275, "grad_norm": 9.416754722595215, "learning_rate": 9.389617798060467e-06, "loss": 0.804, "step": 1651 }, { "epoch": 0.28273147355810374, "grad_norm": 31.227413177490234, "learning_rate": 9.39532230462065e-06, "loss": 4.1229, "step": 1652 }, { "epoch": 0.2829026185178847, "grad_norm": 13.257563591003418, "learning_rate": 9.401026811180832e-06, "loss": 1.1089, "step": 1653 }, { "epoch": 0.2830737634776656, "grad_norm": 33.36773681640625, "learning_rate": 9.406731317741016e-06, "loss": 4.6453, "step": 1654 }, { "epoch": 0.28324490843744654, "grad_norm": 30.116289138793945, "learning_rate": 9.412435824301198e-06, "loss": 3.3475, "step": 1655 }, { "epoch": 0.2834160533972275, "grad_norm": 9.72807502746582, "learning_rate": 9.41814033086138e-06, "loss": 1.3987, "step": 1656 }, { "epoch": 0.2835871983570084, "grad_norm": 35.53730392456055, "learning_rate": 9.423844837421563e-06, "loss": 4.4274, "step": 1657 }, { "epoch": 0.28375834331678934, "grad_norm": 25.7310733795166, "learning_rate": 9.429549343981746e-06, "loss": 3.1681, "step": 1658 }, { "epoch": 0.2839294882765703, "grad_norm": 41.159175872802734, "learning_rate": 9.43525385054193e-06, "loss": 5.0249, "step": 1659 }, { "epoch": 0.2841006332363512, "grad_norm": 44.80512619018555, "learning_rate": 9.440958357102111e-06, "loss": 8.5485, "step": 1660 }, { "epoch": 0.28427177819613214, "grad_norm": 30.980173110961914, "learning_rate": 9.446662863662294e-06, "loss": 3.4326, "step": 1661 }, { "epoch": 0.2844429231559131, "grad_norm": 34.32295608520508, "learning_rate": 9.452367370222476e-06, "loss": 3.1846, "step": 1662 }, { "epoch": 0.284614068115694, "grad_norm": 31.66938591003418, "learning_rate": 9.458071876782658e-06, "loss": 3.5118, "step": 1663 }, { "epoch": 0.28478521307547494, "grad_norm": 32.4676513671875, "learning_rate": 9.463776383342841e-06, "loss": 4.7146, "step": 1664 }, { "epoch": 0.2849563580352559, "grad_norm": 10.913191795349121, "learning_rate": 9.469480889903023e-06, "loss": 0.9731, "step": 1665 }, { "epoch": 0.2851275029950368, "grad_norm": 35.5974006652832, "learning_rate": 9.475185396463206e-06, "loss": 4.3522, "step": 1666 }, { "epoch": 0.28529864795481774, "grad_norm": 33.59803771972656, "learning_rate": 9.480889903023388e-06, "loss": 3.3641, "step": 1667 }, { "epoch": 0.2854697929145987, "grad_norm": 35.429466247558594, "learning_rate": 9.486594409583571e-06, "loss": 3.9219, "step": 1668 }, { "epoch": 0.2856409378743796, "grad_norm": 13.85142707824707, "learning_rate": 9.492298916143755e-06, "loss": 1.3341, "step": 1669 }, { "epoch": 0.28581208283416054, "grad_norm": 9.107728004455566, "learning_rate": 9.498003422703936e-06, "loss": 0.8871, "step": 1670 }, { "epoch": 0.2859832277939415, "grad_norm": 35.564979553222656, "learning_rate": 9.50370792926412e-06, "loss": 4.1451, "step": 1671 }, { "epoch": 0.2861543727537224, "grad_norm": 27.561506271362305, "learning_rate": 9.509412435824301e-06, "loss": 3.3942, "step": 1672 }, { "epoch": 0.28632551771350334, "grad_norm": 35.57343292236328, "learning_rate": 9.515116942384485e-06, "loss": 3.7111, "step": 1673 }, { "epoch": 0.2864966626732843, "grad_norm": 39.25908279418945, "learning_rate": 9.520821448944666e-06, "loss": 4.3091, "step": 1674 }, { "epoch": 0.2866678076330652, "grad_norm": 41.76926803588867, "learning_rate": 9.526525955504848e-06, "loss": 4.7086, "step": 1675 }, { "epoch": 0.28683895259284614, "grad_norm": 30.626611709594727, "learning_rate": 9.532230462065032e-06, "loss": 3.1129, "step": 1676 }, { "epoch": 0.2870100975526271, "grad_norm": 15.441875457763672, "learning_rate": 9.537934968625213e-06, "loss": 1.2888, "step": 1677 }, { "epoch": 0.287181242512408, "grad_norm": 28.600982666015625, "learning_rate": 9.543639475185397e-06, "loss": 2.9279, "step": 1678 }, { "epoch": 0.28735238747218894, "grad_norm": 31.3085994720459, "learning_rate": 9.549343981745578e-06, "loss": 3.8741, "step": 1679 }, { "epoch": 0.2875235324319699, "grad_norm": 207.9216766357422, "learning_rate": 9.555048488305763e-06, "loss": 8.5201, "step": 1680 }, { "epoch": 0.2876946773917508, "grad_norm": 38.76487731933594, "learning_rate": 9.560752994865945e-06, "loss": 4.8258, "step": 1681 }, { "epoch": 0.28786582235153174, "grad_norm": 35.18633270263672, "learning_rate": 9.566457501426127e-06, "loss": 3.9555, "step": 1682 }, { "epoch": 0.2880369673113127, "grad_norm": 153.19830322265625, "learning_rate": 9.57216200798631e-06, "loss": 8.1017, "step": 1683 }, { "epoch": 0.2882081122710936, "grad_norm": 8.444355010986328, "learning_rate": 9.577866514546492e-06, "loss": 0.8761, "step": 1684 }, { "epoch": 0.28837925723087454, "grad_norm": 44.78715515136719, "learning_rate": 9.583571021106675e-06, "loss": 8.4681, "step": 1685 }, { "epoch": 0.2885504021906555, "grad_norm": 25.710901260375977, "learning_rate": 9.589275527666857e-06, "loss": 3.2682, "step": 1686 }, { "epoch": 0.2887215471504364, "grad_norm": 161.20376586914062, "learning_rate": 9.59498003422704e-06, "loss": 8.3231, "step": 1687 }, { "epoch": 0.28889269211021734, "grad_norm": 36.88936996459961, "learning_rate": 9.600684540787222e-06, "loss": 4.4629, "step": 1688 }, { "epoch": 0.2890638370699983, "grad_norm": 33.05325698852539, "learning_rate": 9.606389047347404e-06, "loss": 4.2398, "step": 1689 }, { "epoch": 0.2892349820297792, "grad_norm": 31.297021865844727, "learning_rate": 9.612093553907587e-06, "loss": 3.9676, "step": 1690 }, { "epoch": 0.28940612698956014, "grad_norm": 33.626365661621094, "learning_rate": 9.617798060467769e-06, "loss": 4.2342, "step": 1691 }, { "epoch": 0.2895772719493411, "grad_norm": 32.812740325927734, "learning_rate": 9.623502567027954e-06, "loss": 3.9633, "step": 1692 }, { "epoch": 0.289748416909122, "grad_norm": 16.281417846679688, "learning_rate": 9.629207073588135e-06, "loss": 1.3504, "step": 1693 }, { "epoch": 0.28991956186890294, "grad_norm": 36.80635070800781, "learning_rate": 9.634911580148317e-06, "loss": 4.8133, "step": 1694 }, { "epoch": 0.2900907068286839, "grad_norm": 36.548397064208984, "learning_rate": 9.6406160867085e-06, "loss": 4.0484, "step": 1695 }, { "epoch": 0.2902618517884648, "grad_norm": 35.513729095458984, "learning_rate": 9.646320593268682e-06, "loss": 4.3281, "step": 1696 }, { "epoch": 0.29043299674824574, "grad_norm": 33.258995056152344, "learning_rate": 9.652025099828866e-06, "loss": 4.3749, "step": 1697 }, { "epoch": 0.2906041417080267, "grad_norm": 30.854419708251953, "learning_rate": 9.657729606389047e-06, "loss": 3.4016, "step": 1698 }, { "epoch": 0.2907752866678076, "grad_norm": 8.308602333068848, "learning_rate": 9.66343411294923e-06, "loss": 1.221, "step": 1699 }, { "epoch": 0.29094643162758854, "grad_norm": 10.515448570251465, "learning_rate": 9.669138619509412e-06, "loss": 0.9434, "step": 1700 }, { "epoch": 0.2911175765873695, "grad_norm": 20.784067153930664, "learning_rate": 9.674843126069594e-06, "loss": 2.2873, "step": 1701 }, { "epoch": 0.2912887215471504, "grad_norm": 44.417884826660156, "learning_rate": 9.680547632629777e-06, "loss": 8.2452, "step": 1702 }, { "epoch": 0.2914598665069314, "grad_norm": 28.057279586791992, "learning_rate": 9.68625213918996e-06, "loss": 2.7355, "step": 1703 }, { "epoch": 0.29163101146671233, "grad_norm": 50.71089553833008, "learning_rate": 9.691956645750144e-06, "loss": 8.431, "step": 1704 }, { "epoch": 0.29180215642649326, "grad_norm": 44.918087005615234, "learning_rate": 9.697661152310326e-06, "loss": 7.8944, "step": 1705 }, { "epoch": 0.2919733013862742, "grad_norm": 143.09837341308594, "learning_rate": 9.703365658870507e-06, "loss": 7.8241, "step": 1706 }, { "epoch": 0.29214444634605513, "grad_norm": 22.652225494384766, "learning_rate": 9.709070165430691e-06, "loss": 2.888, "step": 1707 }, { "epoch": 0.29231559130583606, "grad_norm": 32.992774963378906, "learning_rate": 9.714774671990873e-06, "loss": 3.7048, "step": 1708 }, { "epoch": 0.292486736265617, "grad_norm": 30.531761169433594, "learning_rate": 9.720479178551056e-06, "loss": 3.8219, "step": 1709 }, { "epoch": 0.29265788122539793, "grad_norm": 39.57463073730469, "learning_rate": 9.726183685111238e-06, "loss": 4.5677, "step": 1710 }, { "epoch": 0.29282902618517886, "grad_norm": 32.177650451660156, "learning_rate": 9.731888191671421e-06, "loss": 3.9562, "step": 1711 }, { "epoch": 0.2930001711449598, "grad_norm": 11.071613311767578, "learning_rate": 9.737592698231603e-06, "loss": 1.3818, "step": 1712 }, { "epoch": 0.29317131610474073, "grad_norm": 183.07089233398438, "learning_rate": 9.743297204791784e-06, "loss": 8.202, "step": 1713 }, { "epoch": 0.29334246106452166, "grad_norm": 127.64228057861328, "learning_rate": 9.749001711351968e-06, "loss": 7.7497, "step": 1714 }, { "epoch": 0.2935136060243026, "grad_norm": 30.094449996948242, "learning_rate": 9.754706217912151e-06, "loss": 3.133, "step": 1715 }, { "epoch": 0.29368475098408353, "grad_norm": 33.56199264526367, "learning_rate": 9.760410724472334e-06, "loss": 3.9969, "step": 1716 }, { "epoch": 0.29385589594386446, "grad_norm": 30.969953536987305, "learning_rate": 9.766115231032516e-06, "loss": 3.2142, "step": 1717 }, { "epoch": 0.2940270409036454, "grad_norm": 26.745988845825195, "learning_rate": 9.7718197375927e-06, "loss": 2.5008, "step": 1718 }, { "epoch": 0.29419818586342633, "grad_norm": 19.184772491455078, "learning_rate": 9.777524244152881e-06, "loss": 1.7379, "step": 1719 }, { "epoch": 0.29436933082320726, "grad_norm": 30.3228759765625, "learning_rate": 9.783228750713063e-06, "loss": 3.6131, "step": 1720 }, { "epoch": 0.2945404757829882, "grad_norm": 30.700254440307617, "learning_rate": 9.788933257273246e-06, "loss": 3.4613, "step": 1721 }, { "epoch": 0.29471162074276913, "grad_norm": 35.11033248901367, "learning_rate": 9.794637763833428e-06, "loss": 4.2554, "step": 1722 }, { "epoch": 0.29488276570255006, "grad_norm": 47.508277893066406, "learning_rate": 9.800342270393611e-06, "loss": 8.2195, "step": 1723 }, { "epoch": 0.295053910662331, "grad_norm": 35.247528076171875, "learning_rate": 9.806046776953793e-06, "loss": 4.3316, "step": 1724 }, { "epoch": 0.29522505562211193, "grad_norm": 27.610280990600586, "learning_rate": 9.811751283513975e-06, "loss": 3.7587, "step": 1725 }, { "epoch": 0.29539620058189286, "grad_norm": 34.314918518066406, "learning_rate": 9.81745579007416e-06, "loss": 4.4546, "step": 1726 }, { "epoch": 0.2955673455416738, "grad_norm": 31.43994140625, "learning_rate": 9.823160296634341e-06, "loss": 3.8653, "step": 1727 }, { "epoch": 0.29573849050145473, "grad_norm": 15.655001640319824, "learning_rate": 9.828864803194525e-06, "loss": 1.2015, "step": 1728 }, { "epoch": 0.29590963546123566, "grad_norm": 13.799985885620117, "learning_rate": 9.834569309754707e-06, "loss": 1.6485, "step": 1729 }, { "epoch": 0.2960807804210166, "grad_norm": 35.408145904541016, "learning_rate": 9.84027381631489e-06, "loss": 4.6912, "step": 1730 }, { "epoch": 0.29625192538079753, "grad_norm": 33.258941650390625, "learning_rate": 9.845978322875072e-06, "loss": 3.8243, "step": 1731 }, { "epoch": 0.29642307034057847, "grad_norm": 34.537960052490234, "learning_rate": 9.851682829435253e-06, "loss": 3.6863, "step": 1732 }, { "epoch": 0.2965942153003594, "grad_norm": 25.667997360229492, "learning_rate": 9.857387335995437e-06, "loss": 3.21, "step": 1733 }, { "epoch": 0.29676536026014033, "grad_norm": 146.46380615234375, "learning_rate": 9.863091842555618e-06, "loss": 7.1136, "step": 1734 }, { "epoch": 0.29693650521992127, "grad_norm": 20.732595443725586, "learning_rate": 9.868796349115802e-06, "loss": 2.1932, "step": 1735 }, { "epoch": 0.2971076501797022, "grad_norm": 37.78299331665039, "learning_rate": 9.874500855675983e-06, "loss": 4.4365, "step": 1736 }, { "epoch": 0.29727879513948313, "grad_norm": 30.049827575683594, "learning_rate": 9.880205362236167e-06, "loss": 3.2983, "step": 1737 }, { "epoch": 0.29744994009926407, "grad_norm": 12.33377742767334, "learning_rate": 9.88590986879635e-06, "loss": 1.4362, "step": 1738 }, { "epoch": 0.297621085059045, "grad_norm": 24.165996551513672, "learning_rate": 9.891614375356532e-06, "loss": 2.7512, "step": 1739 }, { "epoch": 0.29779223001882593, "grad_norm": 34.980438232421875, "learning_rate": 9.897318881916715e-06, "loss": 3.4089, "step": 1740 }, { "epoch": 0.29796337497860687, "grad_norm": 52.22333526611328, "learning_rate": 9.903023388476897e-06, "loss": 8.55, "step": 1741 }, { "epoch": 0.2981345199383878, "grad_norm": 30.178720474243164, "learning_rate": 9.90872789503708e-06, "loss": 3.7629, "step": 1742 }, { "epoch": 0.29830566489816873, "grad_norm": 12.83564281463623, "learning_rate": 9.914432401597262e-06, "loss": 1.5206, "step": 1743 }, { "epoch": 0.29847680985794967, "grad_norm": 116.20635223388672, "learning_rate": 9.920136908157444e-06, "loss": 7.1701, "step": 1744 }, { "epoch": 0.2986479548177306, "grad_norm": 28.332143783569336, "learning_rate": 9.925841414717627e-06, "loss": 4.0808, "step": 1745 }, { "epoch": 0.29881909977751153, "grad_norm": 17.009302139282227, "learning_rate": 9.931545921277809e-06, "loss": 1.1237, "step": 1746 }, { "epoch": 0.29899024473729247, "grad_norm": 22.102079391479492, "learning_rate": 9.937250427837992e-06, "loss": 2.1591, "step": 1747 }, { "epoch": 0.2991613896970734, "grad_norm": 31.704936981201172, "learning_rate": 9.942954934398174e-06, "loss": 3.3555, "step": 1748 }, { "epoch": 0.29933253465685433, "grad_norm": 7.139681816101074, "learning_rate": 9.948659440958359e-06, "loss": 0.8492, "step": 1749 }, { "epoch": 0.29950367961663527, "grad_norm": 37.93485641479492, "learning_rate": 9.95436394751854e-06, "loss": 4.3445, "step": 1750 }, { "epoch": 0.2996748245764162, "grad_norm": 23.79175567626953, "learning_rate": 9.960068454078722e-06, "loss": 2.6891, "step": 1751 }, { "epoch": 0.29984596953619713, "grad_norm": 26.583223342895508, "learning_rate": 9.965772960638906e-06, "loss": 3.0936, "step": 1752 }, { "epoch": 0.3000171144959781, "grad_norm": 16.86503791809082, "learning_rate": 9.971477467199087e-06, "loss": 1.5367, "step": 1753 }, { "epoch": 0.30018825945575905, "grad_norm": 37.780025482177734, "learning_rate": 9.97718197375927e-06, "loss": 4.5862, "step": 1754 }, { "epoch": 0.30018825945575905, "eval_nli-pairs_loss": 3.577563524246216, "eval_nli-pairs_runtime": 4.5158, "eval_nli-pairs_samples_per_second": 44.289, "eval_nli-pairs_steps_per_second": 1.55, "eval_sts-test_pearson_cosine": 0.7051574603634622, "eval_sts-test_pearson_dot": 0.5937802816639131, "eval_sts-test_pearson_euclidean": 0.7000060119936138, "eval_sts-test_pearson_manhattan": 0.7079127065958083, "eval_sts-test_pearson_max": 0.7079127065958083, "eval_sts-test_spearman_cosine": 0.6765504113809614, "eval_sts-test_spearman_dot": 0.5611218190113842, "eval_sts-test_spearman_euclidean": 0.6793571635918119, "eval_sts-test_spearman_manhattan": 0.6864576898108908, "eval_sts-test_spearman_max": 0.6864576898108908, "step": 1754 }, { "epoch": 0.30018825945575905, "eval_vitaminc-pairs_loss": 2.382566213607788, "eval_vitaminc-pairs_runtime": 2.7572, "eval_vitaminc-pairs_samples_per_second": 72.538, "eval_vitaminc-pairs_steps_per_second": 2.539, "step": 1754 }, { "epoch": 0.30018825945575905, "eval_qnli-contrastive_loss": 7.762363910675049, "eval_qnli-contrastive_runtime": 0.6686, "eval_qnli-contrastive_samples_per_second": 299.128, "eval_qnli-contrastive_steps_per_second": 10.469, "step": 1754 }, { "epoch": 0.30018825945575905, "eval_scitail-pairs-qa_loss": 0.7197363972663879, "eval_scitail-pairs-qa_runtime": 1.7426, "eval_scitail-pairs-qa_samples_per_second": 114.768, "eval_scitail-pairs-qa_steps_per_second": 4.017, "step": 1754 }, { "epoch": 0.30018825945575905, "eval_scitail-pairs-pos_loss": 2.2759039402008057, "eval_scitail-pairs-pos_runtime": 2.8206, "eval_scitail-pairs-pos_samples_per_second": 70.906, "eval_scitail-pairs-pos_steps_per_second": 2.482, "step": 1754 }, { "epoch": 0.30018825945575905, "eval_xsum-pairs_loss": 2.1139955520629883, "eval_xsum-pairs_runtime": 2.6563, "eval_xsum-pairs_samples_per_second": 65.88, "eval_xsum-pairs_steps_per_second": 2.259, "step": 1754 }, { "epoch": 0.30018825945575905, "eval_compression-pairs_loss": 1.1527378559112549, "eval_compression-pairs_runtime": 0.5278, "eval_compression-pairs_samples_per_second": 378.929, "eval_compression-pairs_steps_per_second": 13.263, "step": 1754 }, { "epoch": 0.30018825945575905, "eval_sciq_pairs_loss": 6.166472434997559, "eval_sciq_pairs_runtime": 9.2821, "eval_sciq_pairs_samples_per_second": 21.547, "eval_sciq_pairs_steps_per_second": 0.754, "step": 1754 }, { "epoch": 0.30018825945575905, "eval_qasc_pairs_loss": 8.247413635253906, "eval_qasc_pairs_runtime": 2.7444, "eval_qasc_pairs_samples_per_second": 72.876, "eval_qasc_pairs_steps_per_second": 2.551, "step": 1754 }, { "epoch": 0.30018825945575905, "eval_openbookqa_pairs_loss": 4.27993631362915, "eval_openbookqa_pairs_runtime": 0.68, "eval_openbookqa_pairs_samples_per_second": 101.475, "eval_openbookqa_pairs_steps_per_second": 4.412, "step": 1754 }, { "epoch": 0.30018825945575905, "eval_msmarco_pairs_loss": 3.4503884315490723, "eval_msmarco_pairs_runtime": 4.1424, "eval_msmarco_pairs_samples_per_second": 48.281, "eval_msmarco_pairs_steps_per_second": 1.69, "step": 1754 }, { "epoch": 0.30018825945575905, "eval_nq_pairs_loss": 4.303767204284668, "eval_nq_pairs_runtime": 8.7194, "eval_nq_pairs_samples_per_second": 22.937, "eval_nq_pairs_steps_per_second": 0.803, "step": 1754 }, { "epoch": 0.30018825945575905, "eval_trivia_pairs_loss": 3.893390417098999, "eval_trivia_pairs_runtime": 13.177, "eval_trivia_pairs_samples_per_second": 15.178, "eval_trivia_pairs_steps_per_second": 0.531, "step": 1754 }, { "epoch": 0.30018825945575905, "eval_quora_pairs_loss": 1.0257954597473145, "eval_quora_pairs_runtime": 1.5896, "eval_quora_pairs_samples_per_second": 125.821, "eval_quora_pairs_steps_per_second": 4.404, "step": 1754 }, { "epoch": 0.30018825945575905, "eval_gooaq_pairs_loss": 2.6827940940856934, "eval_gooaq_pairs_runtime": 2.6669, "eval_gooaq_pairs_samples_per_second": 74.993, "eval_gooaq_pairs_steps_per_second": 2.625, "step": 1754 }, { "epoch": 0.30035940441554, "grad_norm": 32.57681655883789, "learning_rate": 9.982886480319452e-06, "loss": 4.3391, "step": 1755 }, { "epoch": 0.3005305493753209, "grad_norm": 26.65064811706543, "learning_rate": 9.988590986879634e-06, "loss": 2.7014, "step": 1756 }, { "epoch": 0.30070169433510185, "grad_norm": 33.25247573852539, "learning_rate": 9.994295493439817e-06, "loss": 4.1446, "step": 1757 }, { "epoch": 0.3008728392948828, "grad_norm": 25.792116165161133, "learning_rate": 9.999999999999999e-06, "loss": 2.7164, "step": 1758 }, { "epoch": 0.3010439842546637, "grad_norm": 28.707399368286133, "learning_rate": 1.0005704506560183e-05, "loss": 3.1937, "step": 1759 }, { "epoch": 0.30121512921444465, "grad_norm": 38.30696105957031, "learning_rate": 1.0011409013120366e-05, "loss": 4.2427, "step": 1760 }, { "epoch": 0.3013862741742256, "grad_norm": 26.254148483276367, "learning_rate": 1.001711351968055e-05, "loss": 2.5525, "step": 1761 }, { "epoch": 0.3015574191340065, "grad_norm": 7.5429487228393555, "learning_rate": 1.0022818026240731e-05, "loss": 0.8481, "step": 1762 }, { "epoch": 0.30172856409378745, "grad_norm": 45.37841796875, "learning_rate": 1.0028522532800913e-05, "loss": 6.5584, "step": 1763 }, { "epoch": 0.3018997090535684, "grad_norm": 17.617197036743164, "learning_rate": 1.0034227039361096e-05, "loss": 1.5689, "step": 1764 }, { "epoch": 0.3020708540133493, "grad_norm": 8.921030044555664, "learning_rate": 1.0039931545921278e-05, "loss": 1.9049, "step": 1765 }, { "epoch": 0.30224199897313025, "grad_norm": 11.456149101257324, "learning_rate": 1.0045636052481461e-05, "loss": 1.4351, "step": 1766 }, { "epoch": 0.3024131439329112, "grad_norm": 36.827125549316406, "learning_rate": 1.0051340559041643e-05, "loss": 3.8073, "step": 1767 }, { "epoch": 0.3025842888926921, "grad_norm": 31.50043296813965, "learning_rate": 1.0057045065601826e-05, "loss": 3.4761, "step": 1768 }, { "epoch": 0.30275543385247305, "grad_norm": 212.15618896484375, "learning_rate": 1.0062749572162008e-05, "loss": 8.804, "step": 1769 }, { "epoch": 0.302926578812254, "grad_norm": 11.170289039611816, "learning_rate": 1.006845407872219e-05, "loss": 1.5324, "step": 1770 }, { "epoch": 0.3030977237720349, "grad_norm": 11.275130271911621, "learning_rate": 1.0074158585282373e-05, "loss": 1.0326, "step": 1771 }, { "epoch": 0.30326886873181585, "grad_norm": 37.139068603515625, "learning_rate": 1.0079863091842556e-05, "loss": 4.5464, "step": 1772 }, { "epoch": 0.3034400136915968, "grad_norm": 24.030378341674805, "learning_rate": 1.008556759840274e-05, "loss": 1.9306, "step": 1773 }, { "epoch": 0.3036111586513777, "grad_norm": 23.25863265991211, "learning_rate": 1.0091272104962921e-05, "loss": 1.8897, "step": 1774 }, { "epoch": 0.30378230361115865, "grad_norm": 33.125823974609375, "learning_rate": 1.0096976611523103e-05, "loss": 3.4839, "step": 1775 }, { "epoch": 0.3039534485709396, "grad_norm": 21.4809627532959, "learning_rate": 1.0102681118083286e-05, "loss": 2.866, "step": 1776 }, { "epoch": 0.3041245935307205, "grad_norm": 54.2559928894043, "learning_rate": 1.0108385624643468e-05, "loss": 8.802, "step": 1777 }, { "epoch": 0.30429573849050146, "grad_norm": 39.62715148925781, "learning_rate": 1.0114090131203651e-05, "loss": 5.1068, "step": 1778 }, { "epoch": 0.3044668834502824, "grad_norm": 14.615751266479492, "learning_rate": 1.0119794637763833e-05, "loss": 1.2298, "step": 1779 }, { "epoch": 0.3046380284100633, "grad_norm": 36.6978874206543, "learning_rate": 1.0125499144324017e-05, "loss": 4.1995, "step": 1780 }, { "epoch": 0.30480917336984426, "grad_norm": 14.718832015991211, "learning_rate": 1.0131203650884198e-05, "loss": 1.1796, "step": 1781 }, { "epoch": 0.3049803183296252, "grad_norm": 36.830204010009766, "learning_rate": 1.013690815744438e-05, "loss": 4.1858, "step": 1782 }, { "epoch": 0.3051514632894061, "grad_norm": 23.391765594482422, "learning_rate": 1.0142612664004565e-05, "loss": 2.4115, "step": 1783 }, { "epoch": 0.30532260824918706, "grad_norm": 35.27947998046875, "learning_rate": 1.0148317170564747e-05, "loss": 4.8061, "step": 1784 }, { "epoch": 0.305493753208968, "grad_norm": 10.68021297454834, "learning_rate": 1.015402167712493e-05, "loss": 2.1324, "step": 1785 }, { "epoch": 0.3056648981687489, "grad_norm": 23.529436111450195, "learning_rate": 1.0159726183685112e-05, "loss": 2.7194, "step": 1786 }, { "epoch": 0.30583604312852986, "grad_norm": 32.76841354370117, "learning_rate": 1.0165430690245295e-05, "loss": 3.9735, "step": 1787 }, { "epoch": 0.3060071880883108, "grad_norm": 20.872732162475586, "learning_rate": 1.0171135196805477e-05, "loss": 2.3385, "step": 1788 }, { "epoch": 0.3061783330480917, "grad_norm": 14.08251953125, "learning_rate": 1.0176839703365658e-05, "loss": 1.8159, "step": 1789 }, { "epoch": 0.30634947800787266, "grad_norm": 39.58723831176758, "learning_rate": 1.0182544209925842e-05, "loss": 4.7749, "step": 1790 }, { "epoch": 0.3065206229676536, "grad_norm": 65.20591735839844, "learning_rate": 1.0188248716486024e-05, "loss": 6.4724, "step": 1791 }, { "epoch": 0.3066917679274345, "grad_norm": 44.97452926635742, "learning_rate": 1.0193953223046207e-05, "loss": 4.9313, "step": 1792 }, { "epoch": 0.30686291288721546, "grad_norm": 35.091163635253906, "learning_rate": 1.0199657729606389e-05, "loss": 3.4266, "step": 1793 }, { "epoch": 0.3070340578469964, "grad_norm": 17.238380432128906, "learning_rate": 1.020536223616657e-05, "loss": 1.4114, "step": 1794 }, { "epoch": 0.3072052028067773, "grad_norm": 12.661242485046387, "learning_rate": 1.0211066742726755e-05, "loss": 2.2799, "step": 1795 }, { "epoch": 0.30737634776655826, "grad_norm": 29.67556381225586, "learning_rate": 1.0216771249286937e-05, "loss": 2.9217, "step": 1796 }, { "epoch": 0.3075474927263392, "grad_norm": 34.465126037597656, "learning_rate": 1.022247575584712e-05, "loss": 3.9674, "step": 1797 }, { "epoch": 0.3077186376861201, "grad_norm": 66.6548080444336, "learning_rate": 1.0228180262407302e-05, "loss": 6.0514, "step": 1798 }, { "epoch": 0.30788978264590106, "grad_norm": 36.210044860839844, "learning_rate": 1.0233884768967485e-05, "loss": 4.2555, "step": 1799 }, { "epoch": 0.308060927605682, "grad_norm": 24.441967010498047, "learning_rate": 1.0239589275527667e-05, "loss": 2.5473, "step": 1800 }, { "epoch": 0.3082320725654629, "grad_norm": 20.574525833129883, "learning_rate": 1.0245293782087849e-05, "loss": 1.6693, "step": 1801 }, { "epoch": 0.30840321752524386, "grad_norm": 26.07015037536621, "learning_rate": 1.0250998288648032e-05, "loss": 2.7451, "step": 1802 }, { "epoch": 0.3085743624850248, "grad_norm": 29.663963317871094, "learning_rate": 1.0256702795208214e-05, "loss": 4.0482, "step": 1803 }, { "epoch": 0.3087455074448058, "grad_norm": 27.77281379699707, "learning_rate": 1.0262407301768397e-05, "loss": 3.0752, "step": 1804 }, { "epoch": 0.3089166524045867, "grad_norm": 34.827430725097656, "learning_rate": 1.0268111808328579e-05, "loss": 3.7669, "step": 1805 }, { "epoch": 0.30908779736436764, "grad_norm": 37.112361907958984, "learning_rate": 1.0273816314888762e-05, "loss": 4.7788, "step": 1806 }, { "epoch": 0.3092589423241486, "grad_norm": 53.2462272644043, "learning_rate": 1.0279520821448946e-05, "loss": 8.0593, "step": 1807 }, { "epoch": 0.3094300872839295, "grad_norm": 38.18441390991211, "learning_rate": 1.0285225328009127e-05, "loss": 4.2028, "step": 1808 }, { "epoch": 0.30960123224371044, "grad_norm": 13.605740547180176, "learning_rate": 1.029092983456931e-05, "loss": 2.5679, "step": 1809 }, { "epoch": 0.3097723772034914, "grad_norm": 37.292240142822266, "learning_rate": 1.0296634341129492e-05, "loss": 4.0864, "step": 1810 }, { "epoch": 0.3099435221632723, "grad_norm": 10.673694610595703, "learning_rate": 1.0302338847689676e-05, "loss": 0.953, "step": 1811 }, { "epoch": 0.31011466712305324, "grad_norm": 30.847604751586914, "learning_rate": 1.0308043354249858e-05, "loss": 4.4181, "step": 1812 }, { "epoch": 0.3102858120828342, "grad_norm": 25.303640365600586, "learning_rate": 1.031374786081004e-05, "loss": 3.0808, "step": 1813 }, { "epoch": 0.3104569570426151, "grad_norm": 31.284347534179688, "learning_rate": 1.0319452367370223e-05, "loss": 3.3148, "step": 1814 }, { "epoch": 0.31062810200239604, "grad_norm": 18.292266845703125, "learning_rate": 1.0325156873930404e-05, "loss": 1.4786, "step": 1815 }, { "epoch": 0.310799246962177, "grad_norm": 93.66471099853516, "learning_rate": 1.0330861380490588e-05, "loss": 6.8127, "step": 1816 }, { "epoch": 0.3109703919219579, "grad_norm": 38.12440872192383, "learning_rate": 1.033656588705077e-05, "loss": 5.0019, "step": 1817 }, { "epoch": 0.31114153688173884, "grad_norm": 32.61493682861328, "learning_rate": 1.0342270393610954e-05, "loss": 4.3171, "step": 1818 }, { "epoch": 0.3113126818415198, "grad_norm": 38.087646484375, "learning_rate": 1.0347974900171136e-05, "loss": 7.6945, "step": 1819 }, { "epoch": 0.3114838268013007, "grad_norm": 21.899497985839844, "learning_rate": 1.0353679406731318e-05, "loss": 1.7206, "step": 1820 }, { "epoch": 0.31165497176108165, "grad_norm": 113.81354522705078, "learning_rate": 1.0359383913291501e-05, "loss": 7.2513, "step": 1821 }, { "epoch": 0.3118261167208626, "grad_norm": 11.316397666931152, "learning_rate": 1.0365088419851683e-05, "loss": 2.1259, "step": 1822 }, { "epoch": 0.3119972616806435, "grad_norm": 26.67529296875, "learning_rate": 1.0370792926411866e-05, "loss": 3.1664, "step": 1823 }, { "epoch": 0.31216840664042445, "grad_norm": 25.253353118896484, "learning_rate": 1.0376497432972048e-05, "loss": 2.4222, "step": 1824 }, { "epoch": 0.3123395516002054, "grad_norm": 8.143440246582031, "learning_rate": 1.038220193953223e-05, "loss": 0.7973, "step": 1825 }, { "epoch": 0.3125106965599863, "grad_norm": 19.66392707824707, "learning_rate": 1.0387906446092413e-05, "loss": 1.5552, "step": 1826 }, { "epoch": 0.31268184151976725, "grad_norm": 23.67314910888672, "learning_rate": 1.0393610952652595e-05, "loss": 3.07, "step": 1827 }, { "epoch": 0.3128529864795482, "grad_norm": 26.236251831054688, "learning_rate": 1.0399315459212778e-05, "loss": 3.1091, "step": 1828 }, { "epoch": 0.3130241314393291, "grad_norm": 28.10502815246582, "learning_rate": 1.0405019965772961e-05, "loss": 3.0707, "step": 1829 }, { "epoch": 0.31319527639911005, "grad_norm": 34.508846282958984, "learning_rate": 1.0410724472333145e-05, "loss": 4.872, "step": 1830 }, { "epoch": 0.313366421358891, "grad_norm": 34.22414016723633, "learning_rate": 1.0416428978893326e-05, "loss": 3.3169, "step": 1831 }, { "epoch": 0.3135375663186719, "grad_norm": 46.06840515136719, "learning_rate": 1.0422133485453508e-05, "loss": 7.9438, "step": 1832 }, { "epoch": 0.31370871127845285, "grad_norm": 18.041322708129883, "learning_rate": 1.0427837992013692e-05, "loss": 1.629, "step": 1833 }, { "epoch": 0.3138798562382338, "grad_norm": 14.525741577148438, "learning_rate": 1.0433542498573873e-05, "loss": 1.1969, "step": 1834 }, { "epoch": 0.3140510011980147, "grad_norm": 4.135936260223389, "learning_rate": 1.0439247005134057e-05, "loss": 0.7184, "step": 1835 }, { "epoch": 0.31422214615779565, "grad_norm": 41.9599723815918, "learning_rate": 1.0444951511694238e-05, "loss": 4.2524, "step": 1836 }, { "epoch": 0.3143932911175766, "grad_norm": 7.373823642730713, "learning_rate": 1.0450656018254422e-05, "loss": 1.8983, "step": 1837 }, { "epoch": 0.3145644360773575, "grad_norm": 31.084392547607422, "learning_rate": 1.0456360524814603e-05, "loss": 4.0436, "step": 1838 }, { "epoch": 0.31473558103713845, "grad_norm": 11.967267036437988, "learning_rate": 1.0462065031374785e-05, "loss": 1.0282, "step": 1839 }, { "epoch": 0.3149067259969194, "grad_norm": 33.466121673583984, "learning_rate": 1.046776953793497e-05, "loss": 3.9262, "step": 1840 }, { "epoch": 0.3150778709567003, "grad_norm": 39.21562576293945, "learning_rate": 1.0473474044495152e-05, "loss": 4.844, "step": 1841 }, { "epoch": 0.31524901591648125, "grad_norm": 33.843055725097656, "learning_rate": 1.0479178551055335e-05, "loss": 3.5103, "step": 1842 }, { "epoch": 0.3154201608762622, "grad_norm": 35.37272644042969, "learning_rate": 1.0484883057615517e-05, "loss": 3.584, "step": 1843 }, { "epoch": 0.3155913058360431, "grad_norm": 17.376483917236328, "learning_rate": 1.0490587564175699e-05, "loss": 1.4993, "step": 1844 }, { "epoch": 0.31576245079582405, "grad_norm": 45.614688873291016, "learning_rate": 1.0496292070735882e-05, "loss": 8.1587, "step": 1845 }, { "epoch": 0.315933595755605, "grad_norm": 31.185443878173828, "learning_rate": 1.0501996577296064e-05, "loss": 4.1762, "step": 1846 }, { "epoch": 0.3161047407153859, "grad_norm": 33.703514099121094, "learning_rate": 1.0507701083856247e-05, "loss": 4.1885, "step": 1847 }, { "epoch": 0.31627588567516685, "grad_norm": 24.48247718811035, "learning_rate": 1.0513405590416429e-05, "loss": 2.7277, "step": 1848 }, { "epoch": 0.3164470306349478, "grad_norm": 25.966876983642578, "learning_rate": 1.0519110096976612e-05, "loss": 2.8921, "step": 1849 }, { "epoch": 0.3166181755947287, "grad_norm": 35.0124626159668, "learning_rate": 1.0524814603536794e-05, "loss": 4.3145, "step": 1850 }, { "epoch": 0.31678932055450965, "grad_norm": 33.62586975097656, "learning_rate": 1.0530519110096975e-05, "loss": 3.8524, "step": 1851 }, { "epoch": 0.3169604655142906, "grad_norm": 30.16233253479004, "learning_rate": 1.053622361665716e-05, "loss": 3.3166, "step": 1852 }, { "epoch": 0.3171316104740715, "grad_norm": 31.811193466186523, "learning_rate": 1.0541928123217342e-05, "loss": 3.5965, "step": 1853 }, { "epoch": 0.31730275543385245, "grad_norm": 35.756778717041016, "learning_rate": 1.0547632629777526e-05, "loss": 4.4027, "step": 1854 }, { "epoch": 0.31747390039363343, "grad_norm": 17.929304122924805, "learning_rate": 1.0553337136337707e-05, "loss": 2.2128, "step": 1855 }, { "epoch": 0.31764504535341437, "grad_norm": 29.329362869262695, "learning_rate": 1.0559041642897889e-05, "loss": 2.4503, "step": 1856 }, { "epoch": 0.3178161903131953, "grad_norm": 38.31791305541992, "learning_rate": 1.0564746149458072e-05, "loss": 4.1596, "step": 1857 }, { "epoch": 0.31798733527297623, "grad_norm": 26.978776931762695, "learning_rate": 1.0570450656018254e-05, "loss": 2.5148, "step": 1858 }, { "epoch": 0.31815848023275717, "grad_norm": 183.96864318847656, "learning_rate": 1.0576155162578437e-05, "loss": 7.8451, "step": 1859 }, { "epoch": 0.3183296251925381, "grad_norm": 34.898677825927734, "learning_rate": 1.0581859669138619e-05, "loss": 3.631, "step": 1860 }, { "epoch": 0.31850077015231903, "grad_norm": 18.749799728393555, "learning_rate": 1.0587564175698802e-05, "loss": 1.5066, "step": 1861 }, { "epoch": 0.31867191511209997, "grad_norm": 32.26422882080078, "learning_rate": 1.0593268682258984e-05, "loss": 4.0466, "step": 1862 }, { "epoch": 0.3188430600718809, "grad_norm": 9.538769721984863, "learning_rate": 1.0598973188819167e-05, "loss": 1.2133, "step": 1863 }, { "epoch": 0.31901420503166183, "grad_norm": 9.156614303588867, "learning_rate": 1.0604677695379351e-05, "loss": 0.9202, "step": 1864 }, { "epoch": 0.31918534999144277, "grad_norm": 137.56471252441406, "learning_rate": 1.0610382201939533e-05, "loss": 7.6205, "step": 1865 }, { "epoch": 0.3193564949512237, "grad_norm": 24.30291748046875, "learning_rate": 1.0616086708499716e-05, "loss": 2.5704, "step": 1866 }, { "epoch": 0.31952763991100464, "grad_norm": 32.78607940673828, "learning_rate": 1.0621791215059898e-05, "loss": 3.4866, "step": 1867 }, { "epoch": 0.31969878487078557, "grad_norm": 25.44717025756836, "learning_rate": 1.0627495721620081e-05, "loss": 2.8747, "step": 1868 }, { "epoch": 0.3198699298305665, "grad_norm": 71.5486831665039, "learning_rate": 1.0633200228180263e-05, "loss": 6.3834, "step": 1869 }, { "epoch": 0.32004107479034744, "grad_norm": 36.36513900756836, "learning_rate": 1.0638904734740444e-05, "loss": 3.8896, "step": 1870 }, { "epoch": 0.32021221975012837, "grad_norm": 14.369461059570312, "learning_rate": 1.0644609241300628e-05, "loss": 1.2576, "step": 1871 }, { "epoch": 0.3203833647099093, "grad_norm": 34.6867561340332, "learning_rate": 1.065031374786081e-05, "loss": 3.4093, "step": 1872 }, { "epoch": 0.32055450966969024, "grad_norm": 21.84122657775879, "learning_rate": 1.0656018254420993e-05, "loss": 2.2791, "step": 1873 }, { "epoch": 0.32072565462947117, "grad_norm": 21.254135131835938, "learning_rate": 1.0661722760981174e-05, "loss": 2.2054, "step": 1874 }, { "epoch": 0.3208967995892521, "grad_norm": 33.362220764160156, "learning_rate": 1.0667427267541358e-05, "loss": 4.1888, "step": 1875 }, { "epoch": 0.32106794454903304, "grad_norm": 63.412601470947266, "learning_rate": 1.0673131774101541e-05, "loss": 8.5606, "step": 1876 }, { "epoch": 0.32123908950881397, "grad_norm": 14.283455848693848, "learning_rate": 1.0678836280661723e-05, "loss": 0.9998, "step": 1877 }, { "epoch": 0.3214102344685949, "grad_norm": 35.16504669189453, "learning_rate": 1.0684540787221906e-05, "loss": 4.2321, "step": 1878 }, { "epoch": 0.32158137942837584, "grad_norm": 12.61963939666748, "learning_rate": 1.0690245293782088e-05, "loss": 1.5004, "step": 1879 }, { "epoch": 0.32175252438815677, "grad_norm": 32.174076080322266, "learning_rate": 1.0695949800342271e-05, "loss": 3.5576, "step": 1880 }, { "epoch": 0.3219236693479377, "grad_norm": 30.472043991088867, "learning_rate": 1.0701654306902453e-05, "loss": 3.4048, "step": 1881 }, { "epoch": 0.32209481430771864, "grad_norm": 84.8609848022461, "learning_rate": 1.0707358813462635e-05, "loss": 6.2658, "step": 1882 }, { "epoch": 0.32226595926749957, "grad_norm": 25.621240615844727, "learning_rate": 1.0713063320022818e-05, "loss": 2.6459, "step": 1883 }, { "epoch": 0.3224371042272805, "grad_norm": 79.82257080078125, "learning_rate": 1.0718767826583e-05, "loss": 6.3192, "step": 1884 }, { "epoch": 0.32260824918706144, "grad_norm": 7.729169845581055, "learning_rate": 1.0724472333143183e-05, "loss": 0.825, "step": 1885 }, { "epoch": 0.32277939414684237, "grad_norm": 29.313451766967773, "learning_rate": 1.0730176839703367e-05, "loss": 2.9915, "step": 1886 }, { "epoch": 0.3229505391066233, "grad_norm": 6.555768013000488, "learning_rate": 1.073588134626355e-05, "loss": 0.7525, "step": 1887 }, { "epoch": 0.32312168406640424, "grad_norm": 35.07060241699219, "learning_rate": 1.0741585852823732e-05, "loss": 4.2147, "step": 1888 }, { "epoch": 0.32329282902618517, "grad_norm": 10.583313941955566, "learning_rate": 1.0747290359383913e-05, "loss": 0.8557, "step": 1889 }, { "epoch": 0.3234639739859661, "grad_norm": 26.075578689575195, "learning_rate": 1.0752994865944097e-05, "loss": 2.9433, "step": 1890 }, { "epoch": 0.32363511894574704, "grad_norm": 17.7381591796875, "learning_rate": 1.0758699372504278e-05, "loss": 1.4998, "step": 1891 }, { "epoch": 0.32380626390552797, "grad_norm": 16.11162567138672, "learning_rate": 1.0764403879064462e-05, "loss": 1.2949, "step": 1892 }, { "epoch": 0.3239774088653089, "grad_norm": 28.165752410888672, "learning_rate": 1.0770108385624643e-05, "loss": 3.4363, "step": 1893 }, { "epoch": 0.32414855382508984, "grad_norm": 37.37394714355469, "learning_rate": 1.0775812892184825e-05, "loss": 4.7016, "step": 1894 }, { "epoch": 0.32431969878487077, "grad_norm": 35.620826721191406, "learning_rate": 1.0781517398745008e-05, "loss": 4.4153, "step": 1895 }, { "epoch": 0.3244908437446517, "grad_norm": 35.83405303955078, "learning_rate": 1.078722190530519e-05, "loss": 4.4295, "step": 1896 }, { "epoch": 0.32466198870443264, "grad_norm": 12.846619606018066, "learning_rate": 1.0792926411865374e-05, "loss": 1.4411, "step": 1897 }, { "epoch": 0.32483313366421357, "grad_norm": 11.455179214477539, "learning_rate": 1.0798630918425557e-05, "loss": 1.1335, "step": 1898 }, { "epoch": 0.3250042786239945, "grad_norm": 36.278289794921875, "learning_rate": 1.080433542498574e-05, "loss": 3.6505, "step": 1899 }, { "epoch": 0.32517542358377544, "grad_norm": 37.59969711303711, "learning_rate": 1.0810039931545922e-05, "loss": 5.1473, "step": 1900 }, { "epoch": 0.32534656854355637, "grad_norm": 27.851537704467773, "learning_rate": 1.0815744438106104e-05, "loss": 2.792, "step": 1901 }, { "epoch": 0.3255177135033373, "grad_norm": 20.874591827392578, "learning_rate": 1.0821448944666287e-05, "loss": 2.5421, "step": 1902 }, { "epoch": 0.32568885846311824, "grad_norm": 12.82272720336914, "learning_rate": 1.0827153451226469e-05, "loss": 0.9663, "step": 1903 }, { "epoch": 0.32586000342289917, "grad_norm": 27.367874145507812, "learning_rate": 1.0832857957786652e-05, "loss": 2.6934, "step": 1904 }, { "epoch": 0.3260311483826801, "grad_norm": 31.575483322143555, "learning_rate": 1.0838562464346834e-05, "loss": 3.3276, "step": 1905 }, { "epoch": 0.3262022933424611, "grad_norm": 36.26526641845703, "learning_rate": 1.0844266970907017e-05, "loss": 4.196, "step": 1906 }, { "epoch": 0.326373438302242, "grad_norm": 20.60125160217285, "learning_rate": 1.0849971477467199e-05, "loss": 1.5247, "step": 1907 }, { "epoch": 0.32654458326202296, "grad_norm": 19.104351043701172, "learning_rate": 1.085567598402738e-05, "loss": 1.9953, "step": 1908 }, { "epoch": 0.3267157282218039, "grad_norm": 31.618993759155273, "learning_rate": 1.0861380490587566e-05, "loss": 3.2496, "step": 1909 }, { "epoch": 0.3268868731815848, "grad_norm": 20.25756072998047, "learning_rate": 1.0867084997147747e-05, "loss": 1.4173, "step": 1910 }, { "epoch": 0.32705801814136576, "grad_norm": 19.579376220703125, "learning_rate": 1.087278950370793e-05, "loss": 1.4559, "step": 1911 }, { "epoch": 0.3272291631011467, "grad_norm": 33.51919174194336, "learning_rate": 1.0878494010268112e-05, "loss": 4.3546, "step": 1912 }, { "epoch": 0.3274003080609276, "grad_norm": 34.54380416870117, "learning_rate": 1.0884198516828294e-05, "loss": 3.8532, "step": 1913 }, { "epoch": 0.32757145302070856, "grad_norm": 43.39759063720703, "learning_rate": 1.0889903023388477e-05, "loss": 5.7, "step": 1914 }, { "epoch": 0.3277425979804895, "grad_norm": 31.343278884887695, "learning_rate": 1.0895607529948659e-05, "loss": 3.6086, "step": 1915 }, { "epoch": 0.3279137429402704, "grad_norm": 37.40540313720703, "learning_rate": 1.0901312036508843e-05, "loss": 3.6012, "step": 1916 }, { "epoch": 0.32808488790005136, "grad_norm": 10.474573135375977, "learning_rate": 1.0907016543069024e-05, "loss": 0.9649, "step": 1917 }, { "epoch": 0.3282560328598323, "grad_norm": 26.88408088684082, "learning_rate": 1.0912721049629208e-05, "loss": 2.6185, "step": 1918 }, { "epoch": 0.3284271778196132, "grad_norm": 24.986539840698242, "learning_rate": 1.091842555618939e-05, "loss": 2.0861, "step": 1919 }, { "epoch": 0.32859832277939416, "grad_norm": 36.754337310791016, "learning_rate": 1.0924130062749573e-05, "loss": 4.4734, "step": 1920 }, { "epoch": 0.3287694677391751, "grad_norm": 36.0711555480957, "learning_rate": 1.0929834569309756e-05, "loss": 3.7612, "step": 1921 }, { "epoch": 0.328940612698956, "grad_norm": 33.72808074951172, "learning_rate": 1.0935539075869938e-05, "loss": 3.6817, "step": 1922 }, { "epoch": 0.32911175765873696, "grad_norm": 31.21643829345703, "learning_rate": 1.0941243582430121e-05, "loss": 3.1247, "step": 1923 }, { "epoch": 0.3292829026185179, "grad_norm": 26.2045955657959, "learning_rate": 1.0946948088990303e-05, "loss": 3.1474, "step": 1924 }, { "epoch": 0.3294540475782988, "grad_norm": 30.681350708007812, "learning_rate": 1.0952652595550484e-05, "loss": 3.1958, "step": 1925 }, { "epoch": 0.32962519253807976, "grad_norm": 57.95525360107422, "learning_rate": 1.0958357102110668e-05, "loss": 8.8044, "step": 1926 }, { "epoch": 0.3297963374978607, "grad_norm": 178.06443786621094, "learning_rate": 1.096406160867085e-05, "loss": 8.7701, "step": 1927 }, { "epoch": 0.3299674824576416, "grad_norm": 35.5237922668457, "learning_rate": 1.0969766115231033e-05, "loss": 3.8513, "step": 1928 }, { "epoch": 0.33013862741742256, "grad_norm": 39.186771392822266, "learning_rate": 1.0975470621791215e-05, "loss": 4.4358, "step": 1929 }, { "epoch": 0.3303097723772035, "grad_norm": 25.387964248657227, "learning_rate": 1.0981175128351398e-05, "loss": 2.6496, "step": 1930 }, { "epoch": 0.3304809173369844, "grad_norm": 41.67265319824219, "learning_rate": 1.098687963491158e-05, "loss": 4.5891, "step": 1931 }, { "epoch": 0.33065206229676536, "grad_norm": 36.71438217163086, "learning_rate": 1.0992584141471763e-05, "loss": 4.1564, "step": 1932 }, { "epoch": 0.3308232072565463, "grad_norm": 12.194602012634277, "learning_rate": 1.0998288648031946e-05, "loss": 1.3654, "step": 1933 }, { "epoch": 0.3309943522163272, "grad_norm": 30.5019474029541, "learning_rate": 1.1003993154592128e-05, "loss": 2.9248, "step": 1934 }, { "epoch": 0.33116549717610816, "grad_norm": 30.596206665039062, "learning_rate": 1.1009697661152311e-05, "loss": 3.6483, "step": 1935 }, { "epoch": 0.3313366421358891, "grad_norm": 190.34573364257812, "learning_rate": 1.1015402167712493e-05, "loss": 9.976, "step": 1936 }, { "epoch": 0.33150778709567, "grad_norm": 23.65143585205078, "learning_rate": 1.1021106674272677e-05, "loss": 2.6501, "step": 1937 }, { "epoch": 0.33167893205545096, "grad_norm": 32.524288177490234, "learning_rate": 1.1026811180832858e-05, "loss": 3.6287, "step": 1938 }, { "epoch": 0.3318500770152319, "grad_norm": 24.90087890625, "learning_rate": 1.103251568739304e-05, "loss": 2.8126, "step": 1939 }, { "epoch": 0.3320212219750128, "grad_norm": 11.670059204101562, "learning_rate": 1.1038220193953223e-05, "loss": 0.9268, "step": 1940 }, { "epoch": 0.33219236693479376, "grad_norm": 20.560199737548828, "learning_rate": 1.1043924700513405e-05, "loss": 2.0298, "step": 1941 }, { "epoch": 0.3323635118945747, "grad_norm": 32.11676788330078, "learning_rate": 1.1049629207073588e-05, "loss": 3.379, "step": 1942 }, { "epoch": 0.3325346568543556, "grad_norm": 31.273881912231445, "learning_rate": 1.1055333713633772e-05, "loss": 3.6115, "step": 1943 }, { "epoch": 0.33270580181413656, "grad_norm": 76.62176513671875, "learning_rate": 1.1061038220193953e-05, "loss": 6.2689, "step": 1944 }, { "epoch": 0.3328769467739175, "grad_norm": 29.79790496826172, "learning_rate": 1.1066742726754137e-05, "loss": 2.9922, "step": 1945 }, { "epoch": 0.3330480917336984, "grad_norm": 28.528804779052734, "learning_rate": 1.1072447233314318e-05, "loss": 3.192, "step": 1946 }, { "epoch": 0.33321923669347936, "grad_norm": 101.99966430664062, "learning_rate": 1.1078151739874502e-05, "loss": 6.9582, "step": 1947 }, { "epoch": 0.3333903816532603, "grad_norm": 33.45838165283203, "learning_rate": 1.1083856246434684e-05, "loss": 4.3572, "step": 1948 }, { "epoch": 0.3335615266130412, "grad_norm": 31.591665267944336, "learning_rate": 1.1089560752994867e-05, "loss": 3.7906, "step": 1949 }, { "epoch": 0.33373267157282216, "grad_norm": 42.0833740234375, "learning_rate": 1.1095265259555049e-05, "loss": 4.95, "step": 1950 }, { "epoch": 0.3339038165326031, "grad_norm": 94.96964263916016, "learning_rate": 1.110096976611523e-05, "loss": 6.5888, "step": 1951 }, { "epoch": 0.334074961492384, "grad_norm": 35.450111389160156, "learning_rate": 1.1106674272675414e-05, "loss": 4.8891, "step": 1952 }, { "epoch": 0.33424610645216496, "grad_norm": 32.57542037963867, "learning_rate": 1.1112378779235595e-05, "loss": 4.2762, "step": 1953 }, { "epoch": 0.3344172514119459, "grad_norm": 24.635988235473633, "learning_rate": 1.1118083285795779e-05, "loss": 2.6646, "step": 1954 }, { "epoch": 0.3345883963717268, "grad_norm": 22.50608253479004, "learning_rate": 1.1123787792355962e-05, "loss": 2.1994, "step": 1955 }, { "epoch": 0.33475954133150776, "grad_norm": 35.915611267089844, "learning_rate": 1.1129492298916144e-05, "loss": 4.3539, "step": 1956 }, { "epoch": 0.33493068629128875, "grad_norm": 39.85637283325195, "learning_rate": 1.1135196805476327e-05, "loss": 8.0766, "step": 1957 }, { "epoch": 0.3351018312510697, "grad_norm": 31.60897445678711, "learning_rate": 1.1140901312036509e-05, "loss": 3.5052, "step": 1958 }, { "epoch": 0.3352729762108506, "grad_norm": 10.988346099853516, "learning_rate": 1.1146605818596692e-05, "loss": 2.0192, "step": 1959 }, { "epoch": 0.33544412117063155, "grad_norm": 77.31686401367188, "learning_rate": 1.1152310325156874e-05, "loss": 6.7873, "step": 1960 }, { "epoch": 0.3356152661304125, "grad_norm": 37.3287239074707, "learning_rate": 1.1158014831717057e-05, "loss": 4.5134, "step": 1961 }, { "epoch": 0.3357864110901934, "grad_norm": 28.940874099731445, "learning_rate": 1.1163719338277239e-05, "loss": 3.5488, "step": 1962 }, { "epoch": 0.33595755604997435, "grad_norm": 27.005020141601562, "learning_rate": 1.116942384483742e-05, "loss": 3.4131, "step": 1963 }, { "epoch": 0.3361287010097553, "grad_norm": 23.171354293823242, "learning_rate": 1.1175128351397604e-05, "loss": 3.0202, "step": 1964 }, { "epoch": 0.3362998459695362, "grad_norm": 33.08194351196289, "learning_rate": 1.1180832857957786e-05, "loss": 3.5406, "step": 1965 }, { "epoch": 0.33647099092931715, "grad_norm": 42.914058685302734, "learning_rate": 1.118653736451797e-05, "loss": 7.7143, "step": 1966 }, { "epoch": 0.3366421358890981, "grad_norm": 6.044030666351318, "learning_rate": 1.1192241871078152e-05, "loss": 1.0934, "step": 1967 }, { "epoch": 0.336813280848879, "grad_norm": 13.652383804321289, "learning_rate": 1.1197946377638336e-05, "loss": 1.2611, "step": 1968 }, { "epoch": 0.33698442580865995, "grad_norm": 120.25743103027344, "learning_rate": 1.1203650884198518e-05, "loss": 6.9692, "step": 1969 }, { "epoch": 0.3371555707684409, "grad_norm": 138.58935546875, "learning_rate": 1.12093553907587e-05, "loss": 6.7316, "step": 1970 }, { "epoch": 0.3373267157282218, "grad_norm": 30.030006408691406, "learning_rate": 1.1215059897318883e-05, "loss": 4.1817, "step": 1971 }, { "epoch": 0.33749786068800275, "grad_norm": 9.535407066345215, "learning_rate": 1.1220764403879064e-05, "loss": 0.9512, "step": 1972 }, { "epoch": 0.3376690056477837, "grad_norm": 25.748254776000977, "learning_rate": 1.1226468910439248e-05, "loss": 3.1973, "step": 1973 }, { "epoch": 0.3378401506075646, "grad_norm": 29.184724807739258, "learning_rate": 1.123217341699943e-05, "loss": 3.5403, "step": 1974 }, { "epoch": 0.33801129556734555, "grad_norm": 36.09633255004883, "learning_rate": 1.1237877923559611e-05, "loss": 4.1013, "step": 1975 }, { "epoch": 0.3381824405271265, "grad_norm": 31.967252731323242, "learning_rate": 1.1243582430119794e-05, "loss": 3.2354, "step": 1976 }, { "epoch": 0.3383535854869074, "grad_norm": 38.74686813354492, "learning_rate": 1.1249286936679976e-05, "loss": 4.5663, "step": 1977 }, { "epoch": 0.33852473044668835, "grad_norm": 30.3746395111084, "learning_rate": 1.1254991443240161e-05, "loss": 3.3973, "step": 1978 }, { "epoch": 0.3386958754064693, "grad_norm": 11.366987228393555, "learning_rate": 1.1260695949800343e-05, "loss": 0.8323, "step": 1979 }, { "epoch": 0.3388670203662502, "grad_norm": 20.15157699584961, "learning_rate": 1.1266400456360526e-05, "loss": 1.5111, "step": 1980 }, { "epoch": 0.33903816532603115, "grad_norm": 25.638330459594727, "learning_rate": 1.1272104962920708e-05, "loss": 2.7039, "step": 1981 }, { "epoch": 0.3392093102858121, "grad_norm": 30.38153839111328, "learning_rate": 1.127780946948089e-05, "loss": 3.6275, "step": 1982 }, { "epoch": 0.339380455245593, "grad_norm": 31.235469818115234, "learning_rate": 1.1283513976041073e-05, "loss": 4.032, "step": 1983 }, { "epoch": 0.33955160020537395, "grad_norm": 36.95757293701172, "learning_rate": 1.1289218482601255e-05, "loss": 4.052, "step": 1984 }, { "epoch": 0.3397227451651549, "grad_norm": 5.83810567855835, "learning_rate": 1.1294922989161438e-05, "loss": 0.7531, "step": 1985 }, { "epoch": 0.3398938901249358, "grad_norm": 187.32872009277344, "learning_rate": 1.130062749572162e-05, "loss": 8.1223, "step": 1986 }, { "epoch": 0.34006503508471675, "grad_norm": 10.221015930175781, "learning_rate": 1.1306332002281803e-05, "loss": 1.3128, "step": 1987 }, { "epoch": 0.3402361800444977, "grad_norm": 23.46990203857422, "learning_rate": 1.1312036508841985e-05, "loss": 2.2877, "step": 1988 }, { "epoch": 0.3404073250042786, "grad_norm": 204.71218872070312, "learning_rate": 1.1317741015402168e-05, "loss": 9.0911, "step": 1989 }, { "epoch": 0.34057846996405955, "grad_norm": 11.691418647766113, "learning_rate": 1.1323445521962352e-05, "loss": 2.0669, "step": 1990 }, { "epoch": 0.3407496149238405, "grad_norm": 34.32474899291992, "learning_rate": 1.1329150028522533e-05, "loss": 3.8131, "step": 1991 }, { "epoch": 0.3409207598836214, "grad_norm": 15.316189765930176, "learning_rate": 1.1334854535082717e-05, "loss": 1.4449, "step": 1992 }, { "epoch": 0.34109190484340235, "grad_norm": 33.847110748291016, "learning_rate": 1.1340559041642898e-05, "loss": 3.6209, "step": 1993 }, { "epoch": 0.3412630498031833, "grad_norm": 30.83047103881836, "learning_rate": 1.134626354820308e-05, "loss": 3.3044, "step": 1994 }, { "epoch": 0.3414341947629642, "grad_norm": 23.169050216674805, "learning_rate": 1.1351968054763263e-05, "loss": 2.7778, "step": 1995 }, { "epoch": 0.34160533972274515, "grad_norm": 28.009946823120117, "learning_rate": 1.1357672561323445e-05, "loss": 2.5658, "step": 1996 }, { "epoch": 0.3417764846825261, "grad_norm": 24.620206832885742, "learning_rate": 1.1363377067883628e-05, "loss": 2.8611, "step": 1997 }, { "epoch": 0.341947629642307, "grad_norm": 35.302894592285156, "learning_rate": 1.136908157444381e-05, "loss": 3.8368, "step": 1998 }, { "epoch": 0.34211877460208795, "grad_norm": 48.49169921875, "learning_rate": 1.1374786081003993e-05, "loss": 8.3039, "step": 1999 }, { "epoch": 0.3422899195618689, "grad_norm": 26.473003387451172, "learning_rate": 1.1380490587564177e-05, "loss": 2.6571, "step": 2000 }, { "epoch": 0.3424610645216498, "grad_norm": 8.975080490112305, "learning_rate": 1.1386195094124359e-05, "loss": 0.8311, "step": 2001 }, { "epoch": 0.34263220948143075, "grad_norm": 29.154399871826172, "learning_rate": 1.1391899600684542e-05, "loss": 3.3092, "step": 2002 }, { "epoch": 0.3428033544412117, "grad_norm": 9.116958618164062, "learning_rate": 1.1397604107244724e-05, "loss": 1.109, "step": 2003 }, { "epoch": 0.3429744994009926, "grad_norm": 150.9268341064453, "learning_rate": 1.1403308613804907e-05, "loss": 6.7063, "step": 2004 }, { "epoch": 0.34314564436077355, "grad_norm": 28.97213363647461, "learning_rate": 1.1409013120365089e-05, "loss": 3.4316, "step": 2005 }, { "epoch": 0.3433167893205545, "grad_norm": 35.343074798583984, "learning_rate": 1.1414717626925272e-05, "loss": 4.1921, "step": 2006 }, { "epoch": 0.34348793428033547, "grad_norm": 26.21539306640625, "learning_rate": 1.1420422133485454e-05, "loss": 2.8775, "step": 2007 }, { "epoch": 0.3436590792401164, "grad_norm": 24.8580322265625, "learning_rate": 1.1426126640045635e-05, "loss": 2.7428, "step": 2008 }, { "epoch": 0.34383022419989734, "grad_norm": 18.229679107666016, "learning_rate": 1.1431831146605819e-05, "loss": 2.1508, "step": 2009 }, { "epoch": 0.34400136915967827, "grad_norm": 12.01388168334961, "learning_rate": 1.1437535653166e-05, "loss": 1.002, "step": 2010 }, { "epoch": 0.3441725141194592, "grad_norm": 101.5674819946289, "learning_rate": 1.1443240159726184e-05, "loss": 6.9708, "step": 2011 }, { "epoch": 0.34434365907924014, "grad_norm": 135.65138244628906, "learning_rate": 1.1448944666286367e-05, "loss": 6.0953, "step": 2012 }, { "epoch": 0.34451480403902107, "grad_norm": 28.10844612121582, "learning_rate": 1.1454649172846549e-05, "loss": 3.5016, "step": 2013 }, { "epoch": 0.344685948998802, "grad_norm": 31.837894439697266, "learning_rate": 1.1460353679406732e-05, "loss": 3.2448, "step": 2014 }, { "epoch": 0.34485709395858294, "grad_norm": 28.26076889038086, "learning_rate": 1.1466058185966914e-05, "loss": 3.1378, "step": 2015 }, { "epoch": 0.34502823891836387, "grad_norm": 32.99501419067383, "learning_rate": 1.1471762692527097e-05, "loss": 3.4328, "step": 2016 }, { "epoch": 0.3451993838781448, "grad_norm": 31.268230438232422, "learning_rate": 1.1477467199087279e-05, "loss": 4.0378, "step": 2017 }, { "epoch": 0.34537052883792574, "grad_norm": 32.19254684448242, "learning_rate": 1.1483171705647462e-05, "loss": 4.356, "step": 2018 }, { "epoch": 0.34554167379770667, "grad_norm": 28.953779220581055, "learning_rate": 1.1488876212207644e-05, "loss": 3.8967, "step": 2019 }, { "epoch": 0.3457128187574876, "grad_norm": 26.264999389648438, "learning_rate": 1.1494580718767826e-05, "loss": 2.7881, "step": 2020 }, { "epoch": 0.34588396371726854, "grad_norm": 21.80779457092285, "learning_rate": 1.150028522532801e-05, "loss": 2.0569, "step": 2021 }, { "epoch": 0.34605510867704947, "grad_norm": 5.897726535797119, "learning_rate": 1.1505989731888191e-05, "loss": 0.6854, "step": 2022 }, { "epoch": 0.3462262536368304, "grad_norm": 18.685945510864258, "learning_rate": 1.1511694238448376e-05, "loss": 1.7189, "step": 2023 }, { "epoch": 0.34639739859661134, "grad_norm": 16.55164909362793, "learning_rate": 1.1517398745008558e-05, "loss": 1.6266, "step": 2024 }, { "epoch": 0.3465685435563923, "grad_norm": 26.497346878051758, "learning_rate": 1.152310325156874e-05, "loss": 3.1355, "step": 2025 }, { "epoch": 0.3467396885161732, "grad_norm": 36.22391128540039, "learning_rate": 1.1528807758128923e-05, "loss": 4.2871, "step": 2026 }, { "epoch": 0.34691083347595414, "grad_norm": 25.69757080078125, "learning_rate": 1.1534512264689104e-05, "loss": 2.4604, "step": 2027 }, { "epoch": 0.3470819784357351, "grad_norm": 34.47371292114258, "learning_rate": 1.1540216771249288e-05, "loss": 4.5727, "step": 2028 }, { "epoch": 0.347253123395516, "grad_norm": 25.829330444335938, "learning_rate": 1.154592127780947e-05, "loss": 2.3708, "step": 2029 }, { "epoch": 0.34742426835529694, "grad_norm": 23.152074813842773, "learning_rate": 1.1551625784369653e-05, "loss": 2.5885, "step": 2030 }, { "epoch": 0.3475954133150779, "grad_norm": 33.27009582519531, "learning_rate": 1.1557330290929834e-05, "loss": 4.0326, "step": 2031 }, { "epoch": 0.3477665582748588, "grad_norm": 11.642922401428223, "learning_rate": 1.1563034797490016e-05, "loss": 1.3036, "step": 2032 }, { "epoch": 0.34793770323463974, "grad_norm": 16.035924911499023, "learning_rate": 1.15687393040502e-05, "loss": 1.3584, "step": 2033 }, { "epoch": 0.3481088481944207, "grad_norm": 38.5884895324707, "learning_rate": 1.1574443810610381e-05, "loss": 5.2381, "step": 2034 }, { "epoch": 0.3482799931542016, "grad_norm": 34.79248046875, "learning_rate": 1.1580148317170566e-05, "loss": 3.4977, "step": 2035 }, { "epoch": 0.34845113811398254, "grad_norm": 24.086618423461914, "learning_rate": 1.1585852823730748e-05, "loss": 2.489, "step": 2036 }, { "epoch": 0.3486222830737635, "grad_norm": 17.970691680908203, "learning_rate": 1.1591557330290931e-05, "loss": 1.2174, "step": 2037 }, { "epoch": 0.3487934280335444, "grad_norm": 27.199962615966797, "learning_rate": 1.1597261836851113e-05, "loss": 2.4304, "step": 2038 }, { "epoch": 0.34896457299332534, "grad_norm": 36.157230377197266, "learning_rate": 1.1602966343411295e-05, "loss": 4.5914, "step": 2039 }, { "epoch": 0.3491357179531063, "grad_norm": 30.98073387145996, "learning_rate": 1.1608670849971478e-05, "loss": 3.1108, "step": 2040 }, { "epoch": 0.3493068629128872, "grad_norm": 4.110781192779541, "learning_rate": 1.161437535653166e-05, "loss": 0.6784, "step": 2041 }, { "epoch": 0.34947800787266814, "grad_norm": 7.259744644165039, "learning_rate": 1.1620079863091843e-05, "loss": 0.7546, "step": 2042 }, { "epoch": 0.3496491528324491, "grad_norm": 9.056280136108398, "learning_rate": 1.1625784369652025e-05, "loss": 0.8102, "step": 2043 }, { "epoch": 0.34982029779223, "grad_norm": 17.079927444458008, "learning_rate": 1.1631488876212207e-05, "loss": 1.8825, "step": 2044 }, { "epoch": 0.34999144275201094, "grad_norm": 5.583414077758789, "learning_rate": 1.163719338277239e-05, "loss": 0.6958, "step": 2045 }, { "epoch": 0.3501625877117919, "grad_norm": 32.52211380004883, "learning_rate": 1.1642897889332573e-05, "loss": 3.8308, "step": 2046 }, { "epoch": 0.3503337326715728, "grad_norm": 8.453152656555176, "learning_rate": 1.1648602395892757e-05, "loss": 0.9997, "step": 2047 }, { "epoch": 0.35050487763135374, "grad_norm": 17.828163146972656, "learning_rate": 1.1654306902452938e-05, "loss": 2.0197, "step": 2048 }, { "epoch": 0.3506760225911347, "grad_norm": 33.86958312988281, "learning_rate": 1.1660011409013122e-05, "loss": 3.5889, "step": 2049 }, { "epoch": 0.3508471675509156, "grad_norm": 39.53785705566406, "learning_rate": 1.1665715915573303e-05, "loss": 4.3322, "step": 2050 }, { "epoch": 0.35101831251069654, "grad_norm": 119.68132019042969, "learning_rate": 1.1671420422133485e-05, "loss": 8.5534, "step": 2051 }, { "epoch": 0.3511894574704775, "grad_norm": 20.703731536865234, "learning_rate": 1.1677124928693669e-05, "loss": 1.9145, "step": 2052 }, { "epoch": 0.3513606024302584, "grad_norm": 32.62479019165039, "learning_rate": 1.168282943525385e-05, "loss": 3.4411, "step": 2053 }, { "epoch": 0.35153174739003934, "grad_norm": 28.38721466064453, "learning_rate": 1.1688533941814034e-05, "loss": 2.913, "step": 2054 }, { "epoch": 0.3517028923498203, "grad_norm": 11.139078140258789, "learning_rate": 1.1694238448374215e-05, "loss": 1.2331, "step": 2055 }, { "epoch": 0.3518740373096012, "grad_norm": 36.095458984375, "learning_rate": 1.1699942954934399e-05, "loss": 4.4497, "step": 2056 }, { "epoch": 0.35204518226938214, "grad_norm": 17.7105655670166, "learning_rate": 1.170564746149458e-05, "loss": 1.341, "step": 2057 }, { "epoch": 0.35221632722916313, "grad_norm": 34.70029067993164, "learning_rate": 1.1711351968054764e-05, "loss": 3.8577, "step": 2058 }, { "epoch": 0.35238747218894406, "grad_norm": 30.967939376831055, "learning_rate": 1.1717056474614947e-05, "loss": 3.5998, "step": 2059 }, { "epoch": 0.352558617148725, "grad_norm": 175.67909240722656, "learning_rate": 1.1722760981175129e-05, "loss": 7.5725, "step": 2060 }, { "epoch": 0.35272976210850593, "grad_norm": 14.09093189239502, "learning_rate": 1.1728465487735312e-05, "loss": 1.1863, "step": 2061 }, { "epoch": 0.35290090706828686, "grad_norm": 16.4505672454834, "learning_rate": 1.1734169994295494e-05, "loss": 1.3923, "step": 2062 }, { "epoch": 0.3530720520280678, "grad_norm": 30.69254493713379, "learning_rate": 1.1739874500855676e-05, "loss": 4.0609, "step": 2063 }, { "epoch": 0.35324319698784873, "grad_norm": 35.82154846191406, "learning_rate": 1.1745579007415859e-05, "loss": 4.1915, "step": 2064 }, { "epoch": 0.35341434194762966, "grad_norm": 34.619754791259766, "learning_rate": 1.175128351397604e-05, "loss": 4.8903, "step": 2065 }, { "epoch": 0.3535854869074106, "grad_norm": 13.456661224365234, "learning_rate": 1.1756988020536224e-05, "loss": 1.4971, "step": 2066 }, { "epoch": 0.35375663186719153, "grad_norm": 34.76420974731445, "learning_rate": 1.1762692527096406e-05, "loss": 3.9249, "step": 2067 }, { "epoch": 0.35392777682697246, "grad_norm": 11.180761337280273, "learning_rate": 1.1768397033656589e-05, "loss": 0.7, "step": 2068 }, { "epoch": 0.3540989217867534, "grad_norm": 195.03485107421875, "learning_rate": 1.1774101540216772e-05, "loss": 6.9708, "step": 2069 }, { "epoch": 0.35427006674653433, "grad_norm": 34.15081787109375, "learning_rate": 1.1779806046776954e-05, "loss": 4.0197, "step": 2070 }, { "epoch": 0.35444121170631526, "grad_norm": 44.15553283691406, "learning_rate": 1.1785510553337137e-05, "loss": 8.034, "step": 2071 }, { "epoch": 0.3546123566660962, "grad_norm": 36.1580924987793, "learning_rate": 1.1791215059897319e-05, "loss": 4.3774, "step": 2072 }, { "epoch": 0.35478350162587713, "grad_norm": 37.583351135253906, "learning_rate": 1.1796919566457503e-05, "loss": 5.0443, "step": 2073 }, { "epoch": 0.35495464658565806, "grad_norm": 7.443456172943115, "learning_rate": 1.1802624073017684e-05, "loss": 0.7081, "step": 2074 }, { "epoch": 0.355125791545439, "grad_norm": 27.195236206054688, "learning_rate": 1.1808328579577866e-05, "loss": 2.7896, "step": 2075 }, { "epoch": 0.35529693650521993, "grad_norm": 10.81725788116455, "learning_rate": 1.181403308613805e-05, "loss": 2.1049, "step": 2076 }, { "epoch": 0.35546808146500086, "grad_norm": 32.889869689941406, "learning_rate": 1.1819737592698231e-05, "loss": 3.9205, "step": 2077 }, { "epoch": 0.3556392264247818, "grad_norm": 119.37525939941406, "learning_rate": 1.1825442099258414e-05, "loss": 7.0729, "step": 2078 }, { "epoch": 0.35581037138456273, "grad_norm": 13.211540222167969, "learning_rate": 1.1831146605818596e-05, "loss": 1.5046, "step": 2079 }, { "epoch": 0.35598151634434366, "grad_norm": 29.677011489868164, "learning_rate": 1.183685111237878e-05, "loss": 3.4441, "step": 2080 }, { "epoch": 0.3561526613041246, "grad_norm": 116.09097290039062, "learning_rate": 1.1842555618938963e-05, "loss": 6.9657, "step": 2081 }, { "epoch": 0.35632380626390553, "grad_norm": 36.9529914855957, "learning_rate": 1.1848260125499144e-05, "loss": 5.1966, "step": 2082 }, { "epoch": 0.35649495122368646, "grad_norm": 32.45378112792969, "learning_rate": 1.1853964632059328e-05, "loss": 3.8259, "step": 2083 }, { "epoch": 0.3566660961834674, "grad_norm": 28.279193878173828, "learning_rate": 1.185966913861951e-05, "loss": 3.0802, "step": 2084 }, { "epoch": 0.35683724114324833, "grad_norm": 16.36111831665039, "learning_rate": 1.1865373645179693e-05, "loss": 1.6254, "step": 2085 }, { "epoch": 0.35700838610302926, "grad_norm": 33.62881851196289, "learning_rate": 1.1871078151739875e-05, "loss": 3.482, "step": 2086 }, { "epoch": 0.3571795310628102, "grad_norm": 22.785282135009766, "learning_rate": 1.1876782658300058e-05, "loss": 2.5492, "step": 2087 }, { "epoch": 0.35735067602259113, "grad_norm": 18.783733367919922, "learning_rate": 1.188248716486024e-05, "loss": 2.1471, "step": 2088 }, { "epoch": 0.35752182098237206, "grad_norm": 25.175399780273438, "learning_rate": 1.1888191671420421e-05, "loss": 2.901, "step": 2089 }, { "epoch": 0.357692965942153, "grad_norm": 32.070228576660156, "learning_rate": 1.1893896177980605e-05, "loss": 4.0126, "step": 2090 }, { "epoch": 0.35786411090193393, "grad_norm": 30.165206909179688, "learning_rate": 1.1899600684540786e-05, "loss": 3.1196, "step": 2091 }, { "epoch": 0.35803525586171486, "grad_norm": 25.695375442504883, "learning_rate": 1.1905305191100971e-05, "loss": 2.5124, "step": 2092 }, { "epoch": 0.3582064008214958, "grad_norm": 7.505849838256836, "learning_rate": 1.1911009697661153e-05, "loss": 1.0043, "step": 2093 }, { "epoch": 0.35837754578127673, "grad_norm": 28.15729522705078, "learning_rate": 1.1916714204221335e-05, "loss": 3.8256, "step": 2094 }, { "epoch": 0.35854869074105766, "grad_norm": 15.077316284179688, "learning_rate": 1.1922418710781518e-05, "loss": 0.9039, "step": 2095 }, { "epoch": 0.3587198357008386, "grad_norm": 11.068819999694824, "learning_rate": 1.19281232173417e-05, "loss": 0.9256, "step": 2096 }, { "epoch": 0.35889098066061953, "grad_norm": 30.34836769104004, "learning_rate": 1.1933827723901883e-05, "loss": 3.3198, "step": 2097 }, { "epoch": 0.35906212562040046, "grad_norm": 92.60661315917969, "learning_rate": 1.1939532230462065e-05, "loss": 5.7395, "step": 2098 }, { "epoch": 0.3592332705801814, "grad_norm": 26.518394470214844, "learning_rate": 1.1945236737022248e-05, "loss": 2.7506, "step": 2099 }, { "epoch": 0.35940441553996233, "grad_norm": 4.0069780349731445, "learning_rate": 1.195094124358243e-05, "loss": 0.622, "step": 2100 }, { "epoch": 0.35957556049974326, "grad_norm": 25.66058349609375, "learning_rate": 1.1956645750142612e-05, "loss": 2.4436, "step": 2101 }, { "epoch": 0.3597467054595242, "grad_norm": 16.090246200561523, "learning_rate": 1.1962350256702795e-05, "loss": 1.4181, "step": 2102 }, { "epoch": 0.35991785041930513, "grad_norm": 9.653539657592773, "learning_rate": 1.1968054763262978e-05, "loss": 1.1303, "step": 2103 }, { "epoch": 0.36008899537908606, "grad_norm": 26.997007369995117, "learning_rate": 1.1973759269823162e-05, "loss": 2.8454, "step": 2104 }, { "epoch": 0.360260140338867, "grad_norm": 35.292945861816406, "learning_rate": 1.1979463776383344e-05, "loss": 4.4265, "step": 2105 }, { "epoch": 0.36043128529864793, "grad_norm": 9.962848663330078, "learning_rate": 1.1985168282943527e-05, "loss": 1.1083, "step": 2106 }, { "epoch": 0.36060243025842886, "grad_norm": 21.34442138671875, "learning_rate": 1.1990872789503709e-05, "loss": 1.9815, "step": 2107 }, { "epoch": 0.3607735752182098, "grad_norm": 63.102256774902344, "learning_rate": 1.199657729606389e-05, "loss": 8.2906, "step": 2108 }, { "epoch": 0.3609447201779908, "grad_norm": 31.640159606933594, "learning_rate": 1.2002281802624074e-05, "loss": 3.9734, "step": 2109 }, { "epoch": 0.3611158651377717, "grad_norm": 29.008909225463867, "learning_rate": 1.2007986309184255e-05, "loss": 2.8619, "step": 2110 }, { "epoch": 0.36128701009755265, "grad_norm": 158.99563598632812, "learning_rate": 1.2013690815744439e-05, "loss": 8.8876, "step": 2111 }, { "epoch": 0.3614581550573336, "grad_norm": 12.028635025024414, "learning_rate": 1.201939532230462e-05, "loss": 1.1747, "step": 2112 }, { "epoch": 0.3616293000171145, "grad_norm": 49.29413986206055, "learning_rate": 1.2025099828864802e-05, "loss": 8.4677, "step": 2113 }, { "epoch": 0.36180044497689545, "grad_norm": 35.586788177490234, "learning_rate": 1.2030804335424985e-05, "loss": 4.3141, "step": 2114 }, { "epoch": 0.3619715899366764, "grad_norm": 15.967235565185547, "learning_rate": 1.2036508841985169e-05, "loss": 1.4648, "step": 2115 }, { "epoch": 0.3621427348964573, "grad_norm": 116.31715393066406, "learning_rate": 1.2042213348545352e-05, "loss": 5.9115, "step": 2116 }, { "epoch": 0.36231387985623825, "grad_norm": 39.9970703125, "learning_rate": 1.2047917855105534e-05, "loss": 5.2751, "step": 2117 }, { "epoch": 0.3624850248160192, "grad_norm": 15.636171340942383, "learning_rate": 1.2053622361665717e-05, "loss": 1.1331, "step": 2118 }, { "epoch": 0.3626561697758001, "grad_norm": 29.51291847229004, "learning_rate": 1.2059326868225899e-05, "loss": 3.0782, "step": 2119 }, { "epoch": 0.36282731473558105, "grad_norm": 33.99169921875, "learning_rate": 1.206503137478608e-05, "loss": 3.4875, "step": 2120 }, { "epoch": 0.362998459695362, "grad_norm": 8.469818115234375, "learning_rate": 1.2070735881346264e-05, "loss": 0.9351, "step": 2121 }, { "epoch": 0.3631696046551429, "grad_norm": 87.96151733398438, "learning_rate": 1.2076440387906446e-05, "loss": 5.0553, "step": 2122 }, { "epoch": 0.36334074961492385, "grad_norm": 11.59670352935791, "learning_rate": 1.2082144894466629e-05, "loss": 1.314, "step": 2123 }, { "epoch": 0.3635118945747048, "grad_norm": 7.859058856964111, "learning_rate": 1.208784940102681e-05, "loss": 0.9692, "step": 2124 }, { "epoch": 0.3636830395344857, "grad_norm": 48.24964904785156, "learning_rate": 1.2093553907586992e-05, "loss": 5.6168, "step": 2125 }, { "epoch": 0.36385418449426665, "grad_norm": 35.264366149902344, "learning_rate": 1.2099258414147178e-05, "loss": 3.7475, "step": 2126 }, { "epoch": 0.3640253294540476, "grad_norm": 30.4807071685791, "learning_rate": 1.210496292070736e-05, "loss": 3.6681, "step": 2127 }, { "epoch": 0.3641964744138285, "grad_norm": 37.583274841308594, "learning_rate": 1.2110667427267543e-05, "loss": 7.6763, "step": 2128 }, { "epoch": 0.36436761937360945, "grad_norm": 10.553574562072754, "learning_rate": 1.2116371933827724e-05, "loss": 0.7119, "step": 2129 }, { "epoch": 0.3645387643333904, "grad_norm": 25.893739700317383, "learning_rate": 1.2122076440387908e-05, "loss": 2.7102, "step": 2130 }, { "epoch": 0.3647099092931713, "grad_norm": 37.81182861328125, "learning_rate": 1.212778094694809e-05, "loss": 7.7056, "step": 2131 }, { "epoch": 0.36488105425295225, "grad_norm": 24.436336517333984, "learning_rate": 1.2133485453508271e-05, "loss": 3.0385, "step": 2132 }, { "epoch": 0.3650521992127332, "grad_norm": 33.72613525390625, "learning_rate": 1.2139189960068454e-05, "loss": 3.625, "step": 2133 }, { "epoch": 0.3652233441725141, "grad_norm": 29.429370880126953, "learning_rate": 1.2144894466628636e-05, "loss": 3.2735, "step": 2134 }, { "epoch": 0.36539448913229505, "grad_norm": 29.37833595275879, "learning_rate": 1.215059897318882e-05, "loss": 3.3102, "step": 2135 }, { "epoch": 0.365565634092076, "grad_norm": 4.678672790527344, "learning_rate": 1.2156303479749001e-05, "loss": 0.6167, "step": 2136 }, { "epoch": 0.3657367790518569, "grad_norm": 13.350298881530762, "learning_rate": 1.2162007986309185e-05, "loss": 0.9838, "step": 2137 }, { "epoch": 0.36590792401163785, "grad_norm": 197.19981384277344, "learning_rate": 1.2167712492869368e-05, "loss": 10.1095, "step": 2138 }, { "epoch": 0.3660790689714188, "grad_norm": 33.24477767944336, "learning_rate": 1.217341699942955e-05, "loss": 3.6562, "step": 2139 }, { "epoch": 0.3662502139311997, "grad_norm": 31.698823928833008, "learning_rate": 1.2179121505989733e-05, "loss": 3.1984, "step": 2140 }, { "epoch": 0.36642135889098065, "grad_norm": 28.302553176879883, "learning_rate": 1.2184826012549915e-05, "loss": 2.9794, "step": 2141 }, { "epoch": 0.3665925038507616, "grad_norm": 26.840988159179688, "learning_rate": 1.2190530519110098e-05, "loss": 3.1451, "step": 2142 }, { "epoch": 0.3667636488105425, "grad_norm": 10.02106761932373, "learning_rate": 1.219623502567028e-05, "loss": 1.7728, "step": 2143 }, { "epoch": 0.36693479377032345, "grad_norm": 19.4163761138916, "learning_rate": 1.2201939532230461e-05, "loss": 1.4892, "step": 2144 }, { "epoch": 0.3671059387301044, "grad_norm": 117.40380096435547, "learning_rate": 1.2207644038790645e-05, "loss": 6.037, "step": 2145 }, { "epoch": 0.3672770836898853, "grad_norm": 36.802330017089844, "learning_rate": 1.2213348545350826e-05, "loss": 3.9472, "step": 2146 }, { "epoch": 0.36744822864966625, "grad_norm": 26.534914016723633, "learning_rate": 1.221905305191101e-05, "loss": 2.9076, "step": 2147 }, { "epoch": 0.3676193736094472, "grad_norm": 8.252175331115723, "learning_rate": 1.2224757558471192e-05, "loss": 1.7274, "step": 2148 }, { "epoch": 0.3677905185692281, "grad_norm": 36.72080993652344, "learning_rate": 1.2230462065031377e-05, "loss": 3.9691, "step": 2149 }, { "epoch": 0.36796166352900905, "grad_norm": 31.389694213867188, "learning_rate": 1.2236166571591558e-05, "loss": 3.662, "step": 2150 }, { "epoch": 0.36813280848879, "grad_norm": 17.889827728271484, "learning_rate": 1.224187107815174e-05, "loss": 1.3585, "step": 2151 }, { "epoch": 0.3683039534485709, "grad_norm": 37.55808639526367, "learning_rate": 1.2247575584711923e-05, "loss": 3.733, "step": 2152 }, { "epoch": 0.36847509840835185, "grad_norm": 28.830768585205078, "learning_rate": 1.2253280091272105e-05, "loss": 2.596, "step": 2153 }, { "epoch": 0.3686462433681328, "grad_norm": 11.456624031066895, "learning_rate": 1.2258984597832288e-05, "loss": 0.6827, "step": 2154 }, { "epoch": 0.3688173883279137, "grad_norm": 29.137744903564453, "learning_rate": 1.226468910439247e-05, "loss": 3.4631, "step": 2155 }, { "epoch": 0.36898853328769465, "grad_norm": 27.315082550048828, "learning_rate": 1.2270393610952653e-05, "loss": 2.4743, "step": 2156 }, { "epoch": 0.3691596782474756, "grad_norm": 15.013253211975098, "learning_rate": 1.2276098117512835e-05, "loss": 1.2812, "step": 2157 }, { "epoch": 0.3693308232072565, "grad_norm": 33.02097702026367, "learning_rate": 1.2281802624073017e-05, "loss": 3.1825, "step": 2158 }, { "epoch": 0.36950196816703745, "grad_norm": 37.75695037841797, "learning_rate": 1.22875071306332e-05, "loss": 3.9585, "step": 2159 }, { "epoch": 0.36967311312681844, "grad_norm": 33.56565475463867, "learning_rate": 1.2293211637193382e-05, "loss": 3.9576, "step": 2160 }, { "epoch": 0.3698442580865994, "grad_norm": 8.82251262664795, "learning_rate": 1.2298916143753567e-05, "loss": 1.045, "step": 2161 }, { "epoch": 0.3700154030463803, "grad_norm": 26.975778579711914, "learning_rate": 1.2304620650313749e-05, "loss": 2.5674, "step": 2162 }, { "epoch": 0.37018654800616124, "grad_norm": 136.73780822753906, "learning_rate": 1.231032515687393e-05, "loss": 5.8467, "step": 2163 }, { "epoch": 0.3703576929659422, "grad_norm": 29.269546508789062, "learning_rate": 1.2316029663434114e-05, "loss": 2.9019, "step": 2164 }, { "epoch": 0.3705288379257231, "grad_norm": 31.114402770996094, "learning_rate": 1.2321734169994295e-05, "loss": 3.803, "step": 2165 }, { "epoch": 0.37069998288550404, "grad_norm": 28.02252769470215, "learning_rate": 1.2327438676554479e-05, "loss": 3.334, "step": 2166 }, { "epoch": 0.370871127845285, "grad_norm": 36.24296951293945, "learning_rate": 1.233314318311466e-05, "loss": 4.0221, "step": 2167 }, { "epoch": 0.3710422728050659, "grad_norm": 42.49361801147461, "learning_rate": 1.2338847689674844e-05, "loss": 4.4893, "step": 2168 }, { "epoch": 0.37121341776484684, "grad_norm": 31.110870361328125, "learning_rate": 1.2344552196235026e-05, "loss": 3.2998, "step": 2169 }, { "epoch": 0.3713845627246278, "grad_norm": 38.54166030883789, "learning_rate": 1.2350256702795207e-05, "loss": 3.9307, "step": 2170 }, { "epoch": 0.3715557076844087, "grad_norm": 31.027143478393555, "learning_rate": 1.235596120935539e-05, "loss": 3.3663, "step": 2171 }, { "epoch": 0.37172685264418964, "grad_norm": 19.778564453125, "learning_rate": 1.2361665715915574e-05, "loss": 1.4132, "step": 2172 }, { "epoch": 0.3718979976039706, "grad_norm": 6.935482025146484, "learning_rate": 1.2367370222475757e-05, "loss": 0.7138, "step": 2173 }, { "epoch": 0.3720691425637515, "grad_norm": 17.002243041992188, "learning_rate": 1.2373074729035939e-05, "loss": 1.363, "step": 2174 }, { "epoch": 0.37224028752353244, "grad_norm": 36.1330451965332, "learning_rate": 1.237877923559612e-05, "loss": 4.304, "step": 2175 }, { "epoch": 0.3724114324833134, "grad_norm": 37.96760940551758, "learning_rate": 1.2384483742156304e-05, "loss": 4.1877, "step": 2176 }, { "epoch": 0.3725825774430943, "grad_norm": 37.3785400390625, "learning_rate": 1.2390188248716486e-05, "loss": 4.2806, "step": 2177 }, { "epoch": 0.37275372240287524, "grad_norm": 124.93565368652344, "learning_rate": 1.239589275527667e-05, "loss": 5.4911, "step": 2178 }, { "epoch": 0.3729248673626562, "grad_norm": 28.42656707763672, "learning_rate": 1.2401597261836851e-05, "loss": 2.685, "step": 2179 }, { "epoch": 0.3730960123224371, "grad_norm": 44.78040313720703, "learning_rate": 1.2407301768397034e-05, "loss": 8.0281, "step": 2180 }, { "epoch": 0.37326715728221804, "grad_norm": 106.5615005493164, "learning_rate": 1.2413006274957216e-05, "loss": 9.7692, "step": 2181 }, { "epoch": 0.373438302241999, "grad_norm": 32.70700454711914, "learning_rate": 1.2418710781517398e-05, "loss": 3.7167, "step": 2182 }, { "epoch": 0.3736094472017799, "grad_norm": 27.95832633972168, "learning_rate": 1.2424415288077583e-05, "loss": 3.4558, "step": 2183 }, { "epoch": 0.37378059216156084, "grad_norm": 51.62168502807617, "learning_rate": 1.2430119794637764e-05, "loss": 7.8843, "step": 2184 }, { "epoch": 0.3739517371213418, "grad_norm": 22.549152374267578, "learning_rate": 1.2435824301197948e-05, "loss": 2.2902, "step": 2185 }, { "epoch": 0.3741228820811227, "grad_norm": 49.26498031616211, "learning_rate": 1.244152880775813e-05, "loss": 7.9836, "step": 2186 }, { "epoch": 0.37429402704090364, "grad_norm": 32.918434143066406, "learning_rate": 1.2447233314318313e-05, "loss": 3.7321, "step": 2187 }, { "epoch": 0.3744651720006846, "grad_norm": 115.87164306640625, "learning_rate": 1.2452937820878495e-05, "loss": 6.1209, "step": 2188 }, { "epoch": 0.3746363169604655, "grad_norm": 32.60509490966797, "learning_rate": 1.2458642327438676e-05, "loss": 4.3652, "step": 2189 }, { "epoch": 0.37480746192024644, "grad_norm": 40.821720123291016, "learning_rate": 1.246434683399886e-05, "loss": 7.5982, "step": 2190 }, { "epoch": 0.3749786068800274, "grad_norm": 30.804649353027344, "learning_rate": 1.2470051340559041e-05, "loss": 3.6332, "step": 2191 }, { "epoch": 0.3751497518398083, "grad_norm": 28.10482406616211, "learning_rate": 1.2475755847119225e-05, "loss": 3.1805, "step": 2192 }, { "epoch": 0.37532089679958924, "grad_norm": 5.394840240478516, "learning_rate": 1.2481460353679406e-05, "loss": 0.6131, "step": 2193 }, { "epoch": 0.3754920417593702, "grad_norm": 22.42398452758789, "learning_rate": 1.2487164860239588e-05, "loss": 2.9095, "step": 2194 }, { "epoch": 0.3756631867191511, "grad_norm": 31.861984252929688, "learning_rate": 1.2492869366799773e-05, "loss": 3.7872, "step": 2195 }, { "epoch": 0.37583433167893204, "grad_norm": 30.0163631439209, "learning_rate": 1.2498573873359955e-05, "loss": 3.2556, "step": 2196 }, { "epoch": 0.376005476638713, "grad_norm": 43.01797103881836, "learning_rate": 1.2504278379920138e-05, "loss": 7.4534, "step": 2197 }, { "epoch": 0.3761766215984939, "grad_norm": 26.029483795166016, "learning_rate": 1.250998288648032e-05, "loss": 3.4138, "step": 2198 }, { "epoch": 0.37634776655827484, "grad_norm": 31.733152389526367, "learning_rate": 1.2515687393040503e-05, "loss": 3.7965, "step": 2199 }, { "epoch": 0.3765189115180558, "grad_norm": 29.86209487915039, "learning_rate": 1.2521391899600685e-05, "loss": 3.1073, "step": 2200 }, { "epoch": 0.3766900564778367, "grad_norm": 73.94261932373047, "learning_rate": 1.2527096406160867e-05, "loss": 6.7022, "step": 2201 }, { "epoch": 0.37686120143761764, "grad_norm": 33.266666412353516, "learning_rate": 1.253280091272105e-05, "loss": 3.467, "step": 2202 }, { "epoch": 0.3770323463973986, "grad_norm": 9.25309944152832, "learning_rate": 1.2538505419281232e-05, "loss": 0.9735, "step": 2203 }, { "epoch": 0.3772034913571795, "grad_norm": 32.7879753112793, "learning_rate": 1.2544209925841415e-05, "loss": 4.3873, "step": 2204 }, { "epoch": 0.37737463631696044, "grad_norm": 38.24089813232422, "learning_rate": 1.2549914432401597e-05, "loss": 4.1272, "step": 2205 }, { "epoch": 0.3775457812767414, "grad_norm": 11.10142707824707, "learning_rate": 1.2555618938961782e-05, "loss": 0.8028, "step": 2206 }, { "epoch": 0.3777169262365223, "grad_norm": 37.619815826416016, "learning_rate": 1.2561323445521963e-05, "loss": 3.8663, "step": 2207 }, { "epoch": 0.37788807119630324, "grad_norm": 43.338417053222656, "learning_rate": 1.2567027952082145e-05, "loss": 4.6084, "step": 2208 }, { "epoch": 0.3780592161560842, "grad_norm": 29.597476959228516, "learning_rate": 1.2572732458642329e-05, "loss": 4.3275, "step": 2209 }, { "epoch": 0.3782303611158651, "grad_norm": 111.00467681884766, "learning_rate": 1.257843696520251e-05, "loss": 6.2678, "step": 2210 }, { "epoch": 0.3784015060756461, "grad_norm": 28.328218460083008, "learning_rate": 1.2584141471762694e-05, "loss": 3.7021, "step": 2211 }, { "epoch": 0.37857265103542703, "grad_norm": 7.334059238433838, "learning_rate": 1.2589845978322875e-05, "loss": 1.0266, "step": 2212 }, { "epoch": 0.37874379599520797, "grad_norm": 12.333498001098633, "learning_rate": 1.2595550484883057e-05, "loss": 1.2937, "step": 2213 }, { "epoch": 0.3789149409549889, "grad_norm": 33.395259857177734, "learning_rate": 1.260125499144324e-05, "loss": 3.7564, "step": 2214 }, { "epoch": 0.37908608591476983, "grad_norm": 12.443466186523438, "learning_rate": 1.2606959498003422e-05, "loss": 1.1804, "step": 2215 }, { "epoch": 0.37925723087455077, "grad_norm": 29.2781982421875, "learning_rate": 1.2612664004563605e-05, "loss": 2.8089, "step": 2216 }, { "epoch": 0.3794283758343317, "grad_norm": 30.066843032836914, "learning_rate": 1.2618368511123787e-05, "loss": 3.678, "step": 2217 }, { "epoch": 0.37959952079411263, "grad_norm": 198.62889099121094, "learning_rate": 1.2624073017683972e-05, "loss": 10.5079, "step": 2218 }, { "epoch": 0.37977066575389357, "grad_norm": 36.29426574707031, "learning_rate": 1.2629777524244154e-05, "loss": 3.7594, "step": 2219 }, { "epoch": 0.3799418107136745, "grad_norm": 4.288938522338867, "learning_rate": 1.2635482030804336e-05, "loss": 0.6002, "step": 2220 }, { "epoch": 0.38011295567345543, "grad_norm": 16.282394409179688, "learning_rate": 1.2641186537364519e-05, "loss": 1.2398, "step": 2221 }, { "epoch": 0.38028410063323637, "grad_norm": 15.423003196716309, "learning_rate": 1.26468910439247e-05, "loss": 1.0447, "step": 2222 }, { "epoch": 0.3804552455930173, "grad_norm": 8.580951690673828, "learning_rate": 1.2652595550484884e-05, "loss": 1.0967, "step": 2223 }, { "epoch": 0.38062639055279823, "grad_norm": 23.481037139892578, "learning_rate": 1.2658300057045066e-05, "loss": 3.0215, "step": 2224 }, { "epoch": 0.38079753551257917, "grad_norm": 31.463350296020508, "learning_rate": 1.2664004563605247e-05, "loss": 3.9185, "step": 2225 }, { "epoch": 0.3809686804723601, "grad_norm": 33.95023727416992, "learning_rate": 1.266970907016543e-05, "loss": 4.4252, "step": 2226 }, { "epoch": 0.38113982543214103, "grad_norm": 32.201377868652344, "learning_rate": 1.2675413576725612e-05, "loss": 3.1638, "step": 2227 }, { "epoch": 0.38131097039192197, "grad_norm": 33.09391784667969, "learning_rate": 1.2681118083285796e-05, "loss": 4.5716, "step": 2228 }, { "epoch": 0.3814821153517029, "grad_norm": 89.28120422363281, "learning_rate": 1.2686822589845979e-05, "loss": 5.4798, "step": 2229 }, { "epoch": 0.38165326031148383, "grad_norm": 18.636362075805664, "learning_rate": 1.2692527096406163e-05, "loss": 1.3417, "step": 2230 }, { "epoch": 0.38182440527126477, "grad_norm": 108.82768249511719, "learning_rate": 1.2698231602966344e-05, "loss": 5.2101, "step": 2231 }, { "epoch": 0.3819955502310457, "grad_norm": 32.57135009765625, "learning_rate": 1.2703936109526526e-05, "loss": 4.3203, "step": 2232 }, { "epoch": 0.38216669519082663, "grad_norm": 33.27009963989258, "learning_rate": 1.270964061608671e-05, "loss": 3.9393, "step": 2233 }, { "epoch": 0.38233784015060757, "grad_norm": 16.50580406188965, "learning_rate": 1.2715345122646891e-05, "loss": 1.5263, "step": 2234 }, { "epoch": 0.3825089851103885, "grad_norm": 18.65876579284668, "learning_rate": 1.2721049629207074e-05, "loss": 1.948, "step": 2235 }, { "epoch": 0.38268013007016943, "grad_norm": 28.283248901367188, "learning_rate": 1.2726754135767256e-05, "loss": 2.8414, "step": 2236 }, { "epoch": 0.38285127502995037, "grad_norm": 118.61890411376953, "learning_rate": 1.273245864232744e-05, "loss": 7.9885, "step": 2237 }, { "epoch": 0.3830224199897313, "grad_norm": 16.00472640991211, "learning_rate": 1.2738163148887621e-05, "loss": 1.4454, "step": 2238 }, { "epoch": 0.38319356494951223, "grad_norm": 18.229719161987305, "learning_rate": 1.2743867655447803e-05, "loss": 1.5843, "step": 2239 }, { "epoch": 0.38336470990929317, "grad_norm": 26.571413040161133, "learning_rate": 1.2749572162007986e-05, "loss": 2.6752, "step": 2240 }, { "epoch": 0.3835358548690741, "grad_norm": 66.64990234375, "learning_rate": 1.275527666856817e-05, "loss": 5.1676, "step": 2241 }, { "epoch": 0.38370699982885503, "grad_norm": 19.84005355834961, "learning_rate": 1.2760981175128353e-05, "loss": 1.4974, "step": 2242 }, { "epoch": 0.38387814478863597, "grad_norm": 18.671689987182617, "learning_rate": 1.2766685681688535e-05, "loss": 1.9852, "step": 2243 }, { "epoch": 0.3840492897484169, "grad_norm": 98.68587493896484, "learning_rate": 1.2772390188248716e-05, "loss": 4.9745, "step": 2244 }, { "epoch": 0.38422043470819783, "grad_norm": 6.933028221130371, "learning_rate": 1.27780946948089e-05, "loss": 0.6802, "step": 2245 }, { "epoch": 0.38439157966797877, "grad_norm": 18.11700439453125, "learning_rate": 1.2783799201369081e-05, "loss": 1.5278, "step": 2246 }, { "epoch": 0.3845627246277597, "grad_norm": 18.046253204345703, "learning_rate": 1.2789503707929265e-05, "loss": 2.1576, "step": 2247 }, { "epoch": 0.38473386958754063, "grad_norm": 44.326602935791016, "learning_rate": 1.2795208214489446e-05, "loss": 7.2696, "step": 2248 }, { "epoch": 0.38490501454732157, "grad_norm": 95.7645034790039, "learning_rate": 1.280091272104963e-05, "loss": 6.0354, "step": 2249 }, { "epoch": 0.3850761595071025, "grad_norm": 7.336085796356201, "learning_rate": 1.2806617227609811e-05, "loss": 1.6021, "step": 2250 }, { "epoch": 0.38524730446688343, "grad_norm": 21.488544464111328, "learning_rate": 1.2812321734169993e-05, "loss": 1.9826, "step": 2251 }, { "epoch": 0.38541844942666437, "grad_norm": 34.97186279296875, "learning_rate": 1.2818026240730178e-05, "loss": 4.3046, "step": 2252 }, { "epoch": 0.3855895943864453, "grad_norm": 4.4676008224487305, "learning_rate": 1.282373074729036e-05, "loss": 0.6622, "step": 2253 }, { "epoch": 0.38576073934622623, "grad_norm": 6.151776313781738, "learning_rate": 1.2829435253850543e-05, "loss": 0.6381, "step": 2254 }, { "epoch": 0.38593188430600717, "grad_norm": 6.36190938949585, "learning_rate": 1.2835139760410725e-05, "loss": 0.6743, "step": 2255 }, { "epoch": 0.3861030292657881, "grad_norm": 24.97540283203125, "learning_rate": 1.2840844266970908e-05, "loss": 2.6396, "step": 2256 }, { "epoch": 0.38627417422556903, "grad_norm": 141.4521026611328, "learning_rate": 1.284654877353109e-05, "loss": 4.7593, "step": 2257 }, { "epoch": 0.38644531918534997, "grad_norm": 16.442359924316406, "learning_rate": 1.2852253280091272e-05, "loss": 1.3891, "step": 2258 }, { "epoch": 0.3866164641451309, "grad_norm": 26.524892807006836, "learning_rate": 1.2857957786651455e-05, "loss": 2.8351, "step": 2259 }, { "epoch": 0.38678760910491183, "grad_norm": 20.284482955932617, "learning_rate": 1.2863662293211637e-05, "loss": 2.2276, "step": 2260 }, { "epoch": 0.3869587540646928, "grad_norm": 13.217884063720703, "learning_rate": 1.286936679977182e-05, "loss": 0.9694, "step": 2261 }, { "epoch": 0.38712989902447376, "grad_norm": 33.80121612548828, "learning_rate": 1.2875071306332002e-05, "loss": 4.1736, "step": 2262 }, { "epoch": 0.3873010439842547, "grad_norm": 33.30670928955078, "learning_rate": 1.2880775812892185e-05, "loss": 3.5895, "step": 2263 }, { "epoch": 0.3874721889440356, "grad_norm": 24.27392578125, "learning_rate": 1.2886480319452369e-05, "loss": 2.6142, "step": 2264 }, { "epoch": 0.38764333390381656, "grad_norm": 4.387927055358887, "learning_rate": 1.289218482601255e-05, "loss": 0.5611, "step": 2265 }, { "epoch": 0.3878144788635975, "grad_norm": 11.723445892333984, "learning_rate": 1.2897889332572734e-05, "loss": 1.9691, "step": 2266 }, { "epoch": 0.3879856238233784, "grad_norm": 31.290142059326172, "learning_rate": 1.2903593839132915e-05, "loss": 4.4918, "step": 2267 }, { "epoch": 0.38815676878315936, "grad_norm": 29.301557540893555, "learning_rate": 1.2909298345693099e-05, "loss": 3.0265, "step": 2268 }, { "epoch": 0.3883279137429403, "grad_norm": 102.96603393554688, "learning_rate": 1.291500285225328e-05, "loss": 5.4491, "step": 2269 }, { "epoch": 0.3884990587027212, "grad_norm": 6.566992282867432, "learning_rate": 1.2920707358813462e-05, "loss": 0.6619, "step": 2270 }, { "epoch": 0.38867020366250216, "grad_norm": 20.63521385192871, "learning_rate": 1.2926411865373645e-05, "loss": 1.4227, "step": 2271 }, { "epoch": 0.3888413486222831, "grad_norm": 35.605445861816406, "learning_rate": 1.2932116371933827e-05, "loss": 3.0135, "step": 2272 }, { "epoch": 0.389012493582064, "grad_norm": 26.535554885864258, "learning_rate": 1.293782087849401e-05, "loss": 2.8951, "step": 2273 }, { "epoch": 0.38918363854184496, "grad_norm": 216.86865234375, "learning_rate": 1.2943525385054192e-05, "loss": 9.7157, "step": 2274 }, { "epoch": 0.3893547835016259, "grad_norm": 30.289108276367188, "learning_rate": 1.2949229891614376e-05, "loss": 3.7199, "step": 2275 }, { "epoch": 0.3895259284614068, "grad_norm": 70.54218292236328, "learning_rate": 1.2954934398174559e-05, "loss": 8.7961, "step": 2276 }, { "epoch": 0.38969707342118776, "grad_norm": 37.42404556274414, "learning_rate": 1.296063890473474e-05, "loss": 4.6408, "step": 2277 }, { "epoch": 0.3898682183809687, "grad_norm": 20.272388458251953, "learning_rate": 1.2966343411294924e-05, "loss": 2.2787, "step": 2278 }, { "epoch": 0.3900393633407496, "grad_norm": 21.717552185058594, "learning_rate": 1.2972047917855106e-05, "loss": 2.6312, "step": 2279 }, { "epoch": 0.39021050830053056, "grad_norm": 27.405563354492188, "learning_rate": 1.2977752424415289e-05, "loss": 2.922, "step": 2280 }, { "epoch": 0.3903816532603115, "grad_norm": 9.014309883117676, "learning_rate": 1.298345693097547e-05, "loss": 0.7394, "step": 2281 }, { "epoch": 0.3905527982200924, "grad_norm": 34.70540237426758, "learning_rate": 1.2989161437535652e-05, "loss": 4.5773, "step": 2282 }, { "epoch": 0.39072394317987336, "grad_norm": 17.615568161010742, "learning_rate": 1.2994865944095836e-05, "loss": 1.3793, "step": 2283 }, { "epoch": 0.3908950881396543, "grad_norm": 9.69536018371582, "learning_rate": 1.3000570450656018e-05, "loss": 1.4154, "step": 2284 }, { "epoch": 0.3910662330994352, "grad_norm": 42.174076080322266, "learning_rate": 1.3006274957216201e-05, "loss": 3.7989, "step": 2285 }, { "epoch": 0.39123737805921616, "grad_norm": 23.85903549194336, "learning_rate": 1.3011979463776384e-05, "loss": 2.7904, "step": 2286 }, { "epoch": 0.3914085230189971, "grad_norm": 12.722695350646973, "learning_rate": 1.3017683970336568e-05, "loss": 0.9675, "step": 2287 }, { "epoch": 0.391579667978778, "grad_norm": 29.10125160217285, "learning_rate": 1.302338847689675e-05, "loss": 3.3556, "step": 2288 }, { "epoch": 0.39175081293855896, "grad_norm": 28.335847854614258, "learning_rate": 1.3029092983456931e-05, "loss": 3.6913, "step": 2289 }, { "epoch": 0.3919219578983399, "grad_norm": 27.098371505737305, "learning_rate": 1.3034797490017114e-05, "loss": 3.4171, "step": 2290 }, { "epoch": 0.3920931028581208, "grad_norm": 24.61624526977539, "learning_rate": 1.3040501996577296e-05, "loss": 2.5601, "step": 2291 }, { "epoch": 0.39226424781790176, "grad_norm": 36.29865264892578, "learning_rate": 1.304620650313748e-05, "loss": 4.1205, "step": 2292 }, { "epoch": 0.3924353927776827, "grad_norm": Infinity, "learning_rate": 1.304620650313748e-05, "loss": 10.2854, "step": 2293 }, { "epoch": 0.3926065377374636, "grad_norm": 43.4719352722168, "learning_rate": 1.3051911009697661e-05, "loss": 5.7968, "step": 2294 }, { "epoch": 0.39277768269724456, "grad_norm": 34.03304672241211, "learning_rate": 1.3057615516257843e-05, "loss": 3.2482, "step": 2295 }, { "epoch": 0.3929488276570255, "grad_norm": 28.92998695373535, "learning_rate": 1.3063320022818026e-05, "loss": 3.1048, "step": 2296 }, { "epoch": 0.3931199726168064, "grad_norm": 30.764570236206055, "learning_rate": 1.3069024529378208e-05, "loss": 3.4204, "step": 2297 }, { "epoch": 0.39329111757658736, "grad_norm": 30.185405731201172, "learning_rate": 1.3074729035938391e-05, "loss": 3.6159, "step": 2298 }, { "epoch": 0.3934622625363683, "grad_norm": 15.160475730895996, "learning_rate": 1.3080433542498575e-05, "loss": 1.0351, "step": 2299 }, { "epoch": 0.3936334074961492, "grad_norm": 30.460662841796875, "learning_rate": 1.3086138049058758e-05, "loss": 3.0676, "step": 2300 }, { "epoch": 0.39380455245593016, "grad_norm": 31.176111221313477, "learning_rate": 1.309184255561894e-05, "loss": 3.07, "step": 2301 }, { "epoch": 0.3939756974157111, "grad_norm": 40.287208557128906, "learning_rate": 1.3097547062179121e-05, "loss": 4.0389, "step": 2302 }, { "epoch": 0.394146842375492, "grad_norm": 31.603471755981445, "learning_rate": 1.3103251568739305e-05, "loss": 3.1424, "step": 2303 }, { "epoch": 0.39431798733527296, "grad_norm": 27.959386825561523, "learning_rate": 1.3108956075299486e-05, "loss": 2.7499, "step": 2304 }, { "epoch": 0.3944891322950539, "grad_norm": 9.44000244140625, "learning_rate": 1.311466058185967e-05, "loss": 0.6843, "step": 2305 }, { "epoch": 0.3946602772548348, "grad_norm": 31.026531219482422, "learning_rate": 1.3120365088419852e-05, "loss": 3.0951, "step": 2306 }, { "epoch": 0.39483142221461576, "grad_norm": 28.429651260375977, "learning_rate": 1.3126069594980035e-05, "loss": 3.1954, "step": 2307 }, { "epoch": 0.3950025671743967, "grad_norm": 36.807884216308594, "learning_rate": 1.3131774101540217e-05, "loss": 3.5692, "step": 2308 }, { "epoch": 0.3951737121341776, "grad_norm": 27.523998260498047, "learning_rate": 1.3137478608100398e-05, "loss": 3.04, "step": 2309 }, { "epoch": 0.39534485709395856, "grad_norm": 22.569734573364258, "learning_rate": 1.3143183114660583e-05, "loss": 2.6063, "step": 2310 }, { "epoch": 0.3955160020537395, "grad_norm": 30.23894691467285, "learning_rate": 1.3148887621220765e-05, "loss": 2.6877, "step": 2311 }, { "epoch": 0.3956871470135205, "grad_norm": 32.485286712646484, "learning_rate": 1.3154592127780948e-05, "loss": 4.2341, "step": 2312 }, { "epoch": 0.3958582919733014, "grad_norm": 9.512272834777832, "learning_rate": 1.316029663434113e-05, "loss": 0.7438, "step": 2313 }, { "epoch": 0.39602943693308235, "grad_norm": 30.39967918395996, "learning_rate": 1.3166001140901312e-05, "loss": 2.678, "step": 2314 }, { "epoch": 0.3962005818928633, "grad_norm": 26.347349166870117, "learning_rate": 1.3171705647461495e-05, "loss": 2.6173, "step": 2315 }, { "epoch": 0.3963717268526442, "grad_norm": 11.27676010131836, "learning_rate": 1.3177410154021677e-05, "loss": 0.9184, "step": 2316 }, { "epoch": 0.39654287181242515, "grad_norm": 28.942106246948242, "learning_rate": 1.318311466058186e-05, "loss": 2.7751, "step": 2317 }, { "epoch": 0.3967140167722061, "grad_norm": 155.31259155273438, "learning_rate": 1.3188819167142042e-05, "loss": 7.0884, "step": 2318 }, { "epoch": 0.396885161731987, "grad_norm": 15.048434257507324, "learning_rate": 1.3194523673702225e-05, "loss": 1.0894, "step": 2319 }, { "epoch": 0.39705630669176795, "grad_norm": 29.555904388427734, "learning_rate": 1.3200228180262407e-05, "loss": 3.0407, "step": 2320 }, { "epoch": 0.3972274516515489, "grad_norm": 22.9705810546875, "learning_rate": 1.3205932686822589e-05, "loss": 2.1862, "step": 2321 }, { "epoch": 0.3973985966113298, "grad_norm": 31.04474449157715, "learning_rate": 1.3211637193382774e-05, "loss": 3.5704, "step": 2322 }, { "epoch": 0.39756974157111075, "grad_norm": 38.25536346435547, "learning_rate": 1.3217341699942955e-05, "loss": 3.7726, "step": 2323 }, { "epoch": 0.3977408865308917, "grad_norm": 24.22712516784668, "learning_rate": 1.3223046206503139e-05, "loss": 2.9952, "step": 2324 }, { "epoch": 0.3979120314906726, "grad_norm": 32.82272720336914, "learning_rate": 1.322875071306332e-05, "loss": 3.313, "step": 2325 }, { "epoch": 0.39808317645045355, "grad_norm": 29.25124168395996, "learning_rate": 1.3234455219623502e-05, "loss": 2.9707, "step": 2326 }, { "epoch": 0.3982543214102345, "grad_norm": 42.494041442871094, "learning_rate": 1.3240159726183686e-05, "loss": 4.4698, "step": 2327 }, { "epoch": 0.3984254663700154, "grad_norm": 32.5220947265625, "learning_rate": 1.3245864232743867e-05, "loss": 3.7016, "step": 2328 }, { "epoch": 0.39859661132979635, "grad_norm": 4.652500629425049, "learning_rate": 1.325156873930405e-05, "loss": 0.5571, "step": 2329 }, { "epoch": 0.3987677562895773, "grad_norm": 136.5018768310547, "learning_rate": 1.3257273245864232e-05, "loss": 5.8926, "step": 2330 }, { "epoch": 0.3989389012493582, "grad_norm": 51.504051208496094, "learning_rate": 1.3262977752424416e-05, "loss": 5.6096, "step": 2331 }, { "epoch": 0.39911004620913915, "grad_norm": 18.578340530395508, "learning_rate": 1.3268682258984597e-05, "loss": 2.2877, "step": 2332 }, { "epoch": 0.3992811911689201, "grad_norm": 26.573881149291992, "learning_rate": 1.327438676554478e-05, "loss": 3.3675, "step": 2333 }, { "epoch": 0.399452336128701, "grad_norm": 28.39176368713379, "learning_rate": 1.3280091272104964e-05, "loss": 3.4595, "step": 2334 }, { "epoch": 0.39962348108848195, "grad_norm": 27.315298080444336, "learning_rate": 1.3285795778665146e-05, "loss": 3.7229, "step": 2335 }, { "epoch": 0.3997946260482629, "grad_norm": 34.018280029296875, "learning_rate": 1.329150028522533e-05, "loss": 3.7833, "step": 2336 }, { "epoch": 0.3999657710080438, "grad_norm": 35.161949157714844, "learning_rate": 1.3297204791785511e-05, "loss": 3.6256, "step": 2337 }, { "epoch": 0.40013691596782475, "grad_norm": 54.34180450439453, "learning_rate": 1.3302909298345694e-05, "loss": 8.2972, "step": 2338 }, { "epoch": 0.4003080609276057, "grad_norm": 37.41242980957031, "learning_rate": 1.3308613804905876e-05, "loss": 4.1554, "step": 2339 }, { "epoch": 0.4004792058873866, "grad_norm": 24.117671966552734, "learning_rate": 1.3314318311466058e-05, "loss": 2.6779, "step": 2340 }, { "epoch": 0.40065035084716755, "grad_norm": 75.865234375, "learning_rate": 1.3320022818026241e-05, "loss": 5.2769, "step": 2341 }, { "epoch": 0.4008214958069485, "grad_norm": 10.710969924926758, "learning_rate": 1.3325727324586423e-05, "loss": 0.8464, "step": 2342 }, { "epoch": 0.4009926407667294, "grad_norm": 32.42598342895508, "learning_rate": 1.3331431831146606e-05, "loss": 4.2176, "step": 2343 }, { "epoch": 0.40116378572651035, "grad_norm": 31.665010452270508, "learning_rate": 1.333713633770679e-05, "loss": 3.6966, "step": 2344 }, { "epoch": 0.4013349306862913, "grad_norm": 19.899494171142578, "learning_rate": 1.3342840844266971e-05, "loss": 1.5844, "step": 2345 }, { "epoch": 0.4015060756460722, "grad_norm": 26.72218132019043, "learning_rate": 1.3348545350827155e-05, "loss": 2.6391, "step": 2346 }, { "epoch": 0.40167722060585315, "grad_norm": 67.58808135986328, "learning_rate": 1.3354249857387336e-05, "loss": 4.8317, "step": 2347 }, { "epoch": 0.4018483655656341, "grad_norm": 35.947166442871094, "learning_rate": 1.335995436394752e-05, "loss": 4.4359, "step": 2348 }, { "epoch": 0.402019510525415, "grad_norm": 33.10310745239258, "learning_rate": 1.3365658870507701e-05, "loss": 4.2287, "step": 2349 }, { "epoch": 0.40219065548519595, "grad_norm": 26.962339401245117, "learning_rate": 1.3371363377067885e-05, "loss": 2.8973, "step": 2350 }, { "epoch": 0.4023618004449769, "grad_norm": 10.832432746887207, "learning_rate": 1.3377067883628066e-05, "loss": 1.1334, "step": 2351 }, { "epoch": 0.4025329454047578, "grad_norm": 3.8930623531341553, "learning_rate": 1.3382772390188248e-05, "loss": 0.5691, "step": 2352 }, { "epoch": 0.40270409036453875, "grad_norm": 30.617422103881836, "learning_rate": 1.3388476896748431e-05, "loss": 4.1985, "step": 2353 }, { "epoch": 0.4028752353243197, "grad_norm": 29.522432327270508, "learning_rate": 1.3394181403308613e-05, "loss": 3.3929, "step": 2354 }, { "epoch": 0.4030463802841006, "grad_norm": 29.46415901184082, "learning_rate": 1.3399885909868796e-05, "loss": 3.0491, "step": 2355 }, { "epoch": 0.40321752524388155, "grad_norm": 28.462308883666992, "learning_rate": 1.340559041642898e-05, "loss": 3.4012, "step": 2356 }, { "epoch": 0.4033886702036625, "grad_norm": 26.383548736572266, "learning_rate": 1.3411294922989163e-05, "loss": 2.8104, "step": 2357 }, { "epoch": 0.4035598151634434, "grad_norm": 5.228971004486084, "learning_rate": 1.3416999429549345e-05, "loss": 0.6984, "step": 2358 }, { "epoch": 0.40373096012322435, "grad_norm": 65.2656021118164, "learning_rate": 1.3422703936109527e-05, "loss": 4.842, "step": 2359 }, { "epoch": 0.4039021050830053, "grad_norm": 9.332406044006348, "learning_rate": 1.342840844266971e-05, "loss": 1.1261, "step": 2360 }, { "epoch": 0.4040732500427862, "grad_norm": 19.021203994750977, "learning_rate": 1.3434112949229892e-05, "loss": 2.1595, "step": 2361 }, { "epoch": 0.40424439500256715, "grad_norm": 26.664148330688477, "learning_rate": 1.3439817455790075e-05, "loss": 3.1312, "step": 2362 }, { "epoch": 0.40441553996234814, "grad_norm": 14.464133262634277, "learning_rate": 1.3445521962350257e-05, "loss": 1.3528, "step": 2363 }, { "epoch": 0.40458668492212907, "grad_norm": 25.62092399597168, "learning_rate": 1.3451226468910438e-05, "loss": 2.6933, "step": 2364 }, { "epoch": 0.40475782988191, "grad_norm": 30.344446182250977, "learning_rate": 1.3456930975470622e-05, "loss": 3.5816, "step": 2365 }, { "epoch": 0.40492897484169094, "grad_norm": 34.0131950378418, "learning_rate": 1.3462635482030803e-05, "loss": 3.8169, "step": 2366 }, { "epoch": 0.40510011980147187, "grad_norm": 25.75012969970703, "learning_rate": 1.3468339988590989e-05, "loss": 2.9735, "step": 2367 }, { "epoch": 0.4052712647612528, "grad_norm": 31.590328216552734, "learning_rate": 1.347404449515117e-05, "loss": 3.4589, "step": 2368 }, { "epoch": 0.40544240972103374, "grad_norm": 24.881752014160156, "learning_rate": 1.3479749001711354e-05, "loss": 2.5041, "step": 2369 }, { "epoch": 0.40561355468081467, "grad_norm": 20.611392974853516, "learning_rate": 1.3485453508271535e-05, "loss": 2.0903, "step": 2370 }, { "epoch": 0.4057846996405956, "grad_norm": 35.99172592163086, "learning_rate": 1.3491158014831717e-05, "loss": 4.3005, "step": 2371 }, { "epoch": 0.40595584460037654, "grad_norm": 44.53636932373047, "learning_rate": 1.34968625213919e-05, "loss": 7.8911, "step": 2372 }, { "epoch": 0.40612698956015747, "grad_norm": 41.1456298828125, "learning_rate": 1.3502567027952082e-05, "loss": 7.4041, "step": 2373 }, { "epoch": 0.4062981345199384, "grad_norm": 24.72629737854004, "learning_rate": 1.3508271534512265e-05, "loss": 2.0511, "step": 2374 }, { "epoch": 0.40646927947971934, "grad_norm": 9.164275169372559, "learning_rate": 1.3513976041072447e-05, "loss": 1.0127, "step": 2375 }, { "epoch": 0.40664042443950027, "grad_norm": 55.97251892089844, "learning_rate": 1.351968054763263e-05, "loss": 7.7883, "step": 2376 }, { "epoch": 0.4068115693992812, "grad_norm": 15.729819297790527, "learning_rate": 1.3525385054192812e-05, "loss": 1.3747, "step": 2377 }, { "epoch": 0.40698271435906214, "grad_norm": 31.85474967956543, "learning_rate": 1.3531089560752994e-05, "loss": 3.7341, "step": 2378 }, { "epoch": 0.40715385931884307, "grad_norm": 32.369163513183594, "learning_rate": 1.3536794067313179e-05, "loss": 3.4044, "step": 2379 }, { "epoch": 0.407325004278624, "grad_norm": 26.481473922729492, "learning_rate": 1.354249857387336e-05, "loss": 2.7264, "step": 2380 }, { "epoch": 0.40749614923840494, "grad_norm": 36.87574005126953, "learning_rate": 1.3548203080433544e-05, "loss": 7.1091, "step": 2381 }, { "epoch": 0.40766729419818587, "grad_norm": 34.68164825439453, "learning_rate": 1.3553907586993726e-05, "loss": 3.5182, "step": 2382 }, { "epoch": 0.4078384391579668, "grad_norm": 4.539041042327881, "learning_rate": 1.3559612093553907e-05, "loss": 0.5712, "step": 2383 }, { "epoch": 0.40800958411774774, "grad_norm": 18.264692306518555, "learning_rate": 1.356531660011409e-05, "loss": 1.6827, "step": 2384 }, { "epoch": 0.40818072907752867, "grad_norm": 58.49655532836914, "learning_rate": 1.3571021106674272e-05, "loss": 4.7984, "step": 2385 }, { "epoch": 0.4083518740373096, "grad_norm": 38.31999969482422, "learning_rate": 1.3576725613234456e-05, "loss": 4.9844, "step": 2386 }, { "epoch": 0.40852301899709054, "grad_norm": 31.779747009277344, "learning_rate": 1.3582430119794637e-05, "loss": 4.109, "step": 2387 }, { "epoch": 0.40869416395687147, "grad_norm": 28.318117141723633, "learning_rate": 1.358813462635482e-05, "loss": 3.1768, "step": 2388 }, { "epoch": 0.4088653089166524, "grad_norm": 109.76797485351562, "learning_rate": 1.3593839132915003e-05, "loss": 8.1629, "step": 2389 }, { "epoch": 0.40903645387643334, "grad_norm": 29.490888595581055, "learning_rate": 1.3599543639475186e-05, "loss": 2.8556, "step": 2390 }, { "epoch": 0.40920759883621427, "grad_norm": 59.6926383972168, "learning_rate": 1.360524814603537e-05, "loss": 7.2838, "step": 2391 }, { "epoch": 0.4093787437959952, "grad_norm": 26.968727111816406, "learning_rate": 1.3610952652595551e-05, "loss": 3.1073, "step": 2392 }, { "epoch": 0.40954988875577614, "grad_norm": 14.444951057434082, "learning_rate": 1.3616657159155734e-05, "loss": 1.1151, "step": 2393 }, { "epoch": 0.40972103371555707, "grad_norm": 27.179691314697266, "learning_rate": 1.3622361665715916e-05, "loss": 3.4778, "step": 2394 }, { "epoch": 0.409892178675338, "grad_norm": 28.209474563598633, "learning_rate": 1.3628066172276098e-05, "loss": 3.1061, "step": 2395 }, { "epoch": 0.41006332363511894, "grad_norm": 28.115158081054688, "learning_rate": 1.3633770678836281e-05, "loss": 3.3303, "step": 2396 }, { "epoch": 0.41023446859489987, "grad_norm": 33.9571418762207, "learning_rate": 1.3639475185396463e-05, "loss": 3.476, "step": 2397 }, { "epoch": 0.4104056135546808, "grad_norm": 99.95455932617188, "learning_rate": 1.3645179691956646e-05, "loss": 4.891, "step": 2398 }, { "epoch": 0.41057675851446174, "grad_norm": 32.09910583496094, "learning_rate": 1.3650884198516828e-05, "loss": 4.5344, "step": 2399 }, { "epoch": 0.41074790347424267, "grad_norm": 22.752981185913086, "learning_rate": 1.3656588705077011e-05, "loss": 2.5455, "step": 2400 }, { "epoch": 0.4109190484340236, "grad_norm": 31.16071128845215, "learning_rate": 1.3662293211637193e-05, "loss": 3.5245, "step": 2401 }, { "epoch": 0.41109019339380454, "grad_norm": 16.054365158081055, "learning_rate": 1.3667997718197376e-05, "loss": 1.1714, "step": 2402 }, { "epoch": 0.41126133835358547, "grad_norm": 61.82563018798828, "learning_rate": 1.367370222475756e-05, "loss": 4.4332, "step": 2403 }, { "epoch": 0.4114324833133664, "grad_norm": 25.521482467651367, "learning_rate": 1.3679406731317741e-05, "loss": 3.1523, "step": 2404 }, { "epoch": 0.41160362827314734, "grad_norm": 28.02633285522461, "learning_rate": 1.3685111237877925e-05, "loss": 3.3077, "step": 2405 }, { "epoch": 0.41177477323292827, "grad_norm": 31.012575149536133, "learning_rate": 1.3690815744438106e-05, "loss": 3.2338, "step": 2406 }, { "epoch": 0.4119459181927092, "grad_norm": 24.693798065185547, "learning_rate": 1.369652025099829e-05, "loss": 3.1159, "step": 2407 }, { "epoch": 0.41211706315249014, "grad_norm": 28.928600311279297, "learning_rate": 1.3702224757558471e-05, "loss": 3.231, "step": 2408 }, { "epoch": 0.41228820811227107, "grad_norm": 30.929235458374023, "learning_rate": 1.3707929264118653e-05, "loss": 3.7399, "step": 2409 }, { "epoch": 0.412459353072052, "grad_norm": 29.809967041015625, "learning_rate": 1.3713633770678837e-05, "loss": 2.8299, "step": 2410 }, { "epoch": 0.41263049803183294, "grad_norm": 34.67237091064453, "learning_rate": 1.3719338277239018e-05, "loss": 3.7692, "step": 2411 }, { "epoch": 0.41280164299161387, "grad_norm": 29.03022575378418, "learning_rate": 1.3725042783799202e-05, "loss": 3.1456, "step": 2412 }, { "epoch": 0.4129727879513948, "grad_norm": 27.838979721069336, "learning_rate": 1.3730747290359385e-05, "loss": 2.7944, "step": 2413 }, { "epoch": 0.4131439329111758, "grad_norm": 27.87117576599121, "learning_rate": 1.3736451796919567e-05, "loss": 2.9217, "step": 2414 }, { "epoch": 0.4133150778709567, "grad_norm": 34.504295349121094, "learning_rate": 1.374215630347975e-05, "loss": 3.9797, "step": 2415 }, { "epoch": 0.41348622283073766, "grad_norm": 21.570331573486328, "learning_rate": 1.3747860810039932e-05, "loss": 2.3607, "step": 2416 }, { "epoch": 0.4136573677905186, "grad_norm": 132.19834899902344, "learning_rate": 1.3753565316600115e-05, "loss": 7.9895, "step": 2417 }, { "epoch": 0.4138285127502995, "grad_norm": 29.5281925201416, "learning_rate": 1.3759269823160297e-05, "loss": 3.3183, "step": 2418 }, { "epoch": 0.41399965771008046, "grad_norm": 7.409353256225586, "learning_rate": 1.376497432972048e-05, "loss": 0.6978, "step": 2419 }, { "epoch": 0.4141708026698614, "grad_norm": 23.6326847076416, "learning_rate": 1.3770678836280662e-05, "loss": 2.4085, "step": 2420 }, { "epoch": 0.4143419476296423, "grad_norm": 6.584427356719971, "learning_rate": 1.3776383342840844e-05, "loss": 0.7725, "step": 2421 }, { "epoch": 0.41451309258942326, "grad_norm": 5.124080181121826, "learning_rate": 1.3782087849401027e-05, "loss": 0.5998, "step": 2422 }, { "epoch": 0.4146842375492042, "grad_norm": 129.28781127929688, "learning_rate": 1.3787792355961209e-05, "loss": 5.2033, "step": 2423 }, { "epoch": 0.4148553825089851, "grad_norm": 30.348461151123047, "learning_rate": 1.3793496862521394e-05, "loss": 3.4905, "step": 2424 }, { "epoch": 0.41502652746876606, "grad_norm": 25.107507705688477, "learning_rate": 1.3799201369081575e-05, "loss": 2.4536, "step": 2425 }, { "epoch": 0.415197672428547, "grad_norm": 20.649410247802734, "learning_rate": 1.3804905875641757e-05, "loss": 2.11, "step": 2426 }, { "epoch": 0.4153688173883279, "grad_norm": 31.82566261291504, "learning_rate": 1.381061038220194e-05, "loss": 3.4624, "step": 2427 }, { "epoch": 0.41553996234810886, "grad_norm": 25.216468811035156, "learning_rate": 1.3816314888762122e-05, "loss": 2.7103, "step": 2428 }, { "epoch": 0.4157111073078898, "grad_norm": 62.44169616699219, "learning_rate": 1.3822019395322305e-05, "loss": 7.9435, "step": 2429 }, { "epoch": 0.4158822522676707, "grad_norm": 14.46311092376709, "learning_rate": 1.3827723901882487e-05, "loss": 1.3546, "step": 2430 }, { "epoch": 0.41605339722745166, "grad_norm": 21.584251403808594, "learning_rate": 1.383342840844267e-05, "loss": 2.3481, "step": 2431 }, { "epoch": 0.4162245421872326, "grad_norm": 28.41043472290039, "learning_rate": 1.3839132915002852e-05, "loss": 3.242, "step": 2432 }, { "epoch": 0.4163956871470135, "grad_norm": 57.48540496826172, "learning_rate": 1.3844837421563034e-05, "loss": 8.205, "step": 2433 }, { "epoch": 0.41656683210679446, "grad_norm": 20.560029983520508, "learning_rate": 1.3850541928123217e-05, "loss": 2.4152, "step": 2434 }, { "epoch": 0.4167379770665754, "grad_norm": 29.860027313232422, "learning_rate": 1.3856246434683399e-05, "loss": 2.9946, "step": 2435 }, { "epoch": 0.4169091220263563, "grad_norm": 34.29914855957031, "learning_rate": 1.3861950941243584e-05, "loss": 4.0154, "step": 2436 }, { "epoch": 0.41708026698613726, "grad_norm": 31.778980255126953, "learning_rate": 1.3867655447803766e-05, "loss": 3.279, "step": 2437 }, { "epoch": 0.4172514119459182, "grad_norm": 30.92992401123047, "learning_rate": 1.3873359954363949e-05, "loss": 3.7128, "step": 2438 }, { "epoch": 0.4174225569056991, "grad_norm": 30.067113876342773, "learning_rate": 1.387906446092413e-05, "loss": 3.3294, "step": 2439 }, { "epoch": 0.41759370186548006, "grad_norm": 78.94349670410156, "learning_rate": 1.3884768967484312e-05, "loss": 4.5151, "step": 2440 }, { "epoch": 0.417764846825261, "grad_norm": 35.60622787475586, "learning_rate": 1.3890473474044496e-05, "loss": 3.3575, "step": 2441 }, { "epoch": 0.4179359917850419, "grad_norm": 29.288429260253906, "learning_rate": 1.3896177980604678e-05, "loss": 3.0086, "step": 2442 }, { "epoch": 0.41810713674482286, "grad_norm": 29.106294631958008, "learning_rate": 1.3901882487164861e-05, "loss": 3.228, "step": 2443 }, { "epoch": 0.4182782817046038, "grad_norm": 20.533992767333984, "learning_rate": 1.3907586993725043e-05, "loss": 2.2425, "step": 2444 }, { "epoch": 0.4184494266643847, "grad_norm": 26.775163650512695, "learning_rate": 1.3913291500285224e-05, "loss": 2.6486, "step": 2445 }, { "epoch": 0.41862057162416566, "grad_norm": 23.887187957763672, "learning_rate": 1.3918996006845408e-05, "loss": 2.0254, "step": 2446 }, { "epoch": 0.4187917165839466, "grad_norm": 32.54766082763672, "learning_rate": 1.3924700513405591e-05, "loss": 4.0827, "step": 2447 }, { "epoch": 0.4189628615437275, "grad_norm": 24.691999435424805, "learning_rate": 1.3930405019965774e-05, "loss": 2.7637, "step": 2448 }, { "epoch": 0.41913400650350846, "grad_norm": 36.446842193603516, "learning_rate": 1.3936109526525956e-05, "loss": 7.2209, "step": 2449 }, { "epoch": 0.4193051514632894, "grad_norm": 24.245582580566406, "learning_rate": 1.394181403308614e-05, "loss": 2.5936, "step": 2450 }, { "epoch": 0.4194762964230703, "grad_norm": 34.520198822021484, "learning_rate": 1.3947518539646321e-05, "loss": 3.7529, "step": 2451 }, { "epoch": 0.41964744138285126, "grad_norm": 34.79539489746094, "learning_rate": 1.3953223046206503e-05, "loss": 4.0982, "step": 2452 }, { "epoch": 0.4198185863426322, "grad_norm": 6.38947057723999, "learning_rate": 1.3958927552766686e-05, "loss": 0.5817, "step": 2453 }, { "epoch": 0.4199897313024131, "grad_norm": 35.33879852294922, "learning_rate": 1.3964632059326868e-05, "loss": 3.3258, "step": 2454 }, { "epoch": 0.42016087626219406, "grad_norm": 8.833622932434082, "learning_rate": 1.3970336565887051e-05, "loss": 1.1178, "step": 2455 }, { "epoch": 0.420332021221975, "grad_norm": 25.07313346862793, "learning_rate": 1.3976041072447233e-05, "loss": 2.7209, "step": 2456 }, { "epoch": 0.4205031661817559, "grad_norm": 30.224679946899414, "learning_rate": 1.3981745579007416e-05, "loss": 3.4578, "step": 2457 }, { "epoch": 0.42067431114153686, "grad_norm": 26.421674728393555, "learning_rate": 1.3987450085567598e-05, "loss": 2.6023, "step": 2458 }, { "epoch": 0.4208454561013178, "grad_norm": 33.97099685668945, "learning_rate": 1.3993154592127781e-05, "loss": 4.066, "step": 2459 }, { "epoch": 0.4210166010610987, "grad_norm": 28.07750701904297, "learning_rate": 1.3998859098687965e-05, "loss": 3.1308, "step": 2460 }, { "epoch": 0.42118774602087966, "grad_norm": 33.50989532470703, "learning_rate": 1.4004563605248146e-05, "loss": 3.461, "step": 2461 }, { "epoch": 0.4213588909806606, "grad_norm": 16.63654136657715, "learning_rate": 1.401026811180833e-05, "loss": 1.3419, "step": 2462 }, { "epoch": 0.4215300359404415, "grad_norm": 16.42656707763672, "learning_rate": 1.4015972618368512e-05, "loss": 1.8682, "step": 2463 }, { "epoch": 0.42170118090022246, "grad_norm": 30.457616806030273, "learning_rate": 1.4021677124928693e-05, "loss": 3.1266, "step": 2464 }, { "epoch": 0.42187232586000345, "grad_norm": 20.4791202545166, "learning_rate": 1.4027381631488877e-05, "loss": 2.2995, "step": 2465 }, { "epoch": 0.4220434708197844, "grad_norm": 8.99075698852539, "learning_rate": 1.4033086138049058e-05, "loss": 1.6982, "step": 2466 }, { "epoch": 0.4222146157795653, "grad_norm": 57.64451599121094, "learning_rate": 1.4038790644609242e-05, "loss": 8.0313, "step": 2467 }, { "epoch": 0.42238576073934625, "grad_norm": 13.558103561401367, "learning_rate": 1.4044495151169423e-05, "loss": 1.2578, "step": 2468 }, { "epoch": 0.4225569056991272, "grad_norm": 21.366905212402344, "learning_rate": 1.4050199657729607e-05, "loss": 2.2006, "step": 2469 }, { "epoch": 0.4227280506589081, "grad_norm": 14.984084129333496, "learning_rate": 1.405590416428979e-05, "loss": 1.1421, "step": 2470 }, { "epoch": 0.42289919561868905, "grad_norm": 37.106781005859375, "learning_rate": 1.4061608670849972e-05, "loss": 3.448, "step": 2471 }, { "epoch": 0.42307034057847, "grad_norm": 103.56417083740234, "learning_rate": 1.4067313177410155e-05, "loss": 8.869, "step": 2472 }, { "epoch": 0.4232414855382509, "grad_norm": 34.15910339355469, "learning_rate": 1.4073017683970337e-05, "loss": 3.99, "step": 2473 }, { "epoch": 0.42341263049803185, "grad_norm": 9.371402740478516, "learning_rate": 1.407872219053052e-05, "loss": 1.7126, "step": 2474 }, { "epoch": 0.4235837754578128, "grad_norm": 5.610677719116211, "learning_rate": 1.4084426697090702e-05, "loss": 0.6139, "step": 2475 }, { "epoch": 0.4237549204175937, "grad_norm": 25.18387222290039, "learning_rate": 1.4090131203650885e-05, "loss": 3.0951, "step": 2476 }, { "epoch": 0.42392606537737465, "grad_norm": 21.81611442565918, "learning_rate": 1.4095835710211067e-05, "loss": 2.1739, "step": 2477 }, { "epoch": 0.4240972103371556, "grad_norm": 37.39387893676758, "learning_rate": 1.4101540216771249e-05, "loss": 4.5252, "step": 2478 }, { "epoch": 0.4242683552969365, "grad_norm": 5.823449611663818, "learning_rate": 1.4107244723331432e-05, "loss": 0.6128, "step": 2479 }, { "epoch": 0.42443950025671745, "grad_norm": 31.69689178466797, "learning_rate": 1.4112949229891614e-05, "loss": 2.9986, "step": 2480 }, { "epoch": 0.4246106452164984, "grad_norm": 35.987152099609375, "learning_rate": 1.4118653736451797e-05, "loss": 3.4619, "step": 2481 }, { "epoch": 0.4247817901762793, "grad_norm": 16.255069732666016, "learning_rate": 1.412435824301198e-05, "loss": 1.2687, "step": 2482 }, { "epoch": 0.42495293513606025, "grad_norm": 38.878501892089844, "learning_rate": 1.4130062749572162e-05, "loss": 4.4326, "step": 2483 }, { "epoch": 0.4251240800958412, "grad_norm": 33.7603759765625, "learning_rate": 1.4135767256132346e-05, "loss": 3.6103, "step": 2484 }, { "epoch": 0.4252952250556221, "grad_norm": 30.415058135986328, "learning_rate": 1.4141471762692527e-05, "loss": 2.9553, "step": 2485 }, { "epoch": 0.42546637001540305, "grad_norm": 18.42668914794922, "learning_rate": 1.414717626925271e-05, "loss": 1.6697, "step": 2486 }, { "epoch": 0.425637514975184, "grad_norm": 28.910137176513672, "learning_rate": 1.4152880775812892e-05, "loss": 2.7775, "step": 2487 }, { "epoch": 0.4258086599349649, "grad_norm": 24.642555236816406, "learning_rate": 1.4158585282373076e-05, "loss": 2.6428, "step": 2488 }, { "epoch": 0.42597980489474585, "grad_norm": 6.298614501953125, "learning_rate": 1.4164289788933257e-05, "loss": 0.5965, "step": 2489 }, { "epoch": 0.4261509498545268, "grad_norm": 134.2201690673828, "learning_rate": 1.4169994295493439e-05, "loss": 5.7153, "step": 2490 }, { "epoch": 0.4263220948143077, "grad_norm": 29.22636604309082, "learning_rate": 1.4175698802053622e-05, "loss": 3.1392, "step": 2491 }, { "epoch": 0.42649323977408865, "grad_norm": 29.945589065551758, "learning_rate": 1.4181403308613804e-05, "loss": 2.9141, "step": 2492 }, { "epoch": 0.4266643847338696, "grad_norm": 122.40117645263672, "learning_rate": 1.418710781517399e-05, "loss": 9.1376, "step": 2493 }, { "epoch": 0.4268355296936505, "grad_norm": 12.278093338012695, "learning_rate": 1.4192812321734171e-05, "loss": 0.8576, "step": 2494 }, { "epoch": 0.42700667465343145, "grad_norm": 29.33226776123047, "learning_rate": 1.4198516828294353e-05, "loss": 2.9443, "step": 2495 }, { "epoch": 0.4271778196132124, "grad_norm": 31.89412498474121, "learning_rate": 1.4204221334854536e-05, "loss": 3.7789, "step": 2496 }, { "epoch": 0.4273489645729933, "grad_norm": 30.404138565063477, "learning_rate": 1.4209925841414718e-05, "loss": 4.0024, "step": 2497 }, { "epoch": 0.42752010953277425, "grad_norm": 7.538527488708496, "learning_rate": 1.4215630347974901e-05, "loss": 0.6812, "step": 2498 }, { "epoch": 0.4276912544925552, "grad_norm": 194.7794952392578, "learning_rate": 1.4221334854535083e-05, "loss": 10.7557, "step": 2499 }, { "epoch": 0.4278623994523361, "grad_norm": 27.38447380065918, "learning_rate": 1.4227039361095266e-05, "loss": 3.0669, "step": 2500 }, { "epoch": 0.42803354441211705, "grad_norm": 36.52588653564453, "learning_rate": 1.4232743867655448e-05, "loss": 3.7922, "step": 2501 }, { "epoch": 0.428204689371898, "grad_norm": 14.776211738586426, "learning_rate": 1.423844837421563e-05, "loss": 1.0695, "step": 2502 }, { "epoch": 0.4283758343316789, "grad_norm": 22.516334533691406, "learning_rate": 1.4244152880775813e-05, "loss": 2.175, "step": 2503 }, { "epoch": 0.42854697929145985, "grad_norm": 31.414302825927734, "learning_rate": 1.4249857387335995e-05, "loss": 3.5488, "step": 2504 }, { "epoch": 0.4287181242512408, "grad_norm": 20.823116302490234, "learning_rate": 1.425556189389618e-05, "loss": 2.0128, "step": 2505 }, { "epoch": 0.4288892692110217, "grad_norm": 33.47979736328125, "learning_rate": 1.4261266400456361e-05, "loss": 3.6721, "step": 2506 }, { "epoch": 0.42906041417080265, "grad_norm": 33.771358489990234, "learning_rate": 1.4266970907016545e-05, "loss": 4.2083, "step": 2507 }, { "epoch": 0.4292315591305836, "grad_norm": 21.674623489379883, "learning_rate": 1.4272675413576726e-05, "loss": 1.8789, "step": 2508 }, { "epoch": 0.4294027040903645, "grad_norm": 31.44987678527832, "learning_rate": 1.4278379920136908e-05, "loss": 3.6812, "step": 2509 }, { "epoch": 0.42957384905014545, "grad_norm": 9.912192344665527, "learning_rate": 1.4284084426697091e-05, "loss": 1.2073, "step": 2510 }, { "epoch": 0.4297449940099264, "grad_norm": 26.342119216918945, "learning_rate": 1.4289788933257273e-05, "loss": 2.4193, "step": 2511 }, { "epoch": 0.4299161389697073, "grad_norm": 57.646331787109375, "learning_rate": 1.4295493439817456e-05, "loss": 8.0588, "step": 2512 }, { "epoch": 0.43008728392948825, "grad_norm": 25.247426986694336, "learning_rate": 1.4301197946377638e-05, "loss": 2.5184, "step": 2513 }, { "epoch": 0.4302584288892692, "grad_norm": 21.471519470214844, "learning_rate": 1.430690245293782e-05, "loss": 2.2334, "step": 2514 }, { "epoch": 0.4304295738490501, "grad_norm": 25.605525970458984, "learning_rate": 1.4312606959498003e-05, "loss": 2.584, "step": 2515 }, { "epoch": 0.4306007188088311, "grad_norm": 34.87372589111328, "learning_rate": 1.4318311466058187e-05, "loss": 3.7447, "step": 2516 }, { "epoch": 0.43077186376861204, "grad_norm": 28.899642944335938, "learning_rate": 1.432401597261837e-05, "loss": 2.9852, "step": 2517 }, { "epoch": 0.430943008728393, "grad_norm": 24.084014892578125, "learning_rate": 1.4329720479178552e-05, "loss": 2.5703, "step": 2518 }, { "epoch": 0.4311141536881739, "grad_norm": 13.15533447265625, "learning_rate": 1.4335424985738735e-05, "loss": 0.8629, "step": 2519 }, { "epoch": 0.43128529864795484, "grad_norm": 100.28350067138672, "learning_rate": 1.4341129492298917e-05, "loss": 5.5365, "step": 2520 }, { "epoch": 0.4314564436077358, "grad_norm": 25.63288116455078, "learning_rate": 1.4346833998859098e-05, "loss": 2.2759, "step": 2521 }, { "epoch": 0.4316275885675167, "grad_norm": 4.588881969451904, "learning_rate": 1.4352538505419282e-05, "loss": 0.5143, "step": 2522 }, { "epoch": 0.43179873352729764, "grad_norm": 31.304664611816406, "learning_rate": 1.4358243011979463e-05, "loss": 4.1463, "step": 2523 }, { "epoch": 0.4319698784870786, "grad_norm": 18.030874252319336, "learning_rate": 1.4363947518539647e-05, "loss": 1.8934, "step": 2524 }, { "epoch": 0.4321410234468595, "grad_norm": 29.338178634643555, "learning_rate": 1.4369652025099829e-05, "loss": 3.3346, "step": 2525 }, { "epoch": 0.43231216840664044, "grad_norm": 29.54951286315918, "learning_rate": 1.4375356531660012e-05, "loss": 2.695, "step": 2526 }, { "epoch": 0.4324833133664214, "grad_norm": 17.5317325592041, "learning_rate": 1.4381061038220195e-05, "loss": 1.4575, "step": 2527 }, { "epoch": 0.4326544583262023, "grad_norm": 32.96657943725586, "learning_rate": 1.4386765544780377e-05, "loss": 3.592, "step": 2528 }, { "epoch": 0.43282560328598324, "grad_norm": 19.32137107849121, "learning_rate": 1.439247005134056e-05, "loss": 1.2429, "step": 2529 }, { "epoch": 0.4329967482457642, "grad_norm": 8.846491813659668, "learning_rate": 1.4398174557900742e-05, "loss": 1.021, "step": 2530 }, { "epoch": 0.4331678932055451, "grad_norm": 4.180466651916504, "learning_rate": 1.4403879064460925e-05, "loss": 0.5108, "step": 2531 }, { "epoch": 0.43333903816532604, "grad_norm": 28.7572078704834, "learning_rate": 1.4409583571021107e-05, "loss": 2.694, "step": 2532 }, { "epoch": 0.433510183125107, "grad_norm": 34.48351287841797, "learning_rate": 1.4415288077581289e-05, "loss": 3.7674, "step": 2533 }, { "epoch": 0.4336813280848879, "grad_norm": 27.524559020996094, "learning_rate": 1.4420992584141472e-05, "loss": 2.768, "step": 2534 }, { "epoch": 0.43385247304466884, "grad_norm": 33.70855712890625, "learning_rate": 1.4426697090701654e-05, "loss": 3.1902, "step": 2535 }, { "epoch": 0.4340236180044498, "grad_norm": 30.53034210205078, "learning_rate": 1.4432401597261837e-05, "loss": 3.4593, "step": 2536 }, { "epoch": 0.4341947629642307, "grad_norm": 30.834991455078125, "learning_rate": 1.4438106103822019e-05, "loss": 3.439, "step": 2537 }, { "epoch": 0.43436590792401164, "grad_norm": 31.815725326538086, "learning_rate": 1.4443810610382202e-05, "loss": 3.683, "step": 2538 }, { "epoch": 0.4345370528837926, "grad_norm": 29.159996032714844, "learning_rate": 1.4449515116942386e-05, "loss": 2.9478, "step": 2539 }, { "epoch": 0.4347081978435735, "grad_norm": 81.45700073242188, "learning_rate": 1.4455219623502567e-05, "loss": 9.0416, "step": 2540 }, { "epoch": 0.43487934280335444, "grad_norm": 87.70926666259766, "learning_rate": 1.446092413006275e-05, "loss": 4.7629, "step": 2541 }, { "epoch": 0.4350504877631354, "grad_norm": 9.934538841247559, "learning_rate": 1.4466628636622932e-05, "loss": 1.8243, "step": 2542 }, { "epoch": 0.4352216327229163, "grad_norm": 9.613969802856445, "learning_rate": 1.4472333143183116e-05, "loss": 1.031, "step": 2543 }, { "epoch": 0.43539277768269724, "grad_norm": 45.231689453125, "learning_rate": 1.4478037649743297e-05, "loss": 7.5705, "step": 2544 }, { "epoch": 0.4355639226424782, "grad_norm": 9.317858695983887, "learning_rate": 1.4483742156303479e-05, "loss": 0.6934, "step": 2545 }, { "epoch": 0.4357350676022591, "grad_norm": 35.789794921875, "learning_rate": 1.4489446662863663e-05, "loss": 4.1345, "step": 2546 }, { "epoch": 0.43590621256204004, "grad_norm": 11.596151351928711, "learning_rate": 1.4495151169423844e-05, "loss": 1.3557, "step": 2547 }, { "epoch": 0.436077357521821, "grad_norm": 4.43747091293335, "learning_rate": 1.4500855675984028e-05, "loss": 0.4991, "step": 2548 }, { "epoch": 0.4362485024816019, "grad_norm": 29.71784019470215, "learning_rate": 1.450656018254421e-05, "loss": 3.192, "step": 2549 }, { "epoch": 0.43641964744138284, "grad_norm": 44.21783447265625, "learning_rate": 1.4512264689104394e-05, "loss": 3.6461, "step": 2550 }, { "epoch": 0.4365907924011638, "grad_norm": 27.61203384399414, "learning_rate": 1.4517969195664576e-05, "loss": 3.3135, "step": 2551 }, { "epoch": 0.4367619373609447, "grad_norm": 23.84665298461914, "learning_rate": 1.4523673702224758e-05, "loss": 2.5268, "step": 2552 }, { "epoch": 0.43693308232072564, "grad_norm": 29.368938446044922, "learning_rate": 1.4529378208784941e-05, "loss": 2.7211, "step": 2553 }, { "epoch": 0.4371042272805066, "grad_norm": 36.08073806762695, "learning_rate": 1.4535082715345123e-05, "loss": 3.7964, "step": 2554 }, { "epoch": 0.4372753722402875, "grad_norm": 32.68186950683594, "learning_rate": 1.4540787221905306e-05, "loss": 3.6173, "step": 2555 }, { "epoch": 0.43744651720006844, "grad_norm": 34.985904693603516, "learning_rate": 1.4546491728465488e-05, "loss": 4.2656, "step": 2556 }, { "epoch": 0.4376176621598494, "grad_norm": 129.27252197265625, "learning_rate": 1.4552196235025671e-05, "loss": 8.7164, "step": 2557 }, { "epoch": 0.4377888071196303, "grad_norm": 29.99295997619629, "learning_rate": 1.4557900741585853e-05, "loss": 3.6185, "step": 2558 }, { "epoch": 0.43795995207941124, "grad_norm": 28.371896743774414, "learning_rate": 1.4563605248146035e-05, "loss": 2.9256, "step": 2559 }, { "epoch": 0.4381310970391922, "grad_norm": 8.728231430053711, "learning_rate": 1.4569309754706218e-05, "loss": 0.9915, "step": 2560 }, { "epoch": 0.4383022419989731, "grad_norm": 31.164567947387695, "learning_rate": 1.45750142612664e-05, "loss": 3.8704, "step": 2561 }, { "epoch": 0.43847338695875404, "grad_norm": 32.18178176879883, "learning_rate": 1.4580718767826585e-05, "loss": 4.2259, "step": 2562 }, { "epoch": 0.438644531918535, "grad_norm": 25.499011993408203, "learning_rate": 1.4586423274386766e-05, "loss": 2.6854, "step": 2563 }, { "epoch": 0.4388156768783159, "grad_norm": 34.26057815551758, "learning_rate": 1.4592127780946948e-05, "loss": 3.4689, "step": 2564 }, { "epoch": 0.43898682183809684, "grad_norm": 33.73667526245117, "learning_rate": 1.4597832287507131e-05, "loss": 4.3474, "step": 2565 }, { "epoch": 0.43915796679787783, "grad_norm": 32.83565902709961, "learning_rate": 1.4603536794067313e-05, "loss": 3.475, "step": 2566 }, { "epoch": 0.43932911175765876, "grad_norm": 3.187453269958496, "learning_rate": 1.4609241300627497e-05, "loss": 0.4736, "step": 2567 }, { "epoch": 0.4395002567174397, "grad_norm": 19.98860740661621, "learning_rate": 1.4614945807187678e-05, "loss": 2.0086, "step": 2568 }, { "epoch": 0.43967140167722063, "grad_norm": 27.594697952270508, "learning_rate": 1.4620650313747862e-05, "loss": 3.2803, "step": 2569 }, { "epoch": 0.43984254663700156, "grad_norm": 3.9966156482696533, "learning_rate": 1.4626354820308043e-05, "loss": 0.568, "step": 2570 }, { "epoch": 0.4400136915967825, "grad_norm": 5.779835224151611, "learning_rate": 1.4632059326868225e-05, "loss": 0.648, "step": 2571 }, { "epoch": 0.44018483655656343, "grad_norm": 39.23750305175781, "learning_rate": 1.4637763833428408e-05, "loss": 7.0941, "step": 2572 }, { "epoch": 0.44035598151634436, "grad_norm": 28.068208694458008, "learning_rate": 1.4643468339988592e-05, "loss": 3.0381, "step": 2573 }, { "epoch": 0.4405271264761253, "grad_norm": 25.783096313476562, "learning_rate": 1.4649172846548775e-05, "loss": 3.0511, "step": 2574 }, { "epoch": 0.44069827143590623, "grad_norm": 29.101238250732422, "learning_rate": 1.4654877353108957e-05, "loss": 2.9123, "step": 2575 }, { "epoch": 0.44086941639568716, "grad_norm": 14.171677589416504, "learning_rate": 1.466058185966914e-05, "loss": 1.0138, "step": 2576 }, { "epoch": 0.4410405613554681, "grad_norm": 27.117347717285156, "learning_rate": 1.4666286366229322e-05, "loss": 3.1994, "step": 2577 }, { "epoch": 0.44121170631524903, "grad_norm": 29.480358123779297, "learning_rate": 1.4671990872789504e-05, "loss": 3.4766, "step": 2578 }, { "epoch": 0.44138285127502996, "grad_norm": 4.977560997009277, "learning_rate": 1.4677695379349687e-05, "loss": 0.7032, "step": 2579 }, { "epoch": 0.4415539962348109, "grad_norm": 31.941097259521484, "learning_rate": 1.4683399885909869e-05, "loss": 2.931, "step": 2580 }, { "epoch": 0.44172514119459183, "grad_norm": 136.83563232421875, "learning_rate": 1.4689104392470052e-05, "loss": 5.0846, "step": 2581 }, { "epoch": 0.44189628615437276, "grad_norm": 9.305535316467285, "learning_rate": 1.4694808899030234e-05, "loss": 0.911, "step": 2582 }, { "epoch": 0.4420674311141537, "grad_norm": 18.890281677246094, "learning_rate": 1.4700513405590415e-05, "loss": 1.4747, "step": 2583 }, { "epoch": 0.44223857607393463, "grad_norm": 46.04558563232422, "learning_rate": 1.4706217912150599e-05, "loss": 7.673, "step": 2584 }, { "epoch": 0.44240972103371556, "grad_norm": 24.37186050415039, "learning_rate": 1.4711922418710782e-05, "loss": 2.3299, "step": 2585 }, { "epoch": 0.4425808659934965, "grad_norm": 38.21072006225586, "learning_rate": 1.4717626925270965e-05, "loss": 7.1275, "step": 2586 }, { "epoch": 0.44275201095327743, "grad_norm": 77.47330474853516, "learning_rate": 1.4723331431831147e-05, "loss": 4.7878, "step": 2587 }, { "epoch": 0.44292315591305836, "grad_norm": 37.18149185180664, "learning_rate": 1.472903593839133e-05, "loss": 3.7571, "step": 2588 }, { "epoch": 0.4430943008728393, "grad_norm": 3.5262255668640137, "learning_rate": 1.4734740444951512e-05, "loss": 0.5224, "step": 2589 }, { "epoch": 0.44326544583262023, "grad_norm": 11.645423889160156, "learning_rate": 1.4740444951511694e-05, "loss": 1.0846, "step": 2590 }, { "epoch": 0.44343659079240116, "grad_norm": 6.892613410949707, "learning_rate": 1.4746149458071877e-05, "loss": 0.54, "step": 2591 }, { "epoch": 0.4436077357521821, "grad_norm": 8.752089500427246, "learning_rate": 1.4751853964632059e-05, "loss": 0.6267, "step": 2592 }, { "epoch": 0.44377888071196303, "grad_norm": 23.974550247192383, "learning_rate": 1.4757558471192242e-05, "loss": 2.6998, "step": 2593 }, { "epoch": 0.44395002567174396, "grad_norm": 7.374299049377441, "learning_rate": 1.4763262977752424e-05, "loss": 0.8349, "step": 2594 }, { "epoch": 0.4441211706315249, "grad_norm": 23.21881103515625, "learning_rate": 1.4768967484312606e-05, "loss": 2.7586, "step": 2595 }, { "epoch": 0.44429231559130583, "grad_norm": 170.6956024169922, "learning_rate": 1.477467199087279e-05, "loss": 9.1929, "step": 2596 }, { "epoch": 0.44446346055108676, "grad_norm": 20.11836814880371, "learning_rate": 1.4780376497432972e-05, "loss": 2.3475, "step": 2597 }, { "epoch": 0.4446346055108677, "grad_norm": 24.03493881225586, "learning_rate": 1.4786081003993156e-05, "loss": 2.9464, "step": 2598 }, { "epoch": 0.44480575047064863, "grad_norm": 27.76041603088379, "learning_rate": 1.4791785510553338e-05, "loss": 2.7217, "step": 2599 }, { "epoch": 0.44497689543042956, "grad_norm": 17.792516708374023, "learning_rate": 1.4797490017113521e-05, "loss": 1.6209, "step": 2600 }, { "epoch": 0.4451480403902105, "grad_norm": 34.788169860839844, "learning_rate": 1.4803194523673703e-05, "loss": 2.8761, "step": 2601 }, { "epoch": 0.44531918534999143, "grad_norm": 18.824007034301758, "learning_rate": 1.4808899030233884e-05, "loss": 2.5789, "step": 2602 }, { "epoch": 0.44549033030977236, "grad_norm": 19.51264190673828, "learning_rate": 1.4814603536794068e-05, "loss": 2.2163, "step": 2603 }, { "epoch": 0.4456614752695533, "grad_norm": 31.428625106811523, "learning_rate": 1.482030804335425e-05, "loss": 2.9496, "step": 2604 }, { "epoch": 0.44583262022933423, "grad_norm": 13.012333869934082, "learning_rate": 1.4826012549914433e-05, "loss": 0.8159, "step": 2605 }, { "epoch": 0.44600376518911516, "grad_norm": 23.477638244628906, "learning_rate": 1.4831717056474614e-05, "loss": 2.4937, "step": 2606 }, { "epoch": 0.4461749101488961, "grad_norm": 35.8111572265625, "learning_rate": 1.48374215630348e-05, "loss": 3.2833, "step": 2607 }, { "epoch": 0.44634605510867703, "grad_norm": 32.99673080444336, "learning_rate": 1.4843126069594981e-05, "loss": 3.5874, "step": 2608 }, { "epoch": 0.44651720006845796, "grad_norm": 3.853698253631592, "learning_rate": 1.4848830576155163e-05, "loss": 0.4709, "step": 2609 }, { "epoch": 0.4466883450282389, "grad_norm": 27.9306583404541, "learning_rate": 1.4854535082715346e-05, "loss": 2.6247, "step": 2610 }, { "epoch": 0.44685948998801983, "grad_norm": 11.854992866516113, "learning_rate": 1.4860239589275528e-05, "loss": 0.9967, "step": 2611 }, { "epoch": 0.44703063494780076, "grad_norm": 49.759117126464844, "learning_rate": 1.4865944095835711e-05, "loss": 7.1995, "step": 2612 }, { "epoch": 0.4472017799075817, "grad_norm": 31.380281448364258, "learning_rate": 1.4871648602395893e-05, "loss": 2.7301, "step": 2613 }, { "epoch": 0.44737292486736263, "grad_norm": 29.84979820251465, "learning_rate": 1.4877353108956075e-05, "loss": 3.1099, "step": 2614 }, { "epoch": 0.44754406982714356, "grad_norm": 13.841278076171875, "learning_rate": 1.4883057615516258e-05, "loss": 1.0569, "step": 2615 }, { "epoch": 0.4477152147869245, "grad_norm": 28.414051055908203, "learning_rate": 1.488876212207644e-05, "loss": 2.6359, "step": 2616 }, { "epoch": 0.4478863597467055, "grad_norm": 29.42824363708496, "learning_rate": 1.4894466628636623e-05, "loss": 3.3203, "step": 2617 }, { "epoch": 0.4480575047064864, "grad_norm": 33.065799713134766, "learning_rate": 1.4900171135196805e-05, "loss": 4.0124, "step": 2618 }, { "epoch": 0.44822864966626735, "grad_norm": 9.898391723632812, "learning_rate": 1.490587564175699e-05, "loss": 0.9107, "step": 2619 }, { "epoch": 0.4483997946260483, "grad_norm": 23.923398971557617, "learning_rate": 1.4911580148317172e-05, "loss": 2.5028, "step": 2620 }, { "epoch": 0.4485709395858292, "grad_norm": 25.825178146362305, "learning_rate": 1.4917284654877353e-05, "loss": 2.3647, "step": 2621 }, { "epoch": 0.44874208454561015, "grad_norm": 24.46117401123047, "learning_rate": 1.4922989161437537e-05, "loss": 2.66, "step": 2622 }, { "epoch": 0.4489132295053911, "grad_norm": 19.926624298095703, "learning_rate": 1.4928693667997718e-05, "loss": 1.771, "step": 2623 }, { "epoch": 0.449084374465172, "grad_norm": 107.68805694580078, "learning_rate": 1.4934398174557902e-05, "loss": 9.219, "step": 2624 }, { "epoch": 0.44925551942495295, "grad_norm": 18.121204376220703, "learning_rate": 1.4940102681118083e-05, "loss": 1.3726, "step": 2625 }, { "epoch": 0.4494266643847339, "grad_norm": 27.648178100585938, "learning_rate": 1.4945807187678267e-05, "loss": 2.6469, "step": 2626 }, { "epoch": 0.4495978093445148, "grad_norm": 28.146556854248047, "learning_rate": 1.4951511694238448e-05, "loss": 2.8926, "step": 2627 }, { "epoch": 0.44976895430429575, "grad_norm": 52.536190032958984, "learning_rate": 1.495721620079863e-05, "loss": 7.5002, "step": 2628 }, { "epoch": 0.4499400992640767, "grad_norm": 24.027881622314453, "learning_rate": 1.4962920707358814e-05, "loss": 2.3452, "step": 2629 }, { "epoch": 0.4501112442238576, "grad_norm": 34.977684020996094, "learning_rate": 1.4968625213918997e-05, "loss": 3.9508, "step": 2630 }, { "epoch": 0.45028238918363855, "grad_norm": 30.991193771362305, "learning_rate": 1.497432972047918e-05, "loss": 3.6064, "step": 2631 }, { "epoch": 0.45028238918363855, "eval_nli-pairs_loss": 2.871744394302368, "eval_nli-pairs_runtime": 4.2947, "eval_nli-pairs_samples_per_second": 46.569, "eval_nli-pairs_steps_per_second": 1.63, "eval_sts-test_pearson_cosine": 0.7195428557259504, "eval_sts-test_pearson_dot": 0.6098064793689061, "eval_sts-test_pearson_euclidean": 0.7205423612792191, "eval_sts-test_pearson_manhattan": 0.7293110123887395, "eval_sts-test_pearson_max": 0.7293110123887395, "eval_sts-test_spearman_cosine": 0.6966954300008318, "eval_sts-test_spearman_dot": 0.5822364450229315, "eval_sts-test_spearman_euclidean": 0.7004689124572796, "eval_sts-test_spearman_manhattan": 0.7099498051685355, "eval_sts-test_spearman_max": 0.7099498051685355, "step": 2631 }, { "epoch": 0.45028238918363855, "eval_vitaminc-pairs_loss": 1.8629615306854248, "eval_vitaminc-pairs_runtime": 2.7342, "eval_vitaminc-pairs_samples_per_second": 73.148, "eval_vitaminc-pairs_steps_per_second": 2.56, "step": 2631 }, { "epoch": 0.45028238918363855, "eval_qnli-contrastive_loss": 5.418925762176514, "eval_qnli-contrastive_runtime": 0.6359, "eval_qnli-contrastive_samples_per_second": 314.496, "eval_qnli-contrastive_steps_per_second": 11.007, "step": 2631 }, { "epoch": 0.45028238918363855, "eval_scitail-pairs-qa_loss": 0.4216327965259552, "eval_scitail-pairs-qa_runtime": 1.6135, "eval_scitail-pairs-qa_samples_per_second": 123.956, "eval_scitail-pairs-qa_steps_per_second": 4.338, "step": 2631 }, { "epoch": 0.45028238918363855, "eval_scitail-pairs-pos_loss": 1.3018670082092285, "eval_scitail-pairs-pos_runtime": 2.6103, "eval_scitail-pairs-pos_samples_per_second": 76.619, "eval_scitail-pairs-pos_steps_per_second": 2.682, "step": 2631 }, { "epoch": 0.45028238918363855, "eval_xsum-pairs_loss": 1.584064245223999, "eval_xsum-pairs_runtime": 2.6388, "eval_xsum-pairs_samples_per_second": 66.317, "eval_xsum-pairs_steps_per_second": 2.274, "step": 2631 }, { "epoch": 0.45028238918363855, "eval_compression-pairs_loss": 0.7760603427886963, "eval_compression-pairs_runtime": 0.5146, "eval_compression-pairs_samples_per_second": 388.623, "eval_compression-pairs_steps_per_second": 13.602, "step": 2631 }, { "epoch": 0.45028238918363855, "eval_sciq_pairs_loss": 5.851566314697266, "eval_sciq_pairs_runtime": 9.2089, "eval_sciq_pairs_samples_per_second": 21.718, "eval_sciq_pairs_steps_per_second": 0.76, "step": 2631 }, { "epoch": 0.45028238918363855, "eval_qasc_pairs_loss": 7.442629814147949, "eval_qasc_pairs_runtime": 2.6477, "eval_qasc_pairs_samples_per_second": 75.537, "eval_qasc_pairs_steps_per_second": 2.644, "step": 2631 }, { "epoch": 0.45028238918363855, "eval_openbookqa_pairs_loss": 4.049252033233643, "eval_openbookqa_pairs_runtime": 0.6399, "eval_openbookqa_pairs_samples_per_second": 107.834, "eval_openbookqa_pairs_steps_per_second": 4.688, "step": 2631 }, { "epoch": 0.45028238918363855, "eval_msmarco_pairs_loss": 2.6957242488861084, "eval_msmarco_pairs_runtime": 3.9586, "eval_msmarco_pairs_samples_per_second": 50.523, "eval_msmarco_pairs_steps_per_second": 1.768, "step": 2631 }, { "epoch": 0.45028238918363855, "eval_nq_pairs_loss": 3.332510471343994, "eval_nq_pairs_runtime": 8.6125, "eval_nq_pairs_samples_per_second": 23.222, "eval_nq_pairs_steps_per_second": 0.813, "step": 2631 }, { "epoch": 0.45028238918363855, "eval_trivia_pairs_loss": 3.298595905303955, "eval_trivia_pairs_runtime": 12.8335, "eval_trivia_pairs_samples_per_second": 15.584, "eval_trivia_pairs_steps_per_second": 0.545, "step": 2631 }, { "epoch": 0.45028238918363855, "eval_quora_pairs_loss": 0.6931056380271912, "eval_quora_pairs_runtime": 1.5975, "eval_quora_pairs_samples_per_second": 125.194, "eval_quora_pairs_steps_per_second": 4.382, "step": 2631 }, { "epoch": 0.45028238918363855, "eval_gooaq_pairs_loss": 2.1408634185791016, "eval_gooaq_pairs_runtime": 2.6505, "eval_gooaq_pairs_samples_per_second": 75.457, "eval_gooaq_pairs_steps_per_second": 2.641, "step": 2631 }, { "epoch": 0.4504535341434195, "grad_norm": 25.071407318115234, "learning_rate": 1.4980034227039362e-05, "loss": 2.3902, "step": 2632 }, { "epoch": 0.4506246791032004, "grad_norm": 14.988327026367188, "learning_rate": 1.4985738733599544e-05, "loss": 1.1409, "step": 2633 }, { "epoch": 0.45079582406298135, "grad_norm": 26.867197036743164, "learning_rate": 1.4991443240159727e-05, "loss": 2.932, "step": 2634 }, { "epoch": 0.4509669690227623, "grad_norm": 36.01612091064453, "learning_rate": 1.4997147746719909e-05, "loss": 4.7158, "step": 2635 }, { "epoch": 0.4511381139825432, "grad_norm": 16.741594314575195, "learning_rate": 1.5002852253280092e-05, "loss": 1.3786, "step": 2636 }, { "epoch": 0.45130925894232415, "grad_norm": 27.737268447875977, "learning_rate": 1.5008556759840274e-05, "loss": 3.3069, "step": 2637 }, { "epoch": 0.4514804039021051, "grad_norm": 12.152483940124512, "learning_rate": 1.5014261266400457e-05, "loss": 1.0332, "step": 2638 }, { "epoch": 0.451651548861886, "grad_norm": 12.2247314453125, "learning_rate": 1.5019965772960639e-05, "loss": 0.825, "step": 2639 }, { "epoch": 0.45182269382166695, "grad_norm": 10.578752517700195, "learning_rate": 1.502567027952082e-05, "loss": 1.097, "step": 2640 }, { "epoch": 0.4519938387814479, "grad_norm": 27.834949493408203, "learning_rate": 1.5031374786081004e-05, "loss": 3.3057, "step": 2641 }, { "epoch": 0.4521649837412288, "grad_norm": 31.40846824645996, "learning_rate": 1.5037079292641186e-05, "loss": 3.5691, "step": 2642 }, { "epoch": 0.45233612870100975, "grad_norm": 37.22605514526367, "learning_rate": 1.5042783799201369e-05, "loss": 3.8636, "step": 2643 }, { "epoch": 0.4525072736607907, "grad_norm": 10.362072944641113, "learning_rate": 1.504848830576155e-05, "loss": 0.9574, "step": 2644 }, { "epoch": 0.4526784186205716, "grad_norm": 27.246967315673828, "learning_rate": 1.5054192812321734e-05, "loss": 2.6717, "step": 2645 }, { "epoch": 0.45284956358035255, "grad_norm": 17.54155921936035, "learning_rate": 1.5059897318881916e-05, "loss": 1.1453, "step": 2646 }, { "epoch": 0.4530207085401335, "grad_norm": 27.662446975708008, "learning_rate": 1.50656018254421e-05, "loss": 2.3804, "step": 2647 }, { "epoch": 0.4531918534999144, "grad_norm": 94.5572738647461, "learning_rate": 1.5071306332002284e-05, "loss": 8.0449, "step": 2648 }, { "epoch": 0.45336299845969535, "grad_norm": 30.91036605834961, "learning_rate": 1.5077010838562466e-05, "loss": 3.4507, "step": 2649 }, { "epoch": 0.4535341434194763, "grad_norm": 24.88844108581543, "learning_rate": 1.508271534512265e-05, "loss": 2.4537, "step": 2650 }, { "epoch": 0.4537052883792572, "grad_norm": 36.03679656982422, "learning_rate": 1.5088419851682831e-05, "loss": 7.241, "step": 2651 }, { "epoch": 0.45387643333903815, "grad_norm": 25.070192337036133, "learning_rate": 1.5094124358243013e-05, "loss": 2.5557, "step": 2652 }, { "epoch": 0.4540475782988191, "grad_norm": 7.677706718444824, "learning_rate": 1.5099828864803196e-05, "loss": 0.5579, "step": 2653 }, { "epoch": 0.4542187232586, "grad_norm": 30.037338256835938, "learning_rate": 1.5105533371363378e-05, "loss": 3.2643, "step": 2654 }, { "epoch": 0.45438986821838095, "grad_norm": 25.368310928344727, "learning_rate": 1.5111237877923561e-05, "loss": 2.722, "step": 2655 }, { "epoch": 0.4545610131781619, "grad_norm": 36.92127990722656, "learning_rate": 1.5116942384483743e-05, "loss": 4.7207, "step": 2656 }, { "epoch": 0.4547321581379428, "grad_norm": 22.686552047729492, "learning_rate": 1.5122646891043926e-05, "loss": 2.1942, "step": 2657 }, { "epoch": 0.45490330309772375, "grad_norm": 53.640262603759766, "learning_rate": 1.5128351397604108e-05, "loss": 6.8632, "step": 2658 }, { "epoch": 0.4550744480575047, "grad_norm": 24.542247772216797, "learning_rate": 1.513405590416429e-05, "loss": 2.4562, "step": 2659 }, { "epoch": 0.4552455930172856, "grad_norm": 5.353951930999756, "learning_rate": 1.5139760410724473e-05, "loss": 0.5523, "step": 2660 }, { "epoch": 0.45541673797706655, "grad_norm": 32.79592514038086, "learning_rate": 1.5145464917284655e-05, "loss": 3.4424, "step": 2661 }, { "epoch": 0.4555878829368475, "grad_norm": 35.7240104675293, "learning_rate": 1.5151169423844838e-05, "loss": 3.5062, "step": 2662 }, { "epoch": 0.4557590278966284, "grad_norm": 30.997047424316406, "learning_rate": 1.515687393040502e-05, "loss": 3.9807, "step": 2663 }, { "epoch": 0.45593017285640935, "grad_norm": 41.52260208129883, "learning_rate": 1.5162578436965201e-05, "loss": 4.4682, "step": 2664 }, { "epoch": 0.4561013178161903, "grad_norm": 33.410797119140625, "learning_rate": 1.5168282943525385e-05, "loss": 3.3602, "step": 2665 }, { "epoch": 0.4562724627759712, "grad_norm": 22.308074951171875, "learning_rate": 1.5173987450085566e-05, "loss": 2.5111, "step": 2666 }, { "epoch": 0.45644360773575215, "grad_norm": 21.073827743530273, "learning_rate": 1.517969195664575e-05, "loss": 2.1332, "step": 2667 }, { "epoch": 0.45661475269553314, "grad_norm": 36.2976188659668, "learning_rate": 1.5185396463205931e-05, "loss": 4.8541, "step": 2668 }, { "epoch": 0.4567858976553141, "grad_norm": 37.76522445678711, "learning_rate": 1.5191100969766115e-05, "loss": 6.96, "step": 2669 }, { "epoch": 0.456957042615095, "grad_norm": 29.864612579345703, "learning_rate": 1.51968054763263e-05, "loss": 2.896, "step": 2670 }, { "epoch": 0.45712818757487594, "grad_norm": 22.04704475402832, "learning_rate": 1.5202509982886482e-05, "loss": 2.6772, "step": 2671 }, { "epoch": 0.4572993325346569, "grad_norm": 19.153793334960938, "learning_rate": 1.5208214489446665e-05, "loss": 1.7357, "step": 2672 }, { "epoch": 0.4574704774944378, "grad_norm": 30.495540618896484, "learning_rate": 1.5213918996006847e-05, "loss": 3.1067, "step": 2673 }, { "epoch": 0.45764162245421874, "grad_norm": 12.724396705627441, "learning_rate": 1.521962350256703e-05, "loss": 0.9931, "step": 2674 }, { "epoch": 0.4578127674139997, "grad_norm": 6.2942399978637695, "learning_rate": 1.5225328009127212e-05, "loss": 0.5454, "step": 2675 }, { "epoch": 0.4579839123737806, "grad_norm": 10.231136322021484, "learning_rate": 1.5231032515687395e-05, "loss": 1.696, "step": 2676 }, { "epoch": 0.45815505733356154, "grad_norm": 152.32469177246094, "learning_rate": 1.5236737022247577e-05, "loss": 8.8958, "step": 2677 }, { "epoch": 0.4583262022933425, "grad_norm": 38.06270980834961, "learning_rate": 1.5242441528807758e-05, "loss": 3.9409, "step": 2678 }, { "epoch": 0.4584973472531234, "grad_norm": 25.77074432373047, "learning_rate": 1.5248146035367942e-05, "loss": 2.6594, "step": 2679 }, { "epoch": 0.45866849221290434, "grad_norm": 29.309284210205078, "learning_rate": 1.5253850541928123e-05, "loss": 3.3099, "step": 2680 }, { "epoch": 0.4588396371726853, "grad_norm": 25.866558074951172, "learning_rate": 1.5259555048488305e-05, "loss": 3.4843, "step": 2681 }, { "epoch": 0.4590107821324662, "grad_norm": 23.736045837402344, "learning_rate": 1.526525955504849e-05, "loss": 2.5762, "step": 2682 }, { "epoch": 0.45918192709224714, "grad_norm": 24.48654556274414, "learning_rate": 1.5270964061608672e-05, "loss": 2.4442, "step": 2683 }, { "epoch": 0.4593530720520281, "grad_norm": 9.48880386352539, "learning_rate": 1.5276668568168852e-05, "loss": 1.0744, "step": 2684 }, { "epoch": 0.459524217011809, "grad_norm": 5.050061225891113, "learning_rate": 1.5282373074729035e-05, "loss": 0.4942, "step": 2685 }, { "epoch": 0.45969536197158994, "grad_norm": 22.153188705444336, "learning_rate": 1.528807758128922e-05, "loss": 2.3914, "step": 2686 }, { "epoch": 0.4598665069313709, "grad_norm": 3.6951212882995605, "learning_rate": 1.5293782087849402e-05, "loss": 0.4808, "step": 2687 }, { "epoch": 0.4600376518911518, "grad_norm": 11.074764251708984, "learning_rate": 1.5299486594409582e-05, "loss": 1.7231, "step": 2688 }, { "epoch": 0.46020879685093274, "grad_norm": 19.63498306274414, "learning_rate": 1.5305191100969765e-05, "loss": 2.1931, "step": 2689 }, { "epoch": 0.4603799418107137, "grad_norm": 30.376020431518555, "learning_rate": 1.531089560752995e-05, "loss": 3.2142, "step": 2690 }, { "epoch": 0.4605510867704946, "grad_norm": 39.05623245239258, "learning_rate": 1.531660011409013e-05, "loss": 4.3445, "step": 2691 }, { "epoch": 0.46072223173027554, "grad_norm": 34.95427703857422, "learning_rate": 1.5322304620650312e-05, "loss": 3.5087, "step": 2692 }, { "epoch": 0.4608933766900565, "grad_norm": 24.339468002319336, "learning_rate": 1.53280091272105e-05, "loss": 2.139, "step": 2693 }, { "epoch": 0.4610645216498374, "grad_norm": 36.85024642944336, "learning_rate": 1.5333713633770682e-05, "loss": 4.5667, "step": 2694 }, { "epoch": 0.46123566660961834, "grad_norm": 121.48307037353516, "learning_rate": 1.5339418140330862e-05, "loss": 5.9012, "step": 2695 }, { "epoch": 0.4614068115693993, "grad_norm": 7.473188877105713, "learning_rate": 1.5345122646891046e-05, "loss": 0.9301, "step": 2696 }, { "epoch": 0.4615779565291802, "grad_norm": 27.48497200012207, "learning_rate": 1.535082715345123e-05, "loss": 3.0351, "step": 2697 }, { "epoch": 0.46174910148896114, "grad_norm": 22.619394302368164, "learning_rate": 1.535653166001141e-05, "loss": 2.4788, "step": 2698 }, { "epoch": 0.4619202464487421, "grad_norm": 25.0198974609375, "learning_rate": 1.5362236166571592e-05, "loss": 2.3989, "step": 2699 }, { "epoch": 0.462091391408523, "grad_norm": 23.36564064025879, "learning_rate": 1.5367940673131776e-05, "loss": 2.4179, "step": 2700 }, { "epoch": 0.46226253636830394, "grad_norm": 29.04068946838379, "learning_rate": 1.537364517969196e-05, "loss": 3.1229, "step": 2701 }, { "epoch": 0.4624336813280849, "grad_norm": 27.629722595214844, "learning_rate": 1.537934968625214e-05, "loss": 2.7618, "step": 2702 }, { "epoch": 0.4626048262878658, "grad_norm": 23.081079483032227, "learning_rate": 1.5385054192812323e-05, "loss": 2.8201, "step": 2703 }, { "epoch": 0.46277597124764674, "grad_norm": 26.009172439575195, "learning_rate": 1.5390758699372506e-05, "loss": 3.1322, "step": 2704 }, { "epoch": 0.4629471162074277, "grad_norm": 18.447147369384766, "learning_rate": 1.5396463205932686e-05, "loss": 1.2356, "step": 2705 }, { "epoch": 0.4631182611672086, "grad_norm": 22.773012161254883, "learning_rate": 1.540216771249287e-05, "loss": 2.6551, "step": 2706 }, { "epoch": 0.46328940612698954, "grad_norm": 32.899314880371094, "learning_rate": 1.5407872219053053e-05, "loss": 3.7763, "step": 2707 }, { "epoch": 0.4634605510867705, "grad_norm": 97.4777603149414, "learning_rate": 1.5413576725613233e-05, "loss": 4.5767, "step": 2708 }, { "epoch": 0.4636316960465514, "grad_norm": 41.41079330444336, "learning_rate": 1.5419281232173416e-05, "loss": 7.352, "step": 2709 }, { "epoch": 0.46380284100633234, "grad_norm": 24.83094024658203, "learning_rate": 1.54249857387336e-05, "loss": 2.836, "step": 2710 }, { "epoch": 0.4639739859661133, "grad_norm": 12.101001739501953, "learning_rate": 1.5430690245293783e-05, "loss": 0.9624, "step": 2711 }, { "epoch": 0.4641451309258942, "grad_norm": 24.289182662963867, "learning_rate": 1.5436394751853963e-05, "loss": 2.3101, "step": 2712 }, { "epoch": 0.46431627588567514, "grad_norm": 23.911334991455078, "learning_rate": 1.5442099258414146e-05, "loss": 2.4969, "step": 2713 }, { "epoch": 0.4644874208454561, "grad_norm": 35.51081085205078, "learning_rate": 1.544780376497433e-05, "loss": 3.353, "step": 2714 }, { "epoch": 0.464658565805237, "grad_norm": 21.24627113342285, "learning_rate": 1.545350827153451e-05, "loss": 2.5466, "step": 2715 }, { "epoch": 0.46482971076501794, "grad_norm": 30.70880126953125, "learning_rate": 1.5459212778094696e-05, "loss": 3.8228, "step": 2716 }, { "epoch": 0.4650008557247989, "grad_norm": 25.956119537353516, "learning_rate": 1.546491728465488e-05, "loss": 2.6475, "step": 2717 }, { "epoch": 0.4651720006845798, "grad_norm": 37.32086944580078, "learning_rate": 1.5470621791215063e-05, "loss": 3.6192, "step": 2718 }, { "epoch": 0.4653431456443608, "grad_norm": 25.61843490600586, "learning_rate": 1.5476326297775243e-05, "loss": 2.336, "step": 2719 }, { "epoch": 0.46551429060414173, "grad_norm": 31.5511531829834, "learning_rate": 1.5482030804335426e-05, "loss": 3.1832, "step": 2720 }, { "epoch": 0.46568543556392267, "grad_norm": 35.96617889404297, "learning_rate": 1.548773531089561e-05, "loss": 4.2684, "step": 2721 }, { "epoch": 0.4658565805237036, "grad_norm": 12.214024543762207, "learning_rate": 1.549343981745579e-05, "loss": 0.8686, "step": 2722 }, { "epoch": 0.46602772548348453, "grad_norm": 3.517146110534668, "learning_rate": 1.5499144324015973e-05, "loss": 0.4999, "step": 2723 }, { "epoch": 0.46619887044326547, "grad_norm": 27.56136703491211, "learning_rate": 1.5504848830576157e-05, "loss": 2.7019, "step": 2724 }, { "epoch": 0.4663700154030464, "grad_norm": 4.812444686889648, "learning_rate": 1.551055333713634e-05, "loss": 0.5107, "step": 2725 }, { "epoch": 0.46654116036282733, "grad_norm": 30.523237228393555, "learning_rate": 1.551625784369652e-05, "loss": 3.2109, "step": 2726 }, { "epoch": 0.46671230532260827, "grad_norm": 28.326934814453125, "learning_rate": 1.5521962350256703e-05, "loss": 3.2289, "step": 2727 }, { "epoch": 0.4668834502823892, "grad_norm": 34.37868118286133, "learning_rate": 1.5527666856816887e-05, "loss": 3.6814, "step": 2728 }, { "epoch": 0.46705459524217013, "grad_norm": 30.16160774230957, "learning_rate": 1.5533371363377067e-05, "loss": 3.4049, "step": 2729 }, { "epoch": 0.46722574020195107, "grad_norm": 4.218698024749756, "learning_rate": 1.553907586993725e-05, "loss": 0.4987, "step": 2730 }, { "epoch": 0.467396885161732, "grad_norm": 23.180875778198242, "learning_rate": 1.5544780376497433e-05, "loss": 2.2238, "step": 2731 }, { "epoch": 0.46756803012151293, "grad_norm": 25.21503257751465, "learning_rate": 1.5550484883057617e-05, "loss": 2.4819, "step": 2732 }, { "epoch": 0.46773917508129387, "grad_norm": 30.37474822998047, "learning_rate": 1.5556189389617797e-05, "loss": 3.2935, "step": 2733 }, { "epoch": 0.4679103200410748, "grad_norm": 16.8712100982666, "learning_rate": 1.556189389617798e-05, "loss": 1.0892, "step": 2734 }, { "epoch": 0.46808146500085573, "grad_norm": 23.52683448791504, "learning_rate": 1.5567598402738164e-05, "loss": 2.3256, "step": 2735 }, { "epoch": 0.46825260996063667, "grad_norm": 37.76002502441406, "learning_rate": 1.5573302909298344e-05, "loss": 3.8535, "step": 2736 }, { "epoch": 0.4684237549204176, "grad_norm": 31.672475814819336, "learning_rate": 1.5579007415858527e-05, "loss": 2.5348, "step": 2737 }, { "epoch": 0.46859489988019853, "grad_norm": 59.173072814941406, "learning_rate": 1.558471192241871e-05, "loss": 7.7627, "step": 2738 }, { "epoch": 0.46876604483997947, "grad_norm": 23.428421020507812, "learning_rate": 1.5590416428978894e-05, "loss": 2.3317, "step": 2739 }, { "epoch": 0.4689371897997604, "grad_norm": 38.1778564453125, "learning_rate": 1.5596120935539077e-05, "loss": 7.6561, "step": 2740 }, { "epoch": 0.46910833475954133, "grad_norm": 10.163063049316406, "learning_rate": 1.560182544209926e-05, "loss": 0.7524, "step": 2741 }, { "epoch": 0.46927947971932227, "grad_norm": 3.395460367202759, "learning_rate": 1.5607529948659444e-05, "loss": 0.4881, "step": 2742 }, { "epoch": 0.4694506246791032, "grad_norm": 28.233747482299805, "learning_rate": 1.5613234455219624e-05, "loss": 2.606, "step": 2743 }, { "epoch": 0.46962176963888413, "grad_norm": 33.14704513549805, "learning_rate": 1.5618938961779807e-05, "loss": 3.4617, "step": 2744 }, { "epoch": 0.46979291459866507, "grad_norm": 4.885557651519775, "learning_rate": 1.562464346833999e-05, "loss": 0.5159, "step": 2745 }, { "epoch": 0.469964059558446, "grad_norm": 32.37671661376953, "learning_rate": 1.563034797490017e-05, "loss": 3.1744, "step": 2746 }, { "epoch": 0.47013520451822693, "grad_norm": 26.680980682373047, "learning_rate": 1.5636052481460354e-05, "loss": 2.8685, "step": 2747 }, { "epoch": 0.47030634947800787, "grad_norm": 27.004371643066406, "learning_rate": 1.5641756988020537e-05, "loss": 3.0092, "step": 2748 }, { "epoch": 0.4704774944377888, "grad_norm": 21.964834213256836, "learning_rate": 1.564746149458072e-05, "loss": 2.2193, "step": 2749 }, { "epoch": 0.47064863939756973, "grad_norm": 66.10285186767578, "learning_rate": 1.56531660011409e-05, "loss": 7.346, "step": 2750 }, { "epoch": 0.47081978435735067, "grad_norm": 6.018518924713135, "learning_rate": 1.5658870507701084e-05, "loss": 0.5488, "step": 2751 }, { "epoch": 0.4709909293171316, "grad_norm": 30.385318756103516, "learning_rate": 1.5664575014261267e-05, "loss": 3.0093, "step": 2752 }, { "epoch": 0.47116207427691253, "grad_norm": 13.027030944824219, "learning_rate": 1.5670279520821447e-05, "loss": 1.9682, "step": 2753 }, { "epoch": 0.47133321923669347, "grad_norm": 35.71416473388672, "learning_rate": 1.567598402738163e-05, "loss": 6.69, "step": 2754 }, { "epoch": 0.4715043641964744, "grad_norm": 29.253435134887695, "learning_rate": 1.5681688533941814e-05, "loss": 2.932, "step": 2755 }, { "epoch": 0.47167550915625533, "grad_norm": 30.07666778564453, "learning_rate": 1.5687393040501998e-05, "loss": 3.8444, "step": 2756 }, { "epoch": 0.47184665411603627, "grad_norm": 35.976871490478516, "learning_rate": 1.5693097547062178e-05, "loss": 4.9907, "step": 2757 }, { "epoch": 0.4720177990758172, "grad_norm": 87.46236419677734, "learning_rate": 1.569880205362236e-05, "loss": 4.691, "step": 2758 }, { "epoch": 0.47218894403559813, "grad_norm": 22.59965705871582, "learning_rate": 1.5704506560182544e-05, "loss": 2.1086, "step": 2759 }, { "epoch": 0.47236008899537907, "grad_norm": 8.256918907165527, "learning_rate": 1.5710211066742724e-05, "loss": 0.9678, "step": 2760 }, { "epoch": 0.47253123395516, "grad_norm": 38.63548278808594, "learning_rate": 1.5715915573302908e-05, "loss": 3.4149, "step": 2761 }, { "epoch": 0.47270237891494093, "grad_norm": 7.380117893218994, "learning_rate": 1.5721620079863094e-05, "loss": 0.6134, "step": 2762 }, { "epoch": 0.47287352387472187, "grad_norm": 27.26441764831543, "learning_rate": 1.5727324586423278e-05, "loss": 2.8164, "step": 2763 }, { "epoch": 0.4730446688345028, "grad_norm": 18.643917083740234, "learning_rate": 1.5733029092983458e-05, "loss": 1.9656, "step": 2764 }, { "epoch": 0.47321581379428374, "grad_norm": 27.289445877075195, "learning_rate": 1.573873359954364e-05, "loss": 2.8402, "step": 2765 }, { "epoch": 0.47338695875406467, "grad_norm": 6.67548942565918, "learning_rate": 1.5744438106103825e-05, "loss": 0.7842, "step": 2766 }, { "epoch": 0.4735581037138456, "grad_norm": 33.30831527709961, "learning_rate": 1.5750142612664005e-05, "loss": 3.6011, "step": 2767 }, { "epoch": 0.47372924867362654, "grad_norm": 104.88871765136719, "learning_rate": 1.5755847119224188e-05, "loss": 5.0141, "step": 2768 }, { "epoch": 0.47390039363340747, "grad_norm": 5.102696418762207, "learning_rate": 1.576155162578437e-05, "loss": 0.5258, "step": 2769 }, { "epoch": 0.47407153859318846, "grad_norm": 3.6947317123413086, "learning_rate": 1.576725613234455e-05, "loss": 0.4833, "step": 2770 }, { "epoch": 0.4742426835529694, "grad_norm": 32.43489074707031, "learning_rate": 1.5772960638904735e-05, "loss": 3.7272, "step": 2771 }, { "epoch": 0.4744138285127503, "grad_norm": 32.32176971435547, "learning_rate": 1.5778665145464918e-05, "loss": 3.4356, "step": 2772 }, { "epoch": 0.47458497347253126, "grad_norm": 2.583381175994873, "learning_rate": 1.57843696520251e-05, "loss": 0.4212, "step": 2773 }, { "epoch": 0.4747561184323122, "grad_norm": 38.029869079589844, "learning_rate": 1.579007415858528e-05, "loss": 7.2033, "step": 2774 }, { "epoch": 0.4749272633920931, "grad_norm": 10.577736854553223, "learning_rate": 1.5795778665145465e-05, "loss": 1.2395, "step": 2775 }, { "epoch": 0.47509840835187406, "grad_norm": 9.981147766113281, "learning_rate": 1.5801483171705648e-05, "loss": 1.4924, "step": 2776 }, { "epoch": 0.475269553311655, "grad_norm": 28.36383819580078, "learning_rate": 1.5807187678265828e-05, "loss": 3.8155, "step": 2777 }, { "epoch": 0.4754406982714359, "grad_norm": 6.329680442810059, "learning_rate": 1.581289218482601e-05, "loss": 0.4932, "step": 2778 }, { "epoch": 0.47561184323121686, "grad_norm": 17.587629318237305, "learning_rate": 1.5818596691386195e-05, "loss": 1.8358, "step": 2779 }, { "epoch": 0.4757829881909978, "grad_norm": 32.48772048950195, "learning_rate": 1.582430119794638e-05, "loss": 4.1859, "step": 2780 }, { "epoch": 0.4759541331507787, "grad_norm": 41.349056243896484, "learning_rate": 1.583000570450656e-05, "loss": 7.0338, "step": 2781 }, { "epoch": 0.47612527811055966, "grad_norm": 32.28718185424805, "learning_rate": 1.583571021106674e-05, "loss": 3.439, "step": 2782 }, { "epoch": 0.4762964230703406, "grad_norm": 52.53911209106445, "learning_rate": 1.5841414717626925e-05, "loss": 6.9516, "step": 2783 }, { "epoch": 0.4764675680301215, "grad_norm": 6.450766086578369, "learning_rate": 1.5847119224187105e-05, "loss": 0.4587, "step": 2784 }, { "epoch": 0.47663871298990246, "grad_norm": 31.295753479003906, "learning_rate": 1.5852823730747292e-05, "loss": 3.6037, "step": 2785 }, { "epoch": 0.4768098579496834, "grad_norm": 10.392585754394531, "learning_rate": 1.5858528237307475e-05, "loss": 0.7695, "step": 2786 }, { "epoch": 0.4769810029094643, "grad_norm": 31.578166961669922, "learning_rate": 1.586423274386766e-05, "loss": 3.4914, "step": 2787 }, { "epoch": 0.47715214786924526, "grad_norm": 35.540199279785156, "learning_rate": 1.586993725042784e-05, "loss": 4.0507, "step": 2788 }, { "epoch": 0.4773232928290262, "grad_norm": 30.065216064453125, "learning_rate": 1.5875641756988022e-05, "loss": 3.4183, "step": 2789 }, { "epoch": 0.4774944377888071, "grad_norm": 3.6649258136749268, "learning_rate": 1.5881346263548205e-05, "loss": 0.4127, "step": 2790 }, { "epoch": 0.47766558274858806, "grad_norm": 88.72532653808594, "learning_rate": 1.5887050770108385e-05, "loss": 4.5608, "step": 2791 }, { "epoch": 0.477836727708369, "grad_norm": 23.770221710205078, "learning_rate": 1.589275527666857e-05, "loss": 2.2223, "step": 2792 }, { "epoch": 0.4780078726681499, "grad_norm": 5.284163951873779, "learning_rate": 1.5898459783228752e-05, "loss": 0.5186, "step": 2793 }, { "epoch": 0.47817901762793086, "grad_norm": 29.41139793395996, "learning_rate": 1.5904164289788935e-05, "loss": 3.0647, "step": 2794 }, { "epoch": 0.4783501625877118, "grad_norm": 26.757612228393555, "learning_rate": 1.5909868796349115e-05, "loss": 2.3827, "step": 2795 }, { "epoch": 0.4785213075474927, "grad_norm": 12.758798599243164, "learning_rate": 1.59155733029093e-05, "loss": 0.818, "step": 2796 }, { "epoch": 0.47869245250727366, "grad_norm": 29.093143463134766, "learning_rate": 1.5921277809469482e-05, "loss": 2.9151, "step": 2797 }, { "epoch": 0.4788635974670546, "grad_norm": 126.96649932861328, "learning_rate": 1.5926982316029662e-05, "loss": 8.4343, "step": 2798 }, { "epoch": 0.4790347424268355, "grad_norm": 31.195518493652344, "learning_rate": 1.5932686822589846e-05, "loss": 3.7256, "step": 2799 }, { "epoch": 0.47920588738661646, "grad_norm": 28.148395538330078, "learning_rate": 1.593839132915003e-05, "loss": 2.8813, "step": 2800 }, { "epoch": 0.4793770323463974, "grad_norm": 36.015403747558594, "learning_rate": 1.5944095835710212e-05, "loss": 4.6005, "step": 2801 }, { "epoch": 0.4795481773061783, "grad_norm": 31.1592960357666, "learning_rate": 1.5949800342270392e-05, "loss": 3.1305, "step": 2802 }, { "epoch": 0.47971932226595926, "grad_norm": 128.36007690429688, "learning_rate": 1.5955504848830576e-05, "loss": 4.3169, "step": 2803 }, { "epoch": 0.4798904672257402, "grad_norm": 13.735505104064941, "learning_rate": 1.596120935539076e-05, "loss": 0.9245, "step": 2804 }, { "epoch": 0.4800616121855211, "grad_norm": 42.414024353027344, "learning_rate": 1.596691386195094e-05, "loss": 6.975, "step": 2805 }, { "epoch": 0.48023275714530206, "grad_norm": 31.763032913208008, "learning_rate": 1.5972618368511122e-05, "loss": 3.1284, "step": 2806 }, { "epoch": 0.480403902105083, "grad_norm": 27.716442108154297, "learning_rate": 1.597832287507131e-05, "loss": 3.0007, "step": 2807 }, { "epoch": 0.4805750470648639, "grad_norm": 32.059425354003906, "learning_rate": 1.598402738163149e-05, "loss": 4.0829, "step": 2808 }, { "epoch": 0.48074619202464486, "grad_norm": 36.31050491333008, "learning_rate": 1.5989731888191673e-05, "loss": 4.8666, "step": 2809 }, { "epoch": 0.4809173369844258, "grad_norm": 23.16267204284668, "learning_rate": 1.5995436394751856e-05, "loss": 2.0993, "step": 2810 }, { "epoch": 0.4810884819442067, "grad_norm": 12.366683006286621, "learning_rate": 1.600114090131204e-05, "loss": 0.913, "step": 2811 }, { "epoch": 0.48125962690398766, "grad_norm": 11.630936622619629, "learning_rate": 1.600684540787222e-05, "loss": 1.233, "step": 2812 }, { "epoch": 0.4814307718637686, "grad_norm": 5.433574676513672, "learning_rate": 1.6012549914432403e-05, "loss": 0.5408, "step": 2813 }, { "epoch": 0.4816019168235495, "grad_norm": 25.152584075927734, "learning_rate": 1.6018254420992586e-05, "loss": 2.7558, "step": 2814 }, { "epoch": 0.48177306178333046, "grad_norm": 32.2104377746582, "learning_rate": 1.6023958927552766e-05, "loss": 3.56, "step": 2815 }, { "epoch": 0.4819442067431114, "grad_norm": 21.62321662902832, "learning_rate": 1.602966343411295e-05, "loss": 2.626, "step": 2816 }, { "epoch": 0.4821153517028923, "grad_norm": 27.26594352722168, "learning_rate": 1.6035367940673133e-05, "loss": 3.057, "step": 2817 }, { "epoch": 0.48228649666267326, "grad_norm": 29.751848220825195, "learning_rate": 1.6041072447233316e-05, "loss": 3.0557, "step": 2818 }, { "epoch": 0.4824576416224542, "grad_norm": 28.00129508972168, "learning_rate": 1.6046776953793496e-05, "loss": 2.9606, "step": 2819 }, { "epoch": 0.4826287865822352, "grad_norm": 10.21130084991455, "learning_rate": 1.605248146035368e-05, "loss": 1.0526, "step": 2820 }, { "epoch": 0.4827999315420161, "grad_norm": 194.53099060058594, "learning_rate": 1.6058185966913863e-05, "loss": 9.7692, "step": 2821 }, { "epoch": 0.48297107650179705, "grad_norm": 20.116971969604492, "learning_rate": 1.6063890473474043e-05, "loss": 1.702, "step": 2822 }, { "epoch": 0.483142221461578, "grad_norm": 25.585695266723633, "learning_rate": 1.6069594980034226e-05, "loss": 3.1031, "step": 2823 }, { "epoch": 0.4833133664213589, "grad_norm": 10.690316200256348, "learning_rate": 1.607529948659441e-05, "loss": 1.112, "step": 2824 }, { "epoch": 0.48348451138113985, "grad_norm": 46.31101989746094, "learning_rate": 1.6081003993154593e-05, "loss": 7.0695, "step": 2825 }, { "epoch": 0.4836556563409208, "grad_norm": 2.7449769973754883, "learning_rate": 1.6086708499714773e-05, "loss": 0.412, "step": 2826 }, { "epoch": 0.4838268013007017, "grad_norm": 31.60761260986328, "learning_rate": 1.6092413006274956e-05, "loss": 3.5248, "step": 2827 }, { "epoch": 0.48399794626048265, "grad_norm": 30.04467010498047, "learning_rate": 1.609811751283514e-05, "loss": 3.5359, "step": 2828 }, { "epoch": 0.4841690912202636, "grad_norm": 10.859264373779297, "learning_rate": 1.610382201939532e-05, "loss": 0.9806, "step": 2829 }, { "epoch": 0.4843402361800445, "grad_norm": 24.42304229736328, "learning_rate": 1.6109526525955507e-05, "loss": 2.5163, "step": 2830 }, { "epoch": 0.48451138113982545, "grad_norm": 32.02371597290039, "learning_rate": 1.611523103251569e-05, "loss": 4.1818, "step": 2831 }, { "epoch": 0.4846825260996064, "grad_norm": 35.690147399902344, "learning_rate": 1.6120935539075873e-05, "loss": 3.3438, "step": 2832 }, { "epoch": 0.4848536710593873, "grad_norm": 25.543243408203125, "learning_rate": 1.6126640045636053e-05, "loss": 2.5981, "step": 2833 }, { "epoch": 0.48502481601916825, "grad_norm": 25.119115829467773, "learning_rate": 1.6132344552196237e-05, "loss": 2.1322, "step": 2834 }, { "epoch": 0.4851959609789492, "grad_norm": 23.112409591674805, "learning_rate": 1.613804905875642e-05, "loss": 2.5395, "step": 2835 }, { "epoch": 0.4853671059387301, "grad_norm": 91.41179656982422, "learning_rate": 1.61437535653166e-05, "loss": 4.2215, "step": 2836 }, { "epoch": 0.48553825089851105, "grad_norm": 34.66135787963867, "learning_rate": 1.6149458071876783e-05, "loss": 3.1988, "step": 2837 }, { "epoch": 0.485709395858292, "grad_norm": 28.888839721679688, "learning_rate": 1.6155162578436967e-05, "loss": 3.1345, "step": 2838 }, { "epoch": 0.4858805408180729, "grad_norm": 63.08065414428711, "learning_rate": 1.6160867084997147e-05, "loss": 8.1288, "step": 2839 }, { "epoch": 0.48605168577785385, "grad_norm": 148.98455810546875, "learning_rate": 1.616657159155733e-05, "loss": 4.9747, "step": 2840 }, { "epoch": 0.4862228307376348, "grad_norm": 29.048202514648438, "learning_rate": 1.6172276098117514e-05, "loss": 2.9531, "step": 2841 }, { "epoch": 0.4863939756974157, "grad_norm": 6.495917320251465, "learning_rate": 1.6177980604677697e-05, "loss": 0.5056, "step": 2842 }, { "epoch": 0.48656512065719665, "grad_norm": 8.356714248657227, "learning_rate": 1.6183685111237877e-05, "loss": 0.9125, "step": 2843 }, { "epoch": 0.4867362656169776, "grad_norm": 26.18461036682129, "learning_rate": 1.618938961779806e-05, "loss": 3.175, "step": 2844 }, { "epoch": 0.4869074105767585, "grad_norm": 9.202829360961914, "learning_rate": 1.6195094124358244e-05, "loss": 1.0864, "step": 2845 }, { "epoch": 0.48707855553653945, "grad_norm": 34.182373046875, "learning_rate": 1.6200798630918424e-05, "loss": 2.7523, "step": 2846 }, { "epoch": 0.4872497004963204, "grad_norm": 68.9462890625, "learning_rate": 1.6206503137478607e-05, "loss": 3.7044, "step": 2847 }, { "epoch": 0.4874208454561013, "grad_norm": 24.633121490478516, "learning_rate": 1.621220764403879e-05, "loss": 2.6342, "step": 2848 }, { "epoch": 0.48759199041588225, "grad_norm": 32.68869400024414, "learning_rate": 1.6217912150598974e-05, "loss": 4.6795, "step": 2849 }, { "epoch": 0.4877631353756632, "grad_norm": 28.001712799072266, "learning_rate": 1.6223616657159154e-05, "loss": 3.3885, "step": 2850 }, { "epoch": 0.4879342803354441, "grad_norm": 4.1197099685668945, "learning_rate": 1.6229321163719337e-05, "loss": 0.4097, "step": 2851 }, { "epoch": 0.48810542529522505, "grad_norm": 29.35110092163086, "learning_rate": 1.623502567027952e-05, "loss": 3.5865, "step": 2852 }, { "epoch": 0.488276570255006, "grad_norm": 26.92041778564453, "learning_rate": 1.6240730176839704e-05, "loss": 2.7247, "step": 2853 }, { "epoch": 0.4884477152147869, "grad_norm": 34.873775482177734, "learning_rate": 1.6246434683399887e-05, "loss": 7.1172, "step": 2854 }, { "epoch": 0.48861886017456785, "grad_norm": 24.180212020874023, "learning_rate": 1.625213918996007e-05, "loss": 2.4944, "step": 2855 }, { "epoch": 0.4887900051343488, "grad_norm": 28.294334411621094, "learning_rate": 1.6257843696520254e-05, "loss": 3.4049, "step": 2856 }, { "epoch": 0.4889611500941297, "grad_norm": 20.231170654296875, "learning_rate": 1.6263548203080434e-05, "loss": 2.2117, "step": 2857 }, { "epoch": 0.48913229505391065, "grad_norm": 21.00507164001465, "learning_rate": 1.6269252709640617e-05, "loss": 1.8153, "step": 2858 }, { "epoch": 0.4893034400136916, "grad_norm": 26.58632469177246, "learning_rate": 1.62749572162008e-05, "loss": 2.7509, "step": 2859 }, { "epoch": 0.4894745849734725, "grad_norm": 25.922264099121094, "learning_rate": 1.628066172276098e-05, "loss": 3.0767, "step": 2860 }, { "epoch": 0.48964572993325345, "grad_norm": 36.93525695800781, "learning_rate": 1.6286366229321164e-05, "loss": 6.587, "step": 2861 }, { "epoch": 0.4898168748930344, "grad_norm": 30.786312103271484, "learning_rate": 1.6292070735881348e-05, "loss": 4.5453, "step": 2862 }, { "epoch": 0.4899880198528153, "grad_norm": 10.850686073303223, "learning_rate": 1.629777524244153e-05, "loss": 0.8675, "step": 2863 }, { "epoch": 0.49015916481259625, "grad_norm": 22.04916763305664, "learning_rate": 1.630347974900171e-05, "loss": 2.1868, "step": 2864 }, { "epoch": 0.4903303097723772, "grad_norm": 27.125104904174805, "learning_rate": 1.6309184255561894e-05, "loss": 2.7107, "step": 2865 }, { "epoch": 0.4905014547321581, "grad_norm": 26.232017517089844, "learning_rate": 1.6314888762122078e-05, "loss": 3.0023, "step": 2866 }, { "epoch": 0.49067259969193905, "grad_norm": 2.6513617038726807, "learning_rate": 1.6320593268682258e-05, "loss": 0.4064, "step": 2867 }, { "epoch": 0.49084374465172, "grad_norm": 29.269208908081055, "learning_rate": 1.632629777524244e-05, "loss": 2.9074, "step": 2868 }, { "epoch": 0.4910148896115009, "grad_norm": 28.653419494628906, "learning_rate": 1.6332002281802624e-05, "loss": 2.749, "step": 2869 }, { "epoch": 0.49118603457128185, "grad_norm": 24.419513702392578, "learning_rate": 1.6337706788362808e-05, "loss": 2.4733, "step": 2870 }, { "epoch": 0.49135717953106284, "grad_norm": 31.81149673461914, "learning_rate": 1.6343411294922988e-05, "loss": 3.0798, "step": 2871 }, { "epoch": 0.49152832449084377, "grad_norm": 7.05307149887085, "learning_rate": 1.634911580148317e-05, "loss": 0.6362, "step": 2872 }, { "epoch": 0.4916994694506247, "grad_norm": 22.482975006103516, "learning_rate": 1.6354820308043355e-05, "loss": 2.0351, "step": 2873 }, { "epoch": 0.49187061441040564, "grad_norm": 9.290128707885742, "learning_rate": 1.6360524814603535e-05, "loss": 0.6272, "step": 2874 }, { "epoch": 0.49204175937018657, "grad_norm": 27.201467514038086, "learning_rate": 1.6366229321163718e-05, "loss": 2.6431, "step": 2875 }, { "epoch": 0.4922129043299675, "grad_norm": 44.08928298950195, "learning_rate": 1.6371933827723905e-05, "loss": 6.6881, "step": 2876 }, { "epoch": 0.49238404928974844, "grad_norm": 14.08613109588623, "learning_rate": 1.6377638334284085e-05, "loss": 1.0184, "step": 2877 }, { "epoch": 0.49255519424952937, "grad_norm": 19.89874839782715, "learning_rate": 1.6383342840844268e-05, "loss": 2.0983, "step": 2878 }, { "epoch": 0.4927263392093103, "grad_norm": 31.281314849853516, "learning_rate": 1.638904734740445e-05, "loss": 4.3604, "step": 2879 }, { "epoch": 0.49289748416909124, "grad_norm": 4.3934245109558105, "learning_rate": 1.6394751853964635e-05, "loss": 0.4535, "step": 2880 }, { "epoch": 0.49306862912887217, "grad_norm": 16.13640785217285, "learning_rate": 1.6400456360524815e-05, "loss": 1.4628, "step": 2881 }, { "epoch": 0.4932397740886531, "grad_norm": 2.4228832721710205, "learning_rate": 1.6406160867084998e-05, "loss": 0.3669, "step": 2882 }, { "epoch": 0.49341091904843404, "grad_norm": 39.298160552978516, "learning_rate": 1.641186537364518e-05, "loss": 5.1978, "step": 2883 }, { "epoch": 0.49358206400821497, "grad_norm": 7.103499889373779, "learning_rate": 1.641756988020536e-05, "loss": 0.7534, "step": 2884 }, { "epoch": 0.4937532089679959, "grad_norm": 36.24224090576172, "learning_rate": 1.6423274386765545e-05, "loss": 5.1747, "step": 2885 }, { "epoch": 0.49392435392777684, "grad_norm": 88.6714859008789, "learning_rate": 1.642897889332573e-05, "loss": 4.1515, "step": 2886 }, { "epoch": 0.49409549888755777, "grad_norm": 102.38868713378906, "learning_rate": 1.6434683399885912e-05, "loss": 4.1397, "step": 2887 }, { "epoch": 0.4942666438473387, "grad_norm": 32.09382247924805, "learning_rate": 1.6440387906446092e-05, "loss": 3.822, "step": 2888 }, { "epoch": 0.49443778880711964, "grad_norm": 27.632850646972656, "learning_rate": 1.6446092413006275e-05, "loss": 3.0071, "step": 2889 }, { "epoch": 0.49460893376690057, "grad_norm": 29.850147247314453, "learning_rate": 1.645179691956646e-05, "loss": 4.5876, "step": 2890 }, { "epoch": 0.4947800787266815, "grad_norm": 20.323644638061523, "learning_rate": 1.645750142612664e-05, "loss": 2.0093, "step": 2891 }, { "epoch": 0.49495122368646244, "grad_norm": 28.592273712158203, "learning_rate": 1.6463205932686822e-05, "loss": 2.6316, "step": 2892 }, { "epoch": 0.49512236864624337, "grad_norm": 29.890256881713867, "learning_rate": 1.6468910439247005e-05, "loss": 2.7351, "step": 2893 }, { "epoch": 0.4952935136060243, "grad_norm": 25.856136322021484, "learning_rate": 1.647461494580719e-05, "loss": 2.7318, "step": 2894 }, { "epoch": 0.49546465856580524, "grad_norm": 28.1647891998291, "learning_rate": 1.648031945236737e-05, "loss": 2.7787, "step": 2895 }, { "epoch": 0.49563580352558617, "grad_norm": 24.757694244384766, "learning_rate": 1.6486023958927552e-05, "loss": 2.7135, "step": 2896 }, { "epoch": 0.4958069484853671, "grad_norm": 42.44664764404297, "learning_rate": 1.6491728465487735e-05, "loss": 3.6649, "step": 2897 }, { "epoch": 0.49597809344514804, "grad_norm": 30.2053279876709, "learning_rate": 1.6497432972047915e-05, "loss": 4.0259, "step": 2898 }, { "epoch": 0.49614923840492897, "grad_norm": 12.054943084716797, "learning_rate": 1.6503137478608102e-05, "loss": 1.0105, "step": 2899 }, { "epoch": 0.4963203833647099, "grad_norm": 17.974079132080078, "learning_rate": 1.6508841985168286e-05, "loss": 2.0786, "step": 2900 }, { "epoch": 0.49649152832449084, "grad_norm": 12.725552558898926, "learning_rate": 1.651454649172847e-05, "loss": 1.0647, "step": 2901 }, { "epoch": 0.49666267328427177, "grad_norm": 22.831754684448242, "learning_rate": 1.652025099828865e-05, "loss": 2.2329, "step": 2902 }, { "epoch": 0.4968338182440527, "grad_norm": 21.267478942871094, "learning_rate": 1.6525955504848832e-05, "loss": 2.5314, "step": 2903 }, { "epoch": 0.49700496320383364, "grad_norm": 27.087793350219727, "learning_rate": 1.6531660011409016e-05, "loss": 2.8437, "step": 2904 }, { "epoch": 0.49717610816361457, "grad_norm": 19.73915672302246, "learning_rate": 1.6537364517969196e-05, "loss": 1.8543, "step": 2905 }, { "epoch": 0.4973472531233955, "grad_norm": 2.955650806427002, "learning_rate": 1.654306902452938e-05, "loss": 0.4054, "step": 2906 }, { "epoch": 0.49751839808317644, "grad_norm": 25.305593490600586, "learning_rate": 1.6548773531089562e-05, "loss": 2.5412, "step": 2907 }, { "epoch": 0.49768954304295737, "grad_norm": 29.378746032714844, "learning_rate": 1.6554478037649742e-05, "loss": 2.7018, "step": 2908 }, { "epoch": 0.4978606880027383, "grad_norm": 14.516071319580078, "learning_rate": 1.6560182544209926e-05, "loss": 1.9194, "step": 2909 }, { "epoch": 0.49803183296251924, "grad_norm": 25.602577209472656, "learning_rate": 1.656588705077011e-05, "loss": 2.1128, "step": 2910 }, { "epoch": 0.49820297792230017, "grad_norm": 109.72111511230469, "learning_rate": 1.6571591557330293e-05, "loss": 4.1774, "step": 2911 }, { "epoch": 0.4983741228820811, "grad_norm": 19.274553298950195, "learning_rate": 1.6577296063890472e-05, "loss": 1.5632, "step": 2912 }, { "epoch": 0.49854526784186204, "grad_norm": 29.17140007019043, "learning_rate": 1.6583000570450656e-05, "loss": 3.7158, "step": 2913 }, { "epoch": 0.49871641280164297, "grad_norm": 31.559934616088867, "learning_rate": 1.658870507701084e-05, "loss": 4.5437, "step": 2914 }, { "epoch": 0.4988875577614239, "grad_norm": 18.08380699157715, "learning_rate": 1.659440958357102e-05, "loss": 1.1722, "step": 2915 }, { "epoch": 0.49905870272120484, "grad_norm": 29.155492782592773, "learning_rate": 1.6600114090131203e-05, "loss": 3.2768, "step": 2916 }, { "epoch": 0.49922984768098577, "grad_norm": 36.51355743408203, "learning_rate": 1.6605818596691386e-05, "loss": 4.8346, "step": 2917 }, { "epoch": 0.4994009926407667, "grad_norm": 18.29048728942871, "learning_rate": 1.661152310325157e-05, "loss": 1.1614, "step": 2918 }, { "epoch": 0.49957213760054764, "grad_norm": 29.851797103881836, "learning_rate": 1.661722760981175e-05, "loss": 2.9554, "step": 2919 }, { "epoch": 0.49974328256032857, "grad_norm": 27.82573699951172, "learning_rate": 1.6622932116371933e-05, "loss": 3.4135, "step": 2920 }, { "epoch": 0.4999144275201095, "grad_norm": 26.42146110534668, "learning_rate": 1.6628636622932116e-05, "loss": 2.5056, "step": 2921 }, { "epoch": 0.5000855724798905, "grad_norm": 11.394399642944336, "learning_rate": 1.66343411294923e-05, "loss": 1.5378, "step": 2922 }, { "epoch": 0.5002567174396714, "grad_norm": 76.39617156982422, "learning_rate": 1.6640045636052483e-05, "loss": 7.2706, "step": 2923 }, { "epoch": 0.5004278623994524, "grad_norm": 30.514179229736328, "learning_rate": 1.6645750142612666e-05, "loss": 3.1234, "step": 2924 }, { "epoch": 0.5005990073592332, "grad_norm": 25.776514053344727, "learning_rate": 1.665145464917285e-05, "loss": 2.852, "step": 2925 }, { "epoch": 0.5007701523190142, "grad_norm": 33.94929122924805, "learning_rate": 1.665715915573303e-05, "loss": 4.5202, "step": 2926 }, { "epoch": 0.5009412972787951, "grad_norm": 42.92927551269531, "learning_rate": 1.6662863662293213e-05, "loss": 7.0151, "step": 2927 }, { "epoch": 0.5011124422385761, "grad_norm": 8.699772834777832, "learning_rate": 1.6668568168853396e-05, "loss": 0.8725, "step": 2928 }, { "epoch": 0.501283587198357, "grad_norm": 27.853302001953125, "learning_rate": 1.6674272675413576e-05, "loss": 2.2825, "step": 2929 }, { "epoch": 0.501454732158138, "grad_norm": 26.110185623168945, "learning_rate": 1.667997718197376e-05, "loss": 2.5107, "step": 2930 }, { "epoch": 0.5016258771179188, "grad_norm": 4.521554946899414, "learning_rate": 1.6685681688533943e-05, "loss": 0.4957, "step": 2931 }, { "epoch": 0.5017970220776998, "grad_norm": 42.245086669921875, "learning_rate": 1.6691386195094127e-05, "loss": 6.5318, "step": 2932 }, { "epoch": 0.5019681670374807, "grad_norm": 25.86848258972168, "learning_rate": 1.6697090701654307e-05, "loss": 2.2382, "step": 2933 }, { "epoch": 0.5021393119972617, "grad_norm": 48.50715637207031, "learning_rate": 1.670279520821449e-05, "loss": 7.148, "step": 2934 }, { "epoch": 0.5023104569570426, "grad_norm": 32.559574127197266, "learning_rate": 1.6708499714774673e-05, "loss": 3.6438, "step": 2935 }, { "epoch": 0.5024816019168236, "grad_norm": 24.84282112121582, "learning_rate": 1.6714204221334853e-05, "loss": 2.7729, "step": 2936 }, { "epoch": 0.5026527468766044, "grad_norm": 14.403919219970703, "learning_rate": 1.6719908727895037e-05, "loss": 1.217, "step": 2937 }, { "epoch": 0.5028238918363854, "grad_norm": 27.424219131469727, "learning_rate": 1.672561323445522e-05, "loss": 2.5197, "step": 2938 }, { "epoch": 0.5029950367961663, "grad_norm": 9.789163589477539, "learning_rate": 1.67313177410154e-05, "loss": 1.7931, "step": 2939 }, { "epoch": 0.5031661817559473, "grad_norm": 27.327239990234375, "learning_rate": 1.6737022247575583e-05, "loss": 3.3266, "step": 2940 }, { "epoch": 0.5033373267157282, "grad_norm": 19.182161331176758, "learning_rate": 1.6742726754135767e-05, "loss": 1.9967, "step": 2941 }, { "epoch": 0.5035084716755092, "grad_norm": 56.43001174926758, "learning_rate": 1.674843126069595e-05, "loss": 3.7189, "step": 2942 }, { "epoch": 0.50367961663529, "grad_norm": 19.654386520385742, "learning_rate": 1.675413576725613e-05, "loss": 1.9999, "step": 2943 }, { "epoch": 0.503850761595071, "grad_norm": 22.203187942504883, "learning_rate": 1.6759840273816314e-05, "loss": 2.1959, "step": 2944 }, { "epoch": 0.5040219065548519, "grad_norm": 6.563319683074951, "learning_rate": 1.67655447803765e-05, "loss": 0.6367, "step": 2945 }, { "epoch": 0.5041930515146329, "grad_norm": 10.192085266113281, "learning_rate": 1.677124928693668e-05, "loss": 0.7288, "step": 2946 }, { "epoch": 0.5043641964744139, "grad_norm": 32.45716094970703, "learning_rate": 1.6776953793496864e-05, "loss": 3.9021, "step": 2947 }, { "epoch": 0.5045353414341948, "grad_norm": 4.9417595863342285, "learning_rate": 1.6782658300057047e-05, "loss": 0.4681, "step": 2948 }, { "epoch": 0.5047064863939758, "grad_norm": 27.206302642822266, "learning_rate": 1.678836280661723e-05, "loss": 3.3352, "step": 2949 }, { "epoch": 0.5048776313537566, "grad_norm": 28.154144287109375, "learning_rate": 1.679406731317741e-05, "loss": 2.9377, "step": 2950 }, { "epoch": 0.5050487763135376, "grad_norm": 21.303789138793945, "learning_rate": 1.6799771819737594e-05, "loss": 2.785, "step": 2951 }, { "epoch": 0.5052199212733185, "grad_norm": 31.954051971435547, "learning_rate": 1.6805476326297777e-05, "loss": 3.1305, "step": 2952 }, { "epoch": 0.5053910662330995, "grad_norm": 10.69640827178955, "learning_rate": 1.6811180832857957e-05, "loss": 1.6799, "step": 2953 }, { "epoch": 0.5055622111928804, "grad_norm": 30.222347259521484, "learning_rate": 1.681688533941814e-05, "loss": 2.8247, "step": 2954 }, { "epoch": 0.5057333561526614, "grad_norm": 96.27491760253906, "learning_rate": 1.6822589845978324e-05, "loss": 4.6357, "step": 2955 }, { "epoch": 0.5059045011124422, "grad_norm": 28.582870483398438, "learning_rate": 1.6828294352538507e-05, "loss": 3.2733, "step": 2956 }, { "epoch": 0.5060756460722232, "grad_norm": 41.087825775146484, "learning_rate": 1.6833998859098687e-05, "loss": 7.1278, "step": 2957 }, { "epoch": 0.5062467910320041, "grad_norm": 7.500061511993408, "learning_rate": 1.683970336565887e-05, "loss": 0.8286, "step": 2958 }, { "epoch": 0.5064179359917851, "grad_norm": 26.969345092773438, "learning_rate": 1.6845407872219054e-05, "loss": 2.235, "step": 2959 }, { "epoch": 0.506589080951566, "grad_norm": 26.311525344848633, "learning_rate": 1.6851112378779234e-05, "loss": 3.0085, "step": 2960 }, { "epoch": 0.506760225911347, "grad_norm": 31.306970596313477, "learning_rate": 1.6856816885339417e-05, "loss": 2.5939, "step": 2961 }, { "epoch": 0.5069313708711278, "grad_norm": 24.608043670654297, "learning_rate": 1.68625213918996e-05, "loss": 2.3096, "step": 2962 }, { "epoch": 0.5071025158309088, "grad_norm": 27.197254180908203, "learning_rate": 1.6868225898459784e-05, "loss": 2.9187, "step": 2963 }, { "epoch": 0.5072736607906897, "grad_norm": 28.446548461914062, "learning_rate": 1.6873930405019964e-05, "loss": 3.2735, "step": 2964 }, { "epoch": 0.5074448057504707, "grad_norm": 32.15707778930664, "learning_rate": 1.6879634911580148e-05, "loss": 6.7019, "step": 2965 }, { "epoch": 0.5076159507102516, "grad_norm": 23.724163055419922, "learning_rate": 1.688533941814033e-05, "loss": 2.1627, "step": 2966 }, { "epoch": 0.5077870956700326, "grad_norm": 28.04530143737793, "learning_rate": 1.689104392470051e-05, "loss": 2.6273, "step": 2967 }, { "epoch": 0.5079582406298134, "grad_norm": 30.895709991455078, "learning_rate": 1.6896748431260698e-05, "loss": 3.8368, "step": 2968 }, { "epoch": 0.5081293855895944, "grad_norm": 14.024374961853027, "learning_rate": 1.690245293782088e-05, "loss": 1.0194, "step": 2969 }, { "epoch": 0.5083005305493753, "grad_norm": 29.09341049194336, "learning_rate": 1.690815744438106e-05, "loss": 3.5337, "step": 2970 }, { "epoch": 0.5084716755091563, "grad_norm": 28.34062385559082, "learning_rate": 1.6913861950941244e-05, "loss": 3.1743, "step": 2971 }, { "epoch": 0.5086428204689372, "grad_norm": 25.496129989624023, "learning_rate": 1.6919566457501428e-05, "loss": 2.898, "step": 2972 }, { "epoch": 0.5088139654287182, "grad_norm": 38.798343658447266, "learning_rate": 1.692527096406161e-05, "loss": 3.5201, "step": 2973 }, { "epoch": 0.508985110388499, "grad_norm": 10.149602890014648, "learning_rate": 1.693097547062179e-05, "loss": 0.7939, "step": 2974 }, { "epoch": 0.50915625534828, "grad_norm": 6.670815944671631, "learning_rate": 1.6936679977181975e-05, "loss": 0.854, "step": 2975 }, { "epoch": 0.5093274003080609, "grad_norm": 75.72901153564453, "learning_rate": 1.6942384483742158e-05, "loss": 2.9379, "step": 2976 }, { "epoch": 0.5094985452678419, "grad_norm": 26.788955688476562, "learning_rate": 1.6948088990302338e-05, "loss": 2.4457, "step": 2977 }, { "epoch": 0.5096696902276228, "grad_norm": 14.796418190002441, "learning_rate": 1.695379349686252e-05, "loss": 1.0122, "step": 2978 }, { "epoch": 0.5098408351874038, "grad_norm": 4.948236465454102, "learning_rate": 1.6959498003422705e-05, "loss": 0.5853, "step": 2979 }, { "epoch": 0.5100119801471846, "grad_norm": 182.9610595703125, "learning_rate": 1.6965202509982888e-05, "loss": 8.9776, "step": 2980 }, { "epoch": 0.5101831251069656, "grad_norm": 29.51963996887207, "learning_rate": 1.6970907016543068e-05, "loss": 2.9543, "step": 2981 }, { "epoch": 0.5103542700667465, "grad_norm": 28.639034271240234, "learning_rate": 1.697661152310325e-05, "loss": 3.2262, "step": 2982 }, { "epoch": 0.5105254150265275, "grad_norm": 29.50834846496582, "learning_rate": 1.6982316029663435e-05, "loss": 3.0001, "step": 2983 }, { "epoch": 0.5106965599863084, "grad_norm": 15.582537651062012, "learning_rate": 1.6988020536223615e-05, "loss": 1.1638, "step": 2984 }, { "epoch": 0.5108677049460894, "grad_norm": 27.667177200317383, "learning_rate": 1.6993725042783798e-05, "loss": 2.9351, "step": 2985 }, { "epoch": 0.5110388499058702, "grad_norm": 28.853923797607422, "learning_rate": 1.699942954934398e-05, "loss": 3.6286, "step": 2986 }, { "epoch": 0.5112099948656512, "grad_norm": 26.117013931274414, "learning_rate": 1.7005134055904165e-05, "loss": 2.8584, "step": 2987 }, { "epoch": 0.5113811398254321, "grad_norm": 34.81660842895508, "learning_rate": 1.7010838562464345e-05, "loss": 4.3968, "step": 2988 }, { "epoch": 0.5115522847852131, "grad_norm": 35.10283279418945, "learning_rate": 1.7016543069024528e-05, "loss": 6.599, "step": 2989 }, { "epoch": 0.511723429744994, "grad_norm": 19.16140365600586, "learning_rate": 1.7022247575584715e-05, "loss": 2.1204, "step": 2990 }, { "epoch": 0.511894574704775, "grad_norm": 22.029394149780273, "learning_rate": 1.7027952082144895e-05, "loss": 1.8696, "step": 2991 }, { "epoch": 0.5120657196645558, "grad_norm": 3.448702335357666, "learning_rate": 1.703365658870508e-05, "loss": 0.4607, "step": 2992 }, { "epoch": 0.5122368646243368, "grad_norm": 22.506763458251953, "learning_rate": 1.7039361095265262e-05, "loss": 2.1106, "step": 2993 }, { "epoch": 0.5124080095841177, "grad_norm": 31.842361450195312, "learning_rate": 1.7045065601825445e-05, "loss": 3.8676, "step": 2994 }, { "epoch": 0.5125791545438987, "grad_norm": 141.6663818359375, "learning_rate": 1.7050770108385625e-05, "loss": 8.5208, "step": 2995 }, { "epoch": 0.5127502995036796, "grad_norm": 62.276729583740234, "learning_rate": 1.705647461494581e-05, "loss": 3.2482, "step": 2996 }, { "epoch": 0.5129214444634606, "grad_norm": 22.119609832763672, "learning_rate": 1.7062179121505992e-05, "loss": 1.9903, "step": 2997 }, { "epoch": 0.5130925894232415, "grad_norm": 52.37403106689453, "learning_rate": 1.7067883628066172e-05, "loss": 6.8319, "step": 2998 }, { "epoch": 0.5132637343830224, "grad_norm": 12.259587287902832, "learning_rate": 1.7073588134626355e-05, "loss": 1.162, "step": 2999 }, { "epoch": 0.5134348793428034, "grad_norm": 8.290674209594727, "learning_rate": 1.707929264118654e-05, "loss": 0.9012, "step": 3000 }, { "epoch": 0.5136060243025843, "grad_norm": 32.74642562866211, "learning_rate": 1.7084997147746722e-05, "loss": 3.4785, "step": 3001 }, { "epoch": 0.5137771692623653, "grad_norm": 31.82801055908203, "learning_rate": 1.7090701654306902e-05, "loss": 4.2721, "step": 3002 }, { "epoch": 0.5139483142221462, "grad_norm": 32.273136138916016, "learning_rate": 1.7096406160867085e-05, "loss": 3.2625, "step": 3003 }, { "epoch": 0.5141194591819271, "grad_norm": 78.98668670654297, "learning_rate": 1.710211066742727e-05, "loss": 3.2698, "step": 3004 }, { "epoch": 0.514290604141708, "grad_norm": 30.16362762451172, "learning_rate": 1.710781517398745e-05, "loss": 3.9137, "step": 3005 }, { "epoch": 0.514461749101489, "grad_norm": 18.465227127075195, "learning_rate": 1.7113519680547632e-05, "loss": 1.8387, "step": 3006 }, { "epoch": 0.5146328940612699, "grad_norm": 3.536219358444214, "learning_rate": 1.7119224187107816e-05, "loss": 0.446, "step": 3007 }, { "epoch": 0.5148040390210509, "grad_norm": 17.390464782714844, "learning_rate": 1.7124928693667996e-05, "loss": 1.7668, "step": 3008 }, { "epoch": 0.5149751839808318, "grad_norm": 18.47218894958496, "learning_rate": 1.713063320022818e-05, "loss": 2.1817, "step": 3009 }, { "epoch": 0.5151463289406127, "grad_norm": 28.22992515563965, "learning_rate": 1.7136337706788362e-05, "loss": 2.9769, "step": 3010 }, { "epoch": 0.5153174739003936, "grad_norm": 62.36894989013672, "learning_rate": 1.7142042213348546e-05, "loss": 7.6922, "step": 3011 }, { "epoch": 0.5154886188601746, "grad_norm": 33.23900604248047, "learning_rate": 1.7147746719908726e-05, "loss": 3.3971, "step": 3012 }, { "epoch": 0.5156597638199555, "grad_norm": 2.5457472801208496, "learning_rate": 1.7153451226468912e-05, "loss": 0.4122, "step": 3013 }, { "epoch": 0.5158309087797365, "grad_norm": 26.533376693725586, "learning_rate": 1.7159155733029096e-05, "loss": 2.9528, "step": 3014 }, { "epoch": 0.5160020537395174, "grad_norm": 33.18933868408203, "learning_rate": 1.7164860239589276e-05, "loss": 3.7197, "step": 3015 }, { "epoch": 0.5161731986992983, "grad_norm": 25.48127555847168, "learning_rate": 1.717056474614946e-05, "loss": 2.8834, "step": 3016 }, { "epoch": 0.5163443436590792, "grad_norm": 32.51988983154297, "learning_rate": 1.7176269252709643e-05, "loss": 3.6681, "step": 3017 }, { "epoch": 0.5165154886188602, "grad_norm": 21.83390998840332, "learning_rate": 1.7181973759269826e-05, "loss": 2.3579, "step": 3018 }, { "epoch": 0.5166866335786411, "grad_norm": 21.106168746948242, "learning_rate": 1.7187678265830006e-05, "loss": 2.1503, "step": 3019 }, { "epoch": 0.5168577785384221, "grad_norm": 23.668697357177734, "learning_rate": 1.719338277239019e-05, "loss": 2.65, "step": 3020 }, { "epoch": 0.517028923498203, "grad_norm": 56.29466247558594, "learning_rate": 1.7199087278950373e-05, "loss": 7.269, "step": 3021 }, { "epoch": 0.5172000684579839, "grad_norm": 14.612650871276855, "learning_rate": 1.7204791785510553e-05, "loss": 1.5426, "step": 3022 }, { "epoch": 0.5173712134177648, "grad_norm": 28.365121841430664, "learning_rate": 1.7210496292070736e-05, "loss": 3.636, "step": 3023 }, { "epoch": 0.5175423583775458, "grad_norm": 25.329317092895508, "learning_rate": 1.721620079863092e-05, "loss": 2.3847, "step": 3024 }, { "epoch": 0.5177135033373267, "grad_norm": 32.05517578125, "learning_rate": 1.7221905305191103e-05, "loss": 3.7742, "step": 3025 }, { "epoch": 0.5178846482971077, "grad_norm": 11.009437561035156, "learning_rate": 1.7227609811751283e-05, "loss": 1.5541, "step": 3026 }, { "epoch": 0.5180557932568886, "grad_norm": 4.6759490966796875, "learning_rate": 1.7233314318311466e-05, "loss": 0.448, "step": 3027 }, { "epoch": 0.5182269382166695, "grad_norm": 23.18576431274414, "learning_rate": 1.723901882487165e-05, "loss": 2.3099, "step": 3028 }, { "epoch": 0.5183980831764504, "grad_norm": 21.823318481445312, "learning_rate": 1.724472333143183e-05, "loss": 2.0502, "step": 3029 }, { "epoch": 0.5185692281362314, "grad_norm": 33.11149215698242, "learning_rate": 1.7250427837992013e-05, "loss": 3.7448, "step": 3030 }, { "epoch": 0.5187403730960123, "grad_norm": 32.03651809692383, "learning_rate": 1.7256132344552196e-05, "loss": 3.5141, "step": 3031 }, { "epoch": 0.5189115180557933, "grad_norm": 29.257003784179688, "learning_rate": 1.726183685111238e-05, "loss": 3.5149, "step": 3032 }, { "epoch": 0.5190826630155742, "grad_norm": 6.367782115936279, "learning_rate": 1.726754135767256e-05, "loss": 1.1882, "step": 3033 }, { "epoch": 0.5192538079753551, "grad_norm": 21.6986083984375, "learning_rate": 1.7273245864232743e-05, "loss": 1.7948, "step": 3034 }, { "epoch": 0.519424952935136, "grad_norm": 14.612825393676758, "learning_rate": 1.7278950370792926e-05, "loss": 1.1659, "step": 3035 }, { "epoch": 0.519596097894917, "grad_norm": 28.725549697875977, "learning_rate": 1.728465487735311e-05, "loss": 2.7611, "step": 3036 }, { "epoch": 0.5197672428546979, "grad_norm": 30.985149383544922, "learning_rate": 1.7290359383913293e-05, "loss": 2.9706, "step": 3037 }, { "epoch": 0.5199383878144789, "grad_norm": 17.664464950561523, "learning_rate": 1.7296063890473477e-05, "loss": 1.6902, "step": 3038 }, { "epoch": 0.5201095327742598, "grad_norm": 32.17440414428711, "learning_rate": 1.7301768397033657e-05, "loss": 3.8811, "step": 3039 }, { "epoch": 0.5202806777340407, "grad_norm": 5.3300580978393555, "learning_rate": 1.730747290359384e-05, "loss": 0.4921, "step": 3040 }, { "epoch": 0.5204518226938216, "grad_norm": 38.537044525146484, "learning_rate": 1.7313177410154023e-05, "loss": 3.2832, "step": 3041 }, { "epoch": 0.5206229676536026, "grad_norm": 39.10978698730469, "learning_rate": 1.7318881916714207e-05, "loss": 3.8919, "step": 3042 }, { "epoch": 0.5207941126133835, "grad_norm": 29.357208251953125, "learning_rate": 1.7324586423274387e-05, "loss": 3.115, "step": 3043 }, { "epoch": 0.5209652575731645, "grad_norm": 15.655451774597168, "learning_rate": 1.733029092983457e-05, "loss": 1.4122, "step": 3044 }, { "epoch": 0.5211364025329454, "grad_norm": 28.293025970458984, "learning_rate": 1.7335995436394753e-05, "loss": 2.9349, "step": 3045 }, { "epoch": 0.5213075474927263, "grad_norm": 32.65211868286133, "learning_rate": 1.7341699942954933e-05, "loss": 3.2992, "step": 3046 }, { "epoch": 0.5214786924525072, "grad_norm": 23.2037296295166, "learning_rate": 1.7347404449515117e-05, "loss": 2.3879, "step": 3047 }, { "epoch": 0.5216498374122882, "grad_norm": 26.37859535217285, "learning_rate": 1.73531089560753e-05, "loss": 2.1987, "step": 3048 }, { "epoch": 0.5218209823720692, "grad_norm": 18.490966796875, "learning_rate": 1.7358813462635484e-05, "loss": 1.461, "step": 3049 }, { "epoch": 0.5219921273318501, "grad_norm": 31.97382354736328, "learning_rate": 1.7364517969195664e-05, "loss": 2.8849, "step": 3050 }, { "epoch": 0.5221632722916311, "grad_norm": 49.7996711730957, "learning_rate": 1.7370222475755847e-05, "loss": 6.9056, "step": 3051 }, { "epoch": 0.5223344172514119, "grad_norm": 28.981660842895508, "learning_rate": 1.737592698231603e-05, "loss": 3.0778, "step": 3052 }, { "epoch": 0.5225055622111929, "grad_norm": 120.67489624023438, "learning_rate": 1.738163148887621e-05, "loss": 4.2964, "step": 3053 }, { "epoch": 0.5226767071709738, "grad_norm": 158.1115264892578, "learning_rate": 1.7387335995436394e-05, "loss": 8.5312, "step": 3054 }, { "epoch": 0.5228478521307548, "grad_norm": 28.185558319091797, "learning_rate": 1.7393040501996577e-05, "loss": 3.537, "step": 3055 }, { "epoch": 0.5230189970905357, "grad_norm": 9.107454299926758, "learning_rate": 1.739874500855676e-05, "loss": 1.0684, "step": 3056 }, { "epoch": 0.5231901420503167, "grad_norm": 36.81668472290039, "learning_rate": 1.740444951511694e-05, "loss": 3.8926, "step": 3057 }, { "epoch": 0.5233612870100975, "grad_norm": 26.352327346801758, "learning_rate": 1.7410154021677124e-05, "loss": 2.664, "step": 3058 }, { "epoch": 0.5235324319698785, "grad_norm": 21.38902473449707, "learning_rate": 1.741585852823731e-05, "loss": 2.2566, "step": 3059 }, { "epoch": 0.5237035769296594, "grad_norm": 10.19254207611084, "learning_rate": 1.742156303479749e-05, "loss": 0.8717, "step": 3060 }, { "epoch": 0.5238747218894404, "grad_norm": 19.25916862487793, "learning_rate": 1.7427267541357674e-05, "loss": 1.3792, "step": 3061 }, { "epoch": 0.5240458668492213, "grad_norm": 21.88836669921875, "learning_rate": 1.7432972047917857e-05, "loss": 2.2543, "step": 3062 }, { "epoch": 0.5242170118090023, "grad_norm": 30.14661979675293, "learning_rate": 1.743867655447804e-05, "loss": 3.4672, "step": 3063 }, { "epoch": 0.5243881567687831, "grad_norm": 25.134571075439453, "learning_rate": 1.744438106103822e-05, "loss": 2.5836, "step": 3064 }, { "epoch": 0.5245593017285641, "grad_norm": 21.906818389892578, "learning_rate": 1.7450085567598404e-05, "loss": 2.4618, "step": 3065 }, { "epoch": 0.524730446688345, "grad_norm": 31.330976486206055, "learning_rate": 1.7455790074158587e-05, "loss": 3.1397, "step": 3066 }, { "epoch": 0.524901591648126, "grad_norm": 6.355524063110352, "learning_rate": 1.7461494580718767e-05, "loss": 0.5091, "step": 3067 }, { "epoch": 0.5250727366079069, "grad_norm": 110.98942565917969, "learning_rate": 1.746719908727895e-05, "loss": 7.9114, "step": 3068 }, { "epoch": 0.5252438815676879, "grad_norm": 31.17119789123535, "learning_rate": 1.7472903593839134e-05, "loss": 3.2594, "step": 3069 }, { "epoch": 0.5254150265274687, "grad_norm": 24.364032745361328, "learning_rate": 1.7478608100399318e-05, "loss": 1.9217, "step": 3070 }, { "epoch": 0.5255861714872497, "grad_norm": 34.264041900634766, "learning_rate": 1.7484312606959498e-05, "loss": 3.9812, "step": 3071 }, { "epoch": 0.5257573164470306, "grad_norm": 27.54375648498535, "learning_rate": 1.749001711351968e-05, "loss": 2.8373, "step": 3072 }, { "epoch": 0.5259284614068116, "grad_norm": 58.27510452270508, "learning_rate": 1.7495721620079864e-05, "loss": 7.2686, "step": 3073 }, { "epoch": 0.5260996063665925, "grad_norm": 27.861116409301758, "learning_rate": 1.7501426126640044e-05, "loss": 3.2877, "step": 3074 }, { "epoch": 0.5262707513263735, "grad_norm": 28.097177505493164, "learning_rate": 1.7507130633200228e-05, "loss": 2.3413, "step": 3075 }, { "epoch": 0.5264418962861543, "grad_norm": 30.74901008605957, "learning_rate": 1.751283513976041e-05, "loss": 3.2284, "step": 3076 }, { "epoch": 0.5266130412459353, "grad_norm": 5.434010982513428, "learning_rate": 1.751853964632059e-05, "loss": 0.5515, "step": 3077 }, { "epoch": 0.5267841862057162, "grad_norm": 19.591594696044922, "learning_rate": 1.7524244152880774e-05, "loss": 1.8104, "step": 3078 }, { "epoch": 0.5269553311654972, "grad_norm": 27.989707946777344, "learning_rate": 1.7529948659440958e-05, "loss": 2.4876, "step": 3079 }, { "epoch": 0.5271264761252781, "grad_norm": 45.50398635864258, "learning_rate": 1.753565316600114e-05, "loss": 6.9276, "step": 3080 }, { "epoch": 0.5272976210850591, "grad_norm": 29.907915115356445, "learning_rate": 1.754135767256132e-05, "loss": 3.8381, "step": 3081 }, { "epoch": 0.5274687660448399, "grad_norm": 22.03485679626465, "learning_rate": 1.7547062179121508e-05, "loss": 1.8432, "step": 3082 }, { "epoch": 0.5276399110046209, "grad_norm": 41.72187042236328, "learning_rate": 1.755276668568169e-05, "loss": 6.81, "step": 3083 }, { "epoch": 0.5278110559644018, "grad_norm": 15.85753345489502, "learning_rate": 1.755847119224187e-05, "loss": 1.1867, "step": 3084 }, { "epoch": 0.5279822009241828, "grad_norm": 14.52872085571289, "learning_rate": 1.7564175698802055e-05, "loss": 1.02, "step": 3085 }, { "epoch": 0.5281533458839637, "grad_norm": 47.226070404052734, "learning_rate": 1.7569880205362238e-05, "loss": 6.5701, "step": 3086 }, { "epoch": 0.5283244908437447, "grad_norm": 26.31117820739746, "learning_rate": 1.757558471192242e-05, "loss": 2.8588, "step": 3087 }, { "epoch": 0.5284956358035255, "grad_norm": 24.817096710205078, "learning_rate": 1.75812892184826e-05, "loss": 2.5557, "step": 3088 }, { "epoch": 0.5286667807633065, "grad_norm": 3.8697149753570557, "learning_rate": 1.7586993725042785e-05, "loss": 0.447, "step": 3089 }, { "epoch": 0.5288379257230874, "grad_norm": 27.01019287109375, "learning_rate": 1.7592698231602968e-05, "loss": 2.6394, "step": 3090 }, { "epoch": 0.5290090706828684, "grad_norm": 3.1552348136901855, "learning_rate": 1.7598402738163148e-05, "loss": 0.4523, "step": 3091 }, { "epoch": 0.5291802156426493, "grad_norm": 30.454021453857422, "learning_rate": 1.760410724472333e-05, "loss": 3.3332, "step": 3092 }, { "epoch": 0.5293513606024303, "grad_norm": 2.6408188343048096, "learning_rate": 1.7609811751283515e-05, "loss": 0.4075, "step": 3093 }, { "epoch": 0.5295225055622111, "grad_norm": 27.623132705688477, "learning_rate": 1.76155162578437e-05, "loss": 2.7079, "step": 3094 }, { "epoch": 0.5296936505219921, "grad_norm": 22.717605590820312, "learning_rate": 1.762122076440388e-05, "loss": 2.2378, "step": 3095 }, { "epoch": 0.529864795481773, "grad_norm": 50.63970184326172, "learning_rate": 1.7626925270964062e-05, "loss": 3.3046, "step": 3096 }, { "epoch": 0.530035940441554, "grad_norm": 47.14366912841797, "learning_rate": 1.7632629777524245e-05, "loss": 6.9276, "step": 3097 }, { "epoch": 0.5302070854013349, "grad_norm": 26.201753616333008, "learning_rate": 1.7638334284084425e-05, "loss": 2.3728, "step": 3098 }, { "epoch": 0.5303782303611159, "grad_norm": 33.462398529052734, "learning_rate": 1.764403879064461e-05, "loss": 6.8023, "step": 3099 }, { "epoch": 0.5305493753208969, "grad_norm": 32.51939010620117, "learning_rate": 1.7649743297204792e-05, "loss": 3.992, "step": 3100 }, { "epoch": 0.5307205202806777, "grad_norm": 14.161356925964355, "learning_rate": 1.7655447803764975e-05, "loss": 1.0058, "step": 3101 }, { "epoch": 0.5308916652404587, "grad_norm": 91.61168670654297, "learning_rate": 1.7661152310325155e-05, "loss": 6.6935, "step": 3102 }, { "epoch": 0.5310628102002396, "grad_norm": 26.40794563293457, "learning_rate": 1.766685681688534e-05, "loss": 2.5008, "step": 3103 }, { "epoch": 0.5312339551600206, "grad_norm": 21.793699264526367, "learning_rate": 1.7672561323445522e-05, "loss": 2.0443, "step": 3104 }, { "epoch": 0.5314051001198015, "grad_norm": 27.75925636291504, "learning_rate": 1.7678265830005705e-05, "loss": 2.893, "step": 3105 }, { "epoch": 0.5315762450795825, "grad_norm": 22.48872947692871, "learning_rate": 1.768397033656589e-05, "loss": 1.8274, "step": 3106 }, { "epoch": 0.5317473900393633, "grad_norm": 21.972978591918945, "learning_rate": 1.7689674843126072e-05, "loss": 2.0134, "step": 3107 }, { "epoch": 0.5319185349991443, "grad_norm": 4.393357753753662, "learning_rate": 1.7695379349686252e-05, "loss": 0.4006, "step": 3108 }, { "epoch": 0.5320896799589252, "grad_norm": 15.986166000366211, "learning_rate": 1.7701083856246435e-05, "loss": 1.0921, "step": 3109 }, { "epoch": 0.5322608249187062, "grad_norm": 23.317607879638672, "learning_rate": 1.770678836280662e-05, "loss": 2.5234, "step": 3110 }, { "epoch": 0.5324319698784871, "grad_norm": 81.8624267578125, "learning_rate": 1.7712492869366802e-05, "loss": 3.4206, "step": 3111 }, { "epoch": 0.532603114838268, "grad_norm": 46.01921844482422, "learning_rate": 1.7718197375926982e-05, "loss": 3.2694, "step": 3112 }, { "epoch": 0.5327742597980489, "grad_norm": 14.079997062683105, "learning_rate": 1.7723901882487166e-05, "loss": 1.1213, "step": 3113 }, { "epoch": 0.5329454047578299, "grad_norm": 27.70348358154297, "learning_rate": 1.772960638904735e-05, "loss": 2.9553, "step": 3114 }, { "epoch": 0.5331165497176108, "grad_norm": 13.08663558959961, "learning_rate": 1.773531089560753e-05, "loss": 0.9058, "step": 3115 }, { "epoch": 0.5332876946773918, "grad_norm": 5.895364761352539, "learning_rate": 1.7741015402167712e-05, "loss": 0.5572, "step": 3116 }, { "epoch": 0.5334588396371727, "grad_norm": 14.521390914916992, "learning_rate": 1.7746719908727896e-05, "loss": 1.2956, "step": 3117 }, { "epoch": 0.5336299845969537, "grad_norm": 5.561517238616943, "learning_rate": 1.775242441528808e-05, "loss": 0.7001, "step": 3118 }, { "epoch": 0.5338011295567345, "grad_norm": 12.158028602600098, "learning_rate": 1.775812892184826e-05, "loss": 0.8123, "step": 3119 }, { "epoch": 0.5339722745165155, "grad_norm": 32.72988510131836, "learning_rate": 1.7763833428408442e-05, "loss": 2.9845, "step": 3120 }, { "epoch": 0.5341434194762964, "grad_norm": 31.350831985473633, "learning_rate": 1.7769537934968626e-05, "loss": 3.6956, "step": 3121 }, { "epoch": 0.5343145644360774, "grad_norm": 19.63844871520996, "learning_rate": 1.7775242441528806e-05, "loss": 2.0279, "step": 3122 }, { "epoch": 0.5344857093958583, "grad_norm": 10.440444946289062, "learning_rate": 1.778094694808899e-05, "loss": 0.8947, "step": 3123 }, { "epoch": 0.5346568543556393, "grad_norm": 28.158235549926758, "learning_rate": 1.7786651454649173e-05, "loss": 2.9307, "step": 3124 }, { "epoch": 0.5348279993154201, "grad_norm": 25.009632110595703, "learning_rate": 1.7792355961209356e-05, "loss": 2.3439, "step": 3125 }, { "epoch": 0.5349991442752011, "grad_norm": 25.99068832397461, "learning_rate": 1.7798060467769536e-05, "loss": 2.681, "step": 3126 }, { "epoch": 0.535170289234982, "grad_norm": 16.541526794433594, "learning_rate": 1.780376497432972e-05, "loss": 1.7645, "step": 3127 }, { "epoch": 0.535341434194763, "grad_norm": 32.52701950073242, "learning_rate": 1.7809469480889906e-05, "loss": 3.4399, "step": 3128 }, { "epoch": 0.5355125791545439, "grad_norm": 1.9595259428024292, "learning_rate": 1.7815173987450086e-05, "loss": 0.3411, "step": 3129 }, { "epoch": 0.5356837241143249, "grad_norm": 22.871707916259766, "learning_rate": 1.782087849401027e-05, "loss": 2.7152, "step": 3130 }, { "epoch": 0.5358548690741057, "grad_norm": 30.88572120666504, "learning_rate": 1.7826583000570453e-05, "loss": 3.0383, "step": 3131 }, { "epoch": 0.5360260140338867, "grad_norm": 24.158727645874023, "learning_rate": 1.7832287507130636e-05, "loss": 2.7594, "step": 3132 }, { "epoch": 0.5361971589936676, "grad_norm": 19.16653823852539, "learning_rate": 1.7837992013690816e-05, "loss": 1.7749, "step": 3133 }, { "epoch": 0.5363683039534486, "grad_norm": 11.925354957580566, "learning_rate": 1.7843696520251e-05, "loss": 0.8569, "step": 3134 }, { "epoch": 0.5365394489132295, "grad_norm": 20.42278289794922, "learning_rate": 1.7849401026811183e-05, "loss": 1.9146, "step": 3135 }, { "epoch": 0.5367105938730105, "grad_norm": 36.13545227050781, "learning_rate": 1.7855105533371363e-05, "loss": 4.4798, "step": 3136 }, { "epoch": 0.5368817388327913, "grad_norm": 4.70065975189209, "learning_rate": 1.7860810039931546e-05, "loss": 0.4304, "step": 3137 }, { "epoch": 0.5370528837925723, "grad_norm": 24.28241539001465, "learning_rate": 1.786651454649173e-05, "loss": 2.4005, "step": 3138 }, { "epoch": 0.5372240287523532, "grad_norm": 14.650952339172363, "learning_rate": 1.787221905305191e-05, "loss": 0.968, "step": 3139 }, { "epoch": 0.5373951737121342, "grad_norm": 16.861696243286133, "learning_rate": 1.7877923559612093e-05, "loss": 1.7277, "step": 3140 }, { "epoch": 0.5375663186719151, "grad_norm": 5.233786106109619, "learning_rate": 1.7883628066172276e-05, "loss": 0.7488, "step": 3141 }, { "epoch": 0.537737463631696, "grad_norm": 32.38574981689453, "learning_rate": 1.788933257273246e-05, "loss": 3.4367, "step": 3142 }, { "epoch": 0.5379086085914769, "grad_norm": 75.13265991210938, "learning_rate": 1.789503707929264e-05, "loss": 3.4272, "step": 3143 }, { "epoch": 0.5380797535512579, "grad_norm": 23.56121063232422, "learning_rate": 1.7900741585852823e-05, "loss": 2.2267, "step": 3144 }, { "epoch": 0.5382508985110388, "grad_norm": 6.575436592102051, "learning_rate": 1.7906446092413007e-05, "loss": 0.7715, "step": 3145 }, { "epoch": 0.5384220434708198, "grad_norm": 30.233795166015625, "learning_rate": 1.7912150598973187e-05, "loss": 2.2247, "step": 3146 }, { "epoch": 0.5385931884306007, "grad_norm": 18.158550262451172, "learning_rate": 1.791785510553337e-05, "loss": 1.4116, "step": 3147 }, { "epoch": 0.5387643333903817, "grad_norm": 25.578800201416016, "learning_rate": 1.7923559612093553e-05, "loss": 2.237, "step": 3148 }, { "epoch": 0.5389354783501625, "grad_norm": 4.3977460861206055, "learning_rate": 1.7929264118653737e-05, "loss": 0.4421, "step": 3149 }, { "epoch": 0.5391066233099435, "grad_norm": 23.86539649963379, "learning_rate": 1.793496862521392e-05, "loss": 2.2867, "step": 3150 }, { "epoch": 0.5392777682697245, "grad_norm": 2.900665283203125, "learning_rate": 1.7940673131774103e-05, "loss": 0.3778, "step": 3151 }, { "epoch": 0.5394489132295054, "grad_norm": 28.02079200744629, "learning_rate": 1.7946377638334287e-05, "loss": 2.756, "step": 3152 }, { "epoch": 0.5396200581892864, "grad_norm": 27.565895080566406, "learning_rate": 1.7952082144894467e-05, "loss": 2.3044, "step": 3153 }, { "epoch": 0.5397912031490673, "grad_norm": 35.14018630981445, "learning_rate": 1.795778665145465e-05, "loss": 4.3437, "step": 3154 }, { "epoch": 0.5399623481088482, "grad_norm": 24.932573318481445, "learning_rate": 1.7963491158014834e-05, "loss": 2.2505, "step": 3155 }, { "epoch": 0.5401334930686291, "grad_norm": 26.866313934326172, "learning_rate": 1.7969195664575017e-05, "loss": 2.7324, "step": 3156 }, { "epoch": 0.5403046380284101, "grad_norm": 22.461328506469727, "learning_rate": 1.7974900171135197e-05, "loss": 2.1863, "step": 3157 }, { "epoch": 0.540475782988191, "grad_norm": 16.967121124267578, "learning_rate": 1.798060467769538e-05, "loss": 1.0429, "step": 3158 }, { "epoch": 0.540646927947972, "grad_norm": 116.18841552734375, "learning_rate": 1.7986309184255564e-05, "loss": 3.4443, "step": 3159 }, { "epoch": 0.5408180729077529, "grad_norm": 28.559480667114258, "learning_rate": 1.7992013690815744e-05, "loss": 2.4973, "step": 3160 }, { "epoch": 0.5409892178675338, "grad_norm": 3.590916395187378, "learning_rate": 1.7997718197375927e-05, "loss": 0.4966, "step": 3161 }, { "epoch": 0.5411603628273147, "grad_norm": 78.85108947753906, "learning_rate": 1.800342270393611e-05, "loss": 4.2312, "step": 3162 }, { "epoch": 0.5413315077870957, "grad_norm": 25.83539390563965, "learning_rate": 1.8009127210496294e-05, "loss": 2.4909, "step": 3163 }, { "epoch": 0.5415026527468766, "grad_norm": 4.292176246643066, "learning_rate": 1.8014831717056474e-05, "loss": 0.4314, "step": 3164 }, { "epoch": 0.5416737977066576, "grad_norm": 6.629253387451172, "learning_rate": 1.8020536223616657e-05, "loss": 0.7743, "step": 3165 }, { "epoch": 0.5418449426664385, "grad_norm": 22.770082473754883, "learning_rate": 1.802624073017684e-05, "loss": 2.2334, "step": 3166 }, { "epoch": 0.5420160876262194, "grad_norm": 26.48427963256836, "learning_rate": 1.803194523673702e-05, "loss": 3.4341, "step": 3167 }, { "epoch": 0.5421872325860003, "grad_norm": 9.429801940917969, "learning_rate": 1.8037649743297204e-05, "loss": 0.8092, "step": 3168 }, { "epoch": 0.5423583775457813, "grad_norm": 56.79134750366211, "learning_rate": 1.8043354249857387e-05, "loss": 7.2408, "step": 3169 }, { "epoch": 0.5425295225055622, "grad_norm": 26.484098434448242, "learning_rate": 1.804905875641757e-05, "loss": 2.4684, "step": 3170 }, { "epoch": 0.5427006674653432, "grad_norm": 21.694990158081055, "learning_rate": 1.805476326297775e-05, "loss": 2.1088, "step": 3171 }, { "epoch": 0.5428718124251241, "grad_norm": 23.824108123779297, "learning_rate": 1.8060467769537934e-05, "loss": 2.7028, "step": 3172 }, { "epoch": 0.543042957384905, "grad_norm": 23.9963321685791, "learning_rate": 1.806617227609812e-05, "loss": 2.1508, "step": 3173 }, { "epoch": 0.5432141023446859, "grad_norm": 23.810443878173828, "learning_rate": 1.80718767826583e-05, "loss": 3.0015, "step": 3174 }, { "epoch": 0.5433852473044669, "grad_norm": 38.47050857543945, "learning_rate": 1.8077581289218484e-05, "loss": 7.0851, "step": 3175 }, { "epoch": 0.5435563922642478, "grad_norm": 26.14175033569336, "learning_rate": 1.8083285795778668e-05, "loss": 2.8927, "step": 3176 }, { "epoch": 0.5437275372240288, "grad_norm": 34.895294189453125, "learning_rate": 1.8088990302338848e-05, "loss": 6.4431, "step": 3177 }, { "epoch": 0.5438986821838097, "grad_norm": 30.46366310119629, "learning_rate": 1.809469480889903e-05, "loss": 3.8774, "step": 3178 }, { "epoch": 0.5440698271435906, "grad_norm": 2.045729637145996, "learning_rate": 1.8100399315459214e-05, "loss": 0.3887, "step": 3179 }, { "epoch": 0.5442409721033715, "grad_norm": 23.526275634765625, "learning_rate": 1.8106103822019398e-05, "loss": 2.2568, "step": 3180 }, { "epoch": 0.5444121170631525, "grad_norm": 37.37553024291992, "learning_rate": 1.8111808328579578e-05, "loss": 5.5636, "step": 3181 }, { "epoch": 0.5445832620229334, "grad_norm": 2.853957176208496, "learning_rate": 1.811751283513976e-05, "loss": 0.3745, "step": 3182 }, { "epoch": 0.5447544069827144, "grad_norm": 3.5641119480133057, "learning_rate": 1.8123217341699944e-05, "loss": 0.424, "step": 3183 }, { "epoch": 0.5449255519424953, "grad_norm": 8.759005546569824, "learning_rate": 1.8128921848260124e-05, "loss": 0.7256, "step": 3184 }, { "epoch": 0.5450966969022762, "grad_norm": 54.43397521972656, "learning_rate": 1.8134626354820308e-05, "loss": 7.7319, "step": 3185 }, { "epoch": 0.5452678418620571, "grad_norm": 26.35443878173828, "learning_rate": 1.814033086138049e-05, "loss": 3.1477, "step": 3186 }, { "epoch": 0.5454389868218381, "grad_norm": 18.872291564941406, "learning_rate": 1.8146035367940675e-05, "loss": 1.8724, "step": 3187 }, { "epoch": 0.545610131781619, "grad_norm": 22.673784255981445, "learning_rate": 1.8151739874500855e-05, "loss": 1.9899, "step": 3188 }, { "epoch": 0.5457812767414, "grad_norm": 9.217958450317383, "learning_rate": 1.8157444381061038e-05, "loss": 0.726, "step": 3189 }, { "epoch": 0.5459524217011809, "grad_norm": 2.148630380630493, "learning_rate": 1.816314888762122e-05, "loss": 0.323, "step": 3190 }, { "epoch": 0.5461235666609618, "grad_norm": 26.988340377807617, "learning_rate": 1.81688533941814e-05, "loss": 2.586, "step": 3191 }, { "epoch": 0.5462947116207427, "grad_norm": 37.6932373046875, "learning_rate": 1.8174557900741585e-05, "loss": 2.9146, "step": 3192 }, { "epoch": 0.5464658565805237, "grad_norm": 74.42720794677734, "learning_rate": 1.8180262407301768e-05, "loss": 3.2535, "step": 3193 }, { "epoch": 0.5466370015403046, "grad_norm": 29.757360458374023, "learning_rate": 1.818596691386195e-05, "loss": 2.8882, "step": 3194 }, { "epoch": 0.5468081465000856, "grad_norm": 15.420557975769043, "learning_rate": 1.819167142042213e-05, "loss": 1.6278, "step": 3195 }, { "epoch": 0.5469792914598665, "grad_norm": 31.367387771606445, "learning_rate": 1.8197375926982318e-05, "loss": 2.9209, "step": 3196 }, { "epoch": 0.5471504364196474, "grad_norm": 28.30303382873535, "learning_rate": 1.82030804335425e-05, "loss": 3.066, "step": 3197 }, { "epoch": 0.5473215813794283, "grad_norm": 27.540369033813477, "learning_rate": 1.820878494010268e-05, "loss": 3.3267, "step": 3198 }, { "epoch": 0.5474927263392093, "grad_norm": 4.438743591308594, "learning_rate": 1.8214489446662865e-05, "loss": 0.3722, "step": 3199 }, { "epoch": 0.5476638712989902, "grad_norm": 29.85404396057129, "learning_rate": 1.822019395322305e-05, "loss": 4.0139, "step": 3200 }, { "epoch": 0.5478350162587712, "grad_norm": 28.56346893310547, "learning_rate": 1.8225898459783232e-05, "loss": 3.6119, "step": 3201 }, { "epoch": 0.5480061612185522, "grad_norm": 29.419742584228516, "learning_rate": 1.8231602966343412e-05, "loss": 2.9087, "step": 3202 }, { "epoch": 0.548177306178333, "grad_norm": 44.72222900390625, "learning_rate": 1.8237307472903595e-05, "loss": 6.7043, "step": 3203 }, { "epoch": 0.548348451138114, "grad_norm": 21.762168884277344, "learning_rate": 1.824301197946378e-05, "loss": 1.9849, "step": 3204 }, { "epoch": 0.5485195960978949, "grad_norm": 26.09598731994629, "learning_rate": 1.824871648602396e-05, "loss": 2.6737, "step": 3205 }, { "epoch": 0.5486907410576759, "grad_norm": 32.6449089050293, "learning_rate": 1.8254420992584142e-05, "loss": 2.7534, "step": 3206 }, { "epoch": 0.5488618860174568, "grad_norm": 20.140134811401367, "learning_rate": 1.8260125499144325e-05, "loss": 2.0024, "step": 3207 }, { "epoch": 0.5490330309772378, "grad_norm": 8.021845817565918, "learning_rate": 1.8265830005704505e-05, "loss": 0.8484, "step": 3208 }, { "epoch": 0.5492041759370186, "grad_norm": 23.706680297851562, "learning_rate": 1.827153451226469e-05, "loss": 2.5228, "step": 3209 }, { "epoch": 0.5493753208967996, "grad_norm": 25.105031967163086, "learning_rate": 1.8277239018824872e-05, "loss": 2.4342, "step": 3210 }, { "epoch": 0.5495464658565805, "grad_norm": 16.53352165222168, "learning_rate": 1.8282943525385055e-05, "loss": 1.4315, "step": 3211 }, { "epoch": 0.5497176108163615, "grad_norm": 24.55224609375, "learning_rate": 1.8288648031945235e-05, "loss": 2.469, "step": 3212 }, { "epoch": 0.5498887557761424, "grad_norm": 3.4264721870422363, "learning_rate": 1.829435253850542e-05, "loss": 0.3677, "step": 3213 }, { "epoch": 0.5500599007359234, "grad_norm": 20.305509567260742, "learning_rate": 1.8300057045065602e-05, "loss": 2.3214, "step": 3214 }, { "epoch": 0.5502310456957042, "grad_norm": 27.69756507873535, "learning_rate": 1.8305761551625782e-05, "loss": 2.8746, "step": 3215 }, { "epoch": 0.5504021906554852, "grad_norm": 100.86264038085938, "learning_rate": 1.8311466058185965e-05, "loss": 7.5686, "step": 3216 }, { "epoch": 0.5505733356152661, "grad_norm": 26.603628158569336, "learning_rate": 1.831717056474615e-05, "loss": 2.5639, "step": 3217 }, { "epoch": 0.5507444805750471, "grad_norm": 31.449655532836914, "learning_rate": 1.8322875071306332e-05, "loss": 3.9795, "step": 3218 }, { "epoch": 0.550915625534828, "grad_norm": 25.562639236450195, "learning_rate": 1.8328579577866516e-05, "loss": 3.0795, "step": 3219 }, { "epoch": 0.551086770494609, "grad_norm": 4.988560199737549, "learning_rate": 1.83342840844267e-05, "loss": 0.4445, "step": 3220 }, { "epoch": 0.5512579154543898, "grad_norm": 31.045183181762695, "learning_rate": 1.8339988590986882e-05, "loss": 6.4413, "step": 3221 }, { "epoch": 0.5514290604141708, "grad_norm": 32.938106536865234, "learning_rate": 1.8345693097547062e-05, "loss": 4.0821, "step": 3222 }, { "epoch": 0.5516002053739517, "grad_norm": 23.498254776000977, "learning_rate": 1.8351397604107246e-05, "loss": 2.8752, "step": 3223 }, { "epoch": 0.5517713503337327, "grad_norm": 27.559247970581055, "learning_rate": 1.835710211066743e-05, "loss": 3.3216, "step": 3224 }, { "epoch": 0.5519424952935136, "grad_norm": 46.420135498046875, "learning_rate": 1.8362806617227613e-05, "loss": 6.9903, "step": 3225 }, { "epoch": 0.5521136402532946, "grad_norm": 23.508155822753906, "learning_rate": 1.8368511123787793e-05, "loss": 2.1877, "step": 3226 }, { "epoch": 0.5522847852130754, "grad_norm": 20.4776611328125, "learning_rate": 1.8374215630347976e-05, "loss": 1.8942, "step": 3227 }, { "epoch": 0.5524559301728564, "grad_norm": 15.294054985046387, "learning_rate": 1.837992013690816e-05, "loss": 1.2082, "step": 3228 }, { "epoch": 0.5526270751326373, "grad_norm": 22.51180076599121, "learning_rate": 1.838562464346834e-05, "loss": 2.2929, "step": 3229 }, { "epoch": 0.5527982200924183, "grad_norm": 21.741634368896484, "learning_rate": 1.8391329150028523e-05, "loss": 1.884, "step": 3230 }, { "epoch": 0.5529693650521992, "grad_norm": 4.330467224121094, "learning_rate": 1.8397033656588706e-05, "loss": 0.4163, "step": 3231 }, { "epoch": 0.5531405100119802, "grad_norm": 26.344017028808594, "learning_rate": 1.840273816314889e-05, "loss": 2.5767, "step": 3232 }, { "epoch": 0.553311654971761, "grad_norm": 53.116172790527344, "learning_rate": 1.840844266970907e-05, "loss": 2.7426, "step": 3233 }, { "epoch": 0.553482799931542, "grad_norm": 15.442861557006836, "learning_rate": 1.8414147176269253e-05, "loss": 0.9586, "step": 3234 }, { "epoch": 0.5536539448913229, "grad_norm": 28.15229606628418, "learning_rate": 1.8419851682829436e-05, "loss": 2.9423, "step": 3235 }, { "epoch": 0.5538250898511039, "grad_norm": 21.91160011291504, "learning_rate": 1.8425556189389616e-05, "loss": 2.3152, "step": 3236 }, { "epoch": 0.5539962348108848, "grad_norm": 21.20878028869629, "learning_rate": 1.84312606959498e-05, "loss": 2.117, "step": 3237 }, { "epoch": 0.5541673797706658, "grad_norm": 1.7572641372680664, "learning_rate": 1.8436965202509983e-05, "loss": 0.3516, "step": 3238 }, { "epoch": 0.5543385247304466, "grad_norm": 25.218217849731445, "learning_rate": 1.8442669709070166e-05, "loss": 2.5554, "step": 3239 }, { "epoch": 0.5545096696902276, "grad_norm": 30.133291244506836, "learning_rate": 1.8448374215630346e-05, "loss": 3.078, "step": 3240 }, { "epoch": 0.5546808146500085, "grad_norm": 30.298227310180664, "learning_rate": 1.845407872219053e-05, "loss": 3.2276, "step": 3241 }, { "epoch": 0.5548519596097895, "grad_norm": 31.560077667236328, "learning_rate": 1.8459783228750716e-05, "loss": 3.713, "step": 3242 }, { "epoch": 0.5550231045695704, "grad_norm": 7.287442207336426, "learning_rate": 1.8465487735310896e-05, "loss": 0.4505, "step": 3243 }, { "epoch": 0.5551942495293514, "grad_norm": 12.331917762756348, "learning_rate": 1.847119224187108e-05, "loss": 0.8588, "step": 3244 }, { "epoch": 0.5553653944891322, "grad_norm": 44.76494216918945, "learning_rate": 1.8476896748431263e-05, "loss": 7.2343, "step": 3245 }, { "epoch": 0.5555365394489132, "grad_norm": 32.351600646972656, "learning_rate": 1.8482601254991443e-05, "loss": 4.0445, "step": 3246 }, { "epoch": 0.5557076844086941, "grad_norm": 28.279098510742188, "learning_rate": 1.8488305761551627e-05, "loss": 3.0796, "step": 3247 }, { "epoch": 0.5558788293684751, "grad_norm": 28.361543655395508, "learning_rate": 1.849401026811181e-05, "loss": 3.3123, "step": 3248 }, { "epoch": 0.556049974328256, "grad_norm": 41.510597229003906, "learning_rate": 1.8499714774671993e-05, "loss": 6.904, "step": 3249 }, { "epoch": 0.556221119288037, "grad_norm": 28.658105850219727, "learning_rate": 1.8505419281232173e-05, "loss": 3.0404, "step": 3250 }, { "epoch": 0.5563922642478178, "grad_norm": 28.752214431762695, "learning_rate": 1.8511123787792357e-05, "loss": 3.7672, "step": 3251 }, { "epoch": 0.5565634092075988, "grad_norm": 34.223060607910156, "learning_rate": 1.851682829435254e-05, "loss": 5.3153, "step": 3252 }, { "epoch": 0.5567345541673798, "grad_norm": 12.981977462768555, "learning_rate": 1.852253280091272e-05, "loss": 0.9685, "step": 3253 }, { "epoch": 0.5569056991271607, "grad_norm": 20.176815032958984, "learning_rate": 1.8528237307472903e-05, "loss": 1.9294, "step": 3254 }, { "epoch": 0.5570768440869417, "grad_norm": 24.55870246887207, "learning_rate": 1.8533941814033087e-05, "loss": 2.8725, "step": 3255 }, { "epoch": 0.5572479890467226, "grad_norm": 21.821077346801758, "learning_rate": 1.853964632059327e-05, "loss": 2.0436, "step": 3256 }, { "epoch": 0.5574191340065036, "grad_norm": 23.027362823486328, "learning_rate": 1.854535082715345e-05, "loss": 2.1357, "step": 3257 }, { "epoch": 0.5575902789662844, "grad_norm": 10.035751342773438, "learning_rate": 1.8551055333713634e-05, "loss": 0.6818, "step": 3258 }, { "epoch": 0.5577614239260654, "grad_norm": 4.451612949371338, "learning_rate": 1.8556759840273817e-05, "loss": 0.4069, "step": 3259 }, { "epoch": 0.5579325688858463, "grad_norm": 25.953588485717773, "learning_rate": 1.8562464346833997e-05, "loss": 2.8325, "step": 3260 }, { "epoch": 0.5581037138456273, "grad_norm": 63.914024353027344, "learning_rate": 1.856816885339418e-05, "loss": 8.0674, "step": 3261 }, { "epoch": 0.5582748588054082, "grad_norm": 11.405961990356445, "learning_rate": 1.8573873359954364e-05, "loss": 0.8008, "step": 3262 }, { "epoch": 0.5584460037651892, "grad_norm": 21.461894989013672, "learning_rate": 1.8579577866514547e-05, "loss": 2.1028, "step": 3263 }, { "epoch": 0.55861714872497, "grad_norm": 22.72207260131836, "learning_rate": 1.8585282373074727e-05, "loss": 2.0172, "step": 3264 }, { "epoch": 0.558788293684751, "grad_norm": 27.586618423461914, "learning_rate": 1.8590986879634914e-05, "loss": 2.0431, "step": 3265 }, { "epoch": 0.5589594386445319, "grad_norm": 26.892311096191406, "learning_rate": 1.8596691386195097e-05, "loss": 2.5198, "step": 3266 }, { "epoch": 0.5591305836043129, "grad_norm": 22.420379638671875, "learning_rate": 1.8602395892755277e-05, "loss": 2.155, "step": 3267 }, { "epoch": 0.5593017285640938, "grad_norm": 8.758207321166992, "learning_rate": 1.860810039931546e-05, "loss": 1.3864, "step": 3268 }, { "epoch": 0.5594728735238748, "grad_norm": 8.072163581848145, "learning_rate": 1.8613804905875644e-05, "loss": 0.5867, "step": 3269 }, { "epoch": 0.5596440184836556, "grad_norm": 6.457092761993408, "learning_rate": 1.8619509412435827e-05, "loss": 0.6464, "step": 3270 }, { "epoch": 0.5598151634434366, "grad_norm": 7.5770392417907715, "learning_rate": 1.8625213918996007e-05, "loss": 0.8377, "step": 3271 }, { "epoch": 0.5599863084032175, "grad_norm": 28.0118350982666, "learning_rate": 1.863091842555619e-05, "loss": 3.5048, "step": 3272 }, { "epoch": 0.5601574533629985, "grad_norm": 75.3667984008789, "learning_rate": 1.8636622932116374e-05, "loss": 7.3634, "step": 3273 }, { "epoch": 0.5603285983227794, "grad_norm": 6.486256122589111, "learning_rate": 1.8642327438676554e-05, "loss": 0.772, "step": 3274 }, { "epoch": 0.5604997432825604, "grad_norm": 18.678125381469727, "learning_rate": 1.8648031945236737e-05, "loss": 1.9006, "step": 3275 }, { "epoch": 0.5606708882423412, "grad_norm": 7.29653263092041, "learning_rate": 1.865373645179692e-05, "loss": 1.075, "step": 3276 }, { "epoch": 0.5608420332021222, "grad_norm": 2.164841890335083, "learning_rate": 1.86594409583571e-05, "loss": 0.3693, "step": 3277 }, { "epoch": 0.5610131781619031, "grad_norm": 21.217857360839844, "learning_rate": 1.8665145464917284e-05, "loss": 2.2186, "step": 3278 }, { "epoch": 0.5611843231216841, "grad_norm": 8.882852554321289, "learning_rate": 1.8670849971477468e-05, "loss": 0.9134, "step": 3279 }, { "epoch": 0.561355468081465, "grad_norm": 17.709449768066406, "learning_rate": 1.867655447803765e-05, "loss": 1.5424, "step": 3280 }, { "epoch": 0.561526613041246, "grad_norm": 5.205548286437988, "learning_rate": 1.868225898459783e-05, "loss": 0.6676, "step": 3281 }, { "epoch": 0.5616977580010268, "grad_norm": 52.959259033203125, "learning_rate": 1.8687963491158014e-05, "loss": 7.2033, "step": 3282 }, { "epoch": 0.5618689029608078, "grad_norm": 29.030555725097656, "learning_rate": 1.8693667997718198e-05, "loss": 2.9477, "step": 3283 }, { "epoch": 0.5620400479205887, "grad_norm": 27.745031356811523, "learning_rate": 1.8699372504278378e-05, "loss": 2.719, "step": 3284 }, { "epoch": 0.5622111928803697, "grad_norm": 25.971677780151367, "learning_rate": 1.870507701083856e-05, "loss": 2.706, "step": 3285 }, { "epoch": 0.5623823378401506, "grad_norm": 27.33722686767578, "learning_rate": 1.8710781517398744e-05, "loss": 2.568, "step": 3286 }, { "epoch": 0.5625534827999316, "grad_norm": 22.52666473388672, "learning_rate": 1.8716486023958928e-05, "loss": 2.3127, "step": 3287 }, { "epoch": 0.5627246277597124, "grad_norm": 10.016031265258789, "learning_rate": 1.872219053051911e-05, "loss": 1.4001, "step": 3288 }, { "epoch": 0.5628957727194934, "grad_norm": 24.30003547668457, "learning_rate": 1.8727895037079295e-05, "loss": 2.4201, "step": 3289 }, { "epoch": 0.5630669176792743, "grad_norm": 6.622725009918213, "learning_rate": 1.8733599543639478e-05, "loss": 0.7098, "step": 3290 }, { "epoch": 0.5632380626390553, "grad_norm": 24.1121883392334, "learning_rate": 1.8739304050199658e-05, "loss": 1.887, "step": 3291 }, { "epoch": 0.5634092075988362, "grad_norm": 31.559614181518555, "learning_rate": 1.874500855675984e-05, "loss": 3.5751, "step": 3292 }, { "epoch": 0.5635803525586172, "grad_norm": 33.04099655151367, "learning_rate": 1.8750713063320025e-05, "loss": 3.2233, "step": 3293 }, { "epoch": 0.563751497518398, "grad_norm": 81.57552337646484, "learning_rate": 1.8756417569880208e-05, "loss": 3.8443, "step": 3294 }, { "epoch": 0.563922642478179, "grad_norm": 30.438037872314453, "learning_rate": 1.8762122076440388e-05, "loss": 3.7816, "step": 3295 }, { "epoch": 0.5640937874379599, "grad_norm": 7.756313323974609, "learning_rate": 1.876782658300057e-05, "loss": 0.7225, "step": 3296 }, { "epoch": 0.5642649323977409, "grad_norm": 28.59238624572754, "learning_rate": 1.8773531089560755e-05, "loss": 3.1516, "step": 3297 }, { "epoch": 0.5644360773575218, "grad_norm": 25.167417526245117, "learning_rate": 1.8779235596120935e-05, "loss": 3.174, "step": 3298 }, { "epoch": 0.5646072223173028, "grad_norm": 86.82372283935547, "learning_rate": 1.8784940102681118e-05, "loss": 4.5193, "step": 3299 }, { "epoch": 0.5647783672770836, "grad_norm": 30.278440475463867, "learning_rate": 1.87906446092413e-05, "loss": 3.2045, "step": 3300 }, { "epoch": 0.5649495122368646, "grad_norm": 34.26241683959961, "learning_rate": 1.8796349115801485e-05, "loss": 3.7586, "step": 3301 }, { "epoch": 0.5651206571966455, "grad_norm": 20.874797821044922, "learning_rate": 1.8802053622361665e-05, "loss": 1.9123, "step": 3302 }, { "epoch": 0.5652918021564265, "grad_norm": 26.034624099731445, "learning_rate": 1.8807758128921848e-05, "loss": 2.522, "step": 3303 }, { "epoch": 0.5654629471162075, "grad_norm": 11.349614143371582, "learning_rate": 1.881346263548203e-05, "loss": 0.9236, "step": 3304 }, { "epoch": 0.5656340920759884, "grad_norm": 10.266570091247559, "learning_rate": 1.881916714204221e-05, "loss": 0.6643, "step": 3305 }, { "epoch": 0.5658052370357693, "grad_norm": 32.189842224121094, "learning_rate": 1.8824871648602395e-05, "loss": 4.5101, "step": 3306 }, { "epoch": 0.5659763819955502, "grad_norm": 24.921152114868164, "learning_rate": 1.883057615516258e-05, "loss": 2.9263, "step": 3307 }, { "epoch": 0.5661475269553312, "grad_norm": 35.14552307128906, "learning_rate": 1.883628066172276e-05, "loss": 4.0464, "step": 3308 }, { "epoch": 0.5663186719151121, "grad_norm": 37.087039947509766, "learning_rate": 1.8841985168282942e-05, "loss": 4.0199, "step": 3309 }, { "epoch": 0.5664898168748931, "grad_norm": 26.691438674926758, "learning_rate": 1.8847689674843125e-05, "loss": 3.2809, "step": 3310 }, { "epoch": 0.566660961834674, "grad_norm": 31.133575439453125, "learning_rate": 1.8853394181403312e-05, "loss": 3.4234, "step": 3311 }, { "epoch": 0.566832106794455, "grad_norm": 98.82320404052734, "learning_rate": 1.8859098687963492e-05, "loss": 7.426, "step": 3312 }, { "epoch": 0.5670032517542358, "grad_norm": 26.13225746154785, "learning_rate": 1.8864803194523675e-05, "loss": 2.5794, "step": 3313 }, { "epoch": 0.5671743967140168, "grad_norm": 28.947038650512695, "learning_rate": 1.887050770108386e-05, "loss": 2.7049, "step": 3314 }, { "epoch": 0.5673455416737977, "grad_norm": 23.491085052490234, "learning_rate": 1.887621220764404e-05, "loss": 2.1517, "step": 3315 }, { "epoch": 0.5675166866335787, "grad_norm": 17.27471351623535, "learning_rate": 1.8881916714204222e-05, "loss": 1.4919, "step": 3316 }, { "epoch": 0.5676878315933596, "grad_norm": 34.11532974243164, "learning_rate": 1.8887621220764405e-05, "loss": 4.5321, "step": 3317 }, { "epoch": 0.5678589765531405, "grad_norm": 19.040075302124023, "learning_rate": 1.889332572732459e-05, "loss": 1.6601, "step": 3318 }, { "epoch": 0.5680301215129214, "grad_norm": 18.085039138793945, "learning_rate": 1.889903023388477e-05, "loss": 1.5302, "step": 3319 }, { "epoch": 0.5682012664727024, "grad_norm": 27.968341827392578, "learning_rate": 1.8904734740444952e-05, "loss": 3.3977, "step": 3320 }, { "epoch": 0.5683724114324833, "grad_norm": 2.1676626205444336, "learning_rate": 1.8910439247005136e-05, "loss": 0.3746, "step": 3321 }, { "epoch": 0.5685435563922643, "grad_norm": 3.0772573947906494, "learning_rate": 1.8916143753565316e-05, "loss": 0.4364, "step": 3322 }, { "epoch": 0.5687147013520452, "grad_norm": 25.465309143066406, "learning_rate": 1.89218482601255e-05, "loss": 2.5141, "step": 3323 }, { "epoch": 0.5688858463118261, "grad_norm": 28.62706184387207, "learning_rate": 1.8927552766685682e-05, "loss": 2.9376, "step": 3324 }, { "epoch": 0.569056991271607, "grad_norm": 10.36950969696045, "learning_rate": 1.8933257273245866e-05, "loss": 0.9048, "step": 3325 }, { "epoch": 0.569228136231388, "grad_norm": 22.50096893310547, "learning_rate": 1.8938961779806046e-05, "loss": 2.3555, "step": 3326 }, { "epoch": 0.5693992811911689, "grad_norm": 25.440292358398438, "learning_rate": 1.894466628636623e-05, "loss": 2.5734, "step": 3327 }, { "epoch": 0.5695704261509499, "grad_norm": 5.363638401031494, "learning_rate": 1.8950370792926412e-05, "loss": 0.4315, "step": 3328 }, { "epoch": 0.5697415711107308, "grad_norm": 29.033611297607422, "learning_rate": 1.8956075299486592e-05, "loss": 2.7605, "step": 3329 }, { "epoch": 0.5699127160705117, "grad_norm": 6.961116790771484, "learning_rate": 1.8961779806046776e-05, "loss": 1.2524, "step": 3330 }, { "epoch": 0.5700838610302926, "grad_norm": 29.2668399810791, "learning_rate": 1.896748431260696e-05, "loss": 3.3542, "step": 3331 }, { "epoch": 0.5702550059900736, "grad_norm": 38.82827377319336, "learning_rate": 1.8973188819167143e-05, "loss": 4.5204, "step": 3332 }, { "epoch": 0.5704261509498545, "grad_norm": 32.07524871826172, "learning_rate": 1.8978893325727326e-05, "loss": 4.3224, "step": 3333 }, { "epoch": 0.5705972959096355, "grad_norm": 3.1426124572753906, "learning_rate": 1.898459783228751e-05, "loss": 0.4036, "step": 3334 }, { "epoch": 0.5707684408694164, "grad_norm": 19.389469146728516, "learning_rate": 1.8990302338847693e-05, "loss": 2.1048, "step": 3335 }, { "epoch": 0.5709395858291973, "grad_norm": 17.071313858032227, "learning_rate": 1.8996006845407873e-05, "loss": 1.6332, "step": 3336 }, { "epoch": 0.5711107307889782, "grad_norm": 7.998443603515625, "learning_rate": 1.9001711351968056e-05, "loss": 0.6125, "step": 3337 }, { "epoch": 0.5712818757487592, "grad_norm": 27.566017150878906, "learning_rate": 1.900741585852824e-05, "loss": 3.0635, "step": 3338 }, { "epoch": 0.5714530207085401, "grad_norm": 6.867462158203125, "learning_rate": 1.901312036508842e-05, "loss": 0.5083, "step": 3339 }, { "epoch": 0.5716241656683211, "grad_norm": 24.942699432373047, "learning_rate": 1.9018824871648603e-05, "loss": 2.4329, "step": 3340 }, { "epoch": 0.571795310628102, "grad_norm": 17.44595718383789, "learning_rate": 1.9024529378208786e-05, "loss": 1.2566, "step": 3341 }, { "epoch": 0.571966455587883, "grad_norm": 30.833187103271484, "learning_rate": 1.903023388476897e-05, "loss": 2.7353, "step": 3342 }, { "epoch": 0.5721376005476638, "grad_norm": 31.722270965576172, "learning_rate": 1.903593839132915e-05, "loss": 3.8463, "step": 3343 }, { "epoch": 0.5723087455074448, "grad_norm": 12.909158706665039, "learning_rate": 1.9041642897889333e-05, "loss": 1.0365, "step": 3344 }, { "epoch": 0.5724798904672257, "grad_norm": 32.17844772338867, "learning_rate": 1.9047347404449516e-05, "loss": 3.2414, "step": 3345 }, { "epoch": 0.5726510354270067, "grad_norm": 25.432022094726562, "learning_rate": 1.9053051911009696e-05, "loss": 2.4539, "step": 3346 }, { "epoch": 0.5728221803867876, "grad_norm": 2.373732805252075, "learning_rate": 1.905875641756988e-05, "loss": 0.3677, "step": 3347 }, { "epoch": 0.5729933253465685, "grad_norm": 40.49632263183594, "learning_rate": 1.9064460924130063e-05, "loss": 3.9831, "step": 3348 }, { "epoch": 0.5731644703063494, "grad_norm": 1.9657033681869507, "learning_rate": 1.9070165430690246e-05, "loss": 0.3775, "step": 3349 }, { "epoch": 0.5733356152661304, "grad_norm": 61.38923645019531, "learning_rate": 1.9075869937250426e-05, "loss": 7.6187, "step": 3350 }, { "epoch": 0.5735067602259113, "grad_norm": 24.892297744750977, "learning_rate": 1.908157444381061e-05, "loss": 2.5702, "step": 3351 }, { "epoch": 0.5736779051856923, "grad_norm": 27.634868621826172, "learning_rate": 1.9087278950370793e-05, "loss": 2.6693, "step": 3352 }, { "epoch": 0.5738490501454732, "grad_norm": 30.543689727783203, "learning_rate": 1.9092983456930973e-05, "loss": 2.881, "step": 3353 }, { "epoch": 0.5740201951052541, "grad_norm": 20.875457763671875, "learning_rate": 1.9098687963491157e-05, "loss": 2.0431, "step": 3354 }, { "epoch": 0.5741913400650351, "grad_norm": 10.260396003723145, "learning_rate": 1.910439247005134e-05, "loss": 1.0317, "step": 3355 }, { "epoch": 0.574362485024816, "grad_norm": 28.790538787841797, "learning_rate": 1.9110096976611527e-05, "loss": 2.8238, "step": 3356 }, { "epoch": 0.574533629984597, "grad_norm": 25.868772506713867, "learning_rate": 1.9115801483171707e-05, "loss": 2.5919, "step": 3357 }, { "epoch": 0.5747047749443779, "grad_norm": 25.83347511291504, "learning_rate": 1.912150598973189e-05, "loss": 3.1996, "step": 3358 }, { "epoch": 0.5748759199041589, "grad_norm": 29.329633712768555, "learning_rate": 1.9127210496292073e-05, "loss": 3.1316, "step": 3359 }, { "epoch": 0.5750470648639397, "grad_norm": 9.001529693603516, "learning_rate": 1.9132915002852253e-05, "loss": 1.2363, "step": 3360 }, { "epoch": 0.5752182098237207, "grad_norm": 5.358071804046631, "learning_rate": 1.9138619509412437e-05, "loss": 0.3949, "step": 3361 }, { "epoch": 0.5753893547835016, "grad_norm": 13.40963363647461, "learning_rate": 1.914432401597262e-05, "loss": 0.7941, "step": 3362 }, { "epoch": 0.5755604997432826, "grad_norm": 37.820556640625, "learning_rate": 1.9150028522532804e-05, "loss": 6.717, "step": 3363 }, { "epoch": 0.5757316447030635, "grad_norm": 181.16746520996094, "learning_rate": 1.9155733029092984e-05, "loss": 9.5897, "step": 3364 }, { "epoch": 0.5759027896628445, "grad_norm": 29.568683624267578, "learning_rate": 1.9161437535653167e-05, "loss": 2.4104, "step": 3365 }, { "epoch": 0.5760739346226253, "grad_norm": 10.582496643066406, "learning_rate": 1.916714204221335e-05, "loss": 0.5204, "step": 3366 }, { "epoch": 0.5762450795824063, "grad_norm": 37.75896072387695, "learning_rate": 1.917284654877353e-05, "loss": 6.4064, "step": 3367 }, { "epoch": 0.5764162245421872, "grad_norm": 23.44141960144043, "learning_rate": 1.9178551055333714e-05, "loss": 2.2333, "step": 3368 }, { "epoch": 0.5765873695019682, "grad_norm": 23.17081642150879, "learning_rate": 1.9184255561893897e-05, "loss": 2.3916, "step": 3369 }, { "epoch": 0.5767585144617491, "grad_norm": 22.356122970581055, "learning_rate": 1.918996006845408e-05, "loss": 2.259, "step": 3370 }, { "epoch": 0.5769296594215301, "grad_norm": 25.988954544067383, "learning_rate": 1.919566457501426e-05, "loss": 2.6056, "step": 3371 }, { "epoch": 0.577100804381311, "grad_norm": 17.81022071838379, "learning_rate": 1.9201369081574444e-05, "loss": 1.3797, "step": 3372 }, { "epoch": 0.5772719493410919, "grad_norm": 28.269866943359375, "learning_rate": 1.9207073588134627e-05, "loss": 2.3896, "step": 3373 }, { "epoch": 0.5774430943008728, "grad_norm": 24.576251983642578, "learning_rate": 1.9212778094694807e-05, "loss": 2.2518, "step": 3374 }, { "epoch": 0.5776142392606538, "grad_norm": 5.2097649574279785, "learning_rate": 1.921848260125499e-05, "loss": 0.3953, "step": 3375 }, { "epoch": 0.5777853842204347, "grad_norm": 3.1124250888824463, "learning_rate": 1.9224187107815174e-05, "loss": 0.3687, "step": 3376 }, { "epoch": 0.5779565291802157, "grad_norm": 20.81354331970215, "learning_rate": 1.9229891614375354e-05, "loss": 2.0595, "step": 3377 }, { "epoch": 0.5781276741399966, "grad_norm": 29.21316909790039, "learning_rate": 1.9235596120935537e-05, "loss": 3.7875, "step": 3378 }, { "epoch": 0.5782988190997775, "grad_norm": 84.69393157958984, "learning_rate": 1.9241300627495724e-05, "loss": 3.0112, "step": 3379 }, { "epoch": 0.5784699640595584, "grad_norm": 1.8985782861709595, "learning_rate": 1.9247005134055907e-05, "loss": 0.3148, "step": 3380 }, { "epoch": 0.5786411090193394, "grad_norm": 10.058646202087402, "learning_rate": 1.9252709640616087e-05, "loss": 0.8459, "step": 3381 }, { "epoch": 0.5788122539791203, "grad_norm": 27.1168270111084, "learning_rate": 1.925841414717627e-05, "loss": 2.5347, "step": 3382 }, { "epoch": 0.5789833989389013, "grad_norm": 89.62450408935547, "learning_rate": 1.9264118653736454e-05, "loss": 3.7248, "step": 3383 }, { "epoch": 0.5791545438986822, "grad_norm": 4.5566558837890625, "learning_rate": 1.9269823160296634e-05, "loss": 0.6092, "step": 3384 }, { "epoch": 0.5793256888584631, "grad_norm": 30.642803192138672, "learning_rate": 1.9275527666856818e-05, "loss": 3.5006, "step": 3385 }, { "epoch": 0.579496833818244, "grad_norm": 27.308584213256836, "learning_rate": 1.9281232173417e-05, "loss": 3.4485, "step": 3386 }, { "epoch": 0.579667978778025, "grad_norm": 29.646587371826172, "learning_rate": 1.9286936679977184e-05, "loss": 3.0531, "step": 3387 }, { "epoch": 0.5798391237378059, "grad_norm": 14.223383903503418, "learning_rate": 1.9292641186537364e-05, "loss": 1.5835, "step": 3388 }, { "epoch": 0.5800102686975869, "grad_norm": 24.695066452026367, "learning_rate": 1.9298345693097548e-05, "loss": 2.368, "step": 3389 }, { "epoch": 0.5801814136573678, "grad_norm": 26.341815948486328, "learning_rate": 1.930405019965773e-05, "loss": 2.3547, "step": 3390 }, { "epoch": 0.5803525586171487, "grad_norm": 18.38678741455078, "learning_rate": 1.930975470621791e-05, "loss": 1.7797, "step": 3391 }, { "epoch": 0.5805237035769296, "grad_norm": 25.61127471923828, "learning_rate": 1.9315459212778094e-05, "loss": 2.638, "step": 3392 }, { "epoch": 0.5806948485367106, "grad_norm": 2.290560007095337, "learning_rate": 1.9321163719338278e-05, "loss": 0.3367, "step": 3393 }, { "epoch": 0.5808659934964915, "grad_norm": 11.412469863891602, "learning_rate": 1.932686822589846e-05, "loss": 1.1406, "step": 3394 }, { "epoch": 0.5810371384562725, "grad_norm": 8.998905181884766, "learning_rate": 1.933257273245864e-05, "loss": 0.6117, "step": 3395 }, { "epoch": 0.5812082834160534, "grad_norm": 7.52636194229126, "learning_rate": 1.9338277239018825e-05, "loss": 0.653, "step": 3396 }, { "epoch": 0.5813794283758343, "grad_norm": 27.57058334350586, "learning_rate": 1.9343981745579008e-05, "loss": 3.0084, "step": 3397 }, { "epoch": 0.5815505733356152, "grad_norm": 26.775415420532227, "learning_rate": 1.9349686252139188e-05, "loss": 2.9132, "step": 3398 }, { "epoch": 0.5817217182953962, "grad_norm": 3.1826353073120117, "learning_rate": 1.935539075869937e-05, "loss": 0.3669, "step": 3399 }, { "epoch": 0.5818928632551771, "grad_norm": 6.152859210968018, "learning_rate": 1.9361095265259555e-05, "loss": 0.6359, "step": 3400 }, { "epoch": 0.5820640082149581, "grad_norm": 1.7208553552627563, "learning_rate": 1.9366799771819738e-05, "loss": 0.3746, "step": 3401 }, { "epoch": 0.582235153174739, "grad_norm": 7.883406162261963, "learning_rate": 1.937250427837992e-05, "loss": 1.0997, "step": 3402 }, { "epoch": 0.5824062981345199, "grad_norm": 26.301164627075195, "learning_rate": 1.9378208784940105e-05, "loss": 2.4291, "step": 3403 }, { "epoch": 0.5825774430943008, "grad_norm": 23.660444259643555, "learning_rate": 1.9383913291500288e-05, "loss": 2.6292, "step": 3404 }, { "epoch": 0.5827485880540818, "grad_norm": 17.410369873046875, "learning_rate": 1.9389617798060468e-05, "loss": 1.4877, "step": 3405 }, { "epoch": 0.5829197330138628, "grad_norm": 31.716928482055664, "learning_rate": 1.939532230462065e-05, "loss": 3.778, "step": 3406 }, { "epoch": 0.5830908779736437, "grad_norm": 39.23788833618164, "learning_rate": 1.9401026811180835e-05, "loss": 3.4125, "step": 3407 }, { "epoch": 0.5832620229334247, "grad_norm": 21.296669006347656, "learning_rate": 1.9406731317741015e-05, "loss": 1.9439, "step": 3408 }, { "epoch": 0.5834331678932055, "grad_norm": 4.249104022979736, "learning_rate": 1.94124358243012e-05, "loss": 0.5641, "step": 3409 }, { "epoch": 0.5836043128529865, "grad_norm": 25.32843780517578, "learning_rate": 1.9418140330861382e-05, "loss": 2.1923, "step": 3410 }, { "epoch": 0.5837754578127674, "grad_norm": 31.81114387512207, "learning_rate": 1.9423844837421565e-05, "loss": 6.2289, "step": 3411 }, { "epoch": 0.5839466027725484, "grad_norm": 34.15937423706055, "learning_rate": 1.9429549343981745e-05, "loss": 7.0035, "step": 3412 }, { "epoch": 0.5841177477323293, "grad_norm": 27.947298049926758, "learning_rate": 1.943525385054193e-05, "loss": 3.2937, "step": 3413 }, { "epoch": 0.5842888926921103, "grad_norm": 13.201940536499023, "learning_rate": 1.9440958357102112e-05, "loss": 0.9029, "step": 3414 }, { "epoch": 0.5844600376518911, "grad_norm": 21.287315368652344, "learning_rate": 1.9446662863662292e-05, "loss": 2.1476, "step": 3415 }, { "epoch": 0.5846311826116721, "grad_norm": 27.151569366455078, "learning_rate": 1.9452367370222475e-05, "loss": 2.8889, "step": 3416 }, { "epoch": 0.584802327571453, "grad_norm": 25.92886734008789, "learning_rate": 1.945807187678266e-05, "loss": 2.43, "step": 3417 }, { "epoch": 0.584973472531234, "grad_norm": 16.41077423095703, "learning_rate": 1.9463776383342842e-05, "loss": 1.1515, "step": 3418 }, { "epoch": 0.5851446174910149, "grad_norm": 7.387080669403076, "learning_rate": 1.9469480889903022e-05, "loss": 1.0438, "step": 3419 }, { "epoch": 0.5853157624507959, "grad_norm": 28.30823516845703, "learning_rate": 1.9475185396463205e-05, "loss": 2.4817, "step": 3420 }, { "epoch": 0.5854869074105767, "grad_norm": 19.957653045654297, "learning_rate": 1.948088990302339e-05, "loss": 2.492, "step": 3421 }, { "epoch": 0.5856580523703577, "grad_norm": 26.708097457885742, "learning_rate": 1.948659440958357e-05, "loss": 2.9457, "step": 3422 }, { "epoch": 0.5858291973301386, "grad_norm": 6.408317565917969, "learning_rate": 1.9492298916143752e-05, "loss": 0.7295, "step": 3423 }, { "epoch": 0.5860003422899196, "grad_norm": 30.148130416870117, "learning_rate": 1.9498003422703935e-05, "loss": 3.3614, "step": 3424 }, { "epoch": 0.5861714872497005, "grad_norm": 22.77581787109375, "learning_rate": 1.9503707929264122e-05, "loss": 1.9865, "step": 3425 }, { "epoch": 0.5863426322094815, "grad_norm": 27.753477096557617, "learning_rate": 1.9509412435824302e-05, "loss": 2.899, "step": 3426 }, { "epoch": 0.5865137771692623, "grad_norm": 6.288846015930176, "learning_rate": 1.9515116942384486e-05, "loss": 0.646, "step": 3427 }, { "epoch": 0.5866849221290433, "grad_norm": 24.92253303527832, "learning_rate": 1.952082144894467e-05, "loss": 2.5784, "step": 3428 }, { "epoch": 0.5868560670888242, "grad_norm": 24.49477767944336, "learning_rate": 1.952652595550485e-05, "loss": 2.1704, "step": 3429 }, { "epoch": 0.5870272120486052, "grad_norm": 24.100597381591797, "learning_rate": 1.9532230462065032e-05, "loss": 2.5277, "step": 3430 }, { "epoch": 0.5871983570083861, "grad_norm": 21.0911922454834, "learning_rate": 1.9537934968625216e-05, "loss": 2.2242, "step": 3431 }, { "epoch": 0.5873695019681671, "grad_norm": 22.534944534301758, "learning_rate": 1.95436394751854e-05, "loss": 2.3912, "step": 3432 }, { "epoch": 0.5875406469279479, "grad_norm": 22.132417678833008, "learning_rate": 1.954934398174558e-05, "loss": 2.3557, "step": 3433 }, { "epoch": 0.5877117918877289, "grad_norm": 21.22612953186035, "learning_rate": 1.9555048488305762e-05, "loss": 1.8585, "step": 3434 }, { "epoch": 0.5878829368475098, "grad_norm": 87.90875244140625, "learning_rate": 1.9560752994865946e-05, "loss": 4.0792, "step": 3435 }, { "epoch": 0.5880540818072908, "grad_norm": 89.19034576416016, "learning_rate": 1.9566457501426126e-05, "loss": 4.43, "step": 3436 }, { "epoch": 0.5882252267670717, "grad_norm": 19.258451461791992, "learning_rate": 1.957216200798631e-05, "loss": 1.7874, "step": 3437 }, { "epoch": 0.5883963717268527, "grad_norm": 1.7522574663162231, "learning_rate": 1.9577866514546493e-05, "loss": 0.3112, "step": 3438 }, { "epoch": 0.5885675166866335, "grad_norm": 18.229957580566406, "learning_rate": 1.9583571021106676e-05, "loss": 2.766, "step": 3439 }, { "epoch": 0.5887386616464145, "grad_norm": 36.58788299560547, "learning_rate": 1.9589275527666856e-05, "loss": 5.2847, "step": 3440 }, { "epoch": 0.5889098066061954, "grad_norm": 23.946247100830078, "learning_rate": 1.959498003422704e-05, "loss": 2.3658, "step": 3441 }, { "epoch": 0.5890809515659764, "grad_norm": 29.713180541992188, "learning_rate": 1.9600684540787223e-05, "loss": 3.6696, "step": 3442 }, { "epoch": 0.5892520965257573, "grad_norm": 22.247447967529297, "learning_rate": 1.9606389047347403e-05, "loss": 2.6781, "step": 3443 }, { "epoch": 0.5894232414855383, "grad_norm": 23.726993560791016, "learning_rate": 1.9612093553907586e-05, "loss": 2.765, "step": 3444 }, { "epoch": 0.5895943864453191, "grad_norm": 39.94513702392578, "learning_rate": 1.961779806046777e-05, "loss": 7.0047, "step": 3445 }, { "epoch": 0.5897655314051001, "grad_norm": 24.248090744018555, "learning_rate": 1.962350256702795e-05, "loss": 2.7753, "step": 3446 }, { "epoch": 0.589936676364881, "grad_norm": 3.6631691455841064, "learning_rate": 1.9629207073588133e-05, "loss": 0.3529, "step": 3447 }, { "epoch": 0.590107821324662, "grad_norm": 25.42365264892578, "learning_rate": 1.963491158014832e-05, "loss": 2.1961, "step": 3448 }, { "epoch": 0.5902789662844429, "grad_norm": 25.308515548706055, "learning_rate": 1.9640616086708503e-05, "loss": 2.3387, "step": 3449 }, { "epoch": 0.5904501112442239, "grad_norm": 19.806636810302734, "learning_rate": 1.9646320593268683e-05, "loss": 2.0071, "step": 3450 }, { "epoch": 0.5906212562040047, "grad_norm": 17.552900314331055, "learning_rate": 1.9652025099828866e-05, "loss": 1.4304, "step": 3451 }, { "epoch": 0.5907924011637857, "grad_norm": 23.210519790649414, "learning_rate": 1.965772960638905e-05, "loss": 2.1445, "step": 3452 }, { "epoch": 0.5909635461235666, "grad_norm": 25.595361709594727, "learning_rate": 1.966343411294923e-05, "loss": 3.0135, "step": 3453 }, { "epoch": 0.5911346910833476, "grad_norm": 3.9893271923065186, "learning_rate": 1.9669138619509413e-05, "loss": 0.4081, "step": 3454 }, { "epoch": 0.5913058360431286, "grad_norm": 2.2912561893463135, "learning_rate": 1.9674843126069596e-05, "loss": 0.3066, "step": 3455 }, { "epoch": 0.5914769810029095, "grad_norm": 23.45972442626953, "learning_rate": 1.968054763262978e-05, "loss": 2.4938, "step": 3456 }, { "epoch": 0.5916481259626905, "grad_norm": 24.78557777404785, "learning_rate": 1.968625213918996e-05, "loss": 2.2888, "step": 3457 }, { "epoch": 0.5918192709224713, "grad_norm": 56.51396560668945, "learning_rate": 1.9691956645750143e-05, "loss": 3.3839, "step": 3458 }, { "epoch": 0.5919904158822523, "grad_norm": 15.350875854492188, "learning_rate": 1.9697661152310327e-05, "loss": 1.0531, "step": 3459 }, { "epoch": 0.5921615608420332, "grad_norm": 73.21929931640625, "learning_rate": 1.9703365658870507e-05, "loss": 3.5199, "step": 3460 }, { "epoch": 0.5923327058018142, "grad_norm": 29.828990936279297, "learning_rate": 1.970907016543069e-05, "loss": 3.8054, "step": 3461 }, { "epoch": 0.5925038507615951, "grad_norm": 18.3194637298584, "learning_rate": 1.9714774671990873e-05, "loss": 1.7246, "step": 3462 }, { "epoch": 0.592674995721376, "grad_norm": 29.311429977416992, "learning_rate": 1.9720479178551057e-05, "loss": 3.2428, "step": 3463 }, { "epoch": 0.5928461406811569, "grad_norm": 1.9222893714904785, "learning_rate": 1.9726183685111237e-05, "loss": 0.3078, "step": 3464 }, { "epoch": 0.5930172856409379, "grad_norm": 6.286295413970947, "learning_rate": 1.973188819167142e-05, "loss": 0.6446, "step": 3465 }, { "epoch": 0.5931884306007188, "grad_norm": 29.647480010986328, "learning_rate": 1.9737592698231603e-05, "loss": 2.9621, "step": 3466 }, { "epoch": 0.5933595755604998, "grad_norm": 26.92269515991211, "learning_rate": 1.9743297204791783e-05, "loss": 2.5933, "step": 3467 }, { "epoch": 0.5935307205202807, "grad_norm": 50.6396484375, "learning_rate": 1.9749001711351967e-05, "loss": 6.8098, "step": 3468 }, { "epoch": 0.5937018654800617, "grad_norm": 25.224733352661133, "learning_rate": 1.975470621791215e-05, "loss": 2.4431, "step": 3469 }, { "epoch": 0.5938730104398425, "grad_norm": 17.845563888549805, "learning_rate": 1.9760410724472334e-05, "loss": 1.7165, "step": 3470 }, { "epoch": 0.5940441553996235, "grad_norm": 5.634066104888916, "learning_rate": 1.9766115231032517e-05, "loss": 0.5882, "step": 3471 }, { "epoch": 0.5942153003594044, "grad_norm": 34.622920989990234, "learning_rate": 1.97718197375927e-05, "loss": 3.5714, "step": 3472 }, { "epoch": 0.5943864453191854, "grad_norm": 63.40961837768555, "learning_rate": 1.9777524244152884e-05, "loss": 2.734, "step": 3473 }, { "epoch": 0.5945575902789663, "grad_norm": 29.88731575012207, "learning_rate": 1.9783228750713064e-05, "loss": 3.8436, "step": 3474 }, { "epoch": 0.5947287352387473, "grad_norm": 27.8708553314209, "learning_rate": 1.9788933257273247e-05, "loss": 2.3388, "step": 3475 }, { "epoch": 0.5948998801985281, "grad_norm": 25.777362823486328, "learning_rate": 1.979463776383343e-05, "loss": 2.3517, "step": 3476 }, { "epoch": 0.5950710251583091, "grad_norm": 14.805953979492188, "learning_rate": 1.980034227039361e-05, "loss": 1.5038, "step": 3477 }, { "epoch": 0.59524217011809, "grad_norm": 19.073440551757812, "learning_rate": 1.9806046776953794e-05, "loss": 1.8955, "step": 3478 }, { "epoch": 0.595413315077871, "grad_norm": 21.738014221191406, "learning_rate": 1.9811751283513977e-05, "loss": 2.3869, "step": 3479 }, { "epoch": 0.5955844600376519, "grad_norm": 2.9714324474334717, "learning_rate": 1.981745579007416e-05, "loss": 0.3378, "step": 3480 }, { "epoch": 0.5957556049974329, "grad_norm": 8.826178550720215, "learning_rate": 1.982316029663434e-05, "loss": 0.9817, "step": 3481 }, { "epoch": 0.5959267499572137, "grad_norm": 16.54644012451172, "learning_rate": 1.9828864803194524e-05, "loss": 1.2827, "step": 3482 }, { "epoch": 0.5960978949169947, "grad_norm": 9.384221076965332, "learning_rate": 1.9834569309754707e-05, "loss": 1.3316, "step": 3483 }, { "epoch": 0.5962690398767756, "grad_norm": 25.255199432373047, "learning_rate": 1.9840273816314887e-05, "loss": 2.1236, "step": 3484 }, { "epoch": 0.5964401848365566, "grad_norm": 27.23832893371582, "learning_rate": 1.984597832287507e-05, "loss": 2.8921, "step": 3485 }, { "epoch": 0.5966113297963375, "grad_norm": 31.743816375732422, "learning_rate": 1.9851682829435254e-05, "loss": 4.1041, "step": 3486 }, { "epoch": 0.5967824747561185, "grad_norm": 23.10817527770996, "learning_rate": 1.9857387335995437e-05, "loss": 1.973, "step": 3487 }, { "epoch": 0.5969536197158993, "grad_norm": 40.163639068603516, "learning_rate": 1.9863091842555617e-05, "loss": 6.3457, "step": 3488 }, { "epoch": 0.5971247646756803, "grad_norm": 29.302976608276367, "learning_rate": 1.98687963491158e-05, "loss": 2.8273, "step": 3489 }, { "epoch": 0.5972959096354612, "grad_norm": 29.635021209716797, "learning_rate": 1.9874500855675984e-05, "loss": 3.671, "step": 3490 }, { "epoch": 0.5974670545952422, "grad_norm": 21.227108001708984, "learning_rate": 1.9880205362236164e-05, "loss": 2.1672, "step": 3491 }, { "epoch": 0.5976381995550231, "grad_norm": 30.448522567749023, "learning_rate": 1.9885909868796348e-05, "loss": 3.0936, "step": 3492 }, { "epoch": 0.597809344514804, "grad_norm": 27.133663177490234, "learning_rate": 1.9891614375356534e-05, "loss": 2.4887, "step": 3493 }, { "epoch": 0.5979804894745849, "grad_norm": 39.466121673583984, "learning_rate": 1.9897318881916718e-05, "loss": 4.8888, "step": 3494 }, { "epoch": 0.5981516344343659, "grad_norm": 39.85908889770508, "learning_rate": 1.9903023388476898e-05, "loss": 6.6469, "step": 3495 }, { "epoch": 0.5983227793941468, "grad_norm": 19.293907165527344, "learning_rate": 1.990872789503708e-05, "loss": 2.0085, "step": 3496 }, { "epoch": 0.5984939243539278, "grad_norm": 30.540531158447266, "learning_rate": 1.9914432401597265e-05, "loss": 2.8524, "step": 3497 }, { "epoch": 0.5986650693137087, "grad_norm": 2.173297882080078, "learning_rate": 1.9920136908157444e-05, "loss": 0.4671, "step": 3498 }, { "epoch": 0.5988362142734897, "grad_norm": 23.616220474243164, "learning_rate": 1.9925841414717628e-05, "loss": 2.1941, "step": 3499 }, { "epoch": 0.5990073592332705, "grad_norm": 10.88476276397705, "learning_rate": 1.993154592127781e-05, "loss": 0.9849, "step": 3500 }, { "epoch": 0.5991785041930515, "grad_norm": 35.73077392578125, "learning_rate": 1.9937250427837995e-05, "loss": 3.3526, "step": 3501 }, { "epoch": 0.5993496491528324, "grad_norm": 16.617977142333984, "learning_rate": 1.9942954934398175e-05, "loss": 1.305, "step": 3502 }, { "epoch": 0.5995207941126134, "grad_norm": 18.637554168701172, "learning_rate": 1.9948659440958358e-05, "loss": 1.7833, "step": 3503 }, { "epoch": 0.5996919390723943, "grad_norm": 22.126482009887695, "learning_rate": 1.995436394751854e-05, "loss": 1.8701, "step": 3504 }, { "epoch": 0.5998630840321753, "grad_norm": 19.62862777709961, "learning_rate": 1.996006845407872e-05, "loss": 1.9236, "step": 3505 }, { "epoch": 0.6000342289919562, "grad_norm": 27.936777114868164, "learning_rate": 1.9965772960638905e-05, "loss": 2.4178, "step": 3506 }, { "epoch": 0.6002053739517371, "grad_norm": 19.932191848754883, "learning_rate": 1.9971477467199088e-05, "loss": 2.1511, "step": 3507 }, { "epoch": 0.6003765189115181, "grad_norm": 25.053146362304688, "learning_rate": 1.9977181973759268e-05, "loss": 2.4803, "step": 3508 }, { "epoch": 0.6003765189115181, "eval_nli-pairs_loss": 2.4920291900634766, "eval_nli-pairs_runtime": 4.6698, "eval_nli-pairs_samples_per_second": 42.828, "eval_nli-pairs_steps_per_second": 1.499, "eval_sts-test_pearson_cosine": 0.7445126100709293, "eval_sts-test_pearson_dot": 0.6267026529286148, "eval_sts-test_pearson_euclidean": 0.7432252885023554, "eval_sts-test_pearson_manhattan": 0.7498148030136934, "eval_sts-test_pearson_max": 0.7498148030136934, "eval_sts-test_spearman_cosine": 0.7257459075346154, "eval_sts-test_spearman_dot": 0.6080996929747863, "eval_sts-test_spearman_euclidean": 0.7251182727779897, "eval_sts-test_spearman_manhattan": 0.7328124096687271, "eval_sts-test_spearman_max": 0.7328124096687271, "step": 3508 }, { "epoch": 0.6003765189115181, "eval_vitaminc-pairs_loss": 1.5536390542984009, "eval_vitaminc-pairs_runtime": 2.8901, "eval_vitaminc-pairs_samples_per_second": 69.202, "eval_vitaminc-pairs_steps_per_second": 2.422, "step": 3508 }, { "epoch": 0.6003765189115181, "eval_qnli-contrastive_loss": 3.72904109954834, "eval_qnli-contrastive_runtime": 0.7044, "eval_qnli-contrastive_samples_per_second": 283.946, "eval_qnli-contrastive_steps_per_second": 9.938, "step": 3508 }, { "epoch": 0.6003765189115181, "eval_scitail-pairs-qa_loss": 0.28478389978408813, "eval_scitail-pairs-qa_runtime": 1.9184, "eval_scitail-pairs-qa_samples_per_second": 104.251, "eval_scitail-pairs-qa_steps_per_second": 3.649, "step": 3508 }, { "epoch": 0.6003765189115181, "eval_scitail-pairs-pos_loss": 1.0560411214828491, "eval_scitail-pairs-pos_runtime": 2.9426, "eval_scitail-pairs-pos_samples_per_second": 67.966, "eval_scitail-pairs-pos_steps_per_second": 2.379, "step": 3508 }, { "epoch": 0.6003765189115181, "eval_xsum-pairs_loss": 1.246793508529663, "eval_xsum-pairs_runtime": 2.6747, "eval_xsum-pairs_samples_per_second": 65.429, "eval_xsum-pairs_steps_per_second": 2.243, "step": 3508 }, { "epoch": 0.6003765189115181, "eval_compression-pairs_loss": 0.5663356184959412, "eval_compression-pairs_runtime": 0.5441, "eval_compression-pairs_samples_per_second": 367.559, "eval_compression-pairs_steps_per_second": 12.865, "step": 3508 }, { "epoch": 0.6003765189115181, "eval_sciq_pairs_loss": 5.566298484802246, "eval_sciq_pairs_runtime": 9.5047, "eval_sciq_pairs_samples_per_second": 21.042, "eval_sciq_pairs_steps_per_second": 0.736, "step": 3508 }, { "epoch": 0.6003765189115181, "eval_qasc_pairs_loss": 6.534984588623047, "eval_qasc_pairs_runtime": 2.8892, "eval_qasc_pairs_samples_per_second": 69.224, "eval_qasc_pairs_steps_per_second": 2.423, "step": 3508 }, { "epoch": 0.6003765189115181, "eval_openbookqa_pairs_loss": 3.5413291454315186, "eval_openbookqa_pairs_runtime": 0.7338, "eval_openbookqa_pairs_samples_per_second": 94.027, "eval_openbookqa_pairs_steps_per_second": 4.088, "step": 3508 }, { "epoch": 0.6003765189115181, "eval_msmarco_pairs_loss": 2.2276792526245117, "eval_msmarco_pairs_runtime": 4.1013, "eval_msmarco_pairs_samples_per_second": 48.765, "eval_msmarco_pairs_steps_per_second": 1.707, "step": 3508 }, { "epoch": 0.6003765189115181, "eval_nq_pairs_loss": 2.868544340133667, "eval_nq_pairs_runtime": 8.7773, "eval_nq_pairs_samples_per_second": 22.786, "eval_nq_pairs_steps_per_second": 0.798, "step": 3508 }, { "epoch": 0.6003765189115181, "eval_trivia_pairs_loss": 2.8433399200439453, "eval_trivia_pairs_runtime": 12.7884, "eval_trivia_pairs_samples_per_second": 15.639, "eval_trivia_pairs_steps_per_second": 0.547, "step": 3508 }, { "epoch": 0.6003765189115181, "eval_quora_pairs_loss": 0.5191998481750488, "eval_quora_pairs_runtime": 1.6069, "eval_quora_pairs_samples_per_second": 124.459, "eval_quora_pairs_steps_per_second": 4.356, "step": 3508 }, { "epoch": 0.6003765189115181, "eval_gooaq_pairs_loss": 1.7708619832992554, "eval_gooaq_pairs_runtime": 2.6531, "eval_gooaq_pairs_samples_per_second": 75.384, "eval_gooaq_pairs_steps_per_second": 2.638, "step": 3508 }, { "epoch": 0.600547663871299, "grad_norm": 25.026371002197266, "learning_rate": 1.998288648031945e-05, "loss": 2.5669, "step": 3509 }, { "epoch": 0.60071880883108, "grad_norm": 27.141857147216797, "learning_rate": 1.9988590986879635e-05, "loss": 3.5018, "step": 3510 }, { "epoch": 0.6008899537908609, "grad_norm": 14.711791038513184, "learning_rate": 1.9994295493439818e-05, "loss": 1.5597, "step": 3511 }, { "epoch": 0.6010610987506418, "grad_norm": 21.489046096801758, "learning_rate": 1.9999999999999998e-05, "loss": 1.9611, "step": 3512 }, { "epoch": 0.6012322437104227, "grad_norm": 25.058837890625, "learning_rate": 2.000570450656018e-05, "loss": 2.6108, "step": 3513 }, { "epoch": 0.6014033886702037, "grad_norm": 35.174922943115234, "learning_rate": 2.0011409013120365e-05, "loss": 4.3163, "step": 3514 }, { "epoch": 0.6015745336299846, "grad_norm": 24.40542984008789, "learning_rate": 2.0017113519680545e-05, "loss": 2.2731, "step": 3515 }, { "epoch": 0.6017456785897656, "grad_norm": 25.144128799438477, "learning_rate": 2.0022818026240732e-05, "loss": 2.9364, "step": 3516 }, { "epoch": 0.6019168235495465, "grad_norm": 24.48015594482422, "learning_rate": 2.0028522532800915e-05, "loss": 2.9065, "step": 3517 }, { "epoch": 0.6020879685093274, "grad_norm": 3.758195161819458, "learning_rate": 2.00342270393611e-05, "loss": 0.3902, "step": 3518 }, { "epoch": 0.6022591134691083, "grad_norm": 24.701047897338867, "learning_rate": 2.003993154592128e-05, "loss": 2.7503, "step": 3519 }, { "epoch": 0.6024302584288893, "grad_norm": 35.492271423339844, "learning_rate": 2.0045636052481462e-05, "loss": 3.48, "step": 3520 }, { "epoch": 0.6026014033886702, "grad_norm": 23.064668655395508, "learning_rate": 2.0051340559041645e-05, "loss": 2.0561, "step": 3521 }, { "epoch": 0.6027725483484512, "grad_norm": 20.811628341674805, "learning_rate": 2.0057045065601825e-05, "loss": 2.1345, "step": 3522 }, { "epoch": 0.602943693308232, "grad_norm": 29.092313766479492, "learning_rate": 2.006274957216201e-05, "loss": 3.3692, "step": 3523 }, { "epoch": 0.603114838268013, "grad_norm": 4.536581993103027, "learning_rate": 2.0068454078722192e-05, "loss": 0.3497, "step": 3524 }, { "epoch": 0.6032859832277939, "grad_norm": 30.311725616455078, "learning_rate": 2.0074158585282375e-05, "loss": 3.1903, "step": 3525 }, { "epoch": 0.6034571281875749, "grad_norm": 22.41107749938965, "learning_rate": 2.0079863091842555e-05, "loss": 1.8769, "step": 3526 }, { "epoch": 0.6036282731473558, "grad_norm": 110.15360260009766, "learning_rate": 2.008556759840274e-05, "loss": 8.2043, "step": 3527 }, { "epoch": 0.6037994181071368, "grad_norm": 22.728544235229492, "learning_rate": 2.0091272104962922e-05, "loss": 2.2539, "step": 3528 }, { "epoch": 0.6039705630669177, "grad_norm": 20.25188446044922, "learning_rate": 2.0096976611523102e-05, "loss": 2.1306, "step": 3529 }, { "epoch": 0.6041417080266986, "grad_norm": 25.181900024414062, "learning_rate": 2.0102681118083286e-05, "loss": 2.7009, "step": 3530 }, { "epoch": 0.6043128529864795, "grad_norm": 20.236555099487305, "learning_rate": 2.010838562464347e-05, "loss": 2.0943, "step": 3531 }, { "epoch": 0.6044839979462605, "grad_norm": 28.511568069458008, "learning_rate": 2.0114090131203652e-05, "loss": 3.4193, "step": 3532 }, { "epoch": 0.6046551429060414, "grad_norm": 20.58401870727539, "learning_rate": 2.0119794637763832e-05, "loss": 1.8277, "step": 3533 }, { "epoch": 0.6048262878658224, "grad_norm": 28.2340087890625, "learning_rate": 2.0125499144324016e-05, "loss": 3.125, "step": 3534 }, { "epoch": 0.6049974328256033, "grad_norm": 29.512651443481445, "learning_rate": 2.01312036508842e-05, "loss": 2.8504, "step": 3535 }, { "epoch": 0.6051685777853842, "grad_norm": 20.814037322998047, "learning_rate": 2.013690815744438e-05, "loss": 1.988, "step": 3536 }, { "epoch": 0.6053397227451651, "grad_norm": 31.458534240722656, "learning_rate": 2.0142612664004562e-05, "loss": 3.4279, "step": 3537 }, { "epoch": 0.6055108677049461, "grad_norm": 6.904580116271973, "learning_rate": 2.0148317170564746e-05, "loss": 0.596, "step": 3538 }, { "epoch": 0.605682012664727, "grad_norm": 4.964241981506348, "learning_rate": 2.015402167712493e-05, "loss": 0.5453, "step": 3539 }, { "epoch": 0.605853157624508, "grad_norm": 13.879029273986816, "learning_rate": 2.0159726183685113e-05, "loss": 0.8886, "step": 3540 }, { "epoch": 0.6060243025842889, "grad_norm": 5.926167964935303, "learning_rate": 2.0165430690245296e-05, "loss": 0.4682, "step": 3541 }, { "epoch": 0.6061954475440698, "grad_norm": 37.99885177612305, "learning_rate": 2.017113519680548e-05, "loss": 2.6702, "step": 3542 }, { "epoch": 0.6063665925038507, "grad_norm": 1.8707212209701538, "learning_rate": 2.017683970336566e-05, "loss": 0.3064, "step": 3543 }, { "epoch": 0.6065377374636317, "grad_norm": 19.801597595214844, "learning_rate": 2.0182544209925843e-05, "loss": 1.9881, "step": 3544 }, { "epoch": 0.6067088824234126, "grad_norm": 6.91411828994751, "learning_rate": 2.0188248716486026e-05, "loss": 0.6814, "step": 3545 }, { "epoch": 0.6068800273831936, "grad_norm": 24.066730499267578, "learning_rate": 2.0193953223046206e-05, "loss": 2.9773, "step": 3546 }, { "epoch": 0.6070511723429745, "grad_norm": 26.214096069335938, "learning_rate": 2.019965772960639e-05, "loss": 2.5413, "step": 3547 }, { "epoch": 0.6072223173027554, "grad_norm": 26.099639892578125, "learning_rate": 2.0205362236166573e-05, "loss": 3.1048, "step": 3548 }, { "epoch": 0.6073934622625363, "grad_norm": 5.310629844665527, "learning_rate": 2.0211066742726756e-05, "loss": 0.4841, "step": 3549 }, { "epoch": 0.6075646072223173, "grad_norm": 31.81784439086914, "learning_rate": 2.0216771249286936e-05, "loss": 2.6404, "step": 3550 }, { "epoch": 0.6077357521820982, "grad_norm": 20.07958221435547, "learning_rate": 2.022247575584712e-05, "loss": 2.0417, "step": 3551 }, { "epoch": 0.6079068971418792, "grad_norm": 23.589279174804688, "learning_rate": 2.0228180262407303e-05, "loss": 1.8377, "step": 3552 }, { "epoch": 0.60807804210166, "grad_norm": 7.810224533081055, "learning_rate": 2.0233884768967483e-05, "loss": 0.6155, "step": 3553 }, { "epoch": 0.608249187061441, "grad_norm": 25.48611068725586, "learning_rate": 2.0239589275527666e-05, "loss": 2.6452, "step": 3554 }, { "epoch": 0.6084203320212219, "grad_norm": 28.041179656982422, "learning_rate": 2.024529378208785e-05, "loss": 2.7049, "step": 3555 }, { "epoch": 0.6085914769810029, "grad_norm": 29.171598434448242, "learning_rate": 2.0250998288648033e-05, "loss": 2.9279, "step": 3556 }, { "epoch": 0.6087626219407839, "grad_norm": 31.75198745727539, "learning_rate": 2.0256702795208213e-05, "loss": 3.5333, "step": 3557 }, { "epoch": 0.6089337669005648, "grad_norm": 27.840137481689453, "learning_rate": 2.0262407301768396e-05, "loss": 2.582, "step": 3558 }, { "epoch": 0.6091049118603458, "grad_norm": 30.64188575744629, "learning_rate": 2.026811180832858e-05, "loss": 3.4939, "step": 3559 }, { "epoch": 0.6092760568201266, "grad_norm": 5.60610294342041, "learning_rate": 2.027381631488876e-05, "loss": 0.5944, "step": 3560 }, { "epoch": 0.6094472017799076, "grad_norm": 6.2669148445129395, "learning_rate": 2.0279520821448943e-05, "loss": 0.6124, "step": 3561 }, { "epoch": 0.6096183467396885, "grad_norm": 24.084915161132812, "learning_rate": 2.028522532800913e-05, "loss": 2.3808, "step": 3562 }, { "epoch": 0.6097894916994695, "grad_norm": 25.230403900146484, "learning_rate": 2.0290929834569313e-05, "loss": 2.2763, "step": 3563 }, { "epoch": 0.6099606366592504, "grad_norm": 3.4166452884674072, "learning_rate": 2.0296634341129493e-05, "loss": 0.3625, "step": 3564 }, { "epoch": 0.6101317816190314, "grad_norm": 43.064022064208984, "learning_rate": 2.0302338847689677e-05, "loss": 7.0079, "step": 3565 }, { "epoch": 0.6103029265788122, "grad_norm": 126.53868103027344, "learning_rate": 2.030804335424986e-05, "loss": 8.3917, "step": 3566 }, { "epoch": 0.6104740715385932, "grad_norm": 131.35928344726562, "learning_rate": 2.031374786081004e-05, "loss": 3.4001, "step": 3567 }, { "epoch": 0.6106452164983741, "grad_norm": 25.176708221435547, "learning_rate": 2.0319452367370223e-05, "loss": 2.6828, "step": 3568 }, { "epoch": 0.6108163614581551, "grad_norm": 10.270312309265137, "learning_rate": 2.0325156873930407e-05, "loss": 0.6769, "step": 3569 }, { "epoch": 0.610987506417936, "grad_norm": 134.7881317138672, "learning_rate": 2.033086138049059e-05, "loss": 9.4478, "step": 3570 }, { "epoch": 0.611158651377717, "grad_norm": 29.06338119506836, "learning_rate": 2.033656588705077e-05, "loss": 3.239, "step": 3571 }, { "epoch": 0.6113297963374978, "grad_norm": 28.089710235595703, "learning_rate": 2.0342270393610954e-05, "loss": 3.3457, "step": 3572 }, { "epoch": 0.6115009412972788, "grad_norm": 30.67607879638672, "learning_rate": 2.0347974900171137e-05, "loss": 3.2694, "step": 3573 }, { "epoch": 0.6116720862570597, "grad_norm": 22.92820167541504, "learning_rate": 2.0353679406731317e-05, "loss": 2.2235, "step": 3574 }, { "epoch": 0.6118432312168407, "grad_norm": 34.413116455078125, "learning_rate": 2.03593839132915e-05, "loss": 4.2986, "step": 3575 }, { "epoch": 0.6120143761766216, "grad_norm": 31.22587013244629, "learning_rate": 2.0365088419851684e-05, "loss": 3.0481, "step": 3576 }, { "epoch": 0.6121855211364026, "grad_norm": 25.429521560668945, "learning_rate": 2.0370792926411864e-05, "loss": 2.229, "step": 3577 }, { "epoch": 0.6123566660961834, "grad_norm": 11.0814208984375, "learning_rate": 2.0376497432972047e-05, "loss": 0.8888, "step": 3578 }, { "epoch": 0.6125278110559644, "grad_norm": 7.115586757659912, "learning_rate": 2.038220193953223e-05, "loss": 0.7247, "step": 3579 }, { "epoch": 0.6126989560157453, "grad_norm": 23.84605598449707, "learning_rate": 2.0387906446092414e-05, "loss": 2.5407, "step": 3580 }, { "epoch": 0.6128701009755263, "grad_norm": 29.189983367919922, "learning_rate": 2.0393610952652594e-05, "loss": 2.9275, "step": 3581 }, { "epoch": 0.6130412459353072, "grad_norm": 13.353384971618652, "learning_rate": 2.0399315459212777e-05, "loss": 1.2951, "step": 3582 }, { "epoch": 0.6132123908950882, "grad_norm": 19.755294799804688, "learning_rate": 2.040501996577296e-05, "loss": 1.69, "step": 3583 }, { "epoch": 0.613383535854869, "grad_norm": 42.32930374145508, "learning_rate": 2.041072447233314e-05, "loss": 6.7412, "step": 3584 }, { "epoch": 0.61355468081465, "grad_norm": 28.144733428955078, "learning_rate": 2.0416428978893327e-05, "loss": 3.0857, "step": 3585 }, { "epoch": 0.6137258257744309, "grad_norm": 9.064329147338867, "learning_rate": 2.042213348545351e-05, "loss": 0.6073, "step": 3586 }, { "epoch": 0.6138969707342119, "grad_norm": 13.769346237182617, "learning_rate": 2.0427837992013694e-05, "loss": 0.9597, "step": 3587 }, { "epoch": 0.6140681156939928, "grad_norm": 26.750154495239258, "learning_rate": 2.0433542498573874e-05, "loss": 2.5717, "step": 3588 }, { "epoch": 0.6142392606537738, "grad_norm": 21.833545684814453, "learning_rate": 2.0439247005134057e-05, "loss": 2.2722, "step": 3589 }, { "epoch": 0.6144104056135546, "grad_norm": 20.934206008911133, "learning_rate": 2.044495151169424e-05, "loss": 1.8977, "step": 3590 }, { "epoch": 0.6145815505733356, "grad_norm": 20.740619659423828, "learning_rate": 2.045065601825442e-05, "loss": 2.1618, "step": 3591 }, { "epoch": 0.6147526955331165, "grad_norm": 18.453035354614258, "learning_rate": 2.0456360524814604e-05, "loss": 1.5326, "step": 3592 }, { "epoch": 0.6149238404928975, "grad_norm": 21.29932403564453, "learning_rate": 2.0462065031374788e-05, "loss": 1.8663, "step": 3593 }, { "epoch": 0.6150949854526784, "grad_norm": 43.378116607666016, "learning_rate": 2.046776953793497e-05, "loss": 2.6436, "step": 3594 }, { "epoch": 0.6152661304124594, "grad_norm": 30.3048038482666, "learning_rate": 2.047347404449515e-05, "loss": 2.4363, "step": 3595 }, { "epoch": 0.6154372753722402, "grad_norm": 7.601318836212158, "learning_rate": 2.0479178551055334e-05, "loss": 0.5412, "step": 3596 }, { "epoch": 0.6156084203320212, "grad_norm": 22.385950088500977, "learning_rate": 2.0484883057615518e-05, "loss": 2.1345, "step": 3597 }, { "epoch": 0.6157795652918021, "grad_norm": 21.425384521484375, "learning_rate": 2.0490587564175698e-05, "loss": 2.3012, "step": 3598 }, { "epoch": 0.6159507102515831, "grad_norm": 35.983375549316406, "learning_rate": 2.049629207073588e-05, "loss": 3.4264, "step": 3599 }, { "epoch": 0.616121855211364, "grad_norm": 21.63048553466797, "learning_rate": 2.0501996577296064e-05, "loss": 1.8898, "step": 3600 }, { "epoch": 0.616293000171145, "grad_norm": 22.900203704833984, "learning_rate": 2.0507701083856248e-05, "loss": 2.0752, "step": 3601 }, { "epoch": 0.6164641451309258, "grad_norm": 36.20056915283203, "learning_rate": 2.0513405590416428e-05, "loss": 3.6793, "step": 3602 }, { "epoch": 0.6166352900907068, "grad_norm": 65.48631286621094, "learning_rate": 2.051911009697661e-05, "loss": 3.0171, "step": 3603 }, { "epoch": 0.6168064350504877, "grad_norm": 28.85053062438965, "learning_rate": 2.0524814603536795e-05, "loss": 2.9372, "step": 3604 }, { "epoch": 0.6169775800102687, "grad_norm": 22.775283813476562, "learning_rate": 2.0530519110096975e-05, "loss": 2.0222, "step": 3605 }, { "epoch": 0.6171487249700496, "grad_norm": 20.72616195678711, "learning_rate": 2.0536223616657158e-05, "loss": 1.9483, "step": 3606 }, { "epoch": 0.6173198699298306, "grad_norm": 82.96855926513672, "learning_rate": 2.054192812321734e-05, "loss": 3.4477, "step": 3607 }, { "epoch": 0.6174910148896116, "grad_norm": 31.961795806884766, "learning_rate": 2.0547632629777525e-05, "loss": 3.855, "step": 3608 }, { "epoch": 0.6176621598493924, "grad_norm": 8.887676239013672, "learning_rate": 2.0553337136337708e-05, "loss": 0.9918, "step": 3609 }, { "epoch": 0.6178333048091734, "grad_norm": 28.133888244628906, "learning_rate": 2.055904164289789e-05, "loss": 3.5373, "step": 3610 }, { "epoch": 0.6180044497689543, "grad_norm": 15.302783966064453, "learning_rate": 2.0564746149458075e-05, "loss": 1.4358, "step": 3611 }, { "epoch": 0.6181755947287353, "grad_norm": 1.7312853336334229, "learning_rate": 2.0570450656018255e-05, "loss": 0.3083, "step": 3612 }, { "epoch": 0.6183467396885162, "grad_norm": 57.10562515258789, "learning_rate": 2.0576155162578438e-05, "loss": 2.6412, "step": 3613 }, { "epoch": 0.6185178846482972, "grad_norm": 23.632875442504883, "learning_rate": 2.058185966913862e-05, "loss": 2.165, "step": 3614 }, { "epoch": 0.618689029608078, "grad_norm": 12.391186714172363, "learning_rate": 2.05875641756988e-05, "loss": 0.8518, "step": 3615 }, { "epoch": 0.618860174567859, "grad_norm": 33.55079650878906, "learning_rate": 2.0593268682258985e-05, "loss": 6.3741, "step": 3616 }, { "epoch": 0.6190313195276399, "grad_norm": 30.267724990844727, "learning_rate": 2.0598973188819168e-05, "loss": 3.5029, "step": 3617 }, { "epoch": 0.6192024644874209, "grad_norm": 23.680438995361328, "learning_rate": 2.060467769537935e-05, "loss": 2.4026, "step": 3618 }, { "epoch": 0.6193736094472018, "grad_norm": 24.904333114624023, "learning_rate": 2.061038220193953e-05, "loss": 2.6435, "step": 3619 }, { "epoch": 0.6195447544069828, "grad_norm": 16.03217124938965, "learning_rate": 2.0616086708499715e-05, "loss": 1.515, "step": 3620 }, { "epoch": 0.6197158993667636, "grad_norm": 20.824888229370117, "learning_rate": 2.06217912150599e-05, "loss": 1.8808, "step": 3621 }, { "epoch": 0.6198870443265446, "grad_norm": 18.110668182373047, "learning_rate": 2.062749572162008e-05, "loss": 1.6689, "step": 3622 }, { "epoch": 0.6200581892863255, "grad_norm": 21.38336753845215, "learning_rate": 2.0633200228180262e-05, "loss": 2.0968, "step": 3623 }, { "epoch": 0.6202293342461065, "grad_norm": 19.99571418762207, "learning_rate": 2.0638904734740445e-05, "loss": 1.7209, "step": 3624 }, { "epoch": 0.6204004792058874, "grad_norm": 26.232772827148438, "learning_rate": 2.064460924130063e-05, "loss": 2.5226, "step": 3625 }, { "epoch": 0.6205716241656684, "grad_norm": 2.9006407260894775, "learning_rate": 2.065031374786081e-05, "loss": 0.3453, "step": 3626 }, { "epoch": 0.6207427691254492, "grad_norm": 1.3548880815505981, "learning_rate": 2.0656018254420992e-05, "loss": 0.318, "step": 3627 }, { "epoch": 0.6209139140852302, "grad_norm": 28.67076301574707, "learning_rate": 2.0661722760981175e-05, "loss": 2.7404, "step": 3628 }, { "epoch": 0.6210850590450111, "grad_norm": 19.108945846557617, "learning_rate": 2.0667427267541355e-05, "loss": 1.9497, "step": 3629 }, { "epoch": 0.6212562040047921, "grad_norm": 19.495290756225586, "learning_rate": 2.067313177410154e-05, "loss": 1.7563, "step": 3630 }, { "epoch": 0.621427348964573, "grad_norm": 9.806105613708496, "learning_rate": 2.0678836280661725e-05, "loss": 0.7359, "step": 3631 }, { "epoch": 0.621598493924354, "grad_norm": 21.068401336669922, "learning_rate": 2.068454078722191e-05, "loss": 1.8909, "step": 3632 }, { "epoch": 0.6217696388841348, "grad_norm": 24.633346557617188, "learning_rate": 2.069024529378209e-05, "loss": 2.1693, "step": 3633 }, { "epoch": 0.6219407838439158, "grad_norm": 20.427921295166016, "learning_rate": 2.0695949800342272e-05, "loss": 1.9249, "step": 3634 }, { "epoch": 0.6221119288036967, "grad_norm": 31.32949447631836, "learning_rate": 2.0701654306902456e-05, "loss": 3.3678, "step": 3635 }, { "epoch": 0.6222830737634777, "grad_norm": 1.1559346914291382, "learning_rate": 2.0707358813462636e-05, "loss": 0.2955, "step": 3636 }, { "epoch": 0.6224542187232586, "grad_norm": 31.57821273803711, "learning_rate": 2.071306332002282e-05, "loss": 3.6773, "step": 3637 }, { "epoch": 0.6226253636830396, "grad_norm": 54.79661560058594, "learning_rate": 2.0718767826583002e-05, "loss": 2.7538, "step": 3638 }, { "epoch": 0.6227965086428204, "grad_norm": 22.394084930419922, "learning_rate": 2.0724472333143186e-05, "loss": 1.9239, "step": 3639 }, { "epoch": 0.6229676536026014, "grad_norm": 27.102100372314453, "learning_rate": 2.0730176839703366e-05, "loss": 2.4047, "step": 3640 }, { "epoch": 0.6231387985623823, "grad_norm": 10.830344200134277, "learning_rate": 2.073588134626355e-05, "loss": 0.9573, "step": 3641 }, { "epoch": 0.6233099435221633, "grad_norm": 36.204898834228516, "learning_rate": 2.0741585852823732e-05, "loss": 4.9509, "step": 3642 }, { "epoch": 0.6234810884819442, "grad_norm": 27.16095542907715, "learning_rate": 2.0747290359383912e-05, "loss": 3.0999, "step": 3643 }, { "epoch": 0.6236522334417252, "grad_norm": 25.359394073486328, "learning_rate": 2.0752994865944096e-05, "loss": 1.8888, "step": 3644 }, { "epoch": 0.623823378401506, "grad_norm": 27.978900909423828, "learning_rate": 2.075869937250428e-05, "loss": 2.641, "step": 3645 }, { "epoch": 0.623994523361287, "grad_norm": 22.19740867614746, "learning_rate": 2.076440387906446e-05, "loss": 2.2979, "step": 3646 }, { "epoch": 0.6241656683210679, "grad_norm": 21.350025177001953, "learning_rate": 2.0770108385624643e-05, "loss": 2.7609, "step": 3647 }, { "epoch": 0.6243368132808489, "grad_norm": 24.283403396606445, "learning_rate": 2.0775812892184826e-05, "loss": 2.2185, "step": 3648 }, { "epoch": 0.6245079582406298, "grad_norm": 24.77626609802246, "learning_rate": 2.078151739874501e-05, "loss": 3.0207, "step": 3649 }, { "epoch": 0.6246791032004108, "grad_norm": 25.94424819946289, "learning_rate": 2.078722190530519e-05, "loss": 3.1535, "step": 3650 }, { "epoch": 0.6248502481601916, "grad_norm": 23.725664138793945, "learning_rate": 2.0792926411865373e-05, "loss": 2.1339, "step": 3651 }, { "epoch": 0.6250213931199726, "grad_norm": 20.791276931762695, "learning_rate": 2.0798630918425556e-05, "loss": 2.2234, "step": 3652 }, { "epoch": 0.6251925380797535, "grad_norm": 18.761796951293945, "learning_rate": 2.0804335424985736e-05, "loss": 1.7418, "step": 3653 }, { "epoch": 0.6253636830395345, "grad_norm": 19.412919998168945, "learning_rate": 2.0810039931545923e-05, "loss": 1.6532, "step": 3654 }, { "epoch": 0.6255348279993154, "grad_norm": 34.13801193237305, "learning_rate": 2.0815744438106106e-05, "loss": 6.6811, "step": 3655 }, { "epoch": 0.6257059729590964, "grad_norm": 28.877214431762695, "learning_rate": 2.082144894466629e-05, "loss": 3.9092, "step": 3656 }, { "epoch": 0.6258771179188772, "grad_norm": 25.978158950805664, "learning_rate": 2.082715345122647e-05, "loss": 2.3646, "step": 3657 }, { "epoch": 0.6260482628786582, "grad_norm": 45.43318176269531, "learning_rate": 2.0832857957786653e-05, "loss": 2.9491, "step": 3658 }, { "epoch": 0.6262194078384392, "grad_norm": 25.172359466552734, "learning_rate": 2.0838562464346836e-05, "loss": 3.1014, "step": 3659 }, { "epoch": 0.6263905527982201, "grad_norm": 31.28904914855957, "learning_rate": 2.0844266970907016e-05, "loss": 3.1078, "step": 3660 }, { "epoch": 0.6265616977580011, "grad_norm": 28.047880172729492, "learning_rate": 2.08499714774672e-05, "loss": 3.2356, "step": 3661 }, { "epoch": 0.626732842717782, "grad_norm": 14.974970817565918, "learning_rate": 2.0855675984027383e-05, "loss": 1.3538, "step": 3662 }, { "epoch": 0.626903987677563, "grad_norm": 1.7350009679794312, "learning_rate": 2.0861380490587566e-05, "loss": 0.3083, "step": 3663 }, { "epoch": 0.6270751326373438, "grad_norm": 24.56871795654297, "learning_rate": 2.0867084997147746e-05, "loss": 2.5915, "step": 3664 }, { "epoch": 0.6272462775971248, "grad_norm": 24.486120223999023, "learning_rate": 2.087278950370793e-05, "loss": 2.461, "step": 3665 }, { "epoch": 0.6274174225569057, "grad_norm": 24.524600982666016, "learning_rate": 2.0878494010268113e-05, "loss": 2.5748, "step": 3666 }, { "epoch": 0.6275885675166867, "grad_norm": 5.067863464355469, "learning_rate": 2.0884198516828293e-05, "loss": 0.5522, "step": 3667 }, { "epoch": 0.6277597124764676, "grad_norm": 15.868297576904297, "learning_rate": 2.0889903023388477e-05, "loss": 1.6833, "step": 3668 }, { "epoch": 0.6279308574362485, "grad_norm": 25.489429473876953, "learning_rate": 2.089560752994866e-05, "loss": 2.5381, "step": 3669 }, { "epoch": 0.6281020023960294, "grad_norm": 26.983837127685547, "learning_rate": 2.0901312036508843e-05, "loss": 3.3307, "step": 3670 }, { "epoch": 0.6282731473558104, "grad_norm": 17.707273483276367, "learning_rate": 2.0907016543069023e-05, "loss": 1.8142, "step": 3671 }, { "epoch": 0.6284442923155913, "grad_norm": 23.989248275756836, "learning_rate": 2.0912721049629207e-05, "loss": 2.5138, "step": 3672 }, { "epoch": 0.6286154372753723, "grad_norm": 24.12046241760254, "learning_rate": 2.091842555618939e-05, "loss": 2.7494, "step": 3673 }, { "epoch": 0.6287865822351532, "grad_norm": 1.9827460050582886, "learning_rate": 2.092413006274957e-05, "loss": 0.2992, "step": 3674 }, { "epoch": 0.6289577271949341, "grad_norm": 15.272665977478027, "learning_rate": 2.0929834569309753e-05, "loss": 1.5809, "step": 3675 }, { "epoch": 0.629128872154715, "grad_norm": 7.758640289306641, "learning_rate": 2.093553907586994e-05, "loss": 0.5765, "step": 3676 }, { "epoch": 0.629300017114496, "grad_norm": 29.360862731933594, "learning_rate": 2.094124358243012e-05, "loss": 3.1532, "step": 3677 }, { "epoch": 0.6294711620742769, "grad_norm": 2.6255311965942383, "learning_rate": 2.0946948088990304e-05, "loss": 0.3356, "step": 3678 }, { "epoch": 0.6296423070340579, "grad_norm": 31.4219970703125, "learning_rate": 2.0952652595550487e-05, "loss": 6.6895, "step": 3679 }, { "epoch": 0.6298134519938388, "grad_norm": 26.191577911376953, "learning_rate": 2.095835710211067e-05, "loss": 2.5616, "step": 3680 }, { "epoch": 0.6299845969536197, "grad_norm": 22.00040054321289, "learning_rate": 2.096406160867085e-05, "loss": 2.0715, "step": 3681 }, { "epoch": 0.6301557419134006, "grad_norm": 18.956966400146484, "learning_rate": 2.0969766115231034e-05, "loss": 1.9574, "step": 3682 }, { "epoch": 0.6303268868731816, "grad_norm": 27.760032653808594, "learning_rate": 2.0975470621791217e-05, "loss": 3.7785, "step": 3683 }, { "epoch": 0.6304980318329625, "grad_norm": 10.644538879394531, "learning_rate": 2.0981175128351397e-05, "loss": 0.8287, "step": 3684 }, { "epoch": 0.6306691767927435, "grad_norm": 51.96141815185547, "learning_rate": 2.098687963491158e-05, "loss": 2.4722, "step": 3685 }, { "epoch": 0.6308403217525244, "grad_norm": 7.464876174926758, "learning_rate": 2.0992584141471764e-05, "loss": 0.6759, "step": 3686 }, { "epoch": 0.6310114667123053, "grad_norm": 18.195411682128906, "learning_rate": 2.0998288648031947e-05, "loss": 1.814, "step": 3687 }, { "epoch": 0.6311826116720862, "grad_norm": 35.01757049560547, "learning_rate": 2.1003993154592127e-05, "loss": 4.6493, "step": 3688 }, { "epoch": 0.6313537566318672, "grad_norm": 27.28526496887207, "learning_rate": 2.100969766115231e-05, "loss": 2.6136, "step": 3689 }, { "epoch": 0.6315249015916481, "grad_norm": 10.132340431213379, "learning_rate": 2.1015402167712494e-05, "loss": 1.4228, "step": 3690 }, { "epoch": 0.6316960465514291, "grad_norm": 29.740331649780273, "learning_rate": 2.1021106674272674e-05, "loss": 3.4955, "step": 3691 }, { "epoch": 0.63186719151121, "grad_norm": 6.783731937408447, "learning_rate": 2.1026811180832857e-05, "loss": 0.589, "step": 3692 }, { "epoch": 0.632038336470991, "grad_norm": 26.901226043701172, "learning_rate": 2.103251568739304e-05, "loss": 2.8409, "step": 3693 }, { "epoch": 0.6322094814307718, "grad_norm": 67.48046112060547, "learning_rate": 2.1038220193953224e-05, "loss": 2.4784, "step": 3694 }, { "epoch": 0.6323806263905528, "grad_norm": 16.813676834106445, "learning_rate": 2.1043924700513404e-05, "loss": 1.3682, "step": 3695 }, { "epoch": 0.6325517713503337, "grad_norm": 27.411855697631836, "learning_rate": 2.1049629207073587e-05, "loss": 2.8456, "step": 3696 }, { "epoch": 0.6327229163101147, "grad_norm": 73.62898254394531, "learning_rate": 2.105533371363377e-05, "loss": 3.2879, "step": 3697 }, { "epoch": 0.6328940612698956, "grad_norm": 22.297090530395508, "learning_rate": 2.106103822019395e-05, "loss": 2.3233, "step": 3698 }, { "epoch": 0.6330652062296765, "grad_norm": 24.923654556274414, "learning_rate": 2.1066742726754138e-05, "loss": 2.1826, "step": 3699 }, { "epoch": 0.6332363511894574, "grad_norm": 20.588891983032227, "learning_rate": 2.107244723331432e-05, "loss": 2.0226, "step": 3700 }, { "epoch": 0.6334074961492384, "grad_norm": 5.975876331329346, "learning_rate": 2.1078151739874504e-05, "loss": 0.6341, "step": 3701 }, { "epoch": 0.6335786411090193, "grad_norm": 21.88986587524414, "learning_rate": 2.1083856246434684e-05, "loss": 2.1575, "step": 3702 }, { "epoch": 0.6337497860688003, "grad_norm": 19.65184211730957, "learning_rate": 2.1089560752994868e-05, "loss": 1.663, "step": 3703 }, { "epoch": 0.6339209310285812, "grad_norm": 30.190269470214844, "learning_rate": 2.109526525955505e-05, "loss": 3.6871, "step": 3704 }, { "epoch": 0.6340920759883621, "grad_norm": 8.569622993469238, "learning_rate": 2.110096976611523e-05, "loss": 0.9122, "step": 3705 }, { "epoch": 0.634263220948143, "grad_norm": 59.63459777832031, "learning_rate": 2.1106674272675414e-05, "loss": 2.3494, "step": 3706 }, { "epoch": 0.634434365907924, "grad_norm": 26.309602737426758, "learning_rate": 2.1112378779235598e-05, "loss": 3.1797, "step": 3707 }, { "epoch": 0.6346055108677049, "grad_norm": 28.47893714904785, "learning_rate": 2.1118083285795778e-05, "loss": 3.2675, "step": 3708 }, { "epoch": 0.6347766558274859, "grad_norm": 24.453359603881836, "learning_rate": 2.112378779235596e-05, "loss": 2.5852, "step": 3709 }, { "epoch": 0.6349478007872669, "grad_norm": 24.291488647460938, "learning_rate": 2.1129492298916145e-05, "loss": 2.3409, "step": 3710 }, { "epoch": 0.6351189457470477, "grad_norm": 26.9472599029541, "learning_rate": 2.1135196805476328e-05, "loss": 3.1146, "step": 3711 }, { "epoch": 0.6352900907068287, "grad_norm": 20.753297805786133, "learning_rate": 2.1140901312036508e-05, "loss": 1.8245, "step": 3712 }, { "epoch": 0.6354612356666096, "grad_norm": 11.65485668182373, "learning_rate": 2.114660581859669e-05, "loss": 0.8492, "step": 3713 }, { "epoch": 0.6356323806263906, "grad_norm": 18.369417190551758, "learning_rate": 2.1152310325156875e-05, "loss": 1.8056, "step": 3714 }, { "epoch": 0.6358035255861715, "grad_norm": 3.041557788848877, "learning_rate": 2.1158014831717055e-05, "loss": 0.3459, "step": 3715 }, { "epoch": 0.6359746705459525, "grad_norm": 31.116910934448242, "learning_rate": 2.1163719338277238e-05, "loss": 2.4418, "step": 3716 }, { "epoch": 0.6361458155057333, "grad_norm": 26.295557022094727, "learning_rate": 2.116942384483742e-05, "loss": 1.8444, "step": 3717 }, { "epoch": 0.6363169604655143, "grad_norm": 25.38450813293457, "learning_rate": 2.1175128351397605e-05, "loss": 2.2447, "step": 3718 }, { "epoch": 0.6364881054252952, "grad_norm": 25.307218551635742, "learning_rate": 2.1180832857957785e-05, "loss": 2.5005, "step": 3719 }, { "epoch": 0.6366592503850762, "grad_norm": 2.224104642868042, "learning_rate": 2.1186537364517968e-05, "loss": 0.3241, "step": 3720 }, { "epoch": 0.6368303953448571, "grad_norm": 17.863842010498047, "learning_rate": 2.119224187107815e-05, "loss": 1.8059, "step": 3721 }, { "epoch": 0.6370015403046381, "grad_norm": 143.09255981445312, "learning_rate": 2.1197946377638335e-05, "loss": 7.8615, "step": 3722 }, { "epoch": 0.637172685264419, "grad_norm": 20.51776695251465, "learning_rate": 2.120365088419852e-05, "loss": 1.9465, "step": 3723 }, { "epoch": 0.6373438302241999, "grad_norm": 19.772676467895508, "learning_rate": 2.1209355390758702e-05, "loss": 1.874, "step": 3724 }, { "epoch": 0.6375149751839808, "grad_norm": 29.25998306274414, "learning_rate": 2.1215059897318885e-05, "loss": 3.1729, "step": 3725 }, { "epoch": 0.6376861201437618, "grad_norm": 68.94001770019531, "learning_rate": 2.1220764403879065e-05, "loss": 7.3574, "step": 3726 }, { "epoch": 0.6378572651035427, "grad_norm": 26.3350887298584, "learning_rate": 2.122646891043925e-05, "loss": 3.0566, "step": 3727 }, { "epoch": 0.6380284100633237, "grad_norm": 1.6111328601837158, "learning_rate": 2.1232173416999432e-05, "loss": 0.2934, "step": 3728 }, { "epoch": 0.6381995550231045, "grad_norm": 20.667644500732422, "learning_rate": 2.1237877923559612e-05, "loss": 2.0751, "step": 3729 }, { "epoch": 0.6383706999828855, "grad_norm": 14.264472961425781, "learning_rate": 2.1243582430119795e-05, "loss": 1.0027, "step": 3730 }, { "epoch": 0.6385418449426664, "grad_norm": 22.407548904418945, "learning_rate": 2.124928693667998e-05, "loss": 1.9791, "step": 3731 }, { "epoch": 0.6387129899024474, "grad_norm": 31.578723907470703, "learning_rate": 2.1254991443240162e-05, "loss": 2.7384, "step": 3732 }, { "epoch": 0.6388841348622283, "grad_norm": 2.103879690170288, "learning_rate": 2.1260695949800342e-05, "loss": 0.312, "step": 3733 }, { "epoch": 0.6390552798220093, "grad_norm": 68.59303283691406, "learning_rate": 2.1266400456360525e-05, "loss": 7.0889, "step": 3734 }, { "epoch": 0.6392264247817901, "grad_norm": 7.920656681060791, "learning_rate": 2.127210496292071e-05, "loss": 1.09, "step": 3735 }, { "epoch": 0.6393975697415711, "grad_norm": 29.83785057067871, "learning_rate": 2.127780946948089e-05, "loss": 2.6416, "step": 3736 }, { "epoch": 0.639568714701352, "grad_norm": 22.06658172607422, "learning_rate": 2.1283513976041072e-05, "loss": 2.3255, "step": 3737 }, { "epoch": 0.639739859661133, "grad_norm": 18.649507522583008, "learning_rate": 2.1289218482601255e-05, "loss": 1.5747, "step": 3738 }, { "epoch": 0.6399110046209139, "grad_norm": 25.857921600341797, "learning_rate": 2.129492298916144e-05, "loss": 2.3616, "step": 3739 }, { "epoch": 0.6400821495806949, "grad_norm": 27.74761390686035, "learning_rate": 2.130062749572162e-05, "loss": 3.559, "step": 3740 }, { "epoch": 0.6402532945404757, "grad_norm": 42.965763092041016, "learning_rate": 2.1306332002281802e-05, "loss": 6.2665, "step": 3741 }, { "epoch": 0.6404244395002567, "grad_norm": 13.754633903503418, "learning_rate": 2.1312036508841986e-05, "loss": 0.9964, "step": 3742 }, { "epoch": 0.6405955844600376, "grad_norm": 30.93828010559082, "learning_rate": 2.1317741015402166e-05, "loss": 2.1571, "step": 3743 }, { "epoch": 0.6407667294198186, "grad_norm": 13.315516471862793, "learning_rate": 2.132344552196235e-05, "loss": 0.9772, "step": 3744 }, { "epoch": 0.6409378743795995, "grad_norm": 25.19696617126465, "learning_rate": 2.1329150028522536e-05, "loss": 2.2795, "step": 3745 }, { "epoch": 0.6411090193393805, "grad_norm": 19.73836326599121, "learning_rate": 2.1334854535082716e-05, "loss": 2.1502, "step": 3746 }, { "epoch": 0.6412801642991613, "grad_norm": 13.088768005371094, "learning_rate": 2.13405590416429e-05, "loss": 0.6967, "step": 3747 }, { "epoch": 0.6414513092589423, "grad_norm": 8.49268627166748, "learning_rate": 2.1346263548203082e-05, "loss": 0.8886, "step": 3748 }, { "epoch": 0.6416224542187232, "grad_norm": 14.05295181274414, "learning_rate": 2.1351968054763266e-05, "loss": 1.1393, "step": 3749 }, { "epoch": 0.6417935991785042, "grad_norm": 35.74152755737305, "learning_rate": 2.1357672561323446e-05, "loss": 3.8807, "step": 3750 }, { "epoch": 0.6419647441382851, "grad_norm": 12.692075729370117, "learning_rate": 2.136337706788363e-05, "loss": 0.9175, "step": 3751 }, { "epoch": 0.6421358890980661, "grad_norm": 3.9487550258636475, "learning_rate": 2.1369081574443813e-05, "loss": 0.3691, "step": 3752 }, { "epoch": 0.642307034057847, "grad_norm": 25.308374404907227, "learning_rate": 2.1374786081003993e-05, "loss": 3.566, "step": 3753 }, { "epoch": 0.6424781790176279, "grad_norm": 27.15434455871582, "learning_rate": 2.1380490587564176e-05, "loss": 2.8604, "step": 3754 }, { "epoch": 0.6426493239774088, "grad_norm": 28.912559509277344, "learning_rate": 2.138619509412436e-05, "loss": 2.7714, "step": 3755 }, { "epoch": 0.6428204689371898, "grad_norm": 185.9123992919922, "learning_rate": 2.1391899600684543e-05, "loss": 11.7502, "step": 3756 }, { "epoch": 0.6429916138969707, "grad_norm": 33.01267623901367, "learning_rate": 2.1397604107244723e-05, "loss": 1.9282, "step": 3757 }, { "epoch": 0.6431627588567517, "grad_norm": 32.01244354248047, "learning_rate": 2.1403308613804906e-05, "loss": 3.0304, "step": 3758 }, { "epoch": 0.6433339038165325, "grad_norm": 26.142648696899414, "learning_rate": 2.140901312036509e-05, "loss": 2.5164, "step": 3759 }, { "epoch": 0.6435050487763135, "grad_norm": 24.568946838378906, "learning_rate": 2.141471762692527e-05, "loss": 2.3979, "step": 3760 }, { "epoch": 0.6436761937360945, "grad_norm": 26.051137924194336, "learning_rate": 2.1420422133485453e-05, "loss": 3.0876, "step": 3761 }, { "epoch": 0.6438473386958754, "grad_norm": 17.334243774414062, "learning_rate": 2.1426126640045636e-05, "loss": 1.4332, "step": 3762 }, { "epoch": 0.6440184836556564, "grad_norm": 29.278783798217773, "learning_rate": 2.143183114660582e-05, "loss": 2.8496, "step": 3763 }, { "epoch": 0.6441896286154373, "grad_norm": 24.168411254882812, "learning_rate": 2.1437535653166e-05, "loss": 2.4451, "step": 3764 }, { "epoch": 0.6443607735752183, "grad_norm": 31.55498504638672, "learning_rate": 2.1443240159726183e-05, "loss": 2.0817, "step": 3765 }, { "epoch": 0.6445319185349991, "grad_norm": 32.670005798339844, "learning_rate": 2.1448944666286366e-05, "loss": 6.7024, "step": 3766 }, { "epoch": 0.6447030634947801, "grad_norm": 2.9258251190185547, "learning_rate": 2.1454649172846546e-05, "loss": 0.3334, "step": 3767 }, { "epoch": 0.644874208454561, "grad_norm": 12.174300193786621, "learning_rate": 2.1460353679406733e-05, "loss": 0.8169, "step": 3768 }, { "epoch": 0.645045353414342, "grad_norm": 28.00621223449707, "learning_rate": 2.1466058185966917e-05, "loss": 3.2524, "step": 3769 }, { "epoch": 0.6452164983741229, "grad_norm": 26.712377548217773, "learning_rate": 2.14717626925271e-05, "loss": 2.7371, "step": 3770 }, { "epoch": 0.6453876433339039, "grad_norm": 21.200624465942383, "learning_rate": 2.147746719908728e-05, "loss": 2.6966, "step": 3771 }, { "epoch": 0.6455587882936847, "grad_norm": 11.284048080444336, "learning_rate": 2.1483171705647463e-05, "loss": 0.7751, "step": 3772 }, { "epoch": 0.6457299332534657, "grad_norm": 11.401342391967773, "learning_rate": 2.1488876212207647e-05, "loss": 0.7674, "step": 3773 }, { "epoch": 0.6459010782132466, "grad_norm": 6.0696587562561035, "learning_rate": 2.1494580718767827e-05, "loss": 0.6054, "step": 3774 }, { "epoch": 0.6460722231730276, "grad_norm": 27.059720993041992, "learning_rate": 2.150028522532801e-05, "loss": 3.0029, "step": 3775 }, { "epoch": 0.6462433681328085, "grad_norm": 20.365650177001953, "learning_rate": 2.1505989731888193e-05, "loss": 2.0427, "step": 3776 }, { "epoch": 0.6464145130925895, "grad_norm": 6.034448146820068, "learning_rate": 2.1511694238448373e-05, "loss": 0.6441, "step": 3777 }, { "epoch": 0.6465856580523703, "grad_norm": 23.394229888916016, "learning_rate": 2.1517398745008557e-05, "loss": 2.4589, "step": 3778 }, { "epoch": 0.6467568030121513, "grad_norm": 46.16388702392578, "learning_rate": 2.152310325156874e-05, "loss": 7.0883, "step": 3779 }, { "epoch": 0.6469279479719322, "grad_norm": 23.118371963500977, "learning_rate": 2.1528807758128924e-05, "loss": 2.5132, "step": 3780 }, { "epoch": 0.6470990929317132, "grad_norm": 29.06417465209961, "learning_rate": 2.1534512264689103e-05, "loss": 2.967, "step": 3781 }, { "epoch": 0.6472702378914941, "grad_norm": 4.700016498565674, "learning_rate": 2.1540216771249287e-05, "loss": 0.3476, "step": 3782 }, { "epoch": 0.6474413828512751, "grad_norm": 9.841592788696289, "learning_rate": 2.154592127780947e-05, "loss": 0.6132, "step": 3783 }, { "epoch": 0.6476125278110559, "grad_norm": 6.57474946975708, "learning_rate": 2.155162578436965e-05, "loss": 0.5898, "step": 3784 }, { "epoch": 0.6477836727708369, "grad_norm": 10.39101791381836, "learning_rate": 2.1557330290929834e-05, "loss": 0.9835, "step": 3785 }, { "epoch": 0.6479548177306178, "grad_norm": 23.276077270507812, "learning_rate": 2.1563034797490017e-05, "loss": 2.6405, "step": 3786 }, { "epoch": 0.6481259626903988, "grad_norm": 25.941986083984375, "learning_rate": 2.15687393040502e-05, "loss": 2.89, "step": 3787 }, { "epoch": 0.6482971076501797, "grad_norm": 32.4000129699707, "learning_rate": 2.157444381061038e-05, "loss": 3.63, "step": 3788 }, { "epoch": 0.6484682526099607, "grad_norm": 36.605247497558594, "learning_rate": 2.1580148317170564e-05, "loss": 6.3912, "step": 3789 }, { "epoch": 0.6486393975697415, "grad_norm": 31.485267639160156, "learning_rate": 2.1585852823730747e-05, "loss": 3.3366, "step": 3790 }, { "epoch": 0.6488105425295225, "grad_norm": 23.93595314025879, "learning_rate": 2.159155733029093e-05, "loss": 2.4522, "step": 3791 }, { "epoch": 0.6489816874893034, "grad_norm": 25.628398895263672, "learning_rate": 2.1597261836851114e-05, "loss": 3.0389, "step": 3792 }, { "epoch": 0.6491528324490844, "grad_norm": 25.61122703552246, "learning_rate": 2.1602966343411297e-05, "loss": 2.1422, "step": 3793 }, { "epoch": 0.6493239774088653, "grad_norm": 26.866369247436523, "learning_rate": 2.160867084997148e-05, "loss": 2.8132, "step": 3794 }, { "epoch": 0.6494951223686463, "grad_norm": 60.774818420410156, "learning_rate": 2.161437535653166e-05, "loss": 6.1755, "step": 3795 }, { "epoch": 0.6496662673284271, "grad_norm": 12.326183319091797, "learning_rate": 2.1620079863091844e-05, "loss": 0.839, "step": 3796 }, { "epoch": 0.6498374122882081, "grad_norm": 22.0472354888916, "learning_rate": 2.1625784369652027e-05, "loss": 2.4891, "step": 3797 }, { "epoch": 0.650008557247989, "grad_norm": 8.49109935760498, "learning_rate": 2.1631488876212207e-05, "loss": 0.7858, "step": 3798 }, { "epoch": 0.65017970220777, "grad_norm": 46.285579681396484, "learning_rate": 2.163719338277239e-05, "loss": 2.7199, "step": 3799 }, { "epoch": 0.6503508471675509, "grad_norm": 16.816619873046875, "learning_rate": 2.1642897889332574e-05, "loss": 1.5917, "step": 3800 }, { "epoch": 0.6505219921273319, "grad_norm": 28.71768569946289, "learning_rate": 2.1648602395892758e-05, "loss": 3.4323, "step": 3801 }, { "epoch": 0.6506931370871127, "grad_norm": 23.60746192932129, "learning_rate": 2.1654306902452938e-05, "loss": 2.0654, "step": 3802 }, { "epoch": 0.6508642820468937, "grad_norm": 26.327360153198242, "learning_rate": 2.166001140901312e-05, "loss": 1.8876, "step": 3803 }, { "epoch": 0.6510354270066746, "grad_norm": 44.482337951660156, "learning_rate": 2.1665715915573304e-05, "loss": 6.5739, "step": 3804 }, { "epoch": 0.6512065719664556, "grad_norm": 27.297197341918945, "learning_rate": 2.1671420422133484e-05, "loss": 2.1462, "step": 3805 }, { "epoch": 0.6513777169262365, "grad_norm": 11.890837669372559, "learning_rate": 2.1677124928693668e-05, "loss": 0.7348, "step": 3806 }, { "epoch": 0.6515488618860175, "grad_norm": 27.35932731628418, "learning_rate": 2.168282943525385e-05, "loss": 2.9561, "step": 3807 }, { "epoch": 0.6517200068457983, "grad_norm": 25.932842254638672, "learning_rate": 2.1688533941814034e-05, "loss": 2.5524, "step": 3808 }, { "epoch": 0.6518911518055793, "grad_norm": 7.344489574432373, "learning_rate": 2.1694238448374214e-05, "loss": 0.8045, "step": 3809 }, { "epoch": 0.6520622967653602, "grad_norm": 24.049985885620117, "learning_rate": 2.1699942954934398e-05, "loss": 2.3474, "step": 3810 }, { "epoch": 0.6522334417251412, "grad_norm": 25.154258728027344, "learning_rate": 2.170564746149458e-05, "loss": 2.8768, "step": 3811 }, { "epoch": 0.6524045866849222, "grad_norm": 35.475502014160156, "learning_rate": 2.171135196805476e-05, "loss": 5.6263, "step": 3812 }, { "epoch": 0.6525757316447031, "grad_norm": 18.898576736450195, "learning_rate": 2.1717056474614945e-05, "loss": 2.0987, "step": 3813 }, { "epoch": 0.652746876604484, "grad_norm": 64.42694091796875, "learning_rate": 2.172276098117513e-05, "loss": 1.9397, "step": 3814 }, { "epoch": 0.6529180215642649, "grad_norm": 51.23388671875, "learning_rate": 2.172846548773531e-05, "loss": 2.187, "step": 3815 }, { "epoch": 0.6530891665240459, "grad_norm": 24.042943954467773, "learning_rate": 2.1734169994295495e-05, "loss": 2.9904, "step": 3816 }, { "epoch": 0.6532603114838268, "grad_norm": 4.368581295013428, "learning_rate": 2.1739874500855678e-05, "loss": 0.442, "step": 3817 }, { "epoch": 0.6534314564436078, "grad_norm": 22.971675872802734, "learning_rate": 2.174557900741586e-05, "loss": 2.3311, "step": 3818 }, { "epoch": 0.6536026014033887, "grad_norm": 23.986604690551758, "learning_rate": 2.175128351397604e-05, "loss": 2.6135, "step": 3819 }, { "epoch": 0.6537737463631696, "grad_norm": 28.69915771484375, "learning_rate": 2.1756988020536225e-05, "loss": 3.5022, "step": 3820 }, { "epoch": 0.6539448913229505, "grad_norm": 8.601239204406738, "learning_rate": 2.1762692527096408e-05, "loss": 0.6592, "step": 3821 }, { "epoch": 0.6541160362827315, "grad_norm": 22.482227325439453, "learning_rate": 2.1768397033656588e-05, "loss": 2.4048, "step": 3822 }, { "epoch": 0.6542871812425124, "grad_norm": 25.31351089477539, "learning_rate": 2.177410154021677e-05, "loss": 3.4277, "step": 3823 }, { "epoch": 0.6544583262022934, "grad_norm": 20.58570671081543, "learning_rate": 2.1779806046776955e-05, "loss": 2.1318, "step": 3824 }, { "epoch": 0.6546294711620743, "grad_norm": 15.284663200378418, "learning_rate": 2.1785510553337138e-05, "loss": 1.6332, "step": 3825 }, { "epoch": 0.6548006161218553, "grad_norm": 22.946290969848633, "learning_rate": 2.1791215059897318e-05, "loss": 2.5015, "step": 3826 }, { "epoch": 0.6549717610816361, "grad_norm": 49.23842239379883, "learning_rate": 2.17969195664575e-05, "loss": 7.6205, "step": 3827 }, { "epoch": 0.6551429060414171, "grad_norm": 16.11168670654297, "learning_rate": 2.1802624073017685e-05, "loss": 1.5222, "step": 3828 }, { "epoch": 0.655314051001198, "grad_norm": 25.72747039794922, "learning_rate": 2.1808328579577865e-05, "loss": 2.7138, "step": 3829 }, { "epoch": 0.655485195960979, "grad_norm": 14.393827438354492, "learning_rate": 2.181403308613805e-05, "loss": 1.1036, "step": 3830 }, { "epoch": 0.6556563409207599, "grad_norm": 27.66619300842285, "learning_rate": 2.1819737592698232e-05, "loss": 2.5863, "step": 3831 }, { "epoch": 0.6558274858805409, "grad_norm": 34.8533935546875, "learning_rate": 2.1825442099258415e-05, "loss": 6.3297, "step": 3832 }, { "epoch": 0.6559986308403217, "grad_norm": 32.486045837402344, "learning_rate": 2.1831146605818595e-05, "loss": 3.5127, "step": 3833 }, { "epoch": 0.6561697758001027, "grad_norm": 22.248271942138672, "learning_rate": 2.183685111237878e-05, "loss": 2.0005, "step": 3834 }, { "epoch": 0.6563409207598836, "grad_norm": 10.389534950256348, "learning_rate": 2.1842555618938962e-05, "loss": 0.7072, "step": 3835 }, { "epoch": 0.6565120657196646, "grad_norm": 11.556964874267578, "learning_rate": 2.1848260125499145e-05, "loss": 0.8002, "step": 3836 }, { "epoch": 0.6566832106794455, "grad_norm": 41.708778381347656, "learning_rate": 2.185396463205933e-05, "loss": 6.6155, "step": 3837 }, { "epoch": 0.6568543556392265, "grad_norm": 26.74636459350586, "learning_rate": 2.1859669138619512e-05, "loss": 3.0992, "step": 3838 }, { "epoch": 0.6570255005990073, "grad_norm": 24.822227478027344, "learning_rate": 2.1865373645179695e-05, "loss": 1.993, "step": 3839 }, { "epoch": 0.6571966455587883, "grad_norm": 28.681447982788086, "learning_rate": 2.1871078151739875e-05, "loss": 2.4919, "step": 3840 }, { "epoch": 0.6573677905185692, "grad_norm": 24.745319366455078, "learning_rate": 2.187678265830006e-05, "loss": 2.4664, "step": 3841 }, { "epoch": 0.6575389354783502, "grad_norm": 21.362070083618164, "learning_rate": 2.1882487164860242e-05, "loss": 1.9891, "step": 3842 }, { "epoch": 0.6577100804381311, "grad_norm": 5.6285481452941895, "learning_rate": 2.1888191671420422e-05, "loss": 0.5344, "step": 3843 }, { "epoch": 0.657881225397912, "grad_norm": 22.420122146606445, "learning_rate": 2.1893896177980606e-05, "loss": 2.1797, "step": 3844 }, { "epoch": 0.6580523703576929, "grad_norm": 1.2516911029815674, "learning_rate": 2.189960068454079e-05, "loss": 0.284, "step": 3845 }, { "epoch": 0.6582235153174739, "grad_norm": 16.150665283203125, "learning_rate": 2.190530519110097e-05, "loss": 1.2482, "step": 3846 }, { "epoch": 0.6583946602772548, "grad_norm": 29.241182327270508, "learning_rate": 2.1911009697661152e-05, "loss": 3.0362, "step": 3847 }, { "epoch": 0.6585658052370358, "grad_norm": 12.874473571777344, "learning_rate": 2.1916714204221336e-05, "loss": 1.3586, "step": 3848 }, { "epoch": 0.6587369501968167, "grad_norm": 25.05373764038086, "learning_rate": 2.192241871078152e-05, "loss": 2.9891, "step": 3849 }, { "epoch": 0.6589080951565977, "grad_norm": 5.338008880615234, "learning_rate": 2.19281232173417e-05, "loss": 0.6053, "step": 3850 }, { "epoch": 0.6590792401163785, "grad_norm": 24.934619903564453, "learning_rate": 2.1933827723901882e-05, "loss": 3.4717, "step": 3851 }, { "epoch": 0.6592503850761595, "grad_norm": 18.92520523071289, "learning_rate": 2.1939532230462066e-05, "loss": 1.792, "step": 3852 }, { "epoch": 0.6594215300359404, "grad_norm": 22.526769638061523, "learning_rate": 2.1945236737022246e-05, "loss": 2.2758, "step": 3853 }, { "epoch": 0.6595926749957214, "grad_norm": 13.877473831176758, "learning_rate": 2.195094124358243e-05, "loss": 0.9918, "step": 3854 }, { "epoch": 0.6597638199555023, "grad_norm": 26.623685836791992, "learning_rate": 2.1956645750142613e-05, "loss": 3.1529, "step": 3855 }, { "epoch": 0.6599349649152833, "grad_norm": 9.52644157409668, "learning_rate": 2.1962350256702796e-05, "loss": 1.4529, "step": 3856 }, { "epoch": 0.6601061098750641, "grad_norm": 27.445514678955078, "learning_rate": 2.1968054763262976e-05, "loss": 3.2014, "step": 3857 }, { "epoch": 0.6602772548348451, "grad_norm": 26.250980377197266, "learning_rate": 2.197375926982316e-05, "loss": 2.4078, "step": 3858 }, { "epoch": 0.660448399794626, "grad_norm": 13.75600814819336, "learning_rate": 2.1979463776383346e-05, "loss": 1.2816, "step": 3859 }, { "epoch": 0.660619544754407, "grad_norm": 39.159141540527344, "learning_rate": 2.1985168282943526e-05, "loss": 7.109, "step": 3860 }, { "epoch": 0.6607906897141879, "grad_norm": 29.373641967773438, "learning_rate": 2.199087278950371e-05, "loss": 3.3187, "step": 3861 }, { "epoch": 0.6609618346739689, "grad_norm": 21.516931533813477, "learning_rate": 2.1996577296063893e-05, "loss": 2.3298, "step": 3862 }, { "epoch": 0.6611329796337498, "grad_norm": 95.70633697509766, "learning_rate": 2.2002281802624076e-05, "loss": 7.5026, "step": 3863 }, { "epoch": 0.6613041245935307, "grad_norm": 34.55537796020508, "learning_rate": 2.2007986309184256e-05, "loss": 4.4655, "step": 3864 }, { "epoch": 0.6614752695533117, "grad_norm": 6.342493534088135, "learning_rate": 2.201369081574444e-05, "loss": 0.7267, "step": 3865 }, { "epoch": 0.6616464145130926, "grad_norm": 21.994108200073242, "learning_rate": 2.2019395322304623e-05, "loss": 1.894, "step": 3866 }, { "epoch": 0.6618175594728736, "grad_norm": 62.81085205078125, "learning_rate": 2.2025099828864803e-05, "loss": 2.5491, "step": 3867 }, { "epoch": 0.6619887044326545, "grad_norm": 13.31482219696045, "learning_rate": 2.2030804335424986e-05, "loss": 1.2874, "step": 3868 }, { "epoch": 0.6621598493924354, "grad_norm": 37.990882873535156, "learning_rate": 2.203650884198517e-05, "loss": 6.5223, "step": 3869 }, { "epoch": 0.6623309943522163, "grad_norm": 7.525701522827148, "learning_rate": 2.2042213348545353e-05, "loss": 0.7646, "step": 3870 }, { "epoch": 0.6625021393119973, "grad_norm": 28.383960723876953, "learning_rate": 2.2047917855105533e-05, "loss": 3.8601, "step": 3871 }, { "epoch": 0.6626732842717782, "grad_norm": 25.914175033569336, "learning_rate": 2.2053622361665716e-05, "loss": 2.8097, "step": 3872 }, { "epoch": 0.6628444292315592, "grad_norm": 25.789932250976562, "learning_rate": 2.20593268682259e-05, "loss": 2.4306, "step": 3873 }, { "epoch": 0.66301557419134, "grad_norm": 25.461977005004883, "learning_rate": 2.206503137478608e-05, "loss": 3.0559, "step": 3874 }, { "epoch": 0.663186719151121, "grad_norm": 21.75908088684082, "learning_rate": 2.2070735881346263e-05, "loss": 2.186, "step": 3875 }, { "epoch": 0.6633578641109019, "grad_norm": 26.937217712402344, "learning_rate": 2.2076440387906447e-05, "loss": 3.0092, "step": 3876 }, { "epoch": 0.6635290090706829, "grad_norm": 23.69822883605957, "learning_rate": 2.2082144894466627e-05, "loss": 2.4399, "step": 3877 }, { "epoch": 0.6637001540304638, "grad_norm": 2.651796579360962, "learning_rate": 2.208784940102681e-05, "loss": 0.3089, "step": 3878 }, { "epoch": 0.6638712989902448, "grad_norm": 13.779840469360352, "learning_rate": 2.2093553907586993e-05, "loss": 1.4293, "step": 3879 }, { "epoch": 0.6640424439500257, "grad_norm": 9.433732032775879, "learning_rate": 2.2099258414147177e-05, "loss": 0.738, "step": 3880 }, { "epoch": 0.6642135889098066, "grad_norm": 8.053833961486816, "learning_rate": 2.2104962920707357e-05, "loss": 0.8599, "step": 3881 }, { "epoch": 0.6643847338695875, "grad_norm": 8.789999008178711, "learning_rate": 2.2110667427267543e-05, "loss": 1.2145, "step": 3882 }, { "epoch": 0.6645558788293685, "grad_norm": 35.39924240112305, "learning_rate": 2.2116371933827727e-05, "loss": 3.1608, "step": 3883 }, { "epoch": 0.6647270237891494, "grad_norm": 123.84489440917969, "learning_rate": 2.2122076440387907e-05, "loss": 6.8157, "step": 3884 }, { "epoch": 0.6648981687489304, "grad_norm": 24.360124588012695, "learning_rate": 2.212778094694809e-05, "loss": 2.1383, "step": 3885 }, { "epoch": 0.6650693137087113, "grad_norm": 26.01473045349121, "learning_rate": 2.2133485453508274e-05, "loss": 3.114, "step": 3886 }, { "epoch": 0.6652404586684922, "grad_norm": 27.838552474975586, "learning_rate": 2.2139189960068457e-05, "loss": 2.8416, "step": 3887 }, { "epoch": 0.6654116036282731, "grad_norm": 27.128395080566406, "learning_rate": 2.2144894466628637e-05, "loss": 2.7295, "step": 3888 }, { "epoch": 0.6655827485880541, "grad_norm": 20.852027893066406, "learning_rate": 2.215059897318882e-05, "loss": 2.2252, "step": 3889 }, { "epoch": 0.665753893547835, "grad_norm": 51.29511260986328, "learning_rate": 2.2156303479749004e-05, "loss": 2.4494, "step": 3890 }, { "epoch": 0.665925038507616, "grad_norm": 27.06675910949707, "learning_rate": 2.2162007986309184e-05, "loss": 2.8832, "step": 3891 }, { "epoch": 0.6660961834673969, "grad_norm": 8.248744010925293, "learning_rate": 2.2167712492869367e-05, "loss": 0.7378, "step": 3892 }, { "epoch": 0.6662673284271778, "grad_norm": 23.798690795898438, "learning_rate": 2.217341699942955e-05, "loss": 3.3288, "step": 3893 }, { "epoch": 0.6664384733869587, "grad_norm": 25.778766632080078, "learning_rate": 2.2179121505989734e-05, "loss": 3.0603, "step": 3894 }, { "epoch": 0.6666096183467397, "grad_norm": 2.8828747272491455, "learning_rate": 2.2184826012549914e-05, "loss": 0.3141, "step": 3895 }, { "epoch": 0.6667807633065206, "grad_norm": 26.101049423217773, "learning_rate": 2.2190530519110097e-05, "loss": 3.1024, "step": 3896 }, { "epoch": 0.6669519082663016, "grad_norm": 20.340776443481445, "learning_rate": 2.219623502567028e-05, "loss": 1.7807, "step": 3897 }, { "epoch": 0.6671230532260825, "grad_norm": 22.285655975341797, "learning_rate": 2.220193953223046e-05, "loss": 1.9623, "step": 3898 }, { "epoch": 0.6672941981858634, "grad_norm": 10.816023826599121, "learning_rate": 2.2207644038790644e-05, "loss": 0.7881, "step": 3899 }, { "epoch": 0.6674653431456443, "grad_norm": 25.382898330688477, "learning_rate": 2.2213348545350827e-05, "loss": 2.2422, "step": 3900 }, { "epoch": 0.6676364881054253, "grad_norm": 18.11640167236328, "learning_rate": 2.221905305191101e-05, "loss": 1.323, "step": 3901 }, { "epoch": 0.6678076330652062, "grad_norm": 30.607837677001953, "learning_rate": 2.222475755847119e-05, "loss": 2.3077, "step": 3902 }, { "epoch": 0.6679787780249872, "grad_norm": 22.524381637573242, "learning_rate": 2.2230462065031374e-05, "loss": 2.7118, "step": 3903 }, { "epoch": 0.668149922984768, "grad_norm": 22.379953384399414, "learning_rate": 2.2236166571591557e-05, "loss": 2.0846, "step": 3904 }, { "epoch": 0.668321067944549, "grad_norm": 4.688474655151367, "learning_rate": 2.224187107815174e-05, "loss": 0.5239, "step": 3905 }, { "epoch": 0.6684922129043299, "grad_norm": 26.99576759338379, "learning_rate": 2.2247575584711924e-05, "loss": 3.4886, "step": 3906 }, { "epoch": 0.6686633578641109, "grad_norm": 29.186248779296875, "learning_rate": 2.2253280091272108e-05, "loss": 3.6048, "step": 3907 }, { "epoch": 0.6688345028238918, "grad_norm": 37.26026916503906, "learning_rate": 2.2258984597832288e-05, "loss": 6.4346, "step": 3908 }, { "epoch": 0.6690056477836728, "grad_norm": 15.596887588500977, "learning_rate": 2.226468910439247e-05, "loss": 1.5087, "step": 3909 }, { "epoch": 0.6691767927434537, "grad_norm": 22.914793014526367, "learning_rate": 2.2270393610952654e-05, "loss": 2.4031, "step": 3910 }, { "epoch": 0.6693479377032346, "grad_norm": 34.148956298828125, "learning_rate": 2.2276098117512838e-05, "loss": 3.4389, "step": 3911 }, { "epoch": 0.6695190826630155, "grad_norm": 21.66793441772461, "learning_rate": 2.2281802624073018e-05, "loss": 2.514, "step": 3912 }, { "epoch": 0.6696902276227965, "grad_norm": 10.826380729675293, "learning_rate": 2.22875071306332e-05, "loss": 0.8586, "step": 3913 }, { "epoch": 0.6698613725825775, "grad_norm": 25.435211181640625, "learning_rate": 2.2293211637193384e-05, "loss": 2.6889, "step": 3914 }, { "epoch": 0.6700325175423584, "grad_norm": 35.62110900878906, "learning_rate": 2.2298916143753564e-05, "loss": 6.5931, "step": 3915 }, { "epoch": 0.6702036625021394, "grad_norm": 30.739681243896484, "learning_rate": 2.2304620650313748e-05, "loss": 2.809, "step": 3916 }, { "epoch": 0.6703748074619202, "grad_norm": 3.0653045177459717, "learning_rate": 2.231032515687393e-05, "loss": 0.3283, "step": 3917 }, { "epoch": 0.6705459524217012, "grad_norm": 29.558330535888672, "learning_rate": 2.2316029663434115e-05, "loss": 2.0042, "step": 3918 }, { "epoch": 0.6707170973814821, "grad_norm": 23.827219009399414, "learning_rate": 2.2321734169994295e-05, "loss": 2.737, "step": 3919 }, { "epoch": 0.6708882423412631, "grad_norm": 33.99700927734375, "learning_rate": 2.2327438676554478e-05, "loss": 6.5747, "step": 3920 }, { "epoch": 0.671059387301044, "grad_norm": 27.55402374267578, "learning_rate": 2.233314318311466e-05, "loss": 2.6091, "step": 3921 }, { "epoch": 0.671230532260825, "grad_norm": 31.95720672607422, "learning_rate": 2.233884768967484e-05, "loss": 3.6537, "step": 3922 }, { "epoch": 0.6714016772206058, "grad_norm": 25.14667510986328, "learning_rate": 2.2344552196235025e-05, "loss": 2.6334, "step": 3923 }, { "epoch": 0.6715728221803868, "grad_norm": 23.47039794921875, "learning_rate": 2.2350256702795208e-05, "loss": 2.1698, "step": 3924 }, { "epoch": 0.6717439671401677, "grad_norm": 18.31406021118164, "learning_rate": 2.235596120935539e-05, "loss": 1.5905, "step": 3925 }, { "epoch": 0.6719151120999487, "grad_norm": 23.610937118530273, "learning_rate": 2.236166571591557e-05, "loss": 2.4129, "step": 3926 }, { "epoch": 0.6720862570597296, "grad_norm": 26.94730567932129, "learning_rate": 2.2367370222475755e-05, "loss": 2.8646, "step": 3927 }, { "epoch": 0.6722574020195106, "grad_norm": 19.2611026763916, "learning_rate": 2.237307472903594e-05, "loss": 1.685, "step": 3928 }, { "epoch": 0.6724285469792914, "grad_norm": 7.879249095916748, "learning_rate": 2.237877923559612e-05, "loss": 0.6331, "step": 3929 }, { "epoch": 0.6725996919390724, "grad_norm": 24.44508171081543, "learning_rate": 2.2384483742156305e-05, "loss": 2.3878, "step": 3930 }, { "epoch": 0.6727708368988533, "grad_norm": 20.18474769592285, "learning_rate": 2.239018824871649e-05, "loss": 1.8989, "step": 3931 }, { "epoch": 0.6729419818586343, "grad_norm": 5.985182762145996, "learning_rate": 2.2395892755276672e-05, "loss": 0.6958, "step": 3932 }, { "epoch": 0.6731131268184152, "grad_norm": 17.770193099975586, "learning_rate": 2.240159726183685e-05, "loss": 1.5905, "step": 3933 }, { "epoch": 0.6732842717781962, "grad_norm": 28.44164276123047, "learning_rate": 2.2407301768397035e-05, "loss": 2.9128, "step": 3934 }, { "epoch": 0.673455416737977, "grad_norm": 27.433252334594727, "learning_rate": 2.241300627495722e-05, "loss": 2.608, "step": 3935 }, { "epoch": 0.673626561697758, "grad_norm": 27.8862247467041, "learning_rate": 2.24187107815174e-05, "loss": 3.3689, "step": 3936 }, { "epoch": 0.6737977066575389, "grad_norm": 1.7033040523529053, "learning_rate": 2.2424415288077582e-05, "loss": 0.3299, "step": 3937 }, { "epoch": 0.6739688516173199, "grad_norm": 24.00095558166504, "learning_rate": 2.2430119794637765e-05, "loss": 2.1404, "step": 3938 }, { "epoch": 0.6741399965771008, "grad_norm": 28.09699821472168, "learning_rate": 2.243582430119795e-05, "loss": 2.9765, "step": 3939 }, { "epoch": 0.6743111415368818, "grad_norm": 33.0010871887207, "learning_rate": 2.244152880775813e-05, "loss": 6.8094, "step": 3940 }, { "epoch": 0.6744822864966626, "grad_norm": 25.918590545654297, "learning_rate": 2.2447233314318312e-05, "loss": 2.3787, "step": 3941 }, { "epoch": 0.6746534314564436, "grad_norm": 6.518866062164307, "learning_rate": 2.2452937820878495e-05, "loss": 0.501, "step": 3942 }, { "epoch": 0.6748245764162245, "grad_norm": 22.836181640625, "learning_rate": 2.2458642327438675e-05, "loss": 1.9666, "step": 3943 }, { "epoch": 0.6749957213760055, "grad_norm": 21.074495315551758, "learning_rate": 2.246434683399886e-05, "loss": 1.9682, "step": 3944 }, { "epoch": 0.6751668663357864, "grad_norm": 24.013696670532227, "learning_rate": 2.2470051340559042e-05, "loss": 2.5082, "step": 3945 }, { "epoch": 0.6753380112955674, "grad_norm": 19.56346893310547, "learning_rate": 2.2475755847119222e-05, "loss": 2.1205, "step": 3946 }, { "epoch": 0.6755091562553482, "grad_norm": 22.354597091674805, "learning_rate": 2.2481460353679405e-05, "loss": 2.1432, "step": 3947 }, { "epoch": 0.6756803012151292, "grad_norm": 20.16799545288086, "learning_rate": 2.248716486023959e-05, "loss": 1.7877, "step": 3948 }, { "epoch": 0.6758514461749101, "grad_norm": 31.60150146484375, "learning_rate": 2.2492869366799772e-05, "loss": 3.0757, "step": 3949 }, { "epoch": 0.6760225911346911, "grad_norm": 27.673959732055664, "learning_rate": 2.2498573873359952e-05, "loss": 3.013, "step": 3950 }, { "epoch": 0.676193736094472, "grad_norm": 20.703968048095703, "learning_rate": 2.250427837992014e-05, "loss": 2.0428, "step": 3951 }, { "epoch": 0.676364881054253, "grad_norm": 1.652134656906128, "learning_rate": 2.2509982886480322e-05, "loss": 0.2892, "step": 3952 }, { "epoch": 0.6765360260140338, "grad_norm": 4.8036017417907715, "learning_rate": 2.2515687393040502e-05, "loss": 0.3591, "step": 3953 }, { "epoch": 0.6767071709738148, "grad_norm": 1.2371207475662231, "learning_rate": 2.2521391899600686e-05, "loss": 0.2731, "step": 3954 }, { "epoch": 0.6768783159335957, "grad_norm": 34.3635368347168, "learning_rate": 2.252709640616087e-05, "loss": 6.5111, "step": 3955 }, { "epoch": 0.6770494608933767, "grad_norm": 26.6789493560791, "learning_rate": 2.2532800912721052e-05, "loss": 2.1933, "step": 3956 }, { "epoch": 0.6772206058531576, "grad_norm": 2.4849586486816406, "learning_rate": 2.2538505419281232e-05, "loss": 0.3021, "step": 3957 }, { "epoch": 0.6773917508129386, "grad_norm": 17.35017204284668, "learning_rate": 2.2544209925841416e-05, "loss": 1.7005, "step": 3958 }, { "epoch": 0.6775628957727194, "grad_norm": 36.90235137939453, "learning_rate": 2.25499144324016e-05, "loss": 3.5547, "step": 3959 }, { "epoch": 0.6777340407325004, "grad_norm": 14.82332706451416, "learning_rate": 2.255561893896178e-05, "loss": 1.2344, "step": 3960 }, { "epoch": 0.6779051856922813, "grad_norm": 16.59126853942871, "learning_rate": 2.2561323445521963e-05, "loss": 1.6819, "step": 3961 }, { "epoch": 0.6780763306520623, "grad_norm": 35.49818420410156, "learning_rate": 2.2567027952082146e-05, "loss": 5.1844, "step": 3962 }, { "epoch": 0.6782474756118432, "grad_norm": 18.354028701782227, "learning_rate": 2.257273245864233e-05, "loss": 1.5253, "step": 3963 }, { "epoch": 0.6784186205716242, "grad_norm": 33.30420684814453, "learning_rate": 2.257843696520251e-05, "loss": 3.3261, "step": 3964 }, { "epoch": 0.6785897655314052, "grad_norm": 1.1292017698287964, "learning_rate": 2.2584141471762693e-05, "loss": 0.2504, "step": 3965 }, { "epoch": 0.678760910491186, "grad_norm": 11.09749984741211, "learning_rate": 2.2589845978322876e-05, "loss": 0.7005, "step": 3966 }, { "epoch": 0.678932055450967, "grad_norm": 26.572961807250977, "learning_rate": 2.2595550484883056e-05, "loss": 2.5559, "step": 3967 }, { "epoch": 0.6791032004107479, "grad_norm": 28.02602767944336, "learning_rate": 2.260125499144324e-05, "loss": 2.5011, "step": 3968 }, { "epoch": 0.6792743453705289, "grad_norm": 6.116679668426514, "learning_rate": 2.2606959498003423e-05, "loss": 0.6422, "step": 3969 }, { "epoch": 0.6794454903303098, "grad_norm": 49.190433502197266, "learning_rate": 2.2612664004563606e-05, "loss": 2.2484, "step": 3970 }, { "epoch": 0.6796166352900908, "grad_norm": 27.249277114868164, "learning_rate": 2.2618368511123786e-05, "loss": 2.8075, "step": 3971 }, { "epoch": 0.6797877802498716, "grad_norm": 9.899073600769043, "learning_rate": 2.262407301768397e-05, "loss": 0.7593, "step": 3972 }, { "epoch": 0.6799589252096526, "grad_norm": 32.11344528198242, "learning_rate": 2.2629777524244153e-05, "loss": 3.4356, "step": 3973 }, { "epoch": 0.6801300701694335, "grad_norm": 33.132877349853516, "learning_rate": 2.2635482030804336e-05, "loss": 6.1209, "step": 3974 }, { "epoch": 0.6803012151292145, "grad_norm": 23.018150329589844, "learning_rate": 2.264118653736452e-05, "loss": 2.6845, "step": 3975 }, { "epoch": 0.6804723600889954, "grad_norm": 19.093454360961914, "learning_rate": 2.2646891043924703e-05, "loss": 1.9479, "step": 3976 }, { "epoch": 0.6806435050487764, "grad_norm": 16.804319381713867, "learning_rate": 2.2652595550484883e-05, "loss": 1.7933, "step": 3977 }, { "epoch": 0.6808146500085572, "grad_norm": 14.481977462768555, "learning_rate": 2.2658300057045066e-05, "loss": 1.2585, "step": 3978 }, { "epoch": 0.6809857949683382, "grad_norm": 30.040294647216797, "learning_rate": 2.266400456360525e-05, "loss": 3.3274, "step": 3979 }, { "epoch": 0.6811569399281191, "grad_norm": 13.815556526184082, "learning_rate": 2.2669709070165433e-05, "loss": 1.3265, "step": 3980 }, { "epoch": 0.6813280848879001, "grad_norm": 6.664211273193359, "learning_rate": 2.2675413576725613e-05, "loss": 0.673, "step": 3981 }, { "epoch": 0.681499229847681, "grad_norm": 28.066905975341797, "learning_rate": 2.2681118083285797e-05, "loss": 3.1732, "step": 3982 }, { "epoch": 0.681670374807462, "grad_norm": 3.707343101501465, "learning_rate": 2.268682258984598e-05, "loss": 0.6108, "step": 3983 }, { "epoch": 0.6818415197672428, "grad_norm": 3.5951898097991943, "learning_rate": 2.269252709640616e-05, "loss": 0.3119, "step": 3984 }, { "epoch": 0.6820126647270238, "grad_norm": 34.33369064331055, "learning_rate": 2.2698231602966343e-05, "loss": 3.5062, "step": 3985 }, { "epoch": 0.6821838096868047, "grad_norm": 10.984424591064453, "learning_rate": 2.2703936109526527e-05, "loss": 1.2835, "step": 3986 }, { "epoch": 0.6823549546465857, "grad_norm": 44.93880844116211, "learning_rate": 2.270964061608671e-05, "loss": 2.0757, "step": 3987 }, { "epoch": 0.6825260996063666, "grad_norm": 25.658374786376953, "learning_rate": 2.271534512264689e-05, "loss": 2.4198, "step": 3988 }, { "epoch": 0.6826972445661476, "grad_norm": 23.74066162109375, "learning_rate": 2.2721049629207073e-05, "loss": 2.3887, "step": 3989 }, { "epoch": 0.6828683895259284, "grad_norm": 22.6767578125, "learning_rate": 2.2726754135767257e-05, "loss": 2.0854, "step": 3990 }, { "epoch": 0.6830395344857094, "grad_norm": 13.210683822631836, "learning_rate": 2.2732458642327437e-05, "loss": 0.7998, "step": 3991 }, { "epoch": 0.6832106794454903, "grad_norm": 17.68242073059082, "learning_rate": 2.273816314888762e-05, "loss": 1.6588, "step": 3992 }, { "epoch": 0.6833818244052713, "grad_norm": 29.65629005432129, "learning_rate": 2.2743867655447804e-05, "loss": 3.4329, "step": 3993 }, { "epoch": 0.6835529693650522, "grad_norm": 23.901870727539062, "learning_rate": 2.2749572162007987e-05, "loss": 2.0216, "step": 3994 }, { "epoch": 0.6837241143248332, "grad_norm": 26.25312614440918, "learning_rate": 2.2755276668568167e-05, "loss": 2.3758, "step": 3995 }, { "epoch": 0.683895259284614, "grad_norm": 21.86573028564453, "learning_rate": 2.2760981175128354e-05, "loss": 1.8268, "step": 3996 }, { "epoch": 0.684066404244395, "grad_norm": 32.05353546142578, "learning_rate": 2.2766685681688537e-05, "loss": 2.6875, "step": 3997 }, { "epoch": 0.6842375492041759, "grad_norm": 24.166894912719727, "learning_rate": 2.2772390188248717e-05, "loss": 2.9649, "step": 3998 }, { "epoch": 0.6844086941639569, "grad_norm": 2.158492088317871, "learning_rate": 2.27780946948089e-05, "loss": 0.3136, "step": 3999 }, { "epoch": 0.6845798391237378, "grad_norm": 3.6095380783081055, "learning_rate": 2.2783799201369084e-05, "loss": 0.4003, "step": 4000 }, { "epoch": 0.6847509840835188, "grad_norm": 15.331098556518555, "learning_rate": 2.2789503707929267e-05, "loss": 1.3303, "step": 4001 }, { "epoch": 0.6849221290432996, "grad_norm": 72.33516693115234, "learning_rate": 2.2795208214489447e-05, "loss": 2.7759, "step": 4002 }, { "epoch": 0.6850932740030806, "grad_norm": 28.182132720947266, "learning_rate": 2.280091272104963e-05, "loss": 2.6851, "step": 4003 }, { "epoch": 0.6852644189628615, "grad_norm": 28.051651000976562, "learning_rate": 2.2806617227609814e-05, "loss": 2.7711, "step": 4004 }, { "epoch": 0.6854355639226425, "grad_norm": 30.069196701049805, "learning_rate": 2.2812321734169994e-05, "loss": 3.0094, "step": 4005 }, { "epoch": 0.6856067088824234, "grad_norm": 22.24188232421875, "learning_rate": 2.2818026240730177e-05, "loss": 2.049, "step": 4006 }, { "epoch": 0.6857778538422044, "grad_norm": 26.202171325683594, "learning_rate": 2.282373074729036e-05, "loss": 2.9979, "step": 4007 }, { "epoch": 0.6859489988019852, "grad_norm": 30.185165405273438, "learning_rate": 2.2829435253850544e-05, "loss": 2.4561, "step": 4008 }, { "epoch": 0.6861201437617662, "grad_norm": 38.99590301513672, "learning_rate": 2.2835139760410724e-05, "loss": 6.2926, "step": 4009 }, { "epoch": 0.6862912887215471, "grad_norm": 22.699562072753906, "learning_rate": 2.2840844266970907e-05, "loss": 2.1231, "step": 4010 }, { "epoch": 0.6864624336813281, "grad_norm": 16.00046730041504, "learning_rate": 2.284654877353109e-05, "loss": 1.4517, "step": 4011 }, { "epoch": 0.686633578641109, "grad_norm": 2.1720409393310547, "learning_rate": 2.285225328009127e-05, "loss": 0.2857, "step": 4012 }, { "epoch": 0.68680472360089, "grad_norm": 24.013540267944336, "learning_rate": 2.2857957786651454e-05, "loss": 2.0506, "step": 4013 }, { "epoch": 0.6869758685606709, "grad_norm": 31.45075798034668, "learning_rate": 2.2863662293211638e-05, "loss": 3.625, "step": 4014 }, { "epoch": 0.6871470135204518, "grad_norm": 13.581439018249512, "learning_rate": 2.2869366799771818e-05, "loss": 1.0461, "step": 4015 }, { "epoch": 0.6873181584802328, "grad_norm": 4.323340892791748, "learning_rate": 2.2875071306332e-05, "loss": 0.4245, "step": 4016 }, { "epoch": 0.6874893034400137, "grad_norm": 22.576906204223633, "learning_rate": 2.2880775812892184e-05, "loss": 1.9977, "step": 4017 }, { "epoch": 0.6876604483997947, "grad_norm": 24.18263053894043, "learning_rate": 2.2886480319452368e-05, "loss": 2.257, "step": 4018 }, { "epoch": 0.6878315933595756, "grad_norm": 31.072450637817383, "learning_rate": 2.289218482601255e-05, "loss": 4.4122, "step": 4019 }, { "epoch": 0.6880027383193565, "grad_norm": 6.332727432250977, "learning_rate": 2.2897889332572734e-05, "loss": 0.5804, "step": 4020 }, { "epoch": 0.6881738832791374, "grad_norm": 28.49344825744629, "learning_rate": 2.2903593839132918e-05, "loss": 3.0976, "step": 4021 }, { "epoch": 0.6883450282389184, "grad_norm": 25.831926345825195, "learning_rate": 2.2909298345693098e-05, "loss": 2.437, "step": 4022 }, { "epoch": 0.6885161731986993, "grad_norm": 1.773380994796753, "learning_rate": 2.291500285225328e-05, "loss": 0.316, "step": 4023 }, { "epoch": 0.6886873181584803, "grad_norm": 28.22812843322754, "learning_rate": 2.2920707358813465e-05, "loss": 3.162, "step": 4024 }, { "epoch": 0.6888584631182612, "grad_norm": 25.598621368408203, "learning_rate": 2.2926411865373648e-05, "loss": 2.4509, "step": 4025 }, { "epoch": 0.6890296080780421, "grad_norm": 23.125686645507812, "learning_rate": 2.2932116371933828e-05, "loss": 2.4663, "step": 4026 }, { "epoch": 0.689200753037823, "grad_norm": 16.213899612426758, "learning_rate": 2.293782087849401e-05, "loss": 1.3174, "step": 4027 }, { "epoch": 0.689371897997604, "grad_norm": 21.237468719482422, "learning_rate": 2.2943525385054195e-05, "loss": 1.7555, "step": 4028 }, { "epoch": 0.6895430429573849, "grad_norm": 18.583372116088867, "learning_rate": 2.2949229891614375e-05, "loss": 2.1211, "step": 4029 }, { "epoch": 0.6897141879171659, "grad_norm": 23.49361228942871, "learning_rate": 2.2954934398174558e-05, "loss": 2.3383, "step": 4030 }, { "epoch": 0.6898853328769468, "grad_norm": 18.21615219116211, "learning_rate": 2.296063890473474e-05, "loss": 1.7035, "step": 4031 }, { "epoch": 0.6900564778367277, "grad_norm": 21.006032943725586, "learning_rate": 2.2966343411294925e-05, "loss": 1.9274, "step": 4032 }, { "epoch": 0.6902276227965086, "grad_norm": 33.84695816040039, "learning_rate": 2.2972047917855105e-05, "loss": 4.7633, "step": 4033 }, { "epoch": 0.6903987677562896, "grad_norm": 23.903696060180664, "learning_rate": 2.2977752424415288e-05, "loss": 2.2789, "step": 4034 }, { "epoch": 0.6905699127160705, "grad_norm": 32.980133056640625, "learning_rate": 2.298345693097547e-05, "loss": 4.0104, "step": 4035 }, { "epoch": 0.6907410576758515, "grad_norm": 26.50374984741211, "learning_rate": 2.298916143753565e-05, "loss": 3.0929, "step": 4036 }, { "epoch": 0.6909122026356324, "grad_norm": 18.61760902404785, "learning_rate": 2.2994865944095835e-05, "loss": 1.5559, "step": 4037 }, { "epoch": 0.6910833475954133, "grad_norm": 21.44686508178711, "learning_rate": 2.300057045065602e-05, "loss": 1.9526, "step": 4038 }, { "epoch": 0.6912544925551942, "grad_norm": 21.688053131103516, "learning_rate": 2.3006274957216202e-05, "loss": 2.2512, "step": 4039 }, { "epoch": 0.6914256375149752, "grad_norm": 16.904464721679688, "learning_rate": 2.3011979463776382e-05, "loss": 1.5846, "step": 4040 }, { "epoch": 0.6915967824747561, "grad_norm": 15.30504035949707, "learning_rate": 2.3017683970336565e-05, "loss": 1.4085, "step": 4041 }, { "epoch": 0.6917679274345371, "grad_norm": 38.0904655456543, "learning_rate": 2.3023388476896752e-05, "loss": 6.407, "step": 4042 }, { "epoch": 0.691939072394318, "grad_norm": 1.5367884635925293, "learning_rate": 2.3029092983456932e-05, "loss": 0.2607, "step": 4043 }, { "epoch": 0.6921102173540989, "grad_norm": 6.76278018951416, "learning_rate": 2.3034797490017115e-05, "loss": 0.5019, "step": 4044 }, { "epoch": 0.6922813623138798, "grad_norm": 28.894147872924805, "learning_rate": 2.30405019965773e-05, "loss": 3.1341, "step": 4045 }, { "epoch": 0.6924525072736608, "grad_norm": 21.855031967163086, "learning_rate": 2.304620650313748e-05, "loss": 2.6851, "step": 4046 }, { "epoch": 0.6926236522334417, "grad_norm": 21.59860610961914, "learning_rate": 2.3051911009697662e-05, "loss": 2.0748, "step": 4047 }, { "epoch": 0.6927947971932227, "grad_norm": 82.68946838378906, "learning_rate": 2.3057615516257845e-05, "loss": 2.8348, "step": 4048 }, { "epoch": 0.6929659421530036, "grad_norm": 47.771636962890625, "learning_rate": 2.306332002281803e-05, "loss": 6.3127, "step": 4049 }, { "epoch": 0.6931370871127845, "grad_norm": 19.974872589111328, "learning_rate": 2.306902452937821e-05, "loss": 2.0138, "step": 4050 }, { "epoch": 0.6933082320725654, "grad_norm": 28.05762481689453, "learning_rate": 2.3074729035938392e-05, "loss": 2.804, "step": 4051 }, { "epoch": 0.6934793770323464, "grad_norm": 26.722768783569336, "learning_rate": 2.3080433542498575e-05, "loss": 3.3621, "step": 4052 }, { "epoch": 0.6936505219921273, "grad_norm": 10.087456703186035, "learning_rate": 2.3086138049058755e-05, "loss": 0.6185, "step": 4053 }, { "epoch": 0.6938216669519083, "grad_norm": 19.535655975341797, "learning_rate": 2.309184255561894e-05, "loss": 2.3951, "step": 4054 }, { "epoch": 0.6939928119116892, "grad_norm": 24.08242416381836, "learning_rate": 2.3097547062179122e-05, "loss": 2.5265, "step": 4055 }, { "epoch": 0.6941639568714701, "grad_norm": 10.019787788391113, "learning_rate": 2.3103251568739306e-05, "loss": 1.679, "step": 4056 }, { "epoch": 0.694335101831251, "grad_norm": 23.744293212890625, "learning_rate": 2.3108956075299486e-05, "loss": 2.3542, "step": 4057 }, { "epoch": 0.694506246791032, "grad_norm": 4.5092244148254395, "learning_rate": 2.311466058185967e-05, "loss": 0.4777, "step": 4058 }, { "epoch": 0.6946773917508129, "grad_norm": 17.50345802307129, "learning_rate": 2.3120365088419852e-05, "loss": 1.8737, "step": 4059 }, { "epoch": 0.6948485367105939, "grad_norm": 23.234378814697266, "learning_rate": 2.3126069594980032e-05, "loss": 2.307, "step": 4060 }, { "epoch": 0.6950196816703748, "grad_norm": 2.192140579223633, "learning_rate": 2.3131774101540216e-05, "loss": 0.2844, "step": 4061 }, { "epoch": 0.6951908266301557, "grad_norm": 22.02082633972168, "learning_rate": 2.31374786081004e-05, "loss": 2.0206, "step": 4062 }, { "epoch": 0.6953619715899366, "grad_norm": 18.239028930664062, "learning_rate": 2.3143183114660582e-05, "loss": 1.5416, "step": 4063 }, { "epoch": 0.6955331165497176, "grad_norm": 8.209535598754883, "learning_rate": 2.3148887621220762e-05, "loss": 0.8478, "step": 4064 }, { "epoch": 0.6957042615094986, "grad_norm": 38.67818832397461, "learning_rate": 2.315459212778095e-05, "loss": 2.1182, "step": 4065 }, { "epoch": 0.6958754064692795, "grad_norm": 27.814809799194336, "learning_rate": 2.3160296634341133e-05, "loss": 3.3903, "step": 4066 }, { "epoch": 0.6960465514290605, "grad_norm": 1.4533177614212036, "learning_rate": 2.3166001140901313e-05, "loss": 0.2869, "step": 4067 }, { "epoch": 0.6962176963888413, "grad_norm": 10.602791786193848, "learning_rate": 2.3171705647461496e-05, "loss": 0.6294, "step": 4068 }, { "epoch": 0.6963888413486223, "grad_norm": 20.941123962402344, "learning_rate": 2.317741015402168e-05, "loss": 2.1678, "step": 4069 }, { "epoch": 0.6965599863084032, "grad_norm": 26.170923233032227, "learning_rate": 2.3183114660581863e-05, "loss": 2.4743, "step": 4070 }, { "epoch": 0.6967311312681842, "grad_norm": 24.81916618347168, "learning_rate": 2.3188819167142043e-05, "loss": 2.6961, "step": 4071 }, { "epoch": 0.6969022762279651, "grad_norm": 21.444581985473633, "learning_rate": 2.3194523673702226e-05, "loss": 2.297, "step": 4072 }, { "epoch": 0.6970734211877461, "grad_norm": 24.241352081298828, "learning_rate": 2.320022818026241e-05, "loss": 2.47, "step": 4073 }, { "epoch": 0.697244566147527, "grad_norm": 19.334854125976562, "learning_rate": 2.320593268682259e-05, "loss": 1.8912, "step": 4074 }, { "epoch": 0.6974157111073079, "grad_norm": 1.0382440090179443, "learning_rate": 2.3211637193382773e-05, "loss": 0.2496, "step": 4075 }, { "epoch": 0.6975868560670888, "grad_norm": 9.449311256408691, "learning_rate": 2.3217341699942956e-05, "loss": 0.7254, "step": 4076 }, { "epoch": 0.6977580010268698, "grad_norm": 73.3146743774414, "learning_rate": 2.3223046206503136e-05, "loss": 7.7671, "step": 4077 }, { "epoch": 0.6979291459866507, "grad_norm": 1.6645268201828003, "learning_rate": 2.322875071306332e-05, "loss": 0.2892, "step": 4078 }, { "epoch": 0.6981002909464317, "grad_norm": 4.770233631134033, "learning_rate": 2.3234455219623503e-05, "loss": 0.4124, "step": 4079 }, { "epoch": 0.6982714359062125, "grad_norm": 28.565988540649414, "learning_rate": 2.3240159726183686e-05, "loss": 1.4992, "step": 4080 }, { "epoch": 0.6984425808659935, "grad_norm": 12.26234245300293, "learning_rate": 2.3245864232743866e-05, "loss": 0.8612, "step": 4081 }, { "epoch": 0.6986137258257744, "grad_norm": 25.978046417236328, "learning_rate": 2.325156873930405e-05, "loss": 2.1483, "step": 4082 }, { "epoch": 0.6987848707855554, "grad_norm": 19.333969116210938, "learning_rate": 2.3257273245864233e-05, "loss": 2.0506, "step": 4083 }, { "epoch": 0.6989560157453363, "grad_norm": 28.06734275817871, "learning_rate": 2.3262977752424413e-05, "loss": 2.6222, "step": 4084 }, { "epoch": 0.6991271607051173, "grad_norm": 9.045817375183105, "learning_rate": 2.3268682258984596e-05, "loss": 0.8083, "step": 4085 }, { "epoch": 0.6992983056648981, "grad_norm": 6.522378444671631, "learning_rate": 2.327438676554478e-05, "loss": 0.5991, "step": 4086 }, { "epoch": 0.6994694506246791, "grad_norm": 24.520263671875, "learning_rate": 2.3280091272104963e-05, "loss": 1.9785, "step": 4087 }, { "epoch": 0.69964059558446, "grad_norm": 25.192462921142578, "learning_rate": 2.3285795778665147e-05, "loss": 1.9242, "step": 4088 }, { "epoch": 0.699811740544241, "grad_norm": 21.36008071899414, "learning_rate": 2.329150028522533e-05, "loss": 2.2738, "step": 4089 }, { "epoch": 0.6999828855040219, "grad_norm": 15.994437217712402, "learning_rate": 2.3297204791785513e-05, "loss": 1.5939, "step": 4090 }, { "epoch": 0.7001540304638029, "grad_norm": 13.80662727355957, "learning_rate": 2.3302909298345693e-05, "loss": 0.8196, "step": 4091 }, { "epoch": 0.7003251754235837, "grad_norm": 1.906544804573059, "learning_rate": 2.3308613804905877e-05, "loss": 0.2608, "step": 4092 }, { "epoch": 0.7004963203833647, "grad_norm": 6.288933753967285, "learning_rate": 2.331431831146606e-05, "loss": 0.8815, "step": 4093 }, { "epoch": 0.7006674653431456, "grad_norm": 25.848539352416992, "learning_rate": 2.3320022818026244e-05, "loss": 3.1319, "step": 4094 }, { "epoch": 0.7008386103029266, "grad_norm": 2.6723341941833496, "learning_rate": 2.3325727324586424e-05, "loss": 0.3466, "step": 4095 }, { "epoch": 0.7010097552627075, "grad_norm": 34.171104431152344, "learning_rate": 2.3331431831146607e-05, "loss": 6.0408, "step": 4096 }, { "epoch": 0.7011809002224885, "grad_norm": 6.2798662185668945, "learning_rate": 2.333713633770679e-05, "loss": 0.5715, "step": 4097 }, { "epoch": 0.7013520451822693, "grad_norm": 21.24723243713379, "learning_rate": 2.334284084426697e-05, "loss": 1.801, "step": 4098 }, { "epoch": 0.7015231901420503, "grad_norm": 2.3047332763671875, "learning_rate": 2.3348545350827154e-05, "loss": 0.2806, "step": 4099 }, { "epoch": 0.7016943351018312, "grad_norm": 35.639129638671875, "learning_rate": 2.3354249857387337e-05, "loss": 6.2857, "step": 4100 }, { "epoch": 0.7018654800616122, "grad_norm": 22.578310012817383, "learning_rate": 2.335995436394752e-05, "loss": 2.3757, "step": 4101 }, { "epoch": 0.7020366250213931, "grad_norm": 27.650184631347656, "learning_rate": 2.33656588705077e-05, "loss": 2.7948, "step": 4102 }, { "epoch": 0.7022077699811741, "grad_norm": 22.47934913635254, "learning_rate": 2.3371363377067884e-05, "loss": 2.9155, "step": 4103 }, { "epoch": 0.702378914940955, "grad_norm": 117.08856201171875, "learning_rate": 2.3377067883628067e-05, "loss": 8.9408, "step": 4104 }, { "epoch": 0.7025500599007359, "grad_norm": 6.346577167510986, "learning_rate": 2.3382772390188247e-05, "loss": 0.4762, "step": 4105 }, { "epoch": 0.7027212048605168, "grad_norm": 24.838397979736328, "learning_rate": 2.338847689674843e-05, "loss": 2.0305, "step": 4106 }, { "epoch": 0.7028923498202978, "grad_norm": 5.3628716468811035, "learning_rate": 2.3394181403308614e-05, "loss": 0.4928, "step": 4107 }, { "epoch": 0.7030634947800787, "grad_norm": 27.933374404907227, "learning_rate": 2.3399885909868797e-05, "loss": 2.7148, "step": 4108 }, { "epoch": 0.7032346397398597, "grad_norm": 61.49900436401367, "learning_rate": 2.3405590416428977e-05, "loss": 2.22, "step": 4109 }, { "epoch": 0.7034057846996405, "grad_norm": 35.6771354675293, "learning_rate": 2.341129492298916e-05, "loss": 2.7082, "step": 4110 }, { "epoch": 0.7035769296594215, "grad_norm": 6.308041095733643, "learning_rate": 2.3416999429549347e-05, "loss": 0.5861, "step": 4111 }, { "epoch": 0.7037480746192024, "grad_norm": 18.36146354675293, "learning_rate": 2.3422703936109527e-05, "loss": 1.5818, "step": 4112 }, { "epoch": 0.7039192195789834, "grad_norm": 26.6254940032959, "learning_rate": 2.342840844266971e-05, "loss": 2.5972, "step": 4113 }, { "epoch": 0.7040903645387643, "grad_norm": 41.90875244140625, "learning_rate": 2.3434112949229894e-05, "loss": 7.2004, "step": 4114 }, { "epoch": 0.7042615094985453, "grad_norm": 26.454225540161133, "learning_rate": 2.3439817455790074e-05, "loss": 3.0425, "step": 4115 }, { "epoch": 0.7044326544583263, "grad_norm": 67.04540252685547, "learning_rate": 2.3445521962350258e-05, "loss": 2.5417, "step": 4116 }, { "epoch": 0.7046037994181071, "grad_norm": 29.956275939941406, "learning_rate": 2.345122646891044e-05, "loss": 3.9422, "step": 4117 }, { "epoch": 0.7047749443778881, "grad_norm": 28.678985595703125, "learning_rate": 2.3456930975470624e-05, "loss": 2.8701, "step": 4118 }, { "epoch": 0.704946089337669, "grad_norm": 21.728593826293945, "learning_rate": 2.3462635482030804e-05, "loss": 1.9539, "step": 4119 }, { "epoch": 0.70511723429745, "grad_norm": 21.133193969726562, "learning_rate": 2.3468339988590988e-05, "loss": 2.0111, "step": 4120 }, { "epoch": 0.7052883792572309, "grad_norm": 57.13780975341797, "learning_rate": 2.347404449515117e-05, "loss": 2.4365, "step": 4121 }, { "epoch": 0.7054595242170119, "grad_norm": 1.5548804998397827, "learning_rate": 2.347974900171135e-05, "loss": 0.2656, "step": 4122 }, { "epoch": 0.7056306691767927, "grad_norm": 16.94969940185547, "learning_rate": 2.3485453508271534e-05, "loss": 1.5448, "step": 4123 }, { "epoch": 0.7058018141365737, "grad_norm": 29.882259368896484, "learning_rate": 2.3491158014831718e-05, "loss": 1.7722, "step": 4124 }, { "epoch": 0.7059729590963546, "grad_norm": 13.851731300354004, "learning_rate": 2.34968625213919e-05, "loss": 1.2664, "step": 4125 }, { "epoch": 0.7061441040561356, "grad_norm": 19.379756927490234, "learning_rate": 2.350256702795208e-05, "loss": 1.5466, "step": 4126 }, { "epoch": 0.7063152490159165, "grad_norm": 20.181297302246094, "learning_rate": 2.3508271534512265e-05, "loss": 2.0799, "step": 4127 }, { "epoch": 0.7064863939756975, "grad_norm": 18.206491470336914, "learning_rate": 2.3513976041072448e-05, "loss": 1.7423, "step": 4128 }, { "epoch": 0.7066575389354783, "grad_norm": 39.42982482910156, "learning_rate": 2.3519680547632628e-05, "loss": 6.003, "step": 4129 }, { "epoch": 0.7068286838952593, "grad_norm": 10.509867668151855, "learning_rate": 2.352538505419281e-05, "loss": 0.7636, "step": 4130 }, { "epoch": 0.7069998288550402, "grad_norm": 2.2939412593841553, "learning_rate": 2.3531089560752995e-05, "loss": 0.2719, "step": 4131 }, { "epoch": 0.7071709738148212, "grad_norm": 25.9577579498291, "learning_rate": 2.3536794067313178e-05, "loss": 2.4994, "step": 4132 }, { "epoch": 0.7073421187746021, "grad_norm": 72.57787322998047, "learning_rate": 2.3542498573873358e-05, "loss": 7.4552, "step": 4133 }, { "epoch": 0.7075132637343831, "grad_norm": 138.49476623535156, "learning_rate": 2.3548203080433545e-05, "loss": 8.7384, "step": 4134 }, { "epoch": 0.7076844086941639, "grad_norm": 18.31711769104004, "learning_rate": 2.3553907586993728e-05, "loss": 1.8353, "step": 4135 }, { "epoch": 0.7078555536539449, "grad_norm": 23.117900848388672, "learning_rate": 2.3559612093553908e-05, "loss": 2.2019, "step": 4136 }, { "epoch": 0.7080266986137258, "grad_norm": 29.79839515686035, "learning_rate": 2.356531660011409e-05, "loss": 3.6913, "step": 4137 }, { "epoch": 0.7081978435735068, "grad_norm": 6.226611614227295, "learning_rate": 2.3571021106674275e-05, "loss": 0.3817, "step": 4138 }, { "epoch": 0.7083689885332877, "grad_norm": 7.866857051849365, "learning_rate": 2.3576725613234458e-05, "loss": 0.7468, "step": 4139 }, { "epoch": 0.7085401334930687, "grad_norm": 13.393908500671387, "learning_rate": 2.3582430119794638e-05, "loss": 0.9038, "step": 4140 }, { "epoch": 0.7087112784528495, "grad_norm": 4.077215194702148, "learning_rate": 2.358813462635482e-05, "loss": 0.3156, "step": 4141 }, { "epoch": 0.7088824234126305, "grad_norm": 35.0775146484375, "learning_rate": 2.3593839132915005e-05, "loss": 6.4035, "step": 4142 }, { "epoch": 0.7090535683724114, "grad_norm": 27.827789306640625, "learning_rate": 2.3599543639475185e-05, "loss": 2.6875, "step": 4143 }, { "epoch": 0.7092247133321924, "grad_norm": 7.222084045410156, "learning_rate": 2.360524814603537e-05, "loss": 0.6337, "step": 4144 }, { "epoch": 0.7093958582919733, "grad_norm": 16.112009048461914, "learning_rate": 2.3610952652595552e-05, "loss": 1.5604, "step": 4145 }, { "epoch": 0.7095670032517543, "grad_norm": 33.34373092651367, "learning_rate": 2.3616657159155732e-05, "loss": 6.1852, "step": 4146 }, { "epoch": 0.7097381482115351, "grad_norm": 23.43242835998535, "learning_rate": 2.3622361665715915e-05, "loss": 2.2633, "step": 4147 }, { "epoch": 0.7099092931713161, "grad_norm": 23.43446159362793, "learning_rate": 2.36280661722761e-05, "loss": 2.2967, "step": 4148 }, { "epoch": 0.710080438131097, "grad_norm": 9.296281814575195, "learning_rate": 2.3633770678836282e-05, "loss": 0.9115, "step": 4149 }, { "epoch": 0.710251583090878, "grad_norm": 23.257322311401367, "learning_rate": 2.3639475185396462e-05, "loss": 2.2038, "step": 4150 }, { "epoch": 0.7104227280506589, "grad_norm": 30.016109466552734, "learning_rate": 2.3645179691956645e-05, "loss": 3.0839, "step": 4151 }, { "epoch": 0.7105938730104399, "grad_norm": 25.682640075683594, "learning_rate": 2.365088419851683e-05, "loss": 2.8843, "step": 4152 }, { "epoch": 0.7107650179702207, "grad_norm": 23.85866355895996, "learning_rate": 2.365658870507701e-05, "loss": 1.9993, "step": 4153 }, { "epoch": 0.7109361629300017, "grad_norm": 4.204997539520264, "learning_rate": 2.3662293211637192e-05, "loss": 0.5192, "step": 4154 }, { "epoch": 0.7111073078897826, "grad_norm": 24.741037368774414, "learning_rate": 2.3667997718197375e-05, "loss": 2.4707, "step": 4155 }, { "epoch": 0.7112784528495636, "grad_norm": 5.143214225769043, "learning_rate": 2.367370222475756e-05, "loss": 0.3212, "step": 4156 }, { "epoch": 0.7114495978093445, "grad_norm": 30.81825828552246, "learning_rate": 2.3679406731317742e-05, "loss": 2.3157, "step": 4157 }, { "epoch": 0.7116207427691255, "grad_norm": 25.597097396850586, "learning_rate": 2.3685111237877926e-05, "loss": 2.8225, "step": 4158 }, { "epoch": 0.7117918877289063, "grad_norm": 12.126123428344727, "learning_rate": 2.369081574443811e-05, "loss": 0.6904, "step": 4159 }, { "epoch": 0.7119630326886873, "grad_norm": 17.513898849487305, "learning_rate": 2.369652025099829e-05, "loss": 1.9711, "step": 4160 }, { "epoch": 0.7121341776484682, "grad_norm": 24.142879486083984, "learning_rate": 2.3702224757558472e-05, "loss": 2.5014, "step": 4161 }, { "epoch": 0.7123053226082492, "grad_norm": 26.637598037719727, "learning_rate": 2.3707929264118656e-05, "loss": 2.5949, "step": 4162 }, { "epoch": 0.7124764675680301, "grad_norm": 22.141407012939453, "learning_rate": 2.371363377067884e-05, "loss": 2.4929, "step": 4163 }, { "epoch": 0.7126476125278111, "grad_norm": 26.212926864624023, "learning_rate": 2.371933827723902e-05, "loss": 2.6947, "step": 4164 }, { "epoch": 0.7128187574875919, "grad_norm": 18.995336532592773, "learning_rate": 2.3725042783799202e-05, "loss": 1.6993, "step": 4165 }, { "epoch": 0.7129899024473729, "grad_norm": 27.700637817382812, "learning_rate": 2.3730747290359386e-05, "loss": 3.3321, "step": 4166 }, { "epoch": 0.7131610474071539, "grad_norm": 5.5868754386901855, "learning_rate": 2.3736451796919566e-05, "loss": 0.5355, "step": 4167 }, { "epoch": 0.7133321923669348, "grad_norm": 15.529837608337402, "learning_rate": 2.374215630347975e-05, "loss": 1.5037, "step": 4168 }, { "epoch": 0.7135033373267158, "grad_norm": 21.299524307250977, "learning_rate": 2.3747860810039933e-05, "loss": 1.9929, "step": 4169 }, { "epoch": 0.7136744822864967, "grad_norm": 45.21254348754883, "learning_rate": 2.3753565316600116e-05, "loss": 6.7553, "step": 4170 }, { "epoch": 0.7138456272462776, "grad_norm": 26.380695343017578, "learning_rate": 2.3759269823160296e-05, "loss": 2.6056, "step": 4171 }, { "epoch": 0.7140167722060585, "grad_norm": 3.5742998123168945, "learning_rate": 2.376497432972048e-05, "loss": 0.4821, "step": 4172 }, { "epoch": 0.7141879171658395, "grad_norm": 31.626493453979492, "learning_rate": 2.3770678836280663e-05, "loss": 3.368, "step": 4173 }, { "epoch": 0.7143590621256204, "grad_norm": 21.61958885192871, "learning_rate": 2.3776383342840843e-05, "loss": 1.709, "step": 4174 }, { "epoch": 0.7145302070854014, "grad_norm": 6.5677809715271, "learning_rate": 2.3782087849401026e-05, "loss": 0.5463, "step": 4175 }, { "epoch": 0.7147013520451823, "grad_norm": 151.24124145507812, "learning_rate": 2.378779235596121e-05, "loss": 8.6429, "step": 4176 }, { "epoch": 0.7148724970049632, "grad_norm": 34.68931579589844, "learning_rate": 2.3793496862521393e-05, "loss": 6.1887, "step": 4177 }, { "epoch": 0.7150436419647441, "grad_norm": 18.861997604370117, "learning_rate": 2.3799201369081573e-05, "loss": 1.8546, "step": 4178 }, { "epoch": 0.7152147869245251, "grad_norm": 19.337419509887695, "learning_rate": 2.380490587564176e-05, "loss": 1.8454, "step": 4179 }, { "epoch": 0.715385931884306, "grad_norm": 1.8883466720581055, "learning_rate": 2.3810610382201943e-05, "loss": 0.289, "step": 4180 }, { "epoch": 0.715557076844087, "grad_norm": 110.51686096191406, "learning_rate": 2.3816314888762123e-05, "loss": 7.8896, "step": 4181 }, { "epoch": 0.7157282218038679, "grad_norm": 69.93323516845703, "learning_rate": 2.3822019395322306e-05, "loss": 2.6675, "step": 4182 }, { "epoch": 0.7158993667636488, "grad_norm": 23.35276222229004, "learning_rate": 2.382772390188249e-05, "loss": 2.0773, "step": 4183 }, { "epoch": 0.7160705117234297, "grad_norm": 20.778461456298828, "learning_rate": 2.383342840844267e-05, "loss": 2.2745, "step": 4184 }, { "epoch": 0.7162416566832107, "grad_norm": 13.58486557006836, "learning_rate": 2.3839132915002853e-05, "loss": 1.2723, "step": 4185 }, { "epoch": 0.7164128016429916, "grad_norm": 6.069742202758789, "learning_rate": 2.3844837421563036e-05, "loss": 0.865, "step": 4186 }, { "epoch": 0.7165839466027726, "grad_norm": 21.17997169494629, "learning_rate": 2.385054192812322e-05, "loss": 1.8901, "step": 4187 }, { "epoch": 0.7167550915625535, "grad_norm": 24.12006187438965, "learning_rate": 2.38562464346834e-05, "loss": 2.7853, "step": 4188 }, { "epoch": 0.7169262365223344, "grad_norm": 13.66297721862793, "learning_rate": 2.3861950941243583e-05, "loss": 0.9545, "step": 4189 }, { "epoch": 0.7170973814821153, "grad_norm": 51.27836990356445, "learning_rate": 2.3867655447803767e-05, "loss": 1.9518, "step": 4190 }, { "epoch": 0.7172685264418963, "grad_norm": 7.221101760864258, "learning_rate": 2.3873359954363947e-05, "loss": 0.5895, "step": 4191 }, { "epoch": 0.7174396714016772, "grad_norm": 91.32466125488281, "learning_rate": 2.387906446092413e-05, "loss": 7.6073, "step": 4192 }, { "epoch": 0.7176108163614582, "grad_norm": 19.606168746948242, "learning_rate": 2.3884768967484313e-05, "loss": 1.8311, "step": 4193 }, { "epoch": 0.7177819613212391, "grad_norm": 8.86071491241455, "learning_rate": 2.3890473474044497e-05, "loss": 0.8949, "step": 4194 }, { "epoch": 0.71795310628102, "grad_norm": 21.827577590942383, "learning_rate": 2.3896177980604677e-05, "loss": 2.0555, "step": 4195 }, { "epoch": 0.7181242512408009, "grad_norm": 24.964656829833984, "learning_rate": 2.390188248716486e-05, "loss": 3.0599, "step": 4196 }, { "epoch": 0.7182953962005819, "grad_norm": 1.9594327211380005, "learning_rate": 2.3907586993725043e-05, "loss": 0.2823, "step": 4197 }, { "epoch": 0.7184665411603628, "grad_norm": 18.58384132385254, "learning_rate": 2.3913291500285223e-05, "loss": 1.8147, "step": 4198 }, { "epoch": 0.7186376861201438, "grad_norm": 22.12485122680664, "learning_rate": 2.3918996006845407e-05, "loss": 2.4456, "step": 4199 }, { "epoch": 0.7188088310799247, "grad_norm": 20.110654830932617, "learning_rate": 2.392470051340559e-05, "loss": 1.9338, "step": 4200 }, { "epoch": 0.7189799760397056, "grad_norm": 25.46828842163086, "learning_rate": 2.3930405019965774e-05, "loss": 1.6106, "step": 4201 }, { "epoch": 0.7191511209994865, "grad_norm": 10.579689979553223, "learning_rate": 2.3936109526525957e-05, "loss": 0.6947, "step": 4202 }, { "epoch": 0.7193222659592675, "grad_norm": 50.630008697509766, "learning_rate": 2.394181403308614e-05, "loss": 1.6672, "step": 4203 }, { "epoch": 0.7194934109190484, "grad_norm": 22.716222763061523, "learning_rate": 2.3947518539646324e-05, "loss": 2.0138, "step": 4204 }, { "epoch": 0.7196645558788294, "grad_norm": 16.84745216369629, "learning_rate": 2.3953223046206504e-05, "loss": 1.0839, "step": 4205 }, { "epoch": 0.7198357008386103, "grad_norm": 1.9779396057128906, "learning_rate": 2.3958927552766687e-05, "loss": 0.2798, "step": 4206 }, { "epoch": 0.7200068457983912, "grad_norm": 30.206113815307617, "learning_rate": 2.396463205932687e-05, "loss": 2.7617, "step": 4207 }, { "epoch": 0.7201779907581721, "grad_norm": 14.300946235656738, "learning_rate": 2.3970336565887054e-05, "loss": 1.4846, "step": 4208 }, { "epoch": 0.7203491357179531, "grad_norm": 14.153777122497559, "learning_rate": 2.3976041072447234e-05, "loss": 0.9053, "step": 4209 }, { "epoch": 0.720520280677734, "grad_norm": 30.488285064697266, "learning_rate": 2.3981745579007417e-05, "loss": 3.6317, "step": 4210 }, { "epoch": 0.720691425637515, "grad_norm": 40.644447326660156, "learning_rate": 2.39874500855676e-05, "loss": 6.598, "step": 4211 }, { "epoch": 0.7208625705972959, "grad_norm": 18.58864402770996, "learning_rate": 2.399315459212778e-05, "loss": 1.7715, "step": 4212 }, { "epoch": 0.7210337155570768, "grad_norm": 25.57322883605957, "learning_rate": 2.3998859098687964e-05, "loss": 2.5285, "step": 4213 }, { "epoch": 0.7212048605168577, "grad_norm": 21.300174713134766, "learning_rate": 2.4004563605248147e-05, "loss": 2.5907, "step": 4214 }, { "epoch": 0.7213760054766387, "grad_norm": 15.046341896057129, "learning_rate": 2.4010268111808327e-05, "loss": 1.2937, "step": 4215 }, { "epoch": 0.7215471504364196, "grad_norm": 23.890518188476562, "learning_rate": 2.401597261836851e-05, "loss": 2.1427, "step": 4216 }, { "epoch": 0.7217182953962006, "grad_norm": 12.539481163024902, "learning_rate": 2.4021677124928694e-05, "loss": 0.8432, "step": 4217 }, { "epoch": 0.7218894403559816, "grad_norm": 21.108722686767578, "learning_rate": 2.4027381631488877e-05, "loss": 2.3811, "step": 4218 }, { "epoch": 0.7220605853157624, "grad_norm": 18.81709861755371, "learning_rate": 2.4033086138049057e-05, "loss": 1.6914, "step": 4219 }, { "epoch": 0.7222317302755434, "grad_norm": 20.49457359313965, "learning_rate": 2.403879064460924e-05, "loss": 1.9627, "step": 4220 }, { "epoch": 0.7224028752353243, "grad_norm": 9.025213241577148, "learning_rate": 2.4044495151169424e-05, "loss": 0.6996, "step": 4221 }, { "epoch": 0.7225740201951053, "grad_norm": 2.5068697929382324, "learning_rate": 2.4050199657729604e-05, "loss": 0.3039, "step": 4222 }, { "epoch": 0.7227451651548862, "grad_norm": 9.711918830871582, "learning_rate": 2.4055904164289788e-05, "loss": 1.3666, "step": 4223 }, { "epoch": 0.7229163101146672, "grad_norm": 17.298583984375, "learning_rate": 2.406160867084997e-05, "loss": 1.5982, "step": 4224 }, { "epoch": 0.723087455074448, "grad_norm": 8.982203483581543, "learning_rate": 2.4067313177410158e-05, "loss": 0.6486, "step": 4225 }, { "epoch": 0.723258600034229, "grad_norm": 24.49521827697754, "learning_rate": 2.4073017683970338e-05, "loss": 3.2304, "step": 4226 }, { "epoch": 0.7234297449940099, "grad_norm": 21.123031616210938, "learning_rate": 2.407872219053052e-05, "loss": 1.8011, "step": 4227 }, { "epoch": 0.7236008899537909, "grad_norm": 9.760648727416992, "learning_rate": 2.4084426697090704e-05, "loss": 0.6659, "step": 4228 }, { "epoch": 0.7237720349135718, "grad_norm": 8.795894622802734, "learning_rate": 2.4090131203650884e-05, "loss": 0.8128, "step": 4229 }, { "epoch": 0.7239431798733528, "grad_norm": 5.41191291809082, "learning_rate": 2.4095835710211068e-05, "loss": 0.4972, "step": 4230 }, { "epoch": 0.7241143248331336, "grad_norm": 6.078951358795166, "learning_rate": 2.410154021677125e-05, "loss": 0.6283, "step": 4231 }, { "epoch": 0.7242854697929146, "grad_norm": 9.235028266906738, "learning_rate": 2.4107244723331435e-05, "loss": 1.1542, "step": 4232 }, { "epoch": 0.7244566147526955, "grad_norm": 18.9094295501709, "learning_rate": 2.4112949229891615e-05, "loss": 1.3619, "step": 4233 }, { "epoch": 0.7246277597124765, "grad_norm": 8.397602081298828, "learning_rate": 2.4118653736451798e-05, "loss": 0.5137, "step": 4234 }, { "epoch": 0.7247989046722574, "grad_norm": 35.325618743896484, "learning_rate": 2.412435824301198e-05, "loss": 5.9278, "step": 4235 }, { "epoch": 0.7249700496320384, "grad_norm": 7.471834182739258, "learning_rate": 2.413006274957216e-05, "loss": 0.5769, "step": 4236 }, { "epoch": 0.7251411945918192, "grad_norm": 10.854155540466309, "learning_rate": 2.4135767256132345e-05, "loss": 0.8743, "step": 4237 }, { "epoch": 0.7253123395516002, "grad_norm": 26.70799446105957, "learning_rate": 2.4141471762692528e-05, "loss": 2.6374, "step": 4238 }, { "epoch": 0.7254834845113811, "grad_norm": 23.99932861328125, "learning_rate": 2.414717626925271e-05, "loss": 1.9431, "step": 4239 }, { "epoch": 0.7256546294711621, "grad_norm": 7.719635963439941, "learning_rate": 2.415288077581289e-05, "loss": 0.5437, "step": 4240 }, { "epoch": 0.725825774430943, "grad_norm": 11.576183319091797, "learning_rate": 2.4158585282373075e-05, "loss": 0.8931, "step": 4241 }, { "epoch": 0.725996919390724, "grad_norm": 1.2357739210128784, "learning_rate": 2.4164289788933258e-05, "loss": 0.2441, "step": 4242 }, { "epoch": 0.7261680643505048, "grad_norm": 32.51354217529297, "learning_rate": 2.4169994295493438e-05, "loss": 5.9717, "step": 4243 }, { "epoch": 0.7263392093102858, "grad_norm": 20.202180862426758, "learning_rate": 2.417569880205362e-05, "loss": 1.909, "step": 4244 }, { "epoch": 0.7265103542700667, "grad_norm": 84.00714111328125, "learning_rate": 2.4181403308613805e-05, "loss": 1.9052, "step": 4245 }, { "epoch": 0.7266814992298477, "grad_norm": 28.061588287353516, "learning_rate": 2.4187107815173985e-05, "loss": 1.4887, "step": 4246 }, { "epoch": 0.7268526441896286, "grad_norm": 2.4384922981262207, "learning_rate": 2.419281232173417e-05, "loss": 0.2672, "step": 4247 }, { "epoch": 0.7270237891494096, "grad_norm": 2.515739679336548, "learning_rate": 2.4198516828294355e-05, "loss": 0.2884, "step": 4248 }, { "epoch": 0.7271949341091904, "grad_norm": 29.182708740234375, "learning_rate": 2.420422133485454e-05, "loss": 3.5217, "step": 4249 }, { "epoch": 0.7273660790689714, "grad_norm": 16.3360538482666, "learning_rate": 2.420992584141472e-05, "loss": 1.7574, "step": 4250 }, { "epoch": 0.7275372240287523, "grad_norm": 37.92715072631836, "learning_rate": 2.4215630347974902e-05, "loss": 3.4675, "step": 4251 }, { "epoch": 0.7277083689885333, "grad_norm": 26.36214256286621, "learning_rate": 2.4221334854535085e-05, "loss": 3.8431, "step": 4252 }, { "epoch": 0.7278795139483142, "grad_norm": 26.715503692626953, "learning_rate": 2.4227039361095265e-05, "loss": 2.7822, "step": 4253 }, { "epoch": 0.7280506589080952, "grad_norm": 27.2398624420166, "learning_rate": 2.423274386765545e-05, "loss": 3.1297, "step": 4254 }, { "epoch": 0.728221803867876, "grad_norm": 30.56747055053711, "learning_rate": 2.4238448374215632e-05, "loss": 5.8095, "step": 4255 }, { "epoch": 0.728392948827657, "grad_norm": 7.464992523193359, "learning_rate": 2.4244152880775815e-05, "loss": 0.6275, "step": 4256 }, { "epoch": 0.7285640937874379, "grad_norm": 12.612624168395996, "learning_rate": 2.4249857387335995e-05, "loss": 0.8099, "step": 4257 }, { "epoch": 0.7287352387472189, "grad_norm": 24.279560089111328, "learning_rate": 2.425556189389618e-05, "loss": 2.72, "step": 4258 }, { "epoch": 0.7289063837069998, "grad_norm": 21.02090835571289, "learning_rate": 2.4261266400456362e-05, "loss": 2.1918, "step": 4259 }, { "epoch": 0.7290775286667808, "grad_norm": 4.548274040222168, "learning_rate": 2.4266970907016542e-05, "loss": 0.3331, "step": 4260 }, { "epoch": 0.7292486736265616, "grad_norm": 20.990015029907227, "learning_rate": 2.4272675413576725e-05, "loss": 2.199, "step": 4261 }, { "epoch": 0.7294198185863426, "grad_norm": 3.5553269386291504, "learning_rate": 2.427837992013691e-05, "loss": 0.3964, "step": 4262 }, { "epoch": 0.7295909635461235, "grad_norm": 17.434099197387695, "learning_rate": 2.4284084426697092e-05, "loss": 1.7278, "step": 4263 }, { "epoch": 0.7297621085059045, "grad_norm": 26.248044967651367, "learning_rate": 2.4289788933257272e-05, "loss": 3.4657, "step": 4264 }, { "epoch": 0.7299332534656854, "grad_norm": 26.377473831176758, "learning_rate": 2.4295493439817456e-05, "loss": 2.8837, "step": 4265 }, { "epoch": 0.7301043984254664, "grad_norm": 29.46402931213379, "learning_rate": 2.430119794637764e-05, "loss": 2.9555, "step": 4266 }, { "epoch": 0.7302755433852472, "grad_norm": 27.894542694091797, "learning_rate": 2.430690245293782e-05, "loss": 2.4692, "step": 4267 }, { "epoch": 0.7304466883450282, "grad_norm": 23.581409454345703, "learning_rate": 2.4312606959498002e-05, "loss": 2.2207, "step": 4268 }, { "epoch": 0.7306178333048092, "grad_norm": 26.48499870300293, "learning_rate": 2.4318311466058186e-05, "loss": 3.1471, "step": 4269 }, { "epoch": 0.7307889782645901, "grad_norm": 1.8997199535369873, "learning_rate": 2.432401597261837e-05, "loss": 0.2555, "step": 4270 }, { "epoch": 0.7309601232243711, "grad_norm": 27.24410057067871, "learning_rate": 2.4329720479178552e-05, "loss": 5.7181, "step": 4271 }, { "epoch": 0.731131268184152, "grad_norm": 20.901037216186523, "learning_rate": 2.4335424985738736e-05, "loss": 1.8875, "step": 4272 }, { "epoch": 0.731302413143933, "grad_norm": 14.34110164642334, "learning_rate": 2.434112949229892e-05, "loss": 1.1739, "step": 4273 }, { "epoch": 0.7314735581037138, "grad_norm": 1.0765384435653687, "learning_rate": 2.43468339988591e-05, "loss": 0.2476, "step": 4274 }, { "epoch": 0.7316447030634948, "grad_norm": 32.98508834838867, "learning_rate": 2.4352538505419283e-05, "loss": 1.3042, "step": 4275 }, { "epoch": 0.7318158480232757, "grad_norm": 16.529098510742188, "learning_rate": 2.4358243011979466e-05, "loss": 1.3914, "step": 4276 }, { "epoch": 0.7319869929830567, "grad_norm": 25.088991165161133, "learning_rate": 2.4363947518539646e-05, "loss": 1.8855, "step": 4277 }, { "epoch": 0.7321581379428376, "grad_norm": 16.27592658996582, "learning_rate": 2.436965202509983e-05, "loss": 1.2685, "step": 4278 }, { "epoch": 0.7323292829026186, "grad_norm": 24.759716033935547, "learning_rate": 2.4375356531660013e-05, "loss": 2.4846, "step": 4279 }, { "epoch": 0.7325004278623994, "grad_norm": 30.153457641601562, "learning_rate": 2.4381061038220196e-05, "loss": 3.6713, "step": 4280 }, { "epoch": 0.7326715728221804, "grad_norm": 25.423839569091797, "learning_rate": 2.4386765544780376e-05, "loss": 2.4835, "step": 4281 }, { "epoch": 0.7328427177819613, "grad_norm": 17.405637741088867, "learning_rate": 2.439247005134056e-05, "loss": 1.6224, "step": 4282 }, { "epoch": 0.7330138627417423, "grad_norm": 21.038610458374023, "learning_rate": 2.4398174557900743e-05, "loss": 1.7716, "step": 4283 }, { "epoch": 0.7331850077015232, "grad_norm": 30.59052848815918, "learning_rate": 2.4403879064460923e-05, "loss": 2.7644, "step": 4284 }, { "epoch": 0.7333561526613042, "grad_norm": 22.464582443237305, "learning_rate": 2.4409583571021106e-05, "loss": 1.891, "step": 4285 }, { "epoch": 0.733527297621085, "grad_norm": 7.699871063232422, "learning_rate": 2.441528807758129e-05, "loss": 0.7411, "step": 4286 }, { "epoch": 0.733698442580866, "grad_norm": 4.126267910003662, "learning_rate": 2.4420992584141473e-05, "loss": 0.4547, "step": 4287 }, { "epoch": 0.7338695875406469, "grad_norm": 35.644535064697266, "learning_rate": 2.4426697090701653e-05, "loss": 6.2534, "step": 4288 }, { "epoch": 0.7340407325004279, "grad_norm": 28.631427764892578, "learning_rate": 2.4432401597261836e-05, "loss": 2.9862, "step": 4289 }, { "epoch": 0.7342118774602088, "grad_norm": 2.322627067565918, "learning_rate": 2.443810610382202e-05, "loss": 0.2372, "step": 4290 }, { "epoch": 0.7343830224199898, "grad_norm": 24.10704803466797, "learning_rate": 2.44438106103822e-05, "loss": 2.0144, "step": 4291 }, { "epoch": 0.7345541673797706, "grad_norm": 30.596025466918945, "learning_rate": 2.4449515116942383e-05, "loss": 4.3575, "step": 4292 }, { "epoch": 0.7347253123395516, "grad_norm": 16.235584259033203, "learning_rate": 2.4455219623502566e-05, "loss": 1.3492, "step": 4293 }, { "epoch": 0.7348964572993325, "grad_norm": 8.513513565063477, "learning_rate": 2.4460924130062753e-05, "loss": 0.6521, "step": 4294 }, { "epoch": 0.7350676022591135, "grad_norm": 20.195348739624023, "learning_rate": 2.4466628636622933e-05, "loss": 2.3305, "step": 4295 }, { "epoch": 0.7352387472188944, "grad_norm": 18.629962921142578, "learning_rate": 2.4472333143183117e-05, "loss": 1.8291, "step": 4296 }, { "epoch": 0.7354098921786754, "grad_norm": 26.3973388671875, "learning_rate": 2.44780376497433e-05, "loss": 3.334, "step": 4297 }, { "epoch": 0.7355810371384562, "grad_norm": 0.9989029765129089, "learning_rate": 2.448374215630348e-05, "loss": 0.2261, "step": 4298 }, { "epoch": 0.7357521820982372, "grad_norm": 22.981550216674805, "learning_rate": 2.4489446662863663e-05, "loss": 2.0884, "step": 4299 }, { "epoch": 0.7359233270580181, "grad_norm": 22.626544952392578, "learning_rate": 2.4495151169423847e-05, "loss": 2.1202, "step": 4300 }, { "epoch": 0.7360944720177991, "grad_norm": 16.54318618774414, "learning_rate": 2.450085567598403e-05, "loss": 1.5155, "step": 4301 }, { "epoch": 0.73626561697758, "grad_norm": 116.46958923339844, "learning_rate": 2.450656018254421e-05, "loss": 8.2721, "step": 4302 }, { "epoch": 0.736436761937361, "grad_norm": 19.956342697143555, "learning_rate": 2.4512264689104393e-05, "loss": 1.8512, "step": 4303 }, { "epoch": 0.7366079068971418, "grad_norm": 27.972797393798828, "learning_rate": 2.4517969195664577e-05, "loss": 2.7114, "step": 4304 }, { "epoch": 0.7367790518569228, "grad_norm": 11.407204627990723, "learning_rate": 2.4523673702224757e-05, "loss": 0.9488, "step": 4305 }, { "epoch": 0.7369501968167037, "grad_norm": 33.092105865478516, "learning_rate": 2.452937820878494e-05, "loss": 2.9253, "step": 4306 }, { "epoch": 0.7371213417764847, "grad_norm": 10.819000244140625, "learning_rate": 2.4535082715345124e-05, "loss": 0.7126, "step": 4307 }, { "epoch": 0.7372924867362656, "grad_norm": 7.100123882293701, "learning_rate": 2.4540787221905307e-05, "loss": 0.5677, "step": 4308 }, { "epoch": 0.7374636316960466, "grad_norm": 3.432849407196045, "learning_rate": 2.4546491728465487e-05, "loss": 0.2723, "step": 4309 }, { "epoch": 0.7376347766558274, "grad_norm": 22.43340492248535, "learning_rate": 2.455219623502567e-05, "loss": 2.1355, "step": 4310 }, { "epoch": 0.7378059216156084, "grad_norm": 22.277841567993164, "learning_rate": 2.4557900741585854e-05, "loss": 3.1528, "step": 4311 }, { "epoch": 0.7379770665753893, "grad_norm": 26.116939544677734, "learning_rate": 2.4563605248146034e-05, "loss": 2.7783, "step": 4312 }, { "epoch": 0.7381482115351703, "grad_norm": 21.692012786865234, "learning_rate": 2.4569309754706217e-05, "loss": 2.08, "step": 4313 }, { "epoch": 0.7383193564949512, "grad_norm": 2.461122751235962, "learning_rate": 2.45750142612664e-05, "loss": 0.2903, "step": 4314 }, { "epoch": 0.7384905014547322, "grad_norm": 26.099124908447266, "learning_rate": 2.458071876782658e-05, "loss": 2.0427, "step": 4315 }, { "epoch": 0.738661646414513, "grad_norm": 18.19993782043457, "learning_rate": 2.4586423274386764e-05, "loss": 1.5727, "step": 4316 }, { "epoch": 0.738832791374294, "grad_norm": 28.66299057006836, "learning_rate": 2.459212778094695e-05, "loss": 2.5081, "step": 4317 }, { "epoch": 0.7390039363340749, "grad_norm": 27.575727462768555, "learning_rate": 2.4597832287507134e-05, "loss": 2.7801, "step": 4318 }, { "epoch": 0.7391750812938559, "grad_norm": 46.224586486816406, "learning_rate": 2.4603536794067314e-05, "loss": 6.5383, "step": 4319 }, { "epoch": 0.7393462262536369, "grad_norm": 9.273065567016602, "learning_rate": 2.4609241300627497e-05, "loss": 1.2797, "step": 4320 }, { "epoch": 0.7395173712134178, "grad_norm": 12.391923904418945, "learning_rate": 2.461494580718768e-05, "loss": 1.0646, "step": 4321 }, { "epoch": 0.7396885161731988, "grad_norm": 33.82769012451172, "learning_rate": 2.462065031374786e-05, "loss": 4.4309, "step": 4322 }, { "epoch": 0.7398596611329796, "grad_norm": 9.532022476196289, "learning_rate": 2.4626354820308044e-05, "loss": 1.0837, "step": 4323 }, { "epoch": 0.7400308060927606, "grad_norm": 16.09440803527832, "learning_rate": 2.4632059326868227e-05, "loss": 1.2765, "step": 4324 }, { "epoch": 0.7402019510525415, "grad_norm": 1.2351820468902588, "learning_rate": 2.463776383342841e-05, "loss": 0.2564, "step": 4325 }, { "epoch": 0.7403730960123225, "grad_norm": 31.442874908447266, "learning_rate": 2.464346833998859e-05, "loss": 3.7296, "step": 4326 }, { "epoch": 0.7405442409721034, "grad_norm": 21.058252334594727, "learning_rate": 2.4649172846548774e-05, "loss": 1.7717, "step": 4327 }, { "epoch": 0.7407153859318844, "grad_norm": 25.62652015686035, "learning_rate": 2.4654877353108958e-05, "loss": 2.1194, "step": 4328 }, { "epoch": 0.7408865308916652, "grad_norm": 16.54743003845215, "learning_rate": 2.4660581859669138e-05, "loss": 1.6091, "step": 4329 }, { "epoch": 0.7410576758514462, "grad_norm": 9.183897018432617, "learning_rate": 2.466628636622932e-05, "loss": 0.7841, "step": 4330 }, { "epoch": 0.7412288208112271, "grad_norm": 12.366703987121582, "learning_rate": 2.4671990872789504e-05, "loss": 1.2934, "step": 4331 }, { "epoch": 0.7413999657710081, "grad_norm": 9.97996997833252, "learning_rate": 2.4677695379349688e-05, "loss": 0.648, "step": 4332 }, { "epoch": 0.741571110730789, "grad_norm": 4.609592437744141, "learning_rate": 2.4683399885909868e-05, "loss": 0.4228, "step": 4333 }, { "epoch": 0.74174225569057, "grad_norm": 40.74041748046875, "learning_rate": 2.468910439247005e-05, "loss": 6.2855, "step": 4334 }, { "epoch": 0.7419134006503508, "grad_norm": 34.59137725830078, "learning_rate": 2.4694808899030234e-05, "loss": 5.9909, "step": 4335 }, { "epoch": 0.7420845456101318, "grad_norm": 8.708815574645996, "learning_rate": 2.4700513405590414e-05, "loss": 0.7173, "step": 4336 }, { "epoch": 0.7422556905699127, "grad_norm": 16.282533645629883, "learning_rate": 2.4706217912150598e-05, "loss": 1.7374, "step": 4337 }, { "epoch": 0.7424268355296937, "grad_norm": 11.787439346313477, "learning_rate": 2.471192241871078e-05, "loss": 0.7362, "step": 4338 }, { "epoch": 0.7425979804894746, "grad_norm": 6.672769546508789, "learning_rate": 2.4717626925270968e-05, "loss": 0.8264, "step": 4339 }, { "epoch": 0.7427691254492556, "grad_norm": 11.766725540161133, "learning_rate": 2.4723331431831148e-05, "loss": 1.1676, "step": 4340 }, { "epoch": 0.7429402704090364, "grad_norm": 40.45454788208008, "learning_rate": 2.472903593839133e-05, "loss": 3.9712, "step": 4341 }, { "epoch": 0.7431114153688174, "grad_norm": 26.344924926757812, "learning_rate": 2.4734740444951515e-05, "loss": 5.5252, "step": 4342 }, { "epoch": 0.7432825603285983, "grad_norm": 23.83209228515625, "learning_rate": 2.4740444951511695e-05, "loss": 2.138, "step": 4343 }, { "epoch": 0.7434537052883793, "grad_norm": 31.80006980895996, "learning_rate": 2.4746149458071878e-05, "loss": 3.0915, "step": 4344 }, { "epoch": 0.7436248502481602, "grad_norm": 6.755828380584717, "learning_rate": 2.475185396463206e-05, "loss": 0.5986, "step": 4345 }, { "epoch": 0.7437959952079412, "grad_norm": 20.988128662109375, "learning_rate": 2.475755847119224e-05, "loss": 2.3074, "step": 4346 }, { "epoch": 0.743967140167722, "grad_norm": 15.407384872436523, "learning_rate": 2.4763262977752425e-05, "loss": 1.2107, "step": 4347 }, { "epoch": 0.744138285127503, "grad_norm": 189.76187133789062, "learning_rate": 2.4768967484312608e-05, "loss": 7.4887, "step": 4348 }, { "epoch": 0.7443094300872839, "grad_norm": 15.863279342651367, "learning_rate": 2.477467199087279e-05, "loss": 1.4751, "step": 4349 }, { "epoch": 0.7444805750470649, "grad_norm": 15.989771842956543, "learning_rate": 2.478037649743297e-05, "loss": 1.5714, "step": 4350 }, { "epoch": 0.7446517200068458, "grad_norm": 22.554706573486328, "learning_rate": 2.4786081003993155e-05, "loss": 2.0379, "step": 4351 }, { "epoch": 0.7448228649666268, "grad_norm": 19.29569435119629, "learning_rate": 2.479178551055334e-05, "loss": 1.6695, "step": 4352 }, { "epoch": 0.7449940099264076, "grad_norm": 32.85340881347656, "learning_rate": 2.479749001711352e-05, "loss": 6.1252, "step": 4353 }, { "epoch": 0.7451651548861886, "grad_norm": 7.867766380310059, "learning_rate": 2.4803194523673702e-05, "loss": 0.5292, "step": 4354 }, { "epoch": 0.7453362998459695, "grad_norm": 31.13640022277832, "learning_rate": 2.4808899030233885e-05, "loss": 3.6394, "step": 4355 }, { "epoch": 0.7455074448057505, "grad_norm": 29.437509536743164, "learning_rate": 2.481460353679407e-05, "loss": 5.7376, "step": 4356 }, { "epoch": 0.7456785897655314, "grad_norm": 22.40192222595215, "learning_rate": 2.482030804335425e-05, "loss": 2.1447, "step": 4357 }, { "epoch": 0.7458497347253124, "grad_norm": 16.596187591552734, "learning_rate": 2.4826012549914432e-05, "loss": 1.5623, "step": 4358 }, { "epoch": 0.7460208796850932, "grad_norm": 18.423601150512695, "learning_rate": 2.4831717056474615e-05, "loss": 1.8746, "step": 4359 }, { "epoch": 0.7461920246448742, "grad_norm": 27.663909912109375, "learning_rate": 2.4837421563034795e-05, "loss": 3.0094, "step": 4360 }, { "epoch": 0.7463631696046551, "grad_norm": 22.51800537109375, "learning_rate": 2.484312606959498e-05, "loss": 2.4867, "step": 4361 }, { "epoch": 0.7465343145644361, "grad_norm": 14.807696342468262, "learning_rate": 2.4848830576155165e-05, "loss": 1.3953, "step": 4362 }, { "epoch": 0.746705459524217, "grad_norm": 27.524484634399414, "learning_rate": 2.485453508271535e-05, "loss": 2.9434, "step": 4363 }, { "epoch": 0.746876604483998, "grad_norm": 20.50353240966797, "learning_rate": 2.486023958927553e-05, "loss": 1.9921, "step": 4364 }, { "epoch": 0.7470477494437788, "grad_norm": 8.223814010620117, "learning_rate": 2.4865944095835712e-05, "loss": 1.2869, "step": 4365 }, { "epoch": 0.7472188944035598, "grad_norm": 9.882466316223145, "learning_rate": 2.4871648602395896e-05, "loss": 0.8515, "step": 4366 }, { "epoch": 0.7473900393633407, "grad_norm": 9.574341773986816, "learning_rate": 2.4877353108956075e-05, "loss": 0.617, "step": 4367 }, { "epoch": 0.7475611843231217, "grad_norm": 65.22699737548828, "learning_rate": 2.488305761551626e-05, "loss": 3.1356, "step": 4368 }, { "epoch": 0.7477323292829026, "grad_norm": 3.961726427078247, "learning_rate": 2.4888762122076442e-05, "loss": 0.4726, "step": 4369 }, { "epoch": 0.7479034742426836, "grad_norm": 24.925546646118164, "learning_rate": 2.4894466628636626e-05, "loss": 2.5579, "step": 4370 }, { "epoch": 0.7480746192024645, "grad_norm": 18.773216247558594, "learning_rate": 2.4900171135196806e-05, "loss": 1.7584, "step": 4371 }, { "epoch": 0.7482457641622454, "grad_norm": 29.555532455444336, "learning_rate": 2.490587564175699e-05, "loss": 6.1397, "step": 4372 }, { "epoch": 0.7484169091220264, "grad_norm": 24.256031036376953, "learning_rate": 2.4911580148317172e-05, "loss": 1.9726, "step": 4373 }, { "epoch": 0.7485880540818073, "grad_norm": 16.05461883544922, "learning_rate": 2.4917284654877352e-05, "loss": 1.264, "step": 4374 }, { "epoch": 0.7487591990415883, "grad_norm": 24.213459014892578, "learning_rate": 2.4922989161437536e-05, "loss": 2.3189, "step": 4375 }, { "epoch": 0.7489303440013692, "grad_norm": 29.8961181640625, "learning_rate": 2.492869366799772e-05, "loss": 3.6839, "step": 4376 }, { "epoch": 0.7491014889611501, "grad_norm": 22.435625076293945, "learning_rate": 2.4934398174557903e-05, "loss": 2.2289, "step": 4377 }, { "epoch": 0.749272633920931, "grad_norm": 30.220745086669922, "learning_rate": 2.4940102681118082e-05, "loss": 4.4896, "step": 4378 }, { "epoch": 0.749443778880712, "grad_norm": 18.16118049621582, "learning_rate": 2.4945807187678266e-05, "loss": 1.8377, "step": 4379 }, { "epoch": 0.7496149238404929, "grad_norm": 27.19329833984375, "learning_rate": 2.495151169423845e-05, "loss": 2.8044, "step": 4380 }, { "epoch": 0.7497860688002739, "grad_norm": 22.332542419433594, "learning_rate": 2.495721620079863e-05, "loss": 1.8498, "step": 4381 }, { "epoch": 0.7499572137600548, "grad_norm": 4.663109302520752, "learning_rate": 2.4962920707358813e-05, "loss": 0.5393, "step": 4382 }, { "epoch": 0.7501283587198357, "grad_norm": 18.246469497680664, "learning_rate": 2.4968625213918996e-05, "loss": 1.5186, "step": 4383 }, { "epoch": 0.7502995036796166, "grad_norm": 28.977493286132812, "learning_rate": 2.4974329720479176e-05, "loss": 2.8719, "step": 4384 }, { "epoch": 0.7504706486393976, "grad_norm": 6.890769958496094, "learning_rate": 2.4980034227039363e-05, "loss": 0.601, "step": 4385 }, { "epoch": 0.7504706486393976, "eval_nli-pairs_loss": 2.050740957260132, "eval_nli-pairs_runtime": 4.2795, "eval_nli-pairs_samples_per_second": 46.734, "eval_nli-pairs_steps_per_second": 1.636, "eval_sts-test_pearson_cosine": 0.756734064986887, "eval_sts-test_pearson_dot": 0.6528865740820513, "eval_sts-test_pearson_euclidean": 0.7545477323381371, "eval_sts-test_pearson_manhattan": 0.7602184258166524, "eval_sts-test_pearson_max": 0.7602184258166524, "eval_sts-test_spearman_cosine": 0.7444733315413253, "eval_sts-test_spearman_dot": 0.6319213377688324, "eval_sts-test_spearman_euclidean": 0.7398981584440489, "eval_sts-test_spearman_manhattan": 0.7468720146418238, "eval_sts-test_spearman_max": 0.7468720146418238, "step": 4385 }, { "epoch": 0.7504706486393976, "eval_vitaminc-pairs_loss": 1.3987665176391602, "eval_vitaminc-pairs_runtime": 2.7296, "eval_vitaminc-pairs_samples_per_second": 73.272, "eval_vitaminc-pairs_steps_per_second": 2.565, "step": 4385 }, { "epoch": 0.7504706486393976, "eval_qnli-contrastive_loss": 2.7516510486602783, "eval_qnli-contrastive_runtime": 0.6347, "eval_qnli-contrastive_samples_per_second": 315.112, "eval_qnli-contrastive_steps_per_second": 11.029, "step": 4385 }, { "epoch": 0.7504706486393976, "eval_scitail-pairs-qa_loss": 0.22120414674282074, "eval_scitail-pairs-qa_runtime": 1.6102, "eval_scitail-pairs-qa_samples_per_second": 124.21, "eval_scitail-pairs-qa_steps_per_second": 4.347, "step": 4385 }, { "epoch": 0.7504706486393976, "eval_scitail-pairs-pos_loss": 0.9076427817344666, "eval_scitail-pairs-pos_runtime": 2.6161, "eval_scitail-pairs-pos_samples_per_second": 76.449, "eval_scitail-pairs-pos_steps_per_second": 2.676, "step": 4385 }, { "epoch": 0.7504706486393976, "eval_xsum-pairs_loss": 1.0805269479751587, "eval_xsum-pairs_runtime": 2.6446, "eval_xsum-pairs_samples_per_second": 66.172, "eval_xsum-pairs_steps_per_second": 2.269, "step": 4385 }, { "epoch": 0.7504706486393976, "eval_compression-pairs_loss": 0.44501441717147827, "eval_compression-pairs_runtime": 0.5283, "eval_compression-pairs_samples_per_second": 378.589, "eval_compression-pairs_steps_per_second": 13.251, "step": 4385 }, { "epoch": 0.7504706486393976, "eval_sciq_pairs_loss": 5.368130207061768, "eval_sciq_pairs_runtime": 9.1813, "eval_sciq_pairs_samples_per_second": 21.783, "eval_sciq_pairs_steps_per_second": 0.762, "step": 4385 }, { "epoch": 0.7504706486393976, "eval_qasc_pairs_loss": 5.916055679321289, "eval_qasc_pairs_runtime": 2.6536, "eval_qasc_pairs_samples_per_second": 75.369, "eval_qasc_pairs_steps_per_second": 2.638, "step": 4385 }, { "epoch": 0.7504706486393976, "eval_openbookqa_pairs_loss": 3.2691121101379395, "eval_openbookqa_pairs_runtime": 0.6379, "eval_openbookqa_pairs_samples_per_second": 108.16, "eval_openbookqa_pairs_steps_per_second": 4.703, "step": 4385 }, { "epoch": 0.7504706486393976, "eval_msmarco_pairs_loss": 1.845609426498413, "eval_msmarco_pairs_runtime": 3.9718, "eval_msmarco_pairs_samples_per_second": 50.355, "eval_msmarco_pairs_steps_per_second": 1.762, "step": 4385 }, { "epoch": 0.7504706486393976, "eval_nq_pairs_loss": 2.279620409011841, "eval_nq_pairs_runtime": 8.6017, "eval_nq_pairs_samples_per_second": 23.251, "eval_nq_pairs_steps_per_second": 0.814, "step": 4385 }, { "epoch": 0.7504706486393976, "eval_trivia_pairs_loss": 2.464531421661377, "eval_trivia_pairs_runtime": 12.8394, "eval_trivia_pairs_samples_per_second": 15.577, "eval_trivia_pairs_steps_per_second": 0.545, "step": 4385 }, { "epoch": 0.7504706486393976, "eval_quora_pairs_loss": 0.40776023268699646, "eval_quora_pairs_runtime": 1.5837, "eval_quora_pairs_samples_per_second": 126.29, "eval_quora_pairs_steps_per_second": 4.42, "step": 4385 }, { "epoch": 0.7504706486393976, "eval_gooaq_pairs_loss": 1.4506279230117798, "eval_gooaq_pairs_runtime": 2.6527, "eval_gooaq_pairs_samples_per_second": 75.396, "eval_gooaq_pairs_steps_per_second": 2.639, "step": 4385 }, { "epoch": 0.7506417935991785, "grad_norm": 29.356266021728516, "learning_rate": 2.4985738733599546e-05, "loss": 2.6466, "step": 4386 }, { "epoch": 0.7508129385589595, "grad_norm": 20.16213607788086, "learning_rate": 2.499144324015973e-05, "loss": 2.096, "step": 4387 }, { "epoch": 0.7509840835187404, "grad_norm": 6.681097984313965, "learning_rate": 2.499714774671991e-05, "loss": 0.4151, "step": 4388 }, { "epoch": 0.7511552284785213, "grad_norm": 28.097131729125977, "learning_rate": 2.5002852253280093e-05, "loss": 2.6453, "step": 4389 }, { "epoch": 0.7513263734383022, "grad_norm": 23.283533096313477, "learning_rate": 2.5008556759840276e-05, "loss": 1.8409, "step": 4390 }, { "epoch": 0.7514975183980832, "grad_norm": 17.86503791809082, "learning_rate": 2.5014261266400456e-05, "loss": 1.9202, "step": 4391 }, { "epoch": 0.7516686633578641, "grad_norm": 24.048988342285156, "learning_rate": 2.501996577296064e-05, "loss": 2.8925, "step": 4392 }, { "epoch": 0.7518398083176451, "grad_norm": 14.309977531433105, "learning_rate": 2.5025670279520823e-05, "loss": 1.2498, "step": 4393 }, { "epoch": 0.752010953277426, "grad_norm": 23.62467384338379, "learning_rate": 2.5031374786081006e-05, "loss": 2.1623, "step": 4394 }, { "epoch": 0.7521820982372069, "grad_norm": 13.426614761352539, "learning_rate": 2.5037079292641186e-05, "loss": 1.299, "step": 4395 }, { "epoch": 0.7523532431969878, "grad_norm": 8.141080856323242, "learning_rate": 2.504278379920137e-05, "loss": 1.2283, "step": 4396 }, { "epoch": 0.7525243881567688, "grad_norm": 21.36097526550293, "learning_rate": 2.5048488305761553e-05, "loss": 1.5297, "step": 4397 }, { "epoch": 0.7526955331165497, "grad_norm": 50.34371566772461, "learning_rate": 2.5054192812321733e-05, "loss": 2.2539, "step": 4398 }, { "epoch": 0.7528666780763307, "grad_norm": 1.270694613456726, "learning_rate": 2.5059897318881917e-05, "loss": 0.2376, "step": 4399 }, { "epoch": 0.7530378230361116, "grad_norm": 25.70792579650879, "learning_rate": 2.50656018254421e-05, "loss": 2.8688, "step": 4400 }, { "epoch": 0.7532089679958925, "grad_norm": 21.675689697265625, "learning_rate": 2.5071306332002283e-05, "loss": 2.2722, "step": 4401 }, { "epoch": 0.7533801129556734, "grad_norm": 0.9436509013175964, "learning_rate": 2.5077010838562463e-05, "loss": 0.2405, "step": 4402 }, { "epoch": 0.7535512579154544, "grad_norm": 17.242216110229492, "learning_rate": 2.5082715345122647e-05, "loss": 1.5298, "step": 4403 }, { "epoch": 0.7537224028752353, "grad_norm": 23.45809555053711, "learning_rate": 2.508841985168283e-05, "loss": 1.9691, "step": 4404 }, { "epoch": 0.7538935478350163, "grad_norm": 3.92891788482666, "learning_rate": 2.509412435824301e-05, "loss": 0.555, "step": 4405 }, { "epoch": 0.7540646927947972, "grad_norm": 22.707956314086914, "learning_rate": 2.5099828864803193e-05, "loss": 2.3504, "step": 4406 }, { "epoch": 0.7542358377545781, "grad_norm": 20.21977996826172, "learning_rate": 2.5105533371363377e-05, "loss": 2.0231, "step": 4407 }, { "epoch": 0.754406982714359, "grad_norm": 1.7529772520065308, "learning_rate": 2.5111237877923564e-05, "loss": 0.2485, "step": 4408 }, { "epoch": 0.75457812767414, "grad_norm": 1.6466134786605835, "learning_rate": 2.5116942384483744e-05, "loss": 0.2669, "step": 4409 }, { "epoch": 0.7547492726339209, "grad_norm": 17.274892807006836, "learning_rate": 2.5122646891043927e-05, "loss": 1.6309, "step": 4410 }, { "epoch": 0.7549204175937019, "grad_norm": 21.545635223388672, "learning_rate": 2.512835139760411e-05, "loss": 2.004, "step": 4411 }, { "epoch": 0.7550915625534828, "grad_norm": 12.705281257629395, "learning_rate": 2.513405590416429e-05, "loss": 0.8108, "step": 4412 }, { "epoch": 0.7552627075132637, "grad_norm": 21.702417373657227, "learning_rate": 2.5139760410724474e-05, "loss": 1.9521, "step": 4413 }, { "epoch": 0.7554338524730446, "grad_norm": 5.507816314697266, "learning_rate": 2.5145464917284657e-05, "loss": 0.4295, "step": 4414 }, { "epoch": 0.7556049974328256, "grad_norm": 44.05494689941406, "learning_rate": 2.5151169423844837e-05, "loss": 2.7, "step": 4415 }, { "epoch": 0.7557761423926065, "grad_norm": 23.11194610595703, "learning_rate": 2.515687393040502e-05, "loss": 2.84, "step": 4416 }, { "epoch": 0.7559472873523875, "grad_norm": 26.601444244384766, "learning_rate": 2.5162578436965204e-05, "loss": 2.6999, "step": 4417 }, { "epoch": 0.7561184323121684, "grad_norm": 21.454803466796875, "learning_rate": 2.5168282943525387e-05, "loss": 2.186, "step": 4418 }, { "epoch": 0.7562895772719493, "grad_norm": 19.278148651123047, "learning_rate": 2.5173987450085567e-05, "loss": 1.9123, "step": 4419 }, { "epoch": 0.7564607222317302, "grad_norm": 25.827354431152344, "learning_rate": 2.517969195664575e-05, "loss": 2.32, "step": 4420 }, { "epoch": 0.7566318671915112, "grad_norm": 24.202350616455078, "learning_rate": 2.5185396463205934e-05, "loss": 3.0039, "step": 4421 }, { "epoch": 0.7568030121512922, "grad_norm": 16.3718318939209, "learning_rate": 2.5191100969766114e-05, "loss": 1.56, "step": 4422 }, { "epoch": 0.7569741571110731, "grad_norm": 27.989944458007812, "learning_rate": 2.5196805476326297e-05, "loss": 3.6797, "step": 4423 }, { "epoch": 0.7571453020708541, "grad_norm": 29.627376556396484, "learning_rate": 2.520250998288648e-05, "loss": 3.0339, "step": 4424 }, { "epoch": 0.7573164470306349, "grad_norm": 23.632122039794922, "learning_rate": 2.5208214489446664e-05, "loss": 2.2779, "step": 4425 }, { "epoch": 0.7574875919904159, "grad_norm": 9.593125343322754, "learning_rate": 2.5213918996006844e-05, "loss": 0.7422, "step": 4426 }, { "epoch": 0.7576587369501968, "grad_norm": 17.23970603942871, "learning_rate": 2.5219623502567027e-05, "loss": 1.2251, "step": 4427 }, { "epoch": 0.7578298819099778, "grad_norm": 26.436206817626953, "learning_rate": 2.522532800912721e-05, "loss": 2.0061, "step": 4428 }, { "epoch": 0.7580010268697587, "grad_norm": 5.741855144500732, "learning_rate": 2.523103251568739e-05, "loss": 0.4587, "step": 4429 }, { "epoch": 0.7581721718295397, "grad_norm": 28.37969398498535, "learning_rate": 2.5236737022247574e-05, "loss": 2.671, "step": 4430 }, { "epoch": 0.7583433167893205, "grad_norm": 18.421100616455078, "learning_rate": 2.524244152880776e-05, "loss": 1.7614, "step": 4431 }, { "epoch": 0.7585144617491015, "grad_norm": 4.070728302001953, "learning_rate": 2.5248146035367944e-05, "loss": 0.2731, "step": 4432 }, { "epoch": 0.7586856067088824, "grad_norm": 1.1695162057876587, "learning_rate": 2.5253850541928124e-05, "loss": 0.2524, "step": 4433 }, { "epoch": 0.7588567516686634, "grad_norm": 22.150390625, "learning_rate": 2.5259555048488308e-05, "loss": 2.1763, "step": 4434 }, { "epoch": 0.7590278966284443, "grad_norm": 24.137784957885742, "learning_rate": 2.526525955504849e-05, "loss": 2.7464, "step": 4435 }, { "epoch": 0.7591990415882253, "grad_norm": 97.06147003173828, "learning_rate": 2.527096406160867e-05, "loss": 8.2917, "step": 4436 }, { "epoch": 0.7593701865480061, "grad_norm": 18.736169815063477, "learning_rate": 2.5276668568168854e-05, "loss": 1.8898, "step": 4437 }, { "epoch": 0.7595413315077871, "grad_norm": 6.186467170715332, "learning_rate": 2.5282373074729038e-05, "loss": 0.6734, "step": 4438 }, { "epoch": 0.759712476467568, "grad_norm": 2.713296413421631, "learning_rate": 2.528807758128922e-05, "loss": 0.2576, "step": 4439 }, { "epoch": 0.759883621427349, "grad_norm": 5.079885482788086, "learning_rate": 2.52937820878494e-05, "loss": 0.928, "step": 4440 }, { "epoch": 0.7600547663871299, "grad_norm": 7.678714275360107, "learning_rate": 2.5299486594409585e-05, "loss": 0.9906, "step": 4441 }, { "epoch": 0.7602259113469109, "grad_norm": 28.940954208374023, "learning_rate": 2.5305191100969768e-05, "loss": 2.2819, "step": 4442 }, { "epoch": 0.7603970563066917, "grad_norm": 19.63545799255371, "learning_rate": 2.5310895607529948e-05, "loss": 1.5862, "step": 4443 }, { "epoch": 0.7605682012664727, "grad_norm": 24.078157424926758, "learning_rate": 2.531660011409013e-05, "loss": 1.9171, "step": 4444 }, { "epoch": 0.7607393462262536, "grad_norm": 25.036775588989258, "learning_rate": 2.5322304620650315e-05, "loss": 2.7408, "step": 4445 }, { "epoch": 0.7609104911860346, "grad_norm": 25.063066482543945, "learning_rate": 2.5328009127210495e-05, "loss": 2.0789, "step": 4446 }, { "epoch": 0.7610816361458155, "grad_norm": 26.909889221191406, "learning_rate": 2.5333713633770678e-05, "loss": 3.0789, "step": 4447 }, { "epoch": 0.7612527811055965, "grad_norm": 55.91215896606445, "learning_rate": 2.533941814033086e-05, "loss": 2.2273, "step": 4448 }, { "epoch": 0.7614239260653773, "grad_norm": 16.553728103637695, "learning_rate": 2.5345122646891045e-05, "loss": 1.4893, "step": 4449 }, { "epoch": 0.7615950710251583, "grad_norm": 35.240394592285156, "learning_rate": 2.5350827153451225e-05, "loss": 6.035, "step": 4450 }, { "epoch": 0.7617662159849392, "grad_norm": 9.102727890014648, "learning_rate": 2.5356531660011408e-05, "loss": 0.5959, "step": 4451 }, { "epoch": 0.7619373609447202, "grad_norm": 19.193933486938477, "learning_rate": 2.536223616657159e-05, "loss": 1.4628, "step": 4452 }, { "epoch": 0.7621085059045011, "grad_norm": 31.62565803527832, "learning_rate": 2.536794067313177e-05, "loss": 5.7421, "step": 4453 }, { "epoch": 0.7622796508642821, "grad_norm": 19.388065338134766, "learning_rate": 2.5373645179691958e-05, "loss": 1.8378, "step": 4454 }, { "epoch": 0.7624507958240629, "grad_norm": 4.330984115600586, "learning_rate": 2.537934968625214e-05, "loss": 0.4705, "step": 4455 }, { "epoch": 0.7626219407838439, "grad_norm": 5.933356761932373, "learning_rate": 2.5385054192812325e-05, "loss": 0.5481, "step": 4456 }, { "epoch": 0.7627930857436248, "grad_norm": 30.84785270690918, "learning_rate": 2.5390758699372505e-05, "loss": 5.6798, "step": 4457 }, { "epoch": 0.7629642307034058, "grad_norm": 22.500408172607422, "learning_rate": 2.539646320593269e-05, "loss": 2.1908, "step": 4458 }, { "epoch": 0.7631353756631867, "grad_norm": 1.4484208822250366, "learning_rate": 2.5402167712492872e-05, "loss": 0.2429, "step": 4459 }, { "epoch": 0.7633065206229677, "grad_norm": 18.820663452148438, "learning_rate": 2.5407872219053052e-05, "loss": 1.5979, "step": 4460 }, { "epoch": 0.7634776655827485, "grad_norm": 29.224742889404297, "learning_rate": 2.5413576725613235e-05, "loss": 3.4263, "step": 4461 }, { "epoch": 0.7636488105425295, "grad_norm": 27.641237258911133, "learning_rate": 2.541928123217342e-05, "loss": 2.9448, "step": 4462 }, { "epoch": 0.7638199555023104, "grad_norm": 28.82822608947754, "learning_rate": 2.5424985738733602e-05, "loss": 3.3548, "step": 4463 }, { "epoch": 0.7639911004620914, "grad_norm": 31.449243545532227, "learning_rate": 2.5430690245293782e-05, "loss": 3.569, "step": 4464 }, { "epoch": 0.7641622454218723, "grad_norm": 22.96309471130371, "learning_rate": 2.5436394751853965e-05, "loss": 2.7067, "step": 4465 }, { "epoch": 0.7643333903816533, "grad_norm": 54.38570022583008, "learning_rate": 2.544209925841415e-05, "loss": 1.9878, "step": 4466 }, { "epoch": 0.7645045353414341, "grad_norm": 13.225268363952637, "learning_rate": 2.544780376497433e-05, "loss": 0.9805, "step": 4467 }, { "epoch": 0.7646756803012151, "grad_norm": 11.44771957397461, "learning_rate": 2.5453508271534512e-05, "loss": 0.8328, "step": 4468 }, { "epoch": 0.764846825260996, "grad_norm": 1.3261395692825317, "learning_rate": 2.5459212778094695e-05, "loss": 0.2307, "step": 4469 }, { "epoch": 0.765017970220777, "grad_norm": 2.594686269760132, "learning_rate": 2.546491728465488e-05, "loss": 0.2594, "step": 4470 }, { "epoch": 0.7651891151805579, "grad_norm": 1.5864319801330566, "learning_rate": 2.547062179121506e-05, "loss": 0.2351, "step": 4471 }, { "epoch": 0.7653602601403389, "grad_norm": 23.80762481689453, "learning_rate": 2.5476326297775242e-05, "loss": 2.3596, "step": 4472 }, { "epoch": 0.7655314051001199, "grad_norm": 0.9860055446624756, "learning_rate": 2.5482030804335426e-05, "loss": 0.2113, "step": 4473 }, { "epoch": 0.7657025500599007, "grad_norm": 24.054868698120117, "learning_rate": 2.5487735310895606e-05, "loss": 2.4686, "step": 4474 }, { "epoch": 0.7658736950196817, "grad_norm": 20.621498107910156, "learning_rate": 2.549343981745579e-05, "loss": 1.9467, "step": 4475 }, { "epoch": 0.7660448399794626, "grad_norm": 23.417434692382812, "learning_rate": 2.5499144324015972e-05, "loss": 2.6939, "step": 4476 }, { "epoch": 0.7662159849392436, "grad_norm": 35.24362564086914, "learning_rate": 2.5504848830576156e-05, "loss": 2.1405, "step": 4477 }, { "epoch": 0.7663871298990245, "grad_norm": 24.250762939453125, "learning_rate": 2.551055333713634e-05, "loss": 2.0974, "step": 4478 }, { "epoch": 0.7665582748588055, "grad_norm": 34.85161590576172, "learning_rate": 2.5516257843696522e-05, "loss": 2.6934, "step": 4479 }, { "epoch": 0.7667294198185863, "grad_norm": 23.28230857849121, "learning_rate": 2.5521962350256706e-05, "loss": 2.4125, "step": 4480 }, { "epoch": 0.7669005647783673, "grad_norm": 24.412673950195312, "learning_rate": 2.5527666856816886e-05, "loss": 2.0085, "step": 4481 }, { "epoch": 0.7670717097381482, "grad_norm": 5.911852836608887, "learning_rate": 2.553337136337707e-05, "loss": 0.4889, "step": 4482 }, { "epoch": 0.7672428546979292, "grad_norm": 15.7787504196167, "learning_rate": 2.5539075869937253e-05, "loss": 1.6727, "step": 4483 }, { "epoch": 0.7674139996577101, "grad_norm": 30.21489715576172, "learning_rate": 2.5544780376497433e-05, "loss": 3.3485, "step": 4484 }, { "epoch": 0.767585144617491, "grad_norm": 1.4315123558044434, "learning_rate": 2.5550484883057616e-05, "loss": 0.2253, "step": 4485 }, { "epoch": 0.7677562895772719, "grad_norm": 43.58045959472656, "learning_rate": 2.55561893896178e-05, "loss": 2.2129, "step": 4486 }, { "epoch": 0.7679274345370529, "grad_norm": 15.73321533203125, "learning_rate": 2.5561893896177983e-05, "loss": 1.8963, "step": 4487 }, { "epoch": 0.7680985794968338, "grad_norm": 15.624593734741211, "learning_rate": 2.5567598402738163e-05, "loss": 1.4826, "step": 4488 }, { "epoch": 0.7682697244566148, "grad_norm": 19.84630012512207, "learning_rate": 2.5573302909298346e-05, "loss": 1.9842, "step": 4489 }, { "epoch": 0.7684408694163957, "grad_norm": 23.67464828491211, "learning_rate": 2.557900741585853e-05, "loss": 1.9172, "step": 4490 }, { "epoch": 0.7686120143761767, "grad_norm": 26.324172973632812, "learning_rate": 2.558471192241871e-05, "loss": 2.4427, "step": 4491 }, { "epoch": 0.7687831593359575, "grad_norm": 29.327041625976562, "learning_rate": 2.5590416428978893e-05, "loss": 2.9564, "step": 4492 }, { "epoch": 0.7689543042957385, "grad_norm": 18.07971954345703, "learning_rate": 2.5596120935539076e-05, "loss": 1.5832, "step": 4493 }, { "epoch": 0.7691254492555194, "grad_norm": 25.531024932861328, "learning_rate": 2.560182544209926e-05, "loss": 2.5085, "step": 4494 }, { "epoch": 0.7692965942153004, "grad_norm": 26.901735305786133, "learning_rate": 2.560752994865944e-05, "loss": 2.7555, "step": 4495 }, { "epoch": 0.7694677391750813, "grad_norm": 1.0921388864517212, "learning_rate": 2.5613234455219623e-05, "loss": 0.2269, "step": 4496 }, { "epoch": 0.7696388841348623, "grad_norm": 24.066415786743164, "learning_rate": 2.5618938961779806e-05, "loss": 2.2307, "step": 4497 }, { "epoch": 0.7698100290946431, "grad_norm": 27.785593032836914, "learning_rate": 2.5624643468339986e-05, "loss": 3.4921, "step": 4498 }, { "epoch": 0.7699811740544241, "grad_norm": 29.524158477783203, "learning_rate": 2.563034797490017e-05, "loss": 3.7322, "step": 4499 }, { "epoch": 0.770152319014205, "grad_norm": 8.157841682434082, "learning_rate": 2.5636052481460356e-05, "loss": 0.6122, "step": 4500 }, { "epoch": 0.770323463973986, "grad_norm": 0.7895174026489258, "learning_rate": 2.564175698802054e-05, "loss": 0.2004, "step": 4501 }, { "epoch": 0.7704946089337669, "grad_norm": 19.612058639526367, "learning_rate": 2.564746149458072e-05, "loss": 1.4928, "step": 4502 }, { "epoch": 0.7706657538935479, "grad_norm": 24.910694122314453, "learning_rate": 2.5653166001140903e-05, "loss": 3.1004, "step": 4503 }, { "epoch": 0.7708368988533287, "grad_norm": 16.94511604309082, "learning_rate": 2.5658870507701087e-05, "loss": 1.5157, "step": 4504 }, { "epoch": 0.7710080438131097, "grad_norm": 18.35201072692871, "learning_rate": 2.5664575014261267e-05, "loss": 1.6228, "step": 4505 }, { "epoch": 0.7711791887728906, "grad_norm": 1.8921977281570435, "learning_rate": 2.567027952082145e-05, "loss": 0.2404, "step": 4506 }, { "epoch": 0.7713503337326716, "grad_norm": 15.37820053100586, "learning_rate": 2.5675984027381633e-05, "loss": 1.4636, "step": 4507 }, { "epoch": 0.7715214786924525, "grad_norm": 45.28339767456055, "learning_rate": 2.5681688533941817e-05, "loss": 5.8686, "step": 4508 }, { "epoch": 0.7716926236522335, "grad_norm": 10.306188583374023, "learning_rate": 2.5687393040501997e-05, "loss": 0.8692, "step": 4509 }, { "epoch": 0.7718637686120143, "grad_norm": 7.009439468383789, "learning_rate": 2.569309754706218e-05, "loss": 0.4821, "step": 4510 }, { "epoch": 0.7720349135717953, "grad_norm": 16.78397560119629, "learning_rate": 2.5698802053622363e-05, "loss": 1.4687, "step": 4511 }, { "epoch": 0.7722060585315762, "grad_norm": 27.457345962524414, "learning_rate": 2.5704506560182543e-05, "loss": 5.8033, "step": 4512 }, { "epoch": 0.7723772034913572, "grad_norm": 18.70033836364746, "learning_rate": 2.5710211066742727e-05, "loss": 1.7159, "step": 4513 }, { "epoch": 0.7725483484511381, "grad_norm": 6.5526204109191895, "learning_rate": 2.571591557330291e-05, "loss": 0.6285, "step": 4514 }, { "epoch": 0.7727194934109191, "grad_norm": 27.923290252685547, "learning_rate": 2.572162007986309e-05, "loss": 2.8709, "step": 4515 }, { "epoch": 0.7728906383706999, "grad_norm": 21.99191665649414, "learning_rate": 2.5727324586423274e-05, "loss": 1.7881, "step": 4516 }, { "epoch": 0.7730617833304809, "grad_norm": 23.24179458618164, "learning_rate": 2.5733029092983457e-05, "loss": 2.0511, "step": 4517 }, { "epoch": 0.7732329282902618, "grad_norm": 17.54611587524414, "learning_rate": 2.573873359954364e-05, "loss": 1.326, "step": 4518 }, { "epoch": 0.7734040732500428, "grad_norm": 30.525188446044922, "learning_rate": 2.574443810610382e-05, "loss": 5.8319, "step": 4519 }, { "epoch": 0.7735752182098237, "grad_norm": 22.358930587768555, "learning_rate": 2.5750142612664004e-05, "loss": 2.7106, "step": 4520 }, { "epoch": 0.7737463631696047, "grad_norm": 28.14476776123047, "learning_rate": 2.5755847119224187e-05, "loss": 2.6532, "step": 4521 }, { "epoch": 0.7739175081293856, "grad_norm": 14.640401840209961, "learning_rate": 2.576155162578437e-05, "loss": 1.2756, "step": 4522 }, { "epoch": 0.7740886530891665, "grad_norm": 22.830739974975586, "learning_rate": 2.5767256132344554e-05, "loss": 1.9665, "step": 4523 }, { "epoch": 0.7742597980489475, "grad_norm": 2.0804736614227295, "learning_rate": 2.5772960638904737e-05, "loss": 0.2506, "step": 4524 }, { "epoch": 0.7744309430087284, "grad_norm": 80.72746276855469, "learning_rate": 2.577866514546492e-05, "loss": 2.0284, "step": 4525 }, { "epoch": 0.7746020879685094, "grad_norm": 1.6171777248382568, "learning_rate": 2.57843696520251e-05, "loss": 0.2526, "step": 4526 }, { "epoch": 0.7747732329282903, "grad_norm": 1.1948031187057495, "learning_rate": 2.5790074158585284e-05, "loss": 0.2329, "step": 4527 }, { "epoch": 0.7749443778880712, "grad_norm": 16.24471664428711, "learning_rate": 2.5795778665145467e-05, "loss": 1.2654, "step": 4528 }, { "epoch": 0.7751155228478521, "grad_norm": 11.590794563293457, "learning_rate": 2.5801483171705647e-05, "loss": 0.7527, "step": 4529 }, { "epoch": 0.7752866678076331, "grad_norm": 21.045690536499023, "learning_rate": 2.580718767826583e-05, "loss": 2.1076, "step": 4530 }, { "epoch": 0.775457812767414, "grad_norm": 29.146739959716797, "learning_rate": 2.5812892184826014e-05, "loss": 2.3028, "step": 4531 }, { "epoch": 0.775628957727195, "grad_norm": 27.92205810546875, "learning_rate": 2.5818596691386197e-05, "loss": 2.6337, "step": 4532 }, { "epoch": 0.7758001026869759, "grad_norm": 4.507087707519531, "learning_rate": 2.5824301197946377e-05, "loss": 0.3644, "step": 4533 }, { "epoch": 0.7759712476467568, "grad_norm": 31.79628562927246, "learning_rate": 2.583000570450656e-05, "loss": 2.0363, "step": 4534 }, { "epoch": 0.7761423926065377, "grad_norm": 20.116243362426758, "learning_rate": 2.5835710211066744e-05, "loss": 2.037, "step": 4535 }, { "epoch": 0.7763135375663187, "grad_norm": 20.690771102905273, "learning_rate": 2.5841414717626924e-05, "loss": 2.5927, "step": 4536 }, { "epoch": 0.7764846825260996, "grad_norm": 26.378053665161133, "learning_rate": 2.5847119224187108e-05, "loss": 2.4488, "step": 4537 }, { "epoch": 0.7766558274858806, "grad_norm": 1.2249003648757935, "learning_rate": 2.585282373074729e-05, "loss": 0.2434, "step": 4538 }, { "epoch": 0.7768269724456615, "grad_norm": 6.3218793869018555, "learning_rate": 2.5858528237307474e-05, "loss": 0.5515, "step": 4539 }, { "epoch": 0.7769981174054424, "grad_norm": 2.316464424133301, "learning_rate": 2.5864232743867654e-05, "loss": 0.2183, "step": 4540 }, { "epoch": 0.7771692623652233, "grad_norm": 19.063066482543945, "learning_rate": 2.5869937250427838e-05, "loss": 1.6202, "step": 4541 }, { "epoch": 0.7773404073250043, "grad_norm": 13.33887767791748, "learning_rate": 2.587564175698802e-05, "loss": 1.0805, "step": 4542 }, { "epoch": 0.7775115522847852, "grad_norm": 60.49800109863281, "learning_rate": 2.58813462635482e-05, "loss": 2.2874, "step": 4543 }, { "epoch": 0.7776826972445662, "grad_norm": 1.045248031616211, "learning_rate": 2.5887050770108384e-05, "loss": 0.2115, "step": 4544 }, { "epoch": 0.7778538422043471, "grad_norm": 0.962874710559845, "learning_rate": 2.589275527666857e-05, "loss": 0.2111, "step": 4545 }, { "epoch": 0.778024987164128, "grad_norm": 9.272252082824707, "learning_rate": 2.589845978322875e-05, "loss": 0.6141, "step": 4546 }, { "epoch": 0.7781961321239089, "grad_norm": 1.7271431684494019, "learning_rate": 2.5904164289788935e-05, "loss": 0.2153, "step": 4547 }, { "epoch": 0.7783672770836899, "grad_norm": 10.894009590148926, "learning_rate": 2.5909868796349118e-05, "loss": 0.7097, "step": 4548 }, { "epoch": 0.7785384220434708, "grad_norm": 26.39044761657715, "learning_rate": 2.59155733029093e-05, "loss": 2.3284, "step": 4549 }, { "epoch": 0.7787095670032518, "grad_norm": 18.677852630615234, "learning_rate": 2.592127780946948e-05, "loss": 1.6088, "step": 4550 }, { "epoch": 0.7788807119630327, "grad_norm": 24.6732177734375, "learning_rate": 2.5926982316029665e-05, "loss": 3.2763, "step": 4551 }, { "epoch": 0.7790518569228136, "grad_norm": 19.83483123779297, "learning_rate": 2.5932686822589848e-05, "loss": 1.9389, "step": 4552 }, { "epoch": 0.7792230018825945, "grad_norm": 20.399280548095703, "learning_rate": 2.5938391329150028e-05, "loss": 1.7964, "step": 4553 }, { "epoch": 0.7793941468423755, "grad_norm": 17.091896057128906, "learning_rate": 2.594409583571021e-05, "loss": 1.3734, "step": 4554 }, { "epoch": 0.7795652918021564, "grad_norm": 31.487939834594727, "learning_rate": 2.5949800342270395e-05, "loss": 2.4755, "step": 4555 }, { "epoch": 0.7797364367619374, "grad_norm": 31.707317352294922, "learning_rate": 2.5955504848830578e-05, "loss": 4.5809, "step": 4556 }, { "epoch": 0.7799075817217183, "grad_norm": 14.886727333068848, "learning_rate": 2.5961209355390758e-05, "loss": 0.8313, "step": 4557 }, { "epoch": 0.7800787266814992, "grad_norm": 21.29567527770996, "learning_rate": 2.596691386195094e-05, "loss": 1.757, "step": 4558 }, { "epoch": 0.7802498716412801, "grad_norm": 68.5009765625, "learning_rate": 2.5972618368511125e-05, "loss": 2.4949, "step": 4559 }, { "epoch": 0.7804210166010611, "grad_norm": 34.35353088378906, "learning_rate": 2.5978322875071305e-05, "loss": 2.2289, "step": 4560 }, { "epoch": 0.780592161560842, "grad_norm": 23.028303146362305, "learning_rate": 2.598402738163149e-05, "loss": 2.4301, "step": 4561 }, { "epoch": 0.780763306520623, "grad_norm": 19.792776107788086, "learning_rate": 2.5989731888191672e-05, "loss": 1.9774, "step": 4562 }, { "epoch": 0.7809344514804039, "grad_norm": 21.630163192749023, "learning_rate": 2.5995436394751855e-05, "loss": 1.9028, "step": 4563 }, { "epoch": 0.7811055964401848, "grad_norm": 28.1812744140625, "learning_rate": 2.6001140901312035e-05, "loss": 3.1802, "step": 4564 }, { "epoch": 0.7812767413999657, "grad_norm": 24.079713821411133, "learning_rate": 2.600684540787222e-05, "loss": 2.3985, "step": 4565 }, { "epoch": 0.7814478863597467, "grad_norm": 20.737655639648438, "learning_rate": 2.6012549914432402e-05, "loss": 2.114, "step": 4566 }, { "epoch": 0.7816190313195276, "grad_norm": 24.673608779907227, "learning_rate": 2.6018254420992582e-05, "loss": 2.7534, "step": 4567 }, { "epoch": 0.7817901762793086, "grad_norm": 26.247447967529297, "learning_rate": 2.602395892755277e-05, "loss": 2.557, "step": 4568 }, { "epoch": 0.7819613212390895, "grad_norm": 5.863075256347656, "learning_rate": 2.6029663434112952e-05, "loss": 0.5004, "step": 4569 }, { "epoch": 0.7821324661988704, "grad_norm": 17.882400512695312, "learning_rate": 2.6035367940673135e-05, "loss": 1.7885, "step": 4570 }, { "epoch": 0.7823036111586513, "grad_norm": 38.73212814331055, "learning_rate": 2.6041072447233315e-05, "loss": 6.2463, "step": 4571 }, { "epoch": 0.7824747561184323, "grad_norm": 22.570146560668945, "learning_rate": 2.60467769537935e-05, "loss": 2.3429, "step": 4572 }, { "epoch": 0.7826459010782133, "grad_norm": 33.48434066772461, "learning_rate": 2.6052481460353682e-05, "loss": 4.7106, "step": 4573 }, { "epoch": 0.7828170460379942, "grad_norm": 8.184353828430176, "learning_rate": 2.6058185966913862e-05, "loss": 0.5866, "step": 4574 }, { "epoch": 0.7829881909977752, "grad_norm": 24.859272003173828, "learning_rate": 2.6063890473474045e-05, "loss": 2.7946, "step": 4575 }, { "epoch": 0.783159335957556, "grad_norm": 24.745906829833984, "learning_rate": 2.606959498003423e-05, "loss": 2.6296, "step": 4576 }, { "epoch": 0.783330480917337, "grad_norm": 21.468034744262695, "learning_rate": 2.6075299486594412e-05, "loss": 2.0762, "step": 4577 }, { "epoch": 0.7835016258771179, "grad_norm": 19.518083572387695, "learning_rate": 2.6081003993154592e-05, "loss": 1.7499, "step": 4578 }, { "epoch": 0.7836727708368989, "grad_norm": 25.370912551879883, "learning_rate": 2.6086708499714776e-05, "loss": 2.0335, "step": 4579 }, { "epoch": 0.7838439157966798, "grad_norm": 9.235052108764648, "learning_rate": 2.609241300627496e-05, "loss": 0.7306, "step": 4580 }, { "epoch": 0.7840150607564608, "grad_norm": 26.33196449279785, "learning_rate": 2.609811751283514e-05, "loss": 2.8747, "step": 4581 }, { "epoch": 0.7841862057162416, "grad_norm": 22.177465438842773, "learning_rate": 2.6103822019395322e-05, "loss": 1.9306, "step": 4582 }, { "epoch": 0.7843573506760226, "grad_norm": 15.260997772216797, "learning_rate": 2.6109526525955506e-05, "loss": 1.5436, "step": 4583 }, { "epoch": 0.7845284956358035, "grad_norm": 17.693405151367188, "learning_rate": 2.6115231032515686e-05, "loss": 1.7023, "step": 4584 }, { "epoch": 0.7846996405955845, "grad_norm": 19.666189193725586, "learning_rate": 2.612093553907587e-05, "loss": 1.7938, "step": 4585 }, { "epoch": 0.7848707855553654, "grad_norm": 1.3055251836776733, "learning_rate": 2.6126640045636052e-05, "loss": 0.2317, "step": 4586 }, { "epoch": 0.7850419305151464, "grad_norm": 23.14202880859375, "learning_rate": 2.6132344552196236e-05, "loss": 2.4039, "step": 4587 }, { "epoch": 0.7852130754749272, "grad_norm": 24.73307991027832, "learning_rate": 2.6138049058756416e-05, "loss": 2.5498, "step": 4588 }, { "epoch": 0.7853842204347082, "grad_norm": 14.071855545043945, "learning_rate": 2.61437535653166e-05, "loss": 1.1738, "step": 4589 }, { "epoch": 0.7855553653944891, "grad_norm": 31.454723358154297, "learning_rate": 2.6149458071876783e-05, "loss": 6.066, "step": 4590 }, { "epoch": 0.7857265103542701, "grad_norm": 10.21793270111084, "learning_rate": 2.6155162578436966e-05, "loss": 0.7021, "step": 4591 }, { "epoch": 0.785897655314051, "grad_norm": 0.9553564190864563, "learning_rate": 2.616086708499715e-05, "loss": 0.2104, "step": 4592 }, { "epoch": 0.786068800273832, "grad_norm": 10.402186393737793, "learning_rate": 2.6166571591557333e-05, "loss": 0.6858, "step": 4593 }, { "epoch": 0.7862399452336128, "grad_norm": 83.13465118408203, "learning_rate": 2.6172276098117516e-05, "loss": 8.0191, "step": 4594 }, { "epoch": 0.7864110901933938, "grad_norm": 79.72745513916016, "learning_rate": 2.6177980604677696e-05, "loss": 7.8977, "step": 4595 }, { "epoch": 0.7865822351531747, "grad_norm": 19.631969451904297, "learning_rate": 2.618368511123788e-05, "loss": 1.6448, "step": 4596 }, { "epoch": 0.7867533801129557, "grad_norm": 21.243122100830078, "learning_rate": 2.6189389617798063e-05, "loss": 2.725, "step": 4597 }, { "epoch": 0.7869245250727366, "grad_norm": 27.33664321899414, "learning_rate": 2.6195094124358243e-05, "loss": 6.0632, "step": 4598 }, { "epoch": 0.7870956700325176, "grad_norm": 27.157442092895508, "learning_rate": 2.6200798630918426e-05, "loss": 2.6574, "step": 4599 }, { "epoch": 0.7872668149922984, "grad_norm": 16.595340728759766, "learning_rate": 2.620650313747861e-05, "loss": 1.8749, "step": 4600 }, { "epoch": 0.7874379599520794, "grad_norm": 5.997491836547852, "learning_rate": 2.6212207644038793e-05, "loss": 0.8366, "step": 4601 }, { "epoch": 0.7876091049118603, "grad_norm": 8.601006507873535, "learning_rate": 2.6217912150598973e-05, "loss": 0.6105, "step": 4602 }, { "epoch": 0.7877802498716413, "grad_norm": 22.949264526367188, "learning_rate": 2.6223616657159156e-05, "loss": 2.0644, "step": 4603 }, { "epoch": 0.7879513948314222, "grad_norm": 20.955198287963867, "learning_rate": 2.622932116371934e-05, "loss": 1.9205, "step": 4604 }, { "epoch": 0.7881225397912032, "grad_norm": 4.057135105133057, "learning_rate": 2.623502567027952e-05, "loss": 0.531, "step": 4605 }, { "epoch": 0.788293684750984, "grad_norm": 38.576942443847656, "learning_rate": 2.6240730176839703e-05, "loss": 1.9383, "step": 4606 }, { "epoch": 0.788464829710765, "grad_norm": 28.404165267944336, "learning_rate": 2.6246434683399886e-05, "loss": 2.4168, "step": 4607 }, { "epoch": 0.7886359746705459, "grad_norm": 30.351381301879883, "learning_rate": 2.625213918996007e-05, "loss": 5.746, "step": 4608 }, { "epoch": 0.7888071196303269, "grad_norm": 6.266965389251709, "learning_rate": 2.625784369652025e-05, "loss": 0.5733, "step": 4609 }, { "epoch": 0.7889782645901078, "grad_norm": 20.09222984313965, "learning_rate": 2.6263548203080433e-05, "loss": 1.7396, "step": 4610 }, { "epoch": 0.7891494095498888, "grad_norm": 29.68480110168457, "learning_rate": 2.6269252709640617e-05, "loss": 3.2917, "step": 4611 }, { "epoch": 0.7893205545096696, "grad_norm": 28.49522590637207, "learning_rate": 2.6274957216200797e-05, "loss": 3.904, "step": 4612 }, { "epoch": 0.7894916994694506, "grad_norm": 24.60774803161621, "learning_rate": 2.628066172276098e-05, "loss": 2.4849, "step": 4613 }, { "epoch": 0.7896628444292315, "grad_norm": 19.17363929748535, "learning_rate": 2.6286366229321167e-05, "loss": 1.6276, "step": 4614 }, { "epoch": 0.7898339893890125, "grad_norm": 27.375503540039062, "learning_rate": 2.6292070735881347e-05, "loss": 3.1904, "step": 4615 }, { "epoch": 0.7900051343487934, "grad_norm": 7.622123718261719, "learning_rate": 2.629777524244153e-05, "loss": 0.6504, "step": 4616 }, { "epoch": 0.7901762793085744, "grad_norm": 18.307863235473633, "learning_rate": 2.6303479749001713e-05, "loss": 1.6608, "step": 4617 }, { "epoch": 0.7903474242683552, "grad_norm": 23.98590850830078, "learning_rate": 2.6309184255561897e-05, "loss": 2.2049, "step": 4618 }, { "epoch": 0.7905185692281362, "grad_norm": 5.612167835235596, "learning_rate": 2.6314888762122077e-05, "loss": 0.552, "step": 4619 }, { "epoch": 0.7906897141879171, "grad_norm": 26.797847747802734, "learning_rate": 2.632059326868226e-05, "loss": 2.891, "step": 4620 }, { "epoch": 0.7908608591476981, "grad_norm": 26.22957420349121, "learning_rate": 2.6326297775242444e-05, "loss": 2.4378, "step": 4621 }, { "epoch": 0.791032004107479, "grad_norm": 1.1691210269927979, "learning_rate": 2.6332002281802624e-05, "loss": 0.2558, "step": 4622 }, { "epoch": 0.79120314906726, "grad_norm": 3.431807041168213, "learning_rate": 2.6337706788362807e-05, "loss": 0.2987, "step": 4623 }, { "epoch": 0.791374294027041, "grad_norm": 9.09062385559082, "learning_rate": 2.634341129492299e-05, "loss": 1.2888, "step": 4624 }, { "epoch": 0.7915454389868218, "grad_norm": 15.95527172088623, "learning_rate": 2.6349115801483174e-05, "loss": 1.237, "step": 4625 }, { "epoch": 0.7917165839466028, "grad_norm": 15.407099723815918, "learning_rate": 2.6354820308043354e-05, "loss": 1.4981, "step": 4626 }, { "epoch": 0.7918877289063837, "grad_norm": 21.4428653717041, "learning_rate": 2.6360524814603537e-05, "loss": 1.9965, "step": 4627 }, { "epoch": 0.7920588738661647, "grad_norm": 32.93854522705078, "learning_rate": 2.636622932116372e-05, "loss": 1.6353, "step": 4628 }, { "epoch": 0.7922300188259456, "grad_norm": 17.02910804748535, "learning_rate": 2.63719338277239e-05, "loss": 1.6158, "step": 4629 }, { "epoch": 0.7924011637857266, "grad_norm": 18.701148986816406, "learning_rate": 2.6377638334284084e-05, "loss": 1.4652, "step": 4630 }, { "epoch": 0.7925723087455074, "grad_norm": 20.513490676879883, "learning_rate": 2.6383342840844267e-05, "loss": 2.1445, "step": 4631 }, { "epoch": 0.7927434537052884, "grad_norm": 33.653228759765625, "learning_rate": 2.638904734740445e-05, "loss": 3.4384, "step": 4632 }, { "epoch": 0.7929145986650693, "grad_norm": 22.135591506958008, "learning_rate": 2.639475185396463e-05, "loss": 1.9552, "step": 4633 }, { "epoch": 0.7930857436248503, "grad_norm": 7.528356552124023, "learning_rate": 2.6400456360524814e-05, "loss": 0.6067, "step": 4634 }, { "epoch": 0.7932568885846312, "grad_norm": 17.62458038330078, "learning_rate": 2.6406160867084997e-05, "loss": 1.4482, "step": 4635 }, { "epoch": 0.7934280335444122, "grad_norm": 1.2217499017715454, "learning_rate": 2.6411865373645177e-05, "loss": 0.2197, "step": 4636 }, { "epoch": 0.793599178504193, "grad_norm": 26.31184196472168, "learning_rate": 2.6417569880205364e-05, "loss": 2.5345, "step": 4637 }, { "epoch": 0.793770323463974, "grad_norm": 1.5256792306900024, "learning_rate": 2.6423274386765547e-05, "loss": 0.2251, "step": 4638 }, { "epoch": 0.7939414684237549, "grad_norm": 14.517436981201172, "learning_rate": 2.642897889332573e-05, "loss": 0.9695, "step": 4639 }, { "epoch": 0.7941126133835359, "grad_norm": 23.73163414001465, "learning_rate": 2.643468339988591e-05, "loss": 2.5979, "step": 4640 }, { "epoch": 0.7942837583433168, "grad_norm": 4.0032172203063965, "learning_rate": 2.6440387906446094e-05, "loss": 0.3854, "step": 4641 }, { "epoch": 0.7944549033030978, "grad_norm": 17.482643127441406, "learning_rate": 2.6446092413006278e-05, "loss": 1.5197, "step": 4642 }, { "epoch": 0.7946260482628786, "grad_norm": 14.50070571899414, "learning_rate": 2.6451796919566458e-05, "loss": 1.1852, "step": 4643 }, { "epoch": 0.7947971932226596, "grad_norm": 25.02318572998047, "learning_rate": 2.645750142612664e-05, "loss": 2.3457, "step": 4644 }, { "epoch": 0.7949683381824405, "grad_norm": 31.2506103515625, "learning_rate": 2.6463205932686824e-05, "loss": 2.6284, "step": 4645 }, { "epoch": 0.7951394831422215, "grad_norm": 7.928152084350586, "learning_rate": 2.6468910439247004e-05, "loss": 0.6422, "step": 4646 }, { "epoch": 0.7953106281020024, "grad_norm": 0.9403290152549744, "learning_rate": 2.6474614945807188e-05, "loss": 0.1969, "step": 4647 }, { "epoch": 0.7954817730617834, "grad_norm": 56.036251068115234, "learning_rate": 2.648031945236737e-05, "loss": 2.2416, "step": 4648 }, { "epoch": 0.7956529180215642, "grad_norm": 35.657833099365234, "learning_rate": 2.6486023958927554e-05, "loss": 6.1541, "step": 4649 }, { "epoch": 0.7958240629813452, "grad_norm": 14.264200210571289, "learning_rate": 2.6491728465487734e-05, "loss": 1.4948, "step": 4650 }, { "epoch": 0.7959952079411261, "grad_norm": 0.9517439603805542, "learning_rate": 2.6497432972047918e-05, "loss": 0.1973, "step": 4651 }, { "epoch": 0.7961663529009071, "grad_norm": 3.9652233123779297, "learning_rate": 2.65031374786081e-05, "loss": 0.3578, "step": 4652 }, { "epoch": 0.796337497860688, "grad_norm": 20.68568229675293, "learning_rate": 2.650884198516828e-05, "loss": 1.6687, "step": 4653 }, { "epoch": 0.796508642820469, "grad_norm": 36.3359375, "learning_rate": 2.6514546491728465e-05, "loss": 6.1452, "step": 4654 }, { "epoch": 0.7966797877802498, "grad_norm": 13.956791877746582, "learning_rate": 2.6520250998288648e-05, "loss": 0.8717, "step": 4655 }, { "epoch": 0.7968509327400308, "grad_norm": 30.4771671295166, "learning_rate": 2.652595550484883e-05, "loss": 3.0092, "step": 4656 }, { "epoch": 0.7970220776998117, "grad_norm": 6.4612226486206055, "learning_rate": 2.653166001140901e-05, "loss": 0.6398, "step": 4657 }, { "epoch": 0.7971932226595927, "grad_norm": 16.6127986907959, "learning_rate": 2.6537364517969195e-05, "loss": 1.8739, "step": 4658 }, { "epoch": 0.7973643676193736, "grad_norm": 21.27251625061035, "learning_rate": 2.6543069024529378e-05, "loss": 1.9702, "step": 4659 }, { "epoch": 0.7975355125791546, "grad_norm": 18.95979118347168, "learning_rate": 2.654877353108956e-05, "loss": 1.6295, "step": 4660 }, { "epoch": 0.7977066575389354, "grad_norm": 10.863266944885254, "learning_rate": 2.6554478037649745e-05, "loss": 0.7761, "step": 4661 }, { "epoch": 0.7978778024987164, "grad_norm": 10.78805160522461, "learning_rate": 2.6560182544209928e-05, "loss": 0.9169, "step": 4662 }, { "epoch": 0.7980489474584973, "grad_norm": 21.447656631469727, "learning_rate": 2.656588705077011e-05, "loss": 2.3001, "step": 4663 }, { "epoch": 0.7982200924182783, "grad_norm": 7.096908092498779, "learning_rate": 2.657159155733029e-05, "loss": 0.6332, "step": 4664 }, { "epoch": 0.7983912373780592, "grad_norm": 11.815482139587402, "learning_rate": 2.6577296063890475e-05, "loss": 0.9556, "step": 4665 }, { "epoch": 0.7985623823378402, "grad_norm": 25.208463668823242, "learning_rate": 2.658300057045066e-05, "loss": 2.7888, "step": 4666 }, { "epoch": 0.798733527297621, "grad_norm": 28.86720848083496, "learning_rate": 2.658870507701084e-05, "loss": 1.907, "step": 4667 }, { "epoch": 0.798904672257402, "grad_norm": 6.2774858474731445, "learning_rate": 2.6594409583571022e-05, "loss": 0.4982, "step": 4668 }, { "epoch": 0.7990758172171829, "grad_norm": 31.023252487182617, "learning_rate": 2.6600114090131205e-05, "loss": 5.9759, "step": 4669 }, { "epoch": 0.7992469621769639, "grad_norm": 22.48405647277832, "learning_rate": 2.660581859669139e-05, "loss": 2.4245, "step": 4670 }, { "epoch": 0.7994181071367448, "grad_norm": 21.80152130126953, "learning_rate": 2.661152310325157e-05, "loss": 2.3967, "step": 4671 }, { "epoch": 0.7995892520965258, "grad_norm": 24.37519645690918, "learning_rate": 2.6617227609811752e-05, "loss": 2.6876, "step": 4672 }, { "epoch": 0.7997603970563066, "grad_norm": 24.82401466369629, "learning_rate": 2.6622932116371935e-05, "loss": 3.2294, "step": 4673 }, { "epoch": 0.7999315420160876, "grad_norm": 0.9125476479530334, "learning_rate": 2.6628636622932115e-05, "loss": 0.2049, "step": 4674 }, { "epoch": 0.8001026869758686, "grad_norm": 20.301301956176758, "learning_rate": 2.66343411294923e-05, "loss": 2.0656, "step": 4675 }, { "epoch": 0.8002738319356495, "grad_norm": 17.966495513916016, "learning_rate": 2.6640045636052482e-05, "loss": 1.658, "step": 4676 }, { "epoch": 0.8004449768954305, "grad_norm": 0.8491156697273254, "learning_rate": 2.6645750142612665e-05, "loss": 0.2129, "step": 4677 }, { "epoch": 0.8006161218552114, "grad_norm": 21.60484504699707, "learning_rate": 2.6651454649172845e-05, "loss": 1.6655, "step": 4678 }, { "epoch": 0.8007872668149923, "grad_norm": 19.46196174621582, "learning_rate": 2.665715915573303e-05, "loss": 1.7872, "step": 4679 }, { "epoch": 0.8009584117747732, "grad_norm": 21.08289909362793, "learning_rate": 2.6662863662293212e-05, "loss": 2.101, "step": 4680 }, { "epoch": 0.8011295567345542, "grad_norm": 19.137561798095703, "learning_rate": 2.6668568168853392e-05, "loss": 1.9419, "step": 4681 }, { "epoch": 0.8013007016943351, "grad_norm": 22.642850875854492, "learning_rate": 2.667427267541358e-05, "loss": 2.2932, "step": 4682 }, { "epoch": 0.8014718466541161, "grad_norm": 31.17798614501953, "learning_rate": 2.6679977181973762e-05, "loss": 2.9005, "step": 4683 }, { "epoch": 0.801642991613897, "grad_norm": 21.248584747314453, "learning_rate": 2.6685681688533942e-05, "loss": 2.0125, "step": 4684 }, { "epoch": 0.801814136573678, "grad_norm": 2.3411998748779297, "learning_rate": 2.6691386195094126e-05, "loss": 0.2708, "step": 4685 }, { "epoch": 0.8019852815334588, "grad_norm": 1.263325810432434, "learning_rate": 2.669709070165431e-05, "loss": 0.2294, "step": 4686 }, { "epoch": 0.8021564264932398, "grad_norm": 24.95157814025879, "learning_rate": 2.6702795208214492e-05, "loss": 3.1625, "step": 4687 }, { "epoch": 0.8023275714530207, "grad_norm": 26.514177322387695, "learning_rate": 2.6708499714774672e-05, "loss": 3.3918, "step": 4688 }, { "epoch": 0.8024987164128017, "grad_norm": 12.605335235595703, "learning_rate": 2.6714204221334856e-05, "loss": 0.9508, "step": 4689 }, { "epoch": 0.8026698613725826, "grad_norm": 19.918142318725586, "learning_rate": 2.671990872789504e-05, "loss": 1.5214, "step": 4690 }, { "epoch": 0.8028410063323635, "grad_norm": 1.9992042779922485, "learning_rate": 2.672561323445522e-05, "loss": 0.2171, "step": 4691 }, { "epoch": 0.8030121512921444, "grad_norm": 9.957181930541992, "learning_rate": 2.6731317741015403e-05, "loss": 0.7101, "step": 4692 }, { "epoch": 0.8031832962519254, "grad_norm": 25.742799758911133, "learning_rate": 2.6737022247575586e-05, "loss": 2.705, "step": 4693 }, { "epoch": 0.8033544412117063, "grad_norm": 25.987510681152344, "learning_rate": 2.674272675413577e-05, "loss": 2.4936, "step": 4694 }, { "epoch": 0.8035255861714873, "grad_norm": 19.842357635498047, "learning_rate": 2.674843126069595e-05, "loss": 1.7686, "step": 4695 }, { "epoch": 0.8036967311312682, "grad_norm": 26.451980590820312, "learning_rate": 2.6754135767256133e-05, "loss": 2.3944, "step": 4696 }, { "epoch": 0.8038678760910491, "grad_norm": 15.866630554199219, "learning_rate": 2.6759840273816316e-05, "loss": 1.239, "step": 4697 }, { "epoch": 0.80403902105083, "grad_norm": 58.552215576171875, "learning_rate": 2.6765544780376496e-05, "loss": 1.9071, "step": 4698 }, { "epoch": 0.804210166010611, "grad_norm": 19.024993896484375, "learning_rate": 2.677124928693668e-05, "loss": 1.7101, "step": 4699 }, { "epoch": 0.8043813109703919, "grad_norm": 25.27145767211914, "learning_rate": 2.6776953793496863e-05, "loss": 2.543, "step": 4700 }, { "epoch": 0.8045524559301729, "grad_norm": 7.051549911499023, "learning_rate": 2.6782658300057046e-05, "loss": 0.5373, "step": 4701 }, { "epoch": 0.8047236008899538, "grad_norm": 24.07325553894043, "learning_rate": 2.6788362806617226e-05, "loss": 2.1983, "step": 4702 }, { "epoch": 0.8048947458497347, "grad_norm": 16.690013885498047, "learning_rate": 2.679406731317741e-05, "loss": 1.6365, "step": 4703 }, { "epoch": 0.8050658908095156, "grad_norm": 29.604305267333984, "learning_rate": 2.6799771819737593e-05, "loss": 1.4153, "step": 4704 }, { "epoch": 0.8052370357692966, "grad_norm": 10.602456092834473, "learning_rate": 2.6805476326297776e-05, "loss": 0.8433, "step": 4705 }, { "epoch": 0.8054081807290775, "grad_norm": 18.099092483520508, "learning_rate": 2.681118083285796e-05, "loss": 1.6778, "step": 4706 }, { "epoch": 0.8055793256888585, "grad_norm": 26.6840763092041, "learning_rate": 2.6816885339418143e-05, "loss": 2.8567, "step": 4707 }, { "epoch": 0.8057504706486394, "grad_norm": 25.60426139831543, "learning_rate": 2.6822589845978326e-05, "loss": 2.7501, "step": 4708 }, { "epoch": 0.8059216156084203, "grad_norm": 23.098224639892578, "learning_rate": 2.6828294352538506e-05, "loss": 2.5179, "step": 4709 }, { "epoch": 0.8060927605682012, "grad_norm": 5.202851295471191, "learning_rate": 2.683399885909869e-05, "loss": 0.4799, "step": 4710 }, { "epoch": 0.8062639055279822, "grad_norm": 1.1651976108551025, "learning_rate": 2.6839703365658873e-05, "loss": 0.2147, "step": 4711 }, { "epoch": 0.8064350504877631, "grad_norm": 24.902393341064453, "learning_rate": 2.6845407872219053e-05, "loss": 2.4519, "step": 4712 }, { "epoch": 0.8066061954475441, "grad_norm": 12.363120079040527, "learning_rate": 2.6851112378779237e-05, "loss": 0.9881, "step": 4713 }, { "epoch": 0.806777340407325, "grad_norm": 5.912435054779053, "learning_rate": 2.685681688533942e-05, "loss": 0.5938, "step": 4714 }, { "epoch": 0.806948485367106, "grad_norm": 106.52516174316406, "learning_rate": 2.68625213918996e-05, "loss": 7.5182, "step": 4715 }, { "epoch": 0.8071196303268868, "grad_norm": 1.668204665184021, "learning_rate": 2.6868225898459783e-05, "loss": 0.2216, "step": 4716 }, { "epoch": 0.8072907752866678, "grad_norm": 16.875843048095703, "learning_rate": 2.6873930405019967e-05, "loss": 1.6175, "step": 4717 }, { "epoch": 0.8074619202464487, "grad_norm": 25.818157196044922, "learning_rate": 2.687963491158015e-05, "loss": 3.323, "step": 4718 }, { "epoch": 0.8076330652062297, "grad_norm": 15.579858779907227, "learning_rate": 2.688533941814033e-05, "loss": 1.5144, "step": 4719 }, { "epoch": 0.8078042101660106, "grad_norm": 16.2536563873291, "learning_rate": 2.6891043924700513e-05, "loss": 1.7982, "step": 4720 }, { "epoch": 0.8079753551257916, "grad_norm": 24.011157989501953, "learning_rate": 2.6896748431260697e-05, "loss": 1.8936, "step": 4721 }, { "epoch": 0.8081465000855724, "grad_norm": 2.2636773586273193, "learning_rate": 2.6902452937820877e-05, "loss": 0.2311, "step": 4722 }, { "epoch": 0.8083176450453534, "grad_norm": 19.2104434967041, "learning_rate": 2.690815744438106e-05, "loss": 2.1646, "step": 4723 }, { "epoch": 0.8084887900051343, "grad_norm": 18.140581130981445, "learning_rate": 2.6913861950941244e-05, "loss": 1.5111, "step": 4724 }, { "epoch": 0.8086599349649153, "grad_norm": 17.46432113647461, "learning_rate": 2.6919566457501427e-05, "loss": 1.6828, "step": 4725 }, { "epoch": 0.8088310799246963, "grad_norm": 9.412572860717773, "learning_rate": 2.6925270964061607e-05, "loss": 0.6319, "step": 4726 }, { "epoch": 0.8090022248844772, "grad_norm": 27.534698486328125, "learning_rate": 2.693097547062179e-05, "loss": 3.4404, "step": 4727 }, { "epoch": 0.8091733698442581, "grad_norm": 20.456804275512695, "learning_rate": 2.6936679977181977e-05, "loss": 1.8323, "step": 4728 }, { "epoch": 0.809344514804039, "grad_norm": 29.691913604736328, "learning_rate": 2.6942384483742157e-05, "loss": 2.6556, "step": 4729 }, { "epoch": 0.80951565976382, "grad_norm": 88.99212646484375, "learning_rate": 2.694808899030234e-05, "loss": 8.2099, "step": 4730 }, { "epoch": 0.8096868047236009, "grad_norm": 24.28471565246582, "learning_rate": 2.6953793496862524e-05, "loss": 2.0042, "step": 4731 }, { "epoch": 0.8098579496833819, "grad_norm": 7.255256175994873, "learning_rate": 2.6959498003422707e-05, "loss": 0.5848, "step": 4732 }, { "epoch": 0.8100290946431628, "grad_norm": 57.900230407714844, "learning_rate": 2.6965202509982887e-05, "loss": 1.7669, "step": 4733 }, { "epoch": 0.8102002396029437, "grad_norm": 23.751659393310547, "learning_rate": 2.697090701654307e-05, "loss": 2.1911, "step": 4734 }, { "epoch": 0.8103713845627246, "grad_norm": 29.752071380615234, "learning_rate": 2.6976611523103254e-05, "loss": 2.9697, "step": 4735 }, { "epoch": 0.8105425295225056, "grad_norm": 24.04039192199707, "learning_rate": 2.6982316029663434e-05, "loss": 2.1467, "step": 4736 }, { "epoch": 0.8107136744822865, "grad_norm": 7.993510723114014, "learning_rate": 2.6988020536223617e-05, "loss": 0.5852, "step": 4737 }, { "epoch": 0.8108848194420675, "grad_norm": 20.55529022216797, "learning_rate": 2.69937250427838e-05, "loss": 1.6321, "step": 4738 }, { "epoch": 0.8110559644018484, "grad_norm": 10.042283058166504, "learning_rate": 2.6999429549343984e-05, "loss": 0.7064, "step": 4739 }, { "epoch": 0.8112271093616293, "grad_norm": 13.695612907409668, "learning_rate": 2.7005134055904164e-05, "loss": 1.1387, "step": 4740 }, { "epoch": 0.8113982543214102, "grad_norm": 21.712265014648438, "learning_rate": 2.7010838562464347e-05, "loss": 1.6631, "step": 4741 }, { "epoch": 0.8115693992811912, "grad_norm": 23.428848266601562, "learning_rate": 2.701654306902453e-05, "loss": 2.6433, "step": 4742 }, { "epoch": 0.8117405442409721, "grad_norm": 25.32332420349121, "learning_rate": 2.702224757558471e-05, "loss": 2.2108, "step": 4743 }, { "epoch": 0.8119116892007531, "grad_norm": 21.363313674926758, "learning_rate": 2.7027952082144894e-05, "loss": 2.5228, "step": 4744 }, { "epoch": 0.812082834160534, "grad_norm": 8.273282051086426, "learning_rate": 2.7033656588705078e-05, "loss": 0.5786, "step": 4745 }, { "epoch": 0.8122539791203149, "grad_norm": 14.859856605529785, "learning_rate": 2.703936109526526e-05, "loss": 1.1601, "step": 4746 }, { "epoch": 0.8124251240800958, "grad_norm": 22.67235565185547, "learning_rate": 2.704506560182544e-05, "loss": 2.1171, "step": 4747 }, { "epoch": 0.8125962690398768, "grad_norm": 20.38551139831543, "learning_rate": 2.7050770108385624e-05, "loss": 2.1415, "step": 4748 }, { "epoch": 0.8127674139996577, "grad_norm": 15.979461669921875, "learning_rate": 2.7056474614945808e-05, "loss": 1.4184, "step": 4749 }, { "epoch": 0.8129385589594387, "grad_norm": 15.520298957824707, "learning_rate": 2.7062179121505988e-05, "loss": 1.4953, "step": 4750 }, { "epoch": 0.8131097039192196, "grad_norm": 25.01488494873047, "learning_rate": 2.7067883628066174e-05, "loss": 2.7059, "step": 4751 }, { "epoch": 0.8132808488790005, "grad_norm": 5.247277736663818, "learning_rate": 2.7073588134626358e-05, "loss": 0.514, "step": 4752 }, { "epoch": 0.8134519938387814, "grad_norm": 9.065279006958008, "learning_rate": 2.7079292641186538e-05, "loss": 0.8338, "step": 4753 }, { "epoch": 0.8136231387985624, "grad_norm": 20.89956283569336, "learning_rate": 2.708499714774672e-05, "loss": 1.9556, "step": 4754 }, { "epoch": 0.8137942837583433, "grad_norm": 22.0202579498291, "learning_rate": 2.7090701654306905e-05, "loss": 1.9706, "step": 4755 }, { "epoch": 0.8139654287181243, "grad_norm": 23.732559204101562, "learning_rate": 2.7096406160867088e-05, "loss": 2.6891, "step": 4756 }, { "epoch": 0.8141365736779052, "grad_norm": 33.41729736328125, "learning_rate": 2.7102110667427268e-05, "loss": 6.6742, "step": 4757 }, { "epoch": 0.8143077186376861, "grad_norm": 24.37078285217285, "learning_rate": 2.710781517398745e-05, "loss": 2.2191, "step": 4758 }, { "epoch": 0.814478863597467, "grad_norm": 1.2146947383880615, "learning_rate": 2.7113519680547635e-05, "loss": 0.2281, "step": 4759 }, { "epoch": 0.814650008557248, "grad_norm": 18.136754989624023, "learning_rate": 2.7119224187107815e-05, "loss": 1.4566, "step": 4760 }, { "epoch": 0.8148211535170289, "grad_norm": 51.885501861572266, "learning_rate": 2.7124928693667998e-05, "loss": 6.6106, "step": 4761 }, { "epoch": 0.8149922984768099, "grad_norm": 23.535844802856445, "learning_rate": 2.713063320022818e-05, "loss": 2.0082, "step": 4762 }, { "epoch": 0.8151634434365908, "grad_norm": 21.241649627685547, "learning_rate": 2.7136337706788365e-05, "loss": 2.3369, "step": 4763 }, { "epoch": 0.8153345883963717, "grad_norm": 18.623498916625977, "learning_rate": 2.7142042213348545e-05, "loss": 1.9244, "step": 4764 }, { "epoch": 0.8155057333561526, "grad_norm": 5.655921936035156, "learning_rate": 2.7147746719908728e-05, "loss": 0.5875, "step": 4765 }, { "epoch": 0.8156768783159336, "grad_norm": 21.945968627929688, "learning_rate": 2.715345122646891e-05, "loss": 2.1567, "step": 4766 }, { "epoch": 0.8158480232757145, "grad_norm": 51.72159957885742, "learning_rate": 2.715915573302909e-05, "loss": 6.8426, "step": 4767 }, { "epoch": 0.8160191682354955, "grad_norm": 21.90216636657715, "learning_rate": 2.7164860239589275e-05, "loss": 2.084, "step": 4768 }, { "epoch": 0.8161903131952764, "grad_norm": 11.635622024536133, "learning_rate": 2.7170564746149458e-05, "loss": 0.8471, "step": 4769 }, { "epoch": 0.8163614581550573, "grad_norm": 4.031811714172363, "learning_rate": 2.717626925270964e-05, "loss": 0.4889, "step": 4770 }, { "epoch": 0.8165326031148382, "grad_norm": 30.011260986328125, "learning_rate": 2.718197375926982e-05, "loss": 2.7194, "step": 4771 }, { "epoch": 0.8167037480746192, "grad_norm": 18.62017250061035, "learning_rate": 2.7187678265830005e-05, "loss": 1.7315, "step": 4772 }, { "epoch": 0.8168748930344001, "grad_norm": 27.6317138671875, "learning_rate": 2.719338277239019e-05, "loss": 3.5465, "step": 4773 }, { "epoch": 0.8170460379941811, "grad_norm": 25.174705505371094, "learning_rate": 2.7199087278950372e-05, "loss": 2.5848, "step": 4774 }, { "epoch": 0.817217182953962, "grad_norm": 25.61824607849121, "learning_rate": 2.7204791785510555e-05, "loss": 2.3883, "step": 4775 }, { "epoch": 0.8173883279137429, "grad_norm": 23.317171096801758, "learning_rate": 2.721049629207074e-05, "loss": 1.9388, "step": 4776 }, { "epoch": 0.8175594728735239, "grad_norm": 19.05599021911621, "learning_rate": 2.7216200798630922e-05, "loss": 1.9781, "step": 4777 }, { "epoch": 0.8177306178333048, "grad_norm": 3.6496589183807373, "learning_rate": 2.7221905305191102e-05, "loss": 0.3759, "step": 4778 }, { "epoch": 0.8179017627930858, "grad_norm": 1.0984550714492798, "learning_rate": 2.7227609811751285e-05, "loss": 0.212, "step": 4779 }, { "epoch": 0.8180729077528667, "grad_norm": 6.329287052154541, "learning_rate": 2.723331431831147e-05, "loss": 0.6039, "step": 4780 }, { "epoch": 0.8182440527126477, "grad_norm": 2.0273239612579346, "learning_rate": 2.723901882487165e-05, "loss": 0.2497, "step": 4781 }, { "epoch": 0.8184151976724285, "grad_norm": 25.492948532104492, "learning_rate": 2.7244723331431832e-05, "loss": 2.387, "step": 4782 }, { "epoch": 0.8185863426322095, "grad_norm": 26.385509490966797, "learning_rate": 2.7250427837992015e-05, "loss": 2.7364, "step": 4783 }, { "epoch": 0.8187574875919904, "grad_norm": 3.4072940349578857, "learning_rate": 2.7256132344552195e-05, "loss": 0.3718, "step": 4784 }, { "epoch": 0.8189286325517714, "grad_norm": 14.639547348022461, "learning_rate": 2.726183685111238e-05, "loss": 1.3097, "step": 4785 }, { "epoch": 0.8190997775115523, "grad_norm": 22.575746536254883, "learning_rate": 2.7267541357672562e-05, "loss": 1.9179, "step": 4786 }, { "epoch": 0.8192709224713333, "grad_norm": 25.742076873779297, "learning_rate": 2.7273245864232746e-05, "loss": 3.313, "step": 4787 }, { "epoch": 0.8194420674311141, "grad_norm": 37.464515686035156, "learning_rate": 2.7278950370792926e-05, "loss": 6.2871, "step": 4788 }, { "epoch": 0.8196132123908951, "grad_norm": 18.994226455688477, "learning_rate": 2.728465487735311e-05, "loss": 1.8293, "step": 4789 }, { "epoch": 0.819784357350676, "grad_norm": 21.22791290283203, "learning_rate": 2.7290359383913292e-05, "loss": 2.0439, "step": 4790 }, { "epoch": 0.819955502310457, "grad_norm": 23.675783157348633, "learning_rate": 2.7296063890473472e-05, "loss": 2.7619, "step": 4791 }, { "epoch": 0.8201266472702379, "grad_norm": 20.29876708984375, "learning_rate": 2.7301768397033656e-05, "loss": 1.6941, "step": 4792 }, { "epoch": 0.8202977922300189, "grad_norm": 18.569841384887695, "learning_rate": 2.730747290359384e-05, "loss": 1.9272, "step": 4793 }, { "epoch": 0.8204689371897997, "grad_norm": 45.40720748901367, "learning_rate": 2.7313177410154022e-05, "loss": 2.4204, "step": 4794 }, { "epoch": 0.8206400821495807, "grad_norm": 20.549243927001953, "learning_rate": 2.7318881916714202e-05, "loss": 2.1771, "step": 4795 }, { "epoch": 0.8208112271093616, "grad_norm": 17.86515235900879, "learning_rate": 2.7324586423274386e-05, "loss": 1.666, "step": 4796 }, { "epoch": 0.8209823720691426, "grad_norm": 20.349185943603516, "learning_rate": 2.7330290929834573e-05, "loss": 1.7841, "step": 4797 }, { "epoch": 0.8211535170289235, "grad_norm": 20.81956672668457, "learning_rate": 2.7335995436394753e-05, "loss": 1.9564, "step": 4798 }, { "epoch": 0.8213246619887045, "grad_norm": 23.731029510498047, "learning_rate": 2.7341699942954936e-05, "loss": 1.8663, "step": 4799 }, { "epoch": 0.8214958069484853, "grad_norm": 20.08209991455078, "learning_rate": 2.734740444951512e-05, "loss": 1.9046, "step": 4800 }, { "epoch": 0.8216669519082663, "grad_norm": 153.8986053466797, "learning_rate": 2.7353108956075303e-05, "loss": 8.5195, "step": 4801 }, { "epoch": 0.8218380968680472, "grad_norm": 21.99418067932129, "learning_rate": 2.7358813462635483e-05, "loss": 2.2924, "step": 4802 }, { "epoch": 0.8220092418278282, "grad_norm": 20.278175354003906, "learning_rate": 2.7364517969195666e-05, "loss": 2.2382, "step": 4803 }, { "epoch": 0.8221803867876091, "grad_norm": 0.9226766228675842, "learning_rate": 2.737022247575585e-05, "loss": 0.2091, "step": 4804 }, { "epoch": 0.8223515317473901, "grad_norm": 25.265033721923828, "learning_rate": 2.737592698231603e-05, "loss": 2.7456, "step": 4805 }, { "epoch": 0.8225226767071709, "grad_norm": 17.61090660095215, "learning_rate": 2.7381631488876213e-05, "loss": 1.7355, "step": 4806 }, { "epoch": 0.8226938216669519, "grad_norm": 27.21466827392578, "learning_rate": 2.7387335995436396e-05, "loss": 2.3528, "step": 4807 }, { "epoch": 0.8228649666267328, "grad_norm": 1.4176955223083496, "learning_rate": 2.739304050199658e-05, "loss": 0.2325, "step": 4808 }, { "epoch": 0.8230361115865138, "grad_norm": 26.031166076660156, "learning_rate": 2.739874500855676e-05, "loss": 2.5746, "step": 4809 }, { "epoch": 0.8232072565462947, "grad_norm": 23.706703186035156, "learning_rate": 2.7404449515116943e-05, "loss": 2.6316, "step": 4810 }, { "epoch": 0.8233784015060757, "grad_norm": 2.113330841064453, "learning_rate": 2.7410154021677126e-05, "loss": 0.3267, "step": 4811 }, { "epoch": 0.8235495464658565, "grad_norm": 13.31404972076416, "learning_rate": 2.7415858528237306e-05, "loss": 1.3433, "step": 4812 }, { "epoch": 0.8237206914256375, "grad_norm": 22.22062110900879, "learning_rate": 2.742156303479749e-05, "loss": 2.5822, "step": 4813 }, { "epoch": 0.8238918363854184, "grad_norm": 17.15830421447754, "learning_rate": 2.7427267541357673e-05, "loss": 1.4505, "step": 4814 }, { "epoch": 0.8240629813451994, "grad_norm": 25.40969467163086, "learning_rate": 2.7432972047917853e-05, "loss": 1.8021, "step": 4815 }, { "epoch": 0.8242341263049803, "grad_norm": 9.526328086853027, "learning_rate": 2.7438676554478036e-05, "loss": 1.0048, "step": 4816 }, { "epoch": 0.8244052712647613, "grad_norm": 16.948484420776367, "learning_rate": 2.744438106103822e-05, "loss": 1.4061, "step": 4817 }, { "epoch": 0.8245764162245421, "grad_norm": 1.4328776597976685, "learning_rate": 2.7450085567598403e-05, "loss": 0.2117, "step": 4818 }, { "epoch": 0.8247475611843231, "grad_norm": 18.04222869873047, "learning_rate": 2.7455790074158583e-05, "loss": 1.676, "step": 4819 }, { "epoch": 0.824918706144104, "grad_norm": 21.746219635009766, "learning_rate": 2.746149458071877e-05, "loss": 2.1541, "step": 4820 }, { "epoch": 0.825089851103885, "grad_norm": 28.491777420043945, "learning_rate": 2.7467199087278953e-05, "loss": 4.0988, "step": 4821 }, { "epoch": 0.8252609960636659, "grad_norm": 4.467132091522217, "learning_rate": 2.7472903593839133e-05, "loss": 0.4074, "step": 4822 }, { "epoch": 0.8254321410234469, "grad_norm": 2.841317892074585, "learning_rate": 2.7478608100399317e-05, "loss": 0.2482, "step": 4823 }, { "epoch": 0.8256032859832277, "grad_norm": 25.000173568725586, "learning_rate": 2.74843126069595e-05, "loss": 2.6425, "step": 4824 }, { "epoch": 0.8257744309430087, "grad_norm": 22.400287628173828, "learning_rate": 2.7490017113519683e-05, "loss": 2.0586, "step": 4825 }, { "epoch": 0.8259455759027896, "grad_norm": 0.8643401265144348, "learning_rate": 2.7495721620079863e-05, "loss": 0.1913, "step": 4826 }, { "epoch": 0.8261167208625706, "grad_norm": 52.71999740600586, "learning_rate": 2.7501426126640047e-05, "loss": 1.9317, "step": 4827 }, { "epoch": 0.8262878658223516, "grad_norm": 16.348684310913086, "learning_rate": 2.750713063320023e-05, "loss": 1.3432, "step": 4828 }, { "epoch": 0.8264590107821325, "grad_norm": 15.317984580993652, "learning_rate": 2.751283513976041e-05, "loss": 1.2614, "step": 4829 }, { "epoch": 0.8266301557419135, "grad_norm": 5.9997406005859375, "learning_rate": 2.7518539646320594e-05, "loss": 0.5047, "step": 4830 }, { "epoch": 0.8268013007016943, "grad_norm": 21.828754425048828, "learning_rate": 2.7524244152880777e-05, "loss": 2.1042, "step": 4831 }, { "epoch": 0.8269724456614753, "grad_norm": 27.086246490478516, "learning_rate": 2.752994865944096e-05, "loss": 2.9267, "step": 4832 }, { "epoch": 0.8271435906212562, "grad_norm": 22.872150421142578, "learning_rate": 2.753565316600114e-05, "loss": 2.6579, "step": 4833 }, { "epoch": 0.8273147355810372, "grad_norm": 13.048178672790527, "learning_rate": 2.7541357672561324e-05, "loss": 1.2089, "step": 4834 }, { "epoch": 0.8274858805408181, "grad_norm": 8.70570182800293, "learning_rate": 2.7547062179121507e-05, "loss": 0.6508, "step": 4835 }, { "epoch": 0.827657025500599, "grad_norm": 6.766833782196045, "learning_rate": 2.7552766685681687e-05, "loss": 0.6133, "step": 4836 }, { "epoch": 0.8278281704603799, "grad_norm": 5.038801193237305, "learning_rate": 2.755847119224187e-05, "loss": 0.4408, "step": 4837 }, { "epoch": 0.8279993154201609, "grad_norm": 17.220415115356445, "learning_rate": 2.7564175698802054e-05, "loss": 1.5166, "step": 4838 }, { "epoch": 0.8281704603799418, "grad_norm": 4.953532695770264, "learning_rate": 2.7569880205362237e-05, "loss": 0.467, "step": 4839 }, { "epoch": 0.8283416053397228, "grad_norm": 1.3376152515411377, "learning_rate": 2.7575584711922417e-05, "loss": 0.2116, "step": 4840 }, { "epoch": 0.8285127502995037, "grad_norm": 12.627934455871582, "learning_rate": 2.75812892184826e-05, "loss": 0.7787, "step": 4841 }, { "epoch": 0.8286838952592847, "grad_norm": 24.3588809967041, "learning_rate": 2.7586993725042787e-05, "loss": 2.7193, "step": 4842 }, { "epoch": 0.8288550402190655, "grad_norm": 11.016646385192871, "learning_rate": 2.7592698231602967e-05, "loss": 0.5866, "step": 4843 }, { "epoch": 0.8290261851788465, "grad_norm": 29.477998733520508, "learning_rate": 2.759840273816315e-05, "loss": 3.4923, "step": 4844 }, { "epoch": 0.8291973301386274, "grad_norm": 44.269596099853516, "learning_rate": 2.7604107244723334e-05, "loss": 1.683, "step": 4845 }, { "epoch": 0.8293684750984084, "grad_norm": 1.1761341094970703, "learning_rate": 2.7609811751283514e-05, "loss": 0.2143, "step": 4846 }, { "epoch": 0.8295396200581893, "grad_norm": 0.9938428401947021, "learning_rate": 2.7615516257843697e-05, "loss": 0.1943, "step": 4847 }, { "epoch": 0.8297107650179703, "grad_norm": 24.60161590576172, "learning_rate": 2.762122076440388e-05, "loss": 1.7267, "step": 4848 }, { "epoch": 0.8298819099777511, "grad_norm": 24.709163665771484, "learning_rate": 2.7626925270964064e-05, "loss": 2.423, "step": 4849 }, { "epoch": 0.8300530549375321, "grad_norm": 7.876855850219727, "learning_rate": 2.7632629777524244e-05, "loss": 0.9071, "step": 4850 }, { "epoch": 0.830224199897313, "grad_norm": 6.107041358947754, "learning_rate": 2.7638334284084428e-05, "loss": 0.4236, "step": 4851 }, { "epoch": 0.830395344857094, "grad_norm": 2.59680438041687, "learning_rate": 2.764403879064461e-05, "loss": 0.2292, "step": 4852 }, { "epoch": 0.8305664898168749, "grad_norm": 17.364612579345703, "learning_rate": 2.764974329720479e-05, "loss": 1.3699, "step": 4853 }, { "epoch": 0.8307376347766559, "grad_norm": 19.087657928466797, "learning_rate": 2.7655447803764974e-05, "loss": 1.1447, "step": 4854 }, { "epoch": 0.8309087797364367, "grad_norm": 0.8207781910896301, "learning_rate": 2.7661152310325158e-05, "loss": 0.1838, "step": 4855 }, { "epoch": 0.8310799246962177, "grad_norm": 5.4272894859313965, "learning_rate": 2.766685681688534e-05, "loss": 0.5259, "step": 4856 }, { "epoch": 0.8312510696559986, "grad_norm": 21.006179809570312, "learning_rate": 2.767256132344552e-05, "loss": 1.4462, "step": 4857 }, { "epoch": 0.8314222146157796, "grad_norm": 27.480995178222656, "learning_rate": 2.7678265830005704e-05, "loss": 2.9023, "step": 4858 }, { "epoch": 0.8315933595755605, "grad_norm": 25.569726943969727, "learning_rate": 2.7683970336565888e-05, "loss": 2.3783, "step": 4859 }, { "epoch": 0.8317645045353415, "grad_norm": 21.275972366333008, "learning_rate": 2.7689674843126068e-05, "loss": 2.207, "step": 4860 }, { "epoch": 0.8319356494951223, "grad_norm": 18.127140045166016, "learning_rate": 2.769537934968625e-05, "loss": 1.7016, "step": 4861 }, { "epoch": 0.8321067944549033, "grad_norm": 8.969490051269531, "learning_rate": 2.7701083856246435e-05, "loss": 0.7251, "step": 4862 }, { "epoch": 0.8322779394146842, "grad_norm": 21.6286678314209, "learning_rate": 2.7706788362806618e-05, "loss": 2.1559, "step": 4863 }, { "epoch": 0.8324490843744652, "grad_norm": 22.46690559387207, "learning_rate": 2.7712492869366798e-05, "loss": 2.5798, "step": 4864 }, { "epoch": 0.8326202293342461, "grad_norm": 24.45953941345215, "learning_rate": 2.7718197375926985e-05, "loss": 2.7626, "step": 4865 }, { "epoch": 0.832791374294027, "grad_norm": 23.767484664916992, "learning_rate": 2.7723901882487168e-05, "loss": 2.5827, "step": 4866 }, { "epoch": 0.8329625192538079, "grad_norm": 12.12441349029541, "learning_rate": 2.7729606389047348e-05, "loss": 1.1174, "step": 4867 }, { "epoch": 0.8331336642135889, "grad_norm": 3.884326934814453, "learning_rate": 2.773531089560753e-05, "loss": 0.4211, "step": 4868 }, { "epoch": 0.8333048091733698, "grad_norm": 17.074594497680664, "learning_rate": 2.7741015402167715e-05, "loss": 1.4077, "step": 4869 }, { "epoch": 0.8334759541331508, "grad_norm": 13.826687812805176, "learning_rate": 2.7746719908727898e-05, "loss": 1.2421, "step": 4870 }, { "epoch": 0.8336470990929317, "grad_norm": 19.932655334472656, "learning_rate": 2.7752424415288078e-05, "loss": 1.6989, "step": 4871 }, { "epoch": 0.8338182440527127, "grad_norm": 10.384773254394531, "learning_rate": 2.775812892184826e-05, "loss": 0.9357, "step": 4872 }, { "epoch": 0.8339893890124935, "grad_norm": 21.14198112487793, "learning_rate": 2.7763833428408445e-05, "loss": 1.8342, "step": 4873 }, { "epoch": 0.8341605339722745, "grad_norm": 1.3305509090423584, "learning_rate": 2.7769537934968625e-05, "loss": 0.2297, "step": 4874 }, { "epoch": 0.8343316789320554, "grad_norm": 19.950328826904297, "learning_rate": 2.777524244152881e-05, "loss": 1.6679, "step": 4875 }, { "epoch": 0.8345028238918364, "grad_norm": 0.8316130042076111, "learning_rate": 2.7780946948088992e-05, "loss": 0.1935, "step": 4876 }, { "epoch": 0.8346739688516173, "grad_norm": 19.933990478515625, "learning_rate": 2.7786651454649175e-05, "loss": 1.7269, "step": 4877 }, { "epoch": 0.8348451138113983, "grad_norm": 26.975513458251953, "learning_rate": 2.7792355961209355e-05, "loss": 1.4489, "step": 4878 }, { "epoch": 0.8350162587711792, "grad_norm": 31.234407424926758, "learning_rate": 2.779806046776954e-05, "loss": 5.6514, "step": 4879 }, { "epoch": 0.8351874037309601, "grad_norm": 26.34868049621582, "learning_rate": 2.7803764974329722e-05, "loss": 2.7207, "step": 4880 }, { "epoch": 0.8353585486907411, "grad_norm": 22.602392196655273, "learning_rate": 2.7809469480889902e-05, "loss": 2.0605, "step": 4881 }, { "epoch": 0.835529693650522, "grad_norm": 2.0254154205322266, "learning_rate": 2.7815173987450085e-05, "loss": 0.2238, "step": 4882 }, { "epoch": 0.835700838610303, "grad_norm": 19.05372428894043, "learning_rate": 2.782087849401027e-05, "loss": 1.8702, "step": 4883 }, { "epoch": 0.8358719835700839, "grad_norm": 17.850038528442383, "learning_rate": 2.782658300057045e-05, "loss": 1.7083, "step": 4884 }, { "epoch": 0.8360431285298648, "grad_norm": 1.1266815662384033, "learning_rate": 2.7832287507130632e-05, "loss": 0.1847, "step": 4885 }, { "epoch": 0.8362142734896457, "grad_norm": 145.55615234375, "learning_rate": 2.7837992013690815e-05, "loss": 8.2946, "step": 4886 }, { "epoch": 0.8363854184494267, "grad_norm": 19.88404083251953, "learning_rate": 2.7843696520251e-05, "loss": 2.1403, "step": 4887 }, { "epoch": 0.8365565634092076, "grad_norm": 18.304166793823242, "learning_rate": 2.7849401026811182e-05, "loss": 1.3405, "step": 4888 }, { "epoch": 0.8367277083689886, "grad_norm": 31.058061599731445, "learning_rate": 2.7855105533371365e-05, "loss": 3.5569, "step": 4889 }, { "epoch": 0.8368988533287695, "grad_norm": 3.8306217193603516, "learning_rate": 2.786081003993155e-05, "loss": 0.3999, "step": 4890 }, { "epoch": 0.8370699982885504, "grad_norm": 21.254697799682617, "learning_rate": 2.786651454649173e-05, "loss": 2.2565, "step": 4891 }, { "epoch": 0.8372411432483313, "grad_norm": 24.048858642578125, "learning_rate": 2.7872219053051912e-05, "loss": 2.2869, "step": 4892 }, { "epoch": 0.8374122882081123, "grad_norm": 32.828433990478516, "learning_rate": 2.7877923559612096e-05, "loss": 6.1064, "step": 4893 }, { "epoch": 0.8375834331678932, "grad_norm": 12.22719669342041, "learning_rate": 2.788362806617228e-05, "loss": 1.727, "step": 4894 }, { "epoch": 0.8377545781276742, "grad_norm": 78.69731903076172, "learning_rate": 2.788933257273246e-05, "loss": 7.6272, "step": 4895 }, { "epoch": 0.837925723087455, "grad_norm": 27.703277587890625, "learning_rate": 2.7895037079292642e-05, "loss": 5.7777, "step": 4896 }, { "epoch": 0.838096868047236, "grad_norm": 31.29784393310547, "learning_rate": 2.7900741585852826e-05, "loss": 4.6196, "step": 4897 }, { "epoch": 0.8382680130070169, "grad_norm": 18.404560089111328, "learning_rate": 2.7906446092413006e-05, "loss": 1.4676, "step": 4898 }, { "epoch": 0.8384391579667979, "grad_norm": 14.947914123535156, "learning_rate": 2.791215059897319e-05, "loss": 1.4166, "step": 4899 }, { "epoch": 0.8386103029265788, "grad_norm": 13.326464653015137, "learning_rate": 2.7917855105533372e-05, "loss": 0.923, "step": 4900 }, { "epoch": 0.8387814478863598, "grad_norm": 19.72146987915039, "learning_rate": 2.7923559612093556e-05, "loss": 1.8728, "step": 4901 }, { "epoch": 0.8389525928461407, "grad_norm": 23.139135360717773, "learning_rate": 2.7929264118653736e-05, "loss": 2.6928, "step": 4902 }, { "epoch": 0.8391237378059216, "grad_norm": 24.41969871520996, "learning_rate": 2.793496862521392e-05, "loss": 2.5487, "step": 4903 }, { "epoch": 0.8392948827657025, "grad_norm": 18.01527214050293, "learning_rate": 2.7940673131774103e-05, "loss": 1.7845, "step": 4904 }, { "epoch": 0.8394660277254835, "grad_norm": 33.79884719848633, "learning_rate": 2.7946377638334283e-05, "loss": 2.9576, "step": 4905 }, { "epoch": 0.8396371726852644, "grad_norm": 20.432056427001953, "learning_rate": 2.7952082144894466e-05, "loss": 2.1053, "step": 4906 }, { "epoch": 0.8398083176450454, "grad_norm": 53.79264831542969, "learning_rate": 2.795778665145465e-05, "loss": 2.1853, "step": 4907 }, { "epoch": 0.8399794626048263, "grad_norm": 30.717485427856445, "learning_rate": 2.7963491158014833e-05, "loss": 5.9112, "step": 4908 }, { "epoch": 0.8401506075646072, "grad_norm": 24.705486297607422, "learning_rate": 2.7969195664575013e-05, "loss": 3.0099, "step": 4909 }, { "epoch": 0.8403217525243881, "grad_norm": 14.869185447692871, "learning_rate": 2.7974900171135196e-05, "loss": 1.0484, "step": 4910 }, { "epoch": 0.8404928974841691, "grad_norm": 5.514411449432373, "learning_rate": 2.7980604677695383e-05, "loss": 0.3749, "step": 4911 }, { "epoch": 0.84066404244395, "grad_norm": 25.00140380859375, "learning_rate": 2.7986309184255563e-05, "loss": 2.7309, "step": 4912 }, { "epoch": 0.840835187403731, "grad_norm": 16.862314224243164, "learning_rate": 2.7992013690815746e-05, "loss": 1.5645, "step": 4913 }, { "epoch": 0.8410063323635119, "grad_norm": 16.41586685180664, "learning_rate": 2.799771819737593e-05, "loss": 1.3737, "step": 4914 }, { "epoch": 0.8411774773232928, "grad_norm": 4.305161476135254, "learning_rate": 2.800342270393611e-05, "loss": 0.4024, "step": 4915 }, { "epoch": 0.8413486222830737, "grad_norm": 9.081815719604492, "learning_rate": 2.8009127210496293e-05, "loss": 0.5857, "step": 4916 }, { "epoch": 0.8415197672428547, "grad_norm": 23.440717697143555, "learning_rate": 2.8014831717056476e-05, "loss": 2.1649, "step": 4917 }, { "epoch": 0.8416909122026356, "grad_norm": 9.641824722290039, "learning_rate": 2.802053622361666e-05, "loss": 0.5756, "step": 4918 }, { "epoch": 0.8418620571624166, "grad_norm": 22.86307716369629, "learning_rate": 2.802624073017684e-05, "loss": 2.4724, "step": 4919 }, { "epoch": 0.8420332021221975, "grad_norm": 20.317447662353516, "learning_rate": 2.8031945236737023e-05, "loss": 2.1701, "step": 4920 }, { "epoch": 0.8422043470819784, "grad_norm": 5.848948955535889, "learning_rate": 2.8037649743297206e-05, "loss": 0.5332, "step": 4921 }, { "epoch": 0.8423754920417593, "grad_norm": 29.350927352905273, "learning_rate": 2.8043354249857386e-05, "loss": 1.6343, "step": 4922 }, { "epoch": 0.8425466370015403, "grad_norm": 23.29707145690918, "learning_rate": 2.804905875641757e-05, "loss": 2.0175, "step": 4923 }, { "epoch": 0.8427177819613212, "grad_norm": 21.697025299072266, "learning_rate": 2.8054763262977753e-05, "loss": 2.3221, "step": 4924 }, { "epoch": 0.8428889269211022, "grad_norm": 24.808443069458008, "learning_rate": 2.8060467769537937e-05, "loss": 2.1885, "step": 4925 }, { "epoch": 0.843060071880883, "grad_norm": 18.006011962890625, "learning_rate": 2.8066172276098117e-05, "loss": 1.2711, "step": 4926 }, { "epoch": 0.843231216840664, "grad_norm": 18.567319869995117, "learning_rate": 2.80718767826583e-05, "loss": 1.6861, "step": 4927 }, { "epoch": 0.8434023618004449, "grad_norm": 19.155935287475586, "learning_rate": 2.8077581289218483e-05, "loss": 1.5559, "step": 4928 }, { "epoch": 0.8435735067602259, "grad_norm": 14.273578643798828, "learning_rate": 2.8083285795778663e-05, "loss": 1.5563, "step": 4929 }, { "epoch": 0.8437446517200069, "grad_norm": 23.349454879760742, "learning_rate": 2.8088990302338847e-05, "loss": 2.4812, "step": 4930 }, { "epoch": 0.8439157966797878, "grad_norm": 1.6741917133331299, "learning_rate": 2.809469480889903e-05, "loss": 0.2197, "step": 4931 }, { "epoch": 0.8440869416395688, "grad_norm": 24.860034942626953, "learning_rate": 2.8100399315459213e-05, "loss": 2.4578, "step": 4932 }, { "epoch": 0.8442580865993496, "grad_norm": 14.901060104370117, "learning_rate": 2.8106103822019393e-05, "loss": 1.5235, "step": 4933 }, { "epoch": 0.8444292315591306, "grad_norm": 9.101666450500488, "learning_rate": 2.811180832857958e-05, "loss": 0.5367, "step": 4934 }, { "epoch": 0.8446003765189115, "grad_norm": 25.940553665161133, "learning_rate": 2.8117512835139764e-05, "loss": 2.8182, "step": 4935 }, { "epoch": 0.8447715214786925, "grad_norm": 21.500452041625977, "learning_rate": 2.8123217341699944e-05, "loss": 2.2284, "step": 4936 }, { "epoch": 0.8449426664384734, "grad_norm": 9.465425491333008, "learning_rate": 2.8128921848260127e-05, "loss": 0.8752, "step": 4937 }, { "epoch": 0.8451138113982544, "grad_norm": 12.658205032348633, "learning_rate": 2.813462635482031e-05, "loss": 0.7855, "step": 4938 }, { "epoch": 0.8452849563580352, "grad_norm": 3.6830475330352783, "learning_rate": 2.8140330861380494e-05, "loss": 0.3844, "step": 4939 }, { "epoch": 0.8454561013178162, "grad_norm": 79.1318130493164, "learning_rate": 2.8146035367940674e-05, "loss": 7.1857, "step": 4940 }, { "epoch": 0.8456272462775971, "grad_norm": 18.231271743774414, "learning_rate": 2.8151739874500857e-05, "loss": 1.4963, "step": 4941 }, { "epoch": 0.8457983912373781, "grad_norm": 30.293922424316406, "learning_rate": 2.815744438106104e-05, "loss": 3.6746, "step": 4942 }, { "epoch": 0.845969536197159, "grad_norm": 23.53990364074707, "learning_rate": 2.816314888762122e-05, "loss": 2.1727, "step": 4943 }, { "epoch": 0.84614068115694, "grad_norm": 45.553245544433594, "learning_rate": 2.8168853394181404e-05, "loss": 6.6332, "step": 4944 }, { "epoch": 0.8463118261167208, "grad_norm": 23.199127197265625, "learning_rate": 2.8174557900741587e-05, "loss": 1.8783, "step": 4945 }, { "epoch": 0.8464829710765018, "grad_norm": 29.977962493896484, "learning_rate": 2.818026240730177e-05, "loss": 3.5376, "step": 4946 }, { "epoch": 0.8466541160362827, "grad_norm": 13.712217330932617, "learning_rate": 2.818596691386195e-05, "loss": 1.2064, "step": 4947 }, { "epoch": 0.8468252609960637, "grad_norm": 21.567646026611328, "learning_rate": 2.8191671420422134e-05, "loss": 1.7774, "step": 4948 }, { "epoch": 0.8469964059558446, "grad_norm": 20.53908348083496, "learning_rate": 2.8197375926982317e-05, "loss": 1.5944, "step": 4949 }, { "epoch": 0.8471675509156256, "grad_norm": 57.32319641113281, "learning_rate": 2.8203080433542497e-05, "loss": 1.6839, "step": 4950 }, { "epoch": 0.8473386958754064, "grad_norm": 19.78857421875, "learning_rate": 2.820878494010268e-05, "loss": 1.645, "step": 4951 }, { "epoch": 0.8475098408351874, "grad_norm": 18.463855743408203, "learning_rate": 2.8214489446662864e-05, "loss": 1.8035, "step": 4952 }, { "epoch": 0.8476809857949683, "grad_norm": 3.614971160888672, "learning_rate": 2.8220193953223044e-05, "loss": 0.3524, "step": 4953 }, { "epoch": 0.8478521307547493, "grad_norm": 44.223514556884766, "learning_rate": 2.8225898459783227e-05, "loss": 1.5668, "step": 4954 }, { "epoch": 0.8480232757145302, "grad_norm": 28.211584091186523, "learning_rate": 2.823160296634341e-05, "loss": 2.7987, "step": 4955 }, { "epoch": 0.8481944206743112, "grad_norm": 6.280284881591797, "learning_rate": 2.8237307472903594e-05, "loss": 0.5097, "step": 4956 }, { "epoch": 0.848365565634092, "grad_norm": 27.26231575012207, "learning_rate": 2.8243011979463778e-05, "loss": 3.0276, "step": 4957 }, { "epoch": 0.848536710593873, "grad_norm": 23.75172996520996, "learning_rate": 2.824871648602396e-05, "loss": 2.684, "step": 4958 }, { "epoch": 0.8487078555536539, "grad_norm": 25.655179977416992, "learning_rate": 2.8254420992584144e-05, "loss": 2.2778, "step": 4959 }, { "epoch": 0.8488790005134349, "grad_norm": 1.2349088191986084, "learning_rate": 2.8260125499144324e-05, "loss": 0.2043, "step": 4960 }, { "epoch": 0.8490501454732158, "grad_norm": 29.569950103759766, "learning_rate": 2.8265830005704508e-05, "loss": 5.8835, "step": 4961 }, { "epoch": 0.8492212904329968, "grad_norm": 21.26607322692871, "learning_rate": 2.827153451226469e-05, "loss": 1.8996, "step": 4962 }, { "epoch": 0.8493924353927776, "grad_norm": 18.51241111755371, "learning_rate": 2.8277239018824875e-05, "loss": 1.6964, "step": 4963 }, { "epoch": 0.8495635803525586, "grad_norm": 129.38690185546875, "learning_rate": 2.8282943525385055e-05, "loss": 8.2196, "step": 4964 }, { "epoch": 0.8497347253123395, "grad_norm": 14.641566276550293, "learning_rate": 2.8288648031945238e-05, "loss": 1.8633, "step": 4965 }, { "epoch": 0.8499058702721205, "grad_norm": 31.69512176513672, "learning_rate": 2.829435253850542e-05, "loss": 3.3676, "step": 4966 }, { "epoch": 0.8500770152319014, "grad_norm": 1.0442372560501099, "learning_rate": 2.83000570450656e-05, "loss": 0.201, "step": 4967 }, { "epoch": 0.8502481601916824, "grad_norm": 20.171241760253906, "learning_rate": 2.8305761551625785e-05, "loss": 1.8819, "step": 4968 }, { "epoch": 0.8504193051514632, "grad_norm": 7.881687164306641, "learning_rate": 2.8311466058185968e-05, "loss": 0.5707, "step": 4969 }, { "epoch": 0.8505904501112442, "grad_norm": 5.979015350341797, "learning_rate": 2.831717056474615e-05, "loss": 0.5286, "step": 4970 }, { "epoch": 0.8507615950710251, "grad_norm": 0.8935146331787109, "learning_rate": 2.832287507130633e-05, "loss": 0.1914, "step": 4971 }, { "epoch": 0.8509327400308061, "grad_norm": 28.41226577758789, "learning_rate": 2.8328579577866515e-05, "loss": 3.873, "step": 4972 }, { "epoch": 0.851103884990587, "grad_norm": 23.3878116607666, "learning_rate": 2.8334284084426698e-05, "loss": 2.738, "step": 4973 }, { "epoch": 0.851275029950368, "grad_norm": 1.1107969284057617, "learning_rate": 2.8339988590986878e-05, "loss": 0.2044, "step": 4974 }, { "epoch": 0.8514461749101488, "grad_norm": 12.226590156555176, "learning_rate": 2.834569309754706e-05, "loss": 0.729, "step": 4975 }, { "epoch": 0.8516173198699298, "grad_norm": 24.799179077148438, "learning_rate": 2.8351397604107245e-05, "loss": 1.7961, "step": 4976 }, { "epoch": 0.8517884648297107, "grad_norm": 22.254865646362305, "learning_rate": 2.8357102110667428e-05, "loss": 2.345, "step": 4977 }, { "epoch": 0.8519596097894917, "grad_norm": 22.752105712890625, "learning_rate": 2.8362806617227608e-05, "loss": 2.2281, "step": 4978 }, { "epoch": 0.8521307547492726, "grad_norm": 2.3499200344085693, "learning_rate": 2.836851112378779e-05, "loss": 0.2585, "step": 4979 }, { "epoch": 0.8523018997090536, "grad_norm": 24.360803604125977, "learning_rate": 2.837421563034798e-05, "loss": 2.2443, "step": 4980 }, { "epoch": 0.8524730446688346, "grad_norm": 22.948808670043945, "learning_rate": 2.837992013690816e-05, "loss": 2.2422, "step": 4981 }, { "epoch": 0.8526441896286154, "grad_norm": 7.84674072265625, "learning_rate": 2.8385624643468342e-05, "loss": 0.6825, "step": 4982 }, { "epoch": 0.8528153345883964, "grad_norm": 11.925847053527832, "learning_rate": 2.8391329150028525e-05, "loss": 0.9807, "step": 4983 }, { "epoch": 0.8529864795481773, "grad_norm": 12.705161094665527, "learning_rate": 2.8397033656588705e-05, "loss": 1.0903, "step": 4984 }, { "epoch": 0.8531576245079583, "grad_norm": 22.5230770111084, "learning_rate": 2.840273816314889e-05, "loss": 2.2401, "step": 4985 }, { "epoch": 0.8533287694677392, "grad_norm": 20.577150344848633, "learning_rate": 2.8408442669709072e-05, "loss": 1.4695, "step": 4986 }, { "epoch": 0.8534999144275202, "grad_norm": 8.088134765625, "learning_rate": 2.8414147176269255e-05, "loss": 0.9019, "step": 4987 }, { "epoch": 0.853671059387301, "grad_norm": 2.71311354637146, "learning_rate": 2.8419851682829435e-05, "loss": 0.3453, "step": 4988 }, { "epoch": 0.853842204347082, "grad_norm": 22.019060134887695, "learning_rate": 2.842555618938962e-05, "loss": 1.6718, "step": 4989 }, { "epoch": 0.8540133493068629, "grad_norm": 30.339303970336914, "learning_rate": 2.8431260695949802e-05, "loss": 2.9397, "step": 4990 }, { "epoch": 0.8541844942666439, "grad_norm": 1.055464744567871, "learning_rate": 2.8436965202509982e-05, "loss": 0.1917, "step": 4991 }, { "epoch": 0.8543556392264248, "grad_norm": 19.96885871887207, "learning_rate": 2.8442669709070165e-05, "loss": 1.551, "step": 4992 }, { "epoch": 0.8545267841862058, "grad_norm": 31.464706420898438, "learning_rate": 2.844837421563035e-05, "loss": 1.7026, "step": 4993 }, { "epoch": 0.8546979291459866, "grad_norm": 6.268205165863037, "learning_rate": 2.8454078722190532e-05, "loss": 0.6232, "step": 4994 }, { "epoch": 0.8548690741057676, "grad_norm": 1.3084250688552856, "learning_rate": 2.8459783228750712e-05, "loss": 0.194, "step": 4995 }, { "epoch": 0.8550402190655485, "grad_norm": 24.96925163269043, "learning_rate": 2.8465487735310896e-05, "loss": 2.4543, "step": 4996 }, { "epoch": 0.8552113640253295, "grad_norm": 20.334264755249023, "learning_rate": 2.847119224187108e-05, "loss": 1.9524, "step": 4997 }, { "epoch": 0.8553825089851104, "grad_norm": 32.75785446166992, "learning_rate": 2.847689674843126e-05, "loss": 1.7313, "step": 4998 }, { "epoch": 0.8555536539448914, "grad_norm": 20.249191284179688, "learning_rate": 2.8482601254991442e-05, "loss": 1.7413, "step": 4999 }, { "epoch": 0.8557247989046722, "grad_norm": 9.610689163208008, "learning_rate": 2.8488305761551626e-05, "loss": 0.5728, "step": 5000 }, { "epoch": 0.8558959438644532, "grad_norm": 20.597206115722656, "learning_rate": 2.849401026811181e-05, "loss": 1.9744, "step": 5001 }, { "epoch": 0.8560670888242341, "grad_norm": 22.978784561157227, "learning_rate": 2.849971477467199e-05, "loss": 1.547, "step": 5002 }, { "epoch": 0.8562382337840151, "grad_norm": 15.783121109008789, "learning_rate": 2.8505419281232176e-05, "loss": 1.2838, "step": 5003 }, { "epoch": 0.856409378743796, "grad_norm": 30.393911361694336, "learning_rate": 2.851112378779236e-05, "loss": 5.1174, "step": 5004 }, { "epoch": 0.856580523703577, "grad_norm": 30.374027252197266, "learning_rate": 2.851682829435254e-05, "loss": 6.2091, "step": 5005 }, { "epoch": 0.8567516686633578, "grad_norm": 20.213665008544922, "learning_rate": 2.8522532800912723e-05, "loss": 1.4388, "step": 5006 }, { "epoch": 0.8569228136231388, "grad_norm": 22.47203254699707, "learning_rate": 2.8528237307472906e-05, "loss": 2.1662, "step": 5007 }, { "epoch": 0.8570939585829197, "grad_norm": 67.63754272460938, "learning_rate": 2.853394181403309e-05, "loss": 8.0588, "step": 5008 }, { "epoch": 0.8572651035427007, "grad_norm": 18.833168029785156, "learning_rate": 2.853964632059327e-05, "loss": 1.5184, "step": 5009 }, { "epoch": 0.8574362485024816, "grad_norm": 20.953800201416016, "learning_rate": 2.8545350827153453e-05, "loss": 1.9016, "step": 5010 }, { "epoch": 0.8576073934622626, "grad_norm": 11.872946739196777, "learning_rate": 2.8551055333713636e-05, "loss": 0.689, "step": 5011 }, { "epoch": 0.8577785384220434, "grad_norm": 20.248685836791992, "learning_rate": 2.8556759840273816e-05, "loss": 2.0556, "step": 5012 }, { "epoch": 0.8579496833818244, "grad_norm": 12.471953392028809, "learning_rate": 2.8562464346834e-05, "loss": 0.8929, "step": 5013 }, { "epoch": 0.8581208283416053, "grad_norm": 9.479846954345703, "learning_rate": 2.8568168853394183e-05, "loss": 0.8552, "step": 5014 }, { "epoch": 0.8582919733013863, "grad_norm": 25.91791343688965, "learning_rate": 2.8573873359954363e-05, "loss": 2.4487, "step": 5015 }, { "epoch": 0.8584631182611672, "grad_norm": 30.794086456298828, "learning_rate": 2.8579577866514546e-05, "loss": 1.6757, "step": 5016 }, { "epoch": 0.8586342632209482, "grad_norm": 30.115619659423828, "learning_rate": 2.858528237307473e-05, "loss": 2.2301, "step": 5017 }, { "epoch": 0.858805408180729, "grad_norm": 15.450135231018066, "learning_rate": 2.8590986879634913e-05, "loss": 1.3106, "step": 5018 }, { "epoch": 0.85897655314051, "grad_norm": 23.82478904724121, "learning_rate": 2.8596691386195093e-05, "loss": 2.1391, "step": 5019 }, { "epoch": 0.8591476981002909, "grad_norm": 21.615318298339844, "learning_rate": 2.8602395892755276e-05, "loss": 1.7556, "step": 5020 }, { "epoch": 0.8593188430600719, "grad_norm": 20.00986671447754, "learning_rate": 2.860810039931546e-05, "loss": 1.8048, "step": 5021 }, { "epoch": 0.8594899880198528, "grad_norm": 1.3434984683990479, "learning_rate": 2.861380490587564e-05, "loss": 0.2228, "step": 5022 }, { "epoch": 0.8596611329796338, "grad_norm": 12.534896850585938, "learning_rate": 2.8619509412435823e-05, "loss": 1.6978, "step": 5023 }, { "epoch": 0.8598322779394146, "grad_norm": 30.588682174682617, "learning_rate": 2.8625213918996006e-05, "loss": 2.4971, "step": 5024 }, { "epoch": 0.8600034228991956, "grad_norm": 16.27142333984375, "learning_rate": 2.8630918425556193e-05, "loss": 1.2722, "step": 5025 }, { "epoch": 0.8601745678589765, "grad_norm": 8.828721046447754, "learning_rate": 2.8636622932116373e-05, "loss": 0.4231, "step": 5026 }, { "epoch": 0.8603457128187575, "grad_norm": 11.027689933776855, "learning_rate": 2.8642327438676557e-05, "loss": 0.7595, "step": 5027 }, { "epoch": 0.8605168577785384, "grad_norm": 23.096948623657227, "learning_rate": 2.864803194523674e-05, "loss": 2.6978, "step": 5028 }, { "epoch": 0.8606880027383194, "grad_norm": 20.453035354614258, "learning_rate": 2.865373645179692e-05, "loss": 2.2458, "step": 5029 }, { "epoch": 0.8608591476981002, "grad_norm": 16.359405517578125, "learning_rate": 2.8659440958357103e-05, "loss": 1.6137, "step": 5030 }, { "epoch": 0.8610302926578812, "grad_norm": 21.571128845214844, "learning_rate": 2.8665145464917287e-05, "loss": 1.8346, "step": 5031 }, { "epoch": 0.8612014376176622, "grad_norm": 30.24646759033203, "learning_rate": 2.867084997147747e-05, "loss": 3.9606, "step": 5032 }, { "epoch": 0.8613725825774431, "grad_norm": 20.61684799194336, "learning_rate": 2.867655447803765e-05, "loss": 2.4751, "step": 5033 }, { "epoch": 0.8615437275372241, "grad_norm": 22.913393020629883, "learning_rate": 2.8682258984597833e-05, "loss": 2.5773, "step": 5034 }, { "epoch": 0.861714872497005, "grad_norm": 16.3863582611084, "learning_rate": 2.8687963491158017e-05, "loss": 1.2117, "step": 5035 }, { "epoch": 0.861886017456786, "grad_norm": 1.4799400568008423, "learning_rate": 2.8693667997718197e-05, "loss": 0.2188, "step": 5036 }, { "epoch": 0.8620571624165668, "grad_norm": 2.393829822540283, "learning_rate": 2.869937250427838e-05, "loss": 0.3249, "step": 5037 }, { "epoch": 0.8622283073763478, "grad_norm": 6.015737533569336, "learning_rate": 2.8705077010838564e-05, "loss": 0.475, "step": 5038 }, { "epoch": 0.8623994523361287, "grad_norm": 10.611120223999023, "learning_rate": 2.8710781517398747e-05, "loss": 0.6693, "step": 5039 }, { "epoch": 0.8625705972959097, "grad_norm": 40.11723709106445, "learning_rate": 2.8716486023958927e-05, "loss": 6.5789, "step": 5040 }, { "epoch": 0.8627417422556906, "grad_norm": 18.530126571655273, "learning_rate": 2.872219053051911e-05, "loss": 1.5516, "step": 5041 }, { "epoch": 0.8629128872154715, "grad_norm": 18.136308670043945, "learning_rate": 2.8727895037079294e-05, "loss": 1.4063, "step": 5042 }, { "epoch": 0.8630840321752524, "grad_norm": 25.583057403564453, "learning_rate": 2.8733599543639474e-05, "loss": 1.7697, "step": 5043 }, { "epoch": 0.8632551771350334, "grad_norm": 19.754838943481445, "learning_rate": 2.8739304050199657e-05, "loss": 1.9445, "step": 5044 }, { "epoch": 0.8634263220948143, "grad_norm": 24.454795837402344, "learning_rate": 2.874500855675984e-05, "loss": 2.0861, "step": 5045 }, { "epoch": 0.8635974670545953, "grad_norm": 21.876201629638672, "learning_rate": 2.8750713063320024e-05, "loss": 2.0072, "step": 5046 }, { "epoch": 0.8637686120143762, "grad_norm": 6.661749839782715, "learning_rate": 2.8756417569880204e-05, "loss": 0.5025, "step": 5047 }, { "epoch": 0.8639397569741571, "grad_norm": 190.44247436523438, "learning_rate": 2.876212207644039e-05, "loss": 8.5548, "step": 5048 }, { "epoch": 0.864110901933938, "grad_norm": 22.162609100341797, "learning_rate": 2.8767826583000574e-05, "loss": 2.0999, "step": 5049 }, { "epoch": 0.864282046893719, "grad_norm": 1.4531558752059937, "learning_rate": 2.8773531089560754e-05, "loss": 0.2048, "step": 5050 }, { "epoch": 0.8644531918534999, "grad_norm": 24.412981033325195, "learning_rate": 2.8779235596120937e-05, "loss": 2.3353, "step": 5051 }, { "epoch": 0.8646243368132809, "grad_norm": 31.54360580444336, "learning_rate": 2.878494010268112e-05, "loss": 1.6217, "step": 5052 }, { "epoch": 0.8647954817730618, "grad_norm": 23.30241584777832, "learning_rate": 2.87906446092413e-05, "loss": 2.1526, "step": 5053 }, { "epoch": 0.8649666267328427, "grad_norm": 18.61789321899414, "learning_rate": 2.8796349115801484e-05, "loss": 1.5916, "step": 5054 }, { "epoch": 0.8651377716926236, "grad_norm": 8.036980628967285, "learning_rate": 2.8802053622361667e-05, "loss": 0.6728, "step": 5055 }, { "epoch": 0.8653089166524046, "grad_norm": 1.1689972877502441, "learning_rate": 2.880775812892185e-05, "loss": 0.1996, "step": 5056 }, { "epoch": 0.8654800616121855, "grad_norm": 20.204479217529297, "learning_rate": 2.881346263548203e-05, "loss": 1.8613, "step": 5057 }, { "epoch": 0.8656512065719665, "grad_norm": 20.377853393554688, "learning_rate": 2.8819167142042214e-05, "loss": 1.9245, "step": 5058 }, { "epoch": 0.8658223515317474, "grad_norm": 32.21659851074219, "learning_rate": 2.8824871648602398e-05, "loss": 4.4384, "step": 5059 }, { "epoch": 0.8659934964915283, "grad_norm": 17.437664031982422, "learning_rate": 2.8830576155162578e-05, "loss": 1.8003, "step": 5060 }, { "epoch": 0.8661646414513092, "grad_norm": 8.696148872375488, "learning_rate": 2.883628066172276e-05, "loss": 0.8588, "step": 5061 }, { "epoch": 0.8663357864110902, "grad_norm": 38.84085464477539, "learning_rate": 2.8841985168282944e-05, "loss": 6.4101, "step": 5062 }, { "epoch": 0.8665069313708711, "grad_norm": 22.779674530029297, "learning_rate": 2.8847689674843128e-05, "loss": 2.1035, "step": 5063 }, { "epoch": 0.8666780763306521, "grad_norm": 20.018033981323242, "learning_rate": 2.8853394181403308e-05, "loss": 1.7867, "step": 5064 }, { "epoch": 0.866849221290433, "grad_norm": 20.817026138305664, "learning_rate": 2.885909868796349e-05, "loss": 1.8507, "step": 5065 }, { "epoch": 0.867020366250214, "grad_norm": 22.181928634643555, "learning_rate": 2.8864803194523674e-05, "loss": 2.3944, "step": 5066 }, { "epoch": 0.8671915112099948, "grad_norm": 21.743671417236328, "learning_rate": 2.8870507701083854e-05, "loss": 2.0094, "step": 5067 }, { "epoch": 0.8673626561697758, "grad_norm": 18.235118865966797, "learning_rate": 2.8876212207644038e-05, "loss": 1.7605, "step": 5068 }, { "epoch": 0.8675338011295567, "grad_norm": 16.165708541870117, "learning_rate": 2.888191671420422e-05, "loss": 1.5405, "step": 5069 }, { "epoch": 0.8677049460893377, "grad_norm": 33.633140563964844, "learning_rate": 2.8887621220764405e-05, "loss": 6.2029, "step": 5070 }, { "epoch": 0.8678760910491186, "grad_norm": 9.106796264648438, "learning_rate": 2.8893325727324588e-05, "loss": 0.5696, "step": 5071 }, { "epoch": 0.8680472360088995, "grad_norm": 10.702054023742676, "learning_rate": 2.889903023388477e-05, "loss": 0.8735, "step": 5072 }, { "epoch": 0.8682183809686804, "grad_norm": 34.0421142578125, "learning_rate": 2.8904734740444955e-05, "loss": 2.06, "step": 5073 }, { "epoch": 0.8683895259284614, "grad_norm": 26.271299362182617, "learning_rate": 2.8910439247005135e-05, "loss": 2.3461, "step": 5074 }, { "epoch": 0.8685606708882423, "grad_norm": 24.45592498779297, "learning_rate": 2.8916143753565318e-05, "loss": 2.7885, "step": 5075 }, { "epoch": 0.8687318158480233, "grad_norm": 29.799867630004883, "learning_rate": 2.89218482601255e-05, "loss": 3.7322, "step": 5076 }, { "epoch": 0.8689029608078042, "grad_norm": 50.683589935302734, "learning_rate": 2.8927552766685685e-05, "loss": 1.8316, "step": 5077 }, { "epoch": 0.8690741057675851, "grad_norm": 8.911881446838379, "learning_rate": 2.8933257273245865e-05, "loss": 0.9942, "step": 5078 }, { "epoch": 0.869245250727366, "grad_norm": 14.339098930358887, "learning_rate": 2.8938961779806048e-05, "loss": 1.3359, "step": 5079 }, { "epoch": 0.869416395687147, "grad_norm": 60.05573272705078, "learning_rate": 2.894466628636623e-05, "loss": 6.966, "step": 5080 }, { "epoch": 0.869587540646928, "grad_norm": 30.60601806640625, "learning_rate": 2.895037079292641e-05, "loss": 5.8897, "step": 5081 }, { "epoch": 0.8697586856067089, "grad_norm": 19.455875396728516, "learning_rate": 2.8956075299486595e-05, "loss": 1.5791, "step": 5082 }, { "epoch": 0.8699298305664899, "grad_norm": 11.915603637695312, "learning_rate": 2.896177980604678e-05, "loss": 0.9361, "step": 5083 }, { "epoch": 0.8701009755262707, "grad_norm": 20.202234268188477, "learning_rate": 2.8967484312606958e-05, "loss": 5.2419, "step": 5084 }, { "epoch": 0.8702721204860517, "grad_norm": 6.621884346008301, "learning_rate": 2.897318881916714e-05, "loss": 0.6413, "step": 5085 }, { "epoch": 0.8704432654458326, "grad_norm": 28.005216598510742, "learning_rate": 2.8978893325727325e-05, "loss": 2.8525, "step": 5086 }, { "epoch": 0.8706144104056136, "grad_norm": 20.600162506103516, "learning_rate": 2.898459783228751e-05, "loss": 1.7191, "step": 5087 }, { "epoch": 0.8707855553653945, "grad_norm": 6.3435821533203125, "learning_rate": 2.899030233884769e-05, "loss": 0.5639, "step": 5088 }, { "epoch": 0.8709567003251755, "grad_norm": 25.650978088378906, "learning_rate": 2.8996006845407872e-05, "loss": 5.7733, "step": 5089 }, { "epoch": 0.8711278452849563, "grad_norm": 22.818950653076172, "learning_rate": 2.9001711351968055e-05, "loss": 2.128, "step": 5090 }, { "epoch": 0.8712989902447373, "grad_norm": 5.952839374542236, "learning_rate": 2.9007415858528235e-05, "loss": 0.4348, "step": 5091 }, { "epoch": 0.8714701352045182, "grad_norm": 1.6554477214813232, "learning_rate": 2.901312036508842e-05, "loss": 0.2175, "step": 5092 }, { "epoch": 0.8716412801642992, "grad_norm": 22.035751342773438, "learning_rate": 2.9018824871648602e-05, "loss": 2.054, "step": 5093 }, { "epoch": 0.8718124251240801, "grad_norm": 1.5505925416946411, "learning_rate": 2.902452937820879e-05, "loss": 0.2221, "step": 5094 }, { "epoch": 0.8719835700838611, "grad_norm": 27.077693939208984, "learning_rate": 2.903023388476897e-05, "loss": 3.1948, "step": 5095 }, { "epoch": 0.872154715043642, "grad_norm": 22.83882713317871, "learning_rate": 2.9035938391329152e-05, "loss": 2.3321, "step": 5096 }, { "epoch": 0.8723258600034229, "grad_norm": 4.607493877410889, "learning_rate": 2.9041642897889335e-05, "loss": 0.4254, "step": 5097 }, { "epoch": 0.8724970049632038, "grad_norm": 17.07670783996582, "learning_rate": 2.9047347404449515e-05, "loss": 1.4832, "step": 5098 }, { "epoch": 0.8726681499229848, "grad_norm": 15.903471946716309, "learning_rate": 2.90530519110097e-05, "loss": 1.5948, "step": 5099 }, { "epoch": 0.8728392948827657, "grad_norm": 1.760149598121643, "learning_rate": 2.9058756417569882e-05, "loss": 0.2047, "step": 5100 }, { "epoch": 0.8730104398425467, "grad_norm": 6.830605983734131, "learning_rate": 2.9064460924130066e-05, "loss": 0.6232, "step": 5101 }, { "epoch": 0.8731815848023275, "grad_norm": 24.54201316833496, "learning_rate": 2.9070165430690246e-05, "loss": 2.2265, "step": 5102 }, { "epoch": 0.8733527297621085, "grad_norm": 20.139869689941406, "learning_rate": 2.907586993725043e-05, "loss": 2.3484, "step": 5103 }, { "epoch": 0.8735238747218894, "grad_norm": 17.505416870117188, "learning_rate": 2.9081574443810612e-05, "loss": 1.434, "step": 5104 }, { "epoch": 0.8736950196816704, "grad_norm": 33.22725296020508, "learning_rate": 2.9087278950370792e-05, "loss": 6.1542, "step": 5105 }, { "epoch": 0.8738661646414513, "grad_norm": 146.86790466308594, "learning_rate": 2.9092983456930976e-05, "loss": 9.0732, "step": 5106 }, { "epoch": 0.8740373096012323, "grad_norm": 16.295713424682617, "learning_rate": 2.909868796349116e-05, "loss": 1.4125, "step": 5107 }, { "epoch": 0.8742084545610131, "grad_norm": 14.710768699645996, "learning_rate": 2.9104392470051342e-05, "loss": 1.1648, "step": 5108 }, { "epoch": 0.8743795995207941, "grad_norm": 12.489028930664062, "learning_rate": 2.9110096976611522e-05, "loss": 1.268, "step": 5109 }, { "epoch": 0.874550744480575, "grad_norm": 13.691608428955078, "learning_rate": 2.9115801483171706e-05, "loss": 1.085, "step": 5110 }, { "epoch": 0.874721889440356, "grad_norm": 23.710769653320312, "learning_rate": 2.912150598973189e-05, "loss": 2.4187, "step": 5111 }, { "epoch": 0.8748930344001369, "grad_norm": 25.6949405670166, "learning_rate": 2.912721049629207e-05, "loss": 2.5228, "step": 5112 }, { "epoch": 0.8750641793599179, "grad_norm": 21.368337631225586, "learning_rate": 2.9132915002852253e-05, "loss": 1.8782, "step": 5113 }, { "epoch": 0.8752353243196987, "grad_norm": 82.89339447021484, "learning_rate": 2.9138619509412436e-05, "loss": 8.207, "step": 5114 }, { "epoch": 0.8754064692794797, "grad_norm": 16.71074676513672, "learning_rate": 2.914432401597262e-05, "loss": 1.4629, "step": 5115 }, { "epoch": 0.8755776142392606, "grad_norm": 2.993272542953491, "learning_rate": 2.91500285225328e-05, "loss": 0.394, "step": 5116 }, { "epoch": 0.8757487591990416, "grad_norm": 14.644696235656738, "learning_rate": 2.9155733029092986e-05, "loss": 1.3512, "step": 5117 }, { "epoch": 0.8759199041588225, "grad_norm": 19.816692352294922, "learning_rate": 2.916143753565317e-05, "loss": 2.3304, "step": 5118 }, { "epoch": 0.8760910491186035, "grad_norm": 33.53482437133789, "learning_rate": 2.916714204221335e-05, "loss": 6.1537, "step": 5119 }, { "epoch": 0.8762621940783843, "grad_norm": 18.77507972717285, "learning_rate": 2.9172846548773533e-05, "loss": 1.5366, "step": 5120 }, { "epoch": 0.8764333390381653, "grad_norm": 19.784685134887695, "learning_rate": 2.9178551055333716e-05, "loss": 2.1277, "step": 5121 }, { "epoch": 0.8766044839979462, "grad_norm": 9.049237251281738, "learning_rate": 2.9184255561893896e-05, "loss": 0.6134, "step": 5122 }, { "epoch": 0.8767756289577272, "grad_norm": 8.538041114807129, "learning_rate": 2.918996006845408e-05, "loss": 0.5802, "step": 5123 }, { "epoch": 0.8769467739175081, "grad_norm": 26.41038703918457, "learning_rate": 2.9195664575014263e-05, "loss": 2.5314, "step": 5124 }, { "epoch": 0.8771179188772891, "grad_norm": 9.25900936126709, "learning_rate": 2.9201369081574446e-05, "loss": 0.7571, "step": 5125 }, { "epoch": 0.87728906383707, "grad_norm": 28.40367317199707, "learning_rate": 2.9207073588134626e-05, "loss": 2.4607, "step": 5126 }, { "epoch": 0.8774602087968509, "grad_norm": 28.677005767822266, "learning_rate": 2.921277809469481e-05, "loss": 3.544, "step": 5127 }, { "epoch": 0.8776313537566318, "grad_norm": 1.4200770854949951, "learning_rate": 2.9218482601254993e-05, "loss": 0.2095, "step": 5128 }, { "epoch": 0.8778024987164128, "grad_norm": 18.276594161987305, "learning_rate": 2.9224187107815173e-05, "loss": 1.494, "step": 5129 }, { "epoch": 0.8779736436761937, "grad_norm": 24.763029098510742, "learning_rate": 2.9229891614375356e-05, "loss": 2.4242, "step": 5130 }, { "epoch": 0.8781447886359747, "grad_norm": 10.431611061096191, "learning_rate": 2.923559612093554e-05, "loss": 0.6153, "step": 5131 }, { "epoch": 0.8783159335957557, "grad_norm": 58.08489990234375, "learning_rate": 2.9241300627495723e-05, "loss": 1.706, "step": 5132 }, { "epoch": 0.8784870785555365, "grad_norm": 22.97351837158203, "learning_rate": 2.9247005134055903e-05, "loss": 2.2527, "step": 5133 }, { "epoch": 0.8786582235153175, "grad_norm": 27.15216827392578, "learning_rate": 2.9252709640616087e-05, "loss": 2.759, "step": 5134 }, { "epoch": 0.8788293684750984, "grad_norm": 22.588558197021484, "learning_rate": 2.925841414717627e-05, "loss": 2.6227, "step": 5135 }, { "epoch": 0.8790005134348794, "grad_norm": 33.35072326660156, "learning_rate": 2.926411865373645e-05, "loss": 1.2665, "step": 5136 }, { "epoch": 0.8791716583946603, "grad_norm": 41.13359451293945, "learning_rate": 2.9269823160296633e-05, "loss": 5.9406, "step": 5137 }, { "epoch": 0.8793428033544413, "grad_norm": 23.982492446899414, "learning_rate": 2.9275527666856817e-05, "loss": 2.9734, "step": 5138 }, { "epoch": 0.8795139483142221, "grad_norm": 16.065969467163086, "learning_rate": 2.9281232173417e-05, "loss": 1.4901, "step": 5139 }, { "epoch": 0.8796850932740031, "grad_norm": 29.961244583129883, "learning_rate": 2.9286936679977183e-05, "loss": 1.5246, "step": 5140 }, { "epoch": 0.879856238233784, "grad_norm": 33.30720138549805, "learning_rate": 2.9292641186537367e-05, "loss": 5.9995, "step": 5141 }, { "epoch": 0.880027383193565, "grad_norm": 24.118555068969727, "learning_rate": 2.929834569309755e-05, "loss": 2.6075, "step": 5142 }, { "epoch": 0.8801985281533459, "grad_norm": 19.221567153930664, "learning_rate": 2.930405019965773e-05, "loss": 1.7106, "step": 5143 }, { "epoch": 0.8803696731131269, "grad_norm": 22.26023292541504, "learning_rate": 2.9309754706217914e-05, "loss": 2.5151, "step": 5144 }, { "epoch": 0.8805408180729077, "grad_norm": 13.728766441345215, "learning_rate": 2.9315459212778097e-05, "loss": 1.1806, "step": 5145 }, { "epoch": 0.8807119630326887, "grad_norm": 20.878938674926758, "learning_rate": 2.932116371933828e-05, "loss": 2.107, "step": 5146 }, { "epoch": 0.8808831079924696, "grad_norm": 21.325651168823242, "learning_rate": 2.932686822589846e-05, "loss": 2.1829, "step": 5147 }, { "epoch": 0.8810542529522506, "grad_norm": 1.0309278964996338, "learning_rate": 2.9332572732458644e-05, "loss": 0.1782, "step": 5148 }, { "epoch": 0.8812253979120315, "grad_norm": 4.816126823425293, "learning_rate": 2.9338277239018827e-05, "loss": 0.3899, "step": 5149 }, { "epoch": 0.8813965428718125, "grad_norm": 1.190704584121704, "learning_rate": 2.9343981745579007e-05, "loss": 0.2021, "step": 5150 }, { "epoch": 0.8815676878315933, "grad_norm": 21.205900192260742, "learning_rate": 2.934968625213919e-05, "loss": 2.7627, "step": 5151 }, { "epoch": 0.8817388327913743, "grad_norm": 1.5020864009857178, "learning_rate": 2.9355390758699374e-05, "loss": 0.2301, "step": 5152 }, { "epoch": 0.8819099777511552, "grad_norm": 0.8392736315727234, "learning_rate": 2.9361095265259554e-05, "loss": 0.1906, "step": 5153 }, { "epoch": 0.8820811227109362, "grad_norm": 0.8165884017944336, "learning_rate": 2.9366799771819737e-05, "loss": 0.1924, "step": 5154 }, { "epoch": 0.8822522676707171, "grad_norm": 16.63069725036621, "learning_rate": 2.937250427837992e-05, "loss": 1.2813, "step": 5155 }, { "epoch": 0.8824234126304981, "grad_norm": 8.837629318237305, "learning_rate": 2.9378208784940104e-05, "loss": 0.5551, "step": 5156 }, { "epoch": 0.8825945575902789, "grad_norm": 30.035585403442383, "learning_rate": 2.9383913291500284e-05, "loss": 5.6298, "step": 5157 }, { "epoch": 0.8827657025500599, "grad_norm": 18.92460060119629, "learning_rate": 2.9389617798060467e-05, "loss": 2.1136, "step": 5158 }, { "epoch": 0.8829368475098408, "grad_norm": 18.019941329956055, "learning_rate": 2.939532230462065e-05, "loss": 1.701, "step": 5159 }, { "epoch": 0.8831079924696218, "grad_norm": 25.73262596130371, "learning_rate": 2.940102681118083e-05, "loss": 2.4158, "step": 5160 }, { "epoch": 0.8832791374294027, "grad_norm": 30.859712600708008, "learning_rate": 2.9406731317741014e-05, "loss": 5.5833, "step": 5161 }, { "epoch": 0.8834502823891837, "grad_norm": 18.802223205566406, "learning_rate": 2.9412435824301197e-05, "loss": 1.9511, "step": 5162 }, { "epoch": 0.8836214273489645, "grad_norm": 5.314499855041504, "learning_rate": 2.9418140330861384e-05, "loss": 0.4971, "step": 5163 }, { "epoch": 0.8837925723087455, "grad_norm": 27.17005157470703, "learning_rate": 2.9423844837421564e-05, "loss": 3.7148, "step": 5164 }, { "epoch": 0.8839637172685264, "grad_norm": 25.687992095947266, "learning_rate": 2.9429549343981748e-05, "loss": 3.388, "step": 5165 }, { "epoch": 0.8841348622283074, "grad_norm": 29.165775299072266, "learning_rate": 2.943525385054193e-05, "loss": 3.7435, "step": 5166 }, { "epoch": 0.8843060071880883, "grad_norm": 21.13896942138672, "learning_rate": 2.944095835710211e-05, "loss": 1.7033, "step": 5167 }, { "epoch": 0.8844771521478693, "grad_norm": 14.15404224395752, "learning_rate": 2.9446662863662294e-05, "loss": 1.156, "step": 5168 }, { "epoch": 0.8846482971076501, "grad_norm": 27.660737991333008, "learning_rate": 2.9452367370222478e-05, "loss": 1.8129, "step": 5169 }, { "epoch": 0.8848194420674311, "grad_norm": 24.719099044799805, "learning_rate": 2.945807187678266e-05, "loss": 2.121, "step": 5170 }, { "epoch": 0.884990587027212, "grad_norm": 21.35886001586914, "learning_rate": 2.946377638334284e-05, "loss": 2.0431, "step": 5171 }, { "epoch": 0.885161731986993, "grad_norm": 22.36219596862793, "learning_rate": 2.9469480889903024e-05, "loss": 2.134, "step": 5172 }, { "epoch": 0.8853328769467739, "grad_norm": 24.10101890563965, "learning_rate": 2.9475185396463208e-05, "loss": 2.071, "step": 5173 }, { "epoch": 0.8855040219065549, "grad_norm": 18.05703353881836, "learning_rate": 2.9480889903023388e-05, "loss": 1.558, "step": 5174 }, { "epoch": 0.8856751668663357, "grad_norm": 26.822391510009766, "learning_rate": 2.948659440958357e-05, "loss": 2.3976, "step": 5175 }, { "epoch": 0.8858463118261167, "grad_norm": 18.833152770996094, "learning_rate": 2.9492298916143755e-05, "loss": 1.6977, "step": 5176 }, { "epoch": 0.8860174567858976, "grad_norm": 19.91875648498535, "learning_rate": 2.9498003422703938e-05, "loss": 1.5125, "step": 5177 }, { "epoch": 0.8861886017456786, "grad_norm": 20.179113388061523, "learning_rate": 2.9503707929264118e-05, "loss": 1.9563, "step": 5178 }, { "epoch": 0.8863597467054595, "grad_norm": 22.212738037109375, "learning_rate": 2.95094124358243e-05, "loss": 1.8149, "step": 5179 }, { "epoch": 0.8865308916652405, "grad_norm": 19.415225982666016, "learning_rate": 2.9515116942384485e-05, "loss": 1.6777, "step": 5180 }, { "epoch": 0.8867020366250213, "grad_norm": 31.313318252563477, "learning_rate": 2.9520821448944665e-05, "loss": 4.1531, "step": 5181 }, { "epoch": 0.8868731815848023, "grad_norm": 34.813720703125, "learning_rate": 2.9526525955504848e-05, "loss": 5.9337, "step": 5182 }, { "epoch": 0.8870443265445833, "grad_norm": 5.619294166564941, "learning_rate": 2.953223046206503e-05, "loss": 0.4206, "step": 5183 }, { "epoch": 0.8872154715043642, "grad_norm": 18.10093116760254, "learning_rate": 2.953793496862521e-05, "loss": 1.8818, "step": 5184 }, { "epoch": 0.8873866164641452, "grad_norm": 22.031768798828125, "learning_rate": 2.9543639475185398e-05, "loss": 2.6025, "step": 5185 }, { "epoch": 0.8875577614239261, "grad_norm": 19.005178451538086, "learning_rate": 2.954934398174558e-05, "loss": 2.3032, "step": 5186 }, { "epoch": 0.887728906383707, "grad_norm": 2.4968888759613037, "learning_rate": 2.9555048488305765e-05, "loss": 0.2314, "step": 5187 }, { "epoch": 0.8879000513434879, "grad_norm": 15.055726051330566, "learning_rate": 2.9560752994865945e-05, "loss": 1.2134, "step": 5188 }, { "epoch": 0.8880711963032689, "grad_norm": 16.972787857055664, "learning_rate": 2.956645750142613e-05, "loss": 1.5495, "step": 5189 }, { "epoch": 0.8882423412630498, "grad_norm": 5.097226142883301, "learning_rate": 2.9572162007986312e-05, "loss": 0.4263, "step": 5190 }, { "epoch": 0.8884134862228308, "grad_norm": 23.945755004882812, "learning_rate": 2.9577866514546492e-05, "loss": 1.9982, "step": 5191 }, { "epoch": 0.8885846311826117, "grad_norm": 18.41358184814453, "learning_rate": 2.9583571021106675e-05, "loss": 1.5775, "step": 5192 }, { "epoch": 0.8887557761423927, "grad_norm": 20.26495361328125, "learning_rate": 2.958927552766686e-05, "loss": 2.0699, "step": 5193 }, { "epoch": 0.8889269211021735, "grad_norm": 21.891618728637695, "learning_rate": 2.9594980034227042e-05, "loss": 1.8668, "step": 5194 }, { "epoch": 0.8890980660619545, "grad_norm": 18.51753807067871, "learning_rate": 2.9600684540787222e-05, "loss": 1.7949, "step": 5195 }, { "epoch": 0.8892692110217354, "grad_norm": 21.540264129638672, "learning_rate": 2.9606389047347405e-05, "loss": 2.2159, "step": 5196 }, { "epoch": 0.8894403559815164, "grad_norm": 25.46014976501465, "learning_rate": 2.961209355390759e-05, "loss": 2.5065, "step": 5197 }, { "epoch": 0.8896115009412973, "grad_norm": 37.4268798828125, "learning_rate": 2.961779806046777e-05, "loss": 1.9917, "step": 5198 }, { "epoch": 0.8897826459010783, "grad_norm": 25.361825942993164, "learning_rate": 2.9623502567027952e-05, "loss": 2.6214, "step": 5199 }, { "epoch": 0.8899537908608591, "grad_norm": 8.111501693725586, "learning_rate": 2.9629207073588135e-05, "loss": 0.8144, "step": 5200 }, { "epoch": 0.8901249358206401, "grad_norm": 18.618261337280273, "learning_rate": 2.963491158014832e-05, "loss": 1.9664, "step": 5201 }, { "epoch": 0.890296080780421, "grad_norm": 17.81608009338379, "learning_rate": 2.96406160867085e-05, "loss": 1.5596, "step": 5202 }, { "epoch": 0.890467225740202, "grad_norm": 5.169036388397217, "learning_rate": 2.9646320593268682e-05, "loss": 0.4681, "step": 5203 }, { "epoch": 0.8906383706999829, "grad_norm": 21.34773826599121, "learning_rate": 2.9652025099828865e-05, "loss": 2.1088, "step": 5204 }, { "epoch": 0.8908095156597639, "grad_norm": 13.635762214660645, "learning_rate": 2.9657729606389045e-05, "loss": 1.147, "step": 5205 }, { "epoch": 0.8909806606195447, "grad_norm": 21.587596893310547, "learning_rate": 2.966343411294923e-05, "loss": 2.0986, "step": 5206 }, { "epoch": 0.8911518055793257, "grad_norm": 1.29149329662323, "learning_rate": 2.9669138619509412e-05, "loss": 0.2093, "step": 5207 }, { "epoch": 0.8913229505391066, "grad_norm": 20.227731704711914, "learning_rate": 2.96748431260696e-05, "loss": 1.682, "step": 5208 }, { "epoch": 0.8914940954988876, "grad_norm": 12.934089660644531, "learning_rate": 2.968054763262978e-05, "loss": 0.6278, "step": 5209 }, { "epoch": 0.8916652404586685, "grad_norm": 18.040390014648438, "learning_rate": 2.9686252139189962e-05, "loss": 1.561, "step": 5210 }, { "epoch": 0.8918363854184495, "grad_norm": 24.204835891723633, "learning_rate": 2.9691956645750146e-05, "loss": 1.7786, "step": 5211 }, { "epoch": 0.8920075303782303, "grad_norm": 23.571611404418945, "learning_rate": 2.9697661152310326e-05, "loss": 2.0676, "step": 5212 }, { "epoch": 0.8921786753380113, "grad_norm": 22.09473991394043, "learning_rate": 2.970336565887051e-05, "loss": 2.099, "step": 5213 }, { "epoch": 0.8923498202977922, "grad_norm": 1.3482792377471924, "learning_rate": 2.9709070165430692e-05, "loss": 0.2145, "step": 5214 }, { "epoch": 0.8925209652575732, "grad_norm": 21.764923095703125, "learning_rate": 2.9714774671990872e-05, "loss": 2.0434, "step": 5215 }, { "epoch": 0.8926921102173541, "grad_norm": 19.195314407348633, "learning_rate": 2.9720479178551056e-05, "loss": 1.4061, "step": 5216 }, { "epoch": 0.892863255177135, "grad_norm": 87.61866760253906, "learning_rate": 2.972618368511124e-05, "loss": 7.9821, "step": 5217 }, { "epoch": 0.8930344001369159, "grad_norm": 19.71204376220703, "learning_rate": 2.9731888191671423e-05, "loss": 1.7201, "step": 5218 }, { "epoch": 0.8932055450966969, "grad_norm": 25.464221954345703, "learning_rate": 2.9737592698231603e-05, "loss": 2.8268, "step": 5219 }, { "epoch": 0.8933766900564778, "grad_norm": 1.361387014389038, "learning_rate": 2.9743297204791786e-05, "loss": 0.2127, "step": 5220 }, { "epoch": 0.8935478350162588, "grad_norm": 18.932161331176758, "learning_rate": 2.974900171135197e-05, "loss": 1.7737, "step": 5221 }, { "epoch": 0.8937189799760397, "grad_norm": 6.523068904876709, "learning_rate": 2.975470621791215e-05, "loss": 0.5011, "step": 5222 }, { "epoch": 0.8938901249358207, "grad_norm": 21.28632926940918, "learning_rate": 2.9760410724472333e-05, "loss": 2.0839, "step": 5223 }, { "epoch": 0.8940612698956015, "grad_norm": 2.508774995803833, "learning_rate": 2.9766115231032516e-05, "loss": 0.2127, "step": 5224 }, { "epoch": 0.8942324148553825, "grad_norm": 22.384605407714844, "learning_rate": 2.97718197375927e-05, "loss": 1.7595, "step": 5225 }, { "epoch": 0.8944035598151634, "grad_norm": 13.072036743164062, "learning_rate": 2.977752424415288e-05, "loss": 1.0962, "step": 5226 }, { "epoch": 0.8945747047749444, "grad_norm": 27.329818725585938, "learning_rate": 2.9783228750713063e-05, "loss": 2.492, "step": 5227 }, { "epoch": 0.8947458497347253, "grad_norm": 0.9053159952163696, "learning_rate": 2.9788933257273246e-05, "loss": 0.1874, "step": 5228 }, { "epoch": 0.8949169946945063, "grad_norm": 17.836803436279297, "learning_rate": 2.9794637763833426e-05, "loss": 1.5566, "step": 5229 }, { "epoch": 0.8950881396542871, "grad_norm": 15.548909187316895, "learning_rate": 2.980034227039361e-05, "loss": 1.2955, "step": 5230 }, { "epoch": 0.8952592846140681, "grad_norm": 19.422529220581055, "learning_rate": 2.9806046776953796e-05, "loss": 2.5122, "step": 5231 }, { "epoch": 0.895430429573849, "grad_norm": 0.8730620741844177, "learning_rate": 2.981175128351398e-05, "loss": 0.1783, "step": 5232 }, { "epoch": 0.89560157453363, "grad_norm": 22.175594329833984, "learning_rate": 2.981745579007416e-05, "loss": 1.6344, "step": 5233 }, { "epoch": 0.895772719493411, "grad_norm": 4.08912467956543, "learning_rate": 2.9823160296634343e-05, "loss": 0.3297, "step": 5234 }, { "epoch": 0.8959438644531919, "grad_norm": 20.610801696777344, "learning_rate": 2.9828864803194527e-05, "loss": 2.0051, "step": 5235 }, { "epoch": 0.8961150094129728, "grad_norm": 19.67643165588379, "learning_rate": 2.9834569309754706e-05, "loss": 1.281, "step": 5236 }, { "epoch": 0.8962861543727537, "grad_norm": 22.13687515258789, "learning_rate": 2.984027381631489e-05, "loss": 2.5087, "step": 5237 }, { "epoch": 0.8964572993325347, "grad_norm": 5.210666656494141, "learning_rate": 2.9845978322875073e-05, "loss": 0.4318, "step": 5238 }, { "epoch": 0.8966284442923156, "grad_norm": 31.837879180908203, "learning_rate": 2.9851682829435257e-05, "loss": 3.9733, "step": 5239 }, { "epoch": 0.8967995892520966, "grad_norm": 24.229183197021484, "learning_rate": 2.9857387335995437e-05, "loss": 2.0104, "step": 5240 }, { "epoch": 0.8969707342118775, "grad_norm": 62.011810302734375, "learning_rate": 2.986309184255562e-05, "loss": 7.4416, "step": 5241 }, { "epoch": 0.8971418791716584, "grad_norm": 23.459850311279297, "learning_rate": 2.9868796349115803e-05, "loss": 2.7344, "step": 5242 }, { "epoch": 0.8973130241314393, "grad_norm": 106.41157531738281, "learning_rate": 2.9874500855675983e-05, "loss": 8.8184, "step": 5243 }, { "epoch": 0.8974841690912203, "grad_norm": 18.093460083007812, "learning_rate": 2.9880205362236167e-05, "loss": 1.3483, "step": 5244 }, { "epoch": 0.8976553140510012, "grad_norm": 6.958301544189453, "learning_rate": 2.988590986879635e-05, "loss": 0.5888, "step": 5245 }, { "epoch": 0.8978264590107822, "grad_norm": 28.051706314086914, "learning_rate": 2.9891614375356534e-05, "loss": 3.8837, "step": 5246 }, { "epoch": 0.897997603970563, "grad_norm": 16.499065399169922, "learning_rate": 2.9897318881916713e-05, "loss": 1.548, "step": 5247 }, { "epoch": 0.898168748930344, "grad_norm": 24.6423282623291, "learning_rate": 2.9903023388476897e-05, "loss": 2.0936, "step": 5248 }, { "epoch": 0.8983398938901249, "grad_norm": 23.600177764892578, "learning_rate": 2.990872789503708e-05, "loss": 2.4881, "step": 5249 }, { "epoch": 0.8985110388499059, "grad_norm": 6.3298163414001465, "learning_rate": 2.991443240159726e-05, "loss": 0.8614, "step": 5250 }, { "epoch": 0.8986821838096868, "grad_norm": 84.1204833984375, "learning_rate": 2.9920136908157444e-05, "loss": 7.2037, "step": 5251 }, { "epoch": 0.8988533287694678, "grad_norm": 13.504063606262207, "learning_rate": 2.9925841414717627e-05, "loss": 1.2107, "step": 5252 }, { "epoch": 0.8990244737292487, "grad_norm": 10.01652717590332, "learning_rate": 2.9931545921277807e-05, "loss": 0.6106, "step": 5253 }, { "epoch": 0.8991956186890296, "grad_norm": 139.29315185546875, "learning_rate": 2.9937250427837994e-05, "loss": 8.8993, "step": 5254 }, { "epoch": 0.8993667636488105, "grad_norm": 31.561298370361328, "learning_rate": 2.9942954934398177e-05, "loss": 3.2313, "step": 5255 }, { "epoch": 0.8995379086085915, "grad_norm": 2.1672093868255615, "learning_rate": 2.994865944095836e-05, "loss": 0.2206, "step": 5256 }, { "epoch": 0.8997090535683724, "grad_norm": 6.4866414070129395, "learning_rate": 2.995436394751854e-05, "loss": 0.5123, "step": 5257 }, { "epoch": 0.8998801985281534, "grad_norm": 12.993927955627441, "learning_rate": 2.9960068454078724e-05, "loss": 1.0112, "step": 5258 }, { "epoch": 0.9000513434879343, "grad_norm": 27.867324829101562, "learning_rate": 2.9965772960638907e-05, "loss": 3.0155, "step": 5259 }, { "epoch": 0.9002224884477152, "grad_norm": 27.61526870727539, "learning_rate": 2.9971477467199087e-05, "loss": 3.0738, "step": 5260 }, { "epoch": 0.9003936334074961, "grad_norm": 38.995018005371094, "learning_rate": 2.997718197375927e-05, "loss": 1.5856, "step": 5261 }, { "epoch": 0.9005647783672771, "grad_norm": 1.1415177583694458, "learning_rate": 2.9982886480319454e-05, "loss": 0.2026, "step": 5262 }, { "epoch": 0.9005647783672771, "eval_nli-pairs_loss": 1.9112855195999146, "eval_nli-pairs_runtime": 4.5969, "eval_nli-pairs_samples_per_second": 43.508, "eval_nli-pairs_steps_per_second": 1.523, "eval_sts-test_pearson_cosine": 0.7624463325036241, "eval_sts-test_pearson_dot": 0.6376176200770809, "eval_sts-test_pearson_euclidean": 0.7595445366220308, "eval_sts-test_pearson_manhattan": 0.7665949852672425, "eval_sts-test_pearson_max": 0.7665949852672425, "eval_sts-test_spearman_cosine": 0.7585611029472056, "eval_sts-test_spearman_dot": 0.6199386840754815, "eval_sts-test_spearman_euclidean": 0.7459963199554185, "eval_sts-test_spearman_manhattan": 0.755338152133313, "eval_sts-test_spearman_max": 0.7585611029472056, "step": 5262 }, { "epoch": 0.9005647783672771, "eval_vitaminc-pairs_loss": 1.1131478548049927, "eval_vitaminc-pairs_runtime": 2.815, "eval_vitaminc-pairs_samples_per_second": 71.047, "eval_vitaminc-pairs_steps_per_second": 2.487, "step": 5262 }, { "epoch": 0.9005647783672771, "eval_qnli-contrastive_loss": 2.1876909732818604, "eval_qnli-contrastive_runtime": 0.6866, "eval_qnli-contrastive_samples_per_second": 291.295, "eval_qnli-contrastive_steps_per_second": 10.195, "step": 5262 }, { "epoch": 0.9005647783672771, "eval_scitail-pairs-qa_loss": 0.16065949201583862, "eval_scitail-pairs-qa_runtime": 1.7154, "eval_scitail-pairs-qa_samples_per_second": 116.59, "eval_scitail-pairs-qa_steps_per_second": 4.081, "step": 5262 }, { "epoch": 0.9005647783672771, "eval_scitail-pairs-pos_loss": 0.7642461061477661, "eval_scitail-pairs-pos_runtime": 3.0222, "eval_scitail-pairs-pos_samples_per_second": 66.177, "eval_scitail-pairs-pos_steps_per_second": 2.316, "step": 5262 }, { "epoch": 0.9005647783672771, "eval_xsum-pairs_loss": 0.9787455797195435, "eval_xsum-pairs_runtime": 2.6515, "eval_xsum-pairs_samples_per_second": 66.001, "eval_xsum-pairs_steps_per_second": 2.263, "step": 5262 }, { "epoch": 0.9005647783672771, "eval_compression-pairs_loss": 0.35531559586524963, "eval_compression-pairs_runtime": 0.519, "eval_compression-pairs_samples_per_second": 385.368, "eval_compression-pairs_steps_per_second": 13.488, "step": 5262 }, { "epoch": 0.9005647783672771, "eval_sciq_pairs_loss": 5.063950538635254, "eval_sciq_pairs_runtime": 9.7486, "eval_sciq_pairs_samples_per_second": 20.516, "eval_sciq_pairs_steps_per_second": 0.718, "step": 5262 }, { "epoch": 0.9005647783672771, "eval_qasc_pairs_loss": 6.048434257507324, "eval_qasc_pairs_runtime": 2.9175, "eval_qasc_pairs_samples_per_second": 68.551, "eval_qasc_pairs_steps_per_second": 2.399, "step": 5262 }, { "epoch": 0.9005647783672771, "eval_openbookqa_pairs_loss": 3.0121548175811768, "eval_openbookqa_pairs_runtime": 0.6555, "eval_openbookqa_pairs_samples_per_second": 105.258, "eval_openbookqa_pairs_steps_per_second": 4.576, "step": 5262 }, { "epoch": 0.9005647783672771, "eval_msmarco_pairs_loss": 1.5317801237106323, "eval_msmarco_pairs_runtime": 3.9921, "eval_msmarco_pairs_samples_per_second": 50.099, "eval_msmarco_pairs_steps_per_second": 1.753, "step": 5262 }, { "epoch": 0.9005647783672771, "eval_nq_pairs_loss": 1.7686961889266968, "eval_nq_pairs_runtime": 8.7237, "eval_nq_pairs_samples_per_second": 22.926, "eval_nq_pairs_steps_per_second": 0.802, "step": 5262 }, { "epoch": 0.9005647783672771, "eval_trivia_pairs_loss": 2.2582404613494873, "eval_trivia_pairs_runtime": 12.9183, "eval_trivia_pairs_samples_per_second": 15.482, "eval_trivia_pairs_steps_per_second": 0.542, "step": 5262 }, { "epoch": 0.9005647783672771, "eval_quora_pairs_loss": 0.30566632747650146, "eval_quora_pairs_runtime": 1.5883, "eval_quora_pairs_samples_per_second": 125.918, "eval_quora_pairs_steps_per_second": 4.407, "step": 5262 }, { "epoch": 0.9005647783672771, "eval_gooaq_pairs_loss": 1.1452974081039429, "eval_gooaq_pairs_runtime": 2.6271, "eval_gooaq_pairs_samples_per_second": 76.128, "eval_gooaq_pairs_steps_per_second": 2.664, "step": 5262 }, { "epoch": 0.900735923327058, "grad_norm": 17.04757308959961, "learning_rate": 2.9988590986879637e-05, "loss": 1.4081, "step": 5263 }, { "epoch": 0.900907068286839, "grad_norm": 30.77292823791504, "learning_rate": 2.9994295493439817e-05, "loss": 5.9519, "step": 5264 }, { "epoch": 0.9010782132466199, "grad_norm": 18.369760513305664, "learning_rate": 3e-05, "loss": 1.7582, "step": 5265 }, { "epoch": 0.9012493582064008, "grad_norm": 33.92514419555664, "learning_rate": 2.999999557498883e-05, "loss": 5.7165, "step": 5266 }, { "epoch": 0.9014205031661817, "grad_norm": 25.037368774414062, "learning_rate": 2.999998229995793e-05, "loss": 3.4615, "step": 5267 }, { "epoch": 0.9015916481259627, "grad_norm": 1.5256587266921997, "learning_rate": 2.9999960174915127e-05, "loss": 0.2069, "step": 5268 }, { "epoch": 0.9017627930857436, "grad_norm": 16.602886199951172, "learning_rate": 2.999992919987348e-05, "loss": 1.3721, "step": 5269 }, { "epoch": 0.9019339380455246, "grad_norm": 16.512067794799805, "learning_rate": 2.9999889374851267e-05, "loss": 1.4327, "step": 5270 }, { "epoch": 0.9021050830053055, "grad_norm": 36.09376525878906, "learning_rate": 2.999984069987198e-05, "loss": 5.6646, "step": 5271 }, { "epoch": 0.9022762279650864, "grad_norm": 18.73316192626953, "learning_rate": 2.9999783174964336e-05, "loss": 1.9157, "step": 5272 }, { "epoch": 0.9024473729248673, "grad_norm": 24.654346466064453, "learning_rate": 2.9999716800162275e-05, "loss": 2.7708, "step": 5273 }, { "epoch": 0.9026185178846483, "grad_norm": 7.934967517852783, "learning_rate": 2.9999641575504964e-05, "loss": 0.9629, "step": 5274 }, { "epoch": 0.9027896628444292, "grad_norm": 13.806313514709473, "learning_rate": 2.9999557501036782e-05, "loss": 1.181, "step": 5275 }, { "epoch": 0.9029608078042102, "grad_norm": 21.018030166625977, "learning_rate": 2.999946457680733e-05, "loss": 2.4935, "step": 5276 }, { "epoch": 0.903131952763991, "grad_norm": 27.857648849487305, "learning_rate": 2.999936280287144e-05, "loss": 3.6943, "step": 5277 }, { "epoch": 0.903303097723772, "grad_norm": 16.297943115234375, "learning_rate": 2.9999252179289158e-05, "loss": 1.3468, "step": 5278 }, { "epoch": 0.9034742426835529, "grad_norm": 24.035676956176758, "learning_rate": 2.9999132706125743e-05, "loss": 2.1091, "step": 5279 }, { "epoch": 0.9036453876433339, "grad_norm": 26.095598220825195, "learning_rate": 2.9999004383451696e-05, "loss": 5.4428, "step": 5280 }, { "epoch": 0.9038165326031148, "grad_norm": 8.601061820983887, "learning_rate": 2.9998867211342718e-05, "loss": 0.7404, "step": 5281 }, { "epoch": 0.9039876775628958, "grad_norm": 95.90294647216797, "learning_rate": 2.999872118987975e-05, "loss": 7.0901, "step": 5282 }, { "epoch": 0.9041588225226767, "grad_norm": 20.32706069946289, "learning_rate": 2.9998566319148938e-05, "loss": 2.4138, "step": 5283 }, { "epoch": 0.9043299674824576, "grad_norm": 16.315454483032227, "learning_rate": 2.9998402599241654e-05, "loss": 1.3743, "step": 5284 }, { "epoch": 0.9045011124422386, "grad_norm": 28.84542465209961, "learning_rate": 2.99982300302545e-05, "loss": 5.7125, "step": 5285 }, { "epoch": 0.9046722574020195, "grad_norm": 17.301790237426758, "learning_rate": 2.9998048612289287e-05, "loss": 1.1816, "step": 5286 }, { "epoch": 0.9048434023618005, "grad_norm": 21.75240135192871, "learning_rate": 2.9997858345453058e-05, "loss": 1.8483, "step": 5287 }, { "epoch": 0.9050145473215814, "grad_norm": 12.841358184814453, "learning_rate": 2.9997659229858064e-05, "loss": 1.3388, "step": 5288 }, { "epoch": 0.9051856922813624, "grad_norm": 20.186582565307617, "learning_rate": 2.999745126562179e-05, "loss": 1.8119, "step": 5289 }, { "epoch": 0.9053568372411432, "grad_norm": 18.223058700561523, "learning_rate": 2.9997234452866925e-05, "loss": 1.2181, "step": 5290 }, { "epoch": 0.9055279822009242, "grad_norm": 8.343592643737793, "learning_rate": 2.9997008791721397e-05, "loss": 0.4821, "step": 5291 }, { "epoch": 0.9056991271607051, "grad_norm": 23.329736709594727, "learning_rate": 2.9996774282318344e-05, "loss": 1.6308, "step": 5292 }, { "epoch": 0.9058702721204861, "grad_norm": 18.162965774536133, "learning_rate": 2.9996530924796127e-05, "loss": 1.6218, "step": 5293 }, { "epoch": 0.906041417080267, "grad_norm": 12.122421264648438, "learning_rate": 2.999627871929833e-05, "loss": 1.0583, "step": 5294 }, { "epoch": 0.906212562040048, "grad_norm": 84.63624572753906, "learning_rate": 2.999601766597375e-05, "loss": 7.7833, "step": 5295 }, { "epoch": 0.9063837069998288, "grad_norm": 8.922574043273926, "learning_rate": 2.9995747764976414e-05, "loss": 0.567, "step": 5296 }, { "epoch": 0.9065548519596098, "grad_norm": 50.66462707519531, "learning_rate": 2.999546901646556e-05, "loss": 2.4272, "step": 5297 }, { "epoch": 0.9067259969193907, "grad_norm": 25.654756546020508, "learning_rate": 2.9995181420605653e-05, "loss": 3.151, "step": 5298 }, { "epoch": 0.9068971418791717, "grad_norm": 20.01262664794922, "learning_rate": 2.9994884977566372e-05, "loss": 2.04, "step": 5299 }, { "epoch": 0.9070682868389526, "grad_norm": 20.105493545532227, "learning_rate": 2.9994579687522615e-05, "loss": 2.3112, "step": 5300 }, { "epoch": 0.9072394317987336, "grad_norm": 18.445158004760742, "learning_rate": 2.9994265550654512e-05, "loss": 1.6806, "step": 5301 }, { "epoch": 0.9074105767585144, "grad_norm": 18.450517654418945, "learning_rate": 2.9993942567147402e-05, "loss": 1.6844, "step": 5302 }, { "epoch": 0.9075817217182954, "grad_norm": 1.4754540920257568, "learning_rate": 2.999361073719184e-05, "loss": 0.2101, "step": 5303 }, { "epoch": 0.9077528666780763, "grad_norm": 1.742583990097046, "learning_rate": 2.999327006098362e-05, "loss": 0.2184, "step": 5304 }, { "epoch": 0.9079240116378573, "grad_norm": 55.38993835449219, "learning_rate": 2.9992920538723722e-05, "loss": 7.8441, "step": 5305 }, { "epoch": 0.9080951565976382, "grad_norm": 18.097368240356445, "learning_rate": 2.999256217061838e-05, "loss": 2.0351, "step": 5306 }, { "epoch": 0.9082663015574192, "grad_norm": 18.082368850708008, "learning_rate": 2.9992194956879027e-05, "loss": 1.5539, "step": 5307 }, { "epoch": 0.9084374465172, "grad_norm": 25.358264923095703, "learning_rate": 2.9991818897722315e-05, "loss": 2.4061, "step": 5308 }, { "epoch": 0.908608591476981, "grad_norm": 25.521377563476562, "learning_rate": 2.9991433993370126e-05, "loss": 2.4613, "step": 5309 }, { "epoch": 0.9087797364367619, "grad_norm": 18.244693756103516, "learning_rate": 2.9991040244049556e-05, "loss": 1.9235, "step": 5310 }, { "epoch": 0.9089508813965429, "grad_norm": 1.618678331375122, "learning_rate": 2.999063764999291e-05, "loss": 0.2182, "step": 5311 }, { "epoch": 0.9091220263563238, "grad_norm": 15.342190742492676, "learning_rate": 2.9990226211437717e-05, "loss": 1.4192, "step": 5312 }, { "epoch": 0.9092931713161048, "grad_norm": 19.77103614807129, "learning_rate": 2.9989805928626736e-05, "loss": 1.673, "step": 5313 }, { "epoch": 0.9094643162758856, "grad_norm": 22.868837356567383, "learning_rate": 2.9989376801807933e-05, "loss": 2.5452, "step": 5314 }, { "epoch": 0.9096354612356666, "grad_norm": 106.19268798828125, "learning_rate": 2.998893883123449e-05, "loss": 8.6557, "step": 5315 }, { "epoch": 0.9098066061954475, "grad_norm": 17.946029663085938, "learning_rate": 2.9988492017164812e-05, "loss": 1.928, "step": 5316 }, { "epoch": 0.9099777511552285, "grad_norm": 13.810042381286621, "learning_rate": 2.9988036359862517e-05, "loss": 1.3295, "step": 5317 }, { "epoch": 0.9101488961150094, "grad_norm": 13.559564590454102, "learning_rate": 2.9987571859596446e-05, "loss": 0.9588, "step": 5318 }, { "epoch": 0.9103200410747904, "grad_norm": 1.3217213153839111, "learning_rate": 2.9987098516640656e-05, "loss": 0.193, "step": 5319 }, { "epoch": 0.9104911860345712, "grad_norm": 17.758501052856445, "learning_rate": 2.9986616331274415e-05, "loss": 1.5314, "step": 5320 }, { "epoch": 0.9106623309943522, "grad_norm": 9.879485130310059, "learning_rate": 2.998612530378222e-05, "loss": 0.6681, "step": 5321 }, { "epoch": 0.9108334759541331, "grad_norm": 8.769416809082031, "learning_rate": 2.9985625434453774e-05, "loss": 0.6281, "step": 5322 }, { "epoch": 0.9110046209139141, "grad_norm": 20.583568572998047, "learning_rate": 2.9985116723584e-05, "loss": 2.0724, "step": 5323 }, { "epoch": 0.911175765873695, "grad_norm": 1.0460354089736938, "learning_rate": 2.998459917147304e-05, "loss": 0.1869, "step": 5324 }, { "epoch": 0.911346910833476, "grad_norm": 26.880903244018555, "learning_rate": 2.9984072778426246e-05, "loss": 2.9832, "step": 5325 }, { "epoch": 0.9115180557932568, "grad_norm": 5.727287292480469, "learning_rate": 2.99835375447542e-05, "loss": 0.5114, "step": 5326 }, { "epoch": 0.9116892007530378, "grad_norm": 76.71002960205078, "learning_rate": 2.9982993470772684e-05, "loss": 7.6678, "step": 5327 }, { "epoch": 0.9118603457128187, "grad_norm": 15.319390296936035, "learning_rate": 2.99824405568027e-05, "loss": 1.161, "step": 5328 }, { "epoch": 0.9120314906725997, "grad_norm": 16.707138061523438, "learning_rate": 2.9981878803170476e-05, "loss": 1.7732, "step": 5329 }, { "epoch": 0.9122026356323806, "grad_norm": 21.353525161743164, "learning_rate": 2.9981308210207444e-05, "loss": 1.9459, "step": 5330 }, { "epoch": 0.9123737805921616, "grad_norm": 26.289155960083008, "learning_rate": 2.998072877825025e-05, "loss": 2.7606, "step": 5331 }, { "epoch": 0.9125449255519424, "grad_norm": 1.3349337577819824, "learning_rate": 2.9980140507640764e-05, "loss": 0.1861, "step": 5332 }, { "epoch": 0.9127160705117234, "grad_norm": 15.343520164489746, "learning_rate": 2.9979543398726073e-05, "loss": 1.3217, "step": 5333 }, { "epoch": 0.9128872154715043, "grad_norm": 160.10142517089844, "learning_rate": 2.9978937451858457e-05, "loss": 9.5253, "step": 5334 }, { "epoch": 0.9130583604312853, "grad_norm": 19.802030563354492, "learning_rate": 2.997832266739544e-05, "loss": 1.7929, "step": 5335 }, { "epoch": 0.9132295053910663, "grad_norm": 0.8016830086708069, "learning_rate": 2.9977699045699735e-05, "loss": 0.1808, "step": 5336 }, { "epoch": 0.9134006503508472, "grad_norm": 12.472278594970703, "learning_rate": 2.9977066587139287e-05, "loss": 0.8948, "step": 5337 }, { "epoch": 0.9135717953106282, "grad_norm": 16.328351974487305, "learning_rate": 2.9976425292087245e-05, "loss": 1.5154, "step": 5338 }, { "epoch": 0.913742940270409, "grad_norm": 15.434953689575195, "learning_rate": 2.9975775160921972e-05, "loss": 1.3687, "step": 5339 }, { "epoch": 0.91391408523019, "grad_norm": 18.20110321044922, "learning_rate": 2.9975116194027046e-05, "loss": 2.0817, "step": 5340 }, { "epoch": 0.9140852301899709, "grad_norm": 10.015130043029785, "learning_rate": 2.9974448391791268e-05, "loss": 0.6482, "step": 5341 }, { "epoch": 0.9142563751497519, "grad_norm": 11.071905136108398, "learning_rate": 2.997377175460863e-05, "loss": 0.5993, "step": 5342 }, { "epoch": 0.9144275201095328, "grad_norm": 15.070958137512207, "learning_rate": 2.9973086282878353e-05, "loss": 1.5188, "step": 5343 }, { "epoch": 0.9145986650693138, "grad_norm": 6.0472917556762695, "learning_rate": 2.9972391977004867e-05, "loss": 0.4436, "step": 5344 }, { "epoch": 0.9147698100290946, "grad_norm": 21.765090942382812, "learning_rate": 2.9971688837397816e-05, "loss": 1.858, "step": 5345 }, { "epoch": 0.9149409549888756, "grad_norm": 23.28327178955078, "learning_rate": 2.997097686447205e-05, "loss": 2.1091, "step": 5346 }, { "epoch": 0.9151120999486565, "grad_norm": 11.79105281829834, "learning_rate": 2.9970256058647636e-05, "loss": 0.8249, "step": 5347 }, { "epoch": 0.9152832449084375, "grad_norm": 41.14191436767578, "learning_rate": 2.996952642034985e-05, "loss": 6.36, "step": 5348 }, { "epoch": 0.9154543898682184, "grad_norm": 20.3240966796875, "learning_rate": 2.996878795000918e-05, "loss": 1.5731, "step": 5349 }, { "epoch": 0.9156255348279994, "grad_norm": 20.114635467529297, "learning_rate": 2.9968040648061324e-05, "loss": 1.6892, "step": 5350 }, { "epoch": 0.9157966797877802, "grad_norm": 26.455171585083008, "learning_rate": 2.9967284514947192e-05, "loss": 2.6576, "step": 5351 }, { "epoch": 0.9159678247475612, "grad_norm": 20.05623435974121, "learning_rate": 2.9966519551112904e-05, "loss": 1.7239, "step": 5352 }, { "epoch": 0.9161389697073421, "grad_norm": 16.712665557861328, "learning_rate": 2.9965745757009784e-05, "loss": 1.3935, "step": 5353 }, { "epoch": 0.9163101146671231, "grad_norm": 24.99353790283203, "learning_rate": 2.996496313309438e-05, "loss": 2.6955, "step": 5354 }, { "epoch": 0.916481259626904, "grad_norm": 28.002347946166992, "learning_rate": 2.9964171679828438e-05, "loss": 2.7177, "step": 5355 }, { "epoch": 0.916652404586685, "grad_norm": 26.17693328857422, "learning_rate": 2.9963371397678915e-05, "loss": 2.1296, "step": 5356 }, { "epoch": 0.9168235495464658, "grad_norm": 9.675960540771484, "learning_rate": 2.9962562287117978e-05, "loss": 0.6918, "step": 5357 }, { "epoch": 0.9169946945062468, "grad_norm": 7.958675861358643, "learning_rate": 2.996174434862301e-05, "loss": 0.5486, "step": 5358 }, { "epoch": 0.9171658394660277, "grad_norm": 26.32320785522461, "learning_rate": 2.9960917582676586e-05, "loss": 3.7321, "step": 5359 }, { "epoch": 0.9173369844258087, "grad_norm": 24.980010986328125, "learning_rate": 2.996008198976651e-05, "loss": 2.9858, "step": 5360 }, { "epoch": 0.9175081293855896, "grad_norm": 21.72593879699707, "learning_rate": 2.9959237570385777e-05, "loss": 1.9385, "step": 5361 }, { "epoch": 0.9176792743453706, "grad_norm": 4.216024398803711, "learning_rate": 2.995838432503259e-05, "loss": 0.4022, "step": 5362 }, { "epoch": 0.9178504193051514, "grad_norm": 7.37491512298584, "learning_rate": 2.9957522254210375e-05, "loss": 0.5214, "step": 5363 }, { "epoch": 0.9180215642649324, "grad_norm": 18.15989875793457, "learning_rate": 2.9956651358427758e-05, "loss": 1.3646, "step": 5364 }, { "epoch": 0.9181927092247133, "grad_norm": 1.2965679168701172, "learning_rate": 2.9955771638198556e-05, "loss": 0.2127, "step": 5365 }, { "epoch": 0.9183638541844943, "grad_norm": 17.309446334838867, "learning_rate": 2.9954883094041813e-05, "loss": 1.3875, "step": 5366 }, { "epoch": 0.9185349991442752, "grad_norm": 31.739850997924805, "learning_rate": 2.9953985726481765e-05, "loss": 5.3984, "step": 5367 }, { "epoch": 0.9187061441040562, "grad_norm": 1.0152965784072876, "learning_rate": 2.995307953604787e-05, "loss": 0.2038, "step": 5368 }, { "epoch": 0.918877289063837, "grad_norm": 26.772960662841797, "learning_rate": 2.9952164523274775e-05, "loss": 1.5391, "step": 5369 }, { "epoch": 0.919048434023618, "grad_norm": 22.822797775268555, "learning_rate": 2.9951240688702342e-05, "loss": 2.2907, "step": 5370 }, { "epoch": 0.9192195789833989, "grad_norm": 25.479511260986328, "learning_rate": 2.9950308032875634e-05, "loss": 3.1916, "step": 5371 }, { "epoch": 0.9193907239431799, "grad_norm": 17.20477294921875, "learning_rate": 2.9949366556344916e-05, "loss": 1.8385, "step": 5372 }, { "epoch": 0.9195618689029608, "grad_norm": 71.76097869873047, "learning_rate": 2.9948416259665665e-05, "loss": 7.5777, "step": 5373 }, { "epoch": 0.9197330138627418, "grad_norm": 24.833290100097656, "learning_rate": 2.9947457143398554e-05, "loss": 2.549, "step": 5374 }, { "epoch": 0.9199041588225226, "grad_norm": 7.05638313293457, "learning_rate": 2.9946489208109468e-05, "loss": 0.6932, "step": 5375 }, { "epoch": 0.9200753037823036, "grad_norm": 20.02423858642578, "learning_rate": 2.9945512454369485e-05, "loss": 2.2629, "step": 5376 }, { "epoch": 0.9202464487420845, "grad_norm": 5.115697860717773, "learning_rate": 2.9944526882754894e-05, "loss": 0.4451, "step": 5377 }, { "epoch": 0.9204175937018655, "grad_norm": 21.760183334350586, "learning_rate": 2.994353249384718e-05, "loss": 1.7632, "step": 5378 }, { "epoch": 0.9205887386616464, "grad_norm": 19.334026336669922, "learning_rate": 2.994252928823304e-05, "loss": 1.6726, "step": 5379 }, { "epoch": 0.9207598836214274, "grad_norm": 8.374558448791504, "learning_rate": 2.9941517266504363e-05, "loss": 0.7515, "step": 5380 }, { "epoch": 0.9209310285812082, "grad_norm": 17.895437240600586, "learning_rate": 2.994049642925824e-05, "loss": 1.7334, "step": 5381 }, { "epoch": 0.9211021735409892, "grad_norm": 22.024200439453125, "learning_rate": 2.9939466777096975e-05, "loss": 2.2925, "step": 5382 }, { "epoch": 0.9212733185007701, "grad_norm": 13.27621841430664, "learning_rate": 2.9938428310628057e-05, "loss": 1.1169, "step": 5383 }, { "epoch": 0.9214444634605511, "grad_norm": 16.63943099975586, "learning_rate": 2.993738103046419e-05, "loss": 1.4458, "step": 5384 }, { "epoch": 0.921615608420332, "grad_norm": 14.45860767364502, "learning_rate": 2.9936324937223263e-05, "loss": 1.2044, "step": 5385 }, { "epoch": 0.921786753380113, "grad_norm": 23.660669326782227, "learning_rate": 2.9935260031528377e-05, "loss": 2.323, "step": 5386 }, { "epoch": 0.9219578983398939, "grad_norm": 5.464766025543213, "learning_rate": 2.993418631400783e-05, "loss": 0.5573, "step": 5387 }, { "epoch": 0.9221290432996748, "grad_norm": 4.731202125549316, "learning_rate": 2.993310378529511e-05, "loss": 0.4096, "step": 5388 }, { "epoch": 0.9223001882594558, "grad_norm": 0.8416853547096252, "learning_rate": 2.9932012446028916e-05, "loss": 0.1837, "step": 5389 }, { "epoch": 0.9224713332192367, "grad_norm": 84.00711822509766, "learning_rate": 2.9930912296853136e-05, "loss": 9.7173, "step": 5390 }, { "epoch": 0.9226424781790177, "grad_norm": 19.461177825927734, "learning_rate": 2.9929803338416863e-05, "loss": 1.7145, "step": 5391 }, { "epoch": 0.9228136231387986, "grad_norm": 18.703411102294922, "learning_rate": 2.992868557137438e-05, "loss": 2.2235, "step": 5392 }, { "epoch": 0.9229847680985795, "grad_norm": 29.23065185546875, "learning_rate": 2.9927558996385178e-05, "loss": 5.9163, "step": 5393 }, { "epoch": 0.9231559130583604, "grad_norm": 3.1618072986602783, "learning_rate": 2.9926423614113935e-05, "loss": 0.3965, "step": 5394 }, { "epoch": 0.9233270580181414, "grad_norm": 22.468812942504883, "learning_rate": 2.9925279425230525e-05, "loss": 2.0604, "step": 5395 }, { "epoch": 0.9234982029779223, "grad_norm": 18.728565216064453, "learning_rate": 2.9924126430410022e-05, "loss": 1.6515, "step": 5396 }, { "epoch": 0.9236693479377033, "grad_norm": 24.218313217163086, "learning_rate": 2.99229646303327e-05, "loss": 3.4235, "step": 5397 }, { "epoch": 0.9238404928974842, "grad_norm": 22.003061294555664, "learning_rate": 2.992179402568402e-05, "loss": 2.5075, "step": 5398 }, { "epoch": 0.9240116378572651, "grad_norm": 22.205307006835938, "learning_rate": 2.9920614617154634e-05, "loss": 2.9202, "step": 5399 }, { "epoch": 0.924182782817046, "grad_norm": 8.464454650878906, "learning_rate": 2.9919426405440406e-05, "loss": 0.5435, "step": 5400 }, { "epoch": 0.924353927776827, "grad_norm": 21.94829559326172, "learning_rate": 2.991822939124237e-05, "loss": 2.3394, "step": 5401 }, { "epoch": 0.9245250727366079, "grad_norm": 17.917320251464844, "learning_rate": 2.991702357526678e-05, "loss": 1.4916, "step": 5402 }, { "epoch": 0.9246962176963889, "grad_norm": 7.415956497192383, "learning_rate": 2.9915808958225057e-05, "loss": 0.5225, "step": 5403 }, { "epoch": 0.9248673626561698, "grad_norm": 25.18984031677246, "learning_rate": 2.9914585540833836e-05, "loss": 1.8023, "step": 5404 }, { "epoch": 0.9250385076159507, "grad_norm": 17.561384201049805, "learning_rate": 2.9913353323814928e-05, "loss": 1.46, "step": 5405 }, { "epoch": 0.9252096525757316, "grad_norm": 4.7073588371276855, "learning_rate": 2.9912112307895352e-05, "loss": 0.3232, "step": 5406 }, { "epoch": 0.9253807975355126, "grad_norm": 19.66071319580078, "learning_rate": 2.9910862493807297e-05, "loss": 2.3367, "step": 5407 }, { "epoch": 0.9255519424952935, "grad_norm": 6.795912742614746, "learning_rate": 2.9909603882288167e-05, "loss": 0.5667, "step": 5408 }, { "epoch": 0.9257230874550745, "grad_norm": 21.672094345092773, "learning_rate": 2.9908336474080534e-05, "loss": 1.8804, "step": 5409 }, { "epoch": 0.9258942324148554, "grad_norm": 14.29305648803711, "learning_rate": 2.9907060269932176e-05, "loss": 1.4349, "step": 5410 }, { "epoch": 0.9260653773746363, "grad_norm": 5.773280143737793, "learning_rate": 2.9905775270596058e-05, "loss": 0.36, "step": 5411 }, { "epoch": 0.9262365223344172, "grad_norm": 8.370780944824219, "learning_rate": 2.990448147683033e-05, "loss": 0.5075, "step": 5412 }, { "epoch": 0.9264076672941982, "grad_norm": 18.59760093688965, "learning_rate": 2.9903178889398325e-05, "loss": 1.7468, "step": 5413 }, { "epoch": 0.9265788122539791, "grad_norm": 10.879481315612793, "learning_rate": 2.9901867509068582e-05, "loss": 0.5504, "step": 5414 }, { "epoch": 0.9267499572137601, "grad_norm": 95.23054504394531, "learning_rate": 2.9900547336614815e-05, "loss": 2.7427, "step": 5415 }, { "epoch": 0.926921102173541, "grad_norm": 24.241077423095703, "learning_rate": 2.9899218372815923e-05, "loss": 2.9172, "step": 5416 }, { "epoch": 0.927092247133322, "grad_norm": 19.38135528564453, "learning_rate": 2.9897880618456e-05, "loss": 1.7318, "step": 5417 }, { "epoch": 0.9272633920931028, "grad_norm": 19.093875885009766, "learning_rate": 2.989653407432432e-05, "loss": 1.5689, "step": 5418 }, { "epoch": 0.9274345370528838, "grad_norm": 15.81525707244873, "learning_rate": 2.989517874121535e-05, "loss": 1.4285, "step": 5419 }, { "epoch": 0.9276056820126647, "grad_norm": 6.848993301391602, "learning_rate": 2.9893814619928737e-05, "loss": 0.5049, "step": 5420 }, { "epoch": 0.9277768269724457, "grad_norm": 45.37533187866211, "learning_rate": 2.9892441711269315e-05, "loss": 2.325, "step": 5421 }, { "epoch": 0.9279479719322266, "grad_norm": 15.195178031921387, "learning_rate": 2.9891060016047097e-05, "loss": 1.2322, "step": 5422 }, { "epoch": 0.9281191168920075, "grad_norm": 9.455146789550781, "learning_rate": 2.9889669535077297e-05, "loss": 1.2928, "step": 5423 }, { "epoch": 0.9282902618517884, "grad_norm": 25.632465362548828, "learning_rate": 2.988827026918029e-05, "loss": 2.4087, "step": 5424 }, { "epoch": 0.9284614068115694, "grad_norm": 16.109840393066406, "learning_rate": 2.988686221918165e-05, "loss": 1.2142, "step": 5425 }, { "epoch": 0.9286325517713503, "grad_norm": 20.72352409362793, "learning_rate": 2.9885445385912124e-05, "loss": 1.6889, "step": 5426 }, { "epoch": 0.9288036967311313, "grad_norm": 7.68108606338501, "learning_rate": 2.9884019770207654e-05, "loss": 0.7547, "step": 5427 }, { "epoch": 0.9289748416909122, "grad_norm": 19.388324737548828, "learning_rate": 2.9882585372909345e-05, "loss": 1.7884, "step": 5428 }, { "epoch": 0.9291459866506931, "grad_norm": 20.230783462524414, "learning_rate": 2.9881142194863503e-05, "loss": 2.2057, "step": 5429 }, { "epoch": 0.929317131610474, "grad_norm": 1.923845648765564, "learning_rate": 2.9879690236921604e-05, "loss": 0.2006, "step": 5430 }, { "epoch": 0.929488276570255, "grad_norm": 15.715072631835938, "learning_rate": 2.98782294999403e-05, "loss": 1.6502, "step": 5431 }, { "epoch": 0.9296594215300359, "grad_norm": 135.404296875, "learning_rate": 2.987675998478143e-05, "loss": 9.1465, "step": 5432 }, { "epoch": 0.9298305664898169, "grad_norm": 3.6420836448669434, "learning_rate": 2.9875281692312005e-05, "loss": 0.3347, "step": 5433 }, { "epoch": 0.9300017114495978, "grad_norm": 23.983654022216797, "learning_rate": 2.987379462340423e-05, "loss": 2.8753, "step": 5434 }, { "epoch": 0.9301728564093787, "grad_norm": 13.618528366088867, "learning_rate": 2.9872298778935472e-05, "loss": 1.1736, "step": 5435 }, { "epoch": 0.9303440013691596, "grad_norm": 31.96976661682129, "learning_rate": 2.9870794159788285e-05, "loss": 3.0737, "step": 5436 }, { "epoch": 0.9305151463289406, "grad_norm": 0.8422608971595764, "learning_rate": 2.9869280766850397e-05, "loss": 0.1952, "step": 5437 }, { "epoch": 0.9306862912887216, "grad_norm": 12.685113906860352, "learning_rate": 2.986775860101471e-05, "loss": 0.9543, "step": 5438 }, { "epoch": 0.9308574362485025, "grad_norm": 17.57579231262207, "learning_rate": 2.9866227663179295e-05, "loss": 1.5615, "step": 5439 }, { "epoch": 0.9310285812082835, "grad_norm": 166.62765502929688, "learning_rate": 2.986468795424742e-05, "loss": 8.5544, "step": 5440 }, { "epoch": 0.9311997261680643, "grad_norm": 0.7508867383003235, "learning_rate": 2.9863139475127515e-05, "loss": 0.1882, "step": 5441 }, { "epoch": 0.9313708711278453, "grad_norm": 16.12635612487793, "learning_rate": 2.9861582226733176e-05, "loss": 1.2267, "step": 5442 }, { "epoch": 0.9315420160876262, "grad_norm": 19.12047576904297, "learning_rate": 2.986001620998319e-05, "loss": 2.0048, "step": 5443 }, { "epoch": 0.9317131610474072, "grad_norm": 27.335081100463867, "learning_rate": 2.985844142580151e-05, "loss": 1.6885, "step": 5444 }, { "epoch": 0.9318843060071881, "grad_norm": 7.98264217376709, "learning_rate": 2.9856857875117254e-05, "loss": 0.6538, "step": 5445 }, { "epoch": 0.9320554509669691, "grad_norm": 29.969785690307617, "learning_rate": 2.985526555886472e-05, "loss": 5.6321, "step": 5446 }, { "epoch": 0.93222659592675, "grad_norm": 12.692312240600586, "learning_rate": 2.9853664477983386e-05, "loss": 1.3322, "step": 5447 }, { "epoch": 0.9323977408865309, "grad_norm": 0.7580317258834839, "learning_rate": 2.9852054633417885e-05, "loss": 0.1724, "step": 5448 }, { "epoch": 0.9325688858463118, "grad_norm": 12.668038368225098, "learning_rate": 2.9850436026118027e-05, "loss": 1.2257, "step": 5449 }, { "epoch": 0.9327400308060928, "grad_norm": 23.302888870239258, "learning_rate": 2.9848808657038795e-05, "loss": 1.729, "step": 5450 }, { "epoch": 0.9329111757658737, "grad_norm": 0.7295454740524292, "learning_rate": 2.9847172527140338e-05, "loss": 0.1795, "step": 5451 }, { "epoch": 0.9330823207256547, "grad_norm": 14.082499504089355, "learning_rate": 2.9845527637387974e-05, "loss": 2.0225, "step": 5452 }, { "epoch": 0.9332534656854355, "grad_norm": 17.303613662719727, "learning_rate": 2.9843873988752195e-05, "loss": 1.432, "step": 5453 }, { "epoch": 0.9334246106452165, "grad_norm": 14.602163314819336, "learning_rate": 2.9842211582208652e-05, "loss": 1.3601, "step": 5454 }, { "epoch": 0.9335957556049974, "grad_norm": 17.85161018371582, "learning_rate": 2.984054041873817e-05, "loss": 1.72, "step": 5455 }, { "epoch": 0.9337669005647784, "grad_norm": 19.920385360717773, "learning_rate": 2.983886049932674e-05, "loss": 2.7301, "step": 5456 }, { "epoch": 0.9339380455245593, "grad_norm": 24.145822525024414, "learning_rate": 2.9837171824965506e-05, "loss": 3.4391, "step": 5457 }, { "epoch": 0.9341091904843403, "grad_norm": 8.437520027160645, "learning_rate": 2.9835474396650802e-05, "loss": 0.5739, "step": 5458 }, { "epoch": 0.9342803354441211, "grad_norm": 25.245956420898438, "learning_rate": 2.98337682153841e-05, "loss": 2.4175, "step": 5459 }, { "epoch": 0.9344514804039021, "grad_norm": 0.8599206805229187, "learning_rate": 2.9832053282172065e-05, "loss": 0.1786, "step": 5460 }, { "epoch": 0.934622625363683, "grad_norm": 19.534420013427734, "learning_rate": 2.9830329598026498e-05, "loss": 1.4205, "step": 5461 }, { "epoch": 0.934793770323464, "grad_norm": 22.197025299072266, "learning_rate": 2.982859716396438e-05, "loss": 2.7027, "step": 5462 }, { "epoch": 0.9349649152832449, "grad_norm": 24.305126190185547, "learning_rate": 2.9826855981007845e-05, "loss": 2.2722, "step": 5463 }, { "epoch": 0.9351360602430259, "grad_norm": 25.295196533203125, "learning_rate": 2.98251060501842e-05, "loss": 3.0028, "step": 5464 }, { "epoch": 0.9353072052028067, "grad_norm": 9.921125411987305, "learning_rate": 2.9823347372525905e-05, "loss": 0.9076, "step": 5465 }, { "epoch": 0.9354783501625877, "grad_norm": 92.25393676757812, "learning_rate": 2.9821579949070577e-05, "loss": 7.8437, "step": 5466 }, { "epoch": 0.9356494951223686, "grad_norm": 22.936864852905273, "learning_rate": 2.9819803780861006e-05, "loss": 2.5465, "step": 5467 }, { "epoch": 0.9358206400821496, "grad_norm": 26.565526962280273, "learning_rate": 2.9818018868945135e-05, "loss": 2.7099, "step": 5468 }, { "epoch": 0.9359917850419305, "grad_norm": 21.221967697143555, "learning_rate": 2.9816225214376052e-05, "loss": 1.9547, "step": 5469 }, { "epoch": 0.9361629300017115, "grad_norm": 15.053738594055176, "learning_rate": 2.9814422818212032e-05, "loss": 1.1589, "step": 5470 }, { "epoch": 0.9363340749614923, "grad_norm": 18.163192749023438, "learning_rate": 2.981261168151648e-05, "loss": 1.5918, "step": 5471 }, { "epoch": 0.9365052199212733, "grad_norm": 18.153732299804688, "learning_rate": 2.9810791805357972e-05, "loss": 1.503, "step": 5472 }, { "epoch": 0.9366763648810542, "grad_norm": 0.5720072984695435, "learning_rate": 2.980896319081024e-05, "loss": 0.1676, "step": 5473 }, { "epoch": 0.9368475098408352, "grad_norm": 36.6143798828125, "learning_rate": 2.9807125838952168e-05, "loss": 5.8291, "step": 5474 }, { "epoch": 0.9370186548006161, "grad_norm": 134.6314697265625, "learning_rate": 2.9805279750867796e-05, "loss": 8.5043, "step": 5475 }, { "epoch": 0.9371897997603971, "grad_norm": 17.09921646118164, "learning_rate": 2.980342492764632e-05, "loss": 1.6199, "step": 5476 }, { "epoch": 0.937360944720178, "grad_norm": 16.151418685913086, "learning_rate": 2.980156137038209e-05, "loss": 1.4363, "step": 5477 }, { "epoch": 0.9375320896799589, "grad_norm": 29.319974899291992, "learning_rate": 2.97996890801746e-05, "loss": 4.7161, "step": 5478 }, { "epoch": 0.9377032346397398, "grad_norm": 21.868122100830078, "learning_rate": 2.9797808058128513e-05, "loss": 2.1905, "step": 5479 }, { "epoch": 0.9378743795995208, "grad_norm": 19.497089385986328, "learning_rate": 2.979591830535363e-05, "loss": 2.518, "step": 5480 }, { "epoch": 0.9380455245593017, "grad_norm": 9.813490867614746, "learning_rate": 2.9794019822964908e-05, "loss": 0.8437, "step": 5481 }, { "epoch": 0.9382166695190827, "grad_norm": 15.766708374023438, "learning_rate": 2.9792112612082455e-05, "loss": 1.1947, "step": 5482 }, { "epoch": 0.9383878144788635, "grad_norm": 21.335594177246094, "learning_rate": 2.9790196673831532e-05, "loss": 1.9788, "step": 5483 }, { "epoch": 0.9385589594386445, "grad_norm": 20.775419235229492, "learning_rate": 2.9788272009342537e-05, "loss": 1.4183, "step": 5484 }, { "epoch": 0.9387301043984254, "grad_norm": 18.061182022094727, "learning_rate": 2.9786338619751033e-05, "loss": 1.7959, "step": 5485 }, { "epoch": 0.9389012493582064, "grad_norm": 18.276731491088867, "learning_rate": 2.978439650619772e-05, "loss": 1.7148, "step": 5486 }, { "epoch": 0.9390723943179873, "grad_norm": 1.8720197677612305, "learning_rate": 2.9782445669828445e-05, "loss": 0.2043, "step": 5487 }, { "epoch": 0.9392435392777683, "grad_norm": 25.51718521118164, "learning_rate": 2.978048611179421e-05, "loss": 2.6434, "step": 5488 }, { "epoch": 0.9394146842375493, "grad_norm": 15.08374309539795, "learning_rate": 2.977851783325115e-05, "loss": 1.2173, "step": 5489 }, { "epoch": 0.9395858291973301, "grad_norm": 13.691933631896973, "learning_rate": 2.977654083536056e-05, "loss": 1.22, "step": 5490 }, { "epoch": 0.9397569741571111, "grad_norm": 16.964563369750977, "learning_rate": 2.9774555119288868e-05, "loss": 1.5784, "step": 5491 }, { "epoch": 0.939928119116892, "grad_norm": 18.708364486694336, "learning_rate": 2.977256068620765e-05, "loss": 1.9451, "step": 5492 }, { "epoch": 0.940099264076673, "grad_norm": 81.27881622314453, "learning_rate": 2.9770557537293624e-05, "loss": 6.9958, "step": 5493 }, { "epoch": 0.9402704090364539, "grad_norm": 0.9163722395896912, "learning_rate": 2.976854567372865e-05, "loss": 0.1888, "step": 5494 }, { "epoch": 0.9404415539962349, "grad_norm": 9.509047508239746, "learning_rate": 2.976652509669973e-05, "loss": 0.5597, "step": 5495 }, { "epoch": 0.9406126989560157, "grad_norm": 6.115006923675537, "learning_rate": 2.976449580739901e-05, "loss": 0.5571, "step": 5496 }, { "epoch": 0.9407838439157967, "grad_norm": 54.508541107177734, "learning_rate": 2.976245780702377e-05, "loss": 6.6135, "step": 5497 }, { "epoch": 0.9409549888755776, "grad_norm": 5.758550643920898, "learning_rate": 2.9760411096776442e-05, "loss": 0.7069, "step": 5498 }, { "epoch": 0.9411261338353586, "grad_norm": 16.58424949645996, "learning_rate": 2.9758355677864574e-05, "loss": 1.4041, "step": 5499 }, { "epoch": 0.9412972787951395, "grad_norm": 13.171340942382812, "learning_rate": 2.975629155150088e-05, "loss": 0.9814, "step": 5500 }, { "epoch": 0.9414684237549205, "grad_norm": 44.11324691772461, "learning_rate": 2.975421871890319e-05, "loss": 1.8447, "step": 5501 }, { "epoch": 0.9416395687147013, "grad_norm": 13.571734428405762, "learning_rate": 2.9752137181294477e-05, "loss": 1.5262, "step": 5502 }, { "epoch": 0.9418107136744823, "grad_norm": 23.19210433959961, "learning_rate": 2.975004693990286e-05, "loss": 2.9885, "step": 5503 }, { "epoch": 0.9419818586342632, "grad_norm": 19.186378479003906, "learning_rate": 2.9747947995961572e-05, "loss": 1.8989, "step": 5504 }, { "epoch": 0.9421530035940442, "grad_norm": 20.15719223022461, "learning_rate": 2.974584035070901e-05, "loss": 1.784, "step": 5505 }, { "epoch": 0.9423241485538251, "grad_norm": 22.521728515625, "learning_rate": 2.974372400538867e-05, "loss": 1.5145, "step": 5506 }, { "epoch": 0.9424952935136061, "grad_norm": 23.285200119018555, "learning_rate": 2.974159896124921e-05, "loss": 2.343, "step": 5507 }, { "epoch": 0.9426664384733869, "grad_norm": 17.868976593017578, "learning_rate": 2.97394652195444e-05, "loss": 1.7375, "step": 5508 }, { "epoch": 0.9428375834331679, "grad_norm": 5.771058559417725, "learning_rate": 2.973732278153316e-05, "loss": 0.3672, "step": 5509 }, { "epoch": 0.9430087283929488, "grad_norm": 20.794448852539062, "learning_rate": 2.9735171648479534e-05, "loss": 2.0266, "step": 5510 }, { "epoch": 0.9431798733527298, "grad_norm": 23.893619537353516, "learning_rate": 2.973301182165268e-05, "loss": 2.8642, "step": 5511 }, { "epoch": 0.9433510183125107, "grad_norm": 26.520517349243164, "learning_rate": 2.973084330232691e-05, "loss": 2.3619, "step": 5512 }, { "epoch": 0.9435221632722917, "grad_norm": 0.8600165843963623, "learning_rate": 2.972866609178165e-05, "loss": 0.1799, "step": 5513 }, { "epoch": 0.9436933082320725, "grad_norm": 22.70649528503418, "learning_rate": 2.972648019130146e-05, "loss": 2.497, "step": 5514 }, { "epoch": 0.9438644531918535, "grad_norm": 18.863407135009766, "learning_rate": 2.972428560217602e-05, "loss": 2.2419, "step": 5515 }, { "epoch": 0.9440355981516344, "grad_norm": 21.091896057128906, "learning_rate": 2.9722082325700145e-05, "loss": 1.6153, "step": 5516 }, { "epoch": 0.9442067431114154, "grad_norm": 16.721572875976562, "learning_rate": 2.971987036317377e-05, "loss": 1.4408, "step": 5517 }, { "epoch": 0.9443778880711963, "grad_norm": 19.266984939575195, "learning_rate": 2.9717649715901956e-05, "loss": 1.6994, "step": 5518 }, { "epoch": 0.9445490330309773, "grad_norm": 105.64752197265625, "learning_rate": 2.971542038519489e-05, "loss": 7.7849, "step": 5519 }, { "epoch": 0.9447201779907581, "grad_norm": 17.627609252929688, "learning_rate": 2.9713182372367874e-05, "loss": 1.7657, "step": 5520 }, { "epoch": 0.9448913229505391, "grad_norm": 15.488409042358398, "learning_rate": 2.9710935678741347e-05, "loss": 1.5876, "step": 5521 }, { "epoch": 0.94506246791032, "grad_norm": 0.9112039804458618, "learning_rate": 2.9708680305640856e-05, "loss": 0.1914, "step": 5522 }, { "epoch": 0.945233612870101, "grad_norm": 8.6788911819458, "learning_rate": 2.9706416254397077e-05, "loss": 0.6403, "step": 5523 }, { "epoch": 0.9454047578298819, "grad_norm": 19.066268920898438, "learning_rate": 2.970414352634581e-05, "loss": 1.6665, "step": 5524 }, { "epoch": 0.9455759027896629, "grad_norm": 20.287647247314453, "learning_rate": 2.9701862122827953e-05, "loss": 1.3302, "step": 5525 }, { "epoch": 0.9457470477494437, "grad_norm": 18.065208435058594, "learning_rate": 2.969957204518955e-05, "loss": 1.5168, "step": 5526 }, { "epoch": 0.9459181927092247, "grad_norm": 20.927453994750977, "learning_rate": 2.969727329478174e-05, "loss": 1.8764, "step": 5527 }, { "epoch": 0.9460893376690056, "grad_norm": 13.100236892700195, "learning_rate": 2.96949658729608e-05, "loss": 1.3787, "step": 5528 }, { "epoch": 0.9462604826287866, "grad_norm": 3.962916851043701, "learning_rate": 2.969264978108811e-05, "loss": 0.3372, "step": 5529 }, { "epoch": 0.9464316275885675, "grad_norm": 24.814523696899414, "learning_rate": 2.969032502053016e-05, "loss": 3.0329, "step": 5530 }, { "epoch": 0.9466027725483485, "grad_norm": 26.95598030090332, "learning_rate": 2.9687991592658568e-05, "loss": 3.471, "step": 5531 }, { "epoch": 0.9467739175081293, "grad_norm": 18.62015724182129, "learning_rate": 2.9685649498850063e-05, "loss": 1.7259, "step": 5532 }, { "epoch": 0.9469450624679103, "grad_norm": 22.18822479248047, "learning_rate": 2.9683298740486477e-05, "loss": 2.9985, "step": 5533 }, { "epoch": 0.9471162074276912, "grad_norm": 22.694929122924805, "learning_rate": 2.968093931895476e-05, "loss": 2.6509, "step": 5534 }, { "epoch": 0.9472873523874722, "grad_norm": 19.534927368164062, "learning_rate": 2.9678571235646983e-05, "loss": 1.6884, "step": 5535 }, { "epoch": 0.9474584973472531, "grad_norm": 20.47797966003418, "learning_rate": 2.9676194491960313e-05, "loss": 1.9016, "step": 5536 }, { "epoch": 0.9476296423070341, "grad_norm": 18.410903930664062, "learning_rate": 2.9673809089297037e-05, "loss": 1.614, "step": 5537 }, { "epoch": 0.9478007872668149, "grad_norm": 1.0874691009521484, "learning_rate": 2.967141502906454e-05, "loss": 0.1918, "step": 5538 }, { "epoch": 0.9479719322265959, "grad_norm": 17.28070068359375, "learning_rate": 2.9669012312675317e-05, "loss": 1.6688, "step": 5539 }, { "epoch": 0.9481430771863769, "grad_norm": 22.55012321472168, "learning_rate": 2.966660094154699e-05, "loss": 2.8969, "step": 5540 }, { "epoch": 0.9483142221461578, "grad_norm": 7.753538608551025, "learning_rate": 2.966418091710226e-05, "loss": 0.5489, "step": 5541 }, { "epoch": 0.9484853671059388, "grad_norm": 15.588809967041016, "learning_rate": 2.966175224076894e-05, "loss": 1.3865, "step": 5542 }, { "epoch": 0.9486565120657197, "grad_norm": 13.793673515319824, "learning_rate": 2.9659314913979966e-05, "loss": 1.036, "step": 5543 }, { "epoch": 0.9488276570255006, "grad_norm": 14.182084083557129, "learning_rate": 2.9656868938173353e-05, "loss": 1.2276, "step": 5544 }, { "epoch": 0.9489988019852815, "grad_norm": 9.446614265441895, "learning_rate": 2.965441431479224e-05, "loss": 0.8011, "step": 5545 }, { "epoch": 0.9491699469450625, "grad_norm": 39.7092170715332, "learning_rate": 2.9651951045284857e-05, "loss": 6.266, "step": 5546 }, { "epoch": 0.9493410919048434, "grad_norm": 23.74007225036621, "learning_rate": 2.9649479131104533e-05, "loss": 2.252, "step": 5547 }, { "epoch": 0.9495122368646244, "grad_norm": 18.47795867919922, "learning_rate": 2.9646998573709693e-05, "loss": 2.0481, "step": 5548 }, { "epoch": 0.9496833818244053, "grad_norm": 0.6889795660972595, "learning_rate": 2.9644509374563887e-05, "loss": 0.1743, "step": 5549 }, { "epoch": 0.9498545267841862, "grad_norm": 29.57404899597168, "learning_rate": 2.9642011535135736e-05, "loss": 1.8229, "step": 5550 }, { "epoch": 0.9500256717439671, "grad_norm": 93.30299377441406, "learning_rate": 2.963950505689897e-05, "loss": 7.0144, "step": 5551 }, { "epoch": 0.9501968167037481, "grad_norm": 17.450119018554688, "learning_rate": 2.9636989941332415e-05, "loss": 1.3122, "step": 5552 }, { "epoch": 0.950367961663529, "grad_norm": 22.121047973632812, "learning_rate": 2.9634466189919995e-05, "loss": 2.3305, "step": 5553 }, { "epoch": 0.95053910662331, "grad_norm": 13.311408042907715, "learning_rate": 2.9631933804150726e-05, "loss": 1.3286, "step": 5554 }, { "epoch": 0.9507102515830909, "grad_norm": 30.132402420043945, "learning_rate": 2.9629392785518714e-05, "loss": 5.6789, "step": 5555 }, { "epoch": 0.9508813965428718, "grad_norm": 15.2648286819458, "learning_rate": 2.9626843135523174e-05, "loss": 1.4216, "step": 5556 }, { "epoch": 0.9510525415026527, "grad_norm": 12.007978439331055, "learning_rate": 2.9624284855668394e-05, "loss": 1.3364, "step": 5557 }, { "epoch": 0.9512236864624337, "grad_norm": 11.762513160705566, "learning_rate": 2.9621717947463768e-05, "loss": 1.1949, "step": 5558 }, { "epoch": 0.9513948314222146, "grad_norm": 22.043636322021484, "learning_rate": 2.9619142412423775e-05, "loss": 2.9871, "step": 5559 }, { "epoch": 0.9515659763819956, "grad_norm": 18.573747634887695, "learning_rate": 2.9616558252067985e-05, "loss": 1.5664, "step": 5560 }, { "epoch": 0.9517371213417765, "grad_norm": 16.317903518676758, "learning_rate": 2.9613965467921053e-05, "loss": 2.1047, "step": 5561 }, { "epoch": 0.9519082663015574, "grad_norm": 67.44075012207031, "learning_rate": 2.9611364061512733e-05, "loss": 7.6283, "step": 5562 }, { "epoch": 0.9520794112613383, "grad_norm": 22.03914451599121, "learning_rate": 2.960875403437785e-05, "loss": 2.7385, "step": 5563 }, { "epoch": 0.9522505562211193, "grad_norm": 21.74106788635254, "learning_rate": 2.960613538805633e-05, "loss": 2.1558, "step": 5564 }, { "epoch": 0.9524217011809002, "grad_norm": 25.852855682373047, "learning_rate": 2.9603508124093173e-05, "loss": 2.4488, "step": 5565 }, { "epoch": 0.9525928461406812, "grad_norm": 8.967046737670898, "learning_rate": 2.9600872244038473e-05, "loss": 6.6295, "step": 5566 }, { "epoch": 0.9527639911004621, "grad_norm": 0.8483248353004456, "learning_rate": 2.95982277494474e-05, "loss": 0.1717, "step": 5567 }, { "epoch": 0.952935136060243, "grad_norm": 21.750349044799805, "learning_rate": 2.9595574641880213e-05, "loss": 2.1601, "step": 5568 }, { "epoch": 0.9531062810200239, "grad_norm": 16.128721237182617, "learning_rate": 2.9592912922902246e-05, "loss": 1.3876, "step": 5569 }, { "epoch": 0.9532774259798049, "grad_norm": 8.323986053466797, "learning_rate": 2.959024259408392e-05, "loss": 0.5623, "step": 5570 }, { "epoch": 0.9534485709395858, "grad_norm": 17.731412887573242, "learning_rate": 2.9587563657000733e-05, "loss": 1.6673, "step": 5571 }, { "epoch": 0.9536197158993668, "grad_norm": 19.352418899536133, "learning_rate": 2.9584876113233258e-05, "loss": 2.8274, "step": 5572 }, { "epoch": 0.9537908608591477, "grad_norm": 22.872282028198242, "learning_rate": 2.9582179964367155e-05, "loss": 1.9904, "step": 5573 }, { "epoch": 0.9539620058189286, "grad_norm": 26.09086036682129, "learning_rate": 2.957947521199315e-05, "loss": 2.6627, "step": 5574 }, { "epoch": 0.9541331507787095, "grad_norm": 12.8101806640625, "learning_rate": 2.957676185770706e-05, "loss": 1.1689, "step": 5575 }, { "epoch": 0.9543042957384905, "grad_norm": 8.432300567626953, "learning_rate": 2.957403990310976e-05, "loss": 0.5261, "step": 5576 }, { "epoch": 0.9544754406982714, "grad_norm": 3.770911693572998, "learning_rate": 2.957130934980721e-05, "loss": 0.3705, "step": 5577 }, { "epoch": 0.9546465856580524, "grad_norm": 14.25076675415039, "learning_rate": 2.9568570199410436e-05, "loss": 1.3667, "step": 5578 }, { "epoch": 0.9548177306178333, "grad_norm": 13.956623077392578, "learning_rate": 2.9565822453535553e-05, "loss": 7.2221, "step": 5579 }, { "epoch": 0.9549888755776142, "grad_norm": 26.3257999420166, "learning_rate": 2.956306611380372e-05, "loss": 2.8963, "step": 5580 }, { "epoch": 0.9551600205373951, "grad_norm": 19.502458572387695, "learning_rate": 2.956030118184119e-05, "loss": 1.7251, "step": 5581 }, { "epoch": 0.9553311654971761, "grad_norm": 15.120080947875977, "learning_rate": 2.955752765927928e-05, "loss": 6.4349, "step": 5582 }, { "epoch": 0.955502310456957, "grad_norm": 18.764127731323242, "learning_rate": 2.9554745547754364e-05, "loss": 1.5513, "step": 5583 }, { "epoch": 0.955673455416738, "grad_norm": 14.517118453979492, "learning_rate": 2.9551954848907897e-05, "loss": 1.4727, "step": 5584 }, { "epoch": 0.9558446003765189, "grad_norm": 4.777843952178955, "learning_rate": 2.9549155564386396e-05, "loss": 0.5504, "step": 5585 }, { "epoch": 0.9560157453362998, "grad_norm": 16.334457397460938, "learning_rate": 2.9546347695841443e-05, "loss": 1.4043, "step": 5586 }, { "epoch": 0.9561868902960807, "grad_norm": 24.128767013549805, "learning_rate": 2.9543531244929677e-05, "loss": 2.866, "step": 5587 }, { "epoch": 0.9563580352558617, "grad_norm": 31.431903839111328, "learning_rate": 2.954070621331282e-05, "loss": 6.181, "step": 5588 }, { "epoch": 0.9565291802156427, "grad_norm": 20.100135803222656, "learning_rate": 2.9537872602657637e-05, "loss": 2.1438, "step": 5589 }, { "epoch": 0.9567003251754236, "grad_norm": 22.727962493896484, "learning_rate": 2.953503041463597e-05, "loss": 3.139, "step": 5590 }, { "epoch": 0.9568714701352046, "grad_norm": 20.488447189331055, "learning_rate": 2.9532179650924702e-05, "loss": 2.2346, "step": 5591 }, { "epoch": 0.9570426150949854, "grad_norm": 17.242015838623047, "learning_rate": 2.9529320313205797e-05, "loss": 1.763, "step": 5592 }, { "epoch": 0.9572137600547664, "grad_norm": 110.57696533203125, "learning_rate": 2.9526452403166268e-05, "loss": 8.8231, "step": 5593 }, { "epoch": 0.9573849050145473, "grad_norm": 1.8073965311050415, "learning_rate": 2.952357592249818e-05, "loss": 0.1905, "step": 5594 }, { "epoch": 0.9575560499743283, "grad_norm": 30.926698684692383, "learning_rate": 2.952069087289867e-05, "loss": 5.726, "step": 5595 }, { "epoch": 0.9577271949341092, "grad_norm": 49.94111251831055, "learning_rate": 2.9517797256069917e-05, "loss": 5.4946, "step": 5596 }, { "epoch": 0.9578983398938902, "grad_norm": 25.600858688354492, "learning_rate": 2.951489507371916e-05, "loss": 2.066, "step": 5597 }, { "epoch": 0.958069484853671, "grad_norm": 17.886932373046875, "learning_rate": 2.9511984327558687e-05, "loss": 1.5243, "step": 5598 }, { "epoch": 0.958240629813452, "grad_norm": 1.2921315431594849, "learning_rate": 2.950906501930585e-05, "loss": 0.1839, "step": 5599 }, { "epoch": 0.9584117747732329, "grad_norm": 15.199446678161621, "learning_rate": 2.950613715068303e-05, "loss": 1.2364, "step": 5600 }, { "epoch": 0.9585829197330139, "grad_norm": 45.226661682128906, "learning_rate": 2.9503200723417697e-05, "loss": 5.7504, "step": 5601 }, { "epoch": 0.9587540646927948, "grad_norm": 46.436668395996094, "learning_rate": 2.9500255739242333e-05, "loss": 5.2193, "step": 5602 }, { "epoch": 0.9589252096525758, "grad_norm": 23.483407974243164, "learning_rate": 2.9497302199894482e-05, "loss": 5.5494, "step": 5603 }, { "epoch": 0.9590963546123566, "grad_norm": 21.826322555541992, "learning_rate": 2.949434010711674e-05, "loss": 2.1491, "step": 5604 }, { "epoch": 0.9592674995721376, "grad_norm": 39.89952850341797, "learning_rate": 2.949136946265675e-05, "loss": 5.3737, "step": 5605 }, { "epoch": 0.9594386445319185, "grad_norm": 51.043880462646484, "learning_rate": 2.9488390268267186e-05, "loss": 6.9771, "step": 5606 }, { "epoch": 0.9596097894916995, "grad_norm": 1.1591174602508545, "learning_rate": 2.948540252570579e-05, "loss": 0.1753, "step": 5607 }, { "epoch": 0.9597809344514804, "grad_norm": 4.65718412399292, "learning_rate": 2.9482406236735328e-05, "loss": 0.353, "step": 5608 }, { "epoch": 0.9599520794112614, "grad_norm": 18.54163932800293, "learning_rate": 2.947940140312361e-05, "loss": 2.3331, "step": 5609 }, { "epoch": 0.9601232243710422, "grad_norm": 16.34014892578125, "learning_rate": 2.9476388026643504e-05, "loss": 1.4218, "step": 5610 }, { "epoch": 0.9602943693308232, "grad_norm": 5.687966346740723, "learning_rate": 2.9473366109072895e-05, "loss": 0.5198, "step": 5611 }, { "epoch": 0.9604655142906041, "grad_norm": 6.7711968421936035, "learning_rate": 2.947033565219473e-05, "loss": 0.461, "step": 5612 }, { "epoch": 0.9606366592503851, "grad_norm": 8.50647258758545, "learning_rate": 2.9467296657796975e-05, "loss": 0.8049, "step": 5613 }, { "epoch": 0.960807804210166, "grad_norm": 21.197162628173828, "learning_rate": 2.946424912767264e-05, "loss": 1.9265, "step": 5614 }, { "epoch": 0.960978949169947, "grad_norm": 1.9009312391281128, "learning_rate": 2.9461193063619777e-05, "loss": 0.2764, "step": 5615 }, { "epoch": 0.9611500941297278, "grad_norm": 20.549768447875977, "learning_rate": 2.9458128467441473e-05, "loss": 1.8308, "step": 5616 }, { "epoch": 0.9613212390895088, "grad_norm": 17.79680061340332, "learning_rate": 2.945505534094583e-05, "loss": 1.3624, "step": 5617 }, { "epoch": 0.9614923840492897, "grad_norm": 19.88323211669922, "learning_rate": 2.945197368594601e-05, "loss": 2.3432, "step": 5618 }, { "epoch": 0.9616635290090707, "grad_norm": 48.602989196777344, "learning_rate": 2.944888350426019e-05, "loss": 1.3937, "step": 5619 }, { "epoch": 0.9618346739688516, "grad_norm": 1.3009952306747437, "learning_rate": 2.944578479771158e-05, "loss": 0.194, "step": 5620 }, { "epoch": 0.9620058189286326, "grad_norm": 29.68500328063965, "learning_rate": 2.9442677568128422e-05, "loss": 1.4026, "step": 5621 }, { "epoch": 0.9621769638884135, "grad_norm": 25.2308292388916, "learning_rate": 2.943956181734399e-05, "loss": 3.3288, "step": 5622 }, { "epoch": 0.9623481088481944, "grad_norm": 15.033297538757324, "learning_rate": 2.943643754719658e-05, "loss": 1.8449, "step": 5623 }, { "epoch": 0.9625192538079753, "grad_norm": 20.76141357421875, "learning_rate": 2.943330475952951e-05, "loss": 1.8632, "step": 5624 }, { "epoch": 0.9626903987677563, "grad_norm": 3.1637096405029297, "learning_rate": 2.9430163456191132e-05, "loss": 0.3309, "step": 5625 }, { "epoch": 0.9628615437275372, "grad_norm": 24.777233123779297, "learning_rate": 2.9427013639034825e-05, "loss": 2.8741, "step": 5626 }, { "epoch": 0.9630326886873182, "grad_norm": 0.9104720950126648, "learning_rate": 2.9423855309918986e-05, "loss": 0.1717, "step": 5627 }, { "epoch": 0.963203833647099, "grad_norm": 85.43341827392578, "learning_rate": 2.942068847070703e-05, "loss": 2.3461, "step": 5628 }, { "epoch": 0.96337497860688, "grad_norm": 18.911468505859375, "learning_rate": 2.941751312326739e-05, "loss": 1.7253, "step": 5629 }, { "epoch": 0.9635461235666609, "grad_norm": 17.639625549316406, "learning_rate": 2.941432926947354e-05, "loss": 0.6695, "step": 5630 }, { "epoch": 0.9637172685264419, "grad_norm": 24.79645347595215, "learning_rate": 2.9411136911203945e-05, "loss": 2.3482, "step": 5631 }, { "epoch": 0.9638884134862228, "grad_norm": 18.90751838684082, "learning_rate": 2.9407936050342114e-05, "loss": 0.6091, "step": 5632 }, { "epoch": 0.9640595584460038, "grad_norm": 0.7830367684364319, "learning_rate": 2.940472668877655e-05, "loss": 0.1618, "step": 5633 }, { "epoch": 0.9642307034057847, "grad_norm": 17.63616943359375, "learning_rate": 2.940150882840079e-05, "loss": 1.4876, "step": 5634 }, { "epoch": 0.9644018483655656, "grad_norm": 13.081395149230957, "learning_rate": 2.939828247111336e-05, "loss": 1.0297, "step": 5635 }, { "epoch": 0.9645729933253465, "grad_norm": 6.1109819412231445, "learning_rate": 2.9395047618817837e-05, "loss": 0.5382, "step": 5636 }, { "epoch": 0.9647441382851275, "grad_norm": 20.527721405029297, "learning_rate": 2.939180427342277e-05, "loss": 2.6818, "step": 5637 }, { "epoch": 0.9649152832449084, "grad_norm": 18.875795364379883, "learning_rate": 2.938855243684175e-05, "loss": 1.6882, "step": 5638 }, { "epoch": 0.9650864282046894, "grad_norm": 1.780640721321106, "learning_rate": 2.938529211099336e-05, "loss": 0.2069, "step": 5639 }, { "epoch": 0.9652575731644704, "grad_norm": 76.09477233886719, "learning_rate": 2.9382023297801196e-05, "loss": 7.8139, "step": 5640 }, { "epoch": 0.9654287181242512, "grad_norm": 12.15816879272461, "learning_rate": 2.9378745999193868e-05, "loss": 0.8883, "step": 5641 }, { "epoch": 0.9655998630840322, "grad_norm": 55.28908920288086, "learning_rate": 2.9375460217104984e-05, "loss": 0.6874, "step": 5642 }, { "epoch": 0.9657710080438131, "grad_norm": 22.68217658996582, "learning_rate": 2.937216595347316e-05, "loss": 2.2194, "step": 5643 }, { "epoch": 0.9659421530035941, "grad_norm": 25.656339645385742, "learning_rate": 2.9368863210242015e-05, "loss": 0.8989, "step": 5644 }, { "epoch": 0.966113297963375, "grad_norm": 21.246002197265625, "learning_rate": 2.9365551989360176e-05, "loss": 1.7201, "step": 5645 }, { "epoch": 0.966284442923156, "grad_norm": 76.55830383300781, "learning_rate": 2.9362232292781267e-05, "loss": 7.3444, "step": 5646 }, { "epoch": 0.9664555878829368, "grad_norm": 0.7908893823623657, "learning_rate": 2.935890412246391e-05, "loss": 0.1723, "step": 5647 }, { "epoch": 0.9666267328427178, "grad_norm": 78.82179260253906, "learning_rate": 2.9355567480371734e-05, "loss": 8.4267, "step": 5648 }, { "epoch": 0.9667978778024987, "grad_norm": 11.044159889221191, "learning_rate": 2.9352222368473366e-05, "loss": 1.4268, "step": 5649 }, { "epoch": 0.9669690227622797, "grad_norm": 13.06053638458252, "learning_rate": 2.934886878874242e-05, "loss": 0.9995, "step": 5650 }, { "epoch": 0.9671401677220606, "grad_norm": 28.17104148864746, "learning_rate": 2.934550674315752e-05, "loss": 4.9991, "step": 5651 }, { "epoch": 0.9673113126818416, "grad_norm": 2.7776384353637695, "learning_rate": 2.9342136233702272e-05, "loss": 0.314, "step": 5652 }, { "epoch": 0.9674824576416224, "grad_norm": 2.665609836578369, "learning_rate": 2.9338757262365284e-05, "loss": 0.2768, "step": 5653 }, { "epoch": 0.9676536026014034, "grad_norm": 0.5785967707633972, "learning_rate": 2.9335369831140155e-05, "loss": 0.1556, "step": 5654 }, { "epoch": 0.9678247475611843, "grad_norm": 27.51642417907715, "learning_rate": 2.9331973942025476e-05, "loss": 2.3748, "step": 5655 }, { "epoch": 0.9679958925209653, "grad_norm": 9.160310745239258, "learning_rate": 2.932856959702482e-05, "loss": 0.5655, "step": 5656 }, { "epoch": 0.9681670374807462, "grad_norm": 4.589384078979492, "learning_rate": 2.932515679814676e-05, "loss": 0.4441, "step": 5657 }, { "epoch": 0.9683381824405272, "grad_norm": 22.23826026916504, "learning_rate": 2.9321735547404857e-05, "loss": 2.5487, "step": 5658 }, { "epoch": 0.968509327400308, "grad_norm": 14.580739974975586, "learning_rate": 2.931830584681765e-05, "loss": 1.1825, "step": 5659 }, { "epoch": 0.968680472360089, "grad_norm": 24.317304611206055, "learning_rate": 2.931486769840866e-05, "loss": 2.8601, "step": 5660 }, { "epoch": 0.9688516173198699, "grad_norm": 16.11456871032715, "learning_rate": 2.9311421104206407e-05, "loss": 1.542, "step": 5661 }, { "epoch": 0.9690227622796509, "grad_norm": 4.138606548309326, "learning_rate": 2.9307966066244392e-05, "loss": 0.4524, "step": 5662 }, { "epoch": 0.9691939072394318, "grad_norm": 86.82868194580078, "learning_rate": 2.9304502586561086e-05, "loss": 7.8846, "step": 5663 }, { "epoch": 0.9693650521992128, "grad_norm": 6.498536109924316, "learning_rate": 2.9301030667199943e-05, "loss": 0.3622, "step": 5664 }, { "epoch": 0.9695361971589936, "grad_norm": 16.460418701171875, "learning_rate": 2.929755031020941e-05, "loss": 1.379, "step": 5665 }, { "epoch": 0.9697073421187746, "grad_norm": 4.3522539138793945, "learning_rate": 2.92940615176429e-05, "loss": 0.4806, "step": 5666 }, { "epoch": 0.9698784870785555, "grad_norm": 2.5449700355529785, "learning_rate": 2.92905642915588e-05, "loss": 0.4481, "step": 5667 }, { "epoch": 0.9700496320383365, "grad_norm": 22.875797271728516, "learning_rate": 2.928705863402048e-05, "loss": 1.7887, "step": 5668 }, { "epoch": 0.9702207769981174, "grad_norm": 1.1875667572021484, "learning_rate": 2.928354454709629e-05, "loss": 0.1802, "step": 5669 }, { "epoch": 0.9703919219578984, "grad_norm": 0.7507598400115967, "learning_rate": 2.9280022032859543e-05, "loss": 0.1708, "step": 5670 }, { "epoch": 0.9705630669176792, "grad_norm": 4.884947776794434, "learning_rate": 2.9276491093388525e-05, "loss": 0.3205, "step": 5671 }, { "epoch": 0.9707342118774602, "grad_norm": 28.726037979125977, "learning_rate": 2.9272951730766496e-05, "loss": 1.3092, "step": 5672 }, { "epoch": 0.9709053568372411, "grad_norm": 38.645179748535156, "learning_rate": 2.9269403947081693e-05, "loss": 6.425, "step": 5673 }, { "epoch": 0.9710765017970221, "grad_norm": 30.966270446777344, "learning_rate": 2.9265847744427305e-05, "loss": 3.1593, "step": 5674 }, { "epoch": 0.971247646756803, "grad_norm": 12.976767539978027, "learning_rate": 2.92622831249015e-05, "loss": 0.7746, "step": 5675 }, { "epoch": 0.971418791716584, "grad_norm": 17.295045852661133, "learning_rate": 2.9258710090607405e-05, "loss": 1.3965, "step": 5676 }, { "epoch": 0.9715899366763648, "grad_norm": 19.398035049438477, "learning_rate": 2.925512864365312e-05, "loss": 2.2865, "step": 5677 }, { "epoch": 0.9717610816361458, "grad_norm": 0.4852750301361084, "learning_rate": 2.9251538786151702e-05, "loss": 0.1467, "step": 5678 }, { "epoch": 0.9719322265959267, "grad_norm": 0.7427716851234436, "learning_rate": 2.9247940520221176e-05, "loss": 0.1506, "step": 5679 }, { "epoch": 0.9721033715557077, "grad_norm": 16.088226318359375, "learning_rate": 2.9244333847984522e-05, "loss": 1.5801, "step": 5680 }, { "epoch": 0.9722745165154886, "grad_norm": 9.518933296203613, "learning_rate": 2.9240718771569676e-05, "loss": 0.5943, "step": 5681 }, { "epoch": 0.9724456614752696, "grad_norm": 20.686717987060547, "learning_rate": 2.923709529310955e-05, "loss": 1.8611, "step": 5682 }, { "epoch": 0.9726168064350504, "grad_norm": 13.390531539916992, "learning_rate": 2.923346341474199e-05, "loss": 1.3245, "step": 5683 }, { "epoch": 0.9727879513948314, "grad_norm": 6.539584159851074, "learning_rate": 2.922982313860982e-05, "loss": 0.3497, "step": 5684 }, { "epoch": 0.9729590963546123, "grad_norm": 32.39887619018555, "learning_rate": 2.9226174466860797e-05, "loss": 3.6324, "step": 5685 }, { "epoch": 0.9731302413143933, "grad_norm": 1.9811031818389893, "learning_rate": 2.922251740164765e-05, "loss": 0.3822, "step": 5686 }, { "epoch": 0.9733013862741742, "grad_norm": 18.403629302978516, "learning_rate": 2.921885194512806e-05, "loss": 1.9106, "step": 5687 }, { "epoch": 0.9734725312339552, "grad_norm": 49.918663024902344, "learning_rate": 2.921517809946464e-05, "loss": 5.6214, "step": 5688 }, { "epoch": 0.973643676193736, "grad_norm": 12.015061378479004, "learning_rate": 2.921149586682497e-05, "loss": 1.2064, "step": 5689 }, { "epoch": 0.973814821153517, "grad_norm": 19.471036911010742, "learning_rate": 2.9207805249381565e-05, "loss": 1.6467, "step": 5690 }, { "epoch": 0.973985966113298, "grad_norm": 32.368045806884766, "learning_rate": 2.9204106249311904e-05, "loss": 1.7296, "step": 5691 }, { "epoch": 0.9741571110730789, "grad_norm": 26.181180953979492, "learning_rate": 2.92003988687984e-05, "loss": 2.8505, "step": 5692 }, { "epoch": 0.9743282560328599, "grad_norm": 12.218537330627441, "learning_rate": 2.9196683110028412e-05, "loss": 0.8142, "step": 5693 }, { "epoch": 0.9744994009926408, "grad_norm": 23.380990982055664, "learning_rate": 2.9192958975194248e-05, "loss": 1.4624, "step": 5694 }, { "epoch": 0.9746705459524218, "grad_norm": 17.542766571044922, "learning_rate": 2.9189226466493143e-05, "loss": 1.4295, "step": 5695 }, { "epoch": 0.9748416909122026, "grad_norm": 13.465328216552734, "learning_rate": 2.9185485586127293e-05, "loss": 1.2529, "step": 5696 }, { "epoch": 0.9750128358719836, "grad_norm": 19.861682891845703, "learning_rate": 2.9181736336303814e-05, "loss": 2.0874, "step": 5697 }, { "epoch": 0.9751839808317645, "grad_norm": 2.460387706756592, "learning_rate": 2.9177978719234775e-05, "loss": 0.345, "step": 5698 }, { "epoch": 0.9753551257915455, "grad_norm": 11.749364852905273, "learning_rate": 2.9174212737137177e-05, "loss": 0.8893, "step": 5699 }, { "epoch": 0.9755262707513264, "grad_norm": 8.463294982910156, "learning_rate": 2.9170438392232947e-05, "loss": 0.6591, "step": 5700 }, { "epoch": 0.9756974157111074, "grad_norm": 23.321102142333984, "learning_rate": 2.9166655686748964e-05, "loss": 3.1519, "step": 5701 }, { "epoch": 0.9758685606708882, "grad_norm": 43.21113586425781, "learning_rate": 2.916286462291702e-05, "loss": 5.9953, "step": 5702 }, { "epoch": 0.9760397056306692, "grad_norm": 6.378363609313965, "learning_rate": 2.915906520297386e-05, "loss": 0.5454, "step": 5703 }, { "epoch": 0.9762108505904501, "grad_norm": 22.250072479248047, "learning_rate": 2.915525742916114e-05, "loss": 2.7336, "step": 5704 }, { "epoch": 0.9763819955502311, "grad_norm": 16.970355987548828, "learning_rate": 2.915144130372545e-05, "loss": 1.6552, "step": 5705 }, { "epoch": 0.976553140510012, "grad_norm": 2.4970545768737793, "learning_rate": 2.914761682891831e-05, "loss": 0.3561, "step": 5706 }, { "epoch": 0.976724285469793, "grad_norm": 24.48183822631836, "learning_rate": 2.9143784006996174e-05, "loss": 2.858, "step": 5707 }, { "epoch": 0.9768954304295738, "grad_norm": 24.73363494873047, "learning_rate": 2.9139942840220407e-05, "loss": 2.3049, "step": 5708 }, { "epoch": 0.9770665753893548, "grad_norm": 1.5006498098373413, "learning_rate": 2.91360933308573e-05, "loss": 0.2978, "step": 5709 }, { "epoch": 0.9772377203491357, "grad_norm": Infinity, "learning_rate": 2.91360933308573e-05, "loss": 1.1356, "step": 5710 }, { "epoch": 0.9774088653089167, "grad_norm": 27.760709762573242, "learning_rate": 2.9132235481178077e-05, "loss": 2.9795, "step": 5711 }, { "epoch": 0.9775800102686976, "grad_norm": 3.4136054515838623, "learning_rate": 2.912836929345887e-05, "loss": 0.313, "step": 5712 }, { "epoch": 0.9777511552284786, "grad_norm": 7.273303031921387, "learning_rate": 2.912449476998073e-05, "loss": 1.0696, "step": 5713 }, { "epoch": 0.9779223001882594, "grad_norm": 13.993139266967773, "learning_rate": 2.9120611913029645e-05, "loss": 1.2567, "step": 5714 }, { "epoch": 0.9780934451480404, "grad_norm": 14.55916690826416, "learning_rate": 2.9116720724896495e-05, "loss": 1.4643, "step": 5715 }, { "epoch": 0.9782645901078213, "grad_norm": 19.891088485717773, "learning_rate": 2.9112821207877092e-05, "loss": 2.3964, "step": 5716 }, { "epoch": 0.9784357350676023, "grad_norm": 19.82097625732422, "learning_rate": 2.9108913364272157e-05, "loss": 2.4607, "step": 5717 }, { "epoch": 0.9786068800273832, "grad_norm": 23.450258255004883, "learning_rate": 2.9104997196387325e-05, "loss": 1.9309, "step": 5718 }, { "epoch": 0.9787780249871642, "grad_norm": 55.61565017700195, "learning_rate": 2.9101072706533134e-05, "loss": 2.2202, "step": 5719 }, { "epoch": 0.978949169946945, "grad_norm": 10.280345916748047, "learning_rate": 2.9097139897025045e-05, "loss": 1.3173, "step": 5720 }, { "epoch": 0.979120314906726, "grad_norm": 24.033308029174805, "learning_rate": 2.9093198770183416e-05, "loss": 2.455, "step": 5721 }, { "epoch": 0.9792914598665069, "grad_norm": 8.36307430267334, "learning_rate": 2.9089249328333528e-05, "loss": 0.7162, "step": 5722 }, { "epoch": 0.9794626048262879, "grad_norm": 16.257606506347656, "learning_rate": 2.9085291573805546e-05, "loss": 1.3828, "step": 5723 }, { "epoch": 0.9796337497860688, "grad_norm": 7.985149383544922, "learning_rate": 2.9081325508934556e-05, "loss": 0.4998, "step": 5724 }, { "epoch": 0.9798048947458498, "grad_norm": 11.047928810119629, "learning_rate": 2.9077351136060545e-05, "loss": 0.8385, "step": 5725 }, { "epoch": 0.9799760397056306, "grad_norm": 7.28739070892334, "learning_rate": 2.907336845752839e-05, "loss": 0.4586, "step": 5726 }, { "epoch": 0.9801471846654116, "grad_norm": 90.9510269165039, "learning_rate": 2.906937747568789e-05, "loss": 0.9434, "step": 5727 }, { "epoch": 0.9803183296251925, "grad_norm": 1.1575900316238403, "learning_rate": 2.906537819289372e-05, "loss": 0.1922, "step": 5728 }, { "epoch": 0.9804894745849735, "grad_norm": 19.945966720581055, "learning_rate": 2.906137061150547e-05, "loss": 2.3853, "step": 5729 }, { "epoch": 0.9806606195447544, "grad_norm": 22.720388412475586, "learning_rate": 2.9057354733887612e-05, "loss": 2.3841, "step": 5730 }, { "epoch": 0.9808317645045354, "grad_norm": 22.19735336303711, "learning_rate": 2.9053330562409525e-05, "loss": 2.4123, "step": 5731 }, { "epoch": 0.9810029094643162, "grad_norm": 0.8141705393791199, "learning_rate": 2.9049298099445474e-05, "loss": 0.1741, "step": 5732 }, { "epoch": 0.9811740544240972, "grad_norm": 109.93119049072266, "learning_rate": 2.9045257347374616e-05, "loss": 7.3922, "step": 5733 }, { "epoch": 0.9813451993838781, "grad_norm": 10.224390029907227, "learning_rate": 2.9041208308581005e-05, "loss": 0.6587, "step": 5734 }, { "epoch": 0.9815163443436591, "grad_norm": 28.685192108154297, "learning_rate": 2.903715098545358e-05, "loss": 1.9348, "step": 5735 }, { "epoch": 0.98168748930344, "grad_norm": 27.049457550048828, "learning_rate": 2.9033085380386163e-05, "loss": 3.427, "step": 5736 }, { "epoch": 0.981858634263221, "grad_norm": 15.711125373840332, "learning_rate": 2.902901149577747e-05, "loss": 1.3106, "step": 5737 }, { "epoch": 0.9820297792230018, "grad_norm": 28.39513397216797, "learning_rate": 2.9024929334031102e-05, "loss": 2.721, "step": 5738 }, { "epoch": 0.9822009241827828, "grad_norm": 0.9611316919326782, "learning_rate": 2.9020838897555534e-05, "loss": 0.1634, "step": 5739 }, { "epoch": 0.9823720691425637, "grad_norm": 20.618175506591797, "learning_rate": 2.9016740188764137e-05, "loss": 2.3602, "step": 5740 }, { "epoch": 0.9825432141023447, "grad_norm": 16.827938079833984, "learning_rate": 2.9012633210075146e-05, "loss": 1.5323, "step": 5741 }, { "epoch": 0.9827143590621257, "grad_norm": 10.538216590881348, "learning_rate": 2.900851796391169e-05, "loss": 0.8438, "step": 5742 }, { "epoch": 0.9828855040219066, "grad_norm": 17.156232833862305, "learning_rate": 2.9004394452701773e-05, "loss": 1.5221, "step": 5743 }, { "epoch": 0.9830566489816875, "grad_norm": 17.9312801361084, "learning_rate": 2.9000262678878273e-05, "loss": 1.7531, "step": 5744 }, { "epoch": 0.9832277939414684, "grad_norm": 14.673002243041992, "learning_rate": 2.8996122644878934e-05, "loss": 1.2513, "step": 5745 }, { "epoch": 0.9833989389012494, "grad_norm": 23.0510196685791, "learning_rate": 2.899197435314639e-05, "loss": 2.9687, "step": 5746 }, { "epoch": 0.9835700838610303, "grad_norm": 10.26144790649414, "learning_rate": 2.8987817806128138e-05, "loss": 1.1219, "step": 5747 }, { "epoch": 0.9837412288208113, "grad_norm": 29.97235107421875, "learning_rate": 2.8983653006276544e-05, "loss": 1.2083, "step": 5748 }, { "epoch": 0.9839123737805922, "grad_norm": 36.61714172363281, "learning_rate": 2.897947995604885e-05, "loss": 2.4298, "step": 5749 }, { "epoch": 0.9840835187403731, "grad_norm": 22.347938537597656, "learning_rate": 2.8975298657907158e-05, "loss": 2.2842, "step": 5750 }, { "epoch": 0.984254663700154, "grad_norm": 12.373258590698242, "learning_rate": 2.8971109114318446e-05, "loss": 0.78, "step": 5751 }, { "epoch": 0.984425808659935, "grad_norm": 22.580699920654297, "learning_rate": 2.8966911327754543e-05, "loss": 3.0575, "step": 5752 }, { "epoch": 0.9845969536197159, "grad_norm": 7.398533344268799, "learning_rate": 2.8962705300692156e-05, "loss": 0.9823, "step": 5753 }, { "epoch": 0.9847680985794969, "grad_norm": 25.29918098449707, "learning_rate": 2.8958491035612842e-05, "loss": 2.2433, "step": 5754 }, { "epoch": 0.9849392435392778, "grad_norm": 24.103534698486328, "learning_rate": 2.895426853500303e-05, "loss": 2.4689, "step": 5755 }, { "epoch": 0.9851103884990587, "grad_norm": 20.637611389160156, "learning_rate": 2.8950037801353995e-05, "loss": 1.447, "step": 5756 }, { "epoch": 0.9852815334588396, "grad_norm": 21.759105682373047, "learning_rate": 2.894579883716188e-05, "loss": 2.3284, "step": 5757 }, { "epoch": 0.9854526784186206, "grad_norm": 15.341378211975098, "learning_rate": 2.894155164492768e-05, "loss": 1.1234, "step": 5758 }, { "epoch": 0.9856238233784015, "grad_norm": 3.0613319873809814, "learning_rate": 2.8937296227157242e-05, "loss": 0.3687, "step": 5759 }, { "epoch": 0.9857949683381825, "grad_norm": 13.759583473205566, "learning_rate": 2.8933032586361275e-05, "loss": 1.0151, "step": 5760 }, { "epoch": 0.9859661132979634, "grad_norm": 17.648202896118164, "learning_rate": 2.8928760725055328e-05, "loss": 1.4579, "step": 5761 }, { "epoch": 0.9861372582577443, "grad_norm": 11.312187194824219, "learning_rate": 2.8924480645759805e-05, "loss": 1.0932, "step": 5762 }, { "epoch": 0.9863084032175252, "grad_norm": 27.215620040893555, "learning_rate": 2.892019235099996e-05, "loss": 2.8235, "step": 5763 }, { "epoch": 0.9864795481773062, "grad_norm": 4.202890872955322, "learning_rate": 2.8915895843305896e-05, "loss": 0.3119, "step": 5764 }, { "epoch": 0.9866506931370871, "grad_norm": 4.855360984802246, "learning_rate": 2.891159112521256e-05, "loss": 0.3622, "step": 5765 }, { "epoch": 0.9868218380968681, "grad_norm": 21.578210830688477, "learning_rate": 2.8907278199259737e-05, "loss": 1.6091, "step": 5766 }, { "epoch": 0.986992983056649, "grad_norm": 4.96875524520874, "learning_rate": 2.8902957067992063e-05, "loss": 0.5169, "step": 5767 }, { "epoch": 0.9871641280164299, "grad_norm": 26.378623962402344, "learning_rate": 2.8898627733959008e-05, "loss": 3.8767, "step": 5768 }, { "epoch": 0.9873352729762108, "grad_norm": 22.44243621826172, "learning_rate": 2.8894290199714893e-05, "loss": 2.9247, "step": 5769 }, { "epoch": 0.9875064179359918, "grad_norm": 17.894241333007812, "learning_rate": 2.888994446781886e-05, "loss": 2.0144, "step": 5770 }, { "epoch": 0.9876775628957727, "grad_norm": 19.04180145263672, "learning_rate": 2.888559054083491e-05, "loss": 1.5276, "step": 5771 }, { "epoch": 0.9878487078555537, "grad_norm": 16.866159439086914, "learning_rate": 2.8881228421331854e-05, "loss": 1.3376, "step": 5772 }, { "epoch": 0.9880198528153346, "grad_norm": 18.463472366333008, "learning_rate": 2.8876858111883352e-05, "loss": 1.501, "step": 5773 }, { "epoch": 0.9881909977751155, "grad_norm": 52.910152435302734, "learning_rate": 2.88724796150679e-05, "loss": 0.5458, "step": 5774 }, { "epoch": 0.9883621427348964, "grad_norm": 15.363899230957031, "learning_rate": 2.8868092933468808e-05, "loss": 1.302, "step": 5775 }, { "epoch": 0.9885332876946774, "grad_norm": 22.834556579589844, "learning_rate": 2.886369806967423e-05, "loss": 2.5358, "step": 5776 }, { "epoch": 0.9887044326544583, "grad_norm": 7.162318229675293, "learning_rate": 2.885929502627714e-05, "loss": 0.4514, "step": 5777 }, { "epoch": 0.9888755776142393, "grad_norm": 5.512144565582275, "learning_rate": 2.8854883805875346e-05, "loss": 0.2699, "step": 5778 }, { "epoch": 0.9890467225740202, "grad_norm": 8.424713134765625, "learning_rate": 2.8850464411071465e-05, "loss": 1.254, "step": 5779 }, { "epoch": 0.9892178675338011, "grad_norm": 9.863699913024902, "learning_rate": 2.884603684447296e-05, "loss": 0.6334, "step": 5780 }, { "epoch": 0.989389012493582, "grad_norm": 7.864769458770752, "learning_rate": 2.8841601108692086e-05, "loss": 0.5012, "step": 5781 }, { "epoch": 0.989560157453363, "grad_norm": 28.139631271362305, "learning_rate": 2.883715720634594e-05, "loss": 2.9025, "step": 5782 }, { "epoch": 0.9897313024131439, "grad_norm": 20.099287033081055, "learning_rate": 2.883270514005644e-05, "loss": 1.7654, "step": 5783 }, { "epoch": 0.9899024473729249, "grad_norm": 11.549965858459473, "learning_rate": 2.8828244912450305e-05, "loss": 0.3662, "step": 5784 }, { "epoch": 0.9900735923327058, "grad_norm": 15.588282585144043, "learning_rate": 2.8823776526159067e-05, "loss": 1.2371, "step": 5785 }, { "epoch": 0.9902447372924867, "grad_norm": 14.879979133605957, "learning_rate": 2.8819299983819093e-05, "loss": 1.2031, "step": 5786 }, { "epoch": 0.9904158822522676, "grad_norm": 16.02366828918457, "learning_rate": 2.8814815288071547e-05, "loss": 1.3539, "step": 5787 }, { "epoch": 0.9905870272120486, "grad_norm": 1.5881539583206177, "learning_rate": 2.8810322441562403e-05, "loss": 0.2036, "step": 5788 }, { "epoch": 0.9907581721718295, "grad_norm": 106.81831359863281, "learning_rate": 2.8805821446942446e-05, "loss": 8.9651, "step": 5789 }, { "epoch": 0.9909293171316105, "grad_norm": 20.03091812133789, "learning_rate": 2.8801312306867275e-05, "loss": 1.7219, "step": 5790 }, { "epoch": 0.9911004620913914, "grad_norm": 30.344528198242188, "learning_rate": 2.8796795023997282e-05, "loss": 5.7801, "step": 5791 }, { "epoch": 0.9912716070511723, "grad_norm": 21.556596755981445, "learning_rate": 2.879226960099768e-05, "loss": 1.8613, "step": 5792 }, { "epoch": 0.9914427520109533, "grad_norm": 21.872007369995117, "learning_rate": 2.8787736040538466e-05, "loss": 1.746, "step": 5793 }, { "epoch": 0.9916138969707342, "grad_norm": 25.079710006713867, "learning_rate": 2.878319434529445e-05, "loss": 3.4592, "step": 5794 }, { "epoch": 0.9917850419305152, "grad_norm": 20.306957244873047, "learning_rate": 2.877864451794525e-05, "loss": 2.257, "step": 5795 }, { "epoch": 0.9919561868902961, "grad_norm": 9.488975524902344, "learning_rate": 2.8774086561175256e-05, "loss": 0.6263, "step": 5796 }, { "epoch": 0.9921273318500771, "grad_norm": 18.889690399169922, "learning_rate": 2.8769520477673678e-05, "loss": 2.3502, "step": 5797 }, { "epoch": 0.9922984768098579, "grad_norm": 8.002593994140625, "learning_rate": 2.8764946270134506e-05, "loss": 0.3844, "step": 5798 }, { "epoch": 0.9924696217696389, "grad_norm": 23.74399757385254, "learning_rate": 2.8760363941256532e-05, "loss": 3.349, "step": 5799 }, { "epoch": 0.9926407667294198, "grad_norm": 7.996278285980225, "learning_rate": 2.875577349374334e-05, "loss": 0.9081, "step": 5800 }, { "epoch": 0.9928119116892008, "grad_norm": 50.602272033691406, "learning_rate": 2.8751174930303295e-05, "loss": 1.317, "step": 5801 }, { "epoch": 0.9929830566489817, "grad_norm": 3.7856621742248535, "learning_rate": 2.8746568253649562e-05, "loss": 0.3388, "step": 5802 }, { "epoch": 0.9931542016087627, "grad_norm": 7.5464091300964355, "learning_rate": 2.874195346650008e-05, "loss": 0.9043, "step": 5803 }, { "epoch": 0.9933253465685435, "grad_norm": 3.0814945697784424, "learning_rate": 2.8737330571577588e-05, "loss": 0.3952, "step": 5804 }, { "epoch": 0.9934964915283245, "grad_norm": 22.453025817871094, "learning_rate": 2.8732699571609594e-05, "loss": 2.323, "step": 5805 }, { "epoch": 0.9936676364881054, "grad_norm": 26.30789566040039, "learning_rate": 2.8728060469328404e-05, "loss": 2.5883, "step": 5806 }, { "epoch": 0.9938387814478864, "grad_norm": 19.208297729492188, "learning_rate": 2.8723413267471086e-05, "loss": 1.9291, "step": 5807 }, { "epoch": 0.9940099264076673, "grad_norm": 6.7294206619262695, "learning_rate": 2.8718757968779503e-05, "loss": 0.546, "step": 5808 }, { "epoch": 0.9941810713674483, "grad_norm": 3.1863274574279785, "learning_rate": 2.871409457600028e-05, "loss": 0.2827, "step": 5809 }, { "epoch": 0.9943522163272291, "grad_norm": 15.646027565002441, "learning_rate": 2.8709423091884836e-05, "loss": 1.2538, "step": 5810 }, { "epoch": 0.9945233612870101, "grad_norm": 4.021656513214111, "learning_rate": 2.8704743519189347e-05, "loss": 0.3942, "step": 5811 }, { "epoch": 0.994694506246791, "grad_norm": 6.282193183898926, "learning_rate": 2.8700055860674765e-05, "loss": 0.5794, "step": 5812 }, { "epoch": 0.994865651206572, "grad_norm": 31.727582931518555, "learning_rate": 2.8695360119106822e-05, "loss": 1.9632, "step": 5813 }, { "epoch": 0.9950367961663529, "grad_norm": 62.91410827636719, "learning_rate": 2.8690656297256014e-05, "loss": 6.5346, "step": 5814 }, { "epoch": 0.9952079411261339, "grad_norm": 131.5074005126953, "learning_rate": 2.868594439789759e-05, "loss": 2.5371, "step": 5815 }, { "epoch": 0.9953790860859147, "grad_norm": 9.428857803344727, "learning_rate": 2.8681224423811595e-05, "loss": 0.8309, "step": 5816 }, { "epoch": 0.9955502310456957, "grad_norm": 6.470557689666748, "learning_rate": 2.8676496377782808e-05, "loss": 0.498, "step": 5817 }, { "epoch": 0.9957213760054766, "grad_norm": 138.75096130371094, "learning_rate": 2.867176026260079e-05, "loss": 2.177, "step": 5818 }, { "epoch": 0.9958925209652576, "grad_norm": 24.097305297851562, "learning_rate": 2.8667016081059853e-05, "loss": 5.9194, "step": 5819 }, { "epoch": 0.9960636659250385, "grad_norm": 26.96247673034668, "learning_rate": 2.866226383595907e-05, "loss": 5.791, "step": 5820 }, { "epoch": 0.9962348108848195, "grad_norm": 18.310457229614258, "learning_rate": 2.865750353010227e-05, "loss": 1.7159, "step": 5821 }, { "epoch": 0.9964059558446003, "grad_norm": 10.174948692321777, "learning_rate": 2.8652735166298053e-05, "loss": 0.6524, "step": 5822 }, { "epoch": 0.9965771008043813, "grad_norm": 14.041560173034668, "learning_rate": 2.864795874735975e-05, "loss": 1.3189, "step": 5823 }, { "epoch": 0.9967482457641622, "grad_norm": 25.18544578552246, "learning_rate": 2.8643174276105456e-05, "loss": 2.4232, "step": 5824 }, { "epoch": 0.9969193907239432, "grad_norm": 15.618341445922852, "learning_rate": 2.863838175535802e-05, "loss": 0.35, "step": 5825 }, { "epoch": 0.9970905356837241, "grad_norm": 7.583703517913818, "learning_rate": 2.8633581187945035e-05, "loss": 0.9977, "step": 5826 }, { "epoch": 0.9972616806435051, "grad_norm": 30.84953498840332, "learning_rate": 2.862877257669884e-05, "loss": 5.2481, "step": 5827 }, { "epoch": 0.9974328256032859, "grad_norm": 1.524729609489441, "learning_rate": 2.8623955924456532e-05, "loss": 0.1755, "step": 5828 }, { "epoch": 0.9976039705630669, "grad_norm": 13.6453857421875, "learning_rate": 2.8619131234059926e-05, "loss": 1.3664, "step": 5829 }, { "epoch": 0.9977751155228478, "grad_norm": 48.88385009765625, "learning_rate": 2.8614298508355615e-05, "loss": 1.7573, "step": 5830 }, { "epoch": 0.9979462604826288, "grad_norm": 3.5198447704315186, "learning_rate": 2.8609457750194903e-05, "loss": 0.3419, "step": 5831 }, { "epoch": 0.9981174054424097, "grad_norm": 11.469450950622559, "learning_rate": 2.8604608962433847e-05, "loss": 0.8417, "step": 5832 }, { "epoch": 0.9982885504021907, "grad_norm": 13.271893501281738, "learning_rate": 2.859975214793324e-05, "loss": 1.218, "step": 5833 }, { "epoch": 0.9984596953619715, "grad_norm": 82.31917572021484, "learning_rate": 2.859488730955861e-05, "loss": 6.6491, "step": 5834 }, { "epoch": 0.9986308403217525, "grad_norm": 7.952483654022217, "learning_rate": 2.8590014450180218e-05, "loss": 0.4058, "step": 5835 }, { "epoch": 0.9988019852815334, "grad_norm": 1.028527855873108, "learning_rate": 2.858513357267306e-05, "loss": 0.1751, "step": 5836 }, { "epoch": 0.9989731302413144, "grad_norm": 10.787403106689453, "learning_rate": 2.858024467991686e-05, "loss": 0.5362, "step": 5837 }, { "epoch": 0.9991442752010953, "grad_norm": 17.176511764526367, "learning_rate": 2.8575347774796066e-05, "loss": 1.5915, "step": 5838 }, { "epoch": 0.9993154201608763, "grad_norm": 1.1093616485595703, "learning_rate": 2.857044286019987e-05, "loss": 0.1634, "step": 5839 }, { "epoch": 0.9994865651206571, "grad_norm": 118.06546020507812, "learning_rate": 2.8565529939022174e-05, "loss": 5.0138, "step": 5840 }, { "epoch": 0.9996577100804381, "grad_norm": 9.499468803405762, "learning_rate": 2.856060901416161e-05, "loss": 0.4036, "step": 5841 }, { "epoch": 0.999828855040219, "grad_norm": 8.388986587524414, "learning_rate": 2.8555680088521526e-05, "loss": 0.6442, "step": 5842 }, { "epoch": 1.0, "grad_norm": 7.313854217529297, "learning_rate": 2.855074316501e-05, "loss": 0.3924, "step": 5843 }, { "epoch": 1.000171144959781, "grad_norm": 17.557804107666016, "learning_rate": 2.8545798246539824e-05, "loss": 1.6257, "step": 5844 }, { "epoch": 1.000342289919562, "grad_norm": 21.283592224121094, "learning_rate": 2.8540845336028503e-05, "loss": 1.7384, "step": 5845 }, { "epoch": 1.0005134348793427, "grad_norm": 40.37034606933594, "learning_rate": 2.8535884436398268e-05, "loss": 1.7387, "step": 5846 }, { "epoch": 1.0006845798391237, "grad_norm": 11.7813081741333, "learning_rate": 2.8530915550576057e-05, "loss": 0.9925, "step": 5847 }, { "epoch": 1.0008557247989047, "grad_norm": 30.10382843017578, "learning_rate": 2.8525938681493515e-05, "loss": 6.0054, "step": 5848 }, { "epoch": 1.0010268697586857, "grad_norm": 23.582857131958008, "learning_rate": 2.8520953832087005e-05, "loss": 2.8579, "step": 5849 }, { "epoch": 1.0011980147184665, "grad_norm": 5.103301048278809, "learning_rate": 2.8515961005297594e-05, "loss": 0.45, "step": 5850 }, { "epoch": 1.0013691596782475, "grad_norm": 148.56295776367188, "learning_rate": 2.851096020407106e-05, "loss": 9.776, "step": 5851 }, { "epoch": 1.0015403046380285, "grad_norm": 21.1070499420166, "learning_rate": 2.850595143135788e-05, "loss": 1.6187, "step": 5852 }, { "epoch": 1.0017114495978094, "grad_norm": 0.8959584832191467, "learning_rate": 2.850093469011324e-05, "loss": 0.1585, "step": 5853 }, { "epoch": 1.0018825945575902, "grad_norm": 27.73525619506836, "learning_rate": 2.8495909983297022e-05, "loss": 1.3172, "step": 5854 }, { "epoch": 1.0020537395173712, "grad_norm": 8.246828079223633, "learning_rate": 2.8490877313873814e-05, "loss": 0.4995, "step": 5855 }, { "epoch": 1.0022248844771522, "grad_norm": 26.056087493896484, "learning_rate": 2.8485836684812896e-05, "loss": 2.0727, "step": 5856 }, { "epoch": 1.0023960294369332, "grad_norm": 15.057696342468262, "learning_rate": 2.848078809908825e-05, "loss": 1.5215, "step": 5857 }, { "epoch": 1.002567174396714, "grad_norm": 8.138750076293945, "learning_rate": 2.8475731559678545e-05, "loss": 0.7365, "step": 5858 }, { "epoch": 1.002738319356495, "grad_norm": 6.88981819152832, "learning_rate": 2.8470667069567146e-05, "loss": 0.5962, "step": 5859 }, { "epoch": 1.002909464316276, "grad_norm": 22.670846939086914, "learning_rate": 2.8465594631742116e-05, "loss": 2.5846, "step": 5860 }, { "epoch": 1.003080609276057, "grad_norm": 8.0685453414917, "learning_rate": 2.846051424919619e-05, "loss": 1.0937, "step": 5861 }, { "epoch": 1.0032517542358377, "grad_norm": 26.379934310913086, "learning_rate": 2.8455425924926812e-05, "loss": 2.7335, "step": 5862 }, { "epoch": 1.0034228991956187, "grad_norm": 12.378397941589355, "learning_rate": 2.84503296619361e-05, "loss": 1.1461, "step": 5863 }, { "epoch": 1.0035940441553997, "grad_norm": 27.61152458190918, "learning_rate": 2.8445225463230852e-05, "loss": 2.5595, "step": 5864 }, { "epoch": 1.0037651891151806, "grad_norm": 27.199756622314453, "learning_rate": 2.8440113331822553e-05, "loss": 4.2857, "step": 5865 }, { "epoch": 1.0039363340749614, "grad_norm": 5.878708362579346, "learning_rate": 2.843499327072737e-05, "loss": 0.4147, "step": 5866 }, { "epoch": 1.0041074790347424, "grad_norm": 15.698190689086914, "learning_rate": 2.8429865282966146e-05, "loss": 1.2913, "step": 5867 }, { "epoch": 1.0042786239945234, "grad_norm": 18.01776123046875, "learning_rate": 2.8424729371564404e-05, "loss": 1.56, "step": 5868 }, { "epoch": 1.0044497689543044, "grad_norm": 15.176900863647461, "learning_rate": 2.8419585539552334e-05, "loss": 1.3356, "step": 5869 }, { "epoch": 1.0046209139140851, "grad_norm": 21.878238677978516, "learning_rate": 2.841443378996481e-05, "loss": 2.5368, "step": 5870 }, { "epoch": 1.0047920588738661, "grad_norm": 101.53121948242188, "learning_rate": 2.840927412584137e-05, "loss": 6.9461, "step": 5871 }, { "epoch": 1.0049632038336471, "grad_norm": 16.709596633911133, "learning_rate": 2.840410655022622e-05, "loss": 1.3958, "step": 5872 }, { "epoch": 1.0051343487934281, "grad_norm": 14.55434799194336, "learning_rate": 2.8398931066168244e-05, "loss": 1.1612, "step": 5873 }, { "epoch": 1.0053054937532089, "grad_norm": 3.6406946182250977, "learning_rate": 2.839374767672098e-05, "loss": 0.3982, "step": 5874 }, { "epoch": 1.0054766387129899, "grad_norm": 14.200058937072754, "learning_rate": 2.8388556384942638e-05, "loss": 1.1531, "step": 5875 }, { "epoch": 1.0056477836727709, "grad_norm": 6.786914825439453, "learning_rate": 2.8383357193896086e-05, "loss": 0.4971, "step": 5876 }, { "epoch": 1.0058189286325518, "grad_norm": 18.153135299682617, "learning_rate": 2.8378150106648857e-05, "loss": 1.4036, "step": 5877 }, { "epoch": 1.0059900735923326, "grad_norm": 19.425262451171875, "learning_rate": 2.837293512627314e-05, "loss": 1.226, "step": 5878 }, { "epoch": 1.0061612185521136, "grad_norm": 26.178579330444336, "learning_rate": 2.8367712255845776e-05, "loss": 2.1104, "step": 5879 }, { "epoch": 1.0063323635118946, "grad_norm": 21.157712936401367, "learning_rate": 2.8362481498448274e-05, "loss": 1.9276, "step": 5880 }, { "epoch": 1.0065035084716756, "grad_norm": 9.509711265563965, "learning_rate": 2.8357242857166787e-05, "loss": 0.6173, "step": 5881 }, { "epoch": 1.0066746534314563, "grad_norm": 22.75120735168457, "learning_rate": 2.8351996335092114e-05, "loss": 2.034, "step": 5882 }, { "epoch": 1.0068457983912373, "grad_norm": 14.869056701660156, "learning_rate": 2.8346741935319716e-05, "loss": 1.3493, "step": 5883 }, { "epoch": 1.0070169433510183, "grad_norm": 21.23749351501465, "learning_rate": 2.8341479660949704e-05, "loss": 1.853, "step": 5884 }, { "epoch": 1.0071880883107993, "grad_norm": 24.229785919189453, "learning_rate": 2.8336209515086813e-05, "loss": 2.0553, "step": 5885 }, { "epoch": 1.00735923327058, "grad_norm": 20.623104095458984, "learning_rate": 2.8330931500840446e-05, "loss": 2.5378, "step": 5886 }, { "epoch": 1.007530378230361, "grad_norm": 17.120683670043945, "learning_rate": 2.8325645621324642e-05, "loss": 1.6522, "step": 5887 }, { "epoch": 1.007701523190142, "grad_norm": 21.083953857421875, "learning_rate": 2.8320351879658065e-05, "loss": 2.0134, "step": 5888 }, { "epoch": 1.007872668149923, "grad_norm": 15.527298927307129, "learning_rate": 2.8315050278964046e-05, "loss": 1.5302, "step": 5889 }, { "epoch": 1.0080438131097038, "grad_norm": 25.979265213012695, "learning_rate": 2.830974082237053e-05, "loss": 1.8001, "step": 5890 }, { "epoch": 1.0082149580694848, "grad_norm": 19.79598045349121, "learning_rate": 2.8304423513010098e-05, "loss": 1.5617, "step": 5891 }, { "epoch": 1.0083861030292658, "grad_norm": 15.19708251953125, "learning_rate": 2.8299098354019984e-05, "loss": 1.3218, "step": 5892 }, { "epoch": 1.0085572479890468, "grad_norm": 16.328197479248047, "learning_rate": 2.8293765348542028e-05, "loss": 1.3238, "step": 5893 }, { "epoch": 1.0087283929488278, "grad_norm": 14.854755401611328, "learning_rate": 2.8288424499722717e-05, "loss": 1.0872, "step": 5894 }, { "epoch": 1.0088995379086085, "grad_norm": 14.211106300354004, "learning_rate": 2.828307581071316e-05, "loss": 1.2177, "step": 5895 }, { "epoch": 1.0090706828683895, "grad_norm": 34.42771911621094, "learning_rate": 2.827771928466909e-05, "loss": 5.7955, "step": 5896 }, { "epoch": 1.0092418278281705, "grad_norm": 25.623085021972656, "learning_rate": 2.8272354924750864e-05, "loss": 2.0299, "step": 5897 }, { "epoch": 1.0094129727879515, "grad_norm": 14.502741813659668, "learning_rate": 2.8266982734123462e-05, "loss": 1.0217, "step": 5898 }, { "epoch": 1.0095841177477323, "grad_norm": 7.936575889587402, "learning_rate": 2.826160271595649e-05, "loss": 0.896, "step": 5899 }, { "epoch": 1.0097552627075133, "grad_norm": 7.099348545074463, "learning_rate": 2.8256214873424163e-05, "loss": 0.6054, "step": 5900 }, { "epoch": 1.0099264076672942, "grad_norm": 23.936155319213867, "learning_rate": 2.8250819209705313e-05, "loss": 1.9715, "step": 5901 }, { "epoch": 1.0100975526270752, "grad_norm": 29.676971435546875, "learning_rate": 2.8245415727983395e-05, "loss": 1.3108, "step": 5902 }, { "epoch": 1.010268697586856, "grad_norm": 8.874768257141113, "learning_rate": 2.8240004431446472e-05, "loss": 0.5545, "step": 5903 }, { "epoch": 1.010439842546637, "grad_norm": 26.877277374267578, "learning_rate": 2.823458532328721e-05, "loss": 5.8937, "step": 5904 }, { "epoch": 1.010610987506418, "grad_norm": 17.6776123046875, "learning_rate": 2.822915840670289e-05, "loss": 1.5149, "step": 5905 }, { "epoch": 1.010782132466199, "grad_norm": 100.99117279052734, "learning_rate": 2.8223723684895413e-05, "loss": 8.8381, "step": 5906 }, { "epoch": 1.0109532774259797, "grad_norm": 12.721230506896973, "learning_rate": 2.8218281161071265e-05, "loss": 1.1296, "step": 5907 }, { "epoch": 1.0111244223857607, "grad_norm": 20.310392379760742, "learning_rate": 2.8212830838441544e-05, "loss": 1.5951, "step": 5908 }, { "epoch": 1.0112955673455417, "grad_norm": 24.384496688842773, "learning_rate": 2.8207372720221944e-05, "loss": 1.6431, "step": 5909 }, { "epoch": 1.0114667123053227, "grad_norm": 10.639677047729492, "learning_rate": 2.820190680963277e-05, "loss": 1.4176, "step": 5910 }, { "epoch": 1.0116378572651035, "grad_norm": 4.812514305114746, "learning_rate": 2.8196433109898917e-05, "loss": 0.4569, "step": 5911 }, { "epoch": 1.0118090022248845, "grad_norm": 15.104774475097656, "learning_rate": 2.819095162424987e-05, "loss": 0.9118, "step": 5912 }, { "epoch": 1.0119801471846654, "grad_norm": 16.543947219848633, "learning_rate": 2.8185462355919717e-05, "loss": 1.4082, "step": 5913 }, { "epoch": 1.0121512921444464, "grad_norm": 10.53313159942627, "learning_rate": 2.817996530814714e-05, "loss": 0.6856, "step": 5914 }, { "epoch": 1.0123224371042272, "grad_norm": 5.993205547332764, "learning_rate": 2.8174460484175396e-05, "loss": 0.4012, "step": 5915 }, { "epoch": 1.0124935820640082, "grad_norm": 28.994531631469727, "learning_rate": 2.8168947887252344e-05, "loss": 5.966, "step": 5916 }, { "epoch": 1.0126647270237892, "grad_norm": 16.274168014526367, "learning_rate": 2.8163427520630427e-05, "loss": 1.4668, "step": 5917 }, { "epoch": 1.0128358719835702, "grad_norm": 19.380239486694336, "learning_rate": 2.815789938756666e-05, "loss": 1.4762, "step": 5918 }, { "epoch": 1.013007016943351, "grad_norm": 11.244359016418457, "learning_rate": 2.8152363491322658e-05, "loss": 0.8843, "step": 5919 }, { "epoch": 1.013178161903132, "grad_norm": 22.26492691040039, "learning_rate": 2.814681983516461e-05, "loss": 2.1597, "step": 5920 }, { "epoch": 1.013349306862913, "grad_norm": 19.49887466430664, "learning_rate": 2.8141268422363276e-05, "loss": 1.7275, "step": 5921 }, { "epoch": 1.013520451822694, "grad_norm": 22.942861557006836, "learning_rate": 2.8135709256194e-05, "loss": 2.0654, "step": 5922 }, { "epoch": 1.0136915967824747, "grad_norm": 19.50592803955078, "learning_rate": 2.81301423399367e-05, "loss": 1.7861, "step": 5923 }, { "epoch": 1.0138627417422557, "grad_norm": 16.96805763244629, "learning_rate": 2.8124567676875854e-05, "loss": 1.3677, "step": 5924 }, { "epoch": 1.0140338867020366, "grad_norm": 28.91179084777832, "learning_rate": 2.8118985270300535e-05, "loss": 2.7091, "step": 5925 }, { "epoch": 1.0142050316618176, "grad_norm": 30.7325496673584, "learning_rate": 2.811339512350437e-05, "loss": 4.6992, "step": 5926 }, { "epoch": 1.0143761766215984, "grad_norm": 7.521573543548584, "learning_rate": 2.8107797239785545e-05, "loss": 0.6563, "step": 5927 }, { "epoch": 1.0145473215813794, "grad_norm": 8.949661254882812, "learning_rate": 2.8102191622446825e-05, "loss": 0.7485, "step": 5928 }, { "epoch": 1.0147184665411604, "grad_norm": 17.26093292236328, "learning_rate": 2.8096578274795534e-05, "loss": 1.5096, "step": 5929 }, { "epoch": 1.0148896115009414, "grad_norm": 15.956555366516113, "learning_rate": 2.809095720014356e-05, "loss": 1.6165, "step": 5930 }, { "epoch": 1.0150607564607221, "grad_norm": 27.52875518798828, "learning_rate": 2.8085328401807334e-05, "loss": 3.6796, "step": 5931 }, { "epoch": 1.0152319014205031, "grad_norm": 11.933248519897461, "learning_rate": 2.8079691883107857e-05, "loss": 1.0633, "step": 5932 }, { "epoch": 1.0154030463802841, "grad_norm": 20.19046974182129, "learning_rate": 2.807404764737069e-05, "loss": 1.8556, "step": 5933 }, { "epoch": 1.015574191340065, "grad_norm": 17.771652221679688, "learning_rate": 2.806839569792594e-05, "loss": 1.6362, "step": 5934 }, { "epoch": 1.0157453362998459, "grad_norm": 21.638553619384766, "learning_rate": 2.8062736038108263e-05, "loss": 2.0634, "step": 5935 }, { "epoch": 1.0159164812596269, "grad_norm": 9.296874046325684, "learning_rate": 2.805706867125687e-05, "loss": 0.6169, "step": 5936 }, { "epoch": 1.0160876262194078, "grad_norm": 36.52355194091797, "learning_rate": 2.8051393600715507e-05, "loss": 5.7418, "step": 5937 }, { "epoch": 1.0162587711791888, "grad_norm": 20.66568946838379, "learning_rate": 2.8045710829832482e-05, "loss": 1.9262, "step": 5938 }, { "epoch": 1.0164299161389696, "grad_norm": 16.43720245361328, "learning_rate": 2.804002036196064e-05, "loss": 1.259, "step": 5939 }, { "epoch": 1.0166010610987506, "grad_norm": 11.791168212890625, "learning_rate": 2.8034322200457354e-05, "loss": 0.7025, "step": 5940 }, { "epoch": 1.0167722060585316, "grad_norm": 3.836411476135254, "learning_rate": 2.802861634868456e-05, "loss": 0.3997, "step": 5941 }, { "epoch": 1.0169433510183126, "grad_norm": 7.429993152618408, "learning_rate": 2.8022902810008718e-05, "loss": 0.5388, "step": 5942 }, { "epoch": 1.0171144959780933, "grad_norm": 16.87652587890625, "learning_rate": 2.8017181587800816e-05, "loss": 1.3096, "step": 5943 }, { "epoch": 1.0172856409378743, "grad_norm": 22.756383895874023, "learning_rate": 2.801145268543639e-05, "loss": 2.1821, "step": 5944 }, { "epoch": 1.0174567858976553, "grad_norm": 3.1953980922698975, "learning_rate": 2.8005716106295502e-05, "loss": 0.3136, "step": 5945 }, { "epoch": 1.0176279308574363, "grad_norm": 19.231718063354492, "learning_rate": 2.799997185376274e-05, "loss": 2.0442, "step": 5946 }, { "epoch": 1.0177990758172173, "grad_norm": 15.215984344482422, "learning_rate": 2.7994219931227218e-05, "loss": 1.3251, "step": 5947 }, { "epoch": 1.017970220776998, "grad_norm": 19.25925064086914, "learning_rate": 2.7988460342082582e-05, "loss": 1.6565, "step": 5948 }, { "epoch": 1.018141365736779, "grad_norm": 22.894515991210938, "learning_rate": 2.7982693089727e-05, "loss": 2.3551, "step": 5949 }, { "epoch": 1.01831251069656, "grad_norm": 27.23731803894043, "learning_rate": 2.7976918177563157e-05, "loss": 2.5454, "step": 5950 }, { "epoch": 1.018483655656341, "grad_norm": 26.405460357666016, "learning_rate": 2.797113560899826e-05, "loss": 2.8609, "step": 5951 }, { "epoch": 1.0186548006161218, "grad_norm": 15.821601867675781, "learning_rate": 2.7965345387444035e-05, "loss": 1.1841, "step": 5952 }, { "epoch": 1.0188259455759028, "grad_norm": 23.906511306762695, "learning_rate": 2.7959547516316723e-05, "loss": 1.3018, "step": 5953 }, { "epoch": 1.0189970905356838, "grad_norm": 26.260948181152344, "learning_rate": 2.7953741999037074e-05, "loss": 2.1539, "step": 5954 }, { "epoch": 1.0191682354954648, "grad_norm": 15.948780059814453, "learning_rate": 2.7947928839030343e-05, "loss": 1.423, "step": 5955 }, { "epoch": 1.0193393804552455, "grad_norm": 21.961246490478516, "learning_rate": 2.794210803972632e-05, "loss": 1.9959, "step": 5956 }, { "epoch": 1.0195105254150265, "grad_norm": 20.09709358215332, "learning_rate": 2.7936279604559274e-05, "loss": 1.9793, "step": 5957 }, { "epoch": 1.0196816703748075, "grad_norm": 14.262060165405273, "learning_rate": 2.793044353696799e-05, "loss": 1.1946, "step": 5958 }, { "epoch": 1.0198528153345885, "grad_norm": 20.70449447631836, "learning_rate": 2.7924599840395764e-05, "loss": 1.7147, "step": 5959 }, { "epoch": 1.0200239602943693, "grad_norm": 9.10728645324707, "learning_rate": 2.791874851829038e-05, "loss": 1.2659, "step": 5960 }, { "epoch": 1.0201951052541502, "grad_norm": 14.673197746276855, "learning_rate": 2.791288957410413e-05, "loss": 1.3024, "step": 5961 }, { "epoch": 1.0203662502139312, "grad_norm": 19.009380340576172, "learning_rate": 2.7907023011293794e-05, "loss": 1.7915, "step": 5962 }, { "epoch": 1.0205373951737122, "grad_norm": 22.03476333618164, "learning_rate": 2.790114883332066e-05, "loss": 1.9748, "step": 5963 }, { "epoch": 1.020708540133493, "grad_norm": 24.187057495117188, "learning_rate": 2.7895267043650498e-05, "loss": 2.6434, "step": 5964 }, { "epoch": 1.020879685093274, "grad_norm": 15.004847526550293, "learning_rate": 2.7889377645753573e-05, "loss": 1.1828, "step": 5965 }, { "epoch": 1.021050830053055, "grad_norm": 21.534664154052734, "learning_rate": 2.7883480643104636e-05, "loss": 2.6369, "step": 5966 }, { "epoch": 1.021221975012836, "grad_norm": 16.689029693603516, "learning_rate": 2.7877576039182934e-05, "loss": 1.5116, "step": 5967 }, { "epoch": 1.0213931199726167, "grad_norm": 18.7231502532959, "learning_rate": 2.7871663837472186e-05, "loss": 1.4577, "step": 5968 }, { "epoch": 1.0215642649323977, "grad_norm": 21.580528259277344, "learning_rate": 2.7865744041460606e-05, "loss": 2.8208, "step": 5969 }, { "epoch": 1.0217354098921787, "grad_norm": 5.064426422119141, "learning_rate": 2.7859816654640876e-05, "loss": 0.3887, "step": 5970 }, { "epoch": 1.0219065548519597, "grad_norm": 12.983659744262695, "learning_rate": 2.7853881680510165e-05, "loss": 1.1746, "step": 5971 }, { "epoch": 1.0220776998117405, "grad_norm": 5.966195106506348, "learning_rate": 2.784793912257012e-05, "loss": 0.4734, "step": 5972 }, { "epoch": 1.0222488447715214, "grad_norm": 14.946609497070312, "learning_rate": 2.784198898432685e-05, "loss": 1.2327, "step": 5973 }, { "epoch": 1.0224199897313024, "grad_norm": 17.650800704956055, "learning_rate": 2.7836031269290958e-05, "loss": 1.3377, "step": 5974 }, { "epoch": 1.0225911346910834, "grad_norm": 0.8784258365631104, "learning_rate": 2.7830065980977493e-05, "loss": 0.1799, "step": 5975 }, { "epoch": 1.0227622796508642, "grad_norm": 1.385658621788025, "learning_rate": 2.7824093122905993e-05, "loss": 0.1917, "step": 5976 }, { "epoch": 1.0229334246106452, "grad_norm": 22.822864532470703, "learning_rate": 2.7818112698600452e-05, "loss": 2.8015, "step": 5977 }, { "epoch": 1.0231045695704262, "grad_norm": 16.914342880249023, "learning_rate": 2.7812124711589323e-05, "loss": 1.241, "step": 5978 }, { "epoch": 1.0232757145302072, "grad_norm": 15.315549850463867, "learning_rate": 2.7806129165405535e-05, "loss": 1.1594, "step": 5979 }, { "epoch": 1.023446859489988, "grad_norm": 43.14786148071289, "learning_rate": 2.780012606358646e-05, "loss": 1.5922, "step": 5980 }, { "epoch": 1.023618004449769, "grad_norm": 19.019437789916992, "learning_rate": 2.7794115409673942e-05, "loss": 2.0881, "step": 5981 }, { "epoch": 1.02378914940955, "grad_norm": 28.388994216918945, "learning_rate": 2.778809720721428e-05, "loss": 3.914, "step": 5982 }, { "epoch": 1.023960294369331, "grad_norm": 27.33300018310547, "learning_rate": 2.7782071459758215e-05, "loss": 3.924, "step": 5983 }, { "epoch": 1.0241314393291117, "grad_norm": 14.441838264465332, "learning_rate": 2.7776038170860952e-05, "loss": 1.2082, "step": 5984 }, { "epoch": 1.0243025842888926, "grad_norm": 16.712003707885742, "learning_rate": 2.776999734408214e-05, "loss": 2.1091, "step": 5985 }, { "epoch": 1.0244737292486736, "grad_norm": 20.41153335571289, "learning_rate": 2.776394898298587e-05, "loss": 1.7813, "step": 5986 }, { "epoch": 1.0246448742084546, "grad_norm": 11.618141174316406, "learning_rate": 2.7757893091140692e-05, "loss": 1.3035, "step": 5987 }, { "epoch": 1.0248160191682354, "grad_norm": 17.35718536376953, "learning_rate": 2.775182967211959e-05, "loss": 1.7141, "step": 5988 }, { "epoch": 1.0249871641280164, "grad_norm": 16.45880699157715, "learning_rate": 2.7745758729499983e-05, "loss": 1.7197, "step": 5989 }, { "epoch": 1.0251583090877974, "grad_norm": 10.551615715026855, "learning_rate": 2.7739680266863744e-05, "loss": 1.0905, "step": 5990 }, { "epoch": 1.0253294540475784, "grad_norm": 26.532712936401367, "learning_rate": 2.7733594287797172e-05, "loss": 3.3279, "step": 5991 }, { "epoch": 1.0255005990073591, "grad_norm": 19.217823028564453, "learning_rate": 2.7727500795891007e-05, "loss": 1.7208, "step": 5992 }, { "epoch": 1.0256717439671401, "grad_norm": 12.649834632873535, "learning_rate": 2.7721399794740412e-05, "loss": 1.4955, "step": 5993 }, { "epoch": 1.025842888926921, "grad_norm": 32.663753509521484, "learning_rate": 2.771529128794499e-05, "loss": 5.9865, "step": 5994 }, { "epoch": 1.026014033886702, "grad_norm": 14.022912979125977, "learning_rate": 2.770917527910877e-05, "loss": 1.1403, "step": 5995 }, { "epoch": 1.0261851788464829, "grad_norm": 20.389646530151367, "learning_rate": 2.7703051771840205e-05, "loss": 1.536, "step": 5996 }, { "epoch": 1.0263563238062638, "grad_norm": 11.238503456115723, "learning_rate": 2.7696920769752176e-05, "loss": 0.8791, "step": 5997 }, { "epoch": 1.0265274687660448, "grad_norm": 22.10092544555664, "learning_rate": 2.7690782276461976e-05, "loss": 1.9444, "step": 5998 }, { "epoch": 1.0266986137258258, "grad_norm": 23.456623077392578, "learning_rate": 2.768463629559134e-05, "loss": 3.2625, "step": 5999 }, { "epoch": 1.0268697586856068, "grad_norm": 3.355825901031494, "learning_rate": 2.7678482830766388e-05, "loss": 0.3481, "step": 6000 }, { "epoch": 1.0270409036453876, "grad_norm": 3.933943510055542, "learning_rate": 2.7672321885617683e-05, "loss": 0.3642, "step": 6001 }, { "epoch": 1.0272120486051686, "grad_norm": 22.027666091918945, "learning_rate": 2.7666153463780192e-05, "loss": 3.2508, "step": 6002 }, { "epoch": 1.0273831935649496, "grad_norm": 28.67835807800293, "learning_rate": 2.7659977568893294e-05, "loss": 3.81, "step": 6003 }, { "epoch": 1.0275543385247305, "grad_norm": 18.57536506652832, "learning_rate": 2.765379420460077e-05, "loss": 1.6178, "step": 6004 }, { "epoch": 1.0277254834845113, "grad_norm": 17.354440689086914, "learning_rate": 2.7647603374550814e-05, "loss": 1.696, "step": 6005 }, { "epoch": 1.0278966284442923, "grad_norm": 22.169416427612305, "learning_rate": 2.7641405082396038e-05, "loss": 1.9527, "step": 6006 }, { "epoch": 1.0280677734040733, "grad_norm": 18.655746459960938, "learning_rate": 2.7635199331793434e-05, "loss": 1.5853, "step": 6007 }, { "epoch": 1.0282389183638543, "grad_norm": 13.884475708007812, "learning_rate": 2.7628986126404398e-05, "loss": 1.1785, "step": 6008 }, { "epoch": 1.028410063323635, "grad_norm": 5.888846397399902, "learning_rate": 2.762276546989474e-05, "loss": 0.5917, "step": 6009 }, { "epoch": 1.028581208283416, "grad_norm": 3.593951940536499, "learning_rate": 2.7616537365934652e-05, "loss": 0.3687, "step": 6010 }, { "epoch": 1.028752353243197, "grad_norm": 22.454221725463867, "learning_rate": 2.761030181819873e-05, "loss": 2.4793, "step": 6011 }, { "epoch": 1.028923498202978, "grad_norm": 2.3373348712921143, "learning_rate": 2.7604058830365952e-05, "loss": 0.3585, "step": 6012 }, { "epoch": 1.0290946431627588, "grad_norm": 1.7826164960861206, "learning_rate": 2.759780840611969e-05, "loss": 0.2326, "step": 6013 }, { "epoch": 1.0292657881225398, "grad_norm": 29.18971824645996, "learning_rate": 2.7591550549147704e-05, "loss": 2.5937, "step": 6014 }, { "epoch": 1.0294369330823208, "grad_norm": 26.53842544555664, "learning_rate": 2.7585285263142143e-05, "loss": 5.4126, "step": 6015 }, { "epoch": 1.0296080780421017, "grad_norm": 20.951757431030273, "learning_rate": 2.7579012551799526e-05, "loss": 2.4848, "step": 6016 }, { "epoch": 1.0297792230018825, "grad_norm": 21.251033782958984, "learning_rate": 2.757273241882077e-05, "loss": 2.3906, "step": 6017 }, { "epoch": 1.0299503679616635, "grad_norm": 19.32236099243164, "learning_rate": 2.756644486791116e-05, "loss": 1.5559, "step": 6018 }, { "epoch": 1.0301215129214445, "grad_norm": 9.028992652893066, "learning_rate": 2.7560149902780358e-05, "loss": 1.0507, "step": 6019 }, { "epoch": 1.0302926578812255, "grad_norm": 3.18945574760437, "learning_rate": 2.7553847527142412e-05, "loss": 0.3364, "step": 6020 }, { "epoch": 1.0304638028410062, "grad_norm": 0.8448656797409058, "learning_rate": 2.7547537744715722e-05, "loss": 0.1704, "step": 6021 }, { "epoch": 1.0306349478007872, "grad_norm": 16.985177993774414, "learning_rate": 2.7541220559223072e-05, "loss": 1.6313, "step": 6022 }, { "epoch": 1.0308060927605682, "grad_norm": 17.1005802154541, "learning_rate": 2.7534895974391614e-05, "loss": 1.7317, "step": 6023 }, { "epoch": 1.0309772377203492, "grad_norm": 4.488323211669922, "learning_rate": 2.7528563993952863e-05, "loss": 0.386, "step": 6024 }, { "epoch": 1.03114838268013, "grad_norm": 20.139062881469727, "learning_rate": 2.7522224621642692e-05, "loss": 2.5811, "step": 6025 }, { "epoch": 1.031319527639911, "grad_norm": 19.923547744750977, "learning_rate": 2.7515877861201348e-05, "loss": 2.1045, "step": 6026 }, { "epoch": 1.031490672599692, "grad_norm": 24.75724220275879, "learning_rate": 2.7509523716373417e-05, "loss": 2.4101, "step": 6027 }, { "epoch": 1.031661817559473, "grad_norm": 13.764646530151367, "learning_rate": 2.7503162190907868e-05, "loss": 1.1857, "step": 6028 }, { "epoch": 1.0318329625192537, "grad_norm": 0.987221896648407, "learning_rate": 2.7496793288557997e-05, "loss": 0.1712, "step": 6029 }, { "epoch": 1.0320041074790347, "grad_norm": 21.300790786743164, "learning_rate": 2.7490417013081475e-05, "loss": 2.4975, "step": 6030 }, { "epoch": 1.0321752524388157, "grad_norm": 17.621726989746094, "learning_rate": 2.7484033368240313e-05, "loss": 1.2443, "step": 6031 }, { "epoch": 1.0323463973985967, "grad_norm": 5.2010579109191895, "learning_rate": 2.7477642357800867e-05, "loss": 0.3915, "step": 6032 }, { "epoch": 1.0325175423583774, "grad_norm": 20.610048294067383, "learning_rate": 2.7471243985533842e-05, "loss": 2.4583, "step": 6033 }, { "epoch": 1.0326886873181584, "grad_norm": 16.952592849731445, "learning_rate": 2.7464838255214296e-05, "loss": 1.373, "step": 6034 }, { "epoch": 1.0328598322779394, "grad_norm": 20.06004524230957, "learning_rate": 2.745842517062161e-05, "loss": 1.711, "step": 6035 }, { "epoch": 1.0330309772377204, "grad_norm": 12.473008155822754, "learning_rate": 2.7452004735539523e-05, "loss": 0.9459, "step": 6036 }, { "epoch": 1.0332021221975012, "grad_norm": 15.336243629455566, "learning_rate": 2.7445576953756088e-05, "loss": 1.5641, "step": 6037 }, { "epoch": 1.0333732671572822, "grad_norm": 7.581173896789551, "learning_rate": 2.7439141829063718e-05, "loss": 0.4988, "step": 6038 }, { "epoch": 1.0335444121170632, "grad_norm": 4.762014389038086, "learning_rate": 2.7432699365259136e-05, "loss": 0.3329, "step": 6039 }, { "epoch": 1.0337155570768441, "grad_norm": 10.778675079345703, "learning_rate": 2.742624956614341e-05, "loss": 0.9549, "step": 6040 }, { "epoch": 1.033886702036625, "grad_norm": 22.29326820373535, "learning_rate": 2.7419792435521935e-05, "loss": 1.9647, "step": 6041 }, { "epoch": 1.034057846996406, "grad_norm": 26.956926345825195, "learning_rate": 2.7413327977204426e-05, "loss": 2.3277, "step": 6042 }, { "epoch": 1.034228991956187, "grad_norm": 13.276328086853027, "learning_rate": 2.740685619500492e-05, "loss": 0.9316, "step": 6043 }, { "epoch": 1.0344001369159679, "grad_norm": 38.53458023071289, "learning_rate": 2.740037709274178e-05, "loss": 1.8594, "step": 6044 }, { "epoch": 1.0345712818757486, "grad_norm": 33.913692474365234, "learning_rate": 2.7393890674237685e-05, "loss": 1.6477, "step": 6045 }, { "epoch": 1.0347424268355296, "grad_norm": 18.45242691040039, "learning_rate": 2.7387396943319628e-05, "loss": 2.0229, "step": 6046 }, { "epoch": 1.0349135717953106, "grad_norm": 17.105792999267578, "learning_rate": 2.7380895903818927e-05, "loss": 1.2288, "step": 6047 }, { "epoch": 1.0350847167550916, "grad_norm": 26.542585372924805, "learning_rate": 2.7374387559571203e-05, "loss": 5.4436, "step": 6048 }, { "epoch": 1.0352558617148726, "grad_norm": 9.220451354980469, "learning_rate": 2.7367871914416383e-05, "loss": 1.0206, "step": 6049 }, { "epoch": 1.0354270066746534, "grad_norm": 18.158157348632812, "learning_rate": 2.736134897219872e-05, "loss": 2.1711, "step": 6050 }, { "epoch": 1.0355981516344344, "grad_norm": 11.473002433776855, "learning_rate": 2.7354818736766747e-05, "loss": 0.9204, "step": 6051 }, { "epoch": 1.0357692965942153, "grad_norm": 4.108072757720947, "learning_rate": 2.7348281211973317e-05, "loss": 0.5207, "step": 6052 }, { "epoch": 1.0359404415539963, "grad_norm": 26.6224308013916, "learning_rate": 2.7341736401675578e-05, "loss": 5.5512, "step": 6053 }, { "epoch": 1.036111586513777, "grad_norm": 17.62623405456543, "learning_rate": 2.7335184309734983e-05, "loss": 1.689, "step": 6054 }, { "epoch": 1.036282731473558, "grad_norm": 11.85800552368164, "learning_rate": 2.732862494001727e-05, "loss": 1.0557, "step": 6055 }, { "epoch": 1.036453876433339, "grad_norm": 5.9652485847473145, "learning_rate": 2.7322058296392484e-05, "loss": 0.413, "step": 6056 }, { "epoch": 1.03662502139312, "grad_norm": 14.44044303894043, "learning_rate": 2.7315484382734947e-05, "loss": 1.4248, "step": 6057 }, { "epoch": 1.0367961663529008, "grad_norm": 7.914107799530029, "learning_rate": 2.730890320292328e-05, "loss": 0.7368, "step": 6058 }, { "epoch": 1.0369673113126818, "grad_norm": 22.765840530395508, "learning_rate": 2.730231476084039e-05, "loss": 2.4051, "step": 6059 }, { "epoch": 1.0371384562724628, "grad_norm": 22.832996368408203, "learning_rate": 2.7295719060373468e-05, "loss": 2.212, "step": 6060 }, { "epoch": 1.0373096012322438, "grad_norm": 15.181253433227539, "learning_rate": 2.7289116105413985e-05, "loss": 1.2104, "step": 6061 }, { "epoch": 1.0374807461920246, "grad_norm": 14.0794095993042, "learning_rate": 2.7282505899857695e-05, "loss": 1.2968, "step": 6062 }, { "epoch": 1.0376518911518056, "grad_norm": 4.96496057510376, "learning_rate": 2.7275888447604632e-05, "loss": 0.3762, "step": 6063 }, { "epoch": 1.0378230361115865, "grad_norm": 6.760622024536133, "learning_rate": 2.7269263752559102e-05, "loss": 0.4875, "step": 6064 }, { "epoch": 1.0379941810713675, "grad_norm": 19.93670654296875, "learning_rate": 2.7262631818629676e-05, "loss": 1.7135, "step": 6065 }, { "epoch": 1.0381653260311483, "grad_norm": 83.41024017333984, "learning_rate": 2.7255992649729222e-05, "loss": 6.0933, "step": 6066 }, { "epoch": 1.0383364709909293, "grad_norm": 13.883889198303223, "learning_rate": 2.7249346249774843e-05, "loss": 1.3731, "step": 6067 }, { "epoch": 1.0385076159507103, "grad_norm": 3.8786051273345947, "learning_rate": 2.7242692622687934e-05, "loss": 0.3786, "step": 6068 }, { "epoch": 1.0386787609104913, "grad_norm": 4.438342571258545, "learning_rate": 2.723603177239415e-05, "loss": 0.3986, "step": 6069 }, { "epoch": 1.038849905870272, "grad_norm": 11.963251113891602, "learning_rate": 2.722936370282338e-05, "loss": 1.0745, "step": 6070 }, { "epoch": 1.039021050830053, "grad_norm": 13.7601957321167, "learning_rate": 2.7222688417909817e-05, "loss": 1.1668, "step": 6071 }, { "epoch": 1.039192195789834, "grad_norm": 23.49478530883789, "learning_rate": 2.7216005921591886e-05, "loss": 2.3047, "step": 6072 }, { "epoch": 1.039363340749615, "grad_norm": 20.574737548828125, "learning_rate": 2.7209316217812263e-05, "loss": 1.4941, "step": 6073 }, { "epoch": 1.0395344857093958, "grad_norm": 94.0860824584961, "learning_rate": 2.720261931051789e-05, "loss": 8.5413, "step": 6074 }, { "epoch": 1.0397056306691768, "grad_norm": 7.629838943481445, "learning_rate": 2.719591520365994e-05, "loss": 0.4664, "step": 6075 }, { "epoch": 1.0398767756289578, "grad_norm": 14.49278450012207, "learning_rate": 2.718920390119386e-05, "loss": 1.3559, "step": 6076 }, { "epoch": 1.0400479205887387, "grad_norm": 14.303519248962402, "learning_rate": 2.7182485407079323e-05, "loss": 1.189, "step": 6077 }, { "epoch": 1.0402190655485195, "grad_norm": 5.337088584899902, "learning_rate": 2.717575972528025e-05, "loss": 0.4424, "step": 6078 }, { "epoch": 1.0403902105083005, "grad_norm": 19.05851173400879, "learning_rate": 2.7169026859764806e-05, "loss": 1.741, "step": 6079 }, { "epoch": 1.0405613554680815, "grad_norm": 20.25058364868164, "learning_rate": 2.7162286814505385e-05, "loss": 2.3799, "step": 6080 }, { "epoch": 1.0407325004278625, "grad_norm": 23.0300350189209, "learning_rate": 2.7155539593478633e-05, "loss": 0.8189, "step": 6081 }, { "epoch": 1.0409036453876432, "grad_norm": 43.651771545410156, "learning_rate": 2.714878520066541e-05, "loss": 6.1888, "step": 6082 }, { "epoch": 1.0410747903474242, "grad_norm": 16.711088180541992, "learning_rate": 2.7142023640050826e-05, "loss": 1.1842, "step": 6083 }, { "epoch": 1.0412459353072052, "grad_norm": 16.65302085876465, "learning_rate": 2.7135254915624213e-05, "loss": 1.3284, "step": 6084 }, { "epoch": 1.0414170802669862, "grad_norm": 14.885180473327637, "learning_rate": 2.712847903137912e-05, "loss": 1.1705, "step": 6085 }, { "epoch": 1.041588225226767, "grad_norm": 14.422985076904297, "learning_rate": 2.7121695991313332e-05, "loss": 1.3098, "step": 6086 }, { "epoch": 1.041759370186548, "grad_norm": 23.700284957885742, "learning_rate": 2.7114905799428863e-05, "loss": 1.958, "step": 6087 }, { "epoch": 1.041930515146329, "grad_norm": 17.216182708740234, "learning_rate": 2.710810845973192e-05, "loss": 1.7762, "step": 6088 }, { "epoch": 1.04210166010611, "grad_norm": 15.392316818237305, "learning_rate": 2.7101303976232955e-05, "loss": 1.3241, "step": 6089 }, { "epoch": 1.0422728050658907, "grad_norm": 18.830110549926758, "learning_rate": 2.709449235294662e-05, "loss": 1.5202, "step": 6090 }, { "epoch": 1.0424439500256717, "grad_norm": 1.7132902145385742, "learning_rate": 2.708767359389178e-05, "loss": 0.2298, "step": 6091 }, { "epoch": 1.0426150949854527, "grad_norm": 20.93419075012207, "learning_rate": 2.7080847703091514e-05, "loss": 2.1347, "step": 6092 }, { "epoch": 1.0427862399452337, "grad_norm": 9.760008811950684, "learning_rate": 2.7074014684573112e-05, "loss": 1.3687, "step": 6093 }, { "epoch": 1.0429573849050144, "grad_norm": 16.97105598449707, "learning_rate": 2.7067174542368064e-05, "loss": 1.5609, "step": 6094 }, { "epoch": 1.0431285298647954, "grad_norm": 18.94205665588379, "learning_rate": 2.7060327280512057e-05, "loss": 1.3384, "step": 6095 }, { "epoch": 1.0432996748245764, "grad_norm": 1.1413235664367676, "learning_rate": 2.7053472903044994e-05, "loss": 0.1765, "step": 6096 }, { "epoch": 1.0434708197843574, "grad_norm": 3.6719653606414795, "learning_rate": 2.7046611414010968e-05, "loss": 0.3251, "step": 6097 }, { "epoch": 1.0436419647441384, "grad_norm": 21.025039672851562, "learning_rate": 2.703974281745826e-05, "loss": 2.9787, "step": 6098 }, { "epoch": 1.0438131097039192, "grad_norm": 32.26662063598633, "learning_rate": 2.703286711743936e-05, "loss": 5.5577, "step": 6099 }, { "epoch": 1.0439842546637002, "grad_norm": 4.281724452972412, "learning_rate": 2.702598431801094e-05, "loss": 0.3655, "step": 6100 }, { "epoch": 1.0441553996234811, "grad_norm": 24.29343032836914, "learning_rate": 2.7019094423233853e-05, "loss": 1.8119, "step": 6101 }, { "epoch": 1.0443265445832621, "grad_norm": 22.842308044433594, "learning_rate": 2.701219743717316e-05, "loss": 2.8777, "step": 6102 }, { "epoch": 1.044497689543043, "grad_norm": 15.586821556091309, "learning_rate": 2.7005293363898085e-05, "loss": 1.6434, "step": 6103 }, { "epoch": 1.0446688345028239, "grad_norm": 23.474470138549805, "learning_rate": 2.6998382207482045e-05, "loss": 2.3843, "step": 6104 }, { "epoch": 1.0448399794626049, "grad_norm": 37.05857849121094, "learning_rate": 2.6991463972002633e-05, "loss": 5.6854, "step": 6105 }, { "epoch": 1.0450111244223859, "grad_norm": 25.402162551879883, "learning_rate": 2.6984538661541615e-05, "loss": 3.2815, "step": 6106 }, { "epoch": 1.0451822693821666, "grad_norm": 13.91854190826416, "learning_rate": 2.6977606280184937e-05, "loss": 1.1625, "step": 6107 }, { "epoch": 1.0453534143419476, "grad_norm": 28.982385635375977, "learning_rate": 2.6970666832022713e-05, "loss": 2.5425, "step": 6108 }, { "epoch": 1.0455245593017286, "grad_norm": 21.270137786865234, "learning_rate": 2.696372032114923e-05, "loss": 2.7429, "step": 6109 }, { "epoch": 1.0456957042615096, "grad_norm": 18.849096298217773, "learning_rate": 2.6956766751662936e-05, "loss": 2.0294, "step": 6110 }, { "epoch": 1.0458668492212904, "grad_norm": 23.60141944885254, "learning_rate": 2.694980612766645e-05, "loss": 2.7659, "step": 6111 }, { "epoch": 1.0460379941810714, "grad_norm": 14.735956192016602, "learning_rate": 2.6942838453266547e-05, "loss": 1.3023, "step": 6112 }, { "epoch": 1.0462091391408523, "grad_norm": 0.9047157168388367, "learning_rate": 2.693586373257417e-05, "loss": 0.1675, "step": 6113 }, { "epoch": 1.0463802841006333, "grad_norm": 8.82510757446289, "learning_rate": 2.6928881969704416e-05, "loss": 0.6863, "step": 6114 }, { "epoch": 1.046551429060414, "grad_norm": 25.968671798706055, "learning_rate": 2.692189316877653e-05, "loss": 1.1619, "step": 6115 }, { "epoch": 1.046722574020195, "grad_norm": 18.711915969848633, "learning_rate": 2.6914897333913914e-05, "loss": 1.6528, "step": 6116 }, { "epoch": 1.046893718979976, "grad_norm": 15.643209457397461, "learning_rate": 2.6907894469244127e-05, "loss": 1.3218, "step": 6117 }, { "epoch": 1.047064863939757, "grad_norm": 22.364917755126953, "learning_rate": 2.6900884578898872e-05, "loss": 2.1141, "step": 6118 }, { "epoch": 1.0472360088995378, "grad_norm": 21.417695999145508, "learning_rate": 2.6893867667013983e-05, "loss": 2.0941, "step": 6119 }, { "epoch": 1.0474071538593188, "grad_norm": 14.733633995056152, "learning_rate": 2.6886843737729457e-05, "loss": 1.5188, "step": 6120 }, { "epoch": 1.0475782988190998, "grad_norm": 23.66362953186035, "learning_rate": 2.687981279518942e-05, "loss": 2.0821, "step": 6121 }, { "epoch": 1.0477494437788808, "grad_norm": 24.078081130981445, "learning_rate": 2.687277484354214e-05, "loss": 2.9703, "step": 6122 }, { "epoch": 1.0479205887386616, "grad_norm": 21.633882522583008, "learning_rate": 2.686572988694002e-05, "loss": 1.7227, "step": 6123 }, { "epoch": 1.0480917336984426, "grad_norm": 15.577491760253906, "learning_rate": 2.685867792953959e-05, "loss": 1.2526, "step": 6124 }, { "epoch": 1.0482628786582235, "grad_norm": 0.8374304175376892, "learning_rate": 2.685161897550152e-05, "loss": 0.165, "step": 6125 }, { "epoch": 1.0484340236180045, "grad_norm": 15.729249000549316, "learning_rate": 2.68445530289906e-05, "loss": 1.5607, "step": 6126 }, { "epoch": 1.0486051685777853, "grad_norm": 3.7719273567199707, "learning_rate": 2.6837480094175753e-05, "loss": 0.3735, "step": 6127 }, { "epoch": 1.0487763135375663, "grad_norm": 22.409833908081055, "learning_rate": 2.6830400175230022e-05, "loss": 2.0298, "step": 6128 }, { "epoch": 1.0489474584973473, "grad_norm": 27.7617244720459, "learning_rate": 2.682331327633056e-05, "loss": 3.3434, "step": 6129 }, { "epoch": 1.0491186034571283, "grad_norm": 20.888198852539062, "learning_rate": 2.681621940165866e-05, "loss": 2.0731, "step": 6130 }, { "epoch": 1.049289748416909, "grad_norm": 17.169727325439453, "learning_rate": 2.680911855539971e-05, "loss": 1.4024, "step": 6131 }, { "epoch": 1.04946089337669, "grad_norm": 32.940948486328125, "learning_rate": 2.6802010741743227e-05, "loss": 5.7171, "step": 6132 }, { "epoch": 1.049632038336471, "grad_norm": 6.877800464630127, "learning_rate": 2.6794895964882826e-05, "loss": 0.4265, "step": 6133 }, { "epoch": 1.049803183296252, "grad_norm": 31.221891403198242, "learning_rate": 2.678777422901623e-05, "loss": 1.3258, "step": 6134 }, { "epoch": 1.0499743282560328, "grad_norm": 18.083770751953125, "learning_rate": 2.6780645538345294e-05, "loss": 1.3423, "step": 6135 }, { "epoch": 1.0501454732158138, "grad_norm": 5.254927158355713, "learning_rate": 2.677350989707594e-05, "loss": 0.3645, "step": 6136 }, { "epoch": 1.0503166181755947, "grad_norm": 20.59516143798828, "learning_rate": 2.6766367309418206e-05, "loss": 2.2506, "step": 6137 }, { "epoch": 1.0504877631353757, "grad_norm": 1.0191950798034668, "learning_rate": 2.6759217779586237e-05, "loss": 0.1677, "step": 6138 }, { "epoch": 1.0506589080951565, "grad_norm": 123.47958374023438, "learning_rate": 2.675206131179826e-05, "loss": 9.1588, "step": 6139 }, { "epoch": 1.0506589080951565, "eval_nli-pairs_loss": 1.7010418176651, "eval_nli-pairs_runtime": 4.3443, "eval_nli-pairs_samples_per_second": 46.038, "eval_nli-pairs_steps_per_second": 1.611, "eval_sts-test_pearson_cosine": 0.7625267011985226, "eval_sts-test_pearson_dot": 0.6259248949371231, "eval_sts-test_pearson_euclidean": 0.7619764658843026, "eval_sts-test_pearson_manhattan": 0.7680853132392583, "eval_sts-test_pearson_max": 0.7680853132392583, "eval_sts-test_spearman_cosine": 0.7625076199036728, "eval_sts-test_spearman_dot": 0.6009276916665572, "eval_sts-test_spearman_euclidean": 0.7523681562432721, "eval_sts-test_spearman_manhattan": 0.7608907210750292, "eval_sts-test_spearman_max": 0.7625076199036728, "step": 6139 }, { "epoch": 1.0506589080951565, "eval_vitaminc-pairs_loss": 1.0161106586456299, "eval_vitaminc-pairs_runtime": 2.7397, "eval_vitaminc-pairs_samples_per_second": 73.0, "eval_vitaminc-pairs_steps_per_second": 2.555, "step": 6139 }, { "epoch": 1.0506589080951565, "eval_qnli-contrastive_loss": 1.7974004745483398, "eval_qnli-contrastive_runtime": 0.6358, "eval_qnli-contrastive_samples_per_second": 314.542, "eval_qnli-contrastive_steps_per_second": 11.009, "step": 6139 }, { "epoch": 1.0506589080951565, "eval_scitail-pairs-qa_loss": 0.1420755535364151, "eval_scitail-pairs-qa_runtime": 1.5961, "eval_scitail-pairs-qa_samples_per_second": 125.302, "eval_scitail-pairs-qa_steps_per_second": 4.386, "step": 6139 }, { "epoch": 1.0506589080951565, "eval_scitail-pairs-pos_loss": 0.7034513354301453, "eval_scitail-pairs-pos_runtime": 2.627, "eval_scitail-pairs-pos_samples_per_second": 76.133, "eval_scitail-pairs-pos_steps_per_second": 2.665, "step": 6139 }, { "epoch": 1.0506589080951565, "eval_xsum-pairs_loss": 0.8569247126579285, "eval_xsum-pairs_runtime": 2.6434, "eval_xsum-pairs_samples_per_second": 66.204, "eval_xsum-pairs_steps_per_second": 2.27, "step": 6139 }, { "epoch": 1.0506589080951565, "eval_compression-pairs_loss": 0.3054618835449219, "eval_compression-pairs_runtime": 0.5142, "eval_compression-pairs_samples_per_second": 388.927, "eval_compression-pairs_steps_per_second": 13.612, "step": 6139 }, { "epoch": 1.0506589080951565, "eval_sciq_pairs_loss": 0.5279684066772461, "eval_sciq_pairs_runtime": 9.1732, "eval_sciq_pairs_samples_per_second": 21.803, "eval_sciq_pairs_steps_per_second": 0.763, "step": 6139 }, { "epoch": 1.0506589080951565, "eval_qasc_pairs_loss": 5.473604679107666, "eval_qasc_pairs_runtime": 2.6467, "eval_qasc_pairs_samples_per_second": 75.564, "eval_qasc_pairs_steps_per_second": 2.645, "step": 6139 }, { "epoch": 1.0506589080951565, "eval_openbookqa_pairs_loss": 2.757842779159546, "eval_openbookqa_pairs_runtime": 0.6388, "eval_openbookqa_pairs_samples_per_second": 108.01, "eval_openbookqa_pairs_steps_per_second": 4.696, "step": 6139 }, { "epoch": 1.0506589080951565, "eval_msmarco_pairs_loss": 1.4087409973144531, "eval_msmarco_pairs_runtime": 3.9825, "eval_msmarco_pairs_samples_per_second": 50.22, "eval_msmarco_pairs_steps_per_second": 1.758, "step": 6139 }, { "epoch": 1.0506589080951565, "eval_nq_pairs_loss": 1.5596331357955933, "eval_nq_pairs_runtime": 8.6663, "eval_nq_pairs_samples_per_second": 23.078, "eval_nq_pairs_steps_per_second": 0.808, "step": 6139 }, { "epoch": 1.0506589080951565, "eval_trivia_pairs_loss": 1.9876388311386108, "eval_trivia_pairs_runtime": 12.8237, "eval_trivia_pairs_samples_per_second": 15.596, "eval_trivia_pairs_steps_per_second": 0.546, "step": 6139 }, { "epoch": 1.0506589080951565, "eval_quora_pairs_loss": 0.2655409276485443, "eval_quora_pairs_runtime": 1.5911, "eval_quora_pairs_samples_per_second": 125.702, "eval_quora_pairs_steps_per_second": 4.4, "step": 6139 }, { "epoch": 1.0506589080951565, "eval_gooaq_pairs_loss": 1.0326839685440063, "eval_gooaq_pairs_runtime": 2.6507, "eval_gooaq_pairs_samples_per_second": 75.453, "eval_gooaq_pairs_steps_per_second": 2.641, "step": 6139 }, { "epoch": 1.0508300530549375, "grad_norm": 1.1939899921417236, "learning_rate": 2.674489791027661e-05, "loss": 0.1777, "step": 6140 }, { "epoch": 1.0510011980147185, "grad_norm": 7.697817802429199, "learning_rate": 2.6737727579247696e-05, "loss": 0.5119, "step": 6141 }, { "epoch": 1.0511723429744995, "grad_norm": 24.694753646850586, "learning_rate": 2.673055032294203e-05, "loss": 2.7351, "step": 6142 }, { "epoch": 1.0513434879342802, "grad_norm": 25.370515823364258, "learning_rate": 2.67233661455942e-05, "loss": 2.9894, "step": 6143 }, { "epoch": 1.0515146328940612, "grad_norm": 29.25603485107422, "learning_rate": 2.671617505144288e-05, "loss": 5.7828, "step": 6144 }, { "epoch": 1.0516857778538422, "grad_norm": 21.737051010131836, "learning_rate": 2.6708977044730832e-05, "loss": 2.0899, "step": 6145 }, { "epoch": 1.0518569228136232, "grad_norm": 5.30400276184082, "learning_rate": 2.6701772129704884e-05, "loss": 0.4699, "step": 6146 }, { "epoch": 1.0520280677734042, "grad_norm": 20.10776710510254, "learning_rate": 2.669456031061595e-05, "loss": 2.4938, "step": 6147 }, { "epoch": 1.052199212733185, "grad_norm": 10.818122863769531, "learning_rate": 2.668734159171902e-05, "loss": 0.9672, "step": 6148 }, { "epoch": 1.052370357692966, "grad_norm": 15.619196891784668, "learning_rate": 2.668011597727314e-05, "loss": 1.3626, "step": 6149 }, { "epoch": 1.052541502652747, "grad_norm": 12.81397819519043, "learning_rate": 2.6672883471541436e-05, "loss": 1.1308, "step": 6150 }, { "epoch": 1.0527126476125277, "grad_norm": 7.247302055358887, "learning_rate": 2.6665644078791098e-05, "loss": 0.8993, "step": 6151 }, { "epoch": 1.0528837925723087, "grad_norm": 16.9379825592041, "learning_rate": 2.6658397803293376e-05, "loss": 1.2732, "step": 6152 }, { "epoch": 1.0530549375320897, "grad_norm": 0.6092934608459473, "learning_rate": 2.665114464932359e-05, "loss": 0.1539, "step": 6153 }, { "epoch": 1.0532260824918707, "grad_norm": 9.06037425994873, "learning_rate": 2.6643884621161102e-05, "loss": 0.6905, "step": 6154 }, { "epoch": 1.0533972274516517, "grad_norm": 6.986600875854492, "learning_rate": 2.6636617723089345e-05, "loss": 0.5406, "step": 6155 }, { "epoch": 1.0535683724114324, "grad_norm": 18.403789520263672, "learning_rate": 2.6629343959395805e-05, "loss": 1.4752, "step": 6156 }, { "epoch": 1.0537395173712134, "grad_norm": 11.036933898925781, "learning_rate": 2.6622063334372e-05, "loss": 0.8559, "step": 6157 }, { "epoch": 1.0539106623309944, "grad_norm": 21.17281150817871, "learning_rate": 2.661477585231352e-05, "loss": 2.043, "step": 6158 }, { "epoch": 1.0540818072907754, "grad_norm": 17.409849166870117, "learning_rate": 2.6607481517519984e-05, "loss": 1.0679, "step": 6159 }, { "epoch": 1.0542529522505562, "grad_norm": 4.291676044464111, "learning_rate": 2.6600180334295073e-05, "loss": 0.5096, "step": 6160 }, { "epoch": 1.0544240972103371, "grad_norm": 20.648345947265625, "learning_rate": 2.659287230694648e-05, "loss": 1.899, "step": 6161 }, { "epoch": 1.0545952421701181, "grad_norm": 0.6559057235717773, "learning_rate": 2.658555743978596e-05, "loss": 0.1645, "step": 6162 }, { "epoch": 1.0547663871298991, "grad_norm": 15.449231147766113, "learning_rate": 2.6578235737129292e-05, "loss": 1.2553, "step": 6163 }, { "epoch": 1.0549375320896799, "grad_norm": 11.673152923583984, "learning_rate": 2.65709072032963e-05, "loss": 1.1081, "step": 6164 }, { "epoch": 1.0551086770494609, "grad_norm": 110.93241882324219, "learning_rate": 2.6563571842610817e-05, "loss": 9.2153, "step": 6165 }, { "epoch": 1.0552798220092419, "grad_norm": 13.813591957092285, "learning_rate": 2.6556229659400724e-05, "loss": 1.1657, "step": 6166 }, { "epoch": 1.0554509669690229, "grad_norm": 20.899248123168945, "learning_rate": 2.6548880657997922e-05, "loss": 2.6031, "step": 6167 }, { "epoch": 1.0556221119288036, "grad_norm": 9.75277042388916, "learning_rate": 2.6541524842738333e-05, "loss": 0.9529, "step": 6168 }, { "epoch": 1.0557932568885846, "grad_norm": 2.945573568344116, "learning_rate": 2.653416221796189e-05, "loss": 0.2978, "step": 6169 }, { "epoch": 1.0559644018483656, "grad_norm": 24.812198638916016, "learning_rate": 2.6526792788012562e-05, "loss": 2.9515, "step": 6170 }, { "epoch": 1.0561355468081466, "grad_norm": 3.9069230556488037, "learning_rate": 2.651941655723832e-05, "loss": 0.3683, "step": 6171 }, { "epoch": 1.0563066917679274, "grad_norm": 27.411827087402344, "learning_rate": 2.6512033529991148e-05, "loss": 2.3892, "step": 6172 }, { "epoch": 1.0564778367277083, "grad_norm": 18.866891860961914, "learning_rate": 2.6504643710627054e-05, "loss": 2.1363, "step": 6173 }, { "epoch": 1.0566489816874893, "grad_norm": 23.880348205566406, "learning_rate": 2.649724710350603e-05, "loss": 2.8926, "step": 6174 }, { "epoch": 1.0568201266472703, "grad_norm": 19.74806022644043, "learning_rate": 2.6489843712992093e-05, "loss": 1.891, "step": 6175 }, { "epoch": 1.056991271607051, "grad_norm": 15.93543815612793, "learning_rate": 2.648243354345325e-05, "loss": 1.4833, "step": 6176 }, { "epoch": 1.057162416566832, "grad_norm": 19.573871612548828, "learning_rate": 2.6475016599261512e-05, "loss": 1.5903, "step": 6177 }, { "epoch": 1.057333561526613, "grad_norm": 13.658683776855469, "learning_rate": 2.6467592884792892e-05, "loss": 1.3537, "step": 6178 }, { "epoch": 1.057504706486394, "grad_norm": 8.274129867553711, "learning_rate": 2.6460162404427387e-05, "loss": 1.3073, "step": 6179 }, { "epoch": 1.0576758514461748, "grad_norm": 13.96843147277832, "learning_rate": 2.6452725162548994e-05, "loss": 1.0987, "step": 6180 }, { "epoch": 1.0578469964059558, "grad_norm": 18.048133850097656, "learning_rate": 2.6445281163545698e-05, "loss": 1.5427, "step": 6181 }, { "epoch": 1.0580181413657368, "grad_norm": 51.59361267089844, "learning_rate": 2.643783041180947e-05, "loss": 5.8192, "step": 6182 }, { "epoch": 1.0581892863255178, "grad_norm": 6.808162212371826, "learning_rate": 2.643037291173626e-05, "loss": 0.4664, "step": 6183 }, { "epoch": 1.0583604312852986, "grad_norm": 24.485843658447266, "learning_rate": 2.6422908667726006e-05, "loss": 2.1158, "step": 6184 }, { "epoch": 1.0585315762450795, "grad_norm": 3.2217724323272705, "learning_rate": 2.6415437684182626e-05, "loss": 0.383, "step": 6185 }, { "epoch": 1.0587027212048605, "grad_norm": 13.59959602355957, "learning_rate": 2.640795996551401e-05, "loss": 1.0761, "step": 6186 }, { "epoch": 1.0588738661646415, "grad_norm": 15.931357383728027, "learning_rate": 2.6400475516132022e-05, "loss": 1.5406, "step": 6187 }, { "epoch": 1.0590450111244223, "grad_norm": 24.72441864013672, "learning_rate": 2.63929843404525e-05, "loss": 2.225, "step": 6188 }, { "epoch": 1.0592161560842033, "grad_norm": 4.713048458099365, "learning_rate": 2.6385486442895244e-05, "loss": 0.3465, "step": 6189 }, { "epoch": 1.0593873010439843, "grad_norm": 16.011383056640625, "learning_rate": 2.637798182788403e-05, "loss": 1.3794, "step": 6190 }, { "epoch": 1.0595584460037653, "grad_norm": 32.27912521362305, "learning_rate": 2.637047049984659e-05, "loss": 5.8596, "step": 6191 }, { "epoch": 1.059729590963546, "grad_norm": 6.318559646606445, "learning_rate": 2.6362952463214628e-05, "loss": 0.2795, "step": 6192 }, { "epoch": 1.059900735923327, "grad_norm": 16.79538345336914, "learning_rate": 2.6355427722423774e-05, "loss": 1.5498, "step": 6193 }, { "epoch": 1.060071880883108, "grad_norm": 22.450878143310547, "learning_rate": 2.634789628191366e-05, "loss": 2.1571, "step": 6194 }, { "epoch": 1.060243025842889, "grad_norm": 17.413171768188477, "learning_rate": 2.6340358146127835e-05, "loss": 1.6402, "step": 6195 }, { "epoch": 1.06041417080267, "grad_norm": 20.46025276184082, "learning_rate": 2.6332813319513813e-05, "loss": 1.7809, "step": 6196 }, { "epoch": 1.0605853157624507, "grad_norm": 19.27883529663086, "learning_rate": 2.6325261806523055e-05, "loss": 1.7623, "step": 6197 }, { "epoch": 1.0607564607222317, "grad_norm": 177.13131713867188, "learning_rate": 2.6317703611610957e-05, "loss": 8.661, "step": 6198 }, { "epoch": 1.0609276056820127, "grad_norm": 14.9853515625, "learning_rate": 2.6310138739236873e-05, "loss": 1.2864, "step": 6199 }, { "epoch": 1.0610987506417935, "grad_norm": 17.604280471801758, "learning_rate": 2.6302567193864087e-05, "loss": 1.5651, "step": 6200 }, { "epoch": 1.0612698956015745, "grad_norm": 9.68821907043457, "learning_rate": 2.6294988979959822e-05, "loss": 0.5439, "step": 6201 }, { "epoch": 1.0614410405613555, "grad_norm": 9.67438793182373, "learning_rate": 2.6287404101995235e-05, "loss": 1.3076, "step": 6202 }, { "epoch": 1.0616121855211365, "grad_norm": 18.51534652709961, "learning_rate": 2.627981256444542e-05, "loss": 1.3825, "step": 6203 }, { "epoch": 1.0617833304809174, "grad_norm": 18.125104904174805, "learning_rate": 2.6272214371789385e-05, "loss": 1.5245, "step": 6204 }, { "epoch": 1.0619544754406982, "grad_norm": 19.579633712768555, "learning_rate": 2.6264609528510084e-05, "loss": 1.8224, "step": 6205 }, { "epoch": 1.0621256204004792, "grad_norm": 23.845556259155273, "learning_rate": 2.6256998039094383e-05, "loss": 2.6739, "step": 6206 }, { "epoch": 1.0622967653602602, "grad_norm": 22.428180694580078, "learning_rate": 2.6249379908033074e-05, "loss": 3.1672, "step": 6207 }, { "epoch": 1.0624679103200412, "grad_norm": 29.91859245300293, "learning_rate": 2.624175513982086e-05, "loss": 3.3232, "step": 6208 }, { "epoch": 1.062639055279822, "grad_norm": 17.00956153869629, "learning_rate": 2.623412373895637e-05, "loss": 1.4198, "step": 6209 }, { "epoch": 1.062810200239603, "grad_norm": 11.492415428161621, "learning_rate": 2.622648570994214e-05, "loss": 0.934, "step": 6210 }, { "epoch": 1.062981345199384, "grad_norm": 21.991470336914062, "learning_rate": 2.6218841057284624e-05, "loss": 1.5342, "step": 6211 }, { "epoch": 1.063152490159165, "grad_norm": 23.230363845825195, "learning_rate": 2.621118978549417e-05, "loss": 1.2235, "step": 6212 }, { "epoch": 1.0633236351189457, "grad_norm": 14.931122779846191, "learning_rate": 2.620353189908505e-05, "loss": 1.4041, "step": 6213 }, { "epoch": 1.0634947800787267, "grad_norm": 24.851787567138672, "learning_rate": 2.6195867402575414e-05, "loss": 3.0139, "step": 6214 }, { "epoch": 1.0636659250385077, "grad_norm": 16.248435974121094, "learning_rate": 2.618819630048734e-05, "loss": 1.4302, "step": 6215 }, { "epoch": 1.0638370699982886, "grad_norm": 16.73040771484375, "learning_rate": 2.6180518597346788e-05, "loss": 1.613, "step": 6216 }, { "epoch": 1.0640082149580694, "grad_norm": 19.11425018310547, "learning_rate": 2.61728342976836e-05, "loss": 1.8297, "step": 6217 }, { "epoch": 1.0641793599178504, "grad_norm": 15.915562629699707, "learning_rate": 2.6165143406031547e-05, "loss": 1.2021, "step": 6218 }, { "epoch": 1.0643505048776314, "grad_norm": 16.527769088745117, "learning_rate": 2.6157445926928247e-05, "loss": 1.3672, "step": 6219 }, { "epoch": 1.0645216498374124, "grad_norm": 20.64341163635254, "learning_rate": 2.614974186491523e-05, "loss": 2.6897, "step": 6220 }, { "epoch": 1.0646927947971931, "grad_norm": 0.6564233899116516, "learning_rate": 2.6142031224537907e-05, "loss": 0.1614, "step": 6221 }, { "epoch": 1.0648639397569741, "grad_norm": 20.409786224365234, "learning_rate": 2.6134314010345565e-05, "loss": 2.0853, "step": 6222 }, { "epoch": 1.0650350847167551, "grad_norm": 2.155768394470215, "learning_rate": 2.612659022689138e-05, "loss": 0.336, "step": 6223 }, { "epoch": 1.065206229676536, "grad_norm": 20.48274803161621, "learning_rate": 2.6118859878732382e-05, "loss": 2.0026, "step": 6224 }, { "epoch": 1.0653773746363169, "grad_norm": 22.327909469604492, "learning_rate": 2.6111122970429495e-05, "loss": 2.4422, "step": 6225 }, { "epoch": 1.0655485195960979, "grad_norm": 1.1469937562942505, "learning_rate": 2.6103379506547513e-05, "loss": 0.1708, "step": 6226 }, { "epoch": 1.0657196645558789, "grad_norm": 106.68755340576172, "learning_rate": 2.6095629491655074e-05, "loss": 8.7931, "step": 6227 }, { "epoch": 1.0658908095156598, "grad_norm": 1.095432162284851, "learning_rate": 2.6087872930324714e-05, "loss": 0.1697, "step": 6228 }, { "epoch": 1.0660619544754406, "grad_norm": 5.803622722625732, "learning_rate": 2.608010982713281e-05, "loss": 0.5993, "step": 6229 }, { "epoch": 1.0662330994352216, "grad_norm": 14.511138916015625, "learning_rate": 2.60723401866596e-05, "loss": 1.4565, "step": 6230 }, { "epoch": 1.0664042443950026, "grad_norm": 5.559805870056152, "learning_rate": 2.6064564013489195e-05, "loss": 0.4986, "step": 6231 }, { "epoch": 1.0665753893547836, "grad_norm": 24.374418258666992, "learning_rate": 2.6056781312209537e-05, "loss": 3.2554, "step": 6232 }, { "epoch": 1.0667465343145643, "grad_norm": 0.6469919085502625, "learning_rate": 2.6048992087412437e-05, "loss": 0.1491, "step": 6233 }, { "epoch": 1.0669176792743453, "grad_norm": 1.050114393234253, "learning_rate": 2.604119634369355e-05, "loss": 0.1731, "step": 6234 }, { "epoch": 1.0670888242341263, "grad_norm": 17.869945526123047, "learning_rate": 2.603339408565237e-05, "loss": 1.8747, "step": 6235 }, { "epoch": 1.0672599691939073, "grad_norm": 65.78507995605469, "learning_rate": 2.602558531789225e-05, "loss": 6.7831, "step": 6236 }, { "epoch": 1.067431114153688, "grad_norm": 24.211278915405273, "learning_rate": 2.601777004502037e-05, "loss": 3.1394, "step": 6237 }, { "epoch": 1.067602259113469, "grad_norm": 11.33743953704834, "learning_rate": 2.6009948271647753e-05, "loss": 0.8017, "step": 6238 }, { "epoch": 1.06777340407325, "grad_norm": 13.077141761779785, "learning_rate": 2.6002120002389257e-05, "loss": 1.0105, "step": 6239 }, { "epoch": 1.067944549033031, "grad_norm": 21.057144165039062, "learning_rate": 2.5994285241863572e-05, "loss": 2.6367, "step": 6240 }, { "epoch": 1.0681156939928118, "grad_norm": 21.67076301574707, "learning_rate": 2.5986443994693216e-05, "loss": 1.9473, "step": 6241 }, { "epoch": 1.0682868389525928, "grad_norm": 3.669816255569458, "learning_rate": 2.5978596265504542e-05, "loss": 0.3221, "step": 6242 }, { "epoch": 1.0684579839123738, "grad_norm": 14.505615234375, "learning_rate": 2.597074205892772e-05, "loss": 1.5707, "step": 6243 }, { "epoch": 1.0686291288721548, "grad_norm": 17.163841247558594, "learning_rate": 2.5962881379596738e-05, "loss": 1.9492, "step": 6244 }, { "epoch": 1.0688002738319358, "grad_norm": 22.28460693359375, "learning_rate": 2.595501423214942e-05, "loss": 2.6652, "step": 6245 }, { "epoch": 1.0689714187917165, "grad_norm": 24.26947593688965, "learning_rate": 2.5947140621227384e-05, "loss": 2.0312, "step": 6246 }, { "epoch": 1.0691425637514975, "grad_norm": 17.7855224609375, "learning_rate": 2.5939260551476075e-05, "loss": 1.8179, "step": 6247 }, { "epoch": 1.0693137087112785, "grad_norm": 10.421858787536621, "learning_rate": 2.5931374027544752e-05, "loss": 0.894, "step": 6248 }, { "epoch": 1.0694848536710593, "grad_norm": 11.617718696594238, "learning_rate": 2.5923481054086467e-05, "loss": 1.0684, "step": 6249 }, { "epoch": 1.0696559986308403, "grad_norm": 3.5524022579193115, "learning_rate": 2.5915581635758093e-05, "loss": 0.3627, "step": 6250 }, { "epoch": 1.0698271435906213, "grad_norm": 24.77372169494629, "learning_rate": 2.5907675777220293e-05, "loss": 2.3108, "step": 6251 }, { "epoch": 1.0699982885504022, "grad_norm": 16.17108917236328, "learning_rate": 2.5899763483137538e-05, "loss": 1.2411, "step": 6252 }, { "epoch": 1.0701694335101832, "grad_norm": 18.850309371948242, "learning_rate": 2.5891844758178092e-05, "loss": 1.6959, "step": 6253 }, { "epoch": 1.070340578469964, "grad_norm": 18.877622604370117, "learning_rate": 2.588391960701402e-05, "loss": 1.4854, "step": 6254 }, { "epoch": 1.070511723429745, "grad_norm": 16.1739501953125, "learning_rate": 2.5875988034321163e-05, "loss": 1.3473, "step": 6255 }, { "epoch": 1.070682868389526, "grad_norm": 44.91740036010742, "learning_rate": 2.586805004477917e-05, "loss": 1.6482, "step": 6256 }, { "epoch": 1.070854013349307, "grad_norm": 3.3182432651519775, "learning_rate": 2.5860105643071463e-05, "loss": 0.3658, "step": 6257 }, { "epoch": 1.0710251583090877, "grad_norm": 17.573970794677734, "learning_rate": 2.585215483388525e-05, "loss": 1.5352, "step": 6258 }, { "epoch": 1.0711963032688687, "grad_norm": 18.447710037231445, "learning_rate": 2.584419762191152e-05, "loss": 1.7313, "step": 6259 }, { "epoch": 1.0713674482286497, "grad_norm": 25.56972885131836, "learning_rate": 2.5836234011845046e-05, "loss": 2.1232, "step": 6260 }, { "epoch": 1.0715385931884307, "grad_norm": 24.050052642822266, "learning_rate": 2.5828264008384362e-05, "loss": 2.7981, "step": 6261 }, { "epoch": 1.0717097381482115, "grad_norm": 21.025928497314453, "learning_rate": 2.5820287616231785e-05, "loss": 1.5964, "step": 6262 }, { "epoch": 1.0718808831079925, "grad_norm": 3.2581591606140137, "learning_rate": 2.5812304840093397e-05, "loss": 0.3287, "step": 6263 }, { "epoch": 1.0720520280677734, "grad_norm": 0.9607375860214233, "learning_rate": 2.5804315684679046e-05, "loss": 0.1687, "step": 6264 }, { "epoch": 1.0722231730275544, "grad_norm": 19.35166358947754, "learning_rate": 2.5796320154702352e-05, "loss": 1.6589, "step": 6265 }, { "epoch": 1.0723943179873352, "grad_norm": 13.92859935760498, "learning_rate": 2.578831825488069e-05, "loss": 1.231, "step": 6266 }, { "epoch": 1.0725654629471162, "grad_norm": 18.011184692382812, "learning_rate": 2.578030998993518e-05, "loss": 1.5781, "step": 6267 }, { "epoch": 1.0727366079068972, "grad_norm": 18.9428768157959, "learning_rate": 2.5772295364590726e-05, "loss": 1.5683, "step": 6268 }, { "epoch": 1.0729077528666782, "grad_norm": 0.7132593989372253, "learning_rate": 2.576427438357596e-05, "loss": 0.1564, "step": 6269 }, { "epoch": 1.073078897826459, "grad_norm": 19.910579681396484, "learning_rate": 2.5756247051623274e-05, "loss": 2.2763, "step": 6270 }, { "epoch": 1.07325004278624, "grad_norm": 1.4179282188415527, "learning_rate": 2.5748213373468808e-05, "loss": 0.1777, "step": 6271 }, { "epoch": 1.073421187746021, "grad_norm": 4.847038269042969, "learning_rate": 2.574017335385244e-05, "loss": 0.3808, "step": 6272 }, { "epoch": 1.073592332705802, "grad_norm": 6.342759132385254, "learning_rate": 2.5732126997517798e-05, "loss": 0.4684, "step": 6273 }, { "epoch": 1.0737634776655827, "grad_norm": 1.4660619497299194, "learning_rate": 2.5724074309212243e-05, "loss": 0.1848, "step": 6274 }, { "epoch": 1.0739346226253637, "grad_norm": 18.226266860961914, "learning_rate": 2.571601529368687e-05, "loss": 1.4087, "step": 6275 }, { "epoch": 1.0741057675851446, "grad_norm": 35.625450134277344, "learning_rate": 2.5707949955696513e-05, "loss": 1.2727, "step": 6276 }, { "epoch": 1.0742769125449256, "grad_norm": 8.416872024536133, "learning_rate": 2.5699878299999738e-05, "loss": 0.4758, "step": 6277 }, { "epoch": 1.0744480575047064, "grad_norm": 17.871906280517578, "learning_rate": 2.569180033135882e-05, "loss": 1.6863, "step": 6278 }, { "epoch": 1.0746192024644874, "grad_norm": 22.277774810791016, "learning_rate": 2.5683716054539787e-05, "loss": 2.9435, "step": 6279 }, { "epoch": 1.0747903474242684, "grad_norm": 20.06029510498047, "learning_rate": 2.5675625474312372e-05, "loss": 1.9097, "step": 6280 }, { "epoch": 1.0749614923840494, "grad_norm": 24.28558349609375, "learning_rate": 2.5667528595450024e-05, "loss": 2.217, "step": 6281 }, { "epoch": 1.0751326373438301, "grad_norm": 10.94339370727539, "learning_rate": 2.565942542272991e-05, "loss": 1.2658, "step": 6282 }, { "epoch": 1.0753037823036111, "grad_norm": 17.108922958374023, "learning_rate": 2.5651315960932926e-05, "loss": 1.2507, "step": 6283 }, { "epoch": 1.075474927263392, "grad_norm": 3.6949167251586914, "learning_rate": 2.5643200214843658e-05, "loss": 0.3683, "step": 6284 }, { "epoch": 1.075646072223173, "grad_norm": 3.6829192638397217, "learning_rate": 2.5635078189250414e-05, "loss": 0.3396, "step": 6285 }, { "epoch": 1.0758172171829539, "grad_norm": 14.598087310791016, "learning_rate": 2.5626949888945196e-05, "loss": 1.435, "step": 6286 }, { "epoch": 1.0759883621427349, "grad_norm": 20.382143020629883, "learning_rate": 2.5618815318723713e-05, "loss": 1.7844, "step": 6287 }, { "epoch": 1.0761595071025158, "grad_norm": 5.522397994995117, "learning_rate": 2.5610674483385373e-05, "loss": 0.5656, "step": 6288 }, { "epoch": 1.0763306520622968, "grad_norm": 31.37264633178711, "learning_rate": 2.560252738773329e-05, "loss": 6.2005, "step": 6289 }, { "epoch": 1.0765017970220776, "grad_norm": 18.120845794677734, "learning_rate": 2.559437403657425e-05, "loss": 1.6842, "step": 6290 }, { "epoch": 1.0766729419818586, "grad_norm": 10.900449752807617, "learning_rate": 2.558621443471876e-05, "loss": 0.798, "step": 6291 }, { "epoch": 1.0768440869416396, "grad_norm": 20.480613708496094, "learning_rate": 2.5578048586980974e-05, "loss": 2.6181, "step": 6292 }, { "epoch": 1.0770152319014206, "grad_norm": 14.86552619934082, "learning_rate": 2.5569876498178774e-05, "loss": 1.0591, "step": 6293 }, { "epoch": 1.0771863768612016, "grad_norm": 23.034835815429688, "learning_rate": 2.556169817313369e-05, "loss": 2.9569, "step": 6294 }, { "epoch": 1.0773575218209823, "grad_norm": 7.4421868324279785, "learning_rate": 2.5553513616670957e-05, "loss": 0.7049, "step": 6295 }, { "epoch": 1.0775286667807633, "grad_norm": 4.225881576538086, "learning_rate": 2.554532283361947e-05, "loss": 0.4169, "step": 6296 }, { "epoch": 1.0776998117405443, "grad_norm": 14.406049728393555, "learning_rate": 2.5537125828811803e-05, "loss": 1.1218, "step": 6297 }, { "epoch": 1.077870956700325, "grad_norm": 10.225951194763184, "learning_rate": 2.5528922607084203e-05, "loss": 1.007, "step": 6298 }, { "epoch": 1.078042101660106, "grad_norm": 14.981630325317383, "learning_rate": 2.552071317327658e-05, "loss": 1.2863, "step": 6299 }, { "epoch": 1.078213246619887, "grad_norm": 13.425951957702637, "learning_rate": 2.5512497532232517e-05, "loss": 1.098, "step": 6300 }, { "epoch": 1.078384391579668, "grad_norm": 20.084674835205078, "learning_rate": 2.550427568879925e-05, "loss": 2.7397, "step": 6301 }, { "epoch": 1.078555536539449, "grad_norm": 16.5575008392334, "learning_rate": 2.5496047647827688e-05, "loss": 1.2174, "step": 6302 }, { "epoch": 1.0787266814992298, "grad_norm": 21.359878540039062, "learning_rate": 2.5487813414172378e-05, "loss": 2.3573, "step": 6303 }, { "epoch": 1.0788978264590108, "grad_norm": 12.596624374389648, "learning_rate": 2.547957299269153e-05, "loss": 1.2493, "step": 6304 }, { "epoch": 1.0790689714187918, "grad_norm": 22.200517654418945, "learning_rate": 2.547132638824701e-05, "loss": 2.6415, "step": 6305 }, { "epoch": 1.0792401163785725, "grad_norm": 20.53194808959961, "learning_rate": 2.546307360570432e-05, "loss": 0.7938, "step": 6306 }, { "epoch": 1.0794112613383535, "grad_norm": 13.094803810119629, "learning_rate": 2.545481464993262e-05, "loss": 0.958, "step": 6307 }, { "epoch": 1.0795824062981345, "grad_norm": 13.0464506149292, "learning_rate": 2.5446549525804703e-05, "loss": 1.1583, "step": 6308 }, { "epoch": 1.0797535512579155, "grad_norm": 21.018848419189453, "learning_rate": 2.5438278238197005e-05, "loss": 2.0161, "step": 6309 }, { "epoch": 1.0799246962176965, "grad_norm": 11.44277286529541, "learning_rate": 2.5430000791989604e-05, "loss": 0.957, "step": 6310 }, { "epoch": 1.0800958411774773, "grad_norm": 19.248382568359375, "learning_rate": 2.5421717192066202e-05, "loss": 1.7125, "step": 6311 }, { "epoch": 1.0802669861372582, "grad_norm": 17.87828826904297, "learning_rate": 2.541342744331413e-05, "loss": 1.6807, "step": 6312 }, { "epoch": 1.0804381310970392, "grad_norm": 14.122018814086914, "learning_rate": 2.5405131550624355e-05, "loss": 1.1676, "step": 6313 }, { "epoch": 1.0806092760568202, "grad_norm": 18.704439163208008, "learning_rate": 2.539682951889147e-05, "loss": 1.4403, "step": 6314 }, { "epoch": 1.080780421016601, "grad_norm": 18.118362426757812, "learning_rate": 2.538852135301368e-05, "loss": 1.4984, "step": 6315 }, { "epoch": 1.080951565976382, "grad_norm": 20.102548599243164, "learning_rate": 2.5380207057892822e-05, "loss": 1.7471, "step": 6316 }, { "epoch": 1.081122710936163, "grad_norm": 7.387364387512207, "learning_rate": 2.5371886638434335e-05, "loss": 0.6007, "step": 6317 }, { "epoch": 1.081293855895944, "grad_norm": 20.12532615661621, "learning_rate": 2.5363560099547282e-05, "loss": 2.5947, "step": 6318 }, { "epoch": 1.0814650008557247, "grad_norm": 12.348640441894531, "learning_rate": 2.5355227446144337e-05, "loss": 0.9146, "step": 6319 }, { "epoch": 1.0816361458155057, "grad_norm": 1.4093433618545532, "learning_rate": 2.5346888683141776e-05, "loss": 0.1879, "step": 6320 }, { "epoch": 1.0818072907752867, "grad_norm": 21.66387939453125, "learning_rate": 2.533854381545948e-05, "loss": 1.9458, "step": 6321 }, { "epoch": 1.0819784357350677, "grad_norm": 29.38831329345703, "learning_rate": 2.5330192848020935e-05, "loss": 5.4342, "step": 6322 }, { "epoch": 1.0821495806948485, "grad_norm": 15.972040176391602, "learning_rate": 2.532183578575322e-05, "loss": 1.4628, "step": 6323 }, { "epoch": 1.0823207256546294, "grad_norm": 22.438627243041992, "learning_rate": 2.5313472633587025e-05, "loss": 1.9986, "step": 6324 }, { "epoch": 1.0824918706144104, "grad_norm": 99.13875579833984, "learning_rate": 2.5305103396456608e-05, "loss": 7.4459, "step": 6325 }, { "epoch": 1.0826630155741914, "grad_norm": 0.6867462396621704, "learning_rate": 2.529672807929984e-05, "loss": 0.1632, "step": 6326 }, { "epoch": 1.0828341605339722, "grad_norm": 26.213483810424805, "learning_rate": 2.5288346687058167e-05, "loss": 2.6063, "step": 6327 }, { "epoch": 1.0830053054937532, "grad_norm": 32.9008674621582, "learning_rate": 2.5279959224676624e-05, "loss": 6.1089, "step": 6328 }, { "epoch": 1.0831764504535342, "grad_norm": 7.711669921875, "learning_rate": 2.5271565697103828e-05, "loss": 0.5029, "step": 6329 }, { "epoch": 1.0833475954133152, "grad_norm": 0.9175840020179749, "learning_rate": 2.526316610929197e-05, "loss": 0.1591, "step": 6330 }, { "epoch": 1.083518740373096, "grad_norm": 15.961565017700195, "learning_rate": 2.5254760466196825e-05, "loss": 1.2495, "step": 6331 }, { "epoch": 1.083689885332877, "grad_norm": 14.952744483947754, "learning_rate": 2.524634877277773e-05, "loss": 1.2988, "step": 6332 }, { "epoch": 1.083861030292658, "grad_norm": 21.237443923950195, "learning_rate": 2.5237931033997598e-05, "loss": 1.5518, "step": 6333 }, { "epoch": 1.084032175252439, "grad_norm": 0.7877843379974365, "learning_rate": 2.5229507254822905e-05, "loss": 0.1497, "step": 6334 }, { "epoch": 1.0842033202122197, "grad_norm": 20.07823371887207, "learning_rate": 2.5221077440223696e-05, "loss": 2.3132, "step": 6335 }, { "epoch": 1.0843744651720006, "grad_norm": 24.466449737548828, "learning_rate": 2.521264159517357e-05, "loss": 1.4605, "step": 6336 }, { "epoch": 1.0845456101317816, "grad_norm": 6.049923896789551, "learning_rate": 2.52041997246497e-05, "loss": 0.5203, "step": 6337 }, { "epoch": 1.0847167550915626, "grad_norm": 0.8076518774032593, "learning_rate": 2.5195751833632784e-05, "loss": 0.1521, "step": 6338 }, { "epoch": 1.0848879000513434, "grad_norm": 5.725604057312012, "learning_rate": 2.5187297927107106e-05, "loss": 0.4024, "step": 6339 }, { "epoch": 1.0850590450111244, "grad_norm": 1.9877190589904785, "learning_rate": 2.5178838010060472e-05, "loss": 0.2543, "step": 6340 }, { "epoch": 1.0852301899709054, "grad_norm": 20.83760643005371, "learning_rate": 2.517037208748426e-05, "loss": 2.1836, "step": 6341 }, { "epoch": 1.0854013349306864, "grad_norm": 19.120075225830078, "learning_rate": 2.516190016437336e-05, "loss": 1.5214, "step": 6342 }, { "epoch": 1.0855724798904671, "grad_norm": 20.080074310302734, "learning_rate": 2.5153422245726225e-05, "loss": 1.8331, "step": 6343 }, { "epoch": 1.0857436248502481, "grad_norm": 19.490802764892578, "learning_rate": 2.514493833654485e-05, "loss": 2.3582, "step": 6344 }, { "epoch": 1.085914769810029, "grad_norm": 25.15721321105957, "learning_rate": 2.5136448441834744e-05, "loss": 3.3368, "step": 6345 }, { "epoch": 1.08608591476981, "grad_norm": 30.424531936645508, "learning_rate": 2.512795256660496e-05, "loss": 5.8034, "step": 6346 }, { "epoch": 1.0862570597295909, "grad_norm": 13.79897689819336, "learning_rate": 2.511945071586807e-05, "loss": 1.112, "step": 6347 }, { "epoch": 1.0864282046893718, "grad_norm": 8.542878150939941, "learning_rate": 2.5110942894640192e-05, "loss": 0.6739, "step": 6348 }, { "epoch": 1.0865993496491528, "grad_norm": 29.626609802246094, "learning_rate": 2.5102429107940947e-05, "loss": 5.7867, "step": 6349 }, { "epoch": 1.0867704946089338, "grad_norm": 15.035479545593262, "learning_rate": 2.509390936079348e-05, "loss": 1.1878, "step": 6350 }, { "epoch": 1.0869416395687148, "grad_norm": 15.836331367492676, "learning_rate": 2.5085383658224454e-05, "loss": 1.4794, "step": 6351 }, { "epoch": 1.0871127845284956, "grad_norm": 16.121477127075195, "learning_rate": 2.5076852005264045e-05, "loss": 1.2524, "step": 6352 }, { "epoch": 1.0872839294882766, "grad_norm": 18.068601608276367, "learning_rate": 2.5068314406945948e-05, "loss": 1.4017, "step": 6353 }, { "epoch": 1.0874550744480576, "grad_norm": 20.634361267089844, "learning_rate": 2.5059770868307353e-05, "loss": 1.8565, "step": 6354 }, { "epoch": 1.0876262194078383, "grad_norm": 156.67971801757812, "learning_rate": 2.5051221394388965e-05, "loss": 7.3016, "step": 6355 }, { "epoch": 1.0877973643676193, "grad_norm": 1.6159067153930664, "learning_rate": 2.5042665990234978e-05, "loss": 0.259, "step": 6356 }, { "epoch": 1.0879685093274003, "grad_norm": 21.53618049621582, "learning_rate": 2.5034104660893102e-05, "loss": 2.9373, "step": 6357 }, { "epoch": 1.0881396542871813, "grad_norm": 20.488115310668945, "learning_rate": 2.5025537411414532e-05, "loss": 2.6575, "step": 6358 }, { "epoch": 1.0883107992469623, "grad_norm": 12.624834060668945, "learning_rate": 2.5016964246853952e-05, "loss": 0.9089, "step": 6359 }, { "epoch": 1.088481944206743, "grad_norm": 22.495460510253906, "learning_rate": 2.500838517226955e-05, "loss": 2.379, "step": 6360 }, { "epoch": 1.088653089166524, "grad_norm": 18.44520378112793, "learning_rate": 2.4999800192722988e-05, "loss": 1.4829, "step": 6361 }, { "epoch": 1.088824234126305, "grad_norm": 8.016738891601562, "learning_rate": 2.499120931327942e-05, "loss": 0.571, "step": 6362 }, { "epoch": 1.088995379086086, "grad_norm": 24.428014755249023, "learning_rate": 2.4982612539007474e-05, "loss": 5.3953, "step": 6363 }, { "epoch": 1.0891665240458668, "grad_norm": 20.098068237304688, "learning_rate": 2.497400987497926e-05, "loss": 2.2569, "step": 6364 }, { "epoch": 1.0893376690056478, "grad_norm": 23.772600173950195, "learning_rate": 2.4965401326270365e-05, "loss": 3.1548, "step": 6365 }, { "epoch": 1.0895088139654288, "grad_norm": 13.880364418029785, "learning_rate": 2.4956786897959844e-05, "loss": 1.3489, "step": 6366 }, { "epoch": 1.0896799589252097, "grad_norm": 4.314807891845703, "learning_rate": 2.4948166595130227e-05, "loss": 0.3581, "step": 6367 }, { "epoch": 1.0898511038849905, "grad_norm": 16.17949104309082, "learning_rate": 2.4939540422867497e-05, "loss": 1.3273, "step": 6368 }, { "epoch": 1.0900222488447715, "grad_norm": 26.82145118713379, "learning_rate": 2.493090838626112e-05, "loss": 2.7974, "step": 6369 }, { "epoch": 1.0901933938045525, "grad_norm": 18.014739990234375, "learning_rate": 2.4922270490403994e-05, "loss": 1.6901, "step": 6370 }, { "epoch": 1.0903645387643335, "grad_norm": 14.148981094360352, "learning_rate": 2.49136267403925e-05, "loss": 1.2315, "step": 6371 }, { "epoch": 1.0905356837241142, "grad_norm": 1.2226158380508423, "learning_rate": 2.4904977141326468e-05, "loss": 0.1799, "step": 6372 }, { "epoch": 1.0907068286838952, "grad_norm": 0.6526600122451782, "learning_rate": 2.489632169830917e-05, "loss": 0.1585, "step": 6373 }, { "epoch": 1.0908779736436762, "grad_norm": 19.312833786010742, "learning_rate": 2.4887660416447326e-05, "loss": 1.8635, "step": 6374 }, { "epoch": 1.0910491186034572, "grad_norm": 16.932435989379883, "learning_rate": 2.4878993300851115e-05, "loss": 1.2672, "step": 6375 }, { "epoch": 1.091220263563238, "grad_norm": 18.97432518005371, "learning_rate": 2.4870320356634138e-05, "loss": 1.2766, "step": 6376 }, { "epoch": 1.091391408523019, "grad_norm": 37.76401901245117, "learning_rate": 2.486164158891345e-05, "loss": 5.9113, "step": 6377 }, { "epoch": 1.0915625534828, "grad_norm": 20.928302764892578, "learning_rate": 2.4852957002809534e-05, "loss": 1.8063, "step": 6378 }, { "epoch": 1.091733698442581, "grad_norm": 0.7269101142883301, "learning_rate": 2.484426660344631e-05, "loss": 0.1545, "step": 6379 }, { "epoch": 1.0919048434023617, "grad_norm": 105.674072265625, "learning_rate": 2.4835570395951133e-05, "loss": 9.2848, "step": 6380 }, { "epoch": 1.0920759883621427, "grad_norm": 19.896669387817383, "learning_rate": 2.4826868385454767e-05, "loss": 1.8101, "step": 6381 }, { "epoch": 1.0922471333219237, "grad_norm": 13.241930961608887, "learning_rate": 2.4818160577091417e-05, "loss": 1.3192, "step": 6382 }, { "epoch": 1.0924182782817047, "grad_norm": 13.864144325256348, "learning_rate": 2.4809446975998707e-05, "loss": 1.1316, "step": 6383 }, { "epoch": 1.0925894232414854, "grad_norm": 17.57133674621582, "learning_rate": 2.480072758731767e-05, "loss": 1.6042, "step": 6384 }, { "epoch": 1.0927605682012664, "grad_norm": 14.61141300201416, "learning_rate": 2.4792002416192754e-05, "loss": 1.2322, "step": 6385 }, { "epoch": 1.0929317131610474, "grad_norm": 24.17422866821289, "learning_rate": 2.4783271467771832e-05, "loss": 2.9147, "step": 6386 }, { "epoch": 1.0931028581208284, "grad_norm": 14.182536125183105, "learning_rate": 2.477453474720617e-05, "loss": 1.534, "step": 6387 }, { "epoch": 1.0932740030806092, "grad_norm": 142.10279846191406, "learning_rate": 2.476579225965045e-05, "loss": 7.7581, "step": 6388 }, { "epoch": 1.0934451480403902, "grad_norm": 13.02221965789795, "learning_rate": 2.475704401026275e-05, "loss": 1.2573, "step": 6389 }, { "epoch": 1.0936162930001712, "grad_norm": 14.556048393249512, "learning_rate": 2.474829000420455e-05, "loss": 1.2619, "step": 6390 }, { "epoch": 1.0937874379599521, "grad_norm": 28.360790252685547, "learning_rate": 2.473953024664073e-05, "loss": 5.3109, "step": 6391 }, { "epoch": 1.093958582919733, "grad_norm": 17.71285629272461, "learning_rate": 2.4730764742739554e-05, "loss": 1.9739, "step": 6392 }, { "epoch": 1.094129727879514, "grad_norm": 16.858354568481445, "learning_rate": 2.4721993497672693e-05, "loss": 1.7266, "step": 6393 }, { "epoch": 1.094300872839295, "grad_norm": 20.677833557128906, "learning_rate": 2.4713216516615182e-05, "loss": 2.622, "step": 6394 }, { "epoch": 1.0944720177990759, "grad_norm": 25.51704978942871, "learning_rate": 2.4704433804745465e-05, "loss": 2.3596, "step": 6395 }, { "epoch": 1.0946431627588566, "grad_norm": 22.32132339477539, "learning_rate": 2.469564536724534e-05, "loss": 2.2435, "step": 6396 }, { "epoch": 1.0948143077186376, "grad_norm": 20.751523971557617, "learning_rate": 2.4686851209300017e-05, "loss": 1.7518, "step": 6397 }, { "epoch": 1.0949854526784186, "grad_norm": 18.10664939880371, "learning_rate": 2.4678051336098048e-05, "loss": 1.968, "step": 6398 }, { "epoch": 1.0951565976381996, "grad_norm": 17.6309814453125, "learning_rate": 2.4669245752831375e-05, "loss": 1.504, "step": 6399 }, { "epoch": 1.0953277425979806, "grad_norm": 17.307374954223633, "learning_rate": 2.4660434464695304e-05, "loss": 1.3444, "step": 6400 }, { "epoch": 1.0954988875577614, "grad_norm": 19.310840606689453, "learning_rate": 2.465161747688851e-05, "loss": 2.3097, "step": 6401 }, { "epoch": 1.0956700325175424, "grad_norm": 26.530712127685547, "learning_rate": 2.4642794794613027e-05, "loss": 2.4516, "step": 6402 }, { "epoch": 1.0958411774773233, "grad_norm": 14.693706512451172, "learning_rate": 2.463396642307426e-05, "loss": 1.1521, "step": 6403 }, { "epoch": 1.0960123224371041, "grad_norm": 45.52013397216797, "learning_rate": 2.4625132367480948e-05, "loss": 6.0723, "step": 6404 }, { "epoch": 1.096183467396885, "grad_norm": 21.434844970703125, "learning_rate": 2.4616292633045203e-05, "loss": 2.6903, "step": 6405 }, { "epoch": 1.096354612356666, "grad_norm": 16.72518539428711, "learning_rate": 2.4607447224982487e-05, "loss": 1.3194, "step": 6406 }, { "epoch": 1.096525757316447, "grad_norm": 14.632075309753418, "learning_rate": 2.4598596148511592e-05, "loss": 1.5802, "step": 6407 }, { "epoch": 1.096696902276228, "grad_norm": 5.6426167488098145, "learning_rate": 2.4589739408854678e-05, "loss": 0.4257, "step": 6408 }, { "epoch": 1.0968680472360088, "grad_norm": 14.361030578613281, "learning_rate": 2.4580877011237228e-05, "loss": 1.1946, "step": 6409 }, { "epoch": 1.0970391921957898, "grad_norm": 0.7481557130813599, "learning_rate": 2.457200896088807e-05, "loss": 0.1552, "step": 6410 }, { "epoch": 1.0972103371555708, "grad_norm": 11.047932624816895, "learning_rate": 2.4563135263039368e-05, "loss": 1.1797, "step": 6411 }, { "epoch": 1.0973814821153518, "grad_norm": 32.6690788269043, "learning_rate": 2.4554255922926618e-05, "loss": 5.7241, "step": 6412 }, { "epoch": 1.0975526270751326, "grad_norm": 6.154735565185547, "learning_rate": 2.4545370945788642e-05, "loss": 0.4541, "step": 6413 }, { "epoch": 1.0977237720349136, "grad_norm": 20.97507095336914, "learning_rate": 2.453648033686759e-05, "loss": 2.084, "step": 6414 }, { "epoch": 1.0978949169946945, "grad_norm": 20.436906814575195, "learning_rate": 2.452758410140893e-05, "loss": 1.9525, "step": 6415 }, { "epoch": 1.0980660619544755, "grad_norm": 13.217822074890137, "learning_rate": 2.4518682244661466e-05, "loss": 1.068, "step": 6416 }, { "epoch": 1.0982372069142563, "grad_norm": 26.957794189453125, "learning_rate": 2.45097747718773e-05, "loss": 5.2028, "step": 6417 }, { "epoch": 1.0984083518740373, "grad_norm": 26.779298782348633, "learning_rate": 2.4500861688311852e-05, "loss": 2.2781, "step": 6418 }, { "epoch": 1.0985794968338183, "grad_norm": 18.87523651123047, "learning_rate": 2.4491942999223856e-05, "loss": 1.7808, "step": 6419 }, { "epoch": 1.0987506417935993, "grad_norm": 18.582653045654297, "learning_rate": 2.4483018709875357e-05, "loss": 2.6275, "step": 6420 }, { "epoch": 1.09892178675338, "grad_norm": 20.106109619140625, "learning_rate": 2.4474088825531687e-05, "loss": 2.4183, "step": 6421 }, { "epoch": 1.099092931713161, "grad_norm": 23.127412796020508, "learning_rate": 2.4465153351461507e-05, "loss": 3.2691, "step": 6422 }, { "epoch": 1.099264076672942, "grad_norm": 26.415891647338867, "learning_rate": 2.4456212292936747e-05, "loss": 3.624, "step": 6423 }, { "epoch": 1.099435221632723, "grad_norm": 8.812840461730957, "learning_rate": 2.444726565523265e-05, "loss": 0.6166, "step": 6424 }, { "epoch": 1.0996063665925038, "grad_norm": 18.31468963623047, "learning_rate": 2.4438313443627748e-05, "loss": 2.1999, "step": 6425 }, { "epoch": 1.0997775115522848, "grad_norm": 0.5593311786651611, "learning_rate": 2.442935566340385e-05, "loss": 0.1435, "step": 6426 }, { "epoch": 1.0999486565120657, "grad_norm": 5.1220011711120605, "learning_rate": 2.4420392319846067e-05, "loss": 0.3795, "step": 6427 }, { "epoch": 1.1001198014718467, "grad_norm": 17.488069534301758, "learning_rate": 2.441142341824279e-05, "loss": 1.6355, "step": 6428 }, { "epoch": 1.1002909464316275, "grad_norm": 14.01169490814209, "learning_rate": 2.4402448963885672e-05, "loss": 1.5106, "step": 6429 }, { "epoch": 1.1004620913914085, "grad_norm": 21.192691802978516, "learning_rate": 2.4393468962069663e-05, "loss": 2.3361, "step": 6430 }, { "epoch": 1.1006332363511895, "grad_norm": 15.216924667358398, "learning_rate": 2.4384483418092976e-05, "loss": 1.3677, "step": 6431 }, { "epoch": 1.1008043813109705, "grad_norm": 16.910829544067383, "learning_rate": 2.4375492337257097e-05, "loss": 1.5384, "step": 6432 }, { "epoch": 1.1009755262707512, "grad_norm": 6.67744779586792, "learning_rate": 2.4366495724866772e-05, "loss": 0.8838, "step": 6433 }, { "epoch": 1.1011466712305322, "grad_norm": 18.879968643188477, "learning_rate": 2.4357493586230018e-05, "loss": 1.6657, "step": 6434 }, { "epoch": 1.1013178161903132, "grad_norm": 23.374927520751953, "learning_rate": 2.4348485926658108e-05, "loss": 2.5072, "step": 6435 }, { "epoch": 1.1014889611500942, "grad_norm": 5.638739109039307, "learning_rate": 2.4339472751465584e-05, "loss": 0.3209, "step": 6436 }, { "epoch": 1.101660106109875, "grad_norm": 8.479720115661621, "learning_rate": 2.433045406597022e-05, "loss": 0.517, "step": 6437 }, { "epoch": 1.101831251069656, "grad_norm": 10.97712230682373, "learning_rate": 2.432142987549306e-05, "loss": 1.1031, "step": 6438 }, { "epoch": 1.102002396029437, "grad_norm": 5.807440280914307, "learning_rate": 2.4312400185358393e-05, "loss": 0.359, "step": 6439 }, { "epoch": 1.102173540989218, "grad_norm": 0.7325602173805237, "learning_rate": 2.4303365000893744e-05, "loss": 0.1568, "step": 6440 }, { "epoch": 1.1023446859489987, "grad_norm": 20.65593910217285, "learning_rate": 2.4294324327429887e-05, "loss": 1.6213, "step": 6441 }, { "epoch": 1.1025158309087797, "grad_norm": 22.244678497314453, "learning_rate": 2.4285278170300835e-05, "loss": 2.1494, "step": 6442 }, { "epoch": 1.1026869758685607, "grad_norm": 14.809113502502441, "learning_rate": 2.4276226534843827e-05, "loss": 0.8485, "step": 6443 }, { "epoch": 1.1028581208283417, "grad_norm": 22.746294021606445, "learning_rate": 2.4267169426399356e-05, "loss": 1.7695, "step": 6444 }, { "epoch": 1.1030292657881224, "grad_norm": 6.71605920791626, "learning_rate": 2.425810685031111e-05, "loss": 0.4067, "step": 6445 }, { "epoch": 1.1032004107479034, "grad_norm": 4.4305524826049805, "learning_rate": 2.4249038811926042e-05, "loss": 0.3609, "step": 6446 }, { "epoch": 1.1033715557076844, "grad_norm": 8.703330039978027, "learning_rate": 2.4239965316594294e-05, "loss": 0.8769, "step": 6447 }, { "epoch": 1.1035427006674654, "grad_norm": 9.909598350524902, "learning_rate": 2.4230886369669248e-05, "loss": 0.5658, "step": 6448 }, { "epoch": 1.1037138456272464, "grad_norm": 0.643650233745575, "learning_rate": 2.4221801976507495e-05, "loss": 0.1524, "step": 6449 }, { "epoch": 1.1038849905870272, "grad_norm": 11.157785415649414, "learning_rate": 2.421271214246884e-05, "loss": 0.4894, "step": 6450 }, { "epoch": 1.1040561355468081, "grad_norm": 18.122224807739258, "learning_rate": 2.42036168729163e-05, "loss": 1.4929, "step": 6451 }, { "epoch": 1.1042272805065891, "grad_norm": 3.4521541595458984, "learning_rate": 2.4194516173216097e-05, "loss": 0.3654, "step": 6452 }, { "epoch": 1.10439842546637, "grad_norm": 21.733537673950195, "learning_rate": 2.4185410048737654e-05, "loss": 2.376, "step": 6453 }, { "epoch": 1.104569570426151, "grad_norm": 7.599557876586914, "learning_rate": 2.4176298504853604e-05, "loss": 0.9135, "step": 6454 }, { "epoch": 1.1047407153859319, "grad_norm": 0.5854821801185608, "learning_rate": 2.4167181546939765e-05, "loss": 0.1569, "step": 6455 }, { "epoch": 1.1049118603457129, "grad_norm": 14.26248550415039, "learning_rate": 2.415805918037516e-05, "loss": 1.2171, "step": 6456 }, { "epoch": 1.1050830053054939, "grad_norm": 21.380966186523438, "learning_rate": 2.4148931410542e-05, "loss": 1.8418, "step": 6457 }, { "epoch": 1.1052541502652746, "grad_norm": 0.7476603984832764, "learning_rate": 2.413979824282568e-05, "loss": 0.1601, "step": 6458 }, { "epoch": 1.1054252952250556, "grad_norm": 1.1973159313201904, "learning_rate": 2.4130659682614783e-05, "loss": 0.1709, "step": 6459 }, { "epoch": 1.1055964401848366, "grad_norm": 24.046266555786133, "learning_rate": 2.4121515735301076e-05, "loss": 2.5584, "step": 6460 }, { "epoch": 1.1057675851446176, "grad_norm": 7.6598052978515625, "learning_rate": 2.4112366406279492e-05, "loss": 0.5179, "step": 6461 }, { "epoch": 1.1059387301043984, "grad_norm": 91.56566619873047, "learning_rate": 2.4103211700948163e-05, "loss": 6.9132, "step": 6462 }, { "epoch": 1.1061098750641793, "grad_norm": 18.259281158447266, "learning_rate": 2.4094051624708374e-05, "loss": 1.5434, "step": 6463 }, { "epoch": 1.1062810200239603, "grad_norm": 14.358467102050781, "learning_rate": 2.4084886182964574e-05, "loss": 1.3149, "step": 6464 }, { "epoch": 1.1064521649837413, "grad_norm": 8.986652374267578, "learning_rate": 2.4075715381124397e-05, "loss": 0.6369, "step": 6465 }, { "epoch": 1.106623309943522, "grad_norm": 20.910594940185547, "learning_rate": 2.4066539224598623e-05, "loss": 1.8807, "step": 6466 }, { "epoch": 1.106794454903303, "grad_norm": 14.152681350708008, "learning_rate": 2.405735771880121e-05, "loss": 1.3497, "step": 6467 }, { "epoch": 1.106965599863084, "grad_norm": 2.2874717712402344, "learning_rate": 2.4048170869149248e-05, "loss": 0.3023, "step": 6468 }, { "epoch": 1.107136744822865, "grad_norm": 9.741914749145508, "learning_rate": 2.4038978681062995e-05, "loss": 1.1822, "step": 6469 }, { "epoch": 1.1073078897826458, "grad_norm": 14.803202629089355, "learning_rate": 2.402978115996586e-05, "loss": 1.8127, "step": 6470 }, { "epoch": 1.1074790347424268, "grad_norm": 16.078153610229492, "learning_rate": 2.402057831128439e-05, "loss": 1.2776, "step": 6471 }, { "epoch": 1.1076501797022078, "grad_norm": 14.220911026000977, "learning_rate": 2.4011370140448278e-05, "loss": 1.0287, "step": 6472 }, { "epoch": 1.1078213246619888, "grad_norm": 16.646638870239258, "learning_rate": 2.4002156652890368e-05, "loss": 1.501, "step": 6473 }, { "epoch": 1.1079924696217696, "grad_norm": 14.892061233520508, "learning_rate": 2.399293785404662e-05, "loss": 1.0803, "step": 6474 }, { "epoch": 1.1081636145815505, "grad_norm": 25.23822021484375, "learning_rate": 2.398371374935614e-05, "loss": 2.2634, "step": 6475 }, { "epoch": 1.1083347595413315, "grad_norm": 164.55755615234375, "learning_rate": 2.3974484344261175e-05, "loss": 13.2938, "step": 6476 }, { "epoch": 1.1085059045011125, "grad_norm": 18.659744262695312, "learning_rate": 2.3965249644207072e-05, "loss": 1.6686, "step": 6477 }, { "epoch": 1.1086770494608933, "grad_norm": 17.61865997314453, "learning_rate": 2.3956009654642333e-05, "loss": 1.625, "step": 6478 }, { "epoch": 1.1088481944206743, "grad_norm": 22.407318115234375, "learning_rate": 2.394676438101855e-05, "loss": 1.9881, "step": 6479 }, { "epoch": 1.1090193393804553, "grad_norm": 10.573683738708496, "learning_rate": 2.393751382879046e-05, "loss": 0.6952, "step": 6480 }, { "epoch": 1.1091904843402363, "grad_norm": 15.807626724243164, "learning_rate": 2.3928258003415902e-05, "loss": 1.4837, "step": 6481 }, { "epoch": 1.109361629300017, "grad_norm": 8.468942642211914, "learning_rate": 2.391899691035582e-05, "loss": 0.5184, "step": 6482 }, { "epoch": 1.109532774259798, "grad_norm": 14.21964168548584, "learning_rate": 2.390973055507428e-05, "loss": 1.099, "step": 6483 }, { "epoch": 1.109703919219579, "grad_norm": 11.065580368041992, "learning_rate": 2.3900458943038437e-05, "loss": 0.8836, "step": 6484 }, { "epoch": 1.10987506417936, "grad_norm": 23.09280776977539, "learning_rate": 2.3891182079718563e-05, "loss": 2.7036, "step": 6485 }, { "epoch": 1.1100462091391408, "grad_norm": 13.219036102294922, "learning_rate": 2.388189997058802e-05, "loss": 1.2623, "step": 6486 }, { "epoch": 1.1102173540989217, "grad_norm": 22.590919494628906, "learning_rate": 2.3872612621123265e-05, "loss": 2.8811, "step": 6487 }, { "epoch": 1.1103884990587027, "grad_norm": 2.921581268310547, "learning_rate": 2.3863320036803846e-05, "loss": 0.3436, "step": 6488 }, { "epoch": 1.1105596440184837, "grad_norm": 5.190805912017822, "learning_rate": 2.3854022223112404e-05, "loss": 0.4765, "step": 6489 }, { "epoch": 1.1107307889782645, "grad_norm": 8.173125267028809, "learning_rate": 2.3844719185534663e-05, "loss": 1.0327, "step": 6490 }, { "epoch": 1.1109019339380455, "grad_norm": 21.659151077270508, "learning_rate": 2.383541092955943e-05, "loss": 2.0525, "step": 6491 }, { "epoch": 1.1110730788978265, "grad_norm": 16.777814865112305, "learning_rate": 2.3826097460678588e-05, "loss": 1.1159, "step": 6492 }, { "epoch": 1.1112442238576075, "grad_norm": 16.834827423095703, "learning_rate": 2.3816778784387094e-05, "loss": 1.4569, "step": 6493 }, { "epoch": 1.1114153688173882, "grad_norm": 6.3645853996276855, "learning_rate": 2.3807454906182992e-05, "loss": 0.6127, "step": 6494 }, { "epoch": 1.1115865137771692, "grad_norm": 18.301231384277344, "learning_rate": 2.3798125831567373e-05, "loss": 1.6497, "step": 6495 }, { "epoch": 1.1117576587369502, "grad_norm": 46.49019241333008, "learning_rate": 2.378879156604441e-05, "loss": 6.3514, "step": 6496 }, { "epoch": 1.1119288036967312, "grad_norm": 13.724831581115723, "learning_rate": 2.3779452115121332e-05, "loss": 1.2679, "step": 6497 }, { "epoch": 1.1120999486565122, "grad_norm": 13.0844144821167, "learning_rate": 2.3770107484308435e-05, "loss": 1.3047, "step": 6498 }, { "epoch": 1.112271093616293, "grad_norm": 23.980138778686523, "learning_rate": 2.376075767911905e-05, "loss": 3.0273, "step": 6499 }, { "epoch": 1.112442238576074, "grad_norm": 28.75403594970703, "learning_rate": 2.375140270506959e-05, "loss": 5.4808, "step": 6500 }, { "epoch": 1.112613383535855, "grad_norm": 23.089555740356445, "learning_rate": 2.3742042567679498e-05, "loss": 2.0221, "step": 6501 }, { "epoch": 1.1127845284956357, "grad_norm": 18.15275001525879, "learning_rate": 2.373267727247127e-05, "loss": 1.4721, "step": 6502 }, { "epoch": 1.1129556734554167, "grad_norm": 2.7961483001708984, "learning_rate": 2.372330682497045e-05, "loss": 0.3007, "step": 6503 }, { "epoch": 1.1131268184151977, "grad_norm": 6.274376392364502, "learning_rate": 2.3713931230705603e-05, "loss": 0.4561, "step": 6504 }, { "epoch": 1.1132979633749787, "grad_norm": 15.660245895385742, "learning_rate": 2.3704550495208356e-05, "loss": 1.3623, "step": 6505 }, { "epoch": 1.1134691083347596, "grad_norm": 25.518728256225586, "learning_rate": 2.369516462401335e-05, "loss": 5.5308, "step": 6506 }, { "epoch": 1.1136402532945404, "grad_norm": 20.65359878540039, "learning_rate": 2.3685773622658262e-05, "loss": 1.6647, "step": 6507 }, { "epoch": 1.1138113982543214, "grad_norm": 22.48552703857422, "learning_rate": 2.3676377496683803e-05, "loss": 1.9629, "step": 6508 }, { "epoch": 1.1139825432141024, "grad_norm": 11.458088874816895, "learning_rate": 2.366697625163369e-05, "loss": 1.1358, "step": 6509 }, { "epoch": 1.1141536881738834, "grad_norm": 21.160249710083008, "learning_rate": 2.365756989305469e-05, "loss": 1.4879, "step": 6510 }, { "epoch": 1.1143248331336641, "grad_norm": 17.153186798095703, "learning_rate": 2.3648158426496556e-05, "loss": 2.1674, "step": 6511 }, { "epoch": 1.1144959780934451, "grad_norm": 21.04764175415039, "learning_rate": 2.3638741857512063e-05, "loss": 1.7981, "step": 6512 }, { "epoch": 1.1146671230532261, "grad_norm": 0.6828592419624329, "learning_rate": 2.3629320191657012e-05, "loss": 0.1629, "step": 6513 }, { "epoch": 1.1148382680130071, "grad_norm": 17.496509552001953, "learning_rate": 2.3619893434490194e-05, "loss": 1.4577, "step": 6514 }, { "epoch": 1.1150094129727879, "grad_norm": 18.580137252807617, "learning_rate": 2.3610461591573408e-05, "loss": 1.6204, "step": 6515 }, { "epoch": 1.1151805579325689, "grad_norm": 24.80152702331543, "learning_rate": 2.3601024668471462e-05, "loss": 2.0046, "step": 6516 }, { "epoch": 1.1153517028923499, "grad_norm": 14.06851577758789, "learning_rate": 2.359158267075215e-05, "loss": 1.3292, "step": 6517 }, { "epoch": 1.1155228478521308, "grad_norm": 31.47075653076172, "learning_rate": 2.3582135603986267e-05, "loss": 5.1749, "step": 6518 }, { "epoch": 1.1156939928119116, "grad_norm": 19.141130447387695, "learning_rate": 2.3572683473747593e-05, "loss": 1.5537, "step": 6519 }, { "epoch": 1.1158651377716926, "grad_norm": 20.078285217285156, "learning_rate": 2.35632262856129e-05, "loss": 1.798, "step": 6520 }, { "epoch": 1.1160362827314736, "grad_norm": 25.23849105834961, "learning_rate": 2.3553764045161943e-05, "loss": 2.3348, "step": 6521 }, { "epoch": 1.1162074276912546, "grad_norm": 0.8329445719718933, "learning_rate": 2.3544296757977465e-05, "loss": 0.162, "step": 6522 }, { "epoch": 1.1163785726510354, "grad_norm": 3.9662768840789795, "learning_rate": 2.3534824429645163e-05, "loss": 0.5279, "step": 6523 }, { "epoch": 1.1165497176108163, "grad_norm": 16.716726303100586, "learning_rate": 2.352534706575374e-05, "loss": 1.6044, "step": 6524 }, { "epoch": 1.1167208625705973, "grad_norm": 0.695058286190033, "learning_rate": 2.3515864671894853e-05, "loss": 0.1527, "step": 6525 }, { "epoch": 1.1168920075303783, "grad_norm": 13.73619270324707, "learning_rate": 2.3506377253663125e-05, "loss": 1.3362, "step": 6526 }, { "epoch": 1.117063152490159, "grad_norm": 23.782678604125977, "learning_rate": 2.3496884816656145e-05, "loss": 0.9277, "step": 6527 }, { "epoch": 1.11723429744994, "grad_norm": 14.975417137145996, "learning_rate": 2.348738736647447e-05, "loss": 1.1147, "step": 6528 }, { "epoch": 1.117405442409721, "grad_norm": 24.206743240356445, "learning_rate": 2.3477884908721605e-05, "loss": 3.5489, "step": 6529 }, { "epoch": 1.117576587369502, "grad_norm": 14.4812650680542, "learning_rate": 2.3468377449004014e-05, "loss": 1.1981, "step": 6530 }, { "epoch": 1.1177477323292828, "grad_norm": 18.3228702545166, "learning_rate": 2.3458864992931118e-05, "loss": 1.5354, "step": 6531 }, { "epoch": 1.1179188772890638, "grad_norm": 8.11532211303711, "learning_rate": 2.3449347546115275e-05, "loss": 0.8612, "step": 6532 }, { "epoch": 1.1180900222488448, "grad_norm": 22.54753303527832, "learning_rate": 2.34398251141718e-05, "loss": 1.9849, "step": 6533 }, { "epoch": 1.1182611672086258, "grad_norm": 8.902283668518066, "learning_rate": 2.343029770271893e-05, "loss": 0.74, "step": 6534 }, { "epoch": 1.1184323121684066, "grad_norm": 20.976781845092773, "learning_rate": 2.3420765317377864e-05, "loss": 1.906, "step": 6535 }, { "epoch": 1.1186034571281875, "grad_norm": 18.906484603881836, "learning_rate": 2.3411227963772713e-05, "loss": 2.3091, "step": 6536 }, { "epoch": 1.1187746020879685, "grad_norm": 13.609125137329102, "learning_rate": 2.340168564753054e-05, "loss": 1.0348, "step": 6537 }, { "epoch": 1.1189457470477495, "grad_norm": 17.64080047607422, "learning_rate": 2.339213837428132e-05, "loss": 1.4005, "step": 6538 }, { "epoch": 1.1191168920075303, "grad_norm": 12.737320899963379, "learning_rate": 2.338258614965796e-05, "loss": 1.1334, "step": 6539 }, { "epoch": 1.1192880369673113, "grad_norm": 4.324669361114502, "learning_rate": 2.3373028979296286e-05, "loss": 0.4197, "step": 6540 }, { "epoch": 1.1194591819270923, "grad_norm": 24.128999710083008, "learning_rate": 2.336346686883504e-05, "loss": 3.4262, "step": 6541 }, { "epoch": 1.1196303268868733, "grad_norm": 8.432863235473633, "learning_rate": 2.3353899823915887e-05, "loss": 0.7029, "step": 6542 }, { "epoch": 1.119801471846654, "grad_norm": 21.715240478515625, "learning_rate": 2.3344327850183395e-05, "loss": 1.9738, "step": 6543 }, { "epoch": 1.119972616806435, "grad_norm": 15.863061904907227, "learning_rate": 2.3334750953285044e-05, "loss": 1.254, "step": 6544 }, { "epoch": 1.120143761766216, "grad_norm": 13.353907585144043, "learning_rate": 2.3325169138871214e-05, "loss": 0.964, "step": 6545 }, { "epoch": 1.120314906725997, "grad_norm": 11.292017936706543, "learning_rate": 2.3315582412595195e-05, "loss": 0.841, "step": 6546 }, { "epoch": 1.1204860516857778, "grad_norm": 12.882715225219727, "learning_rate": 2.3305990780113163e-05, "loss": 0.9168, "step": 6547 }, { "epoch": 1.1206571966455587, "grad_norm": 2.7862045764923096, "learning_rate": 2.3296394247084206e-05, "loss": 0.3106, "step": 6548 }, { "epoch": 1.1208283416053397, "grad_norm": 2.7119204998016357, "learning_rate": 2.328679281917028e-05, "loss": 0.3005, "step": 6549 }, { "epoch": 1.1209994865651207, "grad_norm": 17.22968864440918, "learning_rate": 2.327718650203624e-05, "loss": 1.2516, "step": 6550 }, { "epoch": 1.1211706315249015, "grad_norm": 11.979629516601562, "learning_rate": 2.3267575301349852e-05, "loss": 0.9529, "step": 6551 }, { "epoch": 1.1213417764846825, "grad_norm": 103.80817413330078, "learning_rate": 2.3257959222781708e-05, "loss": 8.1736, "step": 6552 }, { "epoch": 1.1215129214444635, "grad_norm": 23.488616943359375, "learning_rate": 2.3248338272005332e-05, "loss": 3.1435, "step": 6553 }, { "epoch": 1.1216840664042445, "grad_norm": 23.113727569580078, "learning_rate": 2.323871245469709e-05, "loss": 3.0527, "step": 6554 }, { "epoch": 1.1218552113640254, "grad_norm": 20.720781326293945, "learning_rate": 2.3229081776536224e-05, "loss": 2.6047, "step": 6555 }, { "epoch": 1.1220263563238062, "grad_norm": 23.30956268310547, "learning_rate": 2.3219446243204853e-05, "loss": 2.7747, "step": 6556 }, { "epoch": 1.1221975012835872, "grad_norm": 20.627832412719727, "learning_rate": 2.3209805860387956e-05, "loss": 2.2667, "step": 6557 }, { "epoch": 1.1223686462433682, "grad_norm": 13.168161392211914, "learning_rate": 2.3200160633773374e-05, "loss": 1.1686, "step": 6558 }, { "epoch": 1.122539791203149, "grad_norm": 0.711658239364624, "learning_rate": 2.3190510569051806e-05, "loss": 0.1682, "step": 6559 }, { "epoch": 1.12271093616293, "grad_norm": 16.090927124023438, "learning_rate": 2.31808556719168e-05, "loss": 1.4364, "step": 6560 }, { "epoch": 1.122882081122711, "grad_norm": 23.919294357299805, "learning_rate": 2.3171195948064766e-05, "loss": 3.6051, "step": 6561 }, { "epoch": 1.123053226082492, "grad_norm": 21.282033920288086, "learning_rate": 2.316153140319495e-05, "loss": 2.1329, "step": 6562 }, { "epoch": 1.123224371042273, "grad_norm": 17.056264877319336, "learning_rate": 2.315186204300945e-05, "loss": 1.4888, "step": 6563 }, { "epoch": 1.1233955160020537, "grad_norm": 3.378387689590454, "learning_rate": 2.3142187873213202e-05, "loss": 0.3428, "step": 6564 }, { "epoch": 1.1235666609618347, "grad_norm": 5.910409450531006, "learning_rate": 2.313250889951398e-05, "loss": 0.6965, "step": 6565 }, { "epoch": 1.1237378059216157, "grad_norm": 15.086389541625977, "learning_rate": 2.3122825127622397e-05, "loss": 1.2835, "step": 6566 }, { "epoch": 1.1239089508813966, "grad_norm": 13.626158714294434, "learning_rate": 2.311313656325189e-05, "loss": 0.9983, "step": 6567 }, { "epoch": 1.1240800958411774, "grad_norm": 25.3629207611084, "learning_rate": 2.3103443212118728e-05, "loss": 3.3747, "step": 6568 }, { "epoch": 1.1242512408009584, "grad_norm": 4.067588806152344, "learning_rate": 2.3093745079942e-05, "loss": 0.3772, "step": 6569 }, { "epoch": 1.1244223857607394, "grad_norm": 2.650106906890869, "learning_rate": 2.3084042172443615e-05, "loss": 0.4619, "step": 6570 }, { "epoch": 1.1245935307205204, "grad_norm": 23.48897933959961, "learning_rate": 2.307433449534831e-05, "loss": 3.1404, "step": 6571 }, { "epoch": 1.1247646756803011, "grad_norm": 18.580120086669922, "learning_rate": 2.3064622054383628e-05, "loss": 1.5701, "step": 6572 }, { "epoch": 1.1249358206400821, "grad_norm": 9.494951248168945, "learning_rate": 2.3054904855279924e-05, "loss": 0.648, "step": 6573 }, { "epoch": 1.1251069655998631, "grad_norm": 24.08775520324707, "learning_rate": 2.304518290377035e-05, "loss": 2.1751, "step": 6574 }, { "epoch": 1.125278110559644, "grad_norm": 21.949031829833984, "learning_rate": 2.3035456205590884e-05, "loss": 2.5299, "step": 6575 }, { "epoch": 1.1254492555194249, "grad_norm": 15.016345024108887, "learning_rate": 2.3025724766480288e-05, "loss": 1.218, "step": 6576 }, { "epoch": 1.1256204004792059, "grad_norm": 23.55092430114746, "learning_rate": 2.301598859218013e-05, "loss": 2.9416, "step": 6577 }, { "epoch": 1.1257915454389869, "grad_norm": 8.029715538024902, "learning_rate": 2.300624768843476e-05, "loss": 0.5919, "step": 6578 }, { "epoch": 1.1259626903987678, "grad_norm": 13.06380844116211, "learning_rate": 2.2996502060991327e-05, "loss": 1.0507, "step": 6579 }, { "epoch": 1.1261338353585486, "grad_norm": 15.745100021362305, "learning_rate": 2.2986751715599767e-05, "loss": 1.4026, "step": 6580 }, { "epoch": 1.1263049803183296, "grad_norm": 6.315836429595947, "learning_rate": 2.2976996658012805e-05, "loss": 0.5273, "step": 6581 }, { "epoch": 1.1264761252781106, "grad_norm": 13.709003448486328, "learning_rate": 2.296723689398593e-05, "loss": 1.1541, "step": 6582 }, { "epoch": 1.1266472702378916, "grad_norm": 19.77781867980957, "learning_rate": 2.295747242927742e-05, "loss": 1.7555, "step": 6583 }, { "epoch": 1.1268184151976723, "grad_norm": 19.866291046142578, "learning_rate": 2.2947703269648323e-05, "loss": 2.42, "step": 6584 }, { "epoch": 1.1269895601574533, "grad_norm": 2.5817923545837402, "learning_rate": 2.293792942086246e-05, "loss": 0.3039, "step": 6585 }, { "epoch": 1.1271607051172343, "grad_norm": 24.967405319213867, "learning_rate": 2.2928150888686418e-05, "loss": 3.0059, "step": 6586 }, { "epoch": 1.1273318500770153, "grad_norm": 15.376130104064941, "learning_rate": 2.2918367678889542e-05, "loss": 1.2198, "step": 6587 }, { "epoch": 1.127502995036796, "grad_norm": 23.588638305664062, "learning_rate": 2.290857979724394e-05, "loss": 2.7913, "step": 6588 }, { "epoch": 1.127674139996577, "grad_norm": 13.602372169494629, "learning_rate": 2.289878724952448e-05, "loss": 1.3017, "step": 6589 }, { "epoch": 1.127845284956358, "grad_norm": 12.126798629760742, "learning_rate": 2.2888990041508775e-05, "loss": 0.86, "step": 6590 }, { "epoch": 1.128016429916139, "grad_norm": 25.87506866455078, "learning_rate": 2.28791881789772e-05, "loss": 2.8479, "step": 6591 }, { "epoch": 1.1281875748759198, "grad_norm": 2.148634195327759, "learning_rate": 2.2869381667712853e-05, "loss": 0.3244, "step": 6592 }, { "epoch": 1.1283587198357008, "grad_norm": 23.239213943481445, "learning_rate": 2.2859570513501617e-05, "loss": 2.9426, "step": 6593 }, { "epoch": 1.1285298647954818, "grad_norm": 15.510618209838867, "learning_rate": 2.2849754722132058e-05, "loss": 1.3888, "step": 6594 }, { "epoch": 1.1287010097552628, "grad_norm": 5.460433006286621, "learning_rate": 2.2839934299395526e-05, "loss": 0.3742, "step": 6595 }, { "epoch": 1.1288721547150438, "grad_norm": 15.822589874267578, "learning_rate": 2.283010925108608e-05, "loss": 1.7936, "step": 6596 }, { "epoch": 1.1290432996748245, "grad_norm": 19.703838348388672, "learning_rate": 2.2820279583000514e-05, "loss": 2.0802, "step": 6597 }, { "epoch": 1.1292144446346055, "grad_norm": 5.518841743469238, "learning_rate": 2.2810445300938342e-05, "loss": 0.4586, "step": 6598 }, { "epoch": 1.1293855895943865, "grad_norm": 15.560166358947754, "learning_rate": 2.2800606410701813e-05, "loss": 1.2736, "step": 6599 }, { "epoch": 1.1295567345541673, "grad_norm": 1.1018235683441162, "learning_rate": 2.279076291809588e-05, "loss": 0.2273, "step": 6600 }, { "epoch": 1.1297278795139483, "grad_norm": 6.1512064933776855, "learning_rate": 2.2780914828928223e-05, "loss": 0.5965, "step": 6601 }, { "epoch": 1.1298990244737293, "grad_norm": 7.797390937805176, "learning_rate": 2.277106214900923e-05, "loss": 0.8642, "step": 6602 }, { "epoch": 1.1300701694335102, "grad_norm": 4.10001277923584, "learning_rate": 2.2761204884151997e-05, "loss": 0.3231, "step": 6603 }, { "epoch": 1.1302413143932912, "grad_norm": 5.270298957824707, "learning_rate": 2.2751343040172314e-05, "loss": 0.4003, "step": 6604 }, { "epoch": 1.130412459353072, "grad_norm": 12.963715553283691, "learning_rate": 2.2741476622888697e-05, "loss": 1.1548, "step": 6605 }, { "epoch": 1.130583604312853, "grad_norm": 49.15763473510742, "learning_rate": 2.2731605638122342e-05, "loss": 6.4104, "step": 6606 }, { "epoch": 1.130754749272634, "grad_norm": 17.410070419311523, "learning_rate": 2.2721730091697142e-05, "loss": 2.0867, "step": 6607 }, { "epoch": 1.1309258942324147, "grad_norm": 25.885358810424805, "learning_rate": 2.271184998943969e-05, "loss": 5.2408, "step": 6608 }, { "epoch": 1.1310970391921957, "grad_norm": 17.080461502075195, "learning_rate": 2.2701965337179254e-05, "loss": 1.3972, "step": 6609 }, { "epoch": 1.1312681841519767, "grad_norm": 37.435577392578125, "learning_rate": 2.26920761407478e-05, "loss": 1.8542, "step": 6610 }, { "epoch": 1.1314393291117577, "grad_norm": 16.12346649169922, "learning_rate": 2.2682182405979963e-05, "loss": 1.6343, "step": 6611 }, { "epoch": 1.1316104740715387, "grad_norm": 5.131670951843262, "learning_rate": 2.2672284138713066e-05, "loss": 0.3993, "step": 6612 }, { "epoch": 1.1317816190313195, "grad_norm": 16.831790924072266, "learning_rate": 2.2662381344787106e-05, "loss": 1.0834, "step": 6613 }, { "epoch": 1.1319527639911005, "grad_norm": 2.409867286682129, "learning_rate": 2.265247403004473e-05, "loss": 0.1754, "step": 6614 }, { "epoch": 1.1321239089508814, "grad_norm": 25.386507034301758, "learning_rate": 2.264256220033128e-05, "loss": 2.5655, "step": 6615 }, { "epoch": 1.1322950539106622, "grad_norm": 5.757489204406738, "learning_rate": 2.263264586149475e-05, "loss": 0.5487, "step": 6616 }, { "epoch": 1.1324661988704432, "grad_norm": 10.322013854980469, "learning_rate": 2.2622725019385794e-05, "loss": 0.7429, "step": 6617 }, { "epoch": 1.1326373438302242, "grad_norm": 22.98650550842285, "learning_rate": 2.2612799679857722e-05, "loss": 2.7043, "step": 6618 }, { "epoch": 1.1328084887900052, "grad_norm": 14.2614164352417, "learning_rate": 2.2602869848766497e-05, "loss": 1.2144, "step": 6619 }, { "epoch": 1.1329796337497862, "grad_norm": 13.950614929199219, "learning_rate": 2.2592935531970742e-05, "loss": 1.2144, "step": 6620 }, { "epoch": 1.133150778709567, "grad_norm": 16.276235580444336, "learning_rate": 2.2582996735331717e-05, "loss": 1.7681, "step": 6621 }, { "epoch": 1.133321923669348, "grad_norm": 22.497079849243164, "learning_rate": 2.2573053464713314e-05, "loss": 2.842, "step": 6622 }, { "epoch": 1.133493068629129, "grad_norm": 20.166521072387695, "learning_rate": 2.2563105725982094e-05, "loss": 1.5608, "step": 6623 }, { "epoch": 1.13366421358891, "grad_norm": 5.348848819732666, "learning_rate": 2.2553153525007227e-05, "loss": 0.3772, "step": 6624 }, { "epoch": 1.1338353585486907, "grad_norm": 9.403623580932617, "learning_rate": 2.2543196867660534e-05, "loss": 0.9717, "step": 6625 }, { "epoch": 1.1340065035084717, "grad_norm": 5.1142497062683105, "learning_rate": 2.2533235759816454e-05, "loss": 0.4947, "step": 6626 }, { "epoch": 1.1341776484682526, "grad_norm": 14.59904956817627, "learning_rate": 2.2523270207352046e-05, "loss": 1.3381, "step": 6627 }, { "epoch": 1.1343487934280336, "grad_norm": 54.968746185302734, "learning_rate": 2.251330021614702e-05, "loss": 6.1858, "step": 6628 }, { "epoch": 1.1345199383878144, "grad_norm": 23.6578311920166, "learning_rate": 2.250332579208367e-05, "loss": 1.9513, "step": 6629 }, { "epoch": 1.1346910833475954, "grad_norm": 6.994755268096924, "learning_rate": 2.249334694104693e-05, "loss": 0.7519, "step": 6630 }, { "epoch": 1.1348622283073764, "grad_norm": 17.739059448242188, "learning_rate": 2.2483363668924332e-05, "loss": 1.5266, "step": 6631 }, { "epoch": 1.1350333732671574, "grad_norm": 18.796348571777344, "learning_rate": 2.2473375981606027e-05, "loss": 1.5246, "step": 6632 }, { "epoch": 1.1352045182269381, "grad_norm": 14.201302528381348, "learning_rate": 2.246338388498476e-05, "loss": 1.2143, "step": 6633 }, { "epoch": 1.1353756631867191, "grad_norm": 25.871665954589844, "learning_rate": 2.245338738495588e-05, "loss": 5.4664, "step": 6634 }, { "epoch": 1.1355468081465, "grad_norm": 13.160964012145996, "learning_rate": 2.2443386487417345e-05, "loss": 0.9775, "step": 6635 }, { "epoch": 1.135717953106281, "grad_norm": 18.273212432861328, "learning_rate": 2.2433381198269694e-05, "loss": 1.9524, "step": 6636 }, { "epoch": 1.1358890980660619, "grad_norm": 12.089645385742188, "learning_rate": 2.2423371523416068e-05, "loss": 1.1928, "step": 6637 }, { "epoch": 1.1360602430258429, "grad_norm": 19.688276290893555, "learning_rate": 2.2413357468762182e-05, "loss": 2.09, "step": 6638 }, { "epoch": 1.1362313879856238, "grad_norm": 20.712020874023438, "learning_rate": 2.2403339040216348e-05, "loss": 2.2678, "step": 6639 }, { "epoch": 1.1364025329454048, "grad_norm": 4.0690693855285645, "learning_rate": 2.239331624368946e-05, "loss": 0.3117, "step": 6640 }, { "epoch": 1.1365736779051856, "grad_norm": 14.07337474822998, "learning_rate": 2.2383289085094966e-05, "loss": 1.1598, "step": 6641 }, { "epoch": 1.1367448228649666, "grad_norm": 21.29363441467285, "learning_rate": 2.2373257570348917e-05, "loss": 1.1275, "step": 6642 }, { "epoch": 1.1369159678247476, "grad_norm": 11.619843482971191, "learning_rate": 2.2363221705369926e-05, "loss": 1.3518, "step": 6643 }, { "epoch": 1.1370871127845286, "grad_norm": 16.24192237854004, "learning_rate": 2.2353181496079156e-05, "loss": 1.3699, "step": 6644 }, { "epoch": 1.1372582577443096, "grad_norm": 20.99095916748047, "learning_rate": 2.234313694840035e-05, "loss": 1.9589, "step": 6645 }, { "epoch": 1.1374294027040903, "grad_norm": 12.040764808654785, "learning_rate": 2.2333088068259812e-05, "loss": 0.842, "step": 6646 }, { "epoch": 1.1376005476638713, "grad_norm": 62.060768127441406, "learning_rate": 2.2323034861586392e-05, "loss": 6.5968, "step": 6647 }, { "epoch": 1.1377716926236523, "grad_norm": 14.743884086608887, "learning_rate": 2.2312977334311492e-05, "loss": 1.351, "step": 6648 }, { "epoch": 1.137942837583433, "grad_norm": 16.910991668701172, "learning_rate": 2.2302915492369072e-05, "loss": 1.6767, "step": 6649 }, { "epoch": 1.138113982543214, "grad_norm": 25.29916763305664, "learning_rate": 2.2292849341695637e-05, "loss": 2.5186, "step": 6650 }, { "epoch": 1.138285127502995, "grad_norm": 10.531685829162598, "learning_rate": 2.2282778888230224e-05, "loss": 0.8632, "step": 6651 }, { "epoch": 1.138456272462776, "grad_norm": 36.49444580078125, "learning_rate": 2.227270413791442e-05, "loss": 5.9198, "step": 6652 }, { "epoch": 1.138627417422557, "grad_norm": 2.5288755893707275, "learning_rate": 2.226262509669235e-05, "loss": 0.2905, "step": 6653 }, { "epoch": 1.1387985623823378, "grad_norm": 1.1701017618179321, "learning_rate": 2.225254177051065e-05, "loss": 0.2359, "step": 6654 }, { "epoch": 1.1389697073421188, "grad_norm": 0.9904645085334778, "learning_rate": 2.2242454165318507e-05, "loss": 0.169, "step": 6655 }, { "epoch": 1.1391408523018998, "grad_norm": 18.088199615478516, "learning_rate": 2.223236228706761e-05, "loss": 1.7993, "step": 6656 }, { "epoch": 1.1393119972616805, "grad_norm": 25.48227882385254, "learning_rate": 2.2222266141712196e-05, "loss": 3.7581, "step": 6657 }, { "epoch": 1.1394831422214615, "grad_norm": 17.415260314941406, "learning_rate": 2.2212165735209e-05, "loss": 1.6087, "step": 6658 }, { "epoch": 1.1396542871812425, "grad_norm": 12.266166687011719, "learning_rate": 2.2202061073517285e-05, "loss": 1.001, "step": 6659 }, { "epoch": 1.1398254321410235, "grad_norm": 10.487218856811523, "learning_rate": 2.21919521625988e-05, "loss": 0.927, "step": 6660 }, { "epoch": 1.1399965771008045, "grad_norm": 15.511195182800293, "learning_rate": 2.2181839008417832e-05, "loss": 1.3514, "step": 6661 }, { "epoch": 1.1401677220605853, "grad_norm": 19.942005157470703, "learning_rate": 2.217172161694115e-05, "loss": 1.5787, "step": 6662 }, { "epoch": 1.1403388670203662, "grad_norm": 21.329761505126953, "learning_rate": 2.2161599994138035e-05, "loss": 2.9634, "step": 6663 }, { "epoch": 1.1405100119801472, "grad_norm": 3.755098342895508, "learning_rate": 2.2151474145980255e-05, "loss": 0.3287, "step": 6664 }, { "epoch": 1.140681156939928, "grad_norm": 5.238820552825928, "learning_rate": 2.2141344078442076e-05, "loss": 0.3482, "step": 6665 }, { "epoch": 1.140852301899709, "grad_norm": 21.611568450927734, "learning_rate": 2.2131209797500253e-05, "loss": 2.0606, "step": 6666 }, { "epoch": 1.14102344685949, "grad_norm": 22.647153854370117, "learning_rate": 2.2121071309134033e-05, "loss": 2.3558, "step": 6667 }, { "epoch": 1.141194591819271, "grad_norm": 14.586101531982422, "learning_rate": 2.211092861932513e-05, "loss": 1.4945, "step": 6668 }, { "epoch": 1.141365736779052, "grad_norm": 17.73663330078125, "learning_rate": 2.210078173405775e-05, "loss": 1.4825, "step": 6669 }, { "epoch": 1.1415368817388327, "grad_norm": 8.52775764465332, "learning_rate": 2.209063065931857e-05, "loss": 0.9648, "step": 6670 }, { "epoch": 1.1417080266986137, "grad_norm": 22.95386505126953, "learning_rate": 2.2080475401096743e-05, "loss": 3.0247, "step": 6671 }, { "epoch": 1.1418791716583947, "grad_norm": 22.214920043945312, "learning_rate": 2.2070315965383883e-05, "loss": 2.5544, "step": 6672 }, { "epoch": 1.1420503166181757, "grad_norm": 12.52978515625, "learning_rate": 2.2060152358174066e-05, "loss": 1.3018, "step": 6673 }, { "epoch": 1.1422214615779565, "grad_norm": 15.886680603027344, "learning_rate": 2.204998458546385e-05, "loss": 1.4821, "step": 6674 }, { "epoch": 1.1423926065377374, "grad_norm": 19.420021057128906, "learning_rate": 2.2039812653252227e-05, "loss": 1.6811, "step": 6675 }, { "epoch": 1.1425637514975184, "grad_norm": 6.738640785217285, "learning_rate": 2.202963656754065e-05, "loss": 0.4493, "step": 6676 }, { "epoch": 1.1427348964572994, "grad_norm": 15.929643630981445, "learning_rate": 2.2019456334333026e-05, "loss": 1.3787, "step": 6677 }, { "epoch": 1.1429060414170802, "grad_norm": 15.84787654876709, "learning_rate": 2.2009271959635712e-05, "loss": 1.482, "step": 6678 }, { "epoch": 1.1430771863768612, "grad_norm": 18.48777961730957, "learning_rate": 2.19990834494575e-05, "loss": 2.077, "step": 6679 }, { "epoch": 1.1432483313366422, "grad_norm": 5.023829936981201, "learning_rate": 2.1988890809809632e-05, "loss": 0.4019, "step": 6680 }, { "epoch": 1.1434194762964232, "grad_norm": 3.970644474029541, "learning_rate": 2.1978694046705773e-05, "loss": 0.3509, "step": 6681 }, { "epoch": 1.143590621256204, "grad_norm": 1.667288064956665, "learning_rate": 2.1968493166162032e-05, "loss": 0.1854, "step": 6682 }, { "epoch": 1.143761766215985, "grad_norm": 21.447723388671875, "learning_rate": 2.1958288174196947e-05, "loss": 3.0801, "step": 6683 }, { "epoch": 1.143932911175766, "grad_norm": 27.4541072845459, "learning_rate": 2.1948079076831472e-05, "loss": 5.3531, "step": 6684 }, { "epoch": 1.1441040561355469, "grad_norm": 5.7024312019348145, "learning_rate": 2.193786588008899e-05, "loss": 0.3647, "step": 6685 }, { "epoch": 1.1442752010953277, "grad_norm": 0.5887414813041687, "learning_rate": 2.1927648589995312e-05, "loss": 0.1526, "step": 6686 }, { "epoch": 1.1444463460551086, "grad_norm": 104.833984375, "learning_rate": 2.1917427212578644e-05, "loss": 6.8034, "step": 6687 }, { "epoch": 1.1446174910148896, "grad_norm": 43.65744400024414, "learning_rate": 2.1907201753869618e-05, "loss": 1.8006, "step": 6688 }, { "epoch": 1.1447886359746706, "grad_norm": 12.72282886505127, "learning_rate": 2.189697221990127e-05, "loss": 1.0824, "step": 6689 }, { "epoch": 1.1449597809344514, "grad_norm": 22.606210708618164, "learning_rate": 2.1886738616709038e-05, "loss": 2.2436, "step": 6690 }, { "epoch": 1.1451309258942324, "grad_norm": 7.099259376525879, "learning_rate": 2.1876500950330766e-05, "loss": 0.4624, "step": 6691 }, { "epoch": 1.1453020708540134, "grad_norm": 102.5890884399414, "learning_rate": 2.186625922680669e-05, "loss": 8.1622, "step": 6692 }, { "epoch": 1.1454732158137944, "grad_norm": 11.016797065734863, "learning_rate": 2.1856013452179443e-05, "loss": 0.7039, "step": 6693 }, { "epoch": 1.1456443607735751, "grad_norm": 15.703362464904785, "learning_rate": 2.1845763632494046e-05, "loss": 1.5509, "step": 6694 }, { "epoch": 1.145815505733356, "grad_norm": 70.36822509765625, "learning_rate": 2.183550977379791e-05, "loss": 8.0359, "step": 6695 }, { "epoch": 1.145986650693137, "grad_norm": 22.124250411987305, "learning_rate": 2.182525188214083e-05, "loss": 2.2052, "step": 6696 }, { "epoch": 1.146157795652918, "grad_norm": 18.040904998779297, "learning_rate": 2.181498996357497e-05, "loss": 2.0674, "step": 6697 }, { "epoch": 1.1463289406126989, "grad_norm": 3.606860399246216, "learning_rate": 2.1804724024154883e-05, "loss": 0.3582, "step": 6698 }, { "epoch": 1.1465000855724798, "grad_norm": 71.82786560058594, "learning_rate": 2.1794454069937485e-05, "loss": 7.7528, "step": 6699 }, { "epoch": 1.1466712305322608, "grad_norm": 35.79607009887695, "learning_rate": 2.1784180106982063e-05, "loss": 5.6748, "step": 6700 }, { "epoch": 1.1468423754920418, "grad_norm": 12.36203670501709, "learning_rate": 2.1773902141350277e-05, "loss": 1.2623, "step": 6701 }, { "epoch": 1.1470135204518228, "grad_norm": 18.953035354614258, "learning_rate": 2.1763620179106137e-05, "loss": 1.9056, "step": 6702 }, { "epoch": 1.1471846654116036, "grad_norm": 5.2524189949035645, "learning_rate": 2.175333422631602e-05, "loss": 0.4216, "step": 6703 }, { "epoch": 1.1473558103713846, "grad_norm": 16.611282348632812, "learning_rate": 2.1743044289048647e-05, "loss": 1.5661, "step": 6704 }, { "epoch": 1.1475269553311656, "grad_norm": 20.023853302001953, "learning_rate": 2.1732750373375098e-05, "loss": 1.8265, "step": 6705 }, { "epoch": 1.1476981002909463, "grad_norm": 4.804213523864746, "learning_rate": 2.1722452485368808e-05, "loss": 0.3534, "step": 6706 }, { "epoch": 1.1478692452507273, "grad_norm": 17.737071990966797, "learning_rate": 2.171215063110553e-05, "loss": 1.6854, "step": 6707 }, { "epoch": 1.1480403902105083, "grad_norm": 11.945717811584473, "learning_rate": 2.1701844816663387e-05, "loss": 0.9422, "step": 6708 }, { "epoch": 1.1482115351702893, "grad_norm": 13.96307373046875, "learning_rate": 2.1691535048122818e-05, "loss": 0.9708, "step": 6709 }, { "epoch": 1.1483826801300703, "grad_norm": 35.298011779785156, "learning_rate": 2.1681221331566605e-05, "loss": 1.4859, "step": 6710 }, { "epoch": 1.148553825089851, "grad_norm": 2.048851728439331, "learning_rate": 2.167090367307986e-05, "loss": 0.2684, "step": 6711 }, { "epoch": 1.148724970049632, "grad_norm": 9.77960205078125, "learning_rate": 2.1660582078750006e-05, "loss": 0.5625, "step": 6712 }, { "epoch": 1.148896115009413, "grad_norm": 15.336026191711426, "learning_rate": 2.1650256554666804e-05, "loss": 1.2984, "step": 6713 }, { "epoch": 1.1490672599691938, "grad_norm": 15.743756294250488, "learning_rate": 2.163992710692233e-05, "loss": 1.1362, "step": 6714 }, { "epoch": 1.1492384049289748, "grad_norm": 17.978227615356445, "learning_rate": 2.1629593741610977e-05, "loss": 1.5398, "step": 6715 }, { "epoch": 1.1494095498887558, "grad_norm": 22.40999412536621, "learning_rate": 2.1619256464829436e-05, "loss": 1.8867, "step": 6716 }, { "epoch": 1.1495806948485368, "grad_norm": 16.83521842956543, "learning_rate": 2.1608915282676728e-05, "loss": 1.4942, "step": 6717 }, { "epoch": 1.1497518398083177, "grad_norm": 23.83708381652832, "learning_rate": 2.1598570201254156e-05, "loss": 2.1358, "step": 6718 }, { "epoch": 1.1499229847680985, "grad_norm": 31.449512481689453, "learning_rate": 2.1588221226665338e-05, "loss": 5.8379, "step": 6719 }, { "epoch": 1.1500941297278795, "grad_norm": 7.079745292663574, "learning_rate": 2.1577868365016182e-05, "loss": 0.675, "step": 6720 }, { "epoch": 1.1502652746876605, "grad_norm": 21.983989715576172, "learning_rate": 2.156751162241489e-05, "loss": 1.9488, "step": 6721 }, { "epoch": 1.1504364196474415, "grad_norm": 17.203046798706055, "learning_rate": 2.155715100497197e-05, "loss": 1.6076, "step": 6722 }, { "epoch": 1.1506075646072222, "grad_norm": 12.79857063293457, "learning_rate": 2.1546786518800182e-05, "loss": 1.0769, "step": 6723 }, { "epoch": 1.1507787095670032, "grad_norm": 8.416800498962402, "learning_rate": 2.1536418170014595e-05, "loss": 0.6304, "step": 6724 }, { "epoch": 1.1509498545267842, "grad_norm": 24.620590209960938, "learning_rate": 2.1526045964732556e-05, "loss": 2.5394, "step": 6725 }, { "epoch": 1.1511209994865652, "grad_norm": 14.570639610290527, "learning_rate": 2.1515669909073675e-05, "loss": 1.0799, "step": 6726 }, { "epoch": 1.151292144446346, "grad_norm": 34.030250549316406, "learning_rate": 2.1505290009159843e-05, "loss": 5.7075, "step": 6727 }, { "epoch": 1.151463289406127, "grad_norm": 0.49403926730155945, "learning_rate": 2.149490627111522e-05, "loss": 0.1496, "step": 6728 }, { "epoch": 1.151634434365908, "grad_norm": 17.340473175048828, "learning_rate": 2.148451870106622e-05, "loss": 1.7152, "step": 6729 }, { "epoch": 1.151805579325689, "grad_norm": 10.648886680603027, "learning_rate": 2.1474127305141524e-05, "loss": 0.7918, "step": 6730 }, { "epoch": 1.1519767242854697, "grad_norm": 5.201786518096924, "learning_rate": 2.1463732089472083e-05, "loss": 0.6296, "step": 6731 }, { "epoch": 1.1521478692452507, "grad_norm": 5.574512481689453, "learning_rate": 2.145333306019108e-05, "loss": 0.4166, "step": 6732 }, { "epoch": 1.1523190142050317, "grad_norm": 9.512883186340332, "learning_rate": 2.144293022343396e-05, "loss": 0.553, "step": 6733 }, { "epoch": 1.1524901591648127, "grad_norm": 22.8580379486084, "learning_rate": 2.1432523585338406e-05, "loss": 3.0396, "step": 6734 }, { "epoch": 1.1526613041245934, "grad_norm": 11.338586807250977, "learning_rate": 2.142211315204436e-05, "loss": 1.0252, "step": 6735 }, { "epoch": 1.1528324490843744, "grad_norm": 16.34046173095703, "learning_rate": 2.141169892969399e-05, "loss": 1.2929, "step": 6736 }, { "epoch": 1.1530035940441554, "grad_norm": 0.7246559858322144, "learning_rate": 2.1401280924431694e-05, "loss": 0.1495, "step": 6737 }, { "epoch": 1.1531747390039364, "grad_norm": 5.994287967681885, "learning_rate": 2.1390859142404124e-05, "loss": 0.43, "step": 6738 }, { "epoch": 1.1533458839637172, "grad_norm": 21.98945426940918, "learning_rate": 2.1380433589760144e-05, "loss": 1.791, "step": 6739 }, { "epoch": 1.1535170289234982, "grad_norm": 0.6687926650047302, "learning_rate": 2.1370004272650837e-05, "loss": 0.1457, "step": 6740 }, { "epoch": 1.1536881738832792, "grad_norm": 20.061038970947266, "learning_rate": 2.1359571197229526e-05, "loss": 2.5826, "step": 6741 }, { "epoch": 1.1538593188430601, "grad_norm": 2.983421802520752, "learning_rate": 2.1349134369651732e-05, "loss": 0.2926, "step": 6742 }, { "epoch": 1.154030463802841, "grad_norm": 6.651993751525879, "learning_rate": 2.1338693796075205e-05, "loss": 0.42, "step": 6743 }, { "epoch": 1.154201608762622, "grad_norm": 11.948166847229004, "learning_rate": 2.13282494826599e-05, "loss": 1.0267, "step": 6744 }, { "epoch": 1.1543727537224029, "grad_norm": 27.58370590209961, "learning_rate": 2.1317801435567967e-05, "loss": 5.8972, "step": 6745 }, { "epoch": 1.1545438986821839, "grad_norm": 8.554266929626465, "learning_rate": 2.1307349660963782e-05, "loss": 0.5493, "step": 6746 }, { "epoch": 1.1547150436419646, "grad_norm": 18.07694435119629, "learning_rate": 2.12968941650139e-05, "loss": 0.9702, "step": 6747 }, { "epoch": 1.1548861886017456, "grad_norm": 14.886399269104004, "learning_rate": 2.128643495388709e-05, "loss": 1.5393, "step": 6748 }, { "epoch": 1.1550573335615266, "grad_norm": 19.339351654052734, "learning_rate": 2.1275972033754284e-05, "loss": 1.6208, "step": 6749 }, { "epoch": 1.1552284785213076, "grad_norm": 11.952751159667969, "learning_rate": 2.1265505410788633e-05, "loss": 0.77, "step": 6750 }, { "epoch": 1.1553996234810886, "grad_norm": 15.437543869018555, "learning_rate": 2.1255035091165456e-05, "loss": 1.1533, "step": 6751 }, { "epoch": 1.1555707684408694, "grad_norm": 10.964115142822266, "learning_rate": 2.1244561081062262e-05, "loss": 0.8684, "step": 6752 }, { "epoch": 1.1557419134006504, "grad_norm": 3.070553779602051, "learning_rate": 2.123408338665873e-05, "loss": 0.3276, "step": 6753 }, { "epoch": 1.1559130583604313, "grad_norm": 12.972532272338867, "learning_rate": 2.1223602014136712e-05, "loss": 1.2192, "step": 6754 }, { "epoch": 1.1560842033202121, "grad_norm": 18.17832374572754, "learning_rate": 2.1213116969680237e-05, "loss": 1.6733, "step": 6755 }, { "epoch": 1.156255348279993, "grad_norm": 1.0295546054840088, "learning_rate": 2.1202628259475495e-05, "loss": 0.1569, "step": 6756 }, { "epoch": 1.156426493239774, "grad_norm": 16.78033447265625, "learning_rate": 2.119213588971084e-05, "loss": 1.4867, "step": 6757 }, { "epoch": 1.156597638199555, "grad_norm": 2.5894768238067627, "learning_rate": 2.118163986657679e-05, "loss": 0.3114, "step": 6758 }, { "epoch": 1.156768783159336, "grad_norm": 21.40312385559082, "learning_rate": 2.1171140196266012e-05, "loss": 2.1186, "step": 6759 }, { "epoch": 1.1569399281191168, "grad_norm": 24.32610511779785, "learning_rate": 2.1160636884973322e-05, "loss": 2.1989, "step": 6760 }, { "epoch": 1.1571110730788978, "grad_norm": 8.965644836425781, "learning_rate": 2.1150129938895695e-05, "loss": 1.1102, "step": 6761 }, { "epoch": 1.1572822180386788, "grad_norm": 22.843244552612305, "learning_rate": 2.1139619364232247e-05, "loss": 1.2688, "step": 6762 }, { "epoch": 1.1574533629984596, "grad_norm": 19.694812774658203, "learning_rate": 2.1129105167184227e-05, "loss": 2.3136, "step": 6763 }, { "epoch": 1.1576245079582406, "grad_norm": 3.668747663497925, "learning_rate": 2.111858735395503e-05, "loss": 0.4309, "step": 6764 }, { "epoch": 1.1577956529180216, "grad_norm": 15.669942855834961, "learning_rate": 2.110806593075018e-05, "loss": 1.0937, "step": 6765 }, { "epoch": 1.1579667978778025, "grad_norm": 12.45332145690918, "learning_rate": 2.1097540903777333e-05, "loss": 1.0317, "step": 6766 }, { "epoch": 1.1581379428375835, "grad_norm": 20.682071685791016, "learning_rate": 2.108701227924627e-05, "loss": 2.0858, "step": 6767 }, { "epoch": 1.1583090877973643, "grad_norm": 20.13377571105957, "learning_rate": 2.10764800633689e-05, "loss": 2.533, "step": 6768 }, { "epoch": 1.1584802327571453, "grad_norm": 16.149690628051758, "learning_rate": 2.1065944262359234e-05, "loss": 1.3122, "step": 6769 }, { "epoch": 1.1586513777169263, "grad_norm": 14.939237594604492, "learning_rate": 2.1055404882433428e-05, "loss": 1.205, "step": 6770 }, { "epoch": 1.1588225226767073, "grad_norm": 12.55698299407959, "learning_rate": 2.1044861929809712e-05, "loss": 0.9223, "step": 6771 }, { "epoch": 1.158993667636488, "grad_norm": 18.289710998535156, "learning_rate": 2.103431541070846e-05, "loss": 1.6357, "step": 6772 }, { "epoch": 1.159164812596269, "grad_norm": 16.596107482910156, "learning_rate": 2.102376533135213e-05, "loss": 1.1197, "step": 6773 }, { "epoch": 1.15933595755605, "grad_norm": 17.402799606323242, "learning_rate": 2.101321169796528e-05, "loss": 1.4677, "step": 6774 }, { "epoch": 1.159507102515831, "grad_norm": 21.732892990112305, "learning_rate": 2.100265451677457e-05, "loss": 1.8113, "step": 6775 }, { "epoch": 1.1596782474756118, "grad_norm": 15.907790184020996, "learning_rate": 2.0992093794008755e-05, "loss": 1.3312, "step": 6776 }, { "epoch": 1.1598493924353928, "grad_norm": 18.397964477539062, "learning_rate": 2.0981529535898676e-05, "loss": 2.3183, "step": 6777 }, { "epoch": 1.1600205373951737, "grad_norm": 28.13370704650879, "learning_rate": 2.0970961748677267e-05, "loss": 1.3794, "step": 6778 }, { "epoch": 1.1601916823549547, "grad_norm": 0.6629721522331238, "learning_rate": 2.0960390438579528e-05, "loss": 0.1526, "step": 6779 }, { "epoch": 1.1603628273147355, "grad_norm": 20.983154296875, "learning_rate": 2.094981561184255e-05, "loss": 2.3348, "step": 6780 }, { "epoch": 1.1605339722745165, "grad_norm": 13.373003959655762, "learning_rate": 2.09392372747055e-05, "loss": 0.8657, "step": 6781 }, { "epoch": 1.1607051172342975, "grad_norm": 12.408032417297363, "learning_rate": 2.0928655433409614e-05, "loss": 0.9496, "step": 6782 }, { "epoch": 1.1608762621940785, "grad_norm": 28.76824378967285, "learning_rate": 2.0918070094198195e-05, "loss": 3.9569, "step": 6783 }, { "epoch": 1.1610474071538592, "grad_norm": 2.336712121963501, "learning_rate": 2.09074812633166e-05, "loss": 0.2164, "step": 6784 }, { "epoch": 1.1612185521136402, "grad_norm": 18.542802810668945, "learning_rate": 2.0896888947012265e-05, "loss": 1.59, "step": 6785 }, { "epoch": 1.1613896970734212, "grad_norm": 22.71904945373535, "learning_rate": 2.0886293151534663e-05, "loss": 1.7135, "step": 6786 }, { "epoch": 1.1615608420332022, "grad_norm": 21.552074432373047, "learning_rate": 2.0875693883135336e-05, "loss": 1.9509, "step": 6787 }, { "epoch": 1.161731986992983, "grad_norm": 16.57906723022461, "learning_rate": 2.0865091148067868e-05, "loss": 1.3288, "step": 6788 }, { "epoch": 1.161903131952764, "grad_norm": 3.9160079956054688, "learning_rate": 2.085448495258789e-05, "loss": 0.303, "step": 6789 }, { "epoch": 1.162074276912545, "grad_norm": 15.484481811523438, "learning_rate": 2.0843875302953064e-05, "loss": 1.2086, "step": 6790 }, { "epoch": 1.162245421872326, "grad_norm": 10.972047805786133, "learning_rate": 2.0833262205423103e-05, "loss": 1.0228, "step": 6791 }, { "epoch": 1.1624165668321067, "grad_norm": 15.809880256652832, "learning_rate": 2.0822645666259758e-05, "loss": 1.0541, "step": 6792 }, { "epoch": 1.1625877117918877, "grad_norm": 15.736574172973633, "learning_rate": 2.0812025691726795e-05, "loss": 1.5631, "step": 6793 }, { "epoch": 1.1627588567516687, "grad_norm": 13.390948295593262, "learning_rate": 2.080140228809002e-05, "loss": 1.1472, "step": 6794 }, { "epoch": 1.1629300017114497, "grad_norm": 18.913244247436523, "learning_rate": 2.079077546161725e-05, "loss": 2.0747, "step": 6795 }, { "epoch": 1.1631011466712304, "grad_norm": 35.074134826660156, "learning_rate": 2.0780145218578337e-05, "loss": 6.1981, "step": 6796 }, { "epoch": 1.1632722916310114, "grad_norm": 15.795833587646484, "learning_rate": 2.076951156524513e-05, "loss": 1.447, "step": 6797 }, { "epoch": 1.1634434365907924, "grad_norm": 21.82452964782715, "learning_rate": 2.0758874507891514e-05, "loss": 1.8627, "step": 6798 }, { "epoch": 1.1636145815505734, "grad_norm": 15.347325325012207, "learning_rate": 2.0748234052793353e-05, "loss": 1.3685, "step": 6799 }, { "epoch": 1.1637857265103544, "grad_norm": 18.640275955200195, "learning_rate": 2.0737590206228544e-05, "loss": 1.7358, "step": 6800 }, { "epoch": 1.1639568714701352, "grad_norm": 15.257899284362793, "learning_rate": 2.0726942974476967e-05, "loss": 1.3593, "step": 6801 }, { "epoch": 1.1641280164299161, "grad_norm": 22.601909637451172, "learning_rate": 2.0716292363820504e-05, "loss": 2.0876, "step": 6802 }, { "epoch": 1.1642991613896971, "grad_norm": 15.543705940246582, "learning_rate": 2.0705638380543027e-05, "loss": 0.8575, "step": 6803 }, { "epoch": 1.164470306349478, "grad_norm": 16.10650062561035, "learning_rate": 2.0694981030930417e-05, "loss": 1.301, "step": 6804 }, { "epoch": 1.164641451309259, "grad_norm": 14.519549369812012, "learning_rate": 2.068432032127051e-05, "loss": 1.2924, "step": 6805 }, { "epoch": 1.1648125962690399, "grad_norm": 14.380958557128906, "learning_rate": 2.0673656257853148e-05, "loss": 1.3454, "step": 6806 }, { "epoch": 1.1649837412288209, "grad_norm": 18.531217575073242, "learning_rate": 2.0662988846970144e-05, "loss": 1.6529, "step": 6807 }, { "epoch": 1.1651548861886019, "grad_norm": 11.520040512084961, "learning_rate": 2.065231809491528e-05, "loss": 1.0933, "step": 6808 }, { "epoch": 1.1653260311483826, "grad_norm": 11.549054145812988, "learning_rate": 2.064164400798433e-05, "loss": 0.9791, "step": 6809 }, { "epoch": 1.1654971761081636, "grad_norm": 24.388668060302734, "learning_rate": 2.0630966592475006e-05, "loss": 2.5074, "step": 6810 }, { "epoch": 1.1656683210679446, "grad_norm": 2.8475899696350098, "learning_rate": 2.062028585468701e-05, "loss": 0.31, "step": 6811 }, { "epoch": 1.1658394660277254, "grad_norm": 21.80243682861328, "learning_rate": 2.0609601800921984e-05, "loss": 1.9436, "step": 6812 }, { "epoch": 1.1660106109875064, "grad_norm": 14.159065246582031, "learning_rate": 2.0598914437483544e-05, "loss": 1.2508, "step": 6813 }, { "epoch": 1.1661817559472873, "grad_norm": 18.48175811767578, "learning_rate": 2.0588223770677244e-05, "loss": 2.2716, "step": 6814 }, { "epoch": 1.1663529009070683, "grad_norm": 13.813920974731445, "learning_rate": 2.0577529806810595e-05, "loss": 1.0515, "step": 6815 }, { "epoch": 1.1665240458668493, "grad_norm": 11.2750244140625, "learning_rate": 2.0566832552193052e-05, "loss": 0.9192, "step": 6816 }, { "epoch": 1.16669519082663, "grad_norm": 19.322492599487305, "learning_rate": 2.0556132013136013e-05, "loss": 1.9372, "step": 6817 }, { "epoch": 1.166866335786411, "grad_norm": 14.883321762084961, "learning_rate": 2.0545428195952814e-05, "loss": 1.647, "step": 6818 }, { "epoch": 1.167037480746192, "grad_norm": 22.583385467529297, "learning_rate": 2.0534721106958715e-05, "loss": 1.8467, "step": 6819 }, { "epoch": 1.1672086257059728, "grad_norm": 7.125509262084961, "learning_rate": 2.0524010752470924e-05, "loss": 0.4615, "step": 6820 }, { "epoch": 1.1673797706657538, "grad_norm": 30.93987274169922, "learning_rate": 2.051329713880856e-05, "loss": 5.5572, "step": 6821 }, { "epoch": 1.1675509156255348, "grad_norm": 10.377395629882812, "learning_rate": 2.050258027229267e-05, "loss": 0.8635, "step": 6822 }, { "epoch": 1.1677220605853158, "grad_norm": 7.3766093254089355, "learning_rate": 2.0491860159246226e-05, "loss": 0.485, "step": 6823 }, { "epoch": 1.1678932055450968, "grad_norm": 7.117273330688477, "learning_rate": 2.0481136805994104e-05, "loss": 0.3838, "step": 6824 }, { "epoch": 1.1680643505048776, "grad_norm": 18.087154388427734, "learning_rate": 2.0470410218863106e-05, "loss": 1.3246, "step": 6825 }, { "epoch": 1.1682354954646585, "grad_norm": 21.010772705078125, "learning_rate": 2.045968040418193e-05, "loss": 0.9737, "step": 6826 }, { "epoch": 1.1684066404244395, "grad_norm": 16.369413375854492, "learning_rate": 2.0448947368281183e-05, "loss": 1.1337, "step": 6827 }, { "epoch": 1.1685777853842205, "grad_norm": 14.726228713989258, "learning_rate": 2.0438211117493374e-05, "loss": 1.2854, "step": 6828 }, { "epoch": 1.1687489303440013, "grad_norm": 20.079736709594727, "learning_rate": 2.0427471658152902e-05, "loss": 2.8156, "step": 6829 }, { "epoch": 1.1689200753037823, "grad_norm": 18.067716598510742, "learning_rate": 2.041672899659607e-05, "loss": 1.5892, "step": 6830 }, { "epoch": 1.1690912202635633, "grad_norm": 24.576330184936523, "learning_rate": 2.0405983139161063e-05, "loss": 5.732, "step": 6831 }, { "epoch": 1.1692623652233443, "grad_norm": 18.32451057434082, "learning_rate": 2.0395234092187953e-05, "loss": 2.2328, "step": 6832 }, { "epoch": 1.169433510183125, "grad_norm": 22.1658935546875, "learning_rate": 2.038448186201869e-05, "loss": 2.32, "step": 6833 }, { "epoch": 1.169604655142906, "grad_norm": 9.66140365600586, "learning_rate": 2.037372645499711e-05, "loss": 0.6548, "step": 6834 }, { "epoch": 1.169775800102687, "grad_norm": 0.6341655850410461, "learning_rate": 2.0362967877468916e-05, "loss": 0.1526, "step": 6835 }, { "epoch": 1.169946945062468, "grad_norm": 10.674327850341797, "learning_rate": 2.0352206135781683e-05, "loss": 0.9537, "step": 6836 }, { "epoch": 1.1701180900222488, "grad_norm": 8.676131248474121, "learning_rate": 2.0341441236284865e-05, "loss": 1.0683, "step": 6837 }, { "epoch": 1.1702892349820297, "grad_norm": 18.046106338500977, "learning_rate": 2.033067318532976e-05, "loss": 1.4942, "step": 6838 }, { "epoch": 1.1704603799418107, "grad_norm": 14.745805740356445, "learning_rate": 2.0319901989269536e-05, "loss": 1.0318, "step": 6839 }, { "epoch": 1.1706315249015917, "grad_norm": 22.62310028076172, "learning_rate": 2.0309127654459213e-05, "loss": 2.3572, "step": 6840 }, { "epoch": 1.1708026698613725, "grad_norm": 8.969188690185547, "learning_rate": 2.0298350187255666e-05, "loss": 1.2112, "step": 6841 }, { "epoch": 1.1709738148211535, "grad_norm": 21.498462677001953, "learning_rate": 2.0287569594017617e-05, "loss": 2.7483, "step": 6842 }, { "epoch": 1.1711449597809345, "grad_norm": 5.7461981773376465, "learning_rate": 2.0276785881105635e-05, "loss": 0.6604, "step": 6843 }, { "epoch": 1.1713161047407155, "grad_norm": 10.238706588745117, "learning_rate": 2.026599905488212e-05, "loss": 0.912, "step": 6844 }, { "epoch": 1.1714872497004962, "grad_norm": 14.441835403442383, "learning_rate": 2.025520912171132e-05, "loss": 1.1801, "step": 6845 }, { "epoch": 1.1716583946602772, "grad_norm": 19.147550582885742, "learning_rate": 2.024441608795931e-05, "loss": 2.3638, "step": 6846 }, { "epoch": 1.1718295396200582, "grad_norm": 6.082062244415283, "learning_rate": 2.0233619959993997e-05, "loss": 0.4729, "step": 6847 }, { "epoch": 1.1720006845798392, "grad_norm": 39.79881286621094, "learning_rate": 2.0222820744185113e-05, "loss": 5.313, "step": 6848 }, { "epoch": 1.1721718295396202, "grad_norm": 21.230663299560547, "learning_rate": 2.0212018446904214e-05, "loss": 1.9576, "step": 6849 }, { "epoch": 1.172342974499401, "grad_norm": 19.801790237426758, "learning_rate": 2.0201213074524664e-05, "loss": 2.0392, "step": 6850 }, { "epoch": 1.172514119459182, "grad_norm": 11.94463062286377, "learning_rate": 2.019040463342165e-05, "loss": 0.8956, "step": 6851 }, { "epoch": 1.172685264418963, "grad_norm": 18.809799194335938, "learning_rate": 2.0179593129972178e-05, "loss": 2.1701, "step": 6852 }, { "epoch": 1.1728564093787437, "grad_norm": 18.971500396728516, "learning_rate": 2.016877857055504e-05, "loss": 1.7869, "step": 6853 }, { "epoch": 1.1730275543385247, "grad_norm": 14.83297061920166, "learning_rate": 2.015796096155085e-05, "loss": 1.3335, "step": 6854 }, { "epoch": 1.1731986992983057, "grad_norm": 21.98466682434082, "learning_rate": 2.0147140309342008e-05, "loss": 1.9541, "step": 6855 }, { "epoch": 1.1733698442580867, "grad_norm": 14.660383224487305, "learning_rate": 2.0136316620312723e-05, "loss": 1.2516, "step": 6856 }, { "epoch": 1.1735409892178676, "grad_norm": 23.2386474609375, "learning_rate": 2.012548990084897e-05, "loss": 5.376, "step": 6857 }, { "epoch": 1.1737121341776484, "grad_norm": 16.182527542114258, "learning_rate": 2.0114660157338545e-05, "loss": 1.6976, "step": 6858 }, { "epoch": 1.1738832791374294, "grad_norm": 14.8099946975708, "learning_rate": 2.0103827396171014e-05, "loss": 1.2173, "step": 6859 }, { "epoch": 1.1740544240972104, "grad_norm": 22.097217559814453, "learning_rate": 2.0092991623737716e-05, "loss": 2.3721, "step": 6860 }, { "epoch": 1.1742255690569912, "grad_norm": 5.707027435302734, "learning_rate": 2.0082152846431775e-05, "loss": 0.4328, "step": 6861 }, { "epoch": 1.1743967140167721, "grad_norm": 8.321752548217773, "learning_rate": 2.0071311070648083e-05, "loss": 0.635, "step": 6862 }, { "epoch": 1.1745678589765531, "grad_norm": 13.2294282913208, "learning_rate": 2.0060466302783303e-05, "loss": 0.9935, "step": 6863 }, { "epoch": 1.1747390039363341, "grad_norm": 0.649167537689209, "learning_rate": 2.0049618549235873e-05, "loss": 0.1585, "step": 6864 }, { "epoch": 1.1749101488961151, "grad_norm": 17.88936424255371, "learning_rate": 2.0038767816405972e-05, "loss": 1.5718, "step": 6865 }, { "epoch": 1.1750812938558959, "grad_norm": 14.337617874145508, "learning_rate": 2.0027914110695558e-05, "loss": 1.1434, "step": 6866 }, { "epoch": 1.1752524388156769, "grad_norm": 17.345027923583984, "learning_rate": 2.001705743850833e-05, "loss": 1.4073, "step": 6867 }, { "epoch": 1.1754235837754579, "grad_norm": 24.623624801635742, "learning_rate": 2.0006197806249737e-05, "loss": 2.578, "step": 6868 }, { "epoch": 1.1755947287352386, "grad_norm": 21.75896644592285, "learning_rate": 1.9995335220326985e-05, "loss": 1.8332, "step": 6869 }, { "epoch": 1.1757658736950196, "grad_norm": 9.989360809326172, "learning_rate": 1.998446968714901e-05, "loss": 0.9041, "step": 6870 }, { "epoch": 1.1759370186548006, "grad_norm": 18.148115158081055, "learning_rate": 1.99736012131265e-05, "loss": 1.3532, "step": 6871 }, { "epoch": 1.1761081636145816, "grad_norm": 16.34587287902832, "learning_rate": 1.9962729804671868e-05, "loss": 1.2796, "step": 6872 }, { "epoch": 1.1762793085743626, "grad_norm": 8.132262229919434, "learning_rate": 1.995185546819925e-05, "loss": 0.8756, "step": 6873 }, { "epoch": 1.1764504535341433, "grad_norm": 8.244163513183594, "learning_rate": 1.994097821012453e-05, "loss": 0.7777, "step": 6874 }, { "epoch": 1.1766215984939243, "grad_norm": 0.7307465672492981, "learning_rate": 1.9930098036865315e-05, "loss": 0.1506, "step": 6875 }, { "epoch": 1.1767927434537053, "grad_norm": 26.213361740112305, "learning_rate": 1.991921495484091e-05, "loss": 2.6735, "step": 6876 }, { "epoch": 1.1769638884134863, "grad_norm": 0.6290190815925598, "learning_rate": 1.9908328970472363e-05, "loss": 0.151, "step": 6877 }, { "epoch": 1.177135033373267, "grad_norm": 12.63049602508545, "learning_rate": 1.98974400901824e-05, "loss": 0.8783, "step": 6878 }, { "epoch": 1.177306178333048, "grad_norm": 149.73031616210938, "learning_rate": 1.9886548320395496e-05, "loss": 9.0417, "step": 6879 }, { "epoch": 1.177477323292829, "grad_norm": 5.016288757324219, "learning_rate": 1.9875653667537804e-05, "loss": 0.3981, "step": 6880 }, { "epoch": 1.17764846825261, "grad_norm": 19.39210319519043, "learning_rate": 1.9864756138037188e-05, "loss": 1.8209, "step": 6881 }, { "epoch": 1.1778196132123908, "grad_norm": 14.559964179992676, "learning_rate": 1.9853855738323204e-05, "loss": 1.193, "step": 6882 }, { "epoch": 1.1779907581721718, "grad_norm": 16.556758880615234, "learning_rate": 1.9842952474827102e-05, "loss": 1.4446, "step": 6883 }, { "epoch": 1.1781619031319528, "grad_norm": 12.186580657958984, "learning_rate": 1.9832046353981826e-05, "loss": 0.8997, "step": 6884 }, { "epoch": 1.1783330480917338, "grad_norm": 18.176359176635742, "learning_rate": 1.982113738222201e-05, "loss": 1.4875, "step": 6885 }, { "epoch": 1.1785041930515145, "grad_norm": 14.406021118164062, "learning_rate": 1.981022556598395e-05, "loss": 1.4021, "step": 6886 }, { "epoch": 1.1786753380112955, "grad_norm": 16.016162872314453, "learning_rate": 1.9799310911705654e-05, "loss": 1.7261, "step": 6887 }, { "epoch": 1.1788464829710765, "grad_norm": 20.404844284057617, "learning_rate": 1.978839342582676e-05, "loss": 1.9895, "step": 6888 }, { "epoch": 1.1790176279308575, "grad_norm": 2.6637179851531982, "learning_rate": 1.9777473114788612e-05, "loss": 0.2496, "step": 6889 }, { "epoch": 1.1791887728906383, "grad_norm": 25.14553451538086, "learning_rate": 1.9766549985034213e-05, "loss": 2.5686, "step": 6890 }, { "epoch": 1.1793599178504193, "grad_norm": 0.7727463245391846, "learning_rate": 1.9755624043008223e-05, "loss": 0.1432, "step": 6891 }, { "epoch": 1.1795310628102003, "grad_norm": 93.68418884277344, "learning_rate": 1.9744695295156966e-05, "loss": 8.0592, "step": 6892 }, { "epoch": 1.1797022077699812, "grad_norm": 21.706905364990234, "learning_rate": 1.973376374792842e-05, "loss": 2.8132, "step": 6893 }, { "epoch": 1.179873352729762, "grad_norm": 12.288810729980469, "learning_rate": 1.9722829407772208e-05, "loss": 1.0199, "step": 6894 }, { "epoch": 1.180044497689543, "grad_norm": 14.261706352233887, "learning_rate": 1.971189228113961e-05, "loss": 1.2732, "step": 6895 }, { "epoch": 1.180215642649324, "grad_norm": 37.62468719482422, "learning_rate": 1.970095237448355e-05, "loss": 5.9513, "step": 6896 }, { "epoch": 1.180386787609105, "grad_norm": 16.761072158813477, "learning_rate": 1.9690009694258593e-05, "loss": 1.2978, "step": 6897 }, { "epoch": 1.180557932568886, "grad_norm": 24.69586944580078, "learning_rate": 1.9679064246920923e-05, "loss": 5.4448, "step": 6898 }, { "epoch": 1.1807290775286667, "grad_norm": 0.5545434355735779, "learning_rate": 1.9668116038928377e-05, "loss": 0.1292, "step": 6899 }, { "epoch": 1.1809002224884477, "grad_norm": 13.614893913269043, "learning_rate": 1.965716507674042e-05, "loss": 1.2286, "step": 6900 }, { "epoch": 1.1810713674482287, "grad_norm": 17.071117401123047, "learning_rate": 1.964621136681813e-05, "loss": 2.147, "step": 6901 }, { "epoch": 1.1812425124080095, "grad_norm": 18.16098976135254, "learning_rate": 1.963525491562421e-05, "loss": 1.7085, "step": 6902 }, { "epoch": 1.1814136573677905, "grad_norm": 18.80036163330078, "learning_rate": 1.962429572962299e-05, "loss": 1.4874, "step": 6903 }, { "epoch": 1.1815848023275715, "grad_norm": 20.31178855895996, "learning_rate": 1.9613333815280404e-05, "loss": 1.838, "step": 6904 }, { "epoch": 1.1817559472873524, "grad_norm": 15.093534469604492, "learning_rate": 1.9602369179063987e-05, "loss": 1.3441, "step": 6905 }, { "epoch": 1.1819270922471334, "grad_norm": 1.2397549152374268, "learning_rate": 1.9591401827442904e-05, "loss": 0.1521, "step": 6906 }, { "epoch": 1.1820982372069142, "grad_norm": 6.578031063079834, "learning_rate": 1.9580431766887904e-05, "loss": 0.4449, "step": 6907 }, { "epoch": 1.1822693821666952, "grad_norm": 0.47265729308128357, "learning_rate": 1.956945900387134e-05, "loss": 0.1383, "step": 6908 }, { "epoch": 1.1824405271264762, "grad_norm": 7.973837375640869, "learning_rate": 1.955848354486716e-05, "loss": 0.8364, "step": 6909 }, { "epoch": 1.182611672086257, "grad_norm": 24.104881286621094, "learning_rate": 1.9547505396350893e-05, "loss": 3.1458, "step": 6910 }, { "epoch": 1.182782817046038, "grad_norm": 12.037428855895996, "learning_rate": 1.9536524564799673e-05, "loss": 0.9171, "step": 6911 }, { "epoch": 1.182953962005819, "grad_norm": 27.550291061401367, "learning_rate": 1.95255410566922e-05, "loss": 3.0115, "step": 6912 }, { "epoch": 1.1831251069656, "grad_norm": 33.36637496948242, "learning_rate": 1.951455487850877e-05, "loss": 5.617, "step": 6913 }, { "epoch": 1.183296251925381, "grad_norm": 0.5798065066337585, "learning_rate": 1.950356603673123e-05, "loss": 0.1496, "step": 6914 }, { "epoch": 1.1834673968851617, "grad_norm": 15.993162155151367, "learning_rate": 1.9492574537843024e-05, "loss": 1.5259, "step": 6915 }, { "epoch": 1.1836385418449427, "grad_norm": 16.15716552734375, "learning_rate": 1.948158038832914e-05, "loss": 1.3994, "step": 6916 }, { "epoch": 1.1838096868047236, "grad_norm": 6.515031337738037, "learning_rate": 1.9470583594676164e-05, "loss": 0.5043, "step": 6917 }, { "epoch": 1.1839808317645044, "grad_norm": 16.923002243041992, "learning_rate": 1.9459584163372203e-05, "loss": 1.3346, "step": 6918 }, { "epoch": 1.1841519767242854, "grad_norm": 5.997356414794922, "learning_rate": 1.9448582100906943e-05, "loss": 0.3956, "step": 6919 }, { "epoch": 1.1843231216840664, "grad_norm": 8.468660354614258, "learning_rate": 1.9437577413771623e-05, "loss": 0.4804, "step": 6920 }, { "epoch": 1.1844942666438474, "grad_norm": 2.0181360244750977, "learning_rate": 1.9426570108459007e-05, "loss": 0.2875, "step": 6921 }, { "epoch": 1.1846654116036284, "grad_norm": 20.663789749145508, "learning_rate": 1.9415560191463444e-05, "loss": 2.119, "step": 6922 }, { "epoch": 1.1848365565634091, "grad_norm": 16.835023880004883, "learning_rate": 1.940454766928079e-05, "loss": 1.2742, "step": 6923 }, { "epoch": 1.1850077015231901, "grad_norm": 15.038171768188477, "learning_rate": 1.9393532548408447e-05, "loss": 1.3088, "step": 6924 }, { "epoch": 1.1851788464829711, "grad_norm": 12.88936996459961, "learning_rate": 1.938251483534536e-05, "loss": 1.0233, "step": 6925 }, { "epoch": 1.185349991442752, "grad_norm": 17.261011123657227, "learning_rate": 1.937149453659199e-05, "loss": 1.4455, "step": 6926 }, { "epoch": 1.1855211364025329, "grad_norm": 18.969623565673828, "learning_rate": 1.9360471658650336e-05, "loss": 1.8209, "step": 6927 }, { "epoch": 1.1856922813623139, "grad_norm": 16.013376235961914, "learning_rate": 1.9349446208023903e-05, "loss": 1.4429, "step": 6928 }, { "epoch": 1.1858634263220948, "grad_norm": 0.6081182360649109, "learning_rate": 1.9338418191217732e-05, "loss": 0.1424, "step": 6929 }, { "epoch": 1.1860345712818758, "grad_norm": 27.403413772583008, "learning_rate": 1.932738761473837e-05, "loss": 4.3191, "step": 6930 }, { "epoch": 1.1862057162416566, "grad_norm": 1.8343784809112549, "learning_rate": 1.9316354485093866e-05, "loss": 0.2407, "step": 6931 }, { "epoch": 1.1863768612014376, "grad_norm": 12.688108444213867, "learning_rate": 1.9305318808793783e-05, "loss": 0.9057, "step": 6932 }, { "epoch": 1.1865480061612186, "grad_norm": 2.889329433441162, "learning_rate": 1.92942805923492e-05, "loss": 0.4465, "step": 6933 }, { "epoch": 1.1867191511209996, "grad_norm": 18.517244338989258, "learning_rate": 1.9283239842272665e-05, "loss": 1.6817, "step": 6934 }, { "epoch": 1.1868902960807803, "grad_norm": 17.190340042114258, "learning_rate": 1.9272196565078245e-05, "loss": 2.0391, "step": 6935 }, { "epoch": 1.1870614410405613, "grad_norm": 22.873920440673828, "learning_rate": 1.9261150767281486e-05, "loss": 1.8831, "step": 6936 }, { "epoch": 1.1872325860003423, "grad_norm": 16.942474365234375, "learning_rate": 1.9250102455399427e-05, "loss": 1.4123, "step": 6937 }, { "epoch": 1.1874037309601233, "grad_norm": 17.837316513061523, "learning_rate": 1.9239051635950588e-05, "loss": 1.5212, "step": 6938 }, { "epoch": 1.187574875919904, "grad_norm": 20.059364318847656, "learning_rate": 1.9227998315454976e-05, "loss": 1.6603, "step": 6939 }, { "epoch": 1.187746020879685, "grad_norm": 22.977336883544922, "learning_rate": 1.9216942500434055e-05, "loss": 2.8594, "step": 6940 }, { "epoch": 1.187917165839466, "grad_norm": 15.848212242126465, "learning_rate": 1.920588419741078e-05, "loss": 1.3736, "step": 6941 }, { "epoch": 1.188088310799247, "grad_norm": 13.386641502380371, "learning_rate": 1.9194823412909562e-05, "loss": 1.0159, "step": 6942 }, { "epoch": 1.1882594557590278, "grad_norm": 4.138679027557373, "learning_rate": 1.9183760153456286e-05, "loss": 0.3584, "step": 6943 }, { "epoch": 1.1884306007188088, "grad_norm": 3.90535306930542, "learning_rate": 1.9172694425578288e-05, "loss": 0.3002, "step": 6944 }, { "epoch": 1.1886017456785898, "grad_norm": 75.9722900390625, "learning_rate": 1.916162623580436e-05, "loss": 5.6064, "step": 6945 }, { "epoch": 1.1887728906383708, "grad_norm": 12.245502471923828, "learning_rate": 1.9150555590664758e-05, "loss": 0.9652, "step": 6946 }, { "epoch": 1.1889440355981515, "grad_norm": 3.0320892333984375, "learning_rate": 1.913948249669117e-05, "loss": 0.3505, "step": 6947 }, { "epoch": 1.1891151805579325, "grad_norm": 12.209064483642578, "learning_rate": 1.912840696041675e-05, "loss": 1.0553, "step": 6948 }, { "epoch": 1.1892863255177135, "grad_norm": 0.6309384703636169, "learning_rate": 1.9117328988376072e-05, "loss": 0.1423, "step": 6949 }, { "epoch": 1.1894574704774945, "grad_norm": 31.398828506469727, "learning_rate": 1.9106248587105158e-05, "loss": 1.1428, "step": 6950 }, { "epoch": 1.1896286154372753, "grad_norm": 13.952041625976562, "learning_rate": 1.9095165763141463e-05, "loss": 1.3932, "step": 6951 }, { "epoch": 1.1897997603970563, "grad_norm": 18.571762084960938, "learning_rate": 1.9084080523023866e-05, "loss": 1.4609, "step": 6952 }, { "epoch": 1.1899709053568372, "grad_norm": 13.063371658325195, "learning_rate": 1.9072992873292676e-05, "loss": 1.0676, "step": 6953 }, { "epoch": 1.1901420503166182, "grad_norm": 20.675588607788086, "learning_rate": 1.9061902820489628e-05, "loss": 2.3088, "step": 6954 }, { "epoch": 1.1903131952763992, "grad_norm": 18.69338607788086, "learning_rate": 1.9050810371157865e-05, "loss": 2.1148, "step": 6955 }, { "epoch": 1.19048434023618, "grad_norm": 23.922927856445312, "learning_rate": 1.9039715531841946e-05, "loss": 2.4482, "step": 6956 }, { "epoch": 1.190655485195961, "grad_norm": 21.081663131713867, "learning_rate": 1.902861830908785e-05, "loss": 2.1831, "step": 6957 }, { "epoch": 1.190826630155742, "grad_norm": 20.321006774902344, "learning_rate": 1.9017518709442946e-05, "loss": 2.0846, "step": 6958 }, { "epoch": 1.1909977751155227, "grad_norm": 2.0978660583496094, "learning_rate": 1.9006416739456024e-05, "loss": 0.3047, "step": 6959 }, { "epoch": 1.1911689200753037, "grad_norm": 0.8055472373962402, "learning_rate": 1.899531240567726e-05, "loss": 0.1479, "step": 6960 }, { "epoch": 1.1913400650350847, "grad_norm": 22.558717727661133, "learning_rate": 1.8984205714658222e-05, "loss": 2.9147, "step": 6961 }, { "epoch": 1.1915112099948657, "grad_norm": 23.008995056152344, "learning_rate": 1.897309667295188e-05, "loss": 2.2334, "step": 6962 }, { "epoch": 1.1916823549546467, "grad_norm": 19.32323455810547, "learning_rate": 1.8961985287112583e-05, "loss": 2.2991, "step": 6963 }, { "epoch": 1.1918534999144275, "grad_norm": 14.19498348236084, "learning_rate": 1.8950871563696058e-05, "loss": 1.4139, "step": 6964 }, { "epoch": 1.1920246448742084, "grad_norm": 15.657463073730469, "learning_rate": 1.893975550925943e-05, "loss": 1.2715, "step": 6965 }, { "epoch": 1.1921957898339894, "grad_norm": 3.751906394958496, "learning_rate": 1.892863713036119e-05, "loss": 0.3137, "step": 6966 }, { "epoch": 1.1923669347937702, "grad_norm": 13.209271430969238, "learning_rate": 1.891751643356119e-05, "loss": 1.2306, "step": 6967 }, { "epoch": 1.1925380797535512, "grad_norm": 14.844564437866211, "learning_rate": 1.8906393425420654e-05, "loss": 1.1242, "step": 6968 }, { "epoch": 1.1927092247133322, "grad_norm": 19.91156768798828, "learning_rate": 1.8895268112502185e-05, "loss": 1.7784, "step": 6969 }, { "epoch": 1.1928803696731132, "grad_norm": 15.727069854736328, "learning_rate": 1.8884140501369725e-05, "loss": 1.2447, "step": 6970 }, { "epoch": 1.1930515146328942, "grad_norm": 24.86445426940918, "learning_rate": 1.887301059858858e-05, "loss": 5.1445, "step": 6971 }, { "epoch": 1.193222659592675, "grad_norm": 11.971946716308594, "learning_rate": 1.886187841072542e-05, "loss": 1.1217, "step": 6972 }, { "epoch": 1.193393804552456, "grad_norm": 6.445328712463379, "learning_rate": 1.885074394434824e-05, "loss": 0.4316, "step": 6973 }, { "epoch": 1.193564949512237, "grad_norm": 6.6650261878967285, "learning_rate": 1.88396072060264e-05, "loss": 0.5171, "step": 6974 }, { "epoch": 1.193736094472018, "grad_norm": 19.152889251708984, "learning_rate": 1.8828468202330588e-05, "loss": 1.4326, "step": 6975 }, { "epoch": 1.1939072394317987, "grad_norm": 23.997262954711914, "learning_rate": 1.8817326939832835e-05, "loss": 3.1506, "step": 6976 }, { "epoch": 1.1940783843915797, "grad_norm": 17.117610931396484, "learning_rate": 1.88061834251065e-05, "loss": 1.6154, "step": 6977 }, { "epoch": 1.1942495293513606, "grad_norm": 14.683150291442871, "learning_rate": 1.879503766472628e-05, "loss": 1.2889, "step": 6978 }, { "epoch": 1.1944206743111416, "grad_norm": 22.7612361907959, "learning_rate": 1.8783889665268182e-05, "loss": 2.5793, "step": 6979 }, { "epoch": 1.1945918192709224, "grad_norm": 14.981850624084473, "learning_rate": 1.877273943330954e-05, "loss": 1.3815, "step": 6980 }, { "epoch": 1.1947629642307034, "grad_norm": 6.467514991760254, "learning_rate": 1.8761586975429022e-05, "loss": 0.4525, "step": 6981 }, { "epoch": 1.1949341091904844, "grad_norm": 26.042551040649414, "learning_rate": 1.875043229820658e-05, "loss": 2.9506, "step": 6982 }, { "epoch": 1.1951052541502654, "grad_norm": 18.11719512939453, "learning_rate": 1.8739275408223497e-05, "loss": 2.2709, "step": 6983 }, { "epoch": 1.1952763991100461, "grad_norm": 23.576017379760742, "learning_rate": 1.872811631206236e-05, "loss": 1.9806, "step": 6984 }, { "epoch": 1.1954475440698271, "grad_norm": 16.061267852783203, "learning_rate": 1.8716955016307035e-05, "loss": 1.204, "step": 6985 }, { "epoch": 1.195618689029608, "grad_norm": 18.873077392578125, "learning_rate": 1.8705791527542723e-05, "loss": 1.5474, "step": 6986 }, { "epoch": 1.195789833989389, "grad_norm": 11.208637237548828, "learning_rate": 1.8694625852355886e-05, "loss": 0.8615, "step": 6987 }, { "epoch": 1.1959609789491699, "grad_norm": 16.651269912719727, "learning_rate": 1.8683457997334292e-05, "loss": 1.2784, "step": 6988 }, { "epoch": 1.1961321239089509, "grad_norm": 153.02740478515625, "learning_rate": 1.8672287969067002e-05, "loss": 7.5781, "step": 6989 }, { "epoch": 1.1963032688687318, "grad_norm": 0.6318583488464355, "learning_rate": 1.8661115774144333e-05, "loss": 0.1455, "step": 6990 }, { "epoch": 1.1964744138285128, "grad_norm": 18.384014129638672, "learning_rate": 1.86499414191579e-05, "loss": 2.5856, "step": 6991 }, { "epoch": 1.1966455587882936, "grad_norm": 6.968575954437256, "learning_rate": 1.86387649107006e-05, "loss": 0.3881, "step": 6992 }, { "epoch": 1.1968167037480746, "grad_norm": 17.390628814697266, "learning_rate": 1.862758625536658e-05, "loss": 1.4007, "step": 6993 }, { "epoch": 1.1969878487078556, "grad_norm": 4.716192245483398, "learning_rate": 1.861640545975127e-05, "loss": 0.5143, "step": 6994 }, { "epoch": 1.1971589936676366, "grad_norm": 20.03127670288086, "learning_rate": 1.8605222530451354e-05, "loss": 2.2603, "step": 6995 }, { "epoch": 1.1973301386274173, "grad_norm": 18.754140853881836, "learning_rate": 1.8594037474064767e-05, "loss": 1.5199, "step": 6996 }, { "epoch": 1.1975012835871983, "grad_norm": 19.000524520874023, "learning_rate": 1.858285029719072e-05, "loss": 1.7049, "step": 6997 }, { "epoch": 1.1976724285469793, "grad_norm": 15.121638298034668, "learning_rate": 1.857166100642966e-05, "loss": 1.1847, "step": 6998 }, { "epoch": 1.1978435735067603, "grad_norm": 0.5012795329093933, "learning_rate": 1.8560469608383293e-05, "loss": 0.1359, "step": 6999 }, { "epoch": 1.198014718466541, "grad_norm": 35.48890686035156, "learning_rate": 1.854927610965455e-05, "loss": 5.816, "step": 7000 }, { "epoch": 1.198185863426322, "grad_norm": 2.8732776641845703, "learning_rate": 1.853808051684761e-05, "loss": 0.2891, "step": 7001 }, { "epoch": 1.198357008386103, "grad_norm": 3.7505767345428467, "learning_rate": 1.8526882836567904e-05, "loss": 0.3253, "step": 7002 }, { "epoch": 1.198528153345884, "grad_norm": 19.846040725708008, "learning_rate": 1.8515683075422066e-05, "loss": 1.6157, "step": 7003 }, { "epoch": 1.198699298305665, "grad_norm": 10.942571640014648, "learning_rate": 1.8504481240017977e-05, "loss": 0.9332, "step": 7004 }, { "epoch": 1.1988704432654458, "grad_norm": 0.8276537656784058, "learning_rate": 1.849327733696474e-05, "loss": 0.1593, "step": 7005 }, { "epoch": 1.1990415882252268, "grad_norm": 6.18898344039917, "learning_rate": 1.8482071372872673e-05, "loss": 0.4576, "step": 7006 }, { "epoch": 1.1992127331850078, "grad_norm": 22.084545135498047, "learning_rate": 1.84708633543533e-05, "loss": 1.9511, "step": 7007 }, { "epoch": 1.1993838781447885, "grad_norm": 33.36897659301758, "learning_rate": 1.8459653288019385e-05, "loss": 1.3345, "step": 7008 }, { "epoch": 1.1995550231045695, "grad_norm": 14.181174278259277, "learning_rate": 1.8448441180484876e-05, "loss": 1.3218, "step": 7009 }, { "epoch": 1.1997261680643505, "grad_norm": 1.7246726751327515, "learning_rate": 1.8437227038364935e-05, "loss": 0.1508, "step": 7010 }, { "epoch": 1.1998973130241315, "grad_norm": 14.34754467010498, "learning_rate": 1.842601086827592e-05, "loss": 1.3846, "step": 7011 }, { "epoch": 1.2000684579839125, "grad_norm": 24.781330108642578, "learning_rate": 1.8414792676835392e-05, "loss": 5.2621, "step": 7012 }, { "epoch": 1.2002396029436933, "grad_norm": 2.6087961196899414, "learning_rate": 1.8403572470662098e-05, "loss": 0.2925, "step": 7013 }, { "epoch": 1.2004107479034742, "grad_norm": 15.484484672546387, "learning_rate": 1.8392350256375975e-05, "loss": 1.1175, "step": 7014 }, { "epoch": 1.2005818928632552, "grad_norm": 9.616670608520508, "learning_rate": 1.838112604059815e-05, "loss": 0.7138, "step": 7015 }, { "epoch": 1.200753037823036, "grad_norm": 16.37866973876953, "learning_rate": 1.8369899829950928e-05, "loss": 1.4261, "step": 7016 }, { "epoch": 1.200753037823036, "eval_nli-pairs_loss": 1.4953010082244873, "eval_nli-pairs_runtime": 4.5627, "eval_nli-pairs_samples_per_second": 43.833, "eval_nli-pairs_steps_per_second": 1.534, "eval_sts-test_pearson_cosine": 0.7630899675412521, "eval_sts-test_pearson_dot": 0.6254421112157904, "eval_sts-test_pearson_euclidean": 0.7580382057295076, "eval_sts-test_pearson_manhattan": 0.7623571616238837, "eval_sts-test_pearson_max": 0.7630899675412521, "eval_sts-test_spearman_cosine": 0.7635056711829842, "eval_sts-test_spearman_dot": 0.6093963604051945, "eval_sts-test_spearman_euclidean": 0.7487226311935559, "eval_sts-test_spearman_manhattan": 0.7547159555492929, "eval_sts-test_spearman_max": 0.7635056711829842, "step": 7016 }, { "epoch": 1.200753037823036, "eval_vitaminc-pairs_loss": 0.8603028655052185, "eval_vitaminc-pairs_runtime": 2.8678, "eval_vitaminc-pairs_samples_per_second": 69.74, "eval_vitaminc-pairs_steps_per_second": 2.441, "step": 7016 }, { "epoch": 1.200753037823036, "eval_qnli-contrastive_loss": 1.7720210552215576, "eval_qnli-contrastive_runtime": 0.7553, "eval_qnli-contrastive_samples_per_second": 264.8, "eval_qnli-contrastive_steps_per_second": 9.268, "step": 7016 }, { "epoch": 1.200753037823036, "eval_scitail-pairs-qa_loss": 0.12096086144447327, "eval_scitail-pairs-qa_runtime": 1.8473, "eval_scitail-pairs-qa_samples_per_second": 108.263, "eval_scitail-pairs-qa_steps_per_second": 3.789, "step": 7016 }, { "epoch": 1.200753037823036, "eval_scitail-pairs-pos_loss": 0.6861357092857361, "eval_scitail-pairs-pos_runtime": 2.8905, "eval_scitail-pairs-pos_samples_per_second": 69.193, "eval_scitail-pairs-pos_steps_per_second": 2.422, "step": 7016 }, { "epoch": 1.200753037823036, "eval_xsum-pairs_loss": 0.8077878952026367, "eval_xsum-pairs_runtime": 2.6762, "eval_xsum-pairs_samples_per_second": 65.392, "eval_xsum-pairs_steps_per_second": 2.242, "step": 7016 }, { "epoch": 1.200753037823036, "eval_compression-pairs_loss": 0.26878783106803894, "eval_compression-pairs_runtime": 0.5264, "eval_compression-pairs_samples_per_second": 379.949, "eval_compression-pairs_steps_per_second": 13.298, "step": 7016 }, { "epoch": 1.200753037823036, "eval_sciq_pairs_loss": 0.4642964005470276, "eval_sciq_pairs_runtime": 9.6271, "eval_sciq_pairs_samples_per_second": 20.775, "eval_sciq_pairs_steps_per_second": 0.727, "step": 7016 }, { "epoch": 1.200753037823036, "eval_qasc_pairs_loss": 5.549177169799805, "eval_qasc_pairs_runtime": 2.7374, "eval_qasc_pairs_samples_per_second": 73.063, "eval_qasc_pairs_steps_per_second": 2.557, "step": 7016 }, { "epoch": 1.200753037823036, "eval_openbookqa_pairs_loss": 2.5830934047698975, "eval_openbookqa_pairs_runtime": 0.6669, "eval_openbookqa_pairs_samples_per_second": 103.471, "eval_openbookqa_pairs_steps_per_second": 4.499, "step": 7016 }, { "epoch": 1.200753037823036, "eval_msmarco_pairs_loss": 1.228713870048523, "eval_msmarco_pairs_runtime": 4.1215, "eval_msmarco_pairs_samples_per_second": 48.526, "eval_msmarco_pairs_steps_per_second": 1.698, "step": 7016 }, { "epoch": 1.200753037823036, "eval_nq_pairs_loss": 1.4215295314788818, "eval_nq_pairs_runtime": 8.7787, "eval_nq_pairs_samples_per_second": 22.782, "eval_nq_pairs_steps_per_second": 0.797, "step": 7016 }, { "epoch": 1.200753037823036, "eval_trivia_pairs_loss": 1.794838309288025, "eval_trivia_pairs_runtime": 12.9923, "eval_trivia_pairs_samples_per_second": 15.394, "eval_trivia_pairs_steps_per_second": 0.539, "step": 7016 }, { "epoch": 1.200753037823036, "eval_quora_pairs_loss": 0.23021991550922394, "eval_quora_pairs_runtime": 1.5922, "eval_quora_pairs_samples_per_second": 125.616, "eval_quora_pairs_steps_per_second": 4.397, "step": 7016 }, { "epoch": 1.200753037823036, "eval_gooaq_pairs_loss": 0.9434043765068054, "eval_gooaq_pairs_runtime": 2.665, "eval_gooaq_pairs_samples_per_second": 75.047, "eval_gooaq_pairs_steps_per_second": 2.627, "step": 7016 }, { "epoch": 1.200924182782817, "grad_norm": 17.212543487548828, "learning_rate": 1.835867163105778e-05, "loss": 1.3922, "step": 7017 }, { "epoch": 1.201095327742598, "grad_norm": 5.500203609466553, "learning_rate": 1.8347441450543373e-05, "loss": 0.4196, "step": 7018 }, { "epoch": 1.201266472702379, "grad_norm": 21.175825119018555, "learning_rate": 1.833620929503352e-05, "loss": 2.5103, "step": 7019 }, { "epoch": 1.20143761766216, "grad_norm": 23.716127395629883, "learning_rate": 1.8324975171155218e-05, "loss": 2.388, "step": 7020 }, { "epoch": 1.2016087626219407, "grad_norm": 22.948490142822266, "learning_rate": 1.8313739085536613e-05, "loss": 3.5063, "step": 7021 }, { "epoch": 1.2017799075817217, "grad_norm": 21.151857376098633, "learning_rate": 1.830250104480701e-05, "loss": 2.0793, "step": 7022 }, { "epoch": 1.2019510525415027, "grad_norm": 6.868299961090088, "learning_rate": 1.8291261055596863e-05, "loss": 0.6836, "step": 7023 }, { "epoch": 1.2021221975012835, "grad_norm": 11.07442569732666, "learning_rate": 1.82800191245378e-05, "loss": 1.1184, "step": 7024 }, { "epoch": 1.2022933424610645, "grad_norm": 17.20339012145996, "learning_rate": 1.8268775258262567e-05, "loss": 1.8645, "step": 7025 }, { "epoch": 1.2024644874208454, "grad_norm": 8.099205017089844, "learning_rate": 1.8257529463405063e-05, "loss": 1.0813, "step": 7026 }, { "epoch": 1.2026356323806264, "grad_norm": 10.576147079467773, "learning_rate": 1.8246281746600325e-05, "loss": 1.1647, "step": 7027 }, { "epoch": 1.2028067773404074, "grad_norm": 23.977031707763672, "learning_rate": 1.823503211448451e-05, "loss": 2.1884, "step": 7028 }, { "epoch": 1.2029779223001882, "grad_norm": 17.264558792114258, "learning_rate": 1.8223780573694943e-05, "loss": 1.5955, "step": 7029 }, { "epoch": 1.2031490672599692, "grad_norm": 3.9181175231933594, "learning_rate": 1.8212527130870032e-05, "loss": 0.3104, "step": 7030 }, { "epoch": 1.2033202122197502, "grad_norm": 28.664770126342773, "learning_rate": 1.820127179264933e-05, "loss": 5.3973, "step": 7031 }, { "epoch": 1.2034913571795312, "grad_norm": 15.21064281463623, "learning_rate": 1.81900145656735e-05, "loss": 1.5112, "step": 7032 }, { "epoch": 1.203662502139312, "grad_norm": 86.03211975097656, "learning_rate": 1.8178755456584325e-05, "loss": 7.2344, "step": 7033 }, { "epoch": 1.203833647099093, "grad_norm": 3.1570894718170166, "learning_rate": 1.8167494472024694e-05, "loss": 0.2625, "step": 7034 }, { "epoch": 1.204004792058874, "grad_norm": 14.07559585571289, "learning_rate": 1.815623161863861e-05, "loss": 1.3018, "step": 7035 }, { "epoch": 1.2041759370186549, "grad_norm": 17.405441284179688, "learning_rate": 1.814496690307117e-05, "loss": 1.9948, "step": 7036 }, { "epoch": 1.2043470819784357, "grad_norm": 4.415936470031738, "learning_rate": 1.8133700331968572e-05, "loss": 0.3861, "step": 7037 }, { "epoch": 1.2045182269382166, "grad_norm": 18.057254791259766, "learning_rate": 1.8122431911978114e-05, "loss": 1.4661, "step": 7038 }, { "epoch": 1.2046893718979976, "grad_norm": 9.419878005981445, "learning_rate": 1.811116164974817e-05, "loss": 0.5912, "step": 7039 }, { "epoch": 1.2048605168577786, "grad_norm": 4.697745323181152, "learning_rate": 1.809988955192822e-05, "loss": 0.3498, "step": 7040 }, { "epoch": 1.2050316618175594, "grad_norm": 13.607497215270996, "learning_rate": 1.808861562516882e-05, "loss": 1.0024, "step": 7041 }, { "epoch": 1.2052028067773404, "grad_norm": 17.339616775512695, "learning_rate": 1.8077339876121604e-05, "loss": 1.295, "step": 7042 }, { "epoch": 1.2053739517371214, "grad_norm": 21.256227493286133, "learning_rate": 1.8066062311439275e-05, "loss": 1.472, "step": 7043 }, { "epoch": 1.2055450966969024, "grad_norm": 13.709125518798828, "learning_rate": 1.8054782937775607e-05, "loss": 0.9876, "step": 7044 }, { "epoch": 1.2057162416566831, "grad_norm": 21.412246704101562, "learning_rate": 1.804350176178546e-05, "loss": 1.8479, "step": 7045 }, { "epoch": 1.205887386616464, "grad_norm": 15.640390396118164, "learning_rate": 1.803221879012475e-05, "loss": 1.6526, "step": 7046 }, { "epoch": 1.206058531576245, "grad_norm": 22.768157958984375, "learning_rate": 1.802093402945043e-05, "loss": 1.0484, "step": 7047 }, { "epoch": 1.206229676536026, "grad_norm": 12.367416381835938, "learning_rate": 1.8009647486420535e-05, "loss": 1.1592, "step": 7048 }, { "epoch": 1.2064008214958069, "grad_norm": 11.553193092346191, "learning_rate": 1.799835916769414e-05, "loss": 0.9264, "step": 7049 }, { "epoch": 1.2065719664555878, "grad_norm": 16.40020751953125, "learning_rate": 1.7987069079931363e-05, "loss": 1.5556, "step": 7050 }, { "epoch": 1.2067431114153688, "grad_norm": 15.786770820617676, "learning_rate": 1.7975777229793386e-05, "loss": 1.2909, "step": 7051 }, { "epoch": 1.2069142563751498, "grad_norm": 23.682838439941406, "learning_rate": 1.7964483623942413e-05, "loss": 2.1692, "step": 7052 }, { "epoch": 1.2070854013349308, "grad_norm": 17.22081184387207, "learning_rate": 1.7953188269041686e-05, "loss": 1.985, "step": 7053 }, { "epoch": 1.2072565462947116, "grad_norm": 33.2260627746582, "learning_rate": 1.794189117175548e-05, "loss": 5.3859, "step": 7054 }, { "epoch": 1.2074276912544926, "grad_norm": 12.188201904296875, "learning_rate": 1.7930592338749095e-05, "loss": 1.0235, "step": 7055 }, { "epoch": 1.2075988362142736, "grad_norm": 20.444673538208008, "learning_rate": 1.7919291776688875e-05, "loss": 2.1585, "step": 7056 }, { "epoch": 1.2077699811740543, "grad_norm": 19.24138832092285, "learning_rate": 1.7907989492242157e-05, "loss": 1.724, "step": 7057 }, { "epoch": 1.2079411261338353, "grad_norm": 3.7893197536468506, "learning_rate": 1.7896685492077302e-05, "loss": 0.3529, "step": 7058 }, { "epoch": 1.2081122710936163, "grad_norm": 12.841107368469238, "learning_rate": 1.7885379782863695e-05, "loss": 1.0369, "step": 7059 }, { "epoch": 1.2082834160533973, "grad_norm": 0.8358324766159058, "learning_rate": 1.7874072371271714e-05, "loss": 0.1581, "step": 7060 }, { "epoch": 1.2084545610131783, "grad_norm": 10.016582489013672, "learning_rate": 1.786276326397276e-05, "loss": 0.8467, "step": 7061 }, { "epoch": 1.208625705972959, "grad_norm": 14.59321403503418, "learning_rate": 1.7851452467639218e-05, "loss": 1.0185, "step": 7062 }, { "epoch": 1.20879685093274, "grad_norm": 18.010021209716797, "learning_rate": 1.784013998894447e-05, "loss": 1.4267, "step": 7063 }, { "epoch": 1.208967995892521, "grad_norm": 15.748735427856445, "learning_rate": 1.7828825834562897e-05, "loss": 1.1944, "step": 7064 }, { "epoch": 1.2091391408523018, "grad_norm": 14.062028884887695, "learning_rate": 1.7817510011169865e-05, "loss": 1.2241, "step": 7065 }, { "epoch": 1.2093102858120828, "grad_norm": 18.22197151184082, "learning_rate": 1.7806192525441734e-05, "loss": 1.3791, "step": 7066 }, { "epoch": 1.2094814307718638, "grad_norm": 6.379945755004883, "learning_rate": 1.7794873384055832e-05, "loss": 0.4092, "step": 7067 }, { "epoch": 1.2096525757316448, "grad_norm": 2.92317795753479, "learning_rate": 1.778355259369047e-05, "loss": 0.2682, "step": 7068 }, { "epoch": 1.2098237206914257, "grad_norm": 19.18209457397461, "learning_rate": 1.7772230161024935e-05, "loss": 1.602, "step": 7069 }, { "epoch": 1.2099948656512065, "grad_norm": 13.44565486907959, "learning_rate": 1.776090609273947e-05, "loss": 1.0468, "step": 7070 }, { "epoch": 1.2101660106109875, "grad_norm": 20.34478187561035, "learning_rate": 1.77495803955153e-05, "loss": 2.189, "step": 7071 }, { "epoch": 1.2103371555707685, "grad_norm": 2.485504388809204, "learning_rate": 1.7738253076034604e-05, "loss": 0.2954, "step": 7072 }, { "epoch": 1.2105083005305493, "grad_norm": 17.274112701416016, "learning_rate": 1.7726924140980513e-05, "loss": 1.3465, "step": 7073 }, { "epoch": 1.2106794454903302, "grad_norm": 13.669708251953125, "learning_rate": 1.771559359703712e-05, "loss": 1.1574, "step": 7074 }, { "epoch": 1.2108505904501112, "grad_norm": 26.553890228271484, "learning_rate": 1.7704261450889457e-05, "loss": 1.2326, "step": 7075 }, { "epoch": 1.2110217354098922, "grad_norm": 20.33255386352539, "learning_rate": 1.769292770922351e-05, "loss": 1.7637, "step": 7076 }, { "epoch": 1.2111928803696732, "grad_norm": 13.613040924072266, "learning_rate": 1.7681592378726203e-05, "loss": 1.3363, "step": 7077 }, { "epoch": 1.211364025329454, "grad_norm": 14.0830078125, "learning_rate": 1.76702554660854e-05, "loss": 1.0899, "step": 7078 }, { "epoch": 1.211535170289235, "grad_norm": 12.427796363830566, "learning_rate": 1.7658916977989897e-05, "loss": 0.8537, "step": 7079 }, { "epoch": 1.211706315249016, "grad_norm": 55.171695709228516, "learning_rate": 1.7647576921129415e-05, "loss": 6.3338, "step": 7080 }, { "epoch": 1.211877460208797, "grad_norm": 23.50286293029785, "learning_rate": 1.7636235302194604e-05, "loss": 2.6936, "step": 7081 }, { "epoch": 1.2120486051685777, "grad_norm": 14.088695526123047, "learning_rate": 1.762489212787704e-05, "loss": 1.2116, "step": 7082 }, { "epoch": 1.2122197501283587, "grad_norm": 5.875244617462158, "learning_rate": 1.7613547404869208e-05, "loss": 0.4451, "step": 7083 }, { "epoch": 1.2123908950881397, "grad_norm": 5.248507022857666, "learning_rate": 1.7602201139864518e-05, "loss": 0.3901, "step": 7084 }, { "epoch": 1.2125620400479207, "grad_norm": 16.395782470703125, "learning_rate": 1.7590853339557276e-05, "loss": 1.8082, "step": 7085 }, { "epoch": 1.2127331850077014, "grad_norm": 11.277172088623047, "learning_rate": 1.757950401064271e-05, "loss": 0.9103, "step": 7086 }, { "epoch": 1.2129043299674824, "grad_norm": 16.459196090698242, "learning_rate": 1.756815315981693e-05, "loss": 1.3341, "step": 7087 }, { "epoch": 1.2130754749272634, "grad_norm": 23.725290298461914, "learning_rate": 1.755680079377696e-05, "loss": 2.6328, "step": 7088 }, { "epoch": 1.2132466198870444, "grad_norm": 18.98377227783203, "learning_rate": 1.754544691922072e-05, "loss": 1.3258, "step": 7089 }, { "epoch": 1.2134177648468252, "grad_norm": 18.941240310668945, "learning_rate": 1.7534091542847005e-05, "loss": 1.8892, "step": 7090 }, { "epoch": 1.2135889098066062, "grad_norm": 0.5720072984695435, "learning_rate": 1.75227346713555e-05, "loss": 0.146, "step": 7091 }, { "epoch": 1.2137600547663872, "grad_norm": 10.331928253173828, "learning_rate": 1.7511376311446785e-05, "loss": 0.7565, "step": 7092 }, { "epoch": 1.2139311997261681, "grad_norm": 14.877519607543945, "learning_rate": 1.75000164698223e-05, "loss": 1.0429, "step": 7093 }, { "epoch": 1.214102344685949, "grad_norm": 23.970779418945312, "learning_rate": 1.748865515318438e-05, "loss": 3.479, "step": 7094 }, { "epoch": 1.21427348964573, "grad_norm": 16.405920028686523, "learning_rate": 1.7477292368236214e-05, "loss": 1.2732, "step": 7095 }, { "epoch": 1.2144446346055109, "grad_norm": 10.476763725280762, "learning_rate": 1.7465928121681858e-05, "loss": 0.9867, "step": 7096 }, { "epoch": 1.2146157795652919, "grad_norm": 7.040361404418945, "learning_rate": 1.7454562420226242e-05, "loss": 0.682, "step": 7097 }, { "epoch": 1.2147869245250726, "grad_norm": 22.748382568359375, "learning_rate": 1.7443195270575136e-05, "loss": 2.2859, "step": 7098 }, { "epoch": 1.2149580694848536, "grad_norm": 15.852254867553711, "learning_rate": 1.7431826679435186e-05, "loss": 1.3293, "step": 7099 }, { "epoch": 1.2151292144446346, "grad_norm": 14.472835540771484, "learning_rate": 1.742045665351387e-05, "loss": 1.1619, "step": 7100 }, { "epoch": 1.2153003594044156, "grad_norm": 29.56837272644043, "learning_rate": 1.7409085199519524e-05, "loss": 6.0521, "step": 7101 }, { "epoch": 1.2154715043641966, "grad_norm": 21.541980743408203, "learning_rate": 1.7397712324161322e-05, "loss": 2.0102, "step": 7102 }, { "epoch": 1.2156426493239774, "grad_norm": 17.5698184967041, "learning_rate": 1.7386338034149276e-05, "loss": 1.315, "step": 7103 }, { "epoch": 1.2158137942837584, "grad_norm": 17.530529022216797, "learning_rate": 1.737496233619424e-05, "loss": 1.2779, "step": 7104 }, { "epoch": 1.2159849392435393, "grad_norm": 9.821117401123047, "learning_rate": 1.7363585237007886e-05, "loss": 0.8769, "step": 7105 }, { "epoch": 1.21615608420332, "grad_norm": 0.9964603185653687, "learning_rate": 1.735220674330272e-05, "loss": 0.1617, "step": 7106 }, { "epoch": 1.216327229163101, "grad_norm": 18.4915714263916, "learning_rate": 1.7340826861792072e-05, "loss": 1.6341, "step": 7107 }, { "epoch": 1.216498374122882, "grad_norm": 109.80364227294922, "learning_rate": 1.732944559919009e-05, "loss": 8.1499, "step": 7108 }, { "epoch": 1.216669519082663, "grad_norm": 22.95547866821289, "learning_rate": 1.7318062962211734e-05, "loss": 2.3982, "step": 7109 }, { "epoch": 1.216840664042444, "grad_norm": 15.885193824768066, "learning_rate": 1.7306678957572778e-05, "loss": 0.9601, "step": 7110 }, { "epoch": 1.2170118090022248, "grad_norm": 13.969958305358887, "learning_rate": 1.72952935919898e-05, "loss": 1.2441, "step": 7111 }, { "epoch": 1.2171829539620058, "grad_norm": 0.7053208947181702, "learning_rate": 1.7283906872180185e-05, "loss": 0.1422, "step": 7112 }, { "epoch": 1.2173540989217868, "grad_norm": 20.290607452392578, "learning_rate": 1.7272518804862115e-05, "loss": 2.6781, "step": 7113 }, { "epoch": 1.2175252438815676, "grad_norm": 4.332746982574463, "learning_rate": 1.726112939675456e-05, "loss": 0.3709, "step": 7114 }, { "epoch": 1.2176963888413486, "grad_norm": 16.232742309570312, "learning_rate": 1.72497386545773e-05, "loss": 1.2398, "step": 7115 }, { "epoch": 1.2178675338011296, "grad_norm": 12.56082534790039, "learning_rate": 1.7238346585050878e-05, "loss": 1.0542, "step": 7116 }, { "epoch": 1.2180386787609105, "grad_norm": 18.8144474029541, "learning_rate": 1.722695319489664e-05, "loss": 1.6548, "step": 7117 }, { "epoch": 1.2182098237206915, "grad_norm": 14.830066680908203, "learning_rate": 1.7215558490836708e-05, "loss": 1.4143, "step": 7118 }, { "epoch": 1.2183809686804723, "grad_norm": 14.560158729553223, "learning_rate": 1.7204162479593954e-05, "loss": 0.8984, "step": 7119 }, { "epoch": 1.2185521136402533, "grad_norm": 19.058626174926758, "learning_rate": 1.7192765167892057e-05, "loss": 1.8648, "step": 7120 }, { "epoch": 1.2187232586000343, "grad_norm": 21.232755661010742, "learning_rate": 1.7181366562455445e-05, "loss": 1.7785, "step": 7121 }, { "epoch": 1.218894403559815, "grad_norm": 19.12632942199707, "learning_rate": 1.7169966670009313e-05, "loss": 1.607, "step": 7122 }, { "epoch": 1.219065548519596, "grad_norm": 4.814243316650391, "learning_rate": 1.7158565497279616e-05, "loss": 0.3387, "step": 7123 }, { "epoch": 1.219236693479377, "grad_norm": 13.105286598205566, "learning_rate": 1.714716305099306e-05, "loss": 0.941, "step": 7124 }, { "epoch": 1.219407838439158, "grad_norm": 22.902633666992188, "learning_rate": 1.7135759337877103e-05, "loss": 2.8356, "step": 7125 }, { "epoch": 1.219578983398939, "grad_norm": 15.38435173034668, "learning_rate": 1.7124354364659955e-05, "loss": 0.9884, "step": 7126 }, { "epoch": 1.2197501283587198, "grad_norm": 12.530359268188477, "learning_rate": 1.711294813807057e-05, "loss": 1.004, "step": 7127 }, { "epoch": 1.2199212733185008, "grad_norm": 0.6558112502098083, "learning_rate": 1.7101540664838635e-05, "loss": 0.1397, "step": 7128 }, { "epoch": 1.2200924182782817, "grad_norm": 13.729168891906738, "learning_rate": 1.7090131951694577e-05, "loss": 1.0953, "step": 7129 }, { "epoch": 1.2202635632380627, "grad_norm": 6.121183395385742, "learning_rate": 1.7078722005369552e-05, "loss": 0.3995, "step": 7130 }, { "epoch": 1.2204347081978435, "grad_norm": 19.05879020690918, "learning_rate": 1.7067310832595453e-05, "loss": 2.6498, "step": 7131 }, { "epoch": 1.2206058531576245, "grad_norm": 8.5386962890625, "learning_rate": 1.705589844010488e-05, "loss": 1.0223, "step": 7132 }, { "epoch": 1.2207769981174055, "grad_norm": 10.45544719696045, "learning_rate": 1.7044484834631167e-05, "loss": 1.5029, "step": 7133 }, { "epoch": 1.2209481430771865, "grad_norm": 17.10525894165039, "learning_rate": 1.703307002290836e-05, "loss": 1.3586, "step": 7134 }, { "epoch": 1.2211192880369672, "grad_norm": 4.275002956390381, "learning_rate": 1.70216540116712e-05, "loss": 0.3468, "step": 7135 }, { "epoch": 1.2212904329967482, "grad_norm": 16.066173553466797, "learning_rate": 1.7010236807655172e-05, "loss": 1.4318, "step": 7136 }, { "epoch": 1.2214615779565292, "grad_norm": 24.005477905273438, "learning_rate": 1.699881841759643e-05, "loss": 3.3863, "step": 7137 }, { "epoch": 1.2216327229163102, "grad_norm": 18.728178024291992, "learning_rate": 1.6987398848231845e-05, "loss": 1.5824, "step": 7138 }, { "epoch": 1.221803867876091, "grad_norm": 12.951663970947266, "learning_rate": 1.6975978106298984e-05, "loss": 1.1184, "step": 7139 }, { "epoch": 1.221975012835872, "grad_norm": 10.898070335388184, "learning_rate": 1.6964556198536093e-05, "loss": 1.1159, "step": 7140 }, { "epoch": 1.222146157795653, "grad_norm": 19.487085342407227, "learning_rate": 1.6953133131682116e-05, "loss": 2.0669, "step": 7141 }, { "epoch": 1.222317302755434, "grad_norm": 20.508235931396484, "learning_rate": 1.6941708912476684e-05, "loss": 1.7454, "step": 7142 }, { "epoch": 1.2224884477152147, "grad_norm": 15.410446166992188, "learning_rate": 1.6930283547660102e-05, "loss": 1.3581, "step": 7143 }, { "epoch": 1.2226595926749957, "grad_norm": 22.360820770263672, "learning_rate": 1.6918857043973357e-05, "loss": 2.0329, "step": 7144 }, { "epoch": 1.2228307376347767, "grad_norm": 97.12826538085938, "learning_rate": 1.6907429408158092e-05, "loss": 8.4468, "step": 7145 }, { "epoch": 1.2230018825945577, "grad_norm": 9.912070274353027, "learning_rate": 1.6896000646956632e-05, "loss": 0.7416, "step": 7146 }, { "epoch": 1.2231730275543384, "grad_norm": 21.225231170654297, "learning_rate": 1.6884570767111972e-05, "loss": 1.7576, "step": 7147 }, { "epoch": 1.2233441725141194, "grad_norm": 23.260570526123047, "learning_rate": 1.6873139775367752e-05, "loss": 3.2608, "step": 7148 }, { "epoch": 1.2235153174739004, "grad_norm": 18.95728874206543, "learning_rate": 1.6861707678468275e-05, "loss": 1.7623, "step": 7149 }, { "epoch": 1.2236864624336814, "grad_norm": 20.349519729614258, "learning_rate": 1.6850274483158495e-05, "loss": 1.5536, "step": 7150 }, { "epoch": 1.2238576073934622, "grad_norm": 2.7749392986297607, "learning_rate": 1.683884019618401e-05, "loss": 0.2806, "step": 7151 }, { "epoch": 1.2240287523532432, "grad_norm": 5.438803195953369, "learning_rate": 1.682740482429107e-05, "loss": 0.3878, "step": 7152 }, { "epoch": 1.2241998973130241, "grad_norm": 4.200693130493164, "learning_rate": 1.6815968374226565e-05, "loss": 0.3669, "step": 7153 }, { "epoch": 1.2243710422728051, "grad_norm": 13.428351402282715, "learning_rate": 1.6804530852738016e-05, "loss": 1.1932, "step": 7154 }, { "epoch": 1.224542187232586, "grad_norm": 11.479659080505371, "learning_rate": 1.6793092266573576e-05, "loss": 1.0441, "step": 7155 }, { "epoch": 1.2247133321923669, "grad_norm": 7.372691631317139, "learning_rate": 1.6781652622482024e-05, "loss": 0.4833, "step": 7156 }, { "epoch": 1.2248844771521479, "grad_norm": 11.623597145080566, "learning_rate": 1.677021192721277e-05, "loss": 0.8838, "step": 7157 }, { "epoch": 1.2250556221119289, "grad_norm": 1.917174220085144, "learning_rate": 1.675877018751584e-05, "loss": 0.2578, "step": 7158 }, { "epoch": 1.2252267670717099, "grad_norm": 4.783413887023926, "learning_rate": 1.674732741014189e-05, "loss": 0.3817, "step": 7159 }, { "epoch": 1.2253979120314906, "grad_norm": 19.55664825439453, "learning_rate": 1.673588360184216e-05, "loss": 1.3781, "step": 7160 }, { "epoch": 1.2255690569912716, "grad_norm": 9.297699928283691, "learning_rate": 1.6724438769368523e-05, "loss": 0.5528, "step": 7161 }, { "epoch": 1.2257402019510526, "grad_norm": 5.512182235717773, "learning_rate": 1.6712992919473437e-05, "loss": 0.7764, "step": 7162 }, { "epoch": 1.2259113469108334, "grad_norm": 13.277100563049316, "learning_rate": 1.6701546058909978e-05, "loss": 1.1193, "step": 7163 }, { "epoch": 1.2260824918706144, "grad_norm": 14.236153602600098, "learning_rate": 1.6690098194431815e-05, "loss": 1.0366, "step": 7164 }, { "epoch": 1.2262536368303953, "grad_norm": 16.89085578918457, "learning_rate": 1.66786493327932e-05, "loss": 1.3351, "step": 7165 }, { "epoch": 1.2264247817901763, "grad_norm": 4.800353527069092, "learning_rate": 1.6667199480748975e-05, "loss": 0.3385, "step": 7166 }, { "epoch": 1.2265959267499573, "grad_norm": 16.8074893951416, "learning_rate": 1.6655748645054575e-05, "loss": 1.3478, "step": 7167 }, { "epoch": 1.226767071709738, "grad_norm": 0.6463353037834167, "learning_rate": 1.6644296832466e-05, "loss": 0.1442, "step": 7168 }, { "epoch": 1.226938216669519, "grad_norm": 13.736554145812988, "learning_rate": 1.6632844049739856e-05, "loss": 0.979, "step": 7169 }, { "epoch": 1.2271093616293, "grad_norm": 18.092864990234375, "learning_rate": 1.6621390303633287e-05, "loss": 1.633, "step": 7170 }, { "epoch": 1.2272805065890808, "grad_norm": 14.763882637023926, "learning_rate": 1.6609935600904025e-05, "loss": 1.2596, "step": 7171 }, { "epoch": 1.2274516515488618, "grad_norm": 8.309615135192871, "learning_rate": 1.659847994831036e-05, "loss": 0.9655, "step": 7172 }, { "epoch": 1.2276227965086428, "grad_norm": 12.788700103759766, "learning_rate": 1.6587023352611137e-05, "loss": 1.027, "step": 7173 }, { "epoch": 1.2277939414684238, "grad_norm": 8.727684020996094, "learning_rate": 1.657556582056578e-05, "loss": 0.7562, "step": 7174 }, { "epoch": 1.2279650864282048, "grad_norm": 18.728055953979492, "learning_rate": 1.6564107358934242e-05, "loss": 1.4026, "step": 7175 }, { "epoch": 1.2281362313879856, "grad_norm": 19.143850326538086, "learning_rate": 1.655264797447703e-05, "loss": 1.6007, "step": 7176 }, { "epoch": 1.2283073763477665, "grad_norm": 12.161046028137207, "learning_rate": 1.6541187673955203e-05, "loss": 0.929, "step": 7177 }, { "epoch": 1.2284785213075475, "grad_norm": 6.196813583374023, "learning_rate": 1.6529726464130348e-05, "loss": 0.3963, "step": 7178 }, { "epoch": 1.2286496662673285, "grad_norm": 14.087017059326172, "learning_rate": 1.6518264351764606e-05, "loss": 1.1864, "step": 7179 }, { "epoch": 1.2288208112271093, "grad_norm": 10.506908416748047, "learning_rate": 1.6506801343620635e-05, "loss": 0.8129, "step": 7180 }, { "epoch": 1.2289919561868903, "grad_norm": 13.82801342010498, "learning_rate": 1.6495337446461623e-05, "loss": 1.0856, "step": 7181 }, { "epoch": 1.2291631011466713, "grad_norm": 41.28950500488281, "learning_rate": 1.648387266705129e-05, "loss": 6.4556, "step": 7182 }, { "epoch": 1.2293342461064523, "grad_norm": 18.74609375, "learning_rate": 1.6472407012153877e-05, "loss": 2.0057, "step": 7183 }, { "epoch": 1.229505391066233, "grad_norm": 15.446449279785156, "learning_rate": 1.646094048853413e-05, "loss": 1.3507, "step": 7184 }, { "epoch": 1.229676536026014, "grad_norm": 20.220766067504883, "learning_rate": 1.6449473102957327e-05, "loss": 2.4399, "step": 7185 }, { "epoch": 1.229847680985795, "grad_norm": 0.7779582142829895, "learning_rate": 1.6438004862189228e-05, "loss": 0.1509, "step": 7186 }, { "epoch": 1.230018825945576, "grad_norm": 1.8531559705734253, "learning_rate": 1.642653577299612e-05, "loss": 0.2327, "step": 7187 }, { "epoch": 1.2301899709053568, "grad_norm": 0.5356231927871704, "learning_rate": 1.641506584214478e-05, "loss": 0.1463, "step": 7188 }, { "epoch": 1.2303611158651377, "grad_norm": 17.74931526184082, "learning_rate": 1.6403595076402483e-05, "loss": 1.8497, "step": 7189 }, { "epoch": 1.2305322608249187, "grad_norm": 0.6164054870605469, "learning_rate": 1.6392123482537002e-05, "loss": 0.1491, "step": 7190 }, { "epoch": 1.2307034057846997, "grad_norm": 16.2725830078125, "learning_rate": 1.6380651067316598e-05, "loss": 1.2988, "step": 7191 }, { "epoch": 1.2308745507444805, "grad_norm": 14.284119606018066, "learning_rate": 1.6369177837510003e-05, "loss": 1.3086, "step": 7192 }, { "epoch": 1.2310456957042615, "grad_norm": 23.917457580566406, "learning_rate": 1.635770379988645e-05, "loss": 1.668, "step": 7193 }, { "epoch": 1.2312168406640425, "grad_norm": 18.673789978027344, "learning_rate": 1.634622896121562e-05, "loss": 2.3279, "step": 7194 }, { "epoch": 1.2313879856238235, "grad_norm": 14.05659294128418, "learning_rate": 1.6334753328267706e-05, "loss": 1.1137, "step": 7195 }, { "epoch": 1.2315591305836042, "grad_norm": 106.70809936523438, "learning_rate": 1.632327690781334e-05, "loss": 7.4916, "step": 7196 }, { "epoch": 1.2317302755433852, "grad_norm": 1.955734133720398, "learning_rate": 1.631179970662363e-05, "loss": 0.1702, "step": 7197 }, { "epoch": 1.2319014205031662, "grad_norm": 16.24543571472168, "learning_rate": 1.6300321731470136e-05, "loss": 1.4861, "step": 7198 }, { "epoch": 1.2320725654629472, "grad_norm": 5.551748275756836, "learning_rate": 1.6288842989124883e-05, "loss": 0.5526, "step": 7199 }, { "epoch": 1.232243710422728, "grad_norm": 26.176286697387695, "learning_rate": 1.6277363486360348e-05, "loss": 3.4829, "step": 7200 }, { "epoch": 1.232414855382509, "grad_norm": 1.213494896888733, "learning_rate": 1.626588322994945e-05, "loss": 0.1637, "step": 7201 }, { "epoch": 1.23258600034229, "grad_norm": 13.539944648742676, "learning_rate": 1.6254402226665566e-05, "loss": 0.9396, "step": 7202 }, { "epoch": 1.232757145302071, "grad_norm": 14.775613784790039, "learning_rate": 1.62429204832825e-05, "loss": 1.3005, "step": 7203 }, { "epoch": 1.2329282902618517, "grad_norm": 6.944300174713135, "learning_rate": 1.6231438006574496e-05, "loss": 0.3753, "step": 7204 }, { "epoch": 1.2330994352216327, "grad_norm": 7.613897800445557, "learning_rate": 1.6219954803316233e-05, "loss": 0.4657, "step": 7205 }, { "epoch": 1.2332705801814137, "grad_norm": 19.34268569946289, "learning_rate": 1.620847088028282e-05, "loss": 1.7217, "step": 7206 }, { "epoch": 1.2334417251411947, "grad_norm": 18.5648136138916, "learning_rate": 1.6196986244249786e-05, "loss": 2.1591, "step": 7207 }, { "epoch": 1.2336128701009756, "grad_norm": 8.385674476623535, "learning_rate": 1.6185500901993086e-05, "loss": 0.4739, "step": 7208 }, { "epoch": 1.2337840150607564, "grad_norm": 19.39516258239746, "learning_rate": 1.617401486028909e-05, "loss": 1.554, "step": 7209 }, { "epoch": 1.2339551600205374, "grad_norm": 16.113798141479492, "learning_rate": 1.6162528125914575e-05, "loss": 1.1473, "step": 7210 }, { "epoch": 1.2341263049803184, "grad_norm": 14.98845100402832, "learning_rate": 1.6151040705646737e-05, "loss": 1.1081, "step": 7211 }, { "epoch": 1.2342974499400992, "grad_norm": 16.200519561767578, "learning_rate": 1.6139552606263167e-05, "loss": 1.2292, "step": 7212 }, { "epoch": 1.2344685948998801, "grad_norm": 19.726041793823242, "learning_rate": 1.6128063834541862e-05, "loss": 1.9293, "step": 7213 }, { "epoch": 1.2346397398596611, "grad_norm": 21.031391143798828, "learning_rate": 1.6116574397261217e-05, "loss": 2.9587, "step": 7214 }, { "epoch": 1.2348108848194421, "grad_norm": 50.508731842041016, "learning_rate": 1.610508430120001e-05, "loss": 7.2049, "step": 7215 }, { "epoch": 1.2349820297792231, "grad_norm": 0.6097683906555176, "learning_rate": 1.6093593553137416e-05, "loss": 0.1377, "step": 7216 }, { "epoch": 1.2351531747390039, "grad_norm": 29.152162551879883, "learning_rate": 1.6082102159853005e-05, "loss": 5.9683, "step": 7217 }, { "epoch": 1.2353243196987849, "grad_norm": 0.45944657921791077, "learning_rate": 1.6070610128126705e-05, "loss": 0.1373, "step": 7218 }, { "epoch": 1.2354954646585659, "grad_norm": 13.421952247619629, "learning_rate": 1.6059117464738833e-05, "loss": 1.136, "step": 7219 }, { "epoch": 1.2356666096183466, "grad_norm": 1.488674521446228, "learning_rate": 1.6047624176470083e-05, "loss": 0.2489, "step": 7220 }, { "epoch": 1.2358377545781276, "grad_norm": 21.020978927612305, "learning_rate": 1.6036130270101503e-05, "loss": 2.125, "step": 7221 }, { "epoch": 1.2360088995379086, "grad_norm": 10.7100248336792, "learning_rate": 1.6024635752414523e-05, "loss": 0.9111, "step": 7222 }, { "epoch": 1.2361800444976896, "grad_norm": 2.771242141723633, "learning_rate": 1.6013140630190924e-05, "loss": 0.295, "step": 7223 }, { "epoch": 1.2363511894574706, "grad_norm": 21.59668731689453, "learning_rate": 1.6001644910212843e-05, "loss": 2.7739, "step": 7224 }, { "epoch": 1.2365223344172513, "grad_norm": 11.862739562988281, "learning_rate": 1.5990148599262772e-05, "loss": 1.1786, "step": 7225 }, { "epoch": 1.2366934793770323, "grad_norm": 2.565239906311035, "learning_rate": 1.5978651704123557e-05, "loss": 0.2939, "step": 7226 }, { "epoch": 1.2368646243368133, "grad_norm": 15.954310417175293, "learning_rate": 1.5967154231578377e-05, "loss": 1.4216, "step": 7227 }, { "epoch": 1.2370357692965943, "grad_norm": 9.40361213684082, "learning_rate": 1.5955656188410756e-05, "loss": 0.7256, "step": 7228 }, { "epoch": 1.237206914256375, "grad_norm": 10.410623550415039, "learning_rate": 1.5944157581404565e-05, "loss": 0.9274, "step": 7229 }, { "epoch": 1.237378059216156, "grad_norm": 16.96190643310547, "learning_rate": 1.5932658417343995e-05, "loss": 2.0504, "step": 7230 }, { "epoch": 1.237549204175937, "grad_norm": 16.113800048828125, "learning_rate": 1.5921158703013566e-05, "loss": 1.396, "step": 7231 }, { "epoch": 1.237720349135718, "grad_norm": 17.308685302734375, "learning_rate": 1.590965844519813e-05, "loss": 1.4615, "step": 7232 }, { "epoch": 1.2378914940954988, "grad_norm": 12.704358100891113, "learning_rate": 1.5898157650682862e-05, "loss": 1.1502, "step": 7233 }, { "epoch": 1.2380626390552798, "grad_norm": 20.537282943725586, "learning_rate": 1.5886656326253237e-05, "loss": 1.7221, "step": 7234 }, { "epoch": 1.2382337840150608, "grad_norm": 0.48646315932273865, "learning_rate": 1.5875154478695064e-05, "loss": 0.1359, "step": 7235 }, { "epoch": 1.2384049289748418, "grad_norm": 30.423362731933594, "learning_rate": 1.5863652114794442e-05, "loss": 6.2909, "step": 7236 }, { "epoch": 1.2385760739346225, "grad_norm": 0.5225798487663269, "learning_rate": 1.585214924133778e-05, "loss": 0.1348, "step": 7237 }, { "epoch": 1.2387472188944035, "grad_norm": 12.045647621154785, "learning_rate": 1.5840645865111804e-05, "loss": 0.9722, "step": 7238 }, { "epoch": 1.2389183638541845, "grad_norm": 20.3051815032959, "learning_rate": 1.5829141992903513e-05, "loss": 1.6011, "step": 7239 }, { "epoch": 1.2390895088139655, "grad_norm": 17.894704818725586, "learning_rate": 1.5817637631500213e-05, "loss": 1.9778, "step": 7240 }, { "epoch": 1.2392606537737463, "grad_norm": 22.45466423034668, "learning_rate": 1.5806132787689492e-05, "loss": 2.9642, "step": 7241 }, { "epoch": 1.2394317987335273, "grad_norm": 2.576204538345337, "learning_rate": 1.5794627468259224e-05, "loss": 0.2153, "step": 7242 }, { "epoch": 1.2396029436933083, "grad_norm": 10.791744232177734, "learning_rate": 1.5783121679997564e-05, "loss": 1.4476, "step": 7243 }, { "epoch": 1.2397740886530892, "grad_norm": 10.57784366607666, "learning_rate": 1.577161542969295e-05, "loss": 0.9374, "step": 7244 }, { "epoch": 1.23994523361287, "grad_norm": 3.045717477798462, "learning_rate": 1.576010872413408e-05, "loss": 0.3024, "step": 7245 }, { "epoch": 1.240116378572651, "grad_norm": 11.742828369140625, "learning_rate": 1.5748601570109935e-05, "loss": 0.9983, "step": 7246 }, { "epoch": 1.240287523532432, "grad_norm": 14.008795738220215, "learning_rate": 1.573709397440975e-05, "loss": 1.2937, "step": 7247 }, { "epoch": 1.240458668492213, "grad_norm": 26.63939666748047, "learning_rate": 1.5725585943823016e-05, "loss": 3.5139, "step": 7248 }, { "epoch": 1.2406298134519937, "grad_norm": 11.621224403381348, "learning_rate": 1.57140774851395e-05, "loss": 0.84, "step": 7249 }, { "epoch": 1.2408009584117747, "grad_norm": 11.871199607849121, "learning_rate": 1.57025686051492e-05, "loss": 1.0298, "step": 7250 }, { "epoch": 1.2409721033715557, "grad_norm": 19.76806640625, "learning_rate": 1.569105931064238e-05, "loss": 2.4675, "step": 7251 }, { "epoch": 1.2411432483313367, "grad_norm": 16.911815643310547, "learning_rate": 1.567954960840953e-05, "loss": 1.4983, "step": 7252 }, { "epoch": 1.2413143932911175, "grad_norm": 11.281521797180176, "learning_rate": 1.5668039505241407e-05, "loss": 0.8902, "step": 7253 }, { "epoch": 1.2414855382508985, "grad_norm": 14.933623313903809, "learning_rate": 1.565652900792898e-05, "loss": 0.9997, "step": 7254 }, { "epoch": 1.2416566832106795, "grad_norm": 9.2579345703125, "learning_rate": 1.564501812326346e-05, "loss": 0.5748, "step": 7255 }, { "epoch": 1.2418278281704604, "grad_norm": 12.569852828979492, "learning_rate": 1.5633506858036286e-05, "loss": 0.9632, "step": 7256 }, { "epoch": 1.2419989731302414, "grad_norm": 4.103747844696045, "learning_rate": 1.5621995219039122e-05, "loss": 0.2848, "step": 7257 }, { "epoch": 1.2421701180900222, "grad_norm": 16.797086715698242, "learning_rate": 1.561048321306385e-05, "loss": 1.4364, "step": 7258 }, { "epoch": 1.2423412630498032, "grad_norm": 3.778282642364502, "learning_rate": 1.559897084690257e-05, "loss": 0.3722, "step": 7259 }, { "epoch": 1.2425124080095842, "grad_norm": 8.077884674072266, "learning_rate": 1.5587458127347603e-05, "loss": 0.4795, "step": 7260 }, { "epoch": 1.242683552969365, "grad_norm": 7.148391246795654, "learning_rate": 1.5575945061191467e-05, "loss": 0.5597, "step": 7261 }, { "epoch": 1.242854697929146, "grad_norm": 0.840991199016571, "learning_rate": 1.5564431655226888e-05, "loss": 0.1403, "step": 7262 }, { "epoch": 1.243025842888927, "grad_norm": 21.8636531829834, "learning_rate": 1.5552917916246792e-05, "loss": 3.0481, "step": 7263 }, { "epoch": 1.243196987848708, "grad_norm": 15.274545669555664, "learning_rate": 1.5541403851044294e-05, "loss": 1.2329, "step": 7264 }, { "epoch": 1.243368132808489, "grad_norm": 21.48591423034668, "learning_rate": 1.552988946641272e-05, "loss": 1.768, "step": 7265 }, { "epoch": 1.2435392777682697, "grad_norm": 35.31312561035156, "learning_rate": 1.5518374769145577e-05, "loss": 5.3356, "step": 7266 }, { "epoch": 1.2437104227280507, "grad_norm": 12.25826644897461, "learning_rate": 1.550685976603655e-05, "loss": 0.9782, "step": 7267 }, { "epoch": 1.2438815676878316, "grad_norm": 14.4639892578125, "learning_rate": 1.5495344463879502e-05, "loss": 1.1457, "step": 7268 }, { "epoch": 1.2440527126476124, "grad_norm": 0.5839426517486572, "learning_rate": 1.5483828869468477e-05, "loss": 0.1417, "step": 7269 }, { "epoch": 1.2442238576073934, "grad_norm": 19.122604370117188, "learning_rate": 1.5472312989597703e-05, "loss": 1.7344, "step": 7270 }, { "epoch": 1.2443950025671744, "grad_norm": 19.182523727416992, "learning_rate": 1.5460796831061564e-05, "loss": 2.572, "step": 7271 }, { "epoch": 1.2445661475269554, "grad_norm": 28.435495376586914, "learning_rate": 1.5449280400654607e-05, "loss": 5.9176, "step": 7272 }, { "epoch": 1.2447372924867364, "grad_norm": 11.444164276123047, "learning_rate": 1.543776370517155e-05, "loss": 0.8763, "step": 7273 }, { "epoch": 1.2449084374465171, "grad_norm": 15.6010160446167, "learning_rate": 1.5426246751407248e-05, "loss": 1.6153, "step": 7274 }, { "epoch": 1.2450795824062981, "grad_norm": 0.541800320148468, "learning_rate": 1.5414729546156723e-05, "loss": 0.1371, "step": 7275 }, { "epoch": 1.2452507273660791, "grad_norm": 17.38156509399414, "learning_rate": 1.5403212096215158e-05, "loss": 1.475, "step": 7276 }, { "epoch": 1.2454218723258599, "grad_norm": 19.85516929626465, "learning_rate": 1.539169440837785e-05, "loss": 1.42, "step": 7277 }, { "epoch": 1.2455930172856409, "grad_norm": 2.570357084274292, "learning_rate": 1.538017648944026e-05, "loss": 0.2723, "step": 7278 }, { "epoch": 1.2457641622454219, "grad_norm": 6.349524974822998, "learning_rate": 1.5368658346197977e-05, "loss": 0.45, "step": 7279 }, { "epoch": 1.2459353072052028, "grad_norm": 33.41670227050781, "learning_rate": 1.5357139985446712e-05, "loss": 5.7086, "step": 7280 }, { "epoch": 1.2461064521649838, "grad_norm": 20.294418334960938, "learning_rate": 1.5345621413982327e-05, "loss": 2.0751, "step": 7281 }, { "epoch": 1.2462775971247646, "grad_norm": 0.47335755825042725, "learning_rate": 1.5334102638600797e-05, "loss": 0.1342, "step": 7282 }, { "epoch": 1.2464487420845456, "grad_norm": 103.37303161621094, "learning_rate": 1.5322583666098214e-05, "loss": 7.0826, "step": 7283 }, { "epoch": 1.2466198870443266, "grad_norm": 11.035674095153809, "learning_rate": 1.5311064503270783e-05, "loss": 0.756, "step": 7284 }, { "epoch": 1.2467910320041076, "grad_norm": 2.611645460128784, "learning_rate": 1.5299545156914833e-05, "loss": 0.2622, "step": 7285 }, { "epoch": 1.2469621769638883, "grad_norm": 18.983016967773438, "learning_rate": 1.528802563382679e-05, "loss": 1.7266, "step": 7286 }, { "epoch": 1.2471333219236693, "grad_norm": 15.634206771850586, "learning_rate": 1.52765059408032e-05, "loss": 1.177, "step": 7287 }, { "epoch": 1.2473044668834503, "grad_norm": 20.26268768310547, "learning_rate": 1.5264986084640694e-05, "loss": 2.0696, "step": 7288 }, { "epoch": 1.2474756118432313, "grad_norm": 10.667828559875488, "learning_rate": 1.5253466072136002e-05, "loss": 1.0634, "step": 7289 }, { "epoch": 1.247646756803012, "grad_norm": 14.556381225585938, "learning_rate": 1.524194591008595e-05, "loss": 1.0867, "step": 7290 }, { "epoch": 1.247817901762793, "grad_norm": 9.206250190734863, "learning_rate": 1.5230425605287444e-05, "loss": 0.5104, "step": 7291 }, { "epoch": 1.247989046722574, "grad_norm": 21.278926849365234, "learning_rate": 1.5218905164537493e-05, "loss": 1.9708, "step": 7292 }, { "epoch": 1.248160191682355, "grad_norm": 11.876038551330566, "learning_rate": 1.5207384594633174e-05, "loss": 0.917, "step": 7293 }, { "epoch": 1.2483313366421358, "grad_norm": 2.246145725250244, "learning_rate": 1.5195863902371633e-05, "loss": 0.2441, "step": 7294 }, { "epoch": 1.2485024816019168, "grad_norm": 12.991911888122559, "learning_rate": 1.5184343094550094e-05, "loss": 0.9051, "step": 7295 }, { "epoch": 1.2486736265616978, "grad_norm": 17.714027404785156, "learning_rate": 1.517282217796585e-05, "loss": 1.5865, "step": 7296 }, { "epoch": 1.2488447715214788, "grad_norm": 17.127317428588867, "learning_rate": 1.516130115941627e-05, "loss": 1.1275, "step": 7297 }, { "epoch": 1.2490159164812595, "grad_norm": 16.890031814575195, "learning_rate": 1.5149780045698768e-05, "loss": 2.2108, "step": 7298 }, { "epoch": 1.2491870614410405, "grad_norm": 0.5093104839324951, "learning_rate": 1.5138258843610814e-05, "loss": 0.1335, "step": 7299 }, { "epoch": 1.2493582064008215, "grad_norm": 10.212474822998047, "learning_rate": 1.5126737559949937e-05, "loss": 0.9708, "step": 7300 }, { "epoch": 1.2495293513606025, "grad_norm": 53.22825622558594, "learning_rate": 1.511521620151371e-05, "loss": 6.0426, "step": 7301 }, { "epoch": 1.2497004963203833, "grad_norm": 69.67967224121094, "learning_rate": 1.5103694775099744e-05, "loss": 7.1124, "step": 7302 }, { "epoch": 1.2498716412801643, "grad_norm": 26.3138427734375, "learning_rate": 1.5092173287505715e-05, "loss": 5.7451, "step": 7303 }, { "epoch": 1.2500427862399452, "grad_norm": 21.674297332763672, "learning_rate": 1.508065174552931e-05, "loss": 1.9393, "step": 7304 }, { "epoch": 1.2502139311997262, "grad_norm": 21.02547836303711, "learning_rate": 1.5069130155968256e-05, "loss": 2.0444, "step": 7305 }, { "epoch": 1.2503850761595072, "grad_norm": 17.467077255249023, "learning_rate": 1.5057608525620305e-05, "loss": 2.0333, "step": 7306 }, { "epoch": 1.250556221119288, "grad_norm": 0.9757794737815857, "learning_rate": 1.5046086861283228e-05, "loss": 0.1458, "step": 7307 }, { "epoch": 1.250727366079069, "grad_norm": 10.240049362182617, "learning_rate": 1.5034565169754846e-05, "loss": 0.727, "step": 7308 }, { "epoch": 1.25089851103885, "grad_norm": 21.112850189208984, "learning_rate": 1.502304345783296e-05, "loss": 1.5793, "step": 7309 }, { "epoch": 1.2510696559986307, "grad_norm": 9.824060440063477, "learning_rate": 1.50115217323154e-05, "loss": 0.7622, "step": 7310 }, { "epoch": 1.2512408009584117, "grad_norm": 24.483760833740234, "learning_rate": 1.5e-05, "loss": 2.0368, "step": 7311 }, { "epoch": 1.2514119459181927, "grad_norm": 14.989441871643066, "learning_rate": 1.49884782676846e-05, "loss": 1.1908, "step": 7312 }, { "epoch": 1.2515830908779737, "grad_norm": 14.618575096130371, "learning_rate": 1.497695654216704e-05, "loss": 1.1606, "step": 7313 }, { "epoch": 1.2517542358377547, "grad_norm": 79.52991485595703, "learning_rate": 1.4965434830245154e-05, "loss": 6.9577, "step": 7314 }, { "epoch": 1.2519253807975355, "grad_norm": 14.076753616333008, "learning_rate": 1.4953913138716775e-05, "loss": 1.0866, "step": 7315 }, { "epoch": 1.2520965257573164, "grad_norm": 17.31161880493164, "learning_rate": 1.4942391474379706e-05, "loss": 1.1286, "step": 7316 }, { "epoch": 1.2522676707170974, "grad_norm": 14.135196685791016, "learning_rate": 1.4930869844031755e-05, "loss": 1.2252, "step": 7317 }, { "epoch": 1.2524388156768782, "grad_norm": 25.48478126525879, "learning_rate": 1.4919348254470692e-05, "loss": 1.2945, "step": 7318 }, { "epoch": 1.2526099606366592, "grad_norm": 0.5730566382408142, "learning_rate": 1.4907826712494287e-05, "loss": 0.1365, "step": 7319 }, { "epoch": 1.2527811055964402, "grad_norm": 19.554141998291016, "learning_rate": 1.4896305224900258e-05, "loss": 1.6812, "step": 7320 }, { "epoch": 1.2529522505562212, "grad_norm": 12.831884384155273, "learning_rate": 1.4884783798486301e-05, "loss": 0.9967, "step": 7321 }, { "epoch": 1.2531233955160022, "grad_norm": 7.81449031829834, "learning_rate": 1.4873262440050072e-05, "loss": 0.9633, "step": 7322 }, { "epoch": 1.253294540475783, "grad_norm": 14.348649978637695, "learning_rate": 1.4861741156389186e-05, "loss": 1.1456, "step": 7323 }, { "epoch": 1.253465685435564, "grad_norm": 6.867240905761719, "learning_rate": 1.4850219954301236e-05, "loss": 0.4317, "step": 7324 }, { "epoch": 1.253636830395345, "grad_norm": 17.300615310668945, "learning_rate": 1.4838698840583732e-05, "loss": 1.6941, "step": 7325 }, { "epoch": 1.2538079753551257, "grad_norm": 20.17761993408203, "learning_rate": 1.4827177822034152e-05, "loss": 1.7142, "step": 7326 }, { "epoch": 1.2539791203149067, "grad_norm": 3.2199995517730713, "learning_rate": 1.4815656905449914e-05, "loss": 0.2867, "step": 7327 }, { "epoch": 1.2541502652746876, "grad_norm": 9.007351875305176, "learning_rate": 1.4804136097628372e-05, "loss": 0.6969, "step": 7328 }, { "epoch": 1.2543214102344686, "grad_norm": 21.556840896606445, "learning_rate": 1.479261540536683e-05, "loss": 2.9136, "step": 7329 }, { "epoch": 1.2544925551942496, "grad_norm": 21.820880889892578, "learning_rate": 1.4781094835462506e-05, "loss": 3.0347, "step": 7330 }, { "epoch": 1.2546637001540304, "grad_norm": 12.111786842346191, "learning_rate": 1.4769574394712555e-05, "loss": 1.0611, "step": 7331 }, { "epoch": 1.2548348451138114, "grad_norm": 22.66676139831543, "learning_rate": 1.4758054089914054e-05, "loss": 2.1661, "step": 7332 }, { "epoch": 1.2550059900735924, "grad_norm": 15.881113052368164, "learning_rate": 1.4746533927864e-05, "loss": 1.4002, "step": 7333 }, { "epoch": 1.2551771350333731, "grad_norm": 19.4534854888916, "learning_rate": 1.4735013915359306e-05, "loss": 1.1345, "step": 7334 }, { "epoch": 1.2553482799931541, "grad_norm": 20.17005157470703, "learning_rate": 1.4723494059196796e-05, "loss": 2.3617, "step": 7335 }, { "epoch": 1.2555194249529351, "grad_norm": 28.31744956970215, "learning_rate": 1.4711974366173207e-05, "loss": 3.8263, "step": 7336 }, { "epoch": 1.255690569912716, "grad_norm": 7.080312252044678, "learning_rate": 1.4700454843085167e-05, "loss": 0.4506, "step": 7337 }, { "epoch": 1.255861714872497, "grad_norm": 16.499441146850586, "learning_rate": 1.4688935496729218e-05, "loss": 1.4713, "step": 7338 }, { "epoch": 1.2560328598322779, "grad_norm": 22.658084869384766, "learning_rate": 1.4677416333901789e-05, "loss": 1.8541, "step": 7339 }, { "epoch": 1.2562040047920588, "grad_norm": 7.0036702156066895, "learning_rate": 1.4665897361399205e-05, "loss": 0.6589, "step": 7340 }, { "epoch": 1.2563751497518398, "grad_norm": 21.883045196533203, "learning_rate": 1.4654378586017674e-05, "loss": 2.2609, "step": 7341 }, { "epoch": 1.2565462947116208, "grad_norm": 10.533642768859863, "learning_rate": 1.4642860014553292e-05, "loss": 0.8586, "step": 7342 }, { "epoch": 1.2567174396714016, "grad_norm": 13.321760177612305, "learning_rate": 1.4631341653802032e-05, "loss": 1.0397, "step": 7343 }, { "epoch": 1.2568885846311826, "grad_norm": 3.57551908493042, "learning_rate": 1.4619823510559747e-05, "loss": 0.4809, "step": 7344 }, { "epoch": 1.2570597295909636, "grad_norm": 22.513717651367188, "learning_rate": 1.4608305591622153e-05, "loss": 2.6994, "step": 7345 }, { "epoch": 1.2572308745507446, "grad_norm": 10.400582313537598, "learning_rate": 1.4596787903784848e-05, "loss": 0.9155, "step": 7346 }, { "epoch": 1.2574020195105255, "grad_norm": 27.71649742126465, "learning_rate": 1.4585270453843277e-05, "loss": 4.5367, "step": 7347 }, { "epoch": 1.2575731644703063, "grad_norm": 16.393686294555664, "learning_rate": 1.457375324859276e-05, "loss": 1.3998, "step": 7348 }, { "epoch": 1.2577443094300873, "grad_norm": 15.453287124633789, "learning_rate": 1.4562236294828458e-05, "loss": 1.0674, "step": 7349 }, { "epoch": 1.2579154543898683, "grad_norm": 8.631450653076172, "learning_rate": 1.4550719599345392e-05, "loss": 0.4541, "step": 7350 }, { "epoch": 1.258086599349649, "grad_norm": 21.972387313842773, "learning_rate": 1.4539203168938434e-05, "loss": 1.7009, "step": 7351 }, { "epoch": 1.25825774430943, "grad_norm": 14.740167617797852, "learning_rate": 1.4527687010402294e-05, "loss": 0.9929, "step": 7352 }, { "epoch": 1.258428889269211, "grad_norm": 5.436156749725342, "learning_rate": 1.451617113053152e-05, "loss": 0.3797, "step": 7353 }, { "epoch": 1.258600034228992, "grad_norm": 12.183341979980469, "learning_rate": 1.4504655536120502e-05, "loss": 1.0988, "step": 7354 }, { "epoch": 1.258771179188773, "grad_norm": 0.7028123736381531, "learning_rate": 1.4493140233963452e-05, "loss": 0.1615, "step": 7355 }, { "epoch": 1.2589423241485538, "grad_norm": 16.393653869628906, "learning_rate": 1.4481625230854426e-05, "loss": 1.7488, "step": 7356 }, { "epoch": 1.2591134691083348, "grad_norm": 16.673946380615234, "learning_rate": 1.447011053358728e-05, "loss": 1.0904, "step": 7357 }, { "epoch": 1.2592846140681158, "grad_norm": 10.811583518981934, "learning_rate": 1.4458596148955709e-05, "loss": 0.6634, "step": 7358 }, { "epoch": 1.2594557590278965, "grad_norm": 1.4404106140136719, "learning_rate": 1.4447082083753217e-05, "loss": 0.2295, "step": 7359 }, { "epoch": 1.2596269039876775, "grad_norm": 1.8505138158798218, "learning_rate": 1.4435568344773118e-05, "loss": 0.1746, "step": 7360 }, { "epoch": 1.2597980489474585, "grad_norm": 12.746377944946289, "learning_rate": 1.4424054938808534e-05, "loss": 0.913, "step": 7361 }, { "epoch": 1.2599691939072395, "grad_norm": 7.94597864151001, "learning_rate": 1.4412541872652397e-05, "loss": 0.9712, "step": 7362 }, { "epoch": 1.2601403388670205, "grad_norm": 16.183761596679688, "learning_rate": 1.440102915309743e-05, "loss": 1.7207, "step": 7363 }, { "epoch": 1.2603114838268012, "grad_norm": 15.641918182373047, "learning_rate": 1.4389516786936155e-05, "loss": 1.6151, "step": 7364 }, { "epoch": 1.2604826287865822, "grad_norm": 0.5059029459953308, "learning_rate": 1.4378004780960885e-05, "loss": 0.137, "step": 7365 }, { "epoch": 1.2606537737463632, "grad_norm": 18.027734756469727, "learning_rate": 1.4366493141963718e-05, "loss": 1.6826, "step": 7366 }, { "epoch": 1.260824918706144, "grad_norm": 9.822291374206543, "learning_rate": 1.4354981876736543e-05, "loss": 0.6362, "step": 7367 }, { "epoch": 1.260996063665925, "grad_norm": 17.900630950927734, "learning_rate": 1.4343470992071023e-05, "loss": 1.6278, "step": 7368 }, { "epoch": 1.261167208625706, "grad_norm": 17.575546264648438, "learning_rate": 1.4331960494758594e-05, "loss": 0.9771, "step": 7369 }, { "epoch": 1.261338353585487, "grad_norm": 17.916412353515625, "learning_rate": 1.4320450391590468e-05, "loss": 2.165, "step": 7370 }, { "epoch": 1.261509498545268, "grad_norm": 9.473165512084961, "learning_rate": 1.430894068935762e-05, "loss": 1.2812, "step": 7371 }, { "epoch": 1.2616806435050487, "grad_norm": 21.768396377563477, "learning_rate": 1.42974313948508e-05, "loss": 2.4049, "step": 7372 }, { "epoch": 1.2618517884648297, "grad_norm": 5.626485824584961, "learning_rate": 1.4285922514860502e-05, "loss": 0.5733, "step": 7373 }, { "epoch": 1.2620229334246107, "grad_norm": 15.431833267211914, "learning_rate": 1.4274414056176982e-05, "loss": 1.4185, "step": 7374 }, { "epoch": 1.2621940783843915, "grad_norm": 20.183462142944336, "learning_rate": 1.4262906025590251e-05, "loss": 1.9588, "step": 7375 }, { "epoch": 1.2623652233441724, "grad_norm": 0.3969564437866211, "learning_rate": 1.4251398429890066e-05, "loss": 0.1221, "step": 7376 }, { "epoch": 1.2625363683039534, "grad_norm": 15.512309074401855, "learning_rate": 1.4239891275865913e-05, "loss": 1.231, "step": 7377 }, { "epoch": 1.2627075132637344, "grad_norm": 28.235328674316406, "learning_rate": 1.4228384570307047e-05, "loss": 5.5077, "step": 7378 }, { "epoch": 1.2628786582235154, "grad_norm": 14.990321159362793, "learning_rate": 1.4216878320002431e-05, "loss": 1.4904, "step": 7379 }, { "epoch": 1.2630498031832962, "grad_norm": 27.102006912231445, "learning_rate": 1.4205372531740779e-05, "loss": 0.9632, "step": 7380 }, { "epoch": 1.2632209481430772, "grad_norm": 14.286447525024414, "learning_rate": 1.4193867212310512e-05, "loss": 0.8609, "step": 7381 }, { "epoch": 1.2633920931028582, "grad_norm": 27.299739837646484, "learning_rate": 1.418236236849979e-05, "loss": 5.5275, "step": 7382 }, { "epoch": 1.263563238062639, "grad_norm": 20.601612091064453, "learning_rate": 1.417085800709649e-05, "loss": 2.0682, "step": 7383 }, { "epoch": 1.26373438302242, "grad_norm": 4.8306169509887695, "learning_rate": 1.4159354134888199e-05, "loss": 0.4245, "step": 7384 }, { "epoch": 1.263905527982201, "grad_norm": 14.180475234985352, "learning_rate": 1.414785075866222e-05, "loss": 1.1286, "step": 7385 }, { "epoch": 1.264076672941982, "grad_norm": 0.5731605887413025, "learning_rate": 1.4136347885205565e-05, "loss": 0.1332, "step": 7386 }, { "epoch": 1.2642478179017629, "grad_norm": 8.598193168640137, "learning_rate": 1.4124845521304939e-05, "loss": 0.6556, "step": 7387 }, { "epoch": 1.2644189628615436, "grad_norm": 0.4574246406555176, "learning_rate": 1.4113343673746762e-05, "loss": 0.1275, "step": 7388 }, { "epoch": 1.2645901078213246, "grad_norm": 14.310580253601074, "learning_rate": 1.410184234931714e-05, "loss": 1.2369, "step": 7389 }, { "epoch": 1.2647612527811056, "grad_norm": 28.03304100036621, "learning_rate": 1.409034155480187e-05, "loss": 5.8028, "step": 7390 }, { "epoch": 1.2649323977408864, "grad_norm": 14.480050086975098, "learning_rate": 1.4078841296986435e-05, "loss": 1.0287, "step": 7391 }, { "epoch": 1.2651035427006674, "grad_norm": 21.460359573364258, "learning_rate": 1.4067341582656011e-05, "loss": 1.7921, "step": 7392 }, { "epoch": 1.2652746876604484, "grad_norm": 1.9403401613235474, "learning_rate": 1.4055842418595432e-05, "loss": 0.2589, "step": 7393 }, { "epoch": 1.2654458326202294, "grad_norm": 7.202798843383789, "learning_rate": 1.4044343811589241e-05, "loss": 0.4405, "step": 7394 }, { "epoch": 1.2656169775800103, "grad_norm": 62.627830505371094, "learning_rate": 1.4032845768421624e-05, "loss": 7.4275, "step": 7395 }, { "epoch": 1.2657881225397913, "grad_norm": 14.698546409606934, "learning_rate": 1.4021348295876447e-05, "loss": 1.4408, "step": 7396 }, { "epoch": 1.265959267499572, "grad_norm": 22.262781143188477, "learning_rate": 1.4009851400737227e-05, "loss": 2.1098, "step": 7397 }, { "epoch": 1.266130412459353, "grad_norm": 15.667557716369629, "learning_rate": 1.399835508978716e-05, "loss": 1.3713, "step": 7398 }, { "epoch": 1.266301557419134, "grad_norm": 8.630463600158691, "learning_rate": 1.398685936980908e-05, "loss": 0.7282, "step": 7399 }, { "epoch": 1.2664727023789148, "grad_norm": 15.559676170349121, "learning_rate": 1.397536424758548e-05, "loss": 1.74, "step": 7400 }, { "epoch": 1.2666438473386958, "grad_norm": 12.456164360046387, "learning_rate": 1.3963869729898501e-05, "loss": 1.2426, "step": 7401 }, { "epoch": 1.2668149922984768, "grad_norm": 18.912338256835938, "learning_rate": 1.3952375823529925e-05, "loss": 2.2147, "step": 7402 }, { "epoch": 1.2669861372582578, "grad_norm": 19.029582977294922, "learning_rate": 1.3940882535261173e-05, "loss": 1.9542, "step": 7403 }, { "epoch": 1.2671572822180388, "grad_norm": 15.901615142822266, "learning_rate": 1.3929389871873299e-05, "loss": 1.1382, "step": 7404 }, { "epoch": 1.2673284271778196, "grad_norm": 16.025964736938477, "learning_rate": 1.3917897840147e-05, "loss": 1.3947, "step": 7405 }, { "epoch": 1.2674995721376006, "grad_norm": 3.6541223526000977, "learning_rate": 1.3906406446862585e-05, "loss": 0.2928, "step": 7406 }, { "epoch": 1.2676707170973815, "grad_norm": 14.086526870727539, "learning_rate": 1.3894915698799997e-05, "loss": 1.2049, "step": 7407 }, { "epoch": 1.2678418620571623, "grad_norm": 19.24121856689453, "learning_rate": 1.3883425602738794e-05, "loss": 2.2081, "step": 7408 }, { "epoch": 1.2680130070169433, "grad_norm": 86.15853118896484, "learning_rate": 1.3871936165458139e-05, "loss": 7.7859, "step": 7409 }, { "epoch": 1.2681841519767243, "grad_norm": 3.7285234928131104, "learning_rate": 1.3860447393736834e-05, "loss": 0.323, "step": 7410 }, { "epoch": 1.2683552969365053, "grad_norm": 14.129486083984375, "learning_rate": 1.3848959294353263e-05, "loss": 1.1798, "step": 7411 }, { "epoch": 1.2685264418962863, "grad_norm": 25.551198959350586, "learning_rate": 1.3837471874085428e-05, "loss": 1.2741, "step": 7412 }, { "epoch": 1.268697586856067, "grad_norm": 19.952129364013672, "learning_rate": 1.3825985139710915e-05, "loss": 2.2252, "step": 7413 }, { "epoch": 1.268868731815848, "grad_norm": 3.0930943489074707, "learning_rate": 1.3814499098006911e-05, "loss": 0.2683, "step": 7414 }, { "epoch": 1.269039876775629, "grad_norm": 58.00749969482422, "learning_rate": 1.3803013755750211e-05, "loss": 6.766, "step": 7415 }, { "epoch": 1.2692110217354098, "grad_norm": 14.905405044555664, "learning_rate": 1.3791529119717183e-05, "loss": 1.2094, "step": 7416 }, { "epoch": 1.2693821666951908, "grad_norm": 14.11672592163086, "learning_rate": 1.3780045196683771e-05, "loss": 1.0501, "step": 7417 }, { "epoch": 1.2695533116549718, "grad_norm": 17.17655372619629, "learning_rate": 1.3768561993425508e-05, "loss": 1.3703, "step": 7418 }, { "epoch": 1.2697244566147528, "grad_norm": 23.263961791992188, "learning_rate": 1.3757079516717503e-05, "loss": 4.9318, "step": 7419 }, { "epoch": 1.2698956015745337, "grad_norm": 10.799759864807129, "learning_rate": 1.374559777333443e-05, "loss": 1.4142, "step": 7420 }, { "epoch": 1.2700667465343145, "grad_norm": 12.609829902648926, "learning_rate": 1.3734116770050548e-05, "loss": 1.0778, "step": 7421 }, { "epoch": 1.2702378914940955, "grad_norm": 5.679696083068848, "learning_rate": 1.3722636513639654e-05, "loss": 0.4246, "step": 7422 }, { "epoch": 1.2704090364538765, "grad_norm": 15.620946884155273, "learning_rate": 1.371115701087512e-05, "loss": 1.2893, "step": 7423 }, { "epoch": 1.2705801814136573, "grad_norm": 15.387858390808105, "learning_rate": 1.3699678268529866e-05, "loss": 1.3466, "step": 7424 }, { "epoch": 1.2707513263734382, "grad_norm": 9.487750053405762, "learning_rate": 1.3688200293376372e-05, "loss": 0.9842, "step": 7425 }, { "epoch": 1.2709224713332192, "grad_norm": 0.6645063757896423, "learning_rate": 1.367672309218666e-05, "loss": 0.1334, "step": 7426 }, { "epoch": 1.2710936162930002, "grad_norm": 0.7540750503540039, "learning_rate": 1.3665246671732296e-05, "loss": 0.1474, "step": 7427 }, { "epoch": 1.2712647612527812, "grad_norm": 8.321049690246582, "learning_rate": 1.3653771038784385e-05, "loss": 0.6695, "step": 7428 }, { "epoch": 1.271435906212562, "grad_norm": 15.873809814453125, "learning_rate": 1.3642296200113564e-05, "loss": 1.4287, "step": 7429 }, { "epoch": 1.271607051172343, "grad_norm": 11.329692840576172, "learning_rate": 1.3630822162489998e-05, "loss": 1.0208, "step": 7430 }, { "epoch": 1.271778196132124, "grad_norm": 12.636488914489746, "learning_rate": 1.3619348932683404e-05, "loss": 1.0985, "step": 7431 }, { "epoch": 1.2719493410919047, "grad_norm": 23.067245483398438, "learning_rate": 1.3607876517463e-05, "loss": 2.325, "step": 7432 }, { "epoch": 1.2721204860516857, "grad_norm": 0.6290095448493958, "learning_rate": 1.3596404923597521e-05, "loss": 0.144, "step": 7433 }, { "epoch": 1.2722916310114667, "grad_norm": 21.06360626220703, "learning_rate": 1.3584934157855227e-05, "loss": 1.6601, "step": 7434 }, { "epoch": 1.2724627759712477, "grad_norm": 14.543760299682617, "learning_rate": 1.3573464227003888e-05, "loss": 1.2144, "step": 7435 }, { "epoch": 1.2726339209310287, "grad_norm": 0.4954783320426941, "learning_rate": 1.3561995137810771e-05, "loss": 0.1383, "step": 7436 }, { "epoch": 1.2728050658908094, "grad_norm": 8.514019012451172, "learning_rate": 1.3550526897042677e-05, "loss": 0.501, "step": 7437 }, { "epoch": 1.2729762108505904, "grad_norm": 0.661493718624115, "learning_rate": 1.3539059511465868e-05, "loss": 0.1446, "step": 7438 }, { "epoch": 1.2731473558103714, "grad_norm": 20.316226959228516, "learning_rate": 1.3527592987846124e-05, "loss": 1.8445, "step": 7439 }, { "epoch": 1.2733185007701522, "grad_norm": 0.44578152894973755, "learning_rate": 1.3516127332948709e-05, "loss": 0.1261, "step": 7440 }, { "epoch": 1.2734896457299332, "grad_norm": 11.764918327331543, "learning_rate": 1.350466255353838e-05, "loss": 0.8638, "step": 7441 }, { "epoch": 1.2736607906897142, "grad_norm": 14.545720100402832, "learning_rate": 1.349319865637937e-05, "loss": 1.0689, "step": 7442 }, { "epoch": 1.2738319356494952, "grad_norm": 92.29745483398438, "learning_rate": 1.3481735648235398e-05, "loss": 6.7581, "step": 7443 }, { "epoch": 1.2740030806092761, "grad_norm": 17.633432388305664, "learning_rate": 1.3470273535869658e-05, "loss": 1.9326, "step": 7444 }, { "epoch": 1.2741742255690571, "grad_norm": 16.83913803100586, "learning_rate": 1.3458812326044806e-05, "loss": 1.3725, "step": 7445 }, { "epoch": 1.274345370528838, "grad_norm": 3.9324121475219727, "learning_rate": 1.3447352025522971e-05, "loss": 0.6321, "step": 7446 }, { "epoch": 1.2745165154886189, "grad_norm": 0.8707903623580933, "learning_rate": 1.343589264106576e-05, "loss": 0.1474, "step": 7447 }, { "epoch": 1.2746876604483999, "grad_norm": 0.502506673336029, "learning_rate": 1.3424434179434224e-05, "loss": 0.1367, "step": 7448 }, { "epoch": 1.2748588054081806, "grad_norm": 18.582305908203125, "learning_rate": 1.3412976647388867e-05, "loss": 1.4433, "step": 7449 }, { "epoch": 1.2750299503679616, "grad_norm": 20.04930877685547, "learning_rate": 1.340152005168965e-05, "loss": 1.4891, "step": 7450 }, { "epoch": 1.2752010953277426, "grad_norm": 9.783509254455566, "learning_rate": 1.3390064399095984e-05, "loss": 0.8113, "step": 7451 }, { "epoch": 1.2753722402875236, "grad_norm": 20.472023010253906, "learning_rate": 1.3378609696366715e-05, "loss": 2.769, "step": 7452 }, { "epoch": 1.2755433852473046, "grad_norm": 8.195066452026367, "learning_rate": 1.3367155950260148e-05, "loss": 0.7378, "step": 7453 }, { "epoch": 1.2757145302070854, "grad_norm": 13.91455078125, "learning_rate": 1.3355703167534e-05, "loss": 0.9211, "step": 7454 }, { "epoch": 1.2758856751668664, "grad_norm": 0.5569220185279846, "learning_rate": 1.3344251354945433e-05, "loss": 0.1286, "step": 7455 }, { "epoch": 1.2760568201266473, "grad_norm": 19.011075973510742, "learning_rate": 1.3332800519251031e-05, "loss": 2.0409, "step": 7456 }, { "epoch": 1.276227965086428, "grad_norm": 21.4835262298584, "learning_rate": 1.3321350667206805e-05, "loss": 1.8533, "step": 7457 }, { "epoch": 1.276399110046209, "grad_norm": 13.103194236755371, "learning_rate": 1.3309901805568186e-05, "loss": 0.901, "step": 7458 }, { "epoch": 1.27657025500599, "grad_norm": 40.336116790771484, "learning_rate": 1.329845394109002e-05, "loss": 6.137, "step": 7459 }, { "epoch": 1.276741399965771, "grad_norm": 11.189400672912598, "learning_rate": 1.3287007080526564e-05, "loss": 0.84, "step": 7460 }, { "epoch": 1.276912544925552, "grad_norm": 2.909088611602783, "learning_rate": 1.3275561230631481e-05, "loss": 0.2344, "step": 7461 }, { "epoch": 1.2770836898853328, "grad_norm": 13.368642807006836, "learning_rate": 1.3264116398157843e-05, "loss": 1.0364, "step": 7462 }, { "epoch": 1.2772548348451138, "grad_norm": 21.844898223876953, "learning_rate": 1.325267258985811e-05, "loss": 1.8354, "step": 7463 }, { "epoch": 1.2774259798048948, "grad_norm": 16.96022605895996, "learning_rate": 1.3241229812484153e-05, "loss": 1.4636, "step": 7464 }, { "epoch": 1.2775971247646756, "grad_norm": 3.6713128089904785, "learning_rate": 1.322978807278723e-05, "loss": 0.3187, "step": 7465 }, { "epoch": 1.2777682697244566, "grad_norm": 23.283611297607422, "learning_rate": 1.3218347377517979e-05, "loss": 1.2698, "step": 7466 }, { "epoch": 1.2779394146842376, "grad_norm": 4.398828506469727, "learning_rate": 1.320690773342643e-05, "loss": 0.2959, "step": 7467 }, { "epoch": 1.2781105596440185, "grad_norm": 21.414701461791992, "learning_rate": 1.3195469147261987e-05, "loss": 5.061, "step": 7468 }, { "epoch": 1.2782817046037995, "grad_norm": 11.26382064819336, "learning_rate": 1.318403162577344e-05, "loss": 0.8673, "step": 7469 }, { "epoch": 1.2784528495635803, "grad_norm": 7.07971715927124, "learning_rate": 1.3172595175708934e-05, "loss": 0.3555, "step": 7470 }, { "epoch": 1.2786239945233613, "grad_norm": 17.331661224365234, "learning_rate": 1.3161159803815996e-05, "loss": 1.2332, "step": 7471 }, { "epoch": 1.2787951394831423, "grad_norm": 3.61580753326416, "learning_rate": 1.3149725516841514e-05, "loss": 0.2735, "step": 7472 }, { "epoch": 1.278966284442923, "grad_norm": 17.832317352294922, "learning_rate": 1.313829232153173e-05, "loss": 1.1603, "step": 7473 }, { "epoch": 1.279137429402704, "grad_norm": 14.138060569763184, "learning_rate": 1.3126860224632252e-05, "loss": 1.0651, "step": 7474 }, { "epoch": 1.279308574362485, "grad_norm": 3.1998164653778076, "learning_rate": 1.311542923288803e-05, "loss": 0.2512, "step": 7475 }, { "epoch": 1.279479719322266, "grad_norm": 21.73483657836914, "learning_rate": 1.3103999353043369e-05, "loss": 1.6646, "step": 7476 }, { "epoch": 1.279650864282047, "grad_norm": 0.5186206102371216, "learning_rate": 1.3092570591841912e-05, "loss": 0.1297, "step": 7477 }, { "epoch": 1.2798220092418278, "grad_norm": 5.291880130767822, "learning_rate": 1.308114295602665e-05, "loss": 0.4219, "step": 7478 }, { "epoch": 1.2799931542016088, "grad_norm": 4.802558898925781, "learning_rate": 1.3069716452339897e-05, "loss": 0.3575, "step": 7479 }, { "epoch": 1.2801642991613897, "grad_norm": 20.09796142578125, "learning_rate": 1.3058291087523315e-05, "loss": 2.2933, "step": 7480 }, { "epoch": 1.2803354441211705, "grad_norm": 17.5977840423584, "learning_rate": 1.3046866868317883e-05, "loss": 1.5162, "step": 7481 }, { "epoch": 1.2805065890809515, "grad_norm": 19.869531631469727, "learning_rate": 1.303544380146391e-05, "loss": 2.4812, "step": 7482 }, { "epoch": 1.2806777340407325, "grad_norm": 13.439567565917969, "learning_rate": 1.3024021893701019e-05, "loss": 1.1286, "step": 7483 }, { "epoch": 1.2808488790005135, "grad_norm": 0.5329412817955017, "learning_rate": 1.3012601151768157e-05, "loss": 0.1252, "step": 7484 }, { "epoch": 1.2810200239602945, "grad_norm": 27.384693145751953, "learning_rate": 1.3001181582403573e-05, "loss": 5.4171, "step": 7485 }, { "epoch": 1.2811911689200752, "grad_norm": 83.22950744628906, "learning_rate": 1.298976319234483e-05, "loss": 7.6711, "step": 7486 }, { "epoch": 1.2813623138798562, "grad_norm": 10.799430847167969, "learning_rate": 1.29783459883288e-05, "loss": 0.821, "step": 7487 }, { "epoch": 1.2815334588396372, "grad_norm": 4.837438583374023, "learning_rate": 1.296692997709165e-05, "loss": 0.3963, "step": 7488 }, { "epoch": 1.281704603799418, "grad_norm": 23.152280807495117, "learning_rate": 1.2955515165368835e-05, "loss": 2.2947, "step": 7489 }, { "epoch": 1.281875748759199, "grad_norm": 4.582759380340576, "learning_rate": 1.294410155989512e-05, "loss": 0.3968, "step": 7490 }, { "epoch": 1.28204689371898, "grad_norm": 8.332213401794434, "learning_rate": 1.293268916740455e-05, "loss": 0.5057, "step": 7491 }, { "epoch": 1.282218038678761, "grad_norm": 18.660024642944336, "learning_rate": 1.292127799463045e-05, "loss": 1.5929, "step": 7492 }, { "epoch": 1.282389183638542, "grad_norm": 14.767051696777344, "learning_rate": 1.2909868048305429e-05, "loss": 1.321, "step": 7493 }, { "epoch": 1.2825603285983227, "grad_norm": 0.5502254962921143, "learning_rate": 1.2898459335161372e-05, "loss": 0.1237, "step": 7494 }, { "epoch": 1.2827314735581037, "grad_norm": 18.758405685424805, "learning_rate": 1.2887051861929434e-05, "loss": 1.7648, "step": 7495 }, { "epoch": 1.2829026185178847, "grad_norm": 0.6086778044700623, "learning_rate": 1.287564563534005e-05, "loss": 0.1372, "step": 7496 }, { "epoch": 1.2830737634776657, "grad_norm": 21.15096664428711, "learning_rate": 1.2864240662122903e-05, "loss": 1.7089, "step": 7497 }, { "epoch": 1.2832449084374464, "grad_norm": 16.411664962768555, "learning_rate": 1.2852836949006946e-05, "loss": 1.412, "step": 7498 }, { "epoch": 1.2834160533972274, "grad_norm": 4.890477180480957, "learning_rate": 1.2841434502720388e-05, "loss": 0.3891, "step": 7499 }, { "epoch": 1.2835871983570084, "grad_norm": 17.419998168945312, "learning_rate": 1.2830033329990685e-05, "loss": 1.6243, "step": 7500 }, { "epoch": 1.2837583433167894, "grad_norm": 15.588261604309082, "learning_rate": 1.2818633437544555e-05, "loss": 1.0956, "step": 7501 }, { "epoch": 1.2839294882765704, "grad_norm": 15.560494422912598, "learning_rate": 1.2807234832107943e-05, "loss": 1.5608, "step": 7502 }, { "epoch": 1.2841006332363512, "grad_norm": 25.251907348632812, "learning_rate": 1.2795837520406048e-05, "loss": 5.2597, "step": 7503 }, { "epoch": 1.2842717781961321, "grad_norm": 12.475388526916504, "learning_rate": 1.2784441509163297e-05, "loss": 1.0031, "step": 7504 }, { "epoch": 1.2844429231559131, "grad_norm": 12.28564167022705, "learning_rate": 1.2773046805103353e-05, "loss": 0.8902, "step": 7505 }, { "epoch": 1.284614068115694, "grad_norm": 18.319833755493164, "learning_rate": 1.2761653414949115e-05, "loss": 1.701, "step": 7506 }, { "epoch": 1.2847852130754749, "grad_norm": 18.66644859313965, "learning_rate": 1.2750261345422696e-05, "loss": 2.0435, "step": 7507 }, { "epoch": 1.2849563580352559, "grad_norm": 2.7434170246124268, "learning_rate": 1.2738870603245434e-05, "loss": 0.2375, "step": 7508 }, { "epoch": 1.2851275029950369, "grad_norm": 16.576847076416016, "learning_rate": 1.2727481195137886e-05, "loss": 1.5716, "step": 7509 }, { "epoch": 1.2852986479548179, "grad_norm": 16.501052856445312, "learning_rate": 1.2716093127819816e-05, "loss": 1.0884, "step": 7510 }, { "epoch": 1.2854697929145986, "grad_norm": 12.034847259521484, "learning_rate": 1.2704706408010203e-05, "loss": 0.9672, "step": 7511 }, { "epoch": 1.2856409378743796, "grad_norm": 5.079814910888672, "learning_rate": 1.2693321042427225e-05, "loss": 0.3154, "step": 7512 }, { "epoch": 1.2858120828341606, "grad_norm": 0.42951419949531555, "learning_rate": 1.2681937037788272e-05, "loss": 0.1267, "step": 7513 }, { "epoch": 1.2859832277939414, "grad_norm": 24.724746704101562, "learning_rate": 1.2670554400809915e-05, "loss": 2.0771, "step": 7514 }, { "epoch": 1.2861543727537224, "grad_norm": 15.345876693725586, "learning_rate": 1.2659173138207933e-05, "loss": 1.3652, "step": 7515 }, { "epoch": 1.2863255177135033, "grad_norm": 17.33806037902832, "learning_rate": 1.2647793256697284e-05, "loss": 1.59, "step": 7516 }, { "epoch": 1.2864966626732843, "grad_norm": 17.414993286132812, "learning_rate": 1.2636414762992118e-05, "loss": 1.5101, "step": 7517 }, { "epoch": 1.2866678076330653, "grad_norm": 20.75063705444336, "learning_rate": 1.2625037663805763e-05, "loss": 1.6154, "step": 7518 }, { "epoch": 1.286838952592846, "grad_norm": 14.97973918914795, "learning_rate": 1.2613661965850725e-05, "loss": 1.1913, "step": 7519 }, { "epoch": 1.287010097552627, "grad_norm": 3.322770118713379, "learning_rate": 1.2602287675838682e-05, "loss": 0.2227, "step": 7520 }, { "epoch": 1.287181242512408, "grad_norm": 15.625368118286133, "learning_rate": 1.2590914800480482e-05, "loss": 1.1869, "step": 7521 }, { "epoch": 1.2873523874721888, "grad_norm": 15.407942771911621, "learning_rate": 1.2579543346486132e-05, "loss": 1.7004, "step": 7522 }, { "epoch": 1.2875235324319698, "grad_norm": 29.231836318969727, "learning_rate": 1.2568173320564815e-05, "loss": 1.3564, "step": 7523 }, { "epoch": 1.2876946773917508, "grad_norm": 19.370813369750977, "learning_rate": 1.2556804729424863e-05, "loss": 1.5538, "step": 7524 }, { "epoch": 1.2878658223515318, "grad_norm": 16.274520874023438, "learning_rate": 1.2545437579773762e-05, "loss": 1.3551, "step": 7525 }, { "epoch": 1.2880369673113128, "grad_norm": 12.730664253234863, "learning_rate": 1.2534071878318143e-05, "loss": 1.1659, "step": 7526 }, { "epoch": 1.2882081122710936, "grad_norm": 0.45628058910369873, "learning_rate": 1.252270763176379e-05, "loss": 0.1251, "step": 7527 }, { "epoch": 1.2883792572308745, "grad_norm": 26.614437103271484, "learning_rate": 1.2511344846815621e-05, "loss": 5.658, "step": 7528 }, { "epoch": 1.2885504021906555, "grad_norm": 14.866990089416504, "learning_rate": 1.24999835301777e-05, "loss": 1.4717, "step": 7529 }, { "epoch": 1.2887215471504363, "grad_norm": 67.76228332519531, "learning_rate": 1.248862368855322e-05, "loss": 7.1911, "step": 7530 }, { "epoch": 1.2888926921102173, "grad_norm": 15.273962020874023, "learning_rate": 1.2477265328644505e-05, "loss": 1.2144, "step": 7531 }, { "epoch": 1.2890638370699983, "grad_norm": 11.719128608703613, "learning_rate": 1.2465908457153e-05, "loss": 1.0968, "step": 7532 }, { "epoch": 1.2892349820297793, "grad_norm": 25.215810775756836, "learning_rate": 1.2454553080779283e-05, "loss": 1.9797, "step": 7533 }, { "epoch": 1.2894061269895603, "grad_norm": 20.87401580810547, "learning_rate": 1.244319920622304e-05, "loss": 1.9945, "step": 7534 }, { "epoch": 1.289577271949341, "grad_norm": 19.922645568847656, "learning_rate": 1.2431846840183074e-05, "loss": 1.7993, "step": 7535 }, { "epoch": 1.289748416909122, "grad_norm": 3.47524356842041, "learning_rate": 1.2420495989357297e-05, "loss": 0.2826, "step": 7536 }, { "epoch": 1.289919561868903, "grad_norm": 21.989328384399414, "learning_rate": 1.240914666044273e-05, "loss": 2.259, "step": 7537 }, { "epoch": 1.2900907068286838, "grad_norm": 14.711652755737305, "learning_rate": 1.2397798860135483e-05, "loss": 1.169, "step": 7538 }, { "epoch": 1.2902618517884648, "grad_norm": 16.291339874267578, "learning_rate": 1.2386452595130793e-05, "loss": 1.3224, "step": 7539 }, { "epoch": 1.2904329967482457, "grad_norm": 18.962268829345703, "learning_rate": 1.2375107872122963e-05, "loss": 1.5555, "step": 7540 }, { "epoch": 1.2906041417080267, "grad_norm": 13.209924697875977, "learning_rate": 1.2363764697805402e-05, "loss": 1.2939, "step": 7541 }, { "epoch": 1.2907752866678077, "grad_norm": 1.855237603187561, "learning_rate": 1.2352423078870592e-05, "loss": 0.2471, "step": 7542 }, { "epoch": 1.2909464316275885, "grad_norm": 0.9598819613456726, "learning_rate": 1.2341083022010103e-05, "loss": 0.1429, "step": 7543 }, { "epoch": 1.2911175765873695, "grad_norm": 13.005754470825195, "learning_rate": 1.23297445339146e-05, "loss": 0.9044, "step": 7544 }, { "epoch": 1.2912887215471505, "grad_norm": 26.620267868041992, "learning_rate": 1.2318407621273798e-05, "loss": 5.5246, "step": 7545 }, { "epoch": 1.2914598665069315, "grad_norm": 7.993296146392822, "learning_rate": 1.2307072290776492e-05, "loss": 0.6265, "step": 7546 }, { "epoch": 1.2916310114667122, "grad_norm": 28.138734817504883, "learning_rate": 1.2295738549110547e-05, "loss": 5.4191, "step": 7547 }, { "epoch": 1.2918021564264932, "grad_norm": 30.03383445739746, "learning_rate": 1.2284406402962877e-05, "loss": 5.7041, "step": 7548 }, { "epoch": 1.2919733013862742, "grad_norm": 8.908590316772461, "learning_rate": 1.2273075859019486e-05, "loss": 1.1269, "step": 7549 }, { "epoch": 1.2921444463460552, "grad_norm": 13.159024238586426, "learning_rate": 1.2261746923965395e-05, "loss": 1.0702, "step": 7550 }, { "epoch": 1.2923155913058362, "grad_norm": 22.145456314086914, "learning_rate": 1.2250419604484698e-05, "loss": 1.5557, "step": 7551 }, { "epoch": 1.292486736265617, "grad_norm": 17.461442947387695, "learning_rate": 1.223909390726053e-05, "loss": 1.6463, "step": 7552 }, { "epoch": 1.292657881225398, "grad_norm": 13.080147743225098, "learning_rate": 1.2227769838975069e-05, "loss": 1.0846, "step": 7553 }, { "epoch": 1.292829026185179, "grad_norm": 16.844499588012695, "learning_rate": 1.221644740630953e-05, "loss": 1.587, "step": 7554 }, { "epoch": 1.2930001711449597, "grad_norm": 1.6371265649795532, "learning_rate": 1.220512661594417e-05, "loss": 0.2438, "step": 7555 }, { "epoch": 1.2931713161047407, "grad_norm": 61.34412384033203, "learning_rate": 1.2193807474558268e-05, "loss": 6.45, "step": 7556 }, { "epoch": 1.2933424610645217, "grad_norm": 9.564046859741211, "learning_rate": 1.2182489988830141e-05, "loss": 1.0304, "step": 7557 }, { "epoch": 1.2935136060243027, "grad_norm": 8.173623085021973, "learning_rate": 1.2171174165437112e-05, "loss": 0.6847, "step": 7558 }, { "epoch": 1.2936847509840836, "grad_norm": 13.157476425170898, "learning_rate": 1.2159860011055534e-05, "loss": 1.353, "step": 7559 }, { "epoch": 1.2938558959438644, "grad_norm": 17.588411331176758, "learning_rate": 1.2148547532360786e-05, "loss": 1.2431, "step": 7560 }, { "epoch": 1.2940270409036454, "grad_norm": 10.51596450805664, "learning_rate": 1.2137236736027241e-05, "loss": 0.7703, "step": 7561 }, { "epoch": 1.2941981858634264, "grad_norm": 5.292048454284668, "learning_rate": 1.2125927628728285e-05, "loss": 0.3808, "step": 7562 }, { "epoch": 1.2943693308232072, "grad_norm": 17.15229034423828, "learning_rate": 1.2114620217136309e-05, "loss": 1.3197, "step": 7563 }, { "epoch": 1.2945404757829881, "grad_norm": 12.941659927368164, "learning_rate": 1.2103314507922697e-05, "loss": 1.134, "step": 7564 }, { "epoch": 1.2947116207427691, "grad_norm": 10.494157791137695, "learning_rate": 1.2092010507757849e-05, "loss": 0.9958, "step": 7565 }, { "epoch": 1.2948827657025501, "grad_norm": 22.58926773071289, "learning_rate": 1.2080708223311127e-05, "loss": 5.5401, "step": 7566 }, { "epoch": 1.295053910662331, "grad_norm": 20.25670051574707, "learning_rate": 1.2069407661250903e-05, "loss": 1.9963, "step": 7567 }, { "epoch": 1.2952250556221119, "grad_norm": 14.322406768798828, "learning_rate": 1.2058108828244524e-05, "loss": 1.3138, "step": 7568 }, { "epoch": 1.2953962005818929, "grad_norm": 10.519211769104004, "learning_rate": 1.204681173095832e-05, "loss": 1.0285, "step": 7569 }, { "epoch": 1.2955673455416739, "grad_norm": 15.762115478515625, "learning_rate": 1.2035516376057591e-05, "loss": 1.5637, "step": 7570 }, { "epoch": 1.2957384905014546, "grad_norm": 0.9862691164016724, "learning_rate": 1.2024222770206614e-05, "loss": 0.1459, "step": 7571 }, { "epoch": 1.2959096354612356, "grad_norm": 7.247229099273682, "learning_rate": 1.2012930920068638e-05, "loss": 0.4876, "step": 7572 }, { "epoch": 1.2960807804210166, "grad_norm": 19.82130241394043, "learning_rate": 1.2001640832305872e-05, "loss": 2.2351, "step": 7573 }, { "epoch": 1.2962519253807976, "grad_norm": 20.365571975708008, "learning_rate": 1.1990352513579476e-05, "loss": 1.6286, "step": 7574 }, { "epoch": 1.2964230703405786, "grad_norm": 14.565311431884766, "learning_rate": 1.1979065970549573e-05, "loss": 1.146, "step": 7575 }, { "epoch": 1.2965942153003593, "grad_norm": 10.465729713439941, "learning_rate": 1.1967781209875254e-05, "loss": 0.8435, "step": 7576 }, { "epoch": 1.2967653602601403, "grad_norm": 151.20823669433594, "learning_rate": 1.195649823821454e-05, "loss": 8.9416, "step": 7577 }, { "epoch": 1.2969365052199213, "grad_norm": 5.1723103523254395, "learning_rate": 1.1945217062224398e-05, "loss": 0.5016, "step": 7578 }, { "epoch": 1.297107650179702, "grad_norm": 20.893043518066406, "learning_rate": 1.1933937688560737e-05, "loss": 1.8287, "step": 7579 }, { "epoch": 1.297278795139483, "grad_norm": 9.546403884887695, "learning_rate": 1.1922660123878407e-05, "loss": 0.8622, "step": 7580 }, { "epoch": 1.297449940099264, "grad_norm": 3.183964490890503, "learning_rate": 1.1911384374831184e-05, "loss": 0.3136, "step": 7581 }, { "epoch": 1.297621085059045, "grad_norm": 8.555353164672852, "learning_rate": 1.1900110448071781e-05, "loss": 0.78, "step": 7582 }, { "epoch": 1.297792230018826, "grad_norm": 15.852571487426758, "learning_rate": 1.1888838350251835e-05, "loss": 1.3163, "step": 7583 }, { "epoch": 1.2979633749786068, "grad_norm": 30.275362014770508, "learning_rate": 1.1877568088021896e-05, "loss": 5.5375, "step": 7584 }, { "epoch": 1.2981345199383878, "grad_norm": 19.9348201751709, "learning_rate": 1.1866299668031434e-05, "loss": 1.3742, "step": 7585 }, { "epoch": 1.2983056648981688, "grad_norm": 4.104188919067383, "learning_rate": 1.185503309692883e-05, "loss": 0.3701, "step": 7586 }, { "epoch": 1.2984768098579496, "grad_norm": 14.718408584594727, "learning_rate": 1.184376838136139e-05, "loss": 1.0145, "step": 7587 }, { "epoch": 1.2986479548177305, "grad_norm": 16.853395462036133, "learning_rate": 1.1832505527975305e-05, "loss": 2.297, "step": 7588 }, { "epoch": 1.2988190997775115, "grad_norm": 0.6066228151321411, "learning_rate": 1.1821244543415678e-05, "loss": 0.1384, "step": 7589 }, { "epoch": 1.2989902447372925, "grad_norm": 6.714770317077637, "learning_rate": 1.1809985434326502e-05, "loss": 0.4904, "step": 7590 }, { "epoch": 1.2991613896970735, "grad_norm": 12.138726234436035, "learning_rate": 1.179872820735067e-05, "loss": 1.1924, "step": 7591 }, { "epoch": 1.2993325346568543, "grad_norm": 0.5841194987297058, "learning_rate": 1.1787472869129965e-05, "loss": 0.1313, "step": 7592 }, { "epoch": 1.2995036796166353, "grad_norm": 15.870086669921875, "learning_rate": 1.1776219426305055e-05, "loss": 1.3421, "step": 7593 }, { "epoch": 1.2996748245764163, "grad_norm": 8.11445140838623, "learning_rate": 1.1764967885515483e-05, "loss": 0.6656, "step": 7594 }, { "epoch": 1.299845969536197, "grad_norm": 10.974652290344238, "learning_rate": 1.1753718253399677e-05, "loss": 0.9709, "step": 7595 }, { "epoch": 1.300017114495978, "grad_norm": 8.759194374084473, "learning_rate": 1.1742470536594938e-05, "loss": 0.5787, "step": 7596 }, { "epoch": 1.300188259455759, "grad_norm": 22.7238826751709, "learning_rate": 1.1731224741737437e-05, "loss": 1.7518, "step": 7597 }, { "epoch": 1.30035940441554, "grad_norm": 15.450611114501953, "learning_rate": 1.1719980875462205e-05, "loss": 1.4182, "step": 7598 }, { "epoch": 1.300530549375321, "grad_norm": 13.895708084106445, "learning_rate": 1.170873894440314e-05, "loss": 0.9097, "step": 7599 }, { "epoch": 1.300701694335102, "grad_norm": 21.733577728271484, "learning_rate": 1.1697498955193e-05, "loss": 1.898, "step": 7600 }, { "epoch": 1.3008728392948827, "grad_norm": 12.212570190429688, "learning_rate": 1.1686260914463396e-05, "loss": 1.0078, "step": 7601 }, { "epoch": 1.3010439842546637, "grad_norm": 18.94055938720703, "learning_rate": 1.1675024828844786e-05, "loss": 1.3883, "step": 7602 }, { "epoch": 1.3012151292144447, "grad_norm": 18.536867141723633, "learning_rate": 1.1663790704966482e-05, "loss": 1.6046, "step": 7603 }, { "epoch": 1.3013862741742255, "grad_norm": 10.81980037689209, "learning_rate": 1.165255854945663e-05, "loss": 0.9442, "step": 7604 }, { "epoch": 1.3015574191340065, "grad_norm": 0.4216785430908203, "learning_rate": 1.1641328368942222e-05, "loss": 0.1273, "step": 7605 }, { "epoch": 1.3017285640937875, "grad_norm": 26.57358741760254, "learning_rate": 1.1630100170049076e-05, "loss": 3.7187, "step": 7606 }, { "epoch": 1.3018997090535684, "grad_norm": 4.196712493896484, "learning_rate": 1.1618873959401848e-05, "loss": 0.3642, "step": 7607 }, { "epoch": 1.3020708540133494, "grad_norm": 3.9136526584625244, "learning_rate": 1.1607649743624024e-05, "loss": 0.5315, "step": 7608 }, { "epoch": 1.3022419989731302, "grad_norm": 2.5337984561920166, "learning_rate": 1.15964275293379e-05, "loss": 0.2766, "step": 7609 }, { "epoch": 1.3024131439329112, "grad_norm": 19.823795318603516, "learning_rate": 1.1585207323164607e-05, "loss": 1.5666, "step": 7610 }, { "epoch": 1.3025842888926922, "grad_norm": 17.423213958740234, "learning_rate": 1.1573989131724079e-05, "loss": 1.2708, "step": 7611 }, { "epoch": 1.302755433852473, "grad_norm": 23.07868194580078, "learning_rate": 1.1562772961635064e-05, "loss": 1.222, "step": 7612 }, { "epoch": 1.302926578812254, "grad_norm": 3.356442928314209, "learning_rate": 1.1551558819515127e-05, "loss": 0.3407, "step": 7613 }, { "epoch": 1.303097723772035, "grad_norm": 1.479873538017273, "learning_rate": 1.154034671198062e-05, "loss": 0.2154, "step": 7614 }, { "epoch": 1.303268868731816, "grad_norm": 20.11029052734375, "learning_rate": 1.1529136645646705e-05, "loss": 2.3389, "step": 7615 }, { "epoch": 1.303440013691597, "grad_norm": 6.832971096038818, "learning_rate": 1.1517928627127338e-05, "loss": 0.4051, "step": 7616 }, { "epoch": 1.3036111586513777, "grad_norm": 7.990339279174805, "learning_rate": 1.1506722663035266e-05, "loss": 0.5021, "step": 7617 }, { "epoch": 1.3037823036111587, "grad_norm": 15.064433097839355, "learning_rate": 1.1495518759982024e-05, "loss": 1.0979, "step": 7618 }, { "epoch": 1.3039534485709396, "grad_norm": 14.080619812011719, "learning_rate": 1.1484316924577938e-05, "loss": 1.0404, "step": 7619 }, { "epoch": 1.3041245935307204, "grad_norm": 25.42909812927246, "learning_rate": 1.1473117163432102e-05, "loss": 5.4308, "step": 7620 }, { "epoch": 1.3042957384905014, "grad_norm": 18.933637619018555, "learning_rate": 1.1461919483152392e-05, "loss": 2.5292, "step": 7621 }, { "epoch": 1.3044668834502824, "grad_norm": 0.5344691276550293, "learning_rate": 1.1450723890345459e-05, "loss": 0.1414, "step": 7622 }, { "epoch": 1.3046380284100634, "grad_norm": 22.04081916809082, "learning_rate": 1.1439530391616711e-05, "loss": 1.8622, "step": 7623 }, { "epoch": 1.3048091733698444, "grad_norm": 0.5450618863105774, "learning_rate": 1.1428338993570341e-05, "loss": 0.1399, "step": 7624 }, { "epoch": 1.3049803183296251, "grad_norm": 19.328487396240234, "learning_rate": 1.1417149702809283e-05, "loss": 1.4455, "step": 7625 }, { "epoch": 1.3051514632894061, "grad_norm": 10.009942054748535, "learning_rate": 1.1405962525935237e-05, "loss": 0.7642, "step": 7626 }, { "epoch": 1.305322608249187, "grad_norm": 19.036094665527344, "learning_rate": 1.1394777469548654e-05, "loss": 1.6603, "step": 7627 }, { "epoch": 1.3054937532089679, "grad_norm": 7.515437126159668, "learning_rate": 1.1383594540248733e-05, "loss": 0.6696, "step": 7628 }, { "epoch": 1.3056648981687489, "grad_norm": 10.558402061462402, "learning_rate": 1.1372413744633417e-05, "loss": 0.7952, "step": 7629 }, { "epoch": 1.3058360431285299, "grad_norm": 12.095398902893066, "learning_rate": 1.1361235089299398e-05, "loss": 1.2338, "step": 7630 }, { "epoch": 1.3060071880883108, "grad_norm": 10.3566312789917, "learning_rate": 1.1350058580842098e-05, "loss": 0.9752, "step": 7631 }, { "epoch": 1.3061783330480918, "grad_norm": 4.167954444885254, "learning_rate": 1.133888422585567e-05, "loss": 0.384, "step": 7632 }, { "epoch": 1.3063494780078726, "grad_norm": 18.931066513061523, "learning_rate": 1.1327712030933002e-05, "loss": 2.0, "step": 7633 }, { "epoch": 1.3065206229676536, "grad_norm": 123.077392578125, "learning_rate": 1.1316542002665701e-05, "loss": 7.1118, "step": 7634 }, { "epoch": 1.3066917679274346, "grad_norm": 12.888384819030762, "learning_rate": 1.1305374147644112e-05, "loss": 1.194, "step": 7635 }, { "epoch": 1.3068629128872153, "grad_norm": 13.995150566101074, "learning_rate": 1.1294208472457276e-05, "loss": 1.0899, "step": 7636 }, { "epoch": 1.3070340578469963, "grad_norm": 7.124889373779297, "learning_rate": 1.128304498369296e-05, "loss": 0.4111, "step": 7637 }, { "epoch": 1.3072052028067773, "grad_norm": 9.083507537841797, "learning_rate": 1.1271883687937645e-05, "loss": 1.0383, "step": 7638 }, { "epoch": 1.3073763477665583, "grad_norm": 10.323081016540527, "learning_rate": 1.1260724591776502e-05, "loss": 0.8118, "step": 7639 }, { "epoch": 1.3075474927263393, "grad_norm": 12.912384986877441, "learning_rate": 1.1249567701793422e-05, "loss": 1.1339, "step": 7640 }, { "epoch": 1.30771863768612, "grad_norm": 63.08304214477539, "learning_rate": 1.1238413024570982e-05, "loss": 6.4905, "step": 7641 }, { "epoch": 1.307889782645901, "grad_norm": 17.389144897460938, "learning_rate": 1.122726056669046e-05, "loss": 1.6189, "step": 7642 }, { "epoch": 1.308060927605682, "grad_norm": 7.14937162399292, "learning_rate": 1.1216110334731825e-05, "loss": 0.6183, "step": 7643 }, { "epoch": 1.3082320725654628, "grad_norm": 5.7636919021606445, "learning_rate": 1.1204962335273728e-05, "loss": 0.4276, "step": 7644 }, { "epoch": 1.3084032175252438, "grad_norm": 13.262709617614746, "learning_rate": 1.1193816574893499e-05, "loss": 0.8461, "step": 7645 }, { "epoch": 1.3085743624850248, "grad_norm": 17.416807174682617, "learning_rate": 1.1182673060167168e-05, "loss": 1.6257, "step": 7646 }, { "epoch": 1.3087455074448058, "grad_norm": 21.898544311523438, "learning_rate": 1.1171531797669413e-05, "loss": 1.7339, "step": 7647 }, { "epoch": 1.3089166524045868, "grad_norm": 21.608203887939453, "learning_rate": 1.1160392793973605e-05, "loss": 2.1039, "step": 7648 }, { "epoch": 1.3090877973643678, "grad_norm": 22.45233726501465, "learning_rate": 1.1149256055651767e-05, "loss": 2.9647, "step": 7649 }, { "epoch": 1.3092589423241485, "grad_norm": 21.876420974731445, "learning_rate": 1.113812158927458e-05, "loss": 5.0497, "step": 7650 }, { "epoch": 1.3094300872839295, "grad_norm": 16.637968063354492, "learning_rate": 1.1126989401411418e-05, "loss": 1.7433, "step": 7651 }, { "epoch": 1.3096012322437105, "grad_norm": 9.169917106628418, "learning_rate": 1.1115859498630277e-05, "loss": 1.1167, "step": 7652 }, { "epoch": 1.3097723772034913, "grad_norm": 16.875045776367188, "learning_rate": 1.1104731887497817e-05, "loss": 1.5559, "step": 7653 }, { "epoch": 1.3099435221632723, "grad_norm": 0.782894492149353, "learning_rate": 1.1093606574579346e-05, "loss": 0.135, "step": 7654 }, { "epoch": 1.3101146671230532, "grad_norm": 19.62564468383789, "learning_rate": 1.1082483566438814e-05, "loss": 2.1235, "step": 7655 }, { "epoch": 1.3102858120828342, "grad_norm": 14.405755996704102, "learning_rate": 1.107136286963881e-05, "loss": 1.208, "step": 7656 }, { "epoch": 1.3104569570426152, "grad_norm": 13.453514099121094, "learning_rate": 1.1060244490740567e-05, "loss": 1.0998, "step": 7657 }, { "epoch": 1.310628102002396, "grad_norm": 7.483944892883301, "learning_rate": 1.1049128436303943e-05, "loss": 0.4954, "step": 7658 }, { "epoch": 1.310799246962177, "grad_norm": 139.34796142578125, "learning_rate": 1.1038014712887425e-05, "loss": 7.8605, "step": 7659 }, { "epoch": 1.310970391921958, "grad_norm": 20.454601287841797, "learning_rate": 1.1026903327048128e-05, "loss": 2.6431, "step": 7660 }, { "epoch": 1.3111415368817387, "grad_norm": 18.389663696289062, "learning_rate": 1.1015794285341782e-05, "loss": 2.1709, "step": 7661 }, { "epoch": 1.3113126818415197, "grad_norm": 39.07166290283203, "learning_rate": 1.1004687594322747e-05, "loss": 5.7758, "step": 7662 }, { "epoch": 1.3114838268013007, "grad_norm": 5.378295421600342, "learning_rate": 1.0993583260543978e-05, "loss": 0.3618, "step": 7663 }, { "epoch": 1.3116549717610817, "grad_norm": 23.937475204467773, "learning_rate": 1.0982481290557056e-05, "loss": 1.1674, "step": 7664 }, { "epoch": 1.3118261167208627, "grad_norm": 7.371132850646973, "learning_rate": 1.0971381690912159e-05, "loss": 0.7905, "step": 7665 }, { "epoch": 1.3119972616806435, "grad_norm": 9.415770530700684, "learning_rate": 1.0960284468158055e-05, "loss": 1.0082, "step": 7666 }, { "epoch": 1.3121684066404244, "grad_norm": 12.06342887878418, "learning_rate": 1.0949189628842139e-05, "loss": 0.9626, "step": 7667 }, { "epoch": 1.3123395516002054, "grad_norm": 0.5903443098068237, "learning_rate": 1.0938097179510376e-05, "loss": 0.1343, "step": 7668 }, { "epoch": 1.3125106965599862, "grad_norm": 7.682429790496826, "learning_rate": 1.0927007126707325e-05, "loss": 0.4691, "step": 7669 }, { "epoch": 1.3126818415197672, "grad_norm": 10.526029586791992, "learning_rate": 1.0915919476976142e-05, "loss": 1.0967, "step": 7670 }, { "epoch": 1.3128529864795482, "grad_norm": 11.364990234375, "learning_rate": 1.0904834236858544e-05, "loss": 1.022, "step": 7671 }, { "epoch": 1.3130241314393292, "grad_norm": 9.109969139099121, "learning_rate": 1.0893751412894843e-05, "loss": 0.7541, "step": 7672 }, { "epoch": 1.3131952763991102, "grad_norm": 15.1395902633667, "learning_rate": 1.0882671011623927e-05, "loss": 1.9152, "step": 7673 }, { "epoch": 1.313366421358891, "grad_norm": 17.07834243774414, "learning_rate": 1.0871593039583253e-05, "loss": 1.2759, "step": 7674 }, { "epoch": 1.313537566318672, "grad_norm": 22.273035049438477, "learning_rate": 1.086051750330883e-05, "loss": 5.0596, "step": 7675 }, { "epoch": 1.313708711278453, "grad_norm": 2.752251148223877, "learning_rate": 1.0849444409335247e-05, "loss": 0.2592, "step": 7676 }, { "epoch": 1.3138798562382337, "grad_norm": 4.555734634399414, "learning_rate": 1.0838373764195636e-05, "loss": 0.3269, "step": 7677 }, { "epoch": 1.3140510011980147, "grad_norm": 0.5538658499717712, "learning_rate": 1.0827305574421713e-05, "loss": 0.1312, "step": 7678 }, { "epoch": 1.3142221461577956, "grad_norm": 15.580399513244629, "learning_rate": 1.0816239846543714e-05, "loss": 1.2762, "step": 7679 }, { "epoch": 1.3143932911175766, "grad_norm": 4.781283378601074, "learning_rate": 1.0805176587090435e-05, "loss": 0.5832, "step": 7680 }, { "epoch": 1.3145644360773576, "grad_norm": 10.91511058807373, "learning_rate": 1.079411580258922e-05, "loss": 0.9796, "step": 7681 }, { "epoch": 1.3147355810371384, "grad_norm": 1.3821643590927124, "learning_rate": 1.0783057499565945e-05, "loss": 0.1531, "step": 7682 }, { "epoch": 1.3149067259969194, "grad_norm": 18.001895904541016, "learning_rate": 1.0772001684545027e-05, "loss": 1.8831, "step": 7683 }, { "epoch": 1.3150778709567004, "grad_norm": 18.8228759765625, "learning_rate": 1.0760948364049413e-05, "loss": 1.4197, "step": 7684 }, { "epoch": 1.3152490159164811, "grad_norm": 15.630839347839355, "learning_rate": 1.0749897544600576e-05, "loss": 1.2747, "step": 7685 }, { "epoch": 1.3154201608762621, "grad_norm": 15.606013298034668, "learning_rate": 1.0738849232718523e-05, "loss": 1.2318, "step": 7686 }, { "epoch": 1.3155913058360431, "grad_norm": 4.676835536956787, "learning_rate": 1.0727803434921765e-05, "loss": 0.2796, "step": 7687 }, { "epoch": 1.315762450795824, "grad_norm": 26.167057037353516, "learning_rate": 1.0716760157727336e-05, "loss": 5.4689, "step": 7688 }, { "epoch": 1.315933595755605, "grad_norm": 19.37649154663086, "learning_rate": 1.0705719407650805e-05, "loss": 1.9204, "step": 7689 }, { "epoch": 1.3161047407153859, "grad_norm": 16.943334579467773, "learning_rate": 1.0694681191206218e-05, "loss": 1.5151, "step": 7690 }, { "epoch": 1.3162758856751668, "grad_norm": 12.672527313232422, "learning_rate": 1.068364551490614e-05, "loss": 0.9585, "step": 7691 }, { "epoch": 1.3164470306349478, "grad_norm": 14.246026992797852, "learning_rate": 1.0672612385261636e-05, "loss": 1.1833, "step": 7692 }, { "epoch": 1.3166181755947286, "grad_norm": 19.577896118164062, "learning_rate": 1.0661581808782264e-05, "loss": 1.6338, "step": 7693 }, { "epoch": 1.3167893205545096, "grad_norm": 18.652299880981445, "learning_rate": 1.0650553791976096e-05, "loss": 1.8381, "step": 7694 }, { "epoch": 1.3169604655142906, "grad_norm": 14.095416069030762, "learning_rate": 1.0639528341349668e-05, "loss": 1.135, "step": 7695 }, { "epoch": 1.3171316104740716, "grad_norm": 13.167633056640625, "learning_rate": 1.062850546340801e-05, "loss": 1.0979, "step": 7696 }, { "epoch": 1.3173027554338526, "grad_norm": 18.160646438598633, "learning_rate": 1.0617485164654645e-05, "loss": 2.445, "step": 7697 }, { "epoch": 1.3174739003936335, "grad_norm": 10.149754524230957, "learning_rate": 1.0606467451591556e-05, "loss": 0.8535, "step": 7698 }, { "epoch": 1.3176450453534143, "grad_norm": 12.657750129699707, "learning_rate": 1.0595452330719214e-05, "loss": 0.8057, "step": 7699 }, { "epoch": 1.3178161903131953, "grad_norm": 19.711631774902344, "learning_rate": 1.058443980853656e-05, "loss": 2.5439, "step": 7700 }, { "epoch": 1.3179873352729763, "grad_norm": 13.596765518188477, "learning_rate": 1.0573429891540995e-05, "loss": 0.9922, "step": 7701 }, { "epoch": 1.318158480232757, "grad_norm": 11.752197265625, "learning_rate": 1.056242258622839e-05, "loss": 0.9834, "step": 7702 }, { "epoch": 1.318329625192538, "grad_norm": 13.766919136047363, "learning_rate": 1.0551417899093064e-05, "loss": 1.2064, "step": 7703 }, { "epoch": 1.318500770152319, "grad_norm": 3.4079298973083496, "learning_rate": 1.05404158366278e-05, "loss": 0.3151, "step": 7704 }, { "epoch": 1.3186719151121, "grad_norm": 17.611780166625977, "learning_rate": 1.0529416405323839e-05, "loss": 2.0194, "step": 7705 }, { "epoch": 1.318843060071881, "grad_norm": 6.004448890686035, "learning_rate": 1.0518419611670863e-05, "loss": 0.3998, "step": 7706 }, { "epoch": 1.3190142050316618, "grad_norm": 0.5409048795700073, "learning_rate": 1.0507425462156985e-05, "loss": 0.1352, "step": 7707 }, { "epoch": 1.3191853499914428, "grad_norm": 11.109127044677734, "learning_rate": 1.0496433963268778e-05, "loss": 1.2146, "step": 7708 }, { "epoch": 1.3193564949512238, "grad_norm": 13.240180969238281, "learning_rate": 1.0485445121491234e-05, "loss": 1.0734, "step": 7709 }, { "epoch": 1.3195276399110045, "grad_norm": 11.445103645324707, "learning_rate": 1.0474458943307803e-05, "loss": 0.872, "step": 7710 }, { "epoch": 1.3196987848707855, "grad_norm": 13.537281036376953, "learning_rate": 1.0463475435200332e-05, "loss": 1.038, "step": 7711 }, { "epoch": 1.3198699298305665, "grad_norm": 100.9023666381836, "learning_rate": 1.0452494603649108e-05, "loss": 7.4707, "step": 7712 }, { "epoch": 1.3200410747903475, "grad_norm": 14.019550323486328, "learning_rate": 1.0441516455132846e-05, "loss": 1.2618, "step": 7713 }, { "epoch": 1.3202122197501285, "grad_norm": 4.731026649475098, "learning_rate": 1.0430540996128663e-05, "loss": 0.3173, "step": 7714 }, { "epoch": 1.3203833647099092, "grad_norm": 12.830647468566895, "learning_rate": 1.0419568233112095e-05, "loss": 1.0202, "step": 7715 }, { "epoch": 1.3205545096696902, "grad_norm": 8.369707107543945, "learning_rate": 1.0408598172557096e-05, "loss": 0.6916, "step": 7716 }, { "epoch": 1.3207256546294712, "grad_norm": 7.795764446258545, "learning_rate": 1.0397630820936014e-05, "loss": 0.767, "step": 7717 }, { "epoch": 1.320896799589252, "grad_norm": 15.821785926818848, "learning_rate": 1.03866661847196e-05, "loss": 1.7533, "step": 7718 }, { "epoch": 1.321067944549033, "grad_norm": 30.86501693725586, "learning_rate": 1.0375704270377012e-05, "loss": 5.3286, "step": 7719 }, { "epoch": 1.321239089508814, "grad_norm": 0.4868166446685791, "learning_rate": 1.0364745084375787e-05, "loss": 0.1375, "step": 7720 }, { "epoch": 1.321410234468595, "grad_norm": 19.305397033691406, "learning_rate": 1.035378863318187e-05, "loss": 2.0752, "step": 7721 }, { "epoch": 1.321581379428376, "grad_norm": 4.07341194152832, "learning_rate": 1.034283492325958e-05, "loss": 0.3295, "step": 7722 }, { "epoch": 1.3217525243881567, "grad_norm": 15.446962356567383, "learning_rate": 1.033188396107162e-05, "loss": 1.3605, "step": 7723 }, { "epoch": 1.3219236693479377, "grad_norm": 17.727609634399414, "learning_rate": 1.0320935753079077e-05, "loss": 1.2988, "step": 7724 }, { "epoch": 1.3220948143077187, "grad_norm": 12.297486305236816, "learning_rate": 1.0309990305741412e-05, "loss": 1.0018, "step": 7725 }, { "epoch": 1.3222659592674995, "grad_norm": 8.26491641998291, "learning_rate": 1.0299047625516452e-05, "loss": 0.8143, "step": 7726 }, { "epoch": 1.3224371042272804, "grad_norm": 7.907574653625488, "learning_rate": 1.028810771886039e-05, "loss": 1.0222, "step": 7727 }, { "epoch": 1.3226082491870614, "grad_norm": 0.49876004457473755, "learning_rate": 1.0277170592227796e-05, "loss": 0.1302, "step": 7728 }, { "epoch": 1.3227793941468424, "grad_norm": 13.960297584533691, "learning_rate": 1.0266236252071584e-05, "loss": 1.0474, "step": 7729 }, { "epoch": 1.3229505391066234, "grad_norm": 0.38284948468208313, "learning_rate": 1.0255304704843037e-05, "loss": 0.1207, "step": 7730 }, { "epoch": 1.3231216840664042, "grad_norm": 21.4552001953125, "learning_rate": 1.0244375956991776e-05, "loss": 1.9789, "step": 7731 }, { "epoch": 1.3232928290261852, "grad_norm": 0.8332533240318298, "learning_rate": 1.0233450014965787e-05, "loss": 0.1405, "step": 7732 }, { "epoch": 1.3234639739859662, "grad_norm": 15.524604797363281, "learning_rate": 1.022252688521139e-05, "loss": 1.1942, "step": 7733 }, { "epoch": 1.323635118945747, "grad_norm": 4.579717636108398, "learning_rate": 1.0211606574173245e-05, "loss": 0.3253, "step": 7734 }, { "epoch": 1.323806263905528, "grad_norm": 5.621901988983154, "learning_rate": 1.0200689088294356e-05, "loss": 0.4184, "step": 7735 }, { "epoch": 1.323977408865309, "grad_norm": 16.8519229888916, "learning_rate": 1.0189774434016048e-05, "loss": 1.1868, "step": 7736 }, { "epoch": 1.32414855382509, "grad_norm": 20.170482635498047, "learning_rate": 1.017886261777799e-05, "loss": 2.5078, "step": 7737 }, { "epoch": 1.3243196987848709, "grad_norm": 21.047101974487305, "learning_rate": 1.0167953646018171e-05, "loss": 2.4527, "step": 7738 }, { "epoch": 1.3244908437446516, "grad_norm": 18.5394229888916, "learning_rate": 1.0157047525172897e-05, "loss": 2.0388, "step": 7739 }, { "epoch": 1.3246619887044326, "grad_norm": 2.999378204345703, "learning_rate": 1.0146144261676798e-05, "loss": 0.2756, "step": 7740 }, { "epoch": 1.3248331336642136, "grad_norm": 5.7144269943237305, "learning_rate": 1.0135243861962813e-05, "loss": 0.4059, "step": 7741 }, { "epoch": 1.3250042786239944, "grad_norm": 13.805132865905762, "learning_rate": 1.0124346332462198e-05, "loss": 1.1937, "step": 7742 }, { "epoch": 1.3251754235837754, "grad_norm": 12.285717964172363, "learning_rate": 1.0113451679604507e-05, "loss": 1.3567, "step": 7743 }, { "epoch": 1.3253465685435564, "grad_norm": 10.607677459716797, "learning_rate": 1.0102559909817604e-05, "loss": 0.8848, "step": 7744 }, { "epoch": 1.3255177135033374, "grad_norm": 9.427162170410156, "learning_rate": 1.0091671029527644e-05, "loss": 0.6905, "step": 7745 }, { "epoch": 1.3256888584631183, "grad_norm": 0.6724772453308105, "learning_rate": 1.0080785045159091e-05, "loss": 0.1361, "step": 7746 }, { "epoch": 1.3258600034228991, "grad_norm": 10.575011253356934, "learning_rate": 1.0069901963134687e-05, "loss": 0.7674, "step": 7747 }, { "epoch": 1.32603114838268, "grad_norm": 10.72413158416748, "learning_rate": 1.005902178987547e-05, "loss": 0.8709, "step": 7748 }, { "epoch": 1.326202293342461, "grad_norm": 21.01888656616211, "learning_rate": 1.0048144531800754e-05, "loss": 1.8226, "step": 7749 }, { "epoch": 1.326373438302242, "grad_norm": 6.55217981338501, "learning_rate": 1.0037270195328141e-05, "loss": 0.382, "step": 7750 }, { "epoch": 1.3265445832620228, "grad_norm": 10.578644752502441, "learning_rate": 1.0026398786873505e-05, "loss": 0.7, "step": 7751 }, { "epoch": 1.3267157282218038, "grad_norm": 14.880857467651367, "learning_rate": 1.0015530312850989e-05, "loss": 1.1683, "step": 7752 }, { "epoch": 1.3268868731815848, "grad_norm": 9.10450553894043, "learning_rate": 1.0004664779673017e-05, "loss": 0.4336, "step": 7753 }, { "epoch": 1.3270580181413658, "grad_norm": 7.297650337219238, "learning_rate": 9.993802193750263e-06, "loss": 0.458, "step": 7754 }, { "epoch": 1.3272291631011468, "grad_norm": 20.097755432128906, "learning_rate": 9.982942561491673e-06, "loss": 2.2245, "step": 7755 }, { "epoch": 1.3274003080609276, "grad_norm": 19.3148136138916, "learning_rate": 9.972085889304445e-06, "loss": 1.512, "step": 7756 }, { "epoch": 1.3275714530207086, "grad_norm": 24.726036071777344, "learning_rate": 9.96123218359403e-06, "loss": 3.2435, "step": 7757 }, { "epoch": 1.3277425979804895, "grad_norm": 14.569957733154297, "learning_rate": 9.95038145076413e-06, "loss": 1.122, "step": 7758 }, { "epoch": 1.3279137429402703, "grad_norm": 16.211366653442383, "learning_rate": 9.939533697216696e-06, "loss": 1.1394, "step": 7759 }, { "epoch": 1.3280848879000513, "grad_norm": 0.47790247201919556, "learning_rate": 9.92868892935192e-06, "loss": 0.132, "step": 7760 }, { "epoch": 1.3282560328598323, "grad_norm": 10.083187103271484, "learning_rate": 9.917847153568227e-06, "loss": 0.687, "step": 7761 }, { "epoch": 1.3284271778196133, "grad_norm": 10.120450973510742, "learning_rate": 9.907008376262288e-06, "loss": 0.6699, "step": 7762 }, { "epoch": 1.3285983227793943, "grad_norm": 16.270835876464844, "learning_rate": 9.896172603828982e-06, "loss": 1.3664, "step": 7763 }, { "epoch": 1.328769467739175, "grad_norm": 15.282939910888672, "learning_rate": 9.88533984266145e-06, "loss": 1.2255, "step": 7764 }, { "epoch": 1.328940612698956, "grad_norm": 18.38252830505371, "learning_rate": 9.874510099151028e-06, "loss": 1.4593, "step": 7765 }, { "epoch": 1.329111757658737, "grad_norm": 7.455735683441162, "learning_rate": 9.863683379687281e-06, "loss": 0.8564, "step": 7766 }, { "epoch": 1.3292829026185178, "grad_norm": 7.662227630615234, "learning_rate": 9.852859690657995e-06, "loss": 0.6627, "step": 7767 }, { "epoch": 1.3294540475782988, "grad_norm": 15.670798301696777, "learning_rate": 9.842039038449153e-06, "loss": 1.2757, "step": 7768 }, { "epoch": 1.3296251925380798, "grad_norm": 33.80439758300781, "learning_rate": 9.831221429444963e-06, "loss": 5.7635, "step": 7769 }, { "epoch": 1.3297963374978607, "grad_norm": 145.04127502441406, "learning_rate": 9.820406870027826e-06, "loss": 8.283, "step": 7770 }, { "epoch": 1.3299674824576417, "grad_norm": 17.340248107910156, "learning_rate": 9.809595366578351e-06, "loss": 1.3826, "step": 7771 }, { "epoch": 1.3301386274174225, "grad_norm": 21.254674911499023, "learning_rate": 9.798786925475342e-06, "loss": 1.8894, "step": 7772 }, { "epoch": 1.3303097723772035, "grad_norm": 11.893777847290039, "learning_rate": 9.787981553095794e-06, "loss": 0.8193, "step": 7773 }, { "epoch": 1.3304809173369845, "grad_norm": 22.362438201904297, "learning_rate": 9.777179255814888e-06, "loss": 2.1771, "step": 7774 }, { "epoch": 1.3306520622967652, "grad_norm": 21.80306053161621, "learning_rate": 9.766380040006005e-06, "loss": 3.0675, "step": 7775 }, { "epoch": 1.3308232072565462, "grad_norm": 1.6046303510665894, "learning_rate": 9.755583912040692e-06, "loss": 0.2445, "step": 7776 }, { "epoch": 1.3309943522163272, "grad_norm": 9.884397506713867, "learning_rate": 9.744790878288683e-06, "loss": 0.7851, "step": 7777 }, { "epoch": 1.3311654971761082, "grad_norm": 20.357301712036133, "learning_rate": 9.734000945117886e-06, "loss": 1.7883, "step": 7778 }, { "epoch": 1.3313366421358892, "grad_norm": 129.23085021972656, "learning_rate": 9.723214118894366e-06, "loss": 9.2255, "step": 7779 }, { "epoch": 1.33150778709567, "grad_norm": 13.197471618652344, "learning_rate": 9.712430405982382e-06, "loss": 0.8971, "step": 7780 }, { "epoch": 1.331678932055451, "grad_norm": 18.437232971191406, "learning_rate": 9.701649812744335e-06, "loss": 1.4011, "step": 7781 }, { "epoch": 1.331850077015232, "grad_norm": 12.094624519348145, "learning_rate": 9.69087234554079e-06, "loss": 0.7557, "step": 7782 }, { "epoch": 1.3320212219750127, "grad_norm": 0.46964266896247864, "learning_rate": 9.680098010730468e-06, "loss": 0.1422, "step": 7783 }, { "epoch": 1.3321923669347937, "grad_norm": 10.259994506835938, "learning_rate": 9.669326814670244e-06, "loss": 0.8023, "step": 7784 }, { "epoch": 1.3323635118945747, "grad_norm": 13.318816184997559, "learning_rate": 9.658558763715139e-06, "loss": 1.0209, "step": 7785 }, { "epoch": 1.3325346568543557, "grad_norm": 14.060872077941895, "learning_rate": 9.647793864218318e-06, "loss": 1.2461, "step": 7786 }, { "epoch": 1.3327058018141367, "grad_norm": 40.56553649902344, "learning_rate": 9.63703212253109e-06, "loss": 5.0031, "step": 7787 }, { "epoch": 1.3328769467739174, "grad_norm": 12.271203994750977, "learning_rate": 9.626273545002897e-06, "loss": 1.0462, "step": 7788 }, { "epoch": 1.3330480917336984, "grad_norm": 12.823019027709961, "learning_rate": 9.615518137981317e-06, "loss": 0.9287, "step": 7789 }, { "epoch": 1.3332192366934794, "grad_norm": 72.58329772949219, "learning_rate": 9.604765907812051e-06, "loss": 7.3743, "step": 7790 }, { "epoch": 1.3333903816532602, "grad_norm": 17.8341064453125, "learning_rate": 9.59401686083894e-06, "loss": 2.4335, "step": 7791 }, { "epoch": 1.3335615266130412, "grad_norm": 18.05230140686035, "learning_rate": 9.583271003403932e-06, "loss": 1.3464, "step": 7792 }, { "epoch": 1.3337326715728222, "grad_norm": 22.57405662536621, "learning_rate": 9.5725283418471e-06, "loss": 2.1458, "step": 7793 }, { "epoch": 1.3339038165326031, "grad_norm": 21.594478607177734, "learning_rate": 9.561788882506636e-06, "loss": 1.1118, "step": 7794 }, { "epoch": 1.3340749614923841, "grad_norm": 20.780794143676758, "learning_rate": 9.55105263171882e-06, "loss": 2.7362, "step": 7795 }, { "epoch": 1.334246106452165, "grad_norm": 20.054019927978516, "learning_rate": 9.540319595818072e-06, "loss": 2.4585, "step": 7796 }, { "epoch": 1.334417251411946, "grad_norm": 8.055940628051758, "learning_rate": 9.529589781136899e-06, "loss": 0.6487, "step": 7797 }, { "epoch": 1.3345883963717269, "grad_norm": 13.177647590637207, "learning_rate": 9.518863194005898e-06, "loss": 0.9085, "step": 7798 }, { "epoch": 1.3347595413315076, "grad_norm": 16.22951316833496, "learning_rate": 9.508139840753782e-06, "loss": 1.3964, "step": 7799 }, { "epoch": 1.3349306862912886, "grad_norm": 23.331989288330078, "learning_rate": 9.49741972770733e-06, "loss": 5.3252, "step": 7800 }, { "epoch": 1.3351018312510696, "grad_norm": 12.572565078735352, "learning_rate": 9.486702861191439e-06, "loss": 1.1444, "step": 7801 }, { "epoch": 1.3352729762108506, "grad_norm": 5.193902015686035, "learning_rate": 9.475989247529075e-06, "loss": 0.547, "step": 7802 }, { "epoch": 1.3354441211706316, "grad_norm": 57.83475112915039, "learning_rate": 9.465278893041285e-06, "loss": 6.9681, "step": 7803 }, { "epoch": 1.3356152661304126, "grad_norm": 17.487966537475586, "learning_rate": 9.454571804047189e-06, "loss": 1.8436, "step": 7804 }, { "epoch": 1.3357864110901934, "grad_norm": 15.3053617477417, "learning_rate": 9.443867986863986e-06, "loss": 1.2377, "step": 7805 }, { "epoch": 1.3359575560499743, "grad_norm": 15.734844207763672, "learning_rate": 9.433167447806942e-06, "loss": 1.4479, "step": 7806 }, { "epoch": 1.3361287010097553, "grad_norm": 14.855875968933105, "learning_rate": 9.422470193189406e-06, "loss": 1.3216, "step": 7807 }, { "epoch": 1.336299845969536, "grad_norm": 14.787861824035645, "learning_rate": 9.411776229322759e-06, "loss": 1.3146, "step": 7808 }, { "epoch": 1.336470990929317, "grad_norm": 22.168827056884766, "learning_rate": 9.40108556251646e-06, "loss": 5.2831, "step": 7809 }, { "epoch": 1.336642135889098, "grad_norm": 1.7247884273529053, "learning_rate": 9.390398199078018e-06, "loss": 0.2437, "step": 7810 }, { "epoch": 1.336813280848879, "grad_norm": 5.1476593017578125, "learning_rate": 9.379714145312994e-06, "loss": 0.2906, "step": 7811 }, { "epoch": 1.33698442580866, "grad_norm": 17.64693832397461, "learning_rate": 9.369033407524996e-06, "loss": 1.0584, "step": 7812 }, { "epoch": 1.3371555707684408, "grad_norm": 15.794921875, "learning_rate": 9.358355992015674e-06, "loss": 1.1463, "step": 7813 }, { "epoch": 1.3373267157282218, "grad_norm": 19.980100631713867, "learning_rate": 9.34768190508472e-06, "loss": 2.206, "step": 7814 }, { "epoch": 1.3374978606880028, "grad_norm": 1.3618296384811401, "learning_rate": 9.337011153029864e-06, "loss": 0.2121, "step": 7815 }, { "epoch": 1.3376690056477836, "grad_norm": 11.703173637390137, "learning_rate": 9.326343742146853e-06, "loss": 0.9959, "step": 7816 }, { "epoch": 1.3378401506075646, "grad_norm": 13.13909912109375, "learning_rate": 9.315679678729492e-06, "loss": 1.4271, "step": 7817 }, { "epoch": 1.3380112955673455, "grad_norm": 23.269561767578125, "learning_rate": 9.305018969069586e-06, "loss": 1.7931, "step": 7818 }, { "epoch": 1.3381824405271265, "grad_norm": 13.245935440063477, "learning_rate": 9.294361619456975e-06, "loss": 1.0624, "step": 7819 }, { "epoch": 1.3383535854869075, "grad_norm": 17.5803279876709, "learning_rate": 9.283707636179504e-06, "loss": 1.6743, "step": 7820 }, { "epoch": 1.3385247304466883, "grad_norm": 13.90809154510498, "learning_rate": 9.273057025523039e-06, "loss": 1.1273, "step": 7821 }, { "epoch": 1.3386958754064693, "grad_norm": 0.44874170422554016, "learning_rate": 9.262409793771455e-06, "loss": 0.1323, "step": 7822 }, { "epoch": 1.3388670203662503, "grad_norm": 4.137473106384277, "learning_rate": 9.251765947206648e-06, "loss": 0.3071, "step": 7823 }, { "epoch": 1.339038165326031, "grad_norm": 7.665952205657959, "learning_rate": 9.24112549210849e-06, "loss": 0.7075, "step": 7824 }, { "epoch": 1.339209310285812, "grad_norm": 14.941093444824219, "learning_rate": 9.230488434754869e-06, "loss": 1.2963, "step": 7825 }, { "epoch": 1.339380455245593, "grad_norm": 19.078445434570312, "learning_rate": 9.219854781421665e-06, "loss": 2.2177, "step": 7826 }, { "epoch": 1.339551600205374, "grad_norm": 18.343894958496094, "learning_rate": 9.209224538382751e-06, "loss": 1.8106, "step": 7827 }, { "epoch": 1.339722745165155, "grad_norm": 0.48707735538482666, "learning_rate": 9.198597711909983e-06, "loss": 0.1278, "step": 7828 }, { "epoch": 1.3398938901249358, "grad_norm": 76.68790435791016, "learning_rate": 9.187974308273206e-06, "loss": 7.0306, "step": 7829 }, { "epoch": 1.3400650350847167, "grad_norm": 3.141286849975586, "learning_rate": 9.177354333740248e-06, "loss": 0.2852, "step": 7830 }, { "epoch": 1.3402361800444977, "grad_norm": 9.084844589233398, "learning_rate": 9.166737794576901e-06, "loss": 0.6802, "step": 7831 }, { "epoch": 1.3404073250042785, "grad_norm": 86.43070220947266, "learning_rate": 9.156124697046946e-06, "loss": 7.8303, "step": 7832 }, { "epoch": 1.3405784699640595, "grad_norm": 7.052088737487793, "learning_rate": 9.145515047412115e-06, "loss": 0.7937, "step": 7833 }, { "epoch": 1.3407496149238405, "grad_norm": 13.691901206970215, "learning_rate": 9.134908851932133e-06, "loss": 1.1782, "step": 7834 }, { "epoch": 1.3409207598836215, "grad_norm": 5.9041290283203125, "learning_rate": 9.124306116864668e-06, "loss": 0.4507, "step": 7835 }, { "epoch": 1.3410919048434025, "grad_norm": 12.721549034118652, "learning_rate": 9.113706848465341e-06, "loss": 0.8944, "step": 7836 }, { "epoch": 1.3412630498031832, "grad_norm": 16.74374771118164, "learning_rate": 9.103111052987743e-06, "loss": 1.4153, "step": 7837 }, { "epoch": 1.3414341947629642, "grad_norm": 14.554159164428711, "learning_rate": 9.0925187366834e-06, "loss": 1.3382, "step": 7838 }, { "epoch": 1.3416053397227452, "grad_norm": 13.031160354614258, "learning_rate": 9.08192990580181e-06, "loss": 0.746, "step": 7839 }, { "epoch": 1.341776484682526, "grad_norm": 11.126832962036133, "learning_rate": 9.071344566590387e-06, "loss": 0.9381, "step": 7840 }, { "epoch": 1.341947629642307, "grad_norm": 17.594663619995117, "learning_rate": 9.060762725294501e-06, "loss": 1.4099, "step": 7841 }, { "epoch": 1.342118774602088, "grad_norm": 23.681806564331055, "learning_rate": 9.050184388157454e-06, "loss": 5.1249, "step": 7842 }, { "epoch": 1.342289919561869, "grad_norm": 11.81534194946289, "learning_rate": 9.039609561420477e-06, "loss": 0.738, "step": 7843 }, { "epoch": 1.34246106452165, "grad_norm": 0.513480544090271, "learning_rate": 9.029038251322738e-06, "loss": 0.1294, "step": 7844 }, { "epoch": 1.3426322094814307, "grad_norm": 19.058822631835938, "learning_rate": 9.018470464101325e-06, "loss": 1.3968, "step": 7845 }, { "epoch": 1.3428033544412117, "grad_norm": 2.775284767150879, "learning_rate": 9.007906205991247e-06, "loss": 0.2575, "step": 7846 }, { "epoch": 1.3429744994009927, "grad_norm": 8.478485107421875, "learning_rate": 8.997345483225433e-06, "loss": 1.0727, "step": 7847 }, { "epoch": 1.3431456443607734, "grad_norm": 13.795414924621582, "learning_rate": 8.986788302034724e-06, "loss": 1.3467, "step": 7848 }, { "epoch": 1.3433167893205544, "grad_norm": 21.440906524658203, "learning_rate": 8.976234668647871e-06, "loss": 1.8343, "step": 7849 }, { "epoch": 1.3434879342803354, "grad_norm": 11.401413917541504, "learning_rate": 8.965684589291537e-06, "loss": 0.8268, "step": 7850 }, { "epoch": 1.3436590792401164, "grad_norm": 11.775639533996582, "learning_rate": 8.955138070190284e-06, "loss": 1.1194, "step": 7851 }, { "epoch": 1.3438302241998974, "grad_norm": 9.425366401672363, "learning_rate": 8.944595117566574e-06, "loss": 0.8341, "step": 7852 }, { "epoch": 1.3440013691596784, "grad_norm": 1.781152606010437, "learning_rate": 8.934055737640765e-06, "loss": 0.1981, "step": 7853 }, { "epoch": 1.3441725141194591, "grad_norm": 13.886120796203613, "learning_rate": 8.923519936631102e-06, "loss": 1.2971, "step": 7854 }, { "epoch": 1.3443436590792401, "grad_norm": 12.310486793518066, "learning_rate": 8.912987720753735e-06, "loss": 0.7952, "step": 7855 }, { "epoch": 1.3445148040390211, "grad_norm": 11.246068000793457, "learning_rate": 8.902459096222673e-06, "loss": 1.0557, "step": 7856 }, { "epoch": 1.344685948998802, "grad_norm": 11.441505432128906, "learning_rate": 8.891934069249827e-06, "loss": 0.8978, "step": 7857 }, { "epoch": 1.3448570939585829, "grad_norm": 13.720816612243652, "learning_rate": 8.881412646044977e-06, "loss": 1.0826, "step": 7858 }, { "epoch": 1.3450282389183639, "grad_norm": 13.07275104522705, "learning_rate": 8.870894832815776e-06, "loss": 0.9818, "step": 7859 }, { "epoch": 1.3451993838781449, "grad_norm": 13.763599395751953, "learning_rate": 8.860380635767758e-06, "loss": 1.3965, "step": 7860 }, { "epoch": 1.3453705288379258, "grad_norm": 21.626535415649414, "learning_rate": 8.849870061104309e-06, "loss": 1.7967, "step": 7861 }, { "epoch": 1.3455416737977066, "grad_norm": 20.506174087524414, "learning_rate": 8.83936311502668e-06, "loss": 2.3941, "step": 7862 }, { "epoch": 1.3457128187574876, "grad_norm": 12.698630332946777, "learning_rate": 8.828859803733994e-06, "loss": 0.8943, "step": 7863 }, { "epoch": 1.3458839637172686, "grad_norm": 10.888079643249512, "learning_rate": 8.818360133423214e-06, "loss": 0.6842, "step": 7864 }, { "epoch": 1.3460551086770494, "grad_norm": 0.5401256680488586, "learning_rate": 8.807864110289159e-06, "loss": 0.1295, "step": 7865 }, { "epoch": 1.3462262536368304, "grad_norm": 7.9375081062316895, "learning_rate": 8.797371740524508e-06, "loss": 0.5116, "step": 7866 }, { "epoch": 1.3463973985966113, "grad_norm": 5.087529182434082, "learning_rate": 8.786883030319765e-06, "loss": 0.4002, "step": 7867 }, { "epoch": 1.3465685435563923, "grad_norm": 12.330865859985352, "learning_rate": 8.776397985863289e-06, "loss": 1.0029, "step": 7868 }, { "epoch": 1.3467396885161733, "grad_norm": 17.346025466918945, "learning_rate": 8.765916613341272e-06, "loss": 1.667, "step": 7869 }, { "epoch": 1.346910833475954, "grad_norm": 14.988444328308105, "learning_rate": 8.75543891893774e-06, "loss": 1.1046, "step": 7870 }, { "epoch": 1.347081978435735, "grad_norm": 20.520917892456055, "learning_rate": 8.744964908834543e-06, "loss": 2.4082, "step": 7871 }, { "epoch": 1.347253123395516, "grad_norm": 11.348979949951172, "learning_rate": 8.734494589211371e-06, "loss": 1.0192, "step": 7872 }, { "epoch": 1.3474242683552968, "grad_norm": 15.412310600280762, "learning_rate": 8.724027966245718e-06, "loss": 1.3022, "step": 7873 }, { "epoch": 1.3475954133150778, "grad_norm": 20.15825080871582, "learning_rate": 8.71356504611292e-06, "loss": 2.031, "step": 7874 }, { "epoch": 1.3477665582748588, "grad_norm": 5.575507640838623, "learning_rate": 8.7031058349861e-06, "loss": 0.3702, "step": 7875 }, { "epoch": 1.3479377032346398, "grad_norm": 6.61264705657959, "learning_rate": 8.692650339036217e-06, "loss": 0.4081, "step": 7876 }, { "epoch": 1.3481088481944208, "grad_norm": 21.451274871826172, "learning_rate": 8.682198564432035e-06, "loss": 2.5121, "step": 7877 }, { "epoch": 1.3482799931542016, "grad_norm": 14.139251708984375, "learning_rate": 8.671750517340103e-06, "loss": 1.4738, "step": 7878 }, { "epoch": 1.3484511381139825, "grad_norm": 15.205315589904785, "learning_rate": 8.661306203924797e-06, "loss": 1.1368, "step": 7879 }, { "epoch": 1.3486222830737635, "grad_norm": 1.4786579608917236, "learning_rate": 8.650865630348275e-06, "loss": 0.195, "step": 7880 }, { "epoch": 1.3487934280335443, "grad_norm": 10.230877876281738, "learning_rate": 8.640428802770474e-06, "loss": 0.7811, "step": 7881 }, { "epoch": 1.3489645729933253, "grad_norm": 19.68021583557129, "learning_rate": 8.629995727349164e-06, "loss": 2.2237, "step": 7882 }, { "epoch": 1.3491357179531063, "grad_norm": 10.938794136047363, "learning_rate": 8.619566410239862e-06, "loss": 0.9205, "step": 7883 }, { "epoch": 1.3493068629128873, "grad_norm": 1.4069541692733765, "learning_rate": 8.609140857595876e-06, "loss": 0.1698, "step": 7884 }, { "epoch": 1.3494780078726683, "grad_norm": 0.7581135034561157, "learning_rate": 8.598719075568308e-06, "loss": 0.1348, "step": 7885 }, { "epoch": 1.349649152832449, "grad_norm": 0.621238648891449, "learning_rate": 8.58830107030601e-06, "loss": 0.1342, "step": 7886 }, { "epoch": 1.34982029779223, "grad_norm": 7.031002521514893, "learning_rate": 8.57788684795564e-06, "loss": 0.582, "step": 7887 }, { "epoch": 1.349991442752011, "grad_norm": 0.5243487358093262, "learning_rate": 8.567476414661596e-06, "loss": 0.1336, "step": 7888 }, { "epoch": 1.3501625877117918, "grad_norm": 18.68903350830078, "learning_rate": 8.557069776566044e-06, "loss": 1.4371, "step": 7889 }, { "epoch": 1.3503337326715728, "grad_norm": 1.3490283489227295, "learning_rate": 8.546666939808924e-06, "loss": 0.2222, "step": 7890 }, { "epoch": 1.3505048776313537, "grad_norm": 9.998093605041504, "learning_rate": 8.536267910527919e-06, "loss": 0.7217, "step": 7891 }, { "epoch": 1.3506760225911347, "grad_norm": 11.268606185913086, "learning_rate": 8.52587269485847e-06, "loss": 0.9479, "step": 7892 }, { "epoch": 1.3508471675509157, "grad_norm": 20.187633514404297, "learning_rate": 8.515481298933783e-06, "loss": 1.8128, "step": 7893 }, { "epoch": 1.3508471675509157, "eval_nli-pairs_loss": 1.4432600736618042, "eval_nli-pairs_runtime": 4.3769, "eval_nli-pairs_samples_per_second": 45.694, "eval_nli-pairs_steps_per_second": 1.599, "eval_sts-test_pearson_cosine": 0.7748113572228759, "eval_sts-test_pearson_dot": 0.6390425293409608, "eval_sts-test_pearson_euclidean": 0.7620744050210577, "eval_sts-test_pearson_manhattan": 0.7657457138434305, "eval_sts-test_pearson_max": 0.7748113572228759, "eval_sts-test_spearman_cosine": 0.7729829193564915, "eval_sts-test_spearman_dot": 0.6192746726630098, "eval_sts-test_spearman_euclidean": 0.7504799466626302, "eval_sts-test_spearman_manhattan": 0.755559036954118, "eval_sts-test_spearman_max": 0.7729829193564915, "step": 7893 }, { "epoch": 1.3508471675509157, "eval_vitaminc-pairs_loss": 0.7334175109863281, "eval_vitaminc-pairs_runtime": 2.7709, "eval_vitaminc-pairs_samples_per_second": 72.178, "eval_vitaminc-pairs_steps_per_second": 2.526, "step": 7893 }, { "epoch": 1.3508471675509157, "eval_qnli-contrastive_loss": 1.5698559284210205, "eval_qnli-contrastive_runtime": 0.6423, "eval_qnli-contrastive_samples_per_second": 311.394, "eval_qnli-contrastive_steps_per_second": 10.899, "step": 7893 }, { "epoch": 1.3508471675509157, "eval_scitail-pairs-qa_loss": 0.10769753158092499, "eval_scitail-pairs-qa_runtime": 1.6203, "eval_scitail-pairs-qa_samples_per_second": 123.431, "eval_scitail-pairs-qa_steps_per_second": 4.32, "step": 7893 }, { "epoch": 1.3508471675509157, "eval_scitail-pairs-pos_loss": 0.6731968522071838, "eval_scitail-pairs-pos_runtime": 2.6601, "eval_scitail-pairs-pos_samples_per_second": 75.186, "eval_scitail-pairs-pos_steps_per_second": 2.631, "step": 7893 }, { "epoch": 1.3508471675509157, "eval_xsum-pairs_loss": 0.7274036407470703, "eval_xsum-pairs_runtime": 2.6535, "eval_xsum-pairs_samples_per_second": 65.951, "eval_xsum-pairs_steps_per_second": 2.261, "step": 7893 }, { "epoch": 1.3508471675509157, "eval_compression-pairs_loss": 0.24030046164989471, "eval_compression-pairs_runtime": 0.5226, "eval_compression-pairs_samples_per_second": 382.686, "eval_compression-pairs_steps_per_second": 13.394, "step": 7893 }, { "epoch": 1.3508471675509157, "eval_sciq_pairs_loss": 0.43072450160980225, "eval_sciq_pairs_runtime": 9.2015, "eval_sciq_pairs_samples_per_second": 21.736, "eval_sciq_pairs_steps_per_second": 0.761, "step": 7893 }, { "epoch": 1.3508471675509157, "eval_qasc_pairs_loss": 5.355893611907959, "eval_qasc_pairs_runtime": 2.7315, "eval_qasc_pairs_samples_per_second": 73.219, "eval_qasc_pairs_steps_per_second": 2.563, "step": 7893 }, { "epoch": 1.3508471675509157, "eval_openbookqa_pairs_loss": 2.571211338043213, "eval_openbookqa_pairs_runtime": 0.659, "eval_openbookqa_pairs_samples_per_second": 104.704, "eval_openbookqa_pairs_steps_per_second": 4.552, "step": 7893 }, { "epoch": 1.3508471675509157, "eval_msmarco_pairs_loss": 1.1058056354522705, "eval_msmarco_pairs_runtime": 4.0254, "eval_msmarco_pairs_samples_per_second": 49.684, "eval_msmarco_pairs_steps_per_second": 1.739, "step": 7893 }, { "epoch": 1.3508471675509157, "eval_nq_pairs_loss": 1.2713885307312012, "eval_nq_pairs_runtime": 8.6454, "eval_nq_pairs_samples_per_second": 23.134, "eval_nq_pairs_steps_per_second": 0.81, "step": 7893 }, { "epoch": 1.3508471675509157, "eval_trivia_pairs_loss": 1.5911108255386353, "eval_trivia_pairs_runtime": 12.8789, "eval_trivia_pairs_samples_per_second": 15.529, "eval_trivia_pairs_steps_per_second": 0.544, "step": 7893 }, { "epoch": 1.3508471675509157, "eval_quora_pairs_loss": 0.21135039627552032, "eval_quora_pairs_runtime": 1.6142, "eval_quora_pairs_samples_per_second": 123.901, "eval_quora_pairs_steps_per_second": 4.337, "step": 7893 }, { "epoch": 1.3508471675509157, "eval_gooaq_pairs_loss": 0.8607009053230286, "eval_gooaq_pairs_runtime": 2.7341, "eval_gooaq_pairs_samples_per_second": 73.15, "eval_gooaq_pairs_steps_per_second": 2.56, "step": 7893 }, { "epoch": 1.3510183125106965, "grad_norm": 107.09418487548828, "learning_rate": 8.50509372888478e-06, "loss": 8.7501, "step": 7894 }, { "epoch": 1.3511894574704775, "grad_norm": 11.88301944732666, "learning_rate": 8.494709990840158e-06, "loss": 0.8261, "step": 7895 }, { "epoch": 1.3513606024302585, "grad_norm": 15.522592544555664, "learning_rate": 8.484330090926324e-06, "loss": 1.3149, "step": 7896 }, { "epoch": 1.3515317473900392, "grad_norm": 13.993993759155273, "learning_rate": 8.473954035267448e-06, "loss": 1.0682, "step": 7897 }, { "epoch": 1.3517028923498202, "grad_norm": 5.835932731628418, "learning_rate": 8.463581829985406e-06, "loss": 0.5868, "step": 7898 }, { "epoch": 1.3518740373096012, "grad_norm": 19.500629425048828, "learning_rate": 8.453213481199823e-06, "loss": 2.0912, "step": 7899 }, { "epoch": 1.3520451822693822, "grad_norm": 4.438239574432373, "learning_rate": 8.44284899502804e-06, "loss": 0.3397, "step": 7900 }, { "epoch": 1.3522163272291632, "grad_norm": 13.467206001281738, "learning_rate": 8.43248837758511e-06, "loss": 0.9853, "step": 7901 }, { "epoch": 1.3523874721889442, "grad_norm": 13.498573303222656, "learning_rate": 8.422131634983819e-06, "loss": 1.3403, "step": 7902 }, { "epoch": 1.352558617148725, "grad_norm": 197.8994140625, "learning_rate": 8.411778773334667e-06, "loss": 8.94, "step": 7903 }, { "epoch": 1.352729762108506, "grad_norm": 2.5428144931793213, "learning_rate": 8.401429798745847e-06, "loss": 0.2328, "step": 7904 }, { "epoch": 1.352900907068287, "grad_norm": 3.9354376792907715, "learning_rate": 8.391084717323278e-06, "loss": 0.2556, "step": 7905 }, { "epoch": 1.3530720520280677, "grad_norm": 17.734619140625, "learning_rate": 8.380743535170563e-06, "loss": 2.4227, "step": 7906 }, { "epoch": 1.3532431969878487, "grad_norm": 21.069412231445312, "learning_rate": 8.37040625838903e-06, "loss": 1.9247, "step": 7907 }, { "epoch": 1.3534143419476297, "grad_norm": 21.592254638671875, "learning_rate": 8.360072893077672e-06, "loss": 2.9089, "step": 7908 }, { "epoch": 1.3535854869074107, "grad_norm": 3.6049587726593018, "learning_rate": 8.349743445333196e-06, "loss": 0.3444, "step": 7909 }, { "epoch": 1.3537566318671916, "grad_norm": 16.893281936645508, "learning_rate": 8.339417921249998e-06, "loss": 1.6815, "step": 7910 }, { "epoch": 1.3539277768269724, "grad_norm": 0.4270938038825989, "learning_rate": 8.329096326920142e-06, "loss": 0.1226, "step": 7911 }, { "epoch": 1.3540989217867534, "grad_norm": 26.750118255615234, "learning_rate": 8.318778668433396e-06, "loss": 1.192, "step": 7912 }, { "epoch": 1.3542700667465344, "grad_norm": 11.184526443481445, "learning_rate": 8.308464951877181e-06, "loss": 1.0331, "step": 7913 }, { "epoch": 1.3544412117063152, "grad_norm": 25.97614860534668, "learning_rate": 8.298155183336617e-06, "loss": 5.0103, "step": 7914 }, { "epoch": 1.3546123566660961, "grad_norm": 16.017576217651367, "learning_rate": 8.287849368894476e-06, "loss": 1.6073, "step": 7915 }, { "epoch": 1.3547835016258771, "grad_norm": 20.626556396484375, "learning_rate": 8.277547514631201e-06, "loss": 2.6192, "step": 7916 }, { "epoch": 1.3549546465856581, "grad_norm": 0.4352658987045288, "learning_rate": 8.267249626624908e-06, "loss": 0.1248, "step": 7917 }, { "epoch": 1.355125791545439, "grad_norm": 12.039044380187988, "learning_rate": 8.256955710951354e-06, "loss": 0.9049, "step": 7918 }, { "epoch": 1.3552969365052199, "grad_norm": 7.01033353805542, "learning_rate": 8.246665773683985e-06, "loss": 0.8376, "step": 7919 }, { "epoch": 1.3554680814650009, "grad_norm": 15.398770332336426, "learning_rate": 8.236379820893868e-06, "loss": 1.4464, "step": 7920 }, { "epoch": 1.3556392264247819, "grad_norm": 24.395837783813477, "learning_rate": 8.226097858649725e-06, "loss": 1.5301, "step": 7921 }, { "epoch": 1.3558103713845626, "grad_norm": 3.7129740715026855, "learning_rate": 8.215819893017941e-06, "loss": 0.3776, "step": 7922 }, { "epoch": 1.3559815163443436, "grad_norm": 17.666425704956055, "learning_rate": 8.20554593006252e-06, "loss": 1.3419, "step": 7923 }, { "epoch": 1.3561526613041246, "grad_norm": 20.95212173461914, "learning_rate": 8.195275975845118e-06, "loss": 1.4918, "step": 7924 }, { "epoch": 1.3563238062639056, "grad_norm": 21.755355834960938, "learning_rate": 8.185010036425032e-06, "loss": 2.0847, "step": 7925 }, { "epoch": 1.3564949512236866, "grad_norm": 15.192948341369629, "learning_rate": 8.17474811785917e-06, "loss": 1.4842, "step": 7926 }, { "epoch": 1.3566660961834673, "grad_norm": 10.453164100646973, "learning_rate": 8.164490226202092e-06, "loss": 0.9547, "step": 7927 }, { "epoch": 1.3568372411432483, "grad_norm": 10.041457176208496, "learning_rate": 8.154236367505955e-06, "loss": 0.6116, "step": 7928 }, { "epoch": 1.3570083861030293, "grad_norm": 14.595532417297363, "learning_rate": 8.143986547820556e-06, "loss": 1.2874, "step": 7929 }, { "epoch": 1.35717953106281, "grad_norm": 9.68789291381836, "learning_rate": 8.133740773193313e-06, "loss": 0.6884, "step": 7930 }, { "epoch": 1.357350676022591, "grad_norm": 10.335504531860352, "learning_rate": 8.123499049669234e-06, "loss": 0.9279, "step": 7931 }, { "epoch": 1.357521820982372, "grad_norm": 15.070721626281738, "learning_rate": 8.113261383290964e-06, "loss": 1.2174, "step": 7932 }, { "epoch": 1.357692965942153, "grad_norm": 23.035741806030273, "learning_rate": 8.10302778009873e-06, "loss": 2.1438, "step": 7933 }, { "epoch": 1.357864110901934, "grad_norm": 15.676365852355957, "learning_rate": 8.092798246130377e-06, "loss": 1.3295, "step": 7934 }, { "epoch": 1.3580352558617148, "grad_norm": 14.068243980407715, "learning_rate": 8.082572787421357e-06, "loss": 0.8558, "step": 7935 }, { "epoch": 1.3582064008214958, "grad_norm": 2.1047537326812744, "learning_rate": 8.072351410004685e-06, "loss": 0.2433, "step": 7936 }, { "epoch": 1.3583775457812768, "grad_norm": 18.384124755859375, "learning_rate": 8.062134119911007e-06, "loss": 2.3037, "step": 7937 }, { "epoch": 1.3585486907410576, "grad_norm": 0.5251880288124084, "learning_rate": 8.051920923168527e-06, "loss": 0.1249, "step": 7938 }, { "epoch": 1.3587198357008385, "grad_norm": 5.348263263702393, "learning_rate": 8.041711825803055e-06, "loss": 0.3089, "step": 7939 }, { "epoch": 1.3588909806606195, "grad_norm": 12.722254753112793, "learning_rate": 8.03150683383797e-06, "loss": 0.9297, "step": 7940 }, { "epoch": 1.3590621256204005, "grad_norm": 16.143598556518555, "learning_rate": 8.02130595329423e-06, "loss": 0.8998, "step": 7941 }, { "epoch": 1.3592332705801815, "grad_norm": 12.335897445678711, "learning_rate": 8.011109190190374e-06, "loss": 0.9059, "step": 7942 }, { "epoch": 1.3594044155399623, "grad_norm": 0.3998869061470032, "learning_rate": 8.0009165505425e-06, "loss": 0.1258, "step": 7943 }, { "epoch": 1.3595755604997433, "grad_norm": 9.871674537658691, "learning_rate": 7.990728040364294e-06, "loss": 0.8648, "step": 7944 }, { "epoch": 1.3597467054595243, "grad_norm": 5.811002731323242, "learning_rate": 7.980543665666978e-06, "loss": 0.3648, "step": 7945 }, { "epoch": 1.359917850419305, "grad_norm": 3.4111969470977783, "learning_rate": 7.970363432459352e-06, "loss": 0.2808, "step": 7946 }, { "epoch": 1.360088995379086, "grad_norm": 12.346128463745117, "learning_rate": 7.96018734674778e-06, "loss": 0.9339, "step": 7947 }, { "epoch": 1.360260140338867, "grad_norm": 20.27092742919922, "learning_rate": 7.950015414536152e-06, "loss": 1.8533, "step": 7948 }, { "epoch": 1.360431285298648, "grad_norm": 2.390983819961548, "learning_rate": 7.939847641825934e-06, "loss": 0.2616, "step": 7949 }, { "epoch": 1.360602430258429, "grad_norm": 10.336169242858887, "learning_rate": 7.929684034616122e-06, "loss": 0.7228, "step": 7950 }, { "epoch": 1.3607735752182097, "grad_norm": 23.217824935913086, "learning_rate": 7.919524598903256e-06, "loss": 5.2922, "step": 7951 }, { "epoch": 1.3609447201779907, "grad_norm": 16.831451416015625, "learning_rate": 7.90936934068143e-06, "loss": 1.9508, "step": 7952 }, { "epoch": 1.3611158651377717, "grad_norm": 11.5362548828125, "learning_rate": 7.89921826594225e-06, "loss": 0.971, "step": 7953 }, { "epoch": 1.3612870100975527, "grad_norm": 121.41738891601562, "learning_rate": 7.889071380674873e-06, "loss": 8.451, "step": 7954 }, { "epoch": 1.3614581550573335, "grad_norm": 1.763120412826538, "learning_rate": 7.878928690865967e-06, "loss": 0.2545, "step": 7955 }, { "epoch": 1.3616293000171145, "grad_norm": 29.964916229248047, "learning_rate": 7.868790202499748e-06, "loss": 5.3622, "step": 7956 }, { "epoch": 1.3618004449768955, "grad_norm": 18.546653747558594, "learning_rate": 7.858655921557928e-06, "loss": 1.8526, "step": 7957 }, { "epoch": 1.3619715899366764, "grad_norm": 9.387438774108887, "learning_rate": 7.848525854019749e-06, "loss": 0.6086, "step": 7958 }, { "epoch": 1.3621427348964574, "grad_norm": 9.829107284545898, "learning_rate": 7.838400005861972e-06, "loss": 1.1134, "step": 7959 }, { "epoch": 1.3623138798562382, "grad_norm": 24.65566635131836, "learning_rate": 7.828278383058852e-06, "loss": 3.0146, "step": 7960 }, { "epoch": 1.3624850248160192, "grad_norm": 1.144333004951477, "learning_rate": 7.818160991582167e-06, "loss": 0.1983, "step": 7961 }, { "epoch": 1.3626561697758002, "grad_norm": 14.87730884552002, "learning_rate": 7.808047837401202e-06, "loss": 1.1744, "step": 7962 }, { "epoch": 1.362827314735581, "grad_norm": 18.721660614013672, "learning_rate": 7.79793892648272e-06, "loss": 1.5515, "step": 7963 }, { "epoch": 1.362998459695362, "grad_norm": 2.341946601867676, "learning_rate": 7.787834264791002e-06, "loss": 0.2613, "step": 7964 }, { "epoch": 1.363169604655143, "grad_norm": 10.254395484924316, "learning_rate": 7.777733858287805e-06, "loss": 0.7639, "step": 7965 }, { "epoch": 1.363340749614924, "grad_norm": 2.7340681552886963, "learning_rate": 7.767637712932395e-06, "loss": 0.2966, "step": 7966 }, { "epoch": 1.363511894574705, "grad_norm": 3.232351303100586, "learning_rate": 7.7575458346815e-06, "loss": 0.2851, "step": 7967 }, { "epoch": 1.3636830395344857, "grad_norm": 13.136812210083008, "learning_rate": 7.74745822948935e-06, "loss": 1.3821, "step": 7968 }, { "epoch": 1.3638541844942667, "grad_norm": 16.989933013916016, "learning_rate": 7.737374903307653e-06, "loss": 1.3374, "step": 7969 }, { "epoch": 1.3640253294540476, "grad_norm": 15.131267547607422, "learning_rate": 7.727295862085576e-06, "loss": 1.7686, "step": 7970 }, { "epoch": 1.3641964744138284, "grad_norm": 23.571996688842773, "learning_rate": 7.717221111769777e-06, "loss": 5.0176, "step": 7971 }, { "epoch": 1.3643676193736094, "grad_norm": 0.3861672580242157, "learning_rate": 7.707150658304364e-06, "loss": 0.1201, "step": 7972 }, { "epoch": 1.3645387643333904, "grad_norm": 11.791454315185547, "learning_rate": 7.697084507630925e-06, "loss": 0.9477, "step": 7973 }, { "epoch": 1.3647099092931714, "grad_norm": 25.322511672973633, "learning_rate": 7.68702266568851e-06, "loss": 5.0495, "step": 7974 }, { "epoch": 1.3648810542529524, "grad_norm": 15.774009704589844, "learning_rate": 7.67696513841361e-06, "loss": 1.4033, "step": 7975 }, { "epoch": 1.3650521992127331, "grad_norm": 13.291998863220215, "learning_rate": 7.66691193174019e-06, "loss": 1.2573, "step": 7976 }, { "epoch": 1.3652233441725141, "grad_norm": 14.601156234741211, "learning_rate": 7.656863051599646e-06, "loss": 1.1831, "step": 7977 }, { "epoch": 1.365394489132295, "grad_norm": 13.060968399047852, "learning_rate": 7.646818503920841e-06, "loss": 1.1125, "step": 7978 }, { "epoch": 1.3655656340920759, "grad_norm": 0.6287318468093872, "learning_rate": 7.636778294630076e-06, "loss": 0.1291, "step": 7979 }, { "epoch": 1.3657367790518569, "grad_norm": 2.176541805267334, "learning_rate": 7.6267424296510836e-06, "loss": 0.2442, "step": 7980 }, { "epoch": 1.3659079240116379, "grad_norm": 89.38330841064453, "learning_rate": 7.616710914905035e-06, "loss": 8.7055, "step": 7981 }, { "epoch": 1.3660790689714188, "grad_norm": 14.080567359924316, "learning_rate": 7.606683756310548e-06, "loss": 1.318, "step": 7982 }, { "epoch": 1.3662502139311998, "grad_norm": 15.819098472595215, "learning_rate": 7.596660959783651e-06, "loss": 1.3924, "step": 7983 }, { "epoch": 1.3664213588909806, "grad_norm": 14.727864265441895, "learning_rate": 7.586642531237823e-06, "loss": 1.0125, "step": 7984 }, { "epoch": 1.3665925038507616, "grad_norm": 19.342557907104492, "learning_rate": 7.576628476583937e-06, "loss": 1.4103, "step": 7985 }, { "epoch": 1.3667636488105426, "grad_norm": 5.45033597946167, "learning_rate": 7.56661880173031e-06, "loss": 0.5339, "step": 7986 }, { "epoch": 1.3669347937703233, "grad_norm": 6.75947904586792, "learning_rate": 7.556613512582664e-06, "loss": 0.4621, "step": 7987 }, { "epoch": 1.3671059387301043, "grad_norm": 10.857300758361816, "learning_rate": 7.54661261504412e-06, "loss": 0.9741, "step": 7988 }, { "epoch": 1.3672770836898853, "grad_norm": 17.877666473388672, "learning_rate": 7.536616115015246e-06, "loss": 1.5805, "step": 7989 }, { "epoch": 1.3674482286496663, "grad_norm": 14.81043529510498, "learning_rate": 7.526624018393975e-06, "loss": 1.2154, "step": 7990 }, { "epoch": 1.3676193736094473, "grad_norm": 6.165063858032227, "learning_rate": 7.5166363310756705e-06, "loss": 0.7289, "step": 7991 }, { "epoch": 1.367790518569228, "grad_norm": 18.871471405029297, "learning_rate": 7.506653058953077e-06, "loss": 1.4732, "step": 7992 }, { "epoch": 1.367961663529009, "grad_norm": 16.091135025024414, "learning_rate": 7.496674207916326e-06, "loss": 1.8831, "step": 7993 }, { "epoch": 1.36813280848879, "grad_norm": 6.082664966583252, "learning_rate": 7.486699783852983e-06, "loss": 0.3378, "step": 7994 }, { "epoch": 1.3683039534485708, "grad_norm": 15.225743293762207, "learning_rate": 7.476729792647949e-06, "loss": 1.3947, "step": 7995 }, { "epoch": 1.3684750984083518, "grad_norm": 12.026875495910645, "learning_rate": 7.466764240183551e-06, "loss": 0.9757, "step": 7996 }, { "epoch": 1.3686462433681328, "grad_norm": 0.3766089081764221, "learning_rate": 7.456803132339472e-06, "loss": 0.1186, "step": 7997 }, { "epoch": 1.3688173883279138, "grad_norm": 15.264883041381836, "learning_rate": 7.446846474992774e-06, "loss": 1.2569, "step": 7998 }, { "epoch": 1.3689885332876948, "grad_norm": 10.888883590698242, "learning_rate": 7.4368942740179114e-06, "loss": 0.8297, "step": 7999 }, { "epoch": 1.3691596782474755, "grad_norm": 6.847597122192383, "learning_rate": 7.426946535286687e-06, "loss": 0.4545, "step": 8000 }, { "epoch": 1.3693308232072565, "grad_norm": 10.688901901245117, "learning_rate": 7.4170032646682915e-06, "loss": 0.9802, "step": 8001 }, { "epoch": 1.3695019681670375, "grad_norm": 16.83812141418457, "learning_rate": 7.407064468029259e-06, "loss": 1.4476, "step": 8002 }, { "epoch": 1.3696731131268185, "grad_norm": 18.952674865722656, "learning_rate": 7.3971301512335055e-06, "loss": 1.4857, "step": 8003 }, { "epoch": 1.3698442580865993, "grad_norm": 1.0890339612960815, "learning_rate": 7.387200320142282e-06, "loss": 0.209, "step": 8004 }, { "epoch": 1.3700154030463803, "grad_norm": 14.084479331970215, "learning_rate": 7.3772749806142056e-06, "loss": 1.0087, "step": 8005 }, { "epoch": 1.3701865480061612, "grad_norm": 10.847527503967285, "learning_rate": 7.367354138505252e-06, "loss": 0.7337, "step": 8006 }, { "epoch": 1.3703576929659422, "grad_norm": 13.33262825012207, "learning_rate": 7.35743779966872e-06, "loss": 0.9783, "step": 8007 }, { "epoch": 1.3705288379257232, "grad_norm": 15.647591590881348, "learning_rate": 7.347525969955275e-06, "loss": 1.6981, "step": 8008 }, { "epoch": 1.370699982885504, "grad_norm": 17.50412940979004, "learning_rate": 7.337618655212906e-06, "loss": 1.4768, "step": 8009 }, { "epoch": 1.370871127845285, "grad_norm": 14.713836669921875, "learning_rate": 7.327715861286931e-06, "loss": 1.4001, "step": 8010 }, { "epoch": 1.371042272805066, "grad_norm": 18.337644577026367, "learning_rate": 7.317817594020038e-06, "loss": 1.6328, "step": 8011 }, { "epoch": 1.3712134177648467, "grad_norm": 12.138872146606445, "learning_rate": 7.307923859252206e-06, "loss": 1.1789, "step": 8012 }, { "epoch": 1.3713845627246277, "grad_norm": 15.563578605651855, "learning_rate": 7.298034662820748e-06, "loss": 1.4695, "step": 8013 }, { "epoch": 1.3715557076844087, "grad_norm": 17.70362091064453, "learning_rate": 7.288150010560317e-06, "loss": 1.4956, "step": 8014 }, { "epoch": 1.3717268526441897, "grad_norm": 4.129745006561279, "learning_rate": 7.278269908302854e-06, "loss": 0.3229, "step": 8015 }, { "epoch": 1.3718979976039707, "grad_norm": 0.6604208946228027, "learning_rate": 7.268394361877659e-06, "loss": 0.1311, "step": 8016 }, { "epoch": 1.3720691425637515, "grad_norm": 5.454418659210205, "learning_rate": 7.2585233771113065e-06, "loss": 0.3518, "step": 8017 }, { "epoch": 1.3722402875235324, "grad_norm": 17.043312072753906, "learning_rate": 7.248656959827685e-06, "loss": 1.6863, "step": 8018 }, { "epoch": 1.3724114324833134, "grad_norm": 21.178543090820312, "learning_rate": 7.23879511584801e-06, "loss": 2.4076, "step": 8019 }, { "epoch": 1.3725825774430942, "grad_norm": 15.764005661010742, "learning_rate": 7.22893785099077e-06, "loss": 1.4081, "step": 8020 }, { "epoch": 1.3727537224028752, "grad_norm": 22.064271926879883, "learning_rate": 7.219085171071771e-06, "loss": 0.9252, "step": 8021 }, { "epoch": 1.3729248673626562, "grad_norm": 15.479656219482422, "learning_rate": 7.209237081904119e-06, "loss": 1.0212, "step": 8022 }, { "epoch": 1.3730960123224372, "grad_norm": 25.82483673095703, "learning_rate": 7.199393589298185e-06, "loss": 5.4379, "step": 8023 }, { "epoch": 1.3732671572822182, "grad_norm": 59.71751403808594, "learning_rate": 7.189554699061658e-06, "loss": 7.4834, "step": 8024 }, { "epoch": 1.373438302241999, "grad_norm": 15.977654457092285, "learning_rate": 7.179720416999488e-06, "loss": 1.209, "step": 8025 }, { "epoch": 1.37360944720178, "grad_norm": 16.143056869506836, "learning_rate": 7.169890748913924e-06, "loss": 1.948, "step": 8026 }, { "epoch": 1.373780592161561, "grad_norm": 27.495500564575195, "learning_rate": 7.160065700604475e-06, "loss": 5.4623, "step": 8027 }, { "epoch": 1.3739517371213417, "grad_norm": 6.934070110321045, "learning_rate": 7.150245277867945e-06, "loss": 0.5984, "step": 8028 }, { "epoch": 1.3741228820811227, "grad_norm": 22.05686378479004, "learning_rate": 7.140429486498394e-06, "loss": 5.0537, "step": 8029 }, { "epoch": 1.3742940270409036, "grad_norm": 19.047924041748047, "learning_rate": 7.130618332287147e-06, "loss": 1.7073, "step": 8030 }, { "epoch": 1.3744651720006846, "grad_norm": 9.164684295654297, "learning_rate": 7.120811821022805e-06, "loss": 1.0877, "step": 8031 }, { "epoch": 1.3746363169604656, "grad_norm": 19.64408302307129, "learning_rate": 7.11100995849123e-06, "loss": 2.3523, "step": 8032 }, { "epoch": 1.3748074619202464, "grad_norm": 24.614337921142578, "learning_rate": 7.101212750475524e-06, "loss": 5.1241, "step": 8033 }, { "epoch": 1.3749786068800274, "grad_norm": 14.775176048278809, "learning_rate": 7.091420202756066e-06, "loss": 1.2222, "step": 8034 }, { "epoch": 1.3751497518398084, "grad_norm": 14.342777252197266, "learning_rate": 7.0816323211104615e-06, "loss": 1.2048, "step": 8035 }, { "epoch": 1.3753208967995891, "grad_norm": 0.43484142422676086, "learning_rate": 7.0718491113135815e-06, "loss": 0.1246, "step": 8036 }, { "epoch": 1.3754920417593701, "grad_norm": 9.955939292907715, "learning_rate": 7.062070579137541e-06, "loss": 1.0372, "step": 8037 }, { "epoch": 1.375663186719151, "grad_norm": 18.3414306640625, "learning_rate": 7.052296730351676e-06, "loss": 1.8833, "step": 8038 }, { "epoch": 1.375834331678932, "grad_norm": 13.475018501281738, "learning_rate": 7.042527570722584e-06, "loss": 1.1563, "step": 8039 }, { "epoch": 1.376005476638713, "grad_norm": 29.00771713256836, "learning_rate": 7.0327631060140705e-06, "loss": 5.4663, "step": 8040 }, { "epoch": 1.3761766215984939, "grad_norm": 15.355203628540039, "learning_rate": 7.023003341987198e-06, "loss": 1.4888, "step": 8041 }, { "epoch": 1.3763477665582748, "grad_norm": 19.718971252441406, "learning_rate": 7.01324828440023e-06, "loss": 1.6287, "step": 8042 }, { "epoch": 1.3765189115180558, "grad_norm": 12.399322509765625, "learning_rate": 7.0034979390086755e-06, "loss": 1.0184, "step": 8043 }, { "epoch": 1.3766900564778366, "grad_norm": 79.78743743896484, "learning_rate": 6.9937523115652464e-06, "loss": 7.7877, "step": 8044 }, { "epoch": 1.3768612014376176, "grad_norm": 8.104473114013672, "learning_rate": 6.9840114078198745e-06, "loss": 0.9961, "step": 8045 }, { "epoch": 1.3770323463973986, "grad_norm": 1.3494822978973389, "learning_rate": 6.974275233519717e-06, "loss": 0.2309, "step": 8046 }, { "epoch": 1.3772034913571796, "grad_norm": 21.887319564819336, "learning_rate": 6.964543794409114e-06, "loss": 2.6566, "step": 8047 }, { "epoch": 1.3773746363169606, "grad_norm": 20.275388717651367, "learning_rate": 6.954817096229651e-06, "loss": 1.7164, "step": 8048 }, { "epoch": 1.3775457812767413, "grad_norm": 0.4384884834289551, "learning_rate": 6.9450951447200855e-06, "loss": 0.1223, "step": 8049 }, { "epoch": 1.3777169262365223, "grad_norm": 19.98276138305664, "learning_rate": 6.935377945616375e-06, "loss": 1.4869, "step": 8050 }, { "epoch": 1.3778880711963033, "grad_norm": 16.372770309448242, "learning_rate": 6.925665504651695e-06, "loss": 1.4808, "step": 8051 }, { "epoch": 1.378059216156084, "grad_norm": 18.803668975830078, "learning_rate": 6.915957827556389e-06, "loss": 2.2224, "step": 8052 }, { "epoch": 1.378230361115865, "grad_norm": 7.975682735443115, "learning_rate": 6.906254920058005e-06, "loss": 0.8613, "step": 8053 }, { "epoch": 1.378401506075646, "grad_norm": 18.069852828979492, "learning_rate": 6.896556787881279e-06, "loss": 1.431, "step": 8054 }, { "epoch": 1.378572651035427, "grad_norm": 3.733964443206787, "learning_rate": 6.8868634367481105e-06, "loss": 0.2958, "step": 8055 }, { "epoch": 1.378743795995208, "grad_norm": 4.760802268981934, "learning_rate": 6.877174872377608e-06, "loss": 0.3132, "step": 8056 }, { "epoch": 1.378914940954989, "grad_norm": 19.099594116210938, "learning_rate": 6.867491100486021e-06, "loss": 1.4491, "step": 8057 }, { "epoch": 1.3790860859147698, "grad_norm": 3.7092506885528564, "learning_rate": 6.857812126786798e-06, "loss": 0.4602, "step": 8058 }, { "epoch": 1.3792572308745508, "grad_norm": 15.550238609313965, "learning_rate": 6.848137956990553e-06, "loss": 1.0471, "step": 8059 }, { "epoch": 1.3794283758343318, "grad_norm": 16.970060348510742, "learning_rate": 6.8384685968050504e-06, "loss": 2.0567, "step": 8060 }, { "epoch": 1.3795995207941125, "grad_norm": 57.70185852050781, "learning_rate": 6.828804051935237e-06, "loss": 8.2924, "step": 8061 }, { "epoch": 1.3797706657538935, "grad_norm": 17.050525665283203, "learning_rate": 6.8191443280831985e-06, "loss": 1.5775, "step": 8062 }, { "epoch": 1.3799418107136745, "grad_norm": 0.41770151257514954, "learning_rate": 6.809489430948192e-06, "loss": 0.1234, "step": 8063 }, { "epoch": 1.3801129556734555, "grad_norm": 3.295809268951416, "learning_rate": 6.799839366226626e-06, "loss": 0.2752, "step": 8064 }, { "epoch": 1.3802841006332365, "grad_norm": 6.964579105377197, "learning_rate": 6.790194139612041e-06, "loss": 0.3692, "step": 8065 }, { "epoch": 1.3804552455930172, "grad_norm": 4.134066104888916, "learning_rate": 6.780553756795148e-06, "loss": 0.3073, "step": 8066 }, { "epoch": 1.3806263905527982, "grad_norm": 12.902695655822754, "learning_rate": 6.770918223463776e-06, "loss": 1.4663, "step": 8067 }, { "epoch": 1.3807975355125792, "grad_norm": 20.563579559326172, "learning_rate": 6.761287545302915e-06, "loss": 2.075, "step": 8068 }, { "epoch": 1.38096868047236, "grad_norm": 17.05385398864746, "learning_rate": 6.751661727994672e-06, "loss": 2.182, "step": 8069 }, { "epoch": 1.381139825432141, "grad_norm": 15.212250709533691, "learning_rate": 6.7420407772182906e-06, "loss": 0.9807, "step": 8070 }, { "epoch": 1.381310970391922, "grad_norm": 17.51760482788086, "learning_rate": 6.732424698650156e-06, "loss": 2.3006, "step": 8071 }, { "epoch": 1.381482115351703, "grad_norm": 12.286190032958984, "learning_rate": 6.722813497963758e-06, "loss": 0.8639, "step": 8072 }, { "epoch": 1.381653260311484, "grad_norm": 6.781225681304932, "learning_rate": 6.713207180829729e-06, "loss": 0.5811, "step": 8073 }, { "epoch": 1.3818244052712647, "grad_norm": 8.380965232849121, "learning_rate": 6.703605752915802e-06, "loss": 0.9242, "step": 8074 }, { "epoch": 1.3819955502310457, "grad_norm": 18.39959716796875, "learning_rate": 6.694009219886838e-06, "loss": 2.1914, "step": 8075 }, { "epoch": 1.3821666951908267, "grad_norm": 17.51675796508789, "learning_rate": 6.68441758740481e-06, "loss": 1.4097, "step": 8076 }, { "epoch": 1.3823378401506075, "grad_norm": 5.3710198402404785, "learning_rate": 6.6748308611287855e-06, "loss": 0.3662, "step": 8077 }, { "epoch": 1.3825089851103884, "grad_norm": 10.967370986938477, "learning_rate": 6.66524904671496e-06, "loss": 0.7261, "step": 8078 }, { "epoch": 1.3826801300701694, "grad_norm": 15.520840644836426, "learning_rate": 6.655672149816605e-06, "loss": 1.0765, "step": 8079 }, { "epoch": 1.3828512750299504, "grad_norm": 62.75824737548828, "learning_rate": 6.646100176084111e-06, "loss": 7.4534, "step": 8080 }, { "epoch": 1.3830224199897314, "grad_norm": 5.519872188568115, "learning_rate": 6.6365331311649604e-06, "loss": 0.4165, "step": 8081 }, { "epoch": 1.3831935649495122, "grad_norm": 5.536046981811523, "learning_rate": 6.626971020703714e-06, "loss": 0.3804, "step": 8082 }, { "epoch": 1.3833647099092932, "grad_norm": 15.942901611328125, "learning_rate": 6.617413850342042e-06, "loss": 1.2424, "step": 8083 }, { "epoch": 1.3835358548690742, "grad_norm": 14.011425971984863, "learning_rate": 6.607861625718684e-06, "loss": 1.0696, "step": 8084 }, { "epoch": 1.383706999828855, "grad_norm": 3.4530327320098877, "learning_rate": 6.598314352469461e-06, "loss": 0.2756, "step": 8085 }, { "epoch": 1.383878144788636, "grad_norm": 9.03433895111084, "learning_rate": 6.58877203622729e-06, "loss": 0.6991, "step": 8086 }, { "epoch": 1.384049289748417, "grad_norm": 7.679855823516846, "learning_rate": 6.579234682622139e-06, "loss": 0.8322, "step": 8087 }, { "epoch": 1.3842204347081979, "grad_norm": 0.48729658126831055, "learning_rate": 6.5697022972810745e-06, "loss": 0.1323, "step": 8088 }, { "epoch": 1.3843915796679789, "grad_norm": 8.509507179260254, "learning_rate": 6.5601748858282065e-06, "loss": 0.5483, "step": 8089 }, { "epoch": 1.3845627246277596, "grad_norm": 10.94564151763916, "learning_rate": 6.550652453884724e-06, "loss": 0.8701, "step": 8090 }, { "epoch": 1.3847338695875406, "grad_norm": 27.77153968811035, "learning_rate": 6.541135007068887e-06, "loss": 5.4978, "step": 8091 }, { "epoch": 1.3849050145473216, "grad_norm": 26.718948364257812, "learning_rate": 6.531622550995986e-06, "loss": 1.0582, "step": 8092 }, { "epoch": 1.3850761595071024, "grad_norm": 5.982920169830322, "learning_rate": 6.522115091278402e-06, "loss": 0.6942, "step": 8093 }, { "epoch": 1.3852473044668834, "grad_norm": 10.091400146484375, "learning_rate": 6.512612633525535e-06, "loss": 0.7003, "step": 8094 }, { "epoch": 1.3854184494266644, "grad_norm": 19.210908889770508, "learning_rate": 6.503115183343857e-06, "loss": 1.8411, "step": 8095 }, { "epoch": 1.3855895943864454, "grad_norm": 0.4305846691131592, "learning_rate": 6.4936227463368795e-06, "loss": 0.1227, "step": 8096 }, { "epoch": 1.3857607393462263, "grad_norm": 0.4560820460319519, "learning_rate": 6.484135328105148e-06, "loss": 0.1265, "step": 8097 }, { "epoch": 1.3859318843060071, "grad_norm": 0.42901623249053955, "learning_rate": 6.474652934246262e-06, "loss": 0.1273, "step": 8098 }, { "epoch": 1.386103029265788, "grad_norm": 10.552484512329102, "learning_rate": 6.465175570354837e-06, "loss": 0.8805, "step": 8099 }, { "epoch": 1.386274174225569, "grad_norm": 9.38833236694336, "learning_rate": 6.455703242022543e-06, "loss": 0.7382, "step": 8100 }, { "epoch": 1.3864453191853499, "grad_norm": 5.672971725463867, "learning_rate": 6.446235954838058e-06, "loss": 0.3442, "step": 8101 }, { "epoch": 1.3866164641451308, "grad_norm": 10.641942024230957, "learning_rate": 6.4367737143871e-06, "loss": 0.9234, "step": 8102 }, { "epoch": 1.3867876091049118, "grad_norm": 9.72297191619873, "learning_rate": 6.42731652625241e-06, "loss": 0.9145, "step": 8103 }, { "epoch": 1.3869587540646928, "grad_norm": 2.3832085132598877, "learning_rate": 6.417864396013735e-06, "loss": 0.2521, "step": 8104 }, { "epoch": 1.3871298990244738, "grad_norm": 16.631643295288086, "learning_rate": 6.408417329247851e-06, "loss": 1.4814, "step": 8105 }, { "epoch": 1.3873010439842548, "grad_norm": 15.015097618103027, "learning_rate": 6.398975331528536e-06, "loss": 1.2678, "step": 8106 }, { "epoch": 1.3874721889440356, "grad_norm": 16.07328224182129, "learning_rate": 6.389538408426587e-06, "loss": 1.0467, "step": 8107 }, { "epoch": 1.3876433339038166, "grad_norm": 0.4330154061317444, "learning_rate": 6.380106565509806e-06, "loss": 0.1198, "step": 8108 }, { "epoch": 1.3878144788635975, "grad_norm": 7.085061550140381, "learning_rate": 6.370679808342991e-06, "loss": 0.8164, "step": 8109 }, { "epoch": 1.3879856238233783, "grad_norm": 18.764047622680664, "learning_rate": 6.361258142487936e-06, "loss": 2.5517, "step": 8110 }, { "epoch": 1.3881567687831593, "grad_norm": 13.04540729522705, "learning_rate": 6.35184157350345e-06, "loss": 1.1178, "step": 8111 }, { "epoch": 1.3883279137429403, "grad_norm": 7.839341163635254, "learning_rate": 6.342430106945312e-06, "loss": 0.9889, "step": 8112 }, { "epoch": 1.3884990587027213, "grad_norm": 0.3881106972694397, "learning_rate": 6.333023748366311e-06, "loss": 0.1214, "step": 8113 }, { "epoch": 1.3886702036625023, "grad_norm": 6.755177974700928, "learning_rate": 6.323622503316201e-06, "loss": 0.352, "step": 8114 }, { "epoch": 1.388841348622283, "grad_norm": 14.873271942138672, "learning_rate": 6.314226377341743e-06, "loss": 1.1683, "step": 8115 }, { "epoch": 1.389012493582064, "grad_norm": 15.769723892211914, "learning_rate": 6.304835375986661e-06, "loss": 1.0529, "step": 8116 }, { "epoch": 1.389183638541845, "grad_norm": 94.52820587158203, "learning_rate": 6.2954495047916445e-06, "loss": 9.2915, "step": 8117 }, { "epoch": 1.3893547835016258, "grad_norm": 19.46604347229004, "learning_rate": 6.286068769294398e-06, "loss": 2.3134, "step": 8118 }, { "epoch": 1.3895259284614068, "grad_norm": 27.398771286010742, "learning_rate": 6.276693175029553e-06, "loss": 5.3409, "step": 8119 }, { "epoch": 1.3896970734211878, "grad_norm": 16.219106674194336, "learning_rate": 6.267322727528731e-06, "loss": 1.6904, "step": 8120 }, { "epoch": 1.3898682183809687, "grad_norm": 12.038756370544434, "learning_rate": 6.257957432320506e-06, "loss": 0.8749, "step": 8121 }, { "epoch": 1.3900393633407497, "grad_norm": 12.382250785827637, "learning_rate": 6.248597294930407e-06, "loss": 1.0359, "step": 8122 }, { "epoch": 1.3902105083005305, "grad_norm": 13.015780448913574, "learning_rate": 6.23924232088095e-06, "loss": 1.0681, "step": 8123 }, { "epoch": 1.3903816532603115, "grad_norm": 0.49243029952049255, "learning_rate": 6.229892515691566e-06, "loss": 0.1276, "step": 8124 }, { "epoch": 1.3905527982200925, "grad_norm": 20.274410247802734, "learning_rate": 6.220547884878667e-06, "loss": 2.0958, "step": 8125 }, { "epoch": 1.3907239431798732, "grad_norm": 7.040147304534912, "learning_rate": 6.211208433955592e-06, "loss": 0.4615, "step": 8126 }, { "epoch": 1.3908950881396542, "grad_norm": 8.364036560058594, "learning_rate": 6.201874168432627e-06, "loss": 0.6492, "step": 8127 }, { "epoch": 1.3910662330994352, "grad_norm": 17.132287979125977, "learning_rate": 6.192545093817011e-06, "loss": 1.2063, "step": 8128 }, { "epoch": 1.3912373780592162, "grad_norm": 16.411943435668945, "learning_rate": 6.1832212156129045e-06, "loss": 1.2658, "step": 8129 }, { "epoch": 1.3914085230189972, "grad_norm": 5.050886631011963, "learning_rate": 6.173902539321417e-06, "loss": 0.314, "step": 8130 }, { "epoch": 1.391579667978778, "grad_norm": 14.13430404663086, "learning_rate": 6.164589070440572e-06, "loss": 0.8482, "step": 8131 }, { "epoch": 1.391750812938559, "grad_norm": 22.28693962097168, "learning_rate": 6.155280814465341e-06, "loss": 1.8001, "step": 8132 }, { "epoch": 1.39192195789834, "grad_norm": 16.029760360717773, "learning_rate": 6.145977776887599e-06, "loss": 1.2511, "step": 8133 }, { "epoch": 1.3920931028581207, "grad_norm": 12.385472297668457, "learning_rate": 6.136679963196155e-06, "loss": 0.9778, "step": 8134 }, { "epoch": 1.3922642478179017, "grad_norm": 17.029630661010742, "learning_rate": 6.127387378876741e-06, "loss": 1.3992, "step": 8135 }, { "epoch": 1.3924353927776827, "grad_norm": 122.83370208740234, "learning_rate": 6.118100029411982e-06, "loss": 10.1844, "step": 8136 }, { "epoch": 1.3926065377374637, "grad_norm": 25.112655639648438, "learning_rate": 6.108817920281441e-06, "loss": 3.64, "step": 8137 }, { "epoch": 1.3927776826972447, "grad_norm": 14.767163276672363, "learning_rate": 6.099541056961565e-06, "loss": 1.1958, "step": 8138 }, { "epoch": 1.3929488276570254, "grad_norm": 11.507041931152344, "learning_rate": 6.090269444925722e-06, "loss": 1.0309, "step": 8139 }, { "epoch": 1.3931199726168064, "grad_norm": 14.378401756286621, "learning_rate": 6.081003089644182e-06, "loss": 1.3041, "step": 8140 }, { "epoch": 1.3932911175765874, "grad_norm": 9.112606048583984, "learning_rate": 6.071741996584104e-06, "loss": 0.9412, "step": 8141 }, { "epoch": 1.3934622625363682, "grad_norm": 2.832979440689087, "learning_rate": 6.062486171209541e-06, "loss": 0.2715, "step": 8142 }, { "epoch": 1.3936334074961492, "grad_norm": 12.30532455444336, "learning_rate": 6.053235618981454e-06, "loss": 1.1225, "step": 8143 }, { "epoch": 1.3938045524559302, "grad_norm": 13.39280891418457, "learning_rate": 6.0439903453576665e-06, "loss": 1.1622, "step": 8144 }, { "epoch": 1.3939756974157111, "grad_norm": 18.136594772338867, "learning_rate": 6.034750355792927e-06, "loss": 1.5756, "step": 8145 }, { "epoch": 1.3941468423754921, "grad_norm": 14.948494911193848, "learning_rate": 6.0255156557388295e-06, "loss": 1.3044, "step": 8146 }, { "epoch": 1.394317987335273, "grad_norm": 14.837949752807617, "learning_rate": 6.016286250643859e-06, "loss": 1.2203, "step": 8147 }, { "epoch": 1.394489132295054, "grad_norm": 0.6035500168800354, "learning_rate": 6.0070621459533846e-06, "loss": 0.1311, "step": 8148 }, { "epoch": 1.3946602772548349, "grad_norm": 12.36839771270752, "learning_rate": 5.997843347109634e-06, "loss": 0.9687, "step": 8149 }, { "epoch": 1.3948314222146156, "grad_norm": 21.35314178466797, "learning_rate": 5.988629859551719e-06, "loss": 1.5717, "step": 8150 }, { "epoch": 1.3950025671743966, "grad_norm": 15.543251991271973, "learning_rate": 5.979421688715612e-06, "loss": 1.3761, "step": 8151 }, { "epoch": 1.3951737121341776, "grad_norm": 13.013747215270996, "learning_rate": 5.9702188400341394e-06, "loss": 1.0351, "step": 8152 }, { "epoch": 1.3953448570939586, "grad_norm": 14.551894187927246, "learning_rate": 5.9610213189370054e-06, "loss": 1.0881, "step": 8153 }, { "epoch": 1.3955160020537396, "grad_norm": 19.03411102294922, "learning_rate": 5.951829130850753e-06, "loss": 1.3978, "step": 8154 }, { "epoch": 1.3956871470135206, "grad_norm": 18.278907775878906, "learning_rate": 5.9426422811987944e-06, "loss": 2.1687, "step": 8155 }, { "epoch": 1.3958582919733014, "grad_norm": 0.7327485680580139, "learning_rate": 5.933460775401376e-06, "loss": 0.1343, "step": 8156 }, { "epoch": 1.3960294369330823, "grad_norm": 8.675280570983887, "learning_rate": 5.9242846188756085e-06, "loss": 0.6823, "step": 8157 }, { "epoch": 1.3962005818928633, "grad_norm": 14.639605522155762, "learning_rate": 5.915113817035433e-06, "loss": 1.0151, "step": 8158 }, { "epoch": 1.396371726852644, "grad_norm": 5.388650894165039, "learning_rate": 5.905948375291635e-06, "loss": 0.3492, "step": 8159 }, { "epoch": 1.396542871812425, "grad_norm": 13.80362606048584, "learning_rate": 5.896788299051837e-06, "loss": 1.0125, "step": 8160 }, { "epoch": 1.396714016772206, "grad_norm": 14.141159057617188, "learning_rate": 5.887633593720509e-06, "loss": 1.1325, "step": 8161 }, { "epoch": 1.396885161731987, "grad_norm": 2.77217698097229, "learning_rate": 5.878484264698927e-06, "loss": 0.2681, "step": 8162 }, { "epoch": 1.397056306691768, "grad_norm": 13.108981132507324, "learning_rate": 5.869340317385221e-06, "loss": 1.1958, "step": 8163 }, { "epoch": 1.3972274516515488, "grad_norm": 17.22273826599121, "learning_rate": 5.860201757174322e-06, "loss": 1.1516, "step": 8164 }, { "epoch": 1.3973985966113298, "grad_norm": 15.933062553405762, "learning_rate": 5.851068589458e-06, "loss": 1.5206, "step": 8165 }, { "epoch": 1.3975697415711108, "grad_norm": 11.56320571899414, "learning_rate": 5.841940819624841e-06, "loss": 1.075, "step": 8166 }, { "epoch": 1.3977408865308916, "grad_norm": 14.757344245910645, "learning_rate": 5.832818453060236e-06, "loss": 1.2473, "step": 8167 }, { "epoch": 1.3979120314906726, "grad_norm": 14.71597671508789, "learning_rate": 5.823701495146401e-06, "loss": 1.3768, "step": 8168 }, { "epoch": 1.3980831764504535, "grad_norm": 14.82431411743164, "learning_rate": 5.814589951262346e-06, "loss": 0.9769, "step": 8169 }, { "epoch": 1.3982543214102345, "grad_norm": 24.57257652282715, "learning_rate": 5.805483826783909e-06, "loss": 2.1528, "step": 8170 }, { "epoch": 1.3984254663700155, "grad_norm": 19.09847640991211, "learning_rate": 5.796383127083702e-06, "loss": 1.5526, "step": 8171 }, { "epoch": 1.3985966113297963, "grad_norm": 0.5062642693519592, "learning_rate": 5.787287857531164e-06, "loss": 0.1277, "step": 8172 }, { "epoch": 1.3987677562895773, "grad_norm": 14.45943832397461, "learning_rate": 5.778198023492512e-06, "loss": 1.0815, "step": 8173 }, { "epoch": 1.3989389012493583, "grad_norm": 9.705041885375977, "learning_rate": 5.769113630330755e-06, "loss": 1.0699, "step": 8174 }, { "epoch": 1.399110046209139, "grad_norm": 13.191262245178223, "learning_rate": 5.760034683405712e-06, "loss": 0.9696, "step": 8175 }, { "epoch": 1.39928119116892, "grad_norm": 9.288146018981934, "learning_rate": 5.750961188073959e-06, "loss": 0.9779, "step": 8176 }, { "epoch": 1.399452336128701, "grad_norm": 16.641170501708984, "learning_rate": 5.74189314968889e-06, "loss": 1.7428, "step": 8177 }, { "epoch": 1.399623481088482, "grad_norm": 11.01945686340332, "learning_rate": 5.732830573600652e-06, "loss": 1.127, "step": 8178 }, { "epoch": 1.399794626048263, "grad_norm": 17.64215660095215, "learning_rate": 5.723773465156174e-06, "loss": 1.4615, "step": 8179 }, { "epoch": 1.3999657710080438, "grad_norm": 19.89571189880371, "learning_rate": 5.714721829699173e-06, "loss": 1.4644, "step": 8180 }, { "epoch": 1.4001369159678247, "grad_norm": 24.79058265686035, "learning_rate": 5.705675672570117e-06, "loss": 5.4109, "step": 8181 }, { "epoch": 1.4003080609276057, "grad_norm": 19.724206924438477, "learning_rate": 5.696634999106258e-06, "loss": 1.9591, "step": 8182 }, { "epoch": 1.4004792058873865, "grad_norm": 16.37856101989746, "learning_rate": 5.687599814641612e-06, "loss": 1.5471, "step": 8183 }, { "epoch": 1.4006503508471675, "grad_norm": 9.3511381149292, "learning_rate": 5.6785701245069405e-06, "loss": 1.1499, "step": 8184 }, { "epoch": 1.4008214958069485, "grad_norm": 3.557250499725342, "learning_rate": 5.669545934029785e-06, "loss": 0.265, "step": 8185 }, { "epoch": 1.4009926407667295, "grad_norm": 16.56192398071289, "learning_rate": 5.66052724853442e-06, "loss": 1.3035, "step": 8186 }, { "epoch": 1.4011637857265105, "grad_norm": 16.86762237548828, "learning_rate": 5.651514073341889e-06, "loss": 1.2929, "step": 8187 }, { "epoch": 1.4013349306862912, "grad_norm": 5.57222843170166, "learning_rate": 5.642506413769985e-06, "loss": 0.3981, "step": 8188 }, { "epoch": 1.4015060756460722, "grad_norm": 12.076986312866211, "learning_rate": 5.633504275133228e-06, "loss": 0.855, "step": 8189 }, { "epoch": 1.4016772206058532, "grad_norm": 9.291732788085938, "learning_rate": 5.624507662742907e-06, "loss": 0.9672, "step": 8190 }, { "epoch": 1.401848365565634, "grad_norm": 18.100425720214844, "learning_rate": 5.615516581907022e-06, "loss": 2.1209, "step": 8191 }, { "epoch": 1.402019510525415, "grad_norm": 20.10270118713379, "learning_rate": 5.606531037930333e-06, "loss": 2.322, "step": 8192 }, { "epoch": 1.402190655485196, "grad_norm": 12.470746040344238, "learning_rate": 5.597551036114328e-06, "loss": 0.8872, "step": 8193 }, { "epoch": 1.402361800444977, "grad_norm": 2.3300442695617676, "learning_rate": 5.58857658175721e-06, "loss": 0.2696, "step": 8194 }, { "epoch": 1.402532945404758, "grad_norm": 0.4978567361831665, "learning_rate": 5.579607680153932e-06, "loss": 0.1298, "step": 8195 }, { "epoch": 1.4027040903645387, "grad_norm": 17.7264461517334, "learning_rate": 5.57064433659615e-06, "loss": 2.0535, "step": 8196 }, { "epoch": 1.4028752353243197, "grad_norm": 18.33466911315918, "learning_rate": 5.561686556372258e-06, "loss": 1.4185, "step": 8197 }, { "epoch": 1.4030463802841007, "grad_norm": 12.567920684814453, "learning_rate": 5.552734344767356e-06, "loss": 0.9471, "step": 8198 }, { "epoch": 1.4032175252438814, "grad_norm": 17.757282257080078, "learning_rate": 5.543787707063256e-06, "loss": 1.9904, "step": 8199 }, { "epoch": 1.4033886702036624, "grad_norm": 15.925894737243652, "learning_rate": 5.534846648538499e-06, "loss": 1.0907, "step": 8200 }, { "epoch": 1.4035598151634434, "grad_norm": 0.56212317943573, "learning_rate": 5.525911174468313e-06, "loss": 0.1412, "step": 8201 }, { "epoch": 1.4037309601232244, "grad_norm": 14.887899398803711, "learning_rate": 5.5169812901246515e-06, "loss": 0.9752, "step": 8202 }, { "epoch": 1.4039021050830054, "grad_norm": 6.524936199188232, "learning_rate": 5.508057000776145e-06, "loss": 0.486, "step": 8203 }, { "epoch": 1.4040732500427862, "grad_norm": 6.022520542144775, "learning_rate": 5.499138311688148e-06, "loss": 0.5675, "step": 8204 }, { "epoch": 1.4042443950025671, "grad_norm": 21.253719329833984, "learning_rate": 5.490225228122704e-06, "loss": 1.5702, "step": 8205 }, { "epoch": 1.4044155399623481, "grad_norm": 5.472929954528809, "learning_rate": 5.481317755338534e-06, "loss": 0.5731, "step": 8206 }, { "epoch": 1.4045866849221291, "grad_norm": 12.250944137573242, "learning_rate": 5.472415898591072e-06, "loss": 0.9764, "step": 8207 }, { "epoch": 1.40475782988191, "grad_norm": 17.810029983520508, "learning_rate": 5.463519663132413e-06, "loss": 1.3339, "step": 8208 }, { "epoch": 1.4049289748416909, "grad_norm": 14.014885902404785, "learning_rate": 5.45462905421136e-06, "loss": 1.3219, "step": 8209 }, { "epoch": 1.4051001198014719, "grad_norm": 13.23001480102539, "learning_rate": 5.445744077073386e-06, "loss": 1.1867, "step": 8210 }, { "epoch": 1.4052712647612529, "grad_norm": 19.602142333984375, "learning_rate": 5.4368647369606315e-06, "loss": 2.0208, "step": 8211 }, { "epoch": 1.4054424097210338, "grad_norm": 15.634562492370605, "learning_rate": 5.4279910391119335e-06, "loss": 1.2285, "step": 8212 }, { "epoch": 1.4056135546808146, "grad_norm": 5.58061408996582, "learning_rate": 5.419122988762777e-06, "loss": 0.559, "step": 8213 }, { "epoch": 1.4057846996405956, "grad_norm": 21.571144104003906, "learning_rate": 5.410260591145324e-06, "loss": 2.1966, "step": 8214 }, { "epoch": 1.4059558446003766, "grad_norm": 28.33049201965332, "learning_rate": 5.40140385148841e-06, "loss": 5.4506, "step": 8215 }, { "epoch": 1.4061269895601574, "grad_norm": 23.992944717407227, "learning_rate": 5.392552775017515e-06, "loss": 5.2613, "step": 8216 }, { "epoch": 1.4062981345199383, "grad_norm": 9.046107292175293, "learning_rate": 5.383707366954799e-06, "loss": 0.758, "step": 8217 }, { "epoch": 1.4064692794797193, "grad_norm": 1.370169997215271, "learning_rate": 5.374867632519054e-06, "loss": 0.2339, "step": 8218 }, { "epoch": 1.4066404244395003, "grad_norm": 32.47026824951172, "learning_rate": 5.3660335769257416e-06, "loss": 5.6423, "step": 8219 }, { "epoch": 1.4068115693992813, "grad_norm": 7.3732194900512695, "learning_rate": 5.357205205386974e-06, "loss": 0.6134, "step": 8220 }, { "epoch": 1.406982714359062, "grad_norm": 19.994348526000977, "learning_rate": 5.348382523111492e-06, "loss": 2.0643, "step": 8221 }, { "epoch": 1.407153859318843, "grad_norm": 19.23360252380371, "learning_rate": 5.339565535304703e-06, "loss": 1.7582, "step": 8222 }, { "epoch": 1.407325004278624, "grad_norm": 13.627464294433594, "learning_rate": 5.330754247168631e-06, "loss": 1.0169, "step": 8223 }, { "epoch": 1.4074961492384048, "grad_norm": 26.512380599975586, "learning_rate": 5.321948663901956e-06, "loss": 5.258, "step": 8224 }, { "epoch": 1.4076672941981858, "grad_norm": 16.750289916992188, "learning_rate": 5.313148790699989e-06, "loss": 1.5109, "step": 8225 }, { "epoch": 1.4078384391579668, "grad_norm": 0.5825438499450684, "learning_rate": 5.304354632754657e-06, "loss": 0.1341, "step": 8226 }, { "epoch": 1.4080095841177478, "grad_norm": 9.757774353027344, "learning_rate": 5.295566195254541e-06, "loss": 0.6044, "step": 8227 }, { "epoch": 1.4081807290775288, "grad_norm": 8.211784362792969, "learning_rate": 5.2867834833848175e-06, "loss": 0.8329, "step": 8228 }, { "epoch": 1.4083518740373095, "grad_norm": 21.741884231567383, "learning_rate": 5.278006502327305e-06, "loss": 2.9324, "step": 8229 }, { "epoch": 1.4085230189970905, "grad_norm": 17.97635269165039, "learning_rate": 5.269235257260444e-06, "loss": 1.7023, "step": 8230 }, { "epoch": 1.4086941639568715, "grad_norm": 13.902000427246094, "learning_rate": 5.260469753359268e-06, "loss": 1.0075, "step": 8231 }, { "epoch": 1.4088653089166523, "grad_norm": 128.4419403076172, "learning_rate": 5.25170999579545e-06, "loss": 10.1796, "step": 8232 }, { "epoch": 1.4090364538764333, "grad_norm": 8.146458625793457, "learning_rate": 5.242955989737255e-06, "loss": 0.7365, "step": 8233 }, { "epoch": 1.4092075988362143, "grad_norm": 25.456911087036133, "learning_rate": 5.234207740349552e-06, "loss": 5.0111, "step": 8234 }, { "epoch": 1.4093787437959953, "grad_norm": 15.250171661376953, "learning_rate": 5.22546525279383e-06, "loss": 1.2126, "step": 8235 }, { "epoch": 1.4095498887557762, "grad_norm": 7.476015567779541, "learning_rate": 5.216728532228166e-06, "loss": 0.4713, "step": 8236 }, { "epoch": 1.409721033715557, "grad_norm": 16.156631469726562, "learning_rate": 5.2079975838072454e-06, "loss": 1.3638, "step": 8237 }, { "epoch": 1.409892178675338, "grad_norm": 14.219999313354492, "learning_rate": 5.199272412682336e-06, "loss": 1.1954, "step": 8238 }, { "epoch": 1.410063323635119, "grad_norm": 12.787342071533203, "learning_rate": 5.190553024001294e-06, "loss": 1.0364, "step": 8239 }, { "epoch": 1.4102344685948998, "grad_norm": 16.65154266357422, "learning_rate": 5.181839422908585e-06, "loss": 1.4099, "step": 8240 }, { "epoch": 1.4104056135546807, "grad_norm": 7.621182441711426, "learning_rate": 5.173131614545234e-06, "loss": 0.7902, "step": 8241 }, { "epoch": 1.4105767585144617, "grad_norm": 22.970258712768555, "learning_rate": 5.164429604048872e-06, "loss": 2.7821, "step": 8242 }, { "epoch": 1.4107479034742427, "grad_norm": 11.445488929748535, "learning_rate": 5.155733396553691e-06, "loss": 0.9885, "step": 8243 }, { "epoch": 1.4109190484340237, "grad_norm": 17.15314483642578, "learning_rate": 5.147042997190471e-06, "loss": 1.3129, "step": 8244 }, { "epoch": 1.4110901933938045, "grad_norm": 3.5751562118530273, "learning_rate": 5.13835841108656e-06, "loss": 0.3196, "step": 8245 }, { "epoch": 1.4112613383535855, "grad_norm": 10.422638893127441, "learning_rate": 5.129679643365864e-06, "loss": 0.8013, "step": 8246 }, { "epoch": 1.4114324833133665, "grad_norm": 16.59783935546875, "learning_rate": 5.121006699148889e-06, "loss": 1.3876, "step": 8247 }, { "epoch": 1.4116036282731472, "grad_norm": 10.978696823120117, "learning_rate": 5.112339583552672e-06, "loss": 0.9672, "step": 8248 }, { "epoch": 1.4117747732329282, "grad_norm": 12.953923225402832, "learning_rate": 5.103678301690833e-06, "loss": 1.3148, "step": 8249 }, { "epoch": 1.4119459181927092, "grad_norm": 14.40229606628418, "learning_rate": 5.095022858673536e-06, "loss": 1.1966, "step": 8250 }, { "epoch": 1.4121170631524902, "grad_norm": 16.806026458740234, "learning_rate": 5.086373259607495e-06, "loss": 1.971, "step": 8251 }, { "epoch": 1.4122882081122712, "grad_norm": 18.28573989868164, "learning_rate": 5.077729509596009e-06, "loss": 2.0556, "step": 8252 }, { "epoch": 1.412459353072052, "grad_norm": 13.333511352539062, "learning_rate": 5.069091613738883e-06, "loss": 0.9267, "step": 8253 }, { "epoch": 1.412630498031833, "grad_norm": 16.23652458190918, "learning_rate": 5.060459577132504e-06, "loss": 1.406, "step": 8254 }, { "epoch": 1.412801642991614, "grad_norm": 18.61722183227539, "learning_rate": 5.051833404869778e-06, "loss": 1.7441, "step": 8255 }, { "epoch": 1.4129727879513947, "grad_norm": 10.499286651611328, "learning_rate": 5.043213102040155e-06, "loss": 0.8584, "step": 8256 }, { "epoch": 1.4131439329111757, "grad_norm": 13.57168960571289, "learning_rate": 5.034598673729637e-06, "loss": 1.0984, "step": 8257 }, { "epoch": 1.4133150778709567, "grad_norm": 20.259376525878906, "learning_rate": 5.02599012502074e-06, "loss": 2.1547, "step": 8258 }, { "epoch": 1.4134862228307377, "grad_norm": 10.417964935302734, "learning_rate": 5.017387460992531e-06, "loss": 0.7996, "step": 8259 }, { "epoch": 1.4136573677905186, "grad_norm": 46.52947235107422, "learning_rate": 5.0087906867205825e-06, "loss": 7.2844, "step": 8260 }, { "epoch": 1.4138285127502996, "grad_norm": 13.9273681640625, "learning_rate": 5.000199807277016e-06, "loss": 1.154, "step": 8261 }, { "epoch": 1.4139996577100804, "grad_norm": 0.46313220262527466, "learning_rate": 4.991614827730453e-06, "loss": 0.1258, "step": 8262 }, { "epoch": 1.4141708026698614, "grad_norm": 14.016765594482422, "learning_rate": 4.983035753146048e-06, "loss": 0.875, "step": 8263 }, { "epoch": 1.4143419476296424, "grad_norm": 3.0206847190856934, "learning_rate": 4.974462588585474e-06, "loss": 0.2286, "step": 8264 }, { "epoch": 1.4145130925894231, "grad_norm": 0.4846319854259491, "learning_rate": 4.965895339106904e-06, "loss": 0.1241, "step": 8265 }, { "epoch": 1.4146842375492041, "grad_norm": 18.099964141845703, "learning_rate": 4.957334009765025e-06, "loss": 1.001, "step": 8266 }, { "epoch": 1.4148553825089851, "grad_norm": 19.34149932861328, "learning_rate": 4.9487786056110396e-06, "loss": 2.1257, "step": 8267 }, { "epoch": 1.4150265274687661, "grad_norm": 11.995976448059082, "learning_rate": 4.940229131692646e-06, "loss": 0.7773, "step": 8268 }, { "epoch": 1.415197672428547, "grad_norm": 8.392796516418457, "learning_rate": 4.931685593054055e-06, "loss": 0.6936, "step": 8269 }, { "epoch": 1.4153688173883279, "grad_norm": 22.823087692260742, "learning_rate": 4.923147994735959e-06, "loss": 1.7489, "step": 8270 }, { "epoch": 1.4155399623481089, "grad_norm": 8.401246070861816, "learning_rate": 4.91461634177555e-06, "loss": 0.949, "step": 8271 }, { "epoch": 1.4157111073078898, "grad_norm": 21.816896438598633, "learning_rate": 4.906090639206523e-06, "loss": 5.0226, "step": 8272 }, { "epoch": 1.4158822522676706, "grad_norm": 8.518943786621094, "learning_rate": 4.897570892059052e-06, "loss": 0.7506, "step": 8273 }, { "epoch": 1.4160533972274516, "grad_norm": 13.235052108764648, "learning_rate": 4.889057105359807e-06, "loss": 0.9746, "step": 8274 }, { "epoch": 1.4162245421872326, "grad_norm": 15.83570671081543, "learning_rate": 4.880549284131929e-06, "loss": 1.3973, "step": 8275 }, { "epoch": 1.4163956871470136, "grad_norm": 39.7057991027832, "learning_rate": 4.8720474333950415e-06, "loss": 5.954, "step": 8276 }, { "epoch": 1.4165668321067946, "grad_norm": 12.785017967224121, "learning_rate": 4.86355155816526e-06, "loss": 1.1006, "step": 8277 }, { "epoch": 1.4167379770665753, "grad_norm": 21.51802635192871, "learning_rate": 4.8550616634551505e-06, "loss": 1.5379, "step": 8278 }, { "epoch": 1.4169091220263563, "grad_norm": 18.137781143188477, "learning_rate": 4.8465777542737686e-06, "loss": 1.5255, "step": 8279 }, { "epoch": 1.4170802669861373, "grad_norm": 18.3780517578125, "learning_rate": 4.838099835626642e-06, "loss": 1.4327, "step": 8280 }, { "epoch": 1.417251411945918, "grad_norm": 17.565465927124023, "learning_rate": 4.829627912515742e-06, "loss": 1.7263, "step": 8281 }, { "epoch": 1.417422556905699, "grad_norm": 16.71804428100586, "learning_rate": 4.821161989939528e-06, "loss": 1.7956, "step": 8282 }, { "epoch": 1.41759370186548, "grad_norm": 16.671550750732422, "learning_rate": 4.812702072892895e-06, "loss": 1.1207, "step": 8283 }, { "epoch": 1.417764846825261, "grad_norm": 14.933640480041504, "learning_rate": 4.8042481663672185e-06, "loss": 1.0693, "step": 8284 }, { "epoch": 1.417935991785042, "grad_norm": 15.258363723754883, "learning_rate": 4.795800275350304e-06, "loss": 1.0977, "step": 8285 }, { "epoch": 1.4181071367448228, "grad_norm": 19.424457550048828, "learning_rate": 4.787358404826431e-06, "loss": 1.6517, "step": 8286 }, { "epoch": 1.4182782817046038, "grad_norm": 8.330910682678223, "learning_rate": 4.778922559776311e-06, "loss": 0.6448, "step": 8287 }, { "epoch": 1.4184494266643848, "grad_norm": 15.775337219238281, "learning_rate": 4.770492745177095e-06, "loss": 1.314, "step": 8288 }, { "epoch": 1.4186205716241655, "grad_norm": 8.920485496520996, "learning_rate": 4.762068966002404e-06, "loss": 0.7039, "step": 8289 }, { "epoch": 1.4187917165839465, "grad_norm": 19.12442970275879, "learning_rate": 4.753651227222274e-06, "loss": 1.4776, "step": 8290 }, { "epoch": 1.4189628615437275, "grad_norm": 10.677218437194824, "learning_rate": 4.745239533803176e-06, "loss": 0.9044, "step": 8291 }, { "epoch": 1.4191340065035085, "grad_norm": 22.69997215270996, "learning_rate": 4.7368338907080315e-06, "loss": 5.6359, "step": 8292 }, { "epoch": 1.4193051514632895, "grad_norm": 12.117168426513672, "learning_rate": 4.728434302896173e-06, "loss": 1.0645, "step": 8293 }, { "epoch": 1.4194762964230703, "grad_norm": 15.21337604522705, "learning_rate": 4.720040775323374e-06, "loss": 1.387, "step": 8294 }, { "epoch": 1.4196474413828513, "grad_norm": 17.30541229248047, "learning_rate": 4.711653312941836e-06, "loss": 1.3167, "step": 8295 }, { "epoch": 1.4198185863426322, "grad_norm": 0.8116150498390198, "learning_rate": 4.703271920700162e-06, "loss": 0.1334, "step": 8296 }, { "epoch": 1.419989731302413, "grad_norm": 11.490621566772461, "learning_rate": 4.694896603543396e-06, "loss": 0.8766, "step": 8297 }, { "epoch": 1.420160876262194, "grad_norm": 2.6469197273254395, "learning_rate": 4.686527366412978e-06, "loss": 0.2834, "step": 8298 }, { "epoch": 1.420332021221975, "grad_norm": 14.399421691894531, "learning_rate": 4.67816421424678e-06, "loss": 1.3508, "step": 8299 }, { "epoch": 1.420503166181756, "grad_norm": 15.663511276245117, "learning_rate": 4.669807151979065e-06, "loss": 1.4015, "step": 8300 }, { "epoch": 1.420674311141537, "grad_norm": 11.911434173583984, "learning_rate": 4.661456184540523e-06, "loss": 0.9201, "step": 8301 }, { "epoch": 1.4208454561013177, "grad_norm": 20.173824310302734, "learning_rate": 4.6531113168582285e-06, "loss": 1.6779, "step": 8302 }, { "epoch": 1.4210166010610987, "grad_norm": 12.680366516113281, "learning_rate": 4.644772553855665e-06, "loss": 0.8848, "step": 8303 }, { "epoch": 1.4211877460208797, "grad_norm": 15.29770565032959, "learning_rate": 4.636439900452722e-06, "loss": 1.1408, "step": 8304 }, { "epoch": 1.4213588909806605, "grad_norm": 3.1856536865234375, "learning_rate": 4.628113361565664e-06, "loss": 0.2935, "step": 8305 }, { "epoch": 1.4215300359404415, "grad_norm": 7.9707350730896, "learning_rate": 4.619792942107183e-06, "loss": 0.6461, "step": 8306 }, { "epoch": 1.4217011809002225, "grad_norm": 17.432270050048828, "learning_rate": 4.611478646986326e-06, "loss": 1.5045, "step": 8307 }, { "epoch": 1.4218723258600034, "grad_norm": 13.844263076782227, "learning_rate": 4.603170481108535e-06, "loss": 1.0557, "step": 8308 }, { "epoch": 1.4220434708197844, "grad_norm": 6.255578994750977, "learning_rate": 4.5948684493756515e-06, "loss": 0.631, "step": 8309 }, { "epoch": 1.4222146157795654, "grad_norm": 33.30855178833008, "learning_rate": 4.586572556685876e-06, "loss": 5.827, "step": 8310 }, { "epoch": 1.4223857607393462, "grad_norm": 2.2865633964538574, "learning_rate": 4.578282807933802e-06, "loss": 0.267, "step": 8311 }, { "epoch": 1.4225569056991272, "grad_norm": 6.593497276306152, "learning_rate": 4.569999208010399e-06, "loss": 0.7334, "step": 8312 }, { "epoch": 1.4227280506589082, "grad_norm": 4.725667476654053, "learning_rate": 4.5617217618029935e-06, "loss": 0.4138, "step": 8313 }, { "epoch": 1.422899195618689, "grad_norm": 14.847844123840332, "learning_rate": 4.553450474195301e-06, "loss": 1.2268, "step": 8314 }, { "epoch": 1.42307034057847, "grad_norm": 56.064247131347656, "learning_rate": 4.545185350067384e-06, "loss": 7.2508, "step": 8315 }, { "epoch": 1.423241485538251, "grad_norm": 18.788999557495117, "learning_rate": 4.536926394295682e-06, "loss": 1.6801, "step": 8316 }, { "epoch": 1.423412630498032, "grad_norm": 5.343076229095459, "learning_rate": 4.528673611752997e-06, "loss": 0.5371, "step": 8317 }, { "epoch": 1.423583775457813, "grad_norm": 0.4705977141857147, "learning_rate": 4.520427007308471e-06, "loss": 0.1254, "step": 8318 }, { "epoch": 1.4237549204175937, "grad_norm": 14.721872329711914, "learning_rate": 4.512186585827626e-06, "loss": 1.1522, "step": 8319 }, { "epoch": 1.4239260653773747, "grad_norm": 10.518433570861816, "learning_rate": 4.503952352172312e-06, "loss": 0.7963, "step": 8320 }, { "epoch": 1.4240972103371556, "grad_norm": 18.39733123779297, "learning_rate": 4.495724311200743e-06, "loss": 1.9861, "step": 8321 }, { "epoch": 1.4242683552969364, "grad_norm": 0.5959325432777405, "learning_rate": 4.487502467767481e-06, "loss": 0.1356, "step": 8322 }, { "epoch": 1.4244395002567174, "grad_norm": 15.7307710647583, "learning_rate": 4.479286826723415e-06, "loss": 1.1963, "step": 8323 }, { "epoch": 1.4246106452164984, "grad_norm": 17.11187744140625, "learning_rate": 4.471077392915798e-06, "loss": 1.3984, "step": 8324 }, { "epoch": 1.4247817901762794, "grad_norm": 3.4646151065826416, "learning_rate": 4.462874171188197e-06, "loss": 0.2938, "step": 8325 }, { "epoch": 1.4249529351360604, "grad_norm": 16.55135726928711, "learning_rate": 4.454677166380533e-06, "loss": 1.4855, "step": 8326 }, { "epoch": 1.4251240800958411, "grad_norm": 13.826451301574707, "learning_rate": 4.446486383329048e-06, "loss": 1.2132, "step": 8327 }, { "epoch": 1.4252952250556221, "grad_norm": 13.233954429626465, "learning_rate": 4.438301826866311e-06, "loss": 1.1412, "step": 8328 }, { "epoch": 1.425466370015403, "grad_norm": 5.975020885467529, "learning_rate": 4.430123501821233e-06, "loss": 0.5786, "step": 8329 }, { "epoch": 1.4256375149751839, "grad_norm": 12.811347007751465, "learning_rate": 4.421951413019028e-06, "loss": 0.9023, "step": 8330 }, { "epoch": 1.4258086599349649, "grad_norm": 11.870346069335938, "learning_rate": 4.413785565281244e-06, "loss": 1.0858, "step": 8331 }, { "epoch": 1.4259798048947459, "grad_norm": 0.5878346562385559, "learning_rate": 4.405625963425748e-06, "loss": 0.1382, "step": 8332 }, { "epoch": 1.4261509498545268, "grad_norm": 22.074859619140625, "learning_rate": 4.3974726122667095e-06, "loss": 0.9755, "step": 8333 }, { "epoch": 1.4263220948143078, "grad_norm": 14.925138473510742, "learning_rate": 4.389325516614628e-06, "loss": 1.5146, "step": 8334 }, { "epoch": 1.4264932397740886, "grad_norm": 12.06701374053955, "learning_rate": 4.381184681276289e-06, "loss": 1.0256, "step": 8335 }, { "epoch": 1.4266643847338696, "grad_norm": 66.08161163330078, "learning_rate": 4.37305011105481e-06, "loss": 8.282, "step": 8336 }, { "epoch": 1.4268355296936506, "grad_norm": 4.224128723144531, "learning_rate": 4.36492181074959e-06, "loss": 0.2899, "step": 8337 }, { "epoch": 1.4270066746534313, "grad_norm": 16.93010902404785, "learning_rate": 4.356799785156346e-06, "loss": 1.2722, "step": 8338 }, { "epoch": 1.4271778196132123, "grad_norm": 14.85118579864502, "learning_rate": 4.3486840390670755e-06, "loss": 1.4349, "step": 8339 }, { "epoch": 1.4273489645729933, "grad_norm": 19.152435302734375, "learning_rate": 4.3405745772700875e-06, "loss": 2.5445, "step": 8340 }, { "epoch": 1.4275201095327743, "grad_norm": 0.5115156769752502, "learning_rate": 4.3324714045499815e-06, "loss": 0.1364, "step": 8341 }, { "epoch": 1.4276912544925553, "grad_norm": 81.3175048828125, "learning_rate": 4.324374525687635e-06, "loss": 8.4995, "step": 8342 }, { "epoch": 1.427862399452336, "grad_norm": 18.543806076049805, "learning_rate": 4.3162839454602135e-06, "loss": 1.5013, "step": 8343 }, { "epoch": 1.428033544412117, "grad_norm": 22.6982364654541, "learning_rate": 4.3081996686411825e-06, "loss": 2.4938, "step": 8344 }, { "epoch": 1.428204689371898, "grad_norm": 6.561403751373291, "learning_rate": 4.300121700000269e-06, "loss": 0.3636, "step": 8345 }, { "epoch": 1.4283758343316788, "grad_norm": 9.580253601074219, "learning_rate": 4.2920500443034915e-06, "loss": 0.585, "step": 8346 }, { "epoch": 1.4285469792914598, "grad_norm": 16.961750030517578, "learning_rate": 4.283984706313135e-06, "loss": 2.085, "step": 8347 }, { "epoch": 1.4287181242512408, "grad_norm": 5.839288711547852, "learning_rate": 4.275925690787765e-06, "loss": 0.5665, "step": 8348 }, { "epoch": 1.4288892692110218, "grad_norm": 16.67837905883789, "learning_rate": 4.267873002482213e-06, "loss": 1.6816, "step": 8349 }, { "epoch": 1.4290604141708028, "grad_norm": 17.28608512878418, "learning_rate": 4.259826646147563e-06, "loss": 1.6976, "step": 8350 }, { "epoch": 1.4292315591305835, "grad_norm": 7.20570182800293, "learning_rate": 4.251786626531195e-06, "loss": 0.5685, "step": 8351 }, { "epoch": 1.4294027040903645, "grad_norm": 24.501218795776367, "learning_rate": 4.2437529483767305e-06, "loss": 1.9365, "step": 8352 }, { "epoch": 1.4295738490501455, "grad_norm": 5.079190254211426, "learning_rate": 4.235725616424041e-06, "loss": 0.3883, "step": 8353 }, { "epoch": 1.4297449940099263, "grad_norm": 13.976761817932129, "learning_rate": 4.227704635409279e-06, "loss": 0.9349, "step": 8354 }, { "epoch": 1.4299161389697073, "grad_norm": 26.27178382873535, "learning_rate": 4.219690010064819e-06, "loss": 5.4424, "step": 8355 }, { "epoch": 1.4300872839294883, "grad_norm": 10.558706283569336, "learning_rate": 4.2116817451193165e-06, "loss": 0.6608, "step": 8356 }, { "epoch": 1.4302584288892692, "grad_norm": 10.44459342956543, "learning_rate": 4.203679845297648e-06, "loss": 0.7136, "step": 8357 }, { "epoch": 1.4304295738490502, "grad_norm": 12.273661613464355, "learning_rate": 4.195684315320957e-06, "loss": 1.0603, "step": 8358 }, { "epoch": 1.4306007188088312, "grad_norm": 21.012882232666016, "learning_rate": 4.18769515990661e-06, "loss": 2.2086, "step": 8359 }, { "epoch": 1.430771863768612, "grad_norm": 16.736093521118164, "learning_rate": 4.179712383768221e-06, "loss": 1.3113, "step": 8360 }, { "epoch": 1.430943008728393, "grad_norm": 16.535369873046875, "learning_rate": 4.171735991615636e-06, "loss": 1.0518, "step": 8361 }, { "epoch": 1.431114153688174, "grad_norm": 2.179553985595703, "learning_rate": 4.163765988154954e-06, "loss": 0.2467, "step": 8362 }, { "epoch": 1.4312852986479547, "grad_norm": 12.279807090759277, "learning_rate": 4.155802378088475e-06, "loss": 1.116, "step": 8363 }, { "epoch": 1.4314564436077357, "grad_norm": 16.3411808013916, "learning_rate": 4.14784516611475e-06, "loss": 0.9596, "step": 8364 }, { "epoch": 1.4316275885675167, "grad_norm": 0.39746716618537903, "learning_rate": 4.139894356928535e-06, "loss": 0.1217, "step": 8365 }, { "epoch": 1.4317987335272977, "grad_norm": 16.14784049987793, "learning_rate": 4.131949955220829e-06, "loss": 1.7755, "step": 8366 }, { "epoch": 1.4319698784870787, "grad_norm": 11.682435989379883, "learning_rate": 4.124011965678838e-06, "loss": 0.7324, "step": 8367 }, { "epoch": 1.4321410234468595, "grad_norm": 15.93487548828125, "learning_rate": 4.116080392985983e-06, "loss": 1.5287, "step": 8368 }, { "epoch": 1.4323121684066404, "grad_norm": 11.052955627441406, "learning_rate": 4.10815524182191e-06, "loss": 1.0045, "step": 8369 }, { "epoch": 1.4324833133664214, "grad_norm": 4.440634727478027, "learning_rate": 4.100236516862463e-06, "loss": 0.3072, "step": 8370 }, { "epoch": 1.4326544583262022, "grad_norm": 18.903459548950195, "learning_rate": 4.092324222779711e-06, "loss": 1.7344, "step": 8371 }, { "epoch": 1.4328256032859832, "grad_norm": 6.7151031494140625, "learning_rate": 4.0844183642419096e-06, "loss": 0.3896, "step": 8372 }, { "epoch": 1.4329967482457642, "grad_norm": 2.5013160705566406, "learning_rate": 4.076518945913532e-06, "loss": 0.271, "step": 8373 }, { "epoch": 1.4331678932055452, "grad_norm": 0.4836881458759308, "learning_rate": 4.068625972455251e-06, "loss": 0.1247, "step": 8374 }, { "epoch": 1.4333390381653262, "grad_norm": 14.715100288391113, "learning_rate": 4.060739448523921e-06, "loss": 1.1756, "step": 8375 }, { "epoch": 1.433510183125107, "grad_norm": 16.3120059967041, "learning_rate": 4.052859378772617e-06, "loss": 1.486, "step": 8376 }, { "epoch": 1.433681328084888, "grad_norm": 11.071383476257324, "learning_rate": 4.04498576785058e-06, "loss": 0.9383, "step": 8377 }, { "epoch": 1.433852473044669, "grad_norm": 15.928082466125488, "learning_rate": 4.0371186204032614e-06, "loss": 1.5102, "step": 8378 }, { "epoch": 1.4340236180044497, "grad_norm": 16.743022918701172, "learning_rate": 4.029257941072286e-06, "loss": 1.4692, "step": 8379 }, { "epoch": 1.4341947629642307, "grad_norm": 15.075504302978516, "learning_rate": 4.0214037344954604e-06, "loss": 1.2071, "step": 8380 }, { "epoch": 1.4343659079240116, "grad_norm": 21.42952537536621, "learning_rate": 4.013556005306788e-06, "loss": 1.9503, "step": 8381 }, { "epoch": 1.4345370528837926, "grad_norm": 17.631309509277344, "learning_rate": 4.0057147581364324e-06, "loss": 1.4888, "step": 8382 }, { "epoch": 1.4347081978435736, "grad_norm": 93.76021575927734, "learning_rate": 3.997879997610745e-06, "loss": 8.4484, "step": 8383 }, { "epoch": 1.4348793428033544, "grad_norm": 11.445027351379395, "learning_rate": 3.990051728352252e-06, "loss": 0.9915, "step": 8384 }, { "epoch": 1.4350504877631354, "grad_norm": 6.506800651550293, "learning_rate": 3.982229954979631e-06, "loss": 0.7748, "step": 8385 }, { "epoch": 1.4352216327229164, "grad_norm": 3.774770736694336, "learning_rate": 3.9744146821077546e-06, "loss": 0.2667, "step": 8386 }, { "epoch": 1.4353927776826971, "grad_norm": 28.999431610107422, "learning_rate": 3.96660591434763e-06, "loss": 5.4131, "step": 8387 }, { "epoch": 1.4355639226424781, "grad_norm": 1.9287961721420288, "learning_rate": 3.958803656306456e-06, "loss": 0.211, "step": 8388 }, { "epoch": 1.435735067602259, "grad_norm": 21.30387306213379, "learning_rate": 3.951007912587566e-06, "loss": 2.5411, "step": 8389 }, { "epoch": 1.43590621256204, "grad_norm": 2.638003349304199, "learning_rate": 3.9432186877904684e-06, "loss": 0.3276, "step": 8390 }, { "epoch": 1.436077357521821, "grad_norm": 0.4537413716316223, "learning_rate": 3.9354359865108154e-06, "loss": 0.1262, "step": 8391 }, { "epoch": 1.4362485024816019, "grad_norm": 13.961565971374512, "learning_rate": 3.927659813340403e-06, "loss": 1.0929, "step": 8392 }, { "epoch": 1.4364196474413828, "grad_norm": 13.700946807861328, "learning_rate": 3.919890172867191e-06, "loss": 0.9157, "step": 8393 }, { "epoch": 1.4365907924011638, "grad_norm": 17.101577758789062, "learning_rate": 3.912127069675288e-06, "loss": 1.072, "step": 8394 }, { "epoch": 1.4367619373609446, "grad_norm": 5.47079610824585, "learning_rate": 3.904370508344924e-06, "loss": 0.6731, "step": 8395 }, { "epoch": 1.4369330823207256, "grad_norm": 6.5655837059021, "learning_rate": 3.896620493452493e-06, "loss": 0.625, "step": 8396 }, { "epoch": 1.4371042272805066, "grad_norm": 11.239481925964355, "learning_rate": 3.888877029570503e-06, "loss": 1.0318, "step": 8397 }, { "epoch": 1.4372753722402876, "grad_norm": 21.253192901611328, "learning_rate": 3.881140121267619e-06, "loss": 1.5675, "step": 8398 }, { "epoch": 1.4374465172000686, "grad_norm": 17.1704044342041, "learning_rate": 3.873409773108625e-06, "loss": 2.0079, "step": 8399 }, { "epoch": 1.4376176621598493, "grad_norm": 87.91876983642578, "learning_rate": 3.865685989654433e-06, "loss": 8.8725, "step": 8400 }, { "epoch": 1.4377888071196303, "grad_norm": 13.40868854522705, "learning_rate": 3.857968775462096e-06, "loss": 1.0831, "step": 8401 }, { "epoch": 1.4379599520794113, "grad_norm": 17.707443237304688, "learning_rate": 3.8502581350847716e-06, "loss": 1.3818, "step": 8402 }, { "epoch": 1.438131097039192, "grad_norm": 2.461596727371216, "learning_rate": 3.84255407307176e-06, "loss": 0.2992, "step": 8403 }, { "epoch": 1.438302241998973, "grad_norm": 18.57525062561035, "learning_rate": 3.834856593968456e-06, "loss": 1.9091, "step": 8404 }, { "epoch": 1.438473386958754, "grad_norm": 20.6262264251709, "learning_rate": 3.827165702316395e-06, "loss": 1.6094, "step": 8405 }, { "epoch": 1.438644531918535, "grad_norm": 12.672618865966797, "learning_rate": 3.8194814026532146e-06, "loss": 0.858, "step": 8406 }, { "epoch": 1.438815676878316, "grad_norm": 14.475008964538574, "learning_rate": 3.81180369951266e-06, "loss": 1.2888, "step": 8407 }, { "epoch": 1.4389868218380968, "grad_norm": 16.547607421875, "learning_rate": 3.8041325974245826e-06, "loss": 1.5037, "step": 8408 }, { "epoch": 1.4391579667978778, "grad_norm": 19.653106689453125, "learning_rate": 3.7964681009149547e-06, "loss": 1.5261, "step": 8409 }, { "epoch": 1.4393291117576588, "grad_norm": 0.3853145241737366, "learning_rate": 3.788810214505829e-06, "loss": 0.1155, "step": 8410 }, { "epoch": 1.4395002567174398, "grad_norm": 11.497426986694336, "learning_rate": 3.7811589427153793e-06, "loss": 0.6922, "step": 8411 }, { "epoch": 1.4396714016772205, "grad_norm": 17.74052619934082, "learning_rate": 3.7735142900578578e-06, "loss": 1.2992, "step": 8412 }, { "epoch": 1.4398425466370015, "grad_norm": 0.7352682948112488, "learning_rate": 3.7658762610436336e-06, "loss": 0.1482, "step": 8413 }, { "epoch": 1.4400136915967825, "grad_norm": 1.819342017173767, "learning_rate": 3.758244860179142e-06, "loss": 0.218, "step": 8414 }, { "epoch": 1.4401848365565635, "grad_norm": 23.511795043945312, "learning_rate": 3.7506200919669278e-06, "loss": 5.0297, "step": 8415 }, { "epoch": 1.4403559815163445, "grad_norm": 7.822660446166992, "learning_rate": 3.7430019609056187e-06, "loss": 0.7485, "step": 8416 }, { "epoch": 1.4405271264761252, "grad_norm": 12.87449836730957, "learning_rate": 3.735390471489915e-06, "loss": 1.1747, "step": 8417 }, { "epoch": 1.4406982714359062, "grad_norm": 15.970128059387207, "learning_rate": 3.727785628210616e-06, "loss": 1.2442, "step": 8418 }, { "epoch": 1.4408694163956872, "grad_norm": 8.213173866271973, "learning_rate": 3.7201874355545874e-06, "loss": 0.4527, "step": 8419 }, { "epoch": 1.441040561355468, "grad_norm": 15.708293914794922, "learning_rate": 3.7125958980047662e-06, "loss": 1.3131, "step": 8420 }, { "epoch": 1.441211706315249, "grad_norm": 18.762908935546875, "learning_rate": 3.7050110200401822e-06, "loss": 1.6741, "step": 8421 }, { "epoch": 1.44138285127503, "grad_norm": 0.9794007539749146, "learning_rate": 3.6974328061359146e-06, "loss": 0.2083, "step": 8422 }, { "epoch": 1.441553996234811, "grad_norm": 5.408003807067871, "learning_rate": 3.6898612607631327e-06, "loss": 0.7957, "step": 8423 }, { "epoch": 1.441725141194592, "grad_norm": 10.823113441467285, "learning_rate": 3.6822963883890476e-06, "loss": 0.9094, "step": 8424 }, { "epoch": 1.4418962861543727, "grad_norm": 1.8446986675262451, "learning_rate": 3.674738193476949e-06, "loss": 0.2384, "step": 8425 }, { "epoch": 1.4420674311141537, "grad_norm": 6.8515801429748535, "learning_rate": 3.6671866804861903e-06, "loss": 0.5296, "step": 8426 }, { "epoch": 1.4422385760739347, "grad_norm": 25.481624603271484, "learning_rate": 3.659641853872167e-06, "loss": 5.4417, "step": 8427 }, { "epoch": 1.4424097210337155, "grad_norm": 12.586084365844727, "learning_rate": 3.652103718086344e-06, "loss": 1.0079, "step": 8428 }, { "epoch": 1.4425808659934964, "grad_norm": 23.349151611328125, "learning_rate": 3.644572277576224e-06, "loss": 5.2822, "step": 8429 }, { "epoch": 1.4427520109532774, "grad_norm": 10.716401100158691, "learning_rate": 3.637047536785379e-06, "loss": 1.2512, "step": 8430 }, { "epoch": 1.4429231559130584, "grad_norm": 13.124372482299805, "learning_rate": 3.6295295001534133e-06, "loss": 1.4293, "step": 8431 }, { "epoch": 1.4430943008728394, "grad_norm": 0.47882649302482605, "learning_rate": 3.622018172115973e-06, "loss": 0.1324, "step": 8432 }, { "epoch": 1.4432654458326202, "grad_norm": 5.195764064788818, "learning_rate": 3.614513557104762e-06, "loss": 0.4471, "step": 8433 }, { "epoch": 1.4434365907924012, "grad_norm": 0.4021523594856262, "learning_rate": 3.607015659547506e-06, "loss": 0.126, "step": 8434 }, { "epoch": 1.4436077357521822, "grad_norm": 0.5373860597610474, "learning_rate": 3.5995244838679847e-06, "loss": 0.1333, "step": 8435 }, { "epoch": 1.443778880711963, "grad_norm": 13.348397254943848, "learning_rate": 3.5920400344859905e-06, "loss": 1.1046, "step": 8436 }, { "epoch": 1.443950025671744, "grad_norm": 2.4842634201049805, "learning_rate": 3.584562315817373e-06, "loss": 0.2417, "step": 8437 }, { "epoch": 1.444121170631525, "grad_norm": 10.33353328704834, "learning_rate": 3.5770913322739947e-06, "loss": 1.05, "step": 8438 }, { "epoch": 1.4442923155913059, "grad_norm": 58.4034309387207, "learning_rate": 3.5696270882637446e-06, "loss": 7.4923, "step": 8439 }, { "epoch": 1.4444634605510869, "grad_norm": 9.728248596191406, "learning_rate": 3.562169588190533e-06, "loss": 1.0112, "step": 8440 }, { "epoch": 1.4446346055108676, "grad_norm": 12.72889232635498, "learning_rate": 3.554718836454306e-06, "loss": 1.015, "step": 8441 }, { "epoch": 1.4448057504706486, "grad_norm": 13.545023918151855, "learning_rate": 3.5472748374510065e-06, "loss": 0.9717, "step": 8442 }, { "epoch": 1.4449768954304296, "grad_norm": 5.7686285972595215, "learning_rate": 3.539837595572617e-06, "loss": 0.517, "step": 8443 }, { "epoch": 1.4451480403902104, "grad_norm": 14.59744930267334, "learning_rate": 3.5324071152071118e-06, "loss": 1.037, "step": 8444 }, { "epoch": 1.4453191853499914, "grad_norm": 8.939913749694824, "learning_rate": 3.524983400738493e-06, "loss": 0.8705, "step": 8445 }, { "epoch": 1.4454903303097724, "grad_norm": 12.520122528076172, "learning_rate": 3.517566456546758e-06, "loss": 1.0416, "step": 8446 }, { "epoch": 1.4456614752695534, "grad_norm": 14.85897445678711, "learning_rate": 3.5101562870079085e-06, "loss": 1.1178, "step": 8447 }, { "epoch": 1.4458326202293343, "grad_norm": 1.4447020292282104, "learning_rate": 3.502752896493969e-06, "loss": 0.2198, "step": 8448 }, { "epoch": 1.446003765189115, "grad_norm": 13.082090377807617, "learning_rate": 3.495356289372948e-06, "loss": 0.9224, "step": 8449 }, { "epoch": 1.446174910148896, "grad_norm": 14.805837631225586, "learning_rate": 3.487966470008847e-06, "loss": 1.2862, "step": 8450 }, { "epoch": 1.446346055108677, "grad_norm": 25.148759841918945, "learning_rate": 3.4805834427616817e-06, "loss": 1.9002, "step": 8451 }, { "epoch": 1.4465172000684579, "grad_norm": 0.42858776450157166, "learning_rate": 3.4732072119874376e-06, "loss": 0.1169, "step": 8452 }, { "epoch": 1.4466883450282388, "grad_norm": 13.413276672363281, "learning_rate": 3.4658377820381114e-06, "loss": 1.0795, "step": 8453 }, { "epoch": 1.4468594899880198, "grad_norm": 3.4085307121276855, "learning_rate": 3.4584751572616692e-06, "loss": 0.2776, "step": 8454 }, { "epoch": 1.4470306349478008, "grad_norm": 94.4915771484375, "learning_rate": 3.4511193420020786e-06, "loss": 7.4881, "step": 8455 }, { "epoch": 1.4472017799075818, "grad_norm": 8.371195793151855, "learning_rate": 3.4437703405992787e-06, "loss": 0.7268, "step": 8456 }, { "epoch": 1.4473729248673626, "grad_norm": 13.714017868041992, "learning_rate": 3.436428157389181e-06, "loss": 1.1406, "step": 8457 }, { "epoch": 1.4475440698271436, "grad_norm": 3.720073699951172, "learning_rate": 3.429092796703703e-06, "loss": 0.2816, "step": 8458 }, { "epoch": 1.4477152147869246, "grad_norm": 11.747239112854004, "learning_rate": 3.421764262870709e-06, "loss": 0.8131, "step": 8459 }, { "epoch": 1.4478863597467055, "grad_norm": 21.724252700805664, "learning_rate": 3.414442560214041e-06, "loss": 1.8312, "step": 8460 }, { "epoch": 1.4480575047064863, "grad_norm": 20.976316452026367, "learning_rate": 3.407127693053524e-06, "loss": 2.1807, "step": 8461 }, { "epoch": 1.4482286496662673, "grad_norm": 2.2490921020507812, "learning_rate": 3.39981966570493e-06, "loss": 0.2436, "step": 8462 }, { "epoch": 1.4483997946260483, "grad_norm": 15.60532283782959, "learning_rate": 3.392518482480016e-06, "loss": 1.0062, "step": 8463 }, { "epoch": 1.4485709395858293, "grad_norm": 10.781365394592285, "learning_rate": 3.385224147686482e-06, "loss": 0.8134, "step": 8464 }, { "epoch": 1.4487420845456103, "grad_norm": 14.161565780639648, "learning_rate": 3.377936665628004e-06, "loss": 1.1182, "step": 8465 }, { "epoch": 1.448913229505391, "grad_norm": 10.106000900268555, "learning_rate": 3.3706560406041996e-06, "loss": 0.6549, "step": 8466 }, { "epoch": 1.449084374465172, "grad_norm": 77.07288360595703, "learning_rate": 3.3633822769106578e-06, "loss": 8.6634, "step": 8467 }, { "epoch": 1.449255519424953, "grad_norm": 6.6021037101745605, "learning_rate": 3.3561153788388998e-06, "loss": 0.4629, "step": 8468 }, { "epoch": 1.4494266643847338, "grad_norm": 10.353342056274414, "learning_rate": 3.348855350676412e-06, "loss": 0.8539, "step": 8469 }, { "epoch": 1.4495978093445148, "grad_norm": 16.80060577392578, "learning_rate": 3.3416021967066256e-06, "loss": 1.3293, "step": 8470 }, { "epoch": 1.4497689543042958, "grad_norm": 29.66864585876465, "learning_rate": 3.3343559212089083e-06, "loss": 5.7813, "step": 8471 }, { "epoch": 1.4499400992640767, "grad_norm": 12.348899841308594, "learning_rate": 3.3271165284585677e-06, "loss": 0.9477, "step": 8472 }, { "epoch": 1.4501112442238577, "grad_norm": 23.271472930908203, "learning_rate": 3.3198840227268657e-06, "loss": 1.9464, "step": 8473 }, { "epoch": 1.4502823891836385, "grad_norm": 15.827186584472656, "learning_rate": 3.312658408280984e-06, "loss": 1.828, "step": 8474 }, { "epoch": 1.4504535341434195, "grad_norm": 11.40153980255127, "learning_rate": 3.305439689384053e-06, "loss": 0.7975, "step": 8475 }, { "epoch": 1.4506246791032005, "grad_norm": 2.8747787475585938, "learning_rate": 3.2982278702951195e-06, "loss": 0.2592, "step": 8476 }, { "epoch": 1.4507958240629812, "grad_norm": 13.116789817810059, "learning_rate": 3.2910229552691763e-06, "loss": 1.1737, "step": 8477 }, { "epoch": 1.4509669690227622, "grad_norm": 24.355487823486328, "learning_rate": 3.283824948557129e-06, "loss": 3.3179, "step": 8478 }, { "epoch": 1.4511381139825432, "grad_norm": 5.922363758087158, "learning_rate": 3.276633854405805e-06, "loss": 0.4773, "step": 8479 }, { "epoch": 1.4513092589423242, "grad_norm": 18.42041778564453, "learning_rate": 3.2694496770579727e-06, "loss": 2.1234, "step": 8480 }, { "epoch": 1.4514804039021052, "grad_norm": 5.232955455780029, "learning_rate": 3.262272420752307e-06, "loss": 0.3187, "step": 8481 }, { "epoch": 1.451651548861886, "grad_norm": 2.305129289627075, "learning_rate": 3.2551020897233914e-06, "loss": 0.2394, "step": 8482 }, { "epoch": 1.451822693821667, "grad_norm": 2.379843235015869, "learning_rate": 3.247938688201742e-06, "loss": 0.2671, "step": 8483 }, { "epoch": 1.451993838781448, "grad_norm": 17.739065170288086, "learning_rate": 3.240782220413765e-06, "loss": 1.3484, "step": 8484 }, { "epoch": 1.4521649837412287, "grad_norm": 17.64982032775879, "learning_rate": 3.2336326905817978e-06, "loss": 1.6587, "step": 8485 }, { "epoch": 1.4523361287010097, "grad_norm": 13.729183197021484, "learning_rate": 3.226490102924064e-06, "loss": 1.1662, "step": 8486 }, { "epoch": 1.4525072736607907, "grad_norm": 3.2244491577148438, "learning_rate": 3.21935446165471e-06, "loss": 0.2813, "step": 8487 }, { "epoch": 1.4526784186205717, "grad_norm": 14.36533260345459, "learning_rate": 3.212225770983771e-06, "loss": 1.0181, "step": 8488 }, { "epoch": 1.4528495635803527, "grad_norm": 4.451156139373779, "learning_rate": 3.2051040351171793e-06, "loss": 0.2937, "step": 8489 }, { "epoch": 1.4530207085401334, "grad_norm": 10.659801483154297, "learning_rate": 3.197989258256773e-06, "loss": 0.8253, "step": 8490 }, { "epoch": 1.4531918534999144, "grad_norm": 67.64859008789062, "learning_rate": 3.190881444600289e-06, "loss": 7.7092, "step": 8491 }, { "epoch": 1.4533629984596954, "grad_norm": 17.89799690246582, "learning_rate": 3.1837805983413382e-06, "loss": 2.0522, "step": 8492 }, { "epoch": 1.4535341434194762, "grad_norm": 10.325241088867188, "learning_rate": 3.176686723669438e-06, "loss": 0.76, "step": 8493 }, { "epoch": 1.4537052883792572, "grad_norm": 20.618074417114258, "learning_rate": 3.1695998247699774e-06, "loss": 5.0703, "step": 8494 }, { "epoch": 1.4538764333390382, "grad_norm": 11.815807342529297, "learning_rate": 3.1625199058242455e-06, "loss": 0.9483, "step": 8495 }, { "epoch": 1.4540475782988191, "grad_norm": 0.6206079125404358, "learning_rate": 3.1554469710094e-06, "loss": 0.1304, "step": 8496 }, { "epoch": 1.4542187232586001, "grad_norm": 10.281078338623047, "learning_rate": 3.1483810244984806e-06, "loss": 0.7949, "step": 8497 }, { "epoch": 1.454389868218381, "grad_norm": 11.412726402282715, "learning_rate": 3.1413220704604133e-06, "loss": 0.9857, "step": 8498 }, { "epoch": 1.4545610131781619, "grad_norm": 21.183530807495117, "learning_rate": 3.1342701130599823e-06, "loss": 2.652, "step": 8499 }, { "epoch": 1.4547321581379429, "grad_norm": 13.151970863342285, "learning_rate": 3.1272251564578587e-06, "loss": 0.9153, "step": 8500 }, { "epoch": 1.4549033030977236, "grad_norm": 25.7855224609375, "learning_rate": 3.1201872048105813e-06, "loss": 5.2122, "step": 8501 }, { "epoch": 1.4550744480575046, "grad_norm": 16.713407516479492, "learning_rate": 3.1131562622705432e-06, "loss": 1.2021, "step": 8502 }, { "epoch": 1.4552455930172856, "grad_norm": 0.524751603603363, "learning_rate": 3.1061323329860185e-06, "loss": 0.1278, "step": 8503 }, { "epoch": 1.4554167379770666, "grad_norm": 11.288585662841797, "learning_rate": 3.0991154211011303e-06, "loss": 1.0099, "step": 8504 }, { "epoch": 1.4555878829368476, "grad_norm": 16.54046058654785, "learning_rate": 3.0921055307558714e-06, "loss": 1.4352, "step": 8505 }, { "epoch": 1.4557590278966284, "grad_norm": 15.869155883789062, "learning_rate": 3.0851026660860844e-06, "loss": 1.9399, "step": 8506 }, { "epoch": 1.4559301728564094, "grad_norm": 21.86159896850586, "learning_rate": 3.0781068312234732e-06, "loss": 2.5342, "step": 8507 }, { "epoch": 1.4561013178161903, "grad_norm": 13.476174354553223, "learning_rate": 3.0711180302955888e-06, "loss": 1.2246, "step": 8508 }, { "epoch": 1.456272462775971, "grad_norm": 14.098896026611328, "learning_rate": 3.0641362674258305e-06, "loss": 1.0533, "step": 8509 }, { "epoch": 1.456443607735752, "grad_norm": 9.295037269592285, "learning_rate": 3.057161546733457e-06, "loss": 0.7267, "step": 8510 }, { "epoch": 1.456614752695533, "grad_norm": 20.52536392211914, "learning_rate": 3.050193872333554e-06, "loss": 2.0941, "step": 8511 }, { "epoch": 1.456785897655314, "grad_norm": 22.563631057739258, "learning_rate": 3.043233248337066e-06, "loss": 5.463, "step": 8512 }, { "epoch": 1.456957042615095, "grad_norm": 13.759818077087402, "learning_rate": 3.036279678850776e-06, "loss": 1.2918, "step": 8513 }, { "epoch": 1.457128187574876, "grad_norm": 11.931748390197754, "learning_rate": 3.0293331679772893e-06, "loss": 1.1969, "step": 8514 }, { "epoch": 1.4572993325346568, "grad_norm": 8.23759651184082, "learning_rate": 3.0223937198150675e-06, "loss": 0.6667, "step": 8515 }, { "epoch": 1.4574704774944378, "grad_norm": 21.343276977539062, "learning_rate": 3.015461338458386e-06, "loss": 1.5682, "step": 8516 }, { "epoch": 1.4576416224542188, "grad_norm": 4.6625142097473145, "learning_rate": 3.0085360279973707e-06, "loss": 0.2648, "step": 8517 }, { "epoch": 1.4578127674139996, "grad_norm": 0.3689694404602051, "learning_rate": 3.0016177925179555e-06, "loss": 0.1261, "step": 8518 }, { "epoch": 1.4579839123737806, "grad_norm": 5.428666114807129, "learning_rate": 2.994706636101918e-06, "loss": 0.578, "step": 8519 }, { "epoch": 1.4581550573335615, "grad_norm": 76.22441101074219, "learning_rate": 2.9878025628268467e-06, "loss": 8.0354, "step": 8520 }, { "epoch": 1.4583262022933425, "grad_norm": 20.39149284362793, "learning_rate": 2.98090557676615e-06, "loss": 1.7984, "step": 8521 }, { "epoch": 1.4584973472531235, "grad_norm": 15.403654098510742, "learning_rate": 2.974015681989063e-06, "loss": 0.8956, "step": 8522 }, { "epoch": 1.4586684922129043, "grad_norm": 17.665454864501953, "learning_rate": 2.9671328825606414e-06, "loss": 1.8613, "step": 8523 }, { "epoch": 1.4588396371726853, "grad_norm": 16.507240295410156, "learning_rate": 2.9602571825417383e-06, "loss": 1.4073, "step": 8524 }, { "epoch": 1.4590107821324663, "grad_norm": 12.627859115600586, "learning_rate": 2.953388585989036e-06, "loss": 1.0848, "step": 8525 }, { "epoch": 1.459181927092247, "grad_norm": 10.567094802856445, "learning_rate": 2.946527096955005e-06, "loss": 0.7461, "step": 8526 }, { "epoch": 1.459353072052028, "grad_norm": 7.751945972442627, "learning_rate": 2.9396727194879446e-06, "loss": 0.7472, "step": 8527 }, { "epoch": 1.459524217011809, "grad_norm": 0.37635478377342224, "learning_rate": 2.932825457631943e-06, "loss": 0.118, "step": 8528 }, { "epoch": 1.45969536197159, "grad_norm": 12.689519882202148, "learning_rate": 2.92598531542689e-06, "loss": 0.8756, "step": 8529 }, { "epoch": 1.459866506931371, "grad_norm": 0.5429931879043579, "learning_rate": 2.9191522969084895e-06, "loss": 0.1225, "step": 8530 }, { "epoch": 1.4600376518911518, "grad_norm": 6.63957405090332, "learning_rate": 2.9123264061082245e-06, "loss": 0.5892, "step": 8531 }, { "epoch": 1.4602087968509327, "grad_norm": 18.1749267578125, "learning_rate": 2.9055076470533786e-06, "loss": 1.3724, "step": 8532 }, { "epoch": 1.4603799418107137, "grad_norm": 13.266453742980957, "learning_rate": 2.898696023767044e-06, "loss": 1.0603, "step": 8533 }, { "epoch": 1.4605510867704945, "grad_norm": 20.12327766418457, "learning_rate": 2.8918915402680758e-06, "loss": 1.895, "step": 8534 }, { "epoch": 1.4607222317302755, "grad_norm": 18.49260139465332, "learning_rate": 2.8850942005711372e-06, "loss": 1.4397, "step": 8535 }, { "epoch": 1.4608933766900565, "grad_norm": 10.08874797821045, "learning_rate": 2.8783040086866656e-06, "loss": 0.7415, "step": 8536 }, { "epoch": 1.4610645216498375, "grad_norm": 27.016815185546875, "learning_rate": 2.8715209686208783e-06, "loss": 2.752, "step": 8537 }, { "epoch": 1.4612356666096185, "grad_norm": 15.482584953308105, "learning_rate": 2.86474508437579e-06, "loss": 0.9181, "step": 8538 }, { "epoch": 1.4614068115693992, "grad_norm": 3.8112120628356934, "learning_rate": 2.8579763599491715e-06, "loss": 0.2904, "step": 8539 }, { "epoch": 1.4615779565291802, "grad_norm": 13.542598724365234, "learning_rate": 2.8512147993345898e-06, "loss": 1.2935, "step": 8540 }, { "epoch": 1.4617491014889612, "grad_norm": 10.883723258972168, "learning_rate": 2.8444604065213693e-06, "loss": 0.6941, "step": 8541 }, { "epoch": 1.461920246448742, "grad_norm": 14.303625106811523, "learning_rate": 2.8377131854946162e-06, "loss": 1.1057, "step": 8542 }, { "epoch": 1.462091391408523, "grad_norm": 12.465928077697754, "learning_rate": 2.8309731402351957e-06, "loss": 0.9065, "step": 8543 }, { "epoch": 1.462262536368304, "grad_norm": 12.034969329833984, "learning_rate": 2.824240274719748e-06, "loss": 0.8766, "step": 8544 }, { "epoch": 1.462433681328085, "grad_norm": 13.294035911560059, "learning_rate": 2.8175145929206762e-06, "loss": 0.9318, "step": 8545 }, { "epoch": 1.462604826287866, "grad_norm": 17.143617630004883, "learning_rate": 2.8107960988061376e-06, "loss": 1.6987, "step": 8546 }, { "epoch": 1.4627759712476467, "grad_norm": 20.387392044067383, "learning_rate": 2.804084796340059e-06, "loss": 1.8917, "step": 8547 }, { "epoch": 1.4629471162074277, "grad_norm": 2.370588541030884, "learning_rate": 2.797380689482116e-06, "loss": 0.2348, "step": 8548 }, { "epoch": 1.4631182611672087, "grad_norm": 12.722402572631836, "learning_rate": 2.7906837821877373e-06, "loss": 0.9854, "step": 8549 }, { "epoch": 1.4632894061269894, "grad_norm": 17.461929321289062, "learning_rate": 2.783994078408118e-06, "loss": 1.605, "step": 8550 }, { "epoch": 1.4634605510867704, "grad_norm": 11.405351638793945, "learning_rate": 2.777311582090181e-06, "loss": 0.8213, "step": 8551 }, { "epoch": 1.4636316960465514, "grad_norm": 25.37442970275879, "learning_rate": 2.7706362971766212e-06, "loss": 5.4851, "step": 8552 }, { "epoch": 1.4638028410063324, "grad_norm": 15.426714897155762, "learning_rate": 2.7639682276058583e-06, "loss": 1.2276, "step": 8553 }, { "epoch": 1.4639739859661134, "grad_norm": 3.5394811630249023, "learning_rate": 2.7573073773120645e-06, "loss": 0.3064, "step": 8554 }, { "epoch": 1.4641451309258942, "grad_norm": 13.284134864807129, "learning_rate": 2.7506537502251582e-06, "loss": 0.9739, "step": 8555 }, { "epoch": 1.4643162758856751, "grad_norm": 12.07269287109375, "learning_rate": 2.7440073502707796e-06, "loss": 0.9605, "step": 8556 }, { "epoch": 1.4644874208454561, "grad_norm": 16.062152862548828, "learning_rate": 2.737368181370323e-06, "loss": 1.4946, "step": 8557 }, { "epoch": 1.464658565805237, "grad_norm": 15.084619522094727, "learning_rate": 2.730736247440901e-06, "loss": 1.3072, "step": 8558 }, { "epoch": 1.464829710765018, "grad_norm": 18.281511306762695, "learning_rate": 2.7241115523953707e-06, "loss": 2.1375, "step": 8559 }, { "epoch": 1.4650008557247989, "grad_norm": 11.279488563537598, "learning_rate": 2.7174941001423083e-06, "loss": 1.1234, "step": 8560 }, { "epoch": 1.4651720006845799, "grad_norm": 10.39518928527832, "learning_rate": 2.710883894586018e-06, "loss": 0.9096, "step": 8561 }, { "epoch": 1.4653431456443609, "grad_norm": 14.887674331665039, "learning_rate": 2.7042809396265377e-06, "loss": 1.049, "step": 8562 }, { "epoch": 1.4655142906041418, "grad_norm": 15.582239151000977, "learning_rate": 2.697685239159614e-06, "loss": 1.2409, "step": 8563 }, { "epoch": 1.4656854355639226, "grad_norm": 22.435829162597656, "learning_rate": 2.691096797076726e-06, "loss": 2.2822, "step": 8564 }, { "epoch": 1.4658565805237036, "grad_norm": 5.671708106994629, "learning_rate": 2.6845156172650536e-06, "loss": 0.3486, "step": 8565 }, { "epoch": 1.4660277254834846, "grad_norm": 0.4237624406814575, "learning_rate": 2.677941703607515e-06, "loss": 0.1282, "step": 8566 }, { "epoch": 1.4661988704432654, "grad_norm": 12.79686164855957, "learning_rate": 2.6713750599827287e-06, "loss": 0.8523, "step": 8567 }, { "epoch": 1.4663700154030463, "grad_norm": 0.37470367550849915, "learning_rate": 2.664815690265019e-06, "loss": 0.1181, "step": 8568 }, { "epoch": 1.4665411603628273, "grad_norm": 12.265129089355469, "learning_rate": 2.6582635983244203e-06, "loss": 1.1186, "step": 8569 }, { "epoch": 1.4667123053226083, "grad_norm": 11.174092292785645, "learning_rate": 2.6517187880266853e-06, "loss": 1.097, "step": 8570 }, { "epoch": 1.4668834502823893, "grad_norm": 22.1987247467041, "learning_rate": 2.645181263233255e-06, "loss": 2.09, "step": 8571 }, { "epoch": 1.46705459524217, "grad_norm": 18.45955467224121, "learning_rate": 2.6386510278012844e-06, "loss": 1.7561, "step": 8572 }, { "epoch": 1.467225740201951, "grad_norm": 0.5314795970916748, "learning_rate": 2.632128085583616e-06, "loss": 0.1276, "step": 8573 }, { "epoch": 1.467396885161732, "grad_norm": 11.474678039550781, "learning_rate": 2.6256124404288017e-06, "loss": 0.8089, "step": 8574 }, { "epoch": 1.4675680301215128, "grad_norm": 13.592512130737305, "learning_rate": 2.6191040961810716e-06, "loss": 1.0905, "step": 8575 }, { "epoch": 1.4677391750812938, "grad_norm": 17.502023696899414, "learning_rate": 2.6126030566803714e-06, "loss": 1.3819, "step": 8576 }, { "epoch": 1.4679103200410748, "grad_norm": 5.090403079986572, "learning_rate": 2.606109325762316e-06, "loss": 0.3459, "step": 8577 }, { "epoch": 1.4680814650008558, "grad_norm": 12.40960693359375, "learning_rate": 2.599622907258223e-06, "loss": 1.0282, "step": 8578 }, { "epoch": 1.4682526099606368, "grad_norm": 18.674455642700195, "learning_rate": 2.5931438049950794e-06, "loss": 1.7005, "step": 8579 }, { "epoch": 1.4684237549204175, "grad_norm": 15.828418731689453, "learning_rate": 2.586672022795575e-06, "loss": 1.1007, "step": 8580 }, { "epoch": 1.4685948998801985, "grad_norm": 27.76209259033203, "learning_rate": 2.5802075644780626e-06, "loss": 5.3068, "step": 8581 }, { "epoch": 1.4687660448399795, "grad_norm": 8.822689056396484, "learning_rate": 2.5737504338565887e-06, "loss": 0.6865, "step": 8582 }, { "epoch": 1.4689371897997603, "grad_norm": 22.21077537536621, "learning_rate": 2.5673006347408658e-06, "loss": 5.1728, "step": 8583 }, { "epoch": 1.4691083347595413, "grad_norm": 3.085292339324951, "learning_rate": 2.5608581709362878e-06, "loss": 0.2355, "step": 8584 }, { "epoch": 1.4692794797193223, "grad_norm": 0.7077583074569702, "learning_rate": 2.5544230462439175e-06, "loss": 0.1373, "step": 8585 }, { "epoch": 1.4694506246791033, "grad_norm": 15.744388580322266, "learning_rate": 2.5479952644604786e-06, "loss": 1.1297, "step": 8586 }, { "epoch": 1.4696217696388842, "grad_norm": 15.804190635681152, "learning_rate": 2.5415748293783887e-06, "loss": 1.2382, "step": 8587 }, { "epoch": 1.469792914598665, "grad_norm": 0.848552942276001, "learning_rate": 2.5351617447857057e-06, "loss": 0.1375, "step": 8588 }, { "epoch": 1.469964059558446, "grad_norm": 11.672784805297852, "learning_rate": 2.5287560144661563e-06, "loss": 1.0634, "step": 8589 }, { "epoch": 1.470135204518227, "grad_norm": 12.981428146362305, "learning_rate": 2.5223576421991362e-06, "loss": 1.2401, "step": 8590 }, { "epoch": 1.4703063494780078, "grad_norm": 10.640856742858887, "learning_rate": 2.51596663175969e-06, "loss": 0.9131, "step": 8591 }, { "epoch": 1.4704774944377887, "grad_norm": 8.373154640197754, "learning_rate": 2.509582986918527e-06, "loss": 0.622, "step": 8592 }, { "epoch": 1.4706486393975697, "grad_norm": 59.0521354675293, "learning_rate": 2.503206711442004e-06, "loss": 7.9709, "step": 8593 }, { "epoch": 1.4708197843573507, "grad_norm": 0.8816530704498291, "learning_rate": 2.4968378090921375e-06, "loss": 0.1537, "step": 8594 }, { "epoch": 1.4709909293171317, "grad_norm": 15.040864944458008, "learning_rate": 2.4904762836265873e-06, "loss": 1.1132, "step": 8595 }, { "epoch": 1.4711620742769125, "grad_norm": 7.551300048828125, "learning_rate": 2.4841221387986577e-06, "loss": 0.7954, "step": 8596 }, { "epoch": 1.4713332192366935, "grad_norm": 20.366872787475586, "learning_rate": 2.4777753783573078e-06, "loss": 4.8486, "step": 8597 }, { "epoch": 1.4715043641964745, "grad_norm": 17.65959930419922, "learning_rate": 2.4714360060471375e-06, "loss": 1.2182, "step": 8598 }, { "epoch": 1.4716755091562552, "grad_norm": 18.21508026123047, "learning_rate": 2.4651040256083857e-06, "loss": 1.6719, "step": 8599 }, { "epoch": 1.4718466541160362, "grad_norm": 22.4281063079834, "learning_rate": 2.4587794407769304e-06, "loss": 2.8263, "step": 8600 }, { "epoch": 1.4720177990758172, "grad_norm": 16.28119659423828, "learning_rate": 2.452462255284282e-06, "loss": 1.1152, "step": 8601 }, { "epoch": 1.4721889440355982, "grad_norm": 16.165203094482422, "learning_rate": 2.446152472857595e-06, "loss": 0.9293, "step": 8602 }, { "epoch": 1.4723600889953792, "grad_norm": 2.8993372917175293, "learning_rate": 2.4398500972196423e-06, "loss": 0.2628, "step": 8603 }, { "epoch": 1.47253123395516, "grad_norm": 18.692373275756836, "learning_rate": 2.433555132088846e-06, "loss": 1.4589, "step": 8604 }, { "epoch": 1.472702378914941, "grad_norm": 0.8061926960945129, "learning_rate": 2.4272675811792348e-06, "loss": 0.1395, "step": 8605 }, { "epoch": 1.472873523874722, "grad_norm": 12.81713581085205, "learning_rate": 2.42098744820048e-06, "loss": 1.1243, "step": 8606 }, { "epoch": 1.4730446688345027, "grad_norm": 13.554957389831543, "learning_rate": 2.414714736857868e-06, "loss": 0.753, "step": 8607 }, { "epoch": 1.4732158137942837, "grad_norm": 16.154888153076172, "learning_rate": 2.408449450852297e-06, "loss": 1.0956, "step": 8608 }, { "epoch": 1.4733869587540647, "grad_norm": 3.45200777053833, "learning_rate": 2.4021915938803094e-06, "loss": 0.2685, "step": 8609 }, { "epoch": 1.4735581037138457, "grad_norm": 14.280316352844238, "learning_rate": 2.3959411696340507e-06, "loss": 1.3286, "step": 8610 }, { "epoch": 1.4737292486736266, "grad_norm": 16.223106384277344, "learning_rate": 2.3896981818012697e-06, "loss": 1.2277, "step": 8611 }, { "epoch": 1.4739003936334074, "grad_norm": 0.4728884696960449, "learning_rate": 2.3834626340653476e-06, "loss": 0.1268, "step": 8612 }, { "epoch": 1.4740715385931884, "grad_norm": 0.5109286308288574, "learning_rate": 2.3772345301052595e-06, "loss": 0.1254, "step": 8613 }, { "epoch": 1.4742426835529694, "grad_norm": 14.686467170715332, "learning_rate": 2.3710138735956044e-06, "loss": 1.4269, "step": 8614 }, { "epoch": 1.4744138285127504, "grad_norm": 10.182634353637695, "learning_rate": 2.36480066820657e-06, "loss": 0.9876, "step": 8615 }, { "epoch": 1.4745849734725311, "grad_norm": 0.5462074279785156, "learning_rate": 2.3585949176039652e-06, "loss": 0.1283, "step": 8616 }, { "epoch": 1.4747561184323121, "grad_norm": 34.710514068603516, "learning_rate": 2.3523966254491863e-06, "loss": 5.5256, "step": 8617 }, { "epoch": 1.4749272633920931, "grad_norm": 3.7688565254211426, "learning_rate": 2.34620579539923e-06, "loss": 0.3516, "step": 8618 }, { "epoch": 1.4750984083518741, "grad_norm": 4.640887260437012, "learning_rate": 2.340022431106706e-06, "loss": 0.483, "step": 8619 }, { "epoch": 1.475269553311655, "grad_norm": 20.615394592285156, "learning_rate": 2.3338465362198074e-06, "loss": 2.5548, "step": 8620 }, { "epoch": 1.4754406982714359, "grad_norm": 0.4951671361923218, "learning_rate": 2.327678114382315e-06, "loss": 0.1262, "step": 8621 }, { "epoch": 1.4756118432312169, "grad_norm": 5.877453327178955, "learning_rate": 2.321517169233614e-06, "loss": 0.5921, "step": 8622 }, { "epoch": 1.4757829881909978, "grad_norm": 19.228952407836914, "learning_rate": 2.3153637044086616e-06, "loss": 2.3989, "step": 8623 }, { "epoch": 1.4759541331507786, "grad_norm": 29.218852996826172, "learning_rate": 2.3092177235380225e-06, "loss": 5.3795, "step": 8624 }, { "epoch": 1.4761252781105596, "grad_norm": 13.688109397888184, "learning_rate": 2.303079230247827e-06, "loss": 1.3003, "step": 8625 }, { "epoch": 1.4762964230703406, "grad_norm": 26.70513343811035, "learning_rate": 2.2969482281597953e-06, "loss": 5.4995, "step": 8626 }, { "epoch": 1.4764675680301216, "grad_norm": 0.4475199282169342, "learning_rate": 2.2908247208912337e-06, "loss": 0.1284, "step": 8627 }, { "epoch": 1.4766387129899026, "grad_norm": 13.632052421569824, "learning_rate": 2.284708712055012e-06, "loss": 1.2663, "step": 8628 }, { "epoch": 1.4768098579496833, "grad_norm": 1.8141242265701294, "learning_rate": 2.278600205259589e-06, "loss": 0.2333, "step": 8629 }, { "epoch": 1.4769810029094643, "grad_norm": 15.47907829284668, "learning_rate": 2.2724992041089965e-06, "loss": 1.2852, "step": 8630 }, { "epoch": 1.4771521478692453, "grad_norm": 21.15369987487793, "learning_rate": 2.266405712202827e-06, "loss": 1.6042, "step": 8631 }, { "epoch": 1.477323292829026, "grad_norm": 17.852474212646484, "learning_rate": 2.2603197331362564e-06, "loss": 1.5289, "step": 8632 }, { "epoch": 1.477494437788807, "grad_norm": 0.7824785709381104, "learning_rate": 2.2542412705000153e-06, "loss": 0.1304, "step": 8633 }, { "epoch": 1.477665582748588, "grad_norm": 9.270437240600586, "learning_rate": 2.248170327880414e-06, "loss": 1.1586, "step": 8634 }, { "epoch": 1.477836727708369, "grad_norm": 6.792728900909424, "learning_rate": 2.2421069088593083e-06, "loss": 0.5544, "step": 8635 }, { "epoch": 1.47800787266815, "grad_norm": 0.3804835081100464, "learning_rate": 2.2360510170141316e-06, "loss": 0.1178, "step": 8636 }, { "epoch": 1.4781790176279308, "grad_norm": 12.946402549743652, "learning_rate": 2.2300026559178667e-06, "loss": 1.2082, "step": 8637 }, { "epoch": 1.4783501625877118, "grad_norm": 14.585646629333496, "learning_rate": 2.223961829139051e-06, "loss": 0.9811, "step": 8638 }, { "epoch": 1.4785213075474928, "grad_norm": 5.403097629547119, "learning_rate": 2.21792854024179e-06, "loss": 0.2892, "step": 8639 }, { "epoch": 1.4786924525072735, "grad_norm": 12.016549110412598, "learning_rate": 2.211902792785725e-06, "loss": 0.9699, "step": 8640 }, { "epoch": 1.4788635974670545, "grad_norm": 42.36448669433594, "learning_rate": 2.2058845903260595e-06, "loss": 7.0021, "step": 8641 }, { "epoch": 1.4790347424268355, "grad_norm": 18.061737060546875, "learning_rate": 2.1998739364135446e-06, "loss": 2.103, "step": 8642 }, { "epoch": 1.4792058873866165, "grad_norm": 11.131844520568848, "learning_rate": 2.1938708345944703e-06, "loss": 0.8494, "step": 8643 }, { "epoch": 1.4793770323463975, "grad_norm": 23.128252029418945, "learning_rate": 2.18787528841068e-06, "loss": 3.2058, "step": 8644 }, { "epoch": 1.4795481773061783, "grad_norm": 16.91937828063965, "learning_rate": 2.1818873013995495e-06, "loss": 1.1437, "step": 8645 }, { "epoch": 1.4797193222659593, "grad_norm": 10.451518058776855, "learning_rate": 2.175906877094007e-06, "loss": 0.6853, "step": 8646 }, { "epoch": 1.4798904672257402, "grad_norm": 4.0505146980285645, "learning_rate": 2.1699340190225057e-06, "loss": 0.3101, "step": 8647 }, { "epoch": 1.480061612185521, "grad_norm": 23.27374267578125, "learning_rate": 2.163968730709045e-06, "loss": 5.2235, "step": 8648 }, { "epoch": 1.480232757145302, "grad_norm": 10.298914909362793, "learning_rate": 2.1580110156731525e-06, "loss": 1.0408, "step": 8649 }, { "epoch": 1.480403902105083, "grad_norm": 9.977930068969727, "learning_rate": 2.1520608774298815e-06, "loss": 0.9702, "step": 8650 }, { "epoch": 1.480575047064864, "grad_norm": 17.494873046875, "learning_rate": 2.1461183194898325e-06, "loss": 1.6695, "step": 8651 }, { "epoch": 1.480746192024645, "grad_norm": 22.446674346923828, "learning_rate": 2.140183345359124e-06, "loss": 3.1251, "step": 8652 }, { "epoch": 1.4809173369844257, "grad_norm": 12.42232608795166, "learning_rate": 2.1342559585393933e-06, "loss": 0.8405, "step": 8653 }, { "epoch": 1.4810884819442067, "grad_norm": 1.9596792459487915, "learning_rate": 2.1283361625278113e-06, "loss": 0.2301, "step": 8654 }, { "epoch": 1.4812596269039877, "grad_norm": 4.211131572723389, "learning_rate": 2.1224239608170644e-06, "loss": 0.3821, "step": 8655 }, { "epoch": 1.4814307718637685, "grad_norm": 0.6189964413642883, "learning_rate": 2.1165193568953633e-06, "loss": 0.1314, "step": 8656 }, { "epoch": 1.4816019168235495, "grad_norm": 16.088436126708984, "learning_rate": 2.1106223542464304e-06, "loss": 1.3685, "step": 8657 }, { "epoch": 1.4817730617833305, "grad_norm": 17.937074661254883, "learning_rate": 2.1047329563495036e-06, "loss": 1.8444, "step": 8658 }, { "epoch": 1.4819442067431114, "grad_norm": 11.515583038330078, "learning_rate": 2.098851166679344e-06, "loss": 1.2523, "step": 8659 }, { "epoch": 1.4821153517028924, "grad_norm": 19.75164031982422, "learning_rate": 2.0929769887062074e-06, "loss": 1.6112, "step": 8660 }, { "epoch": 1.4822864966626732, "grad_norm": 14.153520584106445, "learning_rate": 2.087110425895869e-06, "loss": 1.2514, "step": 8661 }, { "epoch": 1.4824576416224542, "grad_norm": 13.268850326538086, "learning_rate": 2.081251481709619e-06, "loss": 1.0761, "step": 8662 }, { "epoch": 1.4826287865822352, "grad_norm": 2.862231492996216, "learning_rate": 2.075400159604234e-06, "loss": 0.3298, "step": 8663 }, { "epoch": 1.4827999315420162, "grad_norm": 40.3896484375, "learning_rate": 2.0695564630320083e-06, "loss": 6.5952, "step": 8664 }, { "epoch": 1.482971076501797, "grad_norm": 5.492036819458008, "learning_rate": 2.06372039544073e-06, "loss": 0.5523, "step": 8665 }, { "epoch": 1.483142221461578, "grad_norm": 14.369952201843262, "learning_rate": 2.0578919602736813e-06, "loss": 1.6392, "step": 8666 }, { "epoch": 1.483313366421359, "grad_norm": 1.8615891933441162, "learning_rate": 2.0520711609696573e-06, "loss": 0.2698, "step": 8667 }, { "epoch": 1.48348451138114, "grad_norm": 24.372041702270508, "learning_rate": 2.04625800096293e-06, "loss": 5.1996, "step": 8668 }, { "epoch": 1.483655656340921, "grad_norm": 0.42217203974723816, "learning_rate": 2.040452483683279e-06, "loss": 0.124, "step": 8669 }, { "epoch": 1.4838268013007017, "grad_norm": 18.064851760864258, "learning_rate": 2.0346546125559622e-06, "loss": 1.4861, "step": 8670 }, { "epoch": 1.4839979462604826, "grad_norm": 13.571256637573242, "learning_rate": 2.0288643910017405e-06, "loss": 1.5556, "step": 8671 }, { "epoch": 1.4841690912202636, "grad_norm": 4.089590072631836, "learning_rate": 2.023081822436843e-06, "loss": 0.3336, "step": 8672 }, { "epoch": 1.4843402361800444, "grad_norm": 11.174686431884766, "learning_rate": 2.0173069102729983e-06, "loss": 0.916, "step": 8673 }, { "epoch": 1.4845113811398254, "grad_norm": 19.2801570892334, "learning_rate": 2.0115396579174183e-06, "loss": 2.3274, "step": 8674 }, { "epoch": 1.4846825260996064, "grad_norm": 15.093779563903809, "learning_rate": 2.0057800687727825e-06, "loss": 1.1786, "step": 8675 }, { "epoch": 1.4848536710593874, "grad_norm": 9.493477821350098, "learning_rate": 2.000028146237264e-06, "loss": 0.855, "step": 8676 }, { "epoch": 1.4850248160191684, "grad_norm": 9.813469886779785, "learning_rate": 1.9942838937045015e-06, "loss": 0.7856, "step": 8677 }, { "epoch": 1.4851959609789491, "grad_norm": 8.897295951843262, "learning_rate": 1.98854731456361e-06, "loss": 0.864, "step": 8678 }, { "epoch": 1.4853671059387301, "grad_norm": 13.642171859741211, "learning_rate": 1.982818412199187e-06, "loss": 0.8152, "step": 8679 }, { "epoch": 1.485538250898511, "grad_norm": 15.76082706451416, "learning_rate": 1.9770971899912856e-06, "loss": 1.2928, "step": 8680 }, { "epoch": 1.4857093958582919, "grad_norm": 18.846452713012695, "learning_rate": 1.9713836513154427e-06, "loss": 1.6299, "step": 8681 }, { "epoch": 1.4858805408180729, "grad_norm": 52.782161712646484, "learning_rate": 1.965677799542647e-06, "loss": 7.6975, "step": 8682 }, { "epoch": 1.4860516857778538, "grad_norm": 17.99054527282715, "learning_rate": 1.9599796380393632e-06, "loss": 1.1316, "step": 8683 }, { "epoch": 1.4862228307376348, "grad_norm": 11.52419662475586, "learning_rate": 1.9542891701675206e-06, "loss": 1.085, "step": 8684 }, { "epoch": 1.4863939756974158, "grad_norm": 0.395636647939682, "learning_rate": 1.948606399284495e-06, "loss": 0.1223, "step": 8685 }, { "epoch": 1.4865651206571966, "grad_norm": 3.862820625305176, "learning_rate": 1.942931328743135e-06, "loss": 0.305, "step": 8686 }, { "epoch": 1.4867362656169776, "grad_norm": 17.18566131591797, "learning_rate": 1.9372639618917378e-06, "loss": 1.8984, "step": 8687 }, { "epoch": 1.4869074105767586, "grad_norm": 5.835579872131348, "learning_rate": 1.9316043020740616e-06, "loss": 0.5111, "step": 8688 }, { "epoch": 1.4870785555365393, "grad_norm": 13.971384048461914, "learning_rate": 1.9259523526293123e-06, "loss": 1.2442, "step": 8689 }, { "epoch": 1.4872497004963203, "grad_norm": 16.53280258178711, "learning_rate": 1.9203081168921454e-06, "loss": 1.0105, "step": 8690 }, { "epoch": 1.4874208454561013, "grad_norm": 10.08396053314209, "learning_rate": 1.9146715981926743e-06, "loss": 1.1688, "step": 8691 }, { "epoch": 1.4875919904158823, "grad_norm": 21.6230411529541, "learning_rate": 1.909042799856447e-06, "loss": 3.0133, "step": 8692 }, { "epoch": 1.4877631353756633, "grad_norm": 16.009479522705078, "learning_rate": 1.9034217252044617e-06, "loss": 1.4418, "step": 8693 }, { "epoch": 1.487934280335444, "grad_norm": 0.32677701115608215, "learning_rate": 1.897808377553174e-06, "loss": 0.1142, "step": 8694 }, { "epoch": 1.488105425295225, "grad_norm": 12.716803550720215, "learning_rate": 1.8922027602144532e-06, "loss": 1.1369, "step": 8695 }, { "epoch": 1.488276570255006, "grad_norm": 12.640409469604492, "learning_rate": 1.8866048764956318e-06, "loss": 0.9637, "step": 8696 }, { "epoch": 1.4884477152147868, "grad_norm": 21.609838485717773, "learning_rate": 1.8810147296994663e-06, "loss": 5.1968, "step": 8697 }, { "epoch": 1.4886188601745678, "grad_norm": 12.18921184539795, "learning_rate": 1.8754323231241466e-06, "loss": 0.9312, "step": 8698 }, { "epoch": 1.4887900051343488, "grad_norm": 18.10647964477539, "learning_rate": 1.8698576600633066e-06, "loss": 2.0963, "step": 8699 }, { "epoch": 1.4889611500941298, "grad_norm": 13.537436485290527, "learning_rate": 1.864290743806002e-06, "loss": 1.1168, "step": 8700 }, { "epoch": 1.4891322950539108, "grad_norm": 11.2808198928833, "learning_rate": 1.858731577636727e-06, "loss": 0.6953, "step": 8701 }, { "epoch": 1.4893034400136915, "grad_norm": 15.852746963500977, "learning_rate": 1.8531801648353913e-06, "loss": 1.2251, "step": 8702 }, { "epoch": 1.4894745849734725, "grad_norm": 19.878219604492188, "learning_rate": 1.8476365086773417e-06, "loss": 1.6645, "step": 8703 }, { "epoch": 1.4896457299332535, "grad_norm": 26.932106018066406, "learning_rate": 1.8421006124333368e-06, "loss": 5.3491, "step": 8704 }, { "epoch": 1.4898168748930343, "grad_norm": 20.04827880859375, "learning_rate": 1.8365724793695754e-06, "loss": 2.6135, "step": 8705 }, { "epoch": 1.4899880198528153, "grad_norm": 3.5480690002441406, "learning_rate": 1.8310521127476538e-06, "loss": 0.2846, "step": 8706 }, { "epoch": 1.4901591648125962, "grad_norm": 13.716347694396973, "learning_rate": 1.8255395158246046e-06, "loss": 0.9585, "step": 8707 }, { "epoch": 1.4903303097723772, "grad_norm": 9.723167419433594, "learning_rate": 1.8200346918528598e-06, "loss": 0.8903, "step": 8708 }, { "epoch": 1.4905014547321582, "grad_norm": 16.154159545898438, "learning_rate": 1.8145376440802813e-06, "loss": 1.6429, "step": 8709 }, { "epoch": 1.490672599691939, "grad_norm": 0.3765588700771332, "learning_rate": 1.8090483757501281e-06, "loss": 0.1177, "step": 8710 }, { "epoch": 1.49084374465172, "grad_norm": 14.236617088317871, "learning_rate": 1.8035668901010844e-06, "loss": 1.093, "step": 8711 }, { "epoch": 1.491014889611501, "grad_norm": 13.419577598571777, "learning_rate": 1.79809319036723e-06, "loss": 0.9498, "step": 8712 }, { "epoch": 1.4911860345712817, "grad_norm": 11.613882064819336, "learning_rate": 1.7926272797780546e-06, "loss": 0.954, "step": 8713 }, { "epoch": 1.4913571795310627, "grad_norm": 15.349943161010742, "learning_rate": 1.7871691615584607e-06, "loss": 1.1268, "step": 8714 }, { "epoch": 1.4915283244908437, "grad_norm": 1.526613473892212, "learning_rate": 1.7817188389287337e-06, "loss": 0.2058, "step": 8715 }, { "epoch": 1.4916994694506247, "grad_norm": 12.078264236450195, "learning_rate": 1.7762763151045863e-06, "loss": 0.7651, "step": 8716 }, { "epoch": 1.4918706144104057, "grad_norm": 0.5502970814704895, "learning_rate": 1.7708415932971083e-06, "loss": 0.1337, "step": 8717 }, { "epoch": 1.4920417593701867, "grad_norm": 9.057726860046387, "learning_rate": 1.7654146767127915e-06, "loss": 0.8282, "step": 8718 }, { "epoch": 1.4922129043299674, "grad_norm": 25.093358993530273, "learning_rate": 1.759995568553533e-06, "loss": 5.2401, "step": 8719 }, { "epoch": 1.4923840492897484, "grad_norm": 3.6737241744995117, "learning_rate": 1.754584272016605e-06, "loss": 0.2781, "step": 8720 }, { "epoch": 1.4925551942495294, "grad_norm": 11.911739349365234, "learning_rate": 1.7491807902946871e-06, "loss": 0.9151, "step": 8721 }, { "epoch": 1.4927263392093102, "grad_norm": 27.066675186157227, "learning_rate": 1.7437851265758375e-06, "loss": 3.2981, "step": 8722 }, { "epoch": 1.4928974841690912, "grad_norm": 0.41131046414375305, "learning_rate": 1.7383972840435115e-06, "loss": 0.1204, "step": 8723 }, { "epoch": 1.4930686291288722, "grad_norm": 9.278553009033203, "learning_rate": 1.7330172658765391e-06, "loss": 0.7775, "step": 8724 }, { "epoch": 1.4932397740886532, "grad_norm": 0.3530387878417969, "learning_rate": 1.7276450752491352e-06, "loss": 0.117, "step": 8725 }, { "epoch": 1.4934109190484341, "grad_norm": 23.6711368560791, "learning_rate": 1.7222807153309123e-06, "loss": 3.1004, "step": 8726 }, { "epoch": 1.493582064008215, "grad_norm": 2.162137269973755, "learning_rate": 1.7169241892868403e-06, "loss": 0.2361, "step": 8727 }, { "epoch": 1.493753208967996, "grad_norm": 24.51152229309082, "learning_rate": 1.7115755002772848e-06, "loss": 3.2392, "step": 8728 }, { "epoch": 1.493924353927777, "grad_norm": 8.81994342803955, "learning_rate": 1.7062346514579747e-06, "loss": 0.9084, "step": 8729 }, { "epoch": 1.4940954988875577, "grad_norm": 12.69940185546875, "learning_rate": 1.7009016459800193e-06, "loss": 0.7481, "step": 8730 }, { "epoch": 1.4942666438473386, "grad_norm": 18.715112686157227, "learning_rate": 1.695576486989905e-06, "loss": 2.2736, "step": 8731 }, { "epoch": 1.4944377888071196, "grad_norm": 17.015085220336914, "learning_rate": 1.690259177629475e-06, "loss": 1.2533, "step": 8732 }, { "epoch": 1.4946089337669006, "grad_norm": 21.677297592163086, "learning_rate": 1.6849497210359589e-06, "loss": 3.0261, "step": 8733 }, { "epoch": 1.4947800787266816, "grad_norm": 11.646499633789062, "learning_rate": 1.6796481203419367e-06, "loss": 0.8067, "step": 8734 }, { "epoch": 1.4949512236864624, "grad_norm": 7.48187255859375, "learning_rate": 1.674354378675365e-06, "loss": 0.6705, "step": 8735 }, { "epoch": 1.4951223686462434, "grad_norm": 14.336213111877441, "learning_rate": 1.669068499159554e-06, "loss": 1.146, "step": 8736 }, { "epoch": 1.4952935136060244, "grad_norm": 9.391974449157715, "learning_rate": 1.6637904849131886e-06, "loss": 0.875, "step": 8737 }, { "epoch": 1.4954646585658051, "grad_norm": 15.934311866760254, "learning_rate": 1.6585203390502996e-06, "loss": 1.2158, "step": 8738 }, { "epoch": 1.4956358035255861, "grad_norm": 13.133450508117676, "learning_rate": 1.6532580646802831e-06, "loss": 0.9799, "step": 8739 }, { "epoch": 1.495806948485367, "grad_norm": 13.175771713256836, "learning_rate": 1.6480036649078856e-06, "loss": 0.8717, "step": 8740 }, { "epoch": 1.495978093445148, "grad_norm": 20.734832763671875, "learning_rate": 1.6427571428332171e-06, "loss": 2.6231, "step": 8741 }, { "epoch": 1.496149238404929, "grad_norm": 4.2126970291137695, "learning_rate": 1.6375185015517252e-06, "loss": 0.2936, "step": 8742 }, { "epoch": 1.4963203833647098, "grad_norm": 8.44736385345459, "learning_rate": 1.632287744154224e-06, "loss": 0.7255, "step": 8743 }, { "epoch": 1.4964915283244908, "grad_norm": 7.583530902862549, "learning_rate": 1.6270648737268646e-06, "loss": 0.4966, "step": 8744 }, { "epoch": 1.4966626732842718, "grad_norm": 12.269996643066406, "learning_rate": 1.6218498933511438e-06, "loss": 1.023, "step": 8745 }, { "epoch": 1.4968338182440526, "grad_norm": 12.351924896240234, "learning_rate": 1.6166428061039174e-06, "loss": 1.0181, "step": 8746 }, { "epoch": 1.4970049632038336, "grad_norm": 12.050604820251465, "learning_rate": 1.6114436150573607e-06, "loss": 0.9488, "step": 8747 }, { "epoch": 1.4971761081636146, "grad_norm": 10.327962875366211, "learning_rate": 1.6062523232790172e-06, "loss": 0.7329, "step": 8748 }, { "epoch": 1.4973472531233956, "grad_norm": 0.43802082538604736, "learning_rate": 1.6010689338317564e-06, "loss": 0.1212, "step": 8749 }, { "epoch": 1.4975183980831765, "grad_norm": 10.928634643554688, "learning_rate": 1.595893449773777e-06, "loss": 0.8106, "step": 8750 }, { "epoch": 1.4976895430429573, "grad_norm": 11.663643836975098, "learning_rate": 1.5907258741586316e-06, "loss": 0.9545, "step": 8751 }, { "epoch": 1.4978606880027383, "grad_norm": 8.708173751831055, "learning_rate": 1.5855662100351897e-06, "loss": 0.9869, "step": 8752 }, { "epoch": 1.4980318329625193, "grad_norm": 18.2066707611084, "learning_rate": 1.580414460447666e-06, "loss": 0.5682, "step": 8753 }, { "epoch": 1.4982029779223, "grad_norm": 14.140893936157227, "learning_rate": 1.5752706284355993e-06, "loss": 1.1747, "step": 8754 }, { "epoch": 1.498374122882081, "grad_norm": 9.528616905212402, "learning_rate": 1.5701347170338553e-06, "loss": 0.6355, "step": 8755 }, { "epoch": 1.498545267841862, "grad_norm": 18.601449966430664, "learning_rate": 1.5650067292726332e-06, "loss": 2.5453, "step": 8756 }, { "epoch": 1.498716412801643, "grad_norm": 19.107868194580078, "learning_rate": 1.5598866681774481e-06, "loss": 2.7961, "step": 8757 }, { "epoch": 1.498887557761424, "grad_norm": 4.660526275634766, "learning_rate": 1.5547745367691486e-06, "loss": 0.3109, "step": 8758 }, { "epoch": 1.4990587027212048, "grad_norm": 13.609661102294922, "learning_rate": 1.5496703380639016e-06, "loss": 1.2268, "step": 8759 }, { "epoch": 1.4992298476809858, "grad_norm": 23.278398513793945, "learning_rate": 1.5445740750731852e-06, "loss": 3.1897, "step": 8760 }, { "epoch": 1.4994009926407668, "grad_norm": 5.577650547027588, "learning_rate": 1.539485750803809e-06, "loss": 0.3686, "step": 8761 }, { "epoch": 1.4995721376005475, "grad_norm": 14.022244453430176, "learning_rate": 1.5344053682578869e-06, "loss": 1.1382, "step": 8762 }, { "epoch": 1.4997432825603285, "grad_norm": 20.950279235839844, "learning_rate": 1.5293329304328558e-06, "loss": 1.6109, "step": 8763 }, { "epoch": 1.4999144275201095, "grad_norm": 5.289403915405273, "learning_rate": 1.5242684403214569e-06, "loss": 0.6656, "step": 8764 }, { "epoch": 1.5000855724798905, "grad_norm": 3.436375856399536, "learning_rate": 1.5192119009117534e-06, "loss": 0.4689, "step": 8765 }, { "epoch": 1.5002567174396715, "grad_norm": 30.948457717895508, "learning_rate": 1.5141633151871054e-06, "loss": 5.2712, "step": 8766 }, { "epoch": 1.5004278623994525, "grad_norm": 14.89370059967041, "learning_rate": 1.509122686126187e-06, "loss": 1.4072, "step": 8767 }, { "epoch": 1.5005990073592332, "grad_norm": 13.529912948608398, "learning_rate": 1.5040900167029775e-06, "loss": 1.2001, "step": 8768 }, { "epoch": 1.5007701523190142, "grad_norm": 20.34235382080078, "learning_rate": 1.4990653098867635e-06, "loss": 2.4514, "step": 8769 }, { "epoch": 1.500941297278795, "grad_norm": 23.190624237060547, "learning_rate": 1.4940485686421217e-06, "loss": 5.2502, "step": 8770 }, { "epoch": 1.500941297278795, "eval_nli-pairs_loss": 1.423037052154541, "eval_nli-pairs_runtime": 4.8112, "eval_nli-pairs_samples_per_second": 41.569, "eval_nli-pairs_steps_per_second": 1.455, "eval_sts-test_pearson_cosine": 0.7778826660955432, "eval_sts-test_pearson_dot": 0.6377106165931048, "eval_sts-test_pearson_euclidean": 0.765419125388507, "eval_sts-test_pearson_manhattan": 0.7681524439687704, "eval_sts-test_pearson_max": 0.7778826660955432, "eval_sts-test_spearman_cosine": 0.7770063424249887, "eval_sts-test_spearman_dot": 0.6178407749856393, "eval_sts-test_spearman_euclidean": 0.7545799531290016, "eval_sts-test_spearman_manhattan": 0.7594245562960059, "eval_sts-test_spearman_max": 0.7770063424249887, "step": 8770 }, { "epoch": 1.500941297278795, "eval_vitaminc-pairs_loss": 0.703753650188446, "eval_vitaminc-pairs_runtime": 2.9579, "eval_vitaminc-pairs_samples_per_second": 67.617, "eval_vitaminc-pairs_steps_per_second": 2.367, "step": 8770 }, { "epoch": 1.500941297278795, "eval_qnli-contrastive_loss": 1.5162235498428345, "eval_qnli-contrastive_runtime": 0.7523, "eval_qnli-contrastive_samples_per_second": 265.844, "eval_qnli-contrastive_steps_per_second": 9.305, "step": 8770 }, { "epoch": 1.500941297278795, "eval_scitail-pairs-qa_loss": 0.10561136156320572, "eval_scitail-pairs-qa_runtime": 1.7055, "eval_scitail-pairs-qa_samples_per_second": 117.264, "eval_scitail-pairs-qa_steps_per_second": 4.104, "step": 8770 }, { "epoch": 1.500941297278795, "eval_scitail-pairs-pos_loss": 0.6796361207962036, "eval_scitail-pairs-pos_runtime": 2.8008, "eval_scitail-pairs-pos_samples_per_second": 71.408, "eval_scitail-pairs-pos_steps_per_second": 2.499, "step": 8770 }, { "epoch": 1.500941297278795, "eval_xsum-pairs_loss": 0.7107104659080505, "eval_xsum-pairs_runtime": 2.6891, "eval_xsum-pairs_samples_per_second": 65.077, "eval_xsum-pairs_steps_per_second": 2.231, "step": 8770 }, { "epoch": 1.500941297278795, "eval_compression-pairs_loss": 0.23499943315982819, "eval_compression-pairs_runtime": 0.5384, "eval_compression-pairs_samples_per_second": 371.505, "eval_compression-pairs_steps_per_second": 13.003, "step": 8770 }, { "epoch": 1.500941297278795, "eval_sciq_pairs_loss": 0.4169030487537384, "eval_sciq_pairs_runtime": 9.5981, "eval_sciq_pairs_samples_per_second": 20.837, "eval_sciq_pairs_steps_per_second": 0.729, "step": 8770 }, { "epoch": 1.500941297278795, "eval_qasc_pairs_loss": 5.317800521850586, "eval_qasc_pairs_runtime": 2.7447, "eval_qasc_pairs_samples_per_second": 72.867, "eval_qasc_pairs_steps_per_second": 2.55, "step": 8770 }, { "epoch": 1.500941297278795, "eval_openbookqa_pairs_loss": 2.4854586124420166, "eval_openbookqa_pairs_runtime": 0.6438, "eval_openbookqa_pairs_samples_per_second": 107.183, "eval_openbookqa_pairs_steps_per_second": 4.66, "step": 8770 }, { "epoch": 1.500941297278795, "eval_msmarco_pairs_loss": 1.041033148765564, "eval_msmarco_pairs_runtime": 4.1389, "eval_msmarco_pairs_samples_per_second": 48.322, "eval_msmarco_pairs_steps_per_second": 1.691, "step": 8770 }, { "epoch": 1.500941297278795, "eval_nq_pairs_loss": 1.2097156047821045, "eval_nq_pairs_runtime": 8.8496, "eval_nq_pairs_samples_per_second": 22.6, "eval_nq_pairs_steps_per_second": 0.791, "step": 8770 }, { "epoch": 1.500941297278795, "eval_trivia_pairs_loss": 1.5453585386276245, "eval_trivia_pairs_runtime": 12.9823, "eval_trivia_pairs_samples_per_second": 15.406, "eval_trivia_pairs_steps_per_second": 0.539, "step": 8770 }, { "epoch": 1.500941297278795, "eval_quora_pairs_loss": 0.2055002599954605, "eval_quora_pairs_runtime": 1.613, "eval_quora_pairs_samples_per_second": 123.99, "eval_quora_pairs_steps_per_second": 4.34, "step": 8770 }, { "epoch": 1.500941297278795, "eval_gooaq_pairs_loss": 0.8422526717185974, "eval_gooaq_pairs_runtime": 2.7002, "eval_gooaq_pairs_samples_per_second": 74.067, "eval_gooaq_pairs_steps_per_second": 2.592, "step": 8770 }, { "epoch": 1.501112442238576, "grad_norm": 4.013746738433838, "learning_rate": 1.489039795928943e-06, "loss": 0.3081, "step": 8771 }, { "epoch": 1.501283587198357, "grad_norm": 12.073554992675781, "learning_rate": 1.4840389947024085e-06, "loss": 0.7875, "step": 8772 }, { "epoch": 1.501454732158138, "grad_norm": 12.199190139770508, "learning_rate": 1.4790461679129997e-06, "loss": 0.8923, "step": 8773 }, { "epoch": 1.501625877117919, "grad_norm": 1.9509609937667847, "learning_rate": 1.4740613185064883e-06, "loss": 0.1551, "step": 8774 }, { "epoch": 1.5017970220777, "grad_norm": 21.677330017089844, "learning_rate": 1.4690844494239468e-06, "loss": 4.7849, "step": 8775 }, { "epoch": 1.5019681670374807, "grad_norm": 11.554576873779297, "learning_rate": 1.464115563601735e-06, "loss": 0.7825, "step": 8776 }, { "epoch": 1.5021393119972617, "grad_norm": 26.38978385925293, "learning_rate": 1.4591546639714993e-06, "loss": 5.1294, "step": 8777 }, { "epoch": 1.5023104569570425, "grad_norm": 15.504985809326172, "learning_rate": 1.4542017534601831e-06, "loss": 1.2319, "step": 8778 }, { "epoch": 1.5024816019168235, "grad_norm": 15.800821304321289, "learning_rate": 1.4492568349900025e-06, "loss": 1.3873, "step": 8779 }, { "epoch": 1.5026527468766044, "grad_norm": 5.078528881072998, "learning_rate": 1.4443199114784756e-06, "loss": 0.597, "step": 8780 }, { "epoch": 1.5028238918363854, "grad_norm": 16.248367309570312, "learning_rate": 1.4393909858383953e-06, "loss": 1.1847, "step": 8781 }, { "epoch": 1.5029950367961664, "grad_norm": 6.7768659591674805, "learning_rate": 1.4344700609778271e-06, "loss": 0.8068, "step": 8782 }, { "epoch": 1.5031661817559474, "grad_norm": 18.511356353759766, "learning_rate": 1.4295571398001305e-06, "loss": 2.0702, "step": 8783 }, { "epoch": 1.5033373267157282, "grad_norm": 9.188233375549316, "learning_rate": 1.4246522252039335e-06, "loss": 0.8233, "step": 8784 }, { "epoch": 1.5035084716755092, "grad_norm": 12.551141738891602, "learning_rate": 1.4197553200831442e-06, "loss": 0.8082, "step": 8785 }, { "epoch": 1.50367961663529, "grad_norm": 9.112393379211426, "learning_rate": 1.4148664273269436e-06, "loss": 0.6563, "step": 8786 }, { "epoch": 1.503850761595071, "grad_norm": 10.370498657226562, "learning_rate": 1.409985549819784e-06, "loss": 0.7959, "step": 8787 }, { "epoch": 1.504021906554852, "grad_norm": 2.188581705093384, "learning_rate": 1.4051126904413935e-06, "loss": 0.2273, "step": 8788 }, { "epoch": 1.504193051514633, "grad_norm": 3.72436261177063, "learning_rate": 1.4002478520667628e-06, "loss": 0.2615, "step": 8789 }, { "epoch": 1.5043641964744139, "grad_norm": 20.707782745361328, "learning_rate": 1.3953910375661505e-06, "loss": 2.4371, "step": 8790 }, { "epoch": 1.5045353414341949, "grad_norm": 0.3801811635494232, "learning_rate": 1.390542249805098e-06, "loss": 0.1254, "step": 8791 }, { "epoch": 1.5047064863939759, "grad_norm": 15.350961685180664, "learning_rate": 1.385701491644385e-06, "loss": 1.4045, "step": 8792 }, { "epoch": 1.5048776313537566, "grad_norm": 15.098352432250977, "learning_rate": 1.380868765940072e-06, "loss": 1.1111, "step": 8793 }, { "epoch": 1.5050487763135376, "grad_norm": 15.766717910766602, "learning_rate": 1.3760440755434734e-06, "loss": 1.4953, "step": 8794 }, { "epoch": 1.5052199212733184, "grad_norm": 14.612103462219238, "learning_rate": 1.3712274233011585e-06, "loss": 1.443, "step": 8795 }, { "epoch": 1.5053910662330994, "grad_norm": 6.014898777008057, "learning_rate": 1.3664188120549659e-06, "loss": 0.7955, "step": 8796 }, { "epoch": 1.5055622111928804, "grad_norm": 11.059626579284668, "learning_rate": 1.3616182446419795e-06, "loss": 0.9545, "step": 8797 }, { "epoch": 1.5057333561526614, "grad_norm": 25.546361923217773, "learning_rate": 1.3568257238945453e-06, "loss": 1.4751, "step": 8798 }, { "epoch": 1.5059045011124423, "grad_norm": 12.902242660522461, "learning_rate": 1.3520412526402515e-06, "loss": 1.2331, "step": 8799 }, { "epoch": 1.5060756460722233, "grad_norm": 23.613662719726562, "learning_rate": 1.34726483370195e-06, "loss": 5.2051, "step": 8800 }, { "epoch": 1.506246791032004, "grad_norm": 2.1539666652679443, "learning_rate": 1.3424964698977281e-06, "loss": 0.2545, "step": 8801 }, { "epoch": 1.506417935991785, "grad_norm": 13.814474105834961, "learning_rate": 1.3377361640409325e-06, "loss": 1.0032, "step": 8802 }, { "epoch": 1.5065890809515659, "grad_norm": 20.652355194091797, "learning_rate": 1.332983918940151e-06, "loss": 1.368, "step": 8803 }, { "epoch": 1.5067602259113468, "grad_norm": 17.37527847290039, "learning_rate": 1.3282397373992129e-06, "loss": 0.7081, "step": 8804 }, { "epoch": 1.5069313708711278, "grad_norm": 9.630388259887695, "learning_rate": 1.3235036222171938e-06, "loss": 0.7732, "step": 8805 }, { "epoch": 1.5071025158309088, "grad_norm": 14.42004108428955, "learning_rate": 1.31877557618841e-06, "loss": 1.3624, "step": 8806 }, { "epoch": 1.5072736607906898, "grad_norm": 17.15968132019043, "learning_rate": 1.3140556021024092e-06, "loss": 2.2275, "step": 8807 }, { "epoch": 1.5074448057504708, "grad_norm": 30.557077407836914, "learning_rate": 1.3093437027439931e-06, "loss": 5.4318, "step": 8808 }, { "epoch": 1.5076159507102516, "grad_norm": 7.824394226074219, "learning_rate": 1.3046398808931797e-06, "loss": 0.5952, "step": 8809 }, { "epoch": 1.5077870956700326, "grad_norm": 11.921709060668945, "learning_rate": 1.2999441393252375e-06, "loss": 0.9276, "step": 8810 }, { "epoch": 1.5079582406298133, "grad_norm": 19.82746124267578, "learning_rate": 1.2952564808106571e-06, "loss": 1.6795, "step": 8811 }, { "epoch": 1.5081293855895943, "grad_norm": 4.480539798736572, "learning_rate": 1.2905769081151658e-06, "loss": 0.2813, "step": 8812 }, { "epoch": 1.5083005305493753, "grad_norm": 10.990690231323242, "learning_rate": 1.2859054239997203e-06, "loss": 1.1628, "step": 8813 }, { "epoch": 1.5084716755091563, "grad_norm": 8.219344139099121, "learning_rate": 1.2812420312204992e-06, "loss": 0.9422, "step": 8814 }, { "epoch": 1.5086428204689373, "grad_norm": 10.142589569091797, "learning_rate": 1.2765867325289148e-06, "loss": 1.0203, "step": 8815 }, { "epoch": 1.5088139654287183, "grad_norm": 11.971064567565918, "learning_rate": 1.271939530671597e-06, "loss": 0.951, "step": 8816 }, { "epoch": 1.508985110388499, "grad_norm": 3.5844132900238037, "learning_rate": 1.2673004283904055e-06, "loss": 0.2983, "step": 8817 }, { "epoch": 1.50915625534828, "grad_norm": 2.6141886711120605, "learning_rate": 1.262669428422416e-06, "loss": 0.2972, "step": 8818 }, { "epoch": 1.5093274003080608, "grad_norm": 8.112431526184082, "learning_rate": 1.258046533499923e-06, "loss": 0.6758, "step": 8819 }, { "epoch": 1.5094985452678418, "grad_norm": 18.731298446655273, "learning_rate": 1.2534317463504447e-06, "loss": 1.342, "step": 8820 }, { "epoch": 1.5096696902276228, "grad_norm": 2.863673686981201, "learning_rate": 1.2488250696967096e-06, "loss": 0.2412, "step": 8821 }, { "epoch": 1.5098408351874038, "grad_norm": 0.9755850434303284, "learning_rate": 1.244226506256662e-06, "loss": 0.2018, "step": 8822 }, { "epoch": 1.5100119801471847, "grad_norm": 58.96149444580078, "learning_rate": 1.2396360587434684e-06, "loss": 7.1236, "step": 8823 }, { "epoch": 1.5101831251069657, "grad_norm": 18.974409103393555, "learning_rate": 1.235053729865494e-06, "loss": 1.3713, "step": 8824 }, { "epoch": 1.5103542700667465, "grad_norm": 18.200756072998047, "learning_rate": 1.2304795223263243e-06, "loss": 1.6076, "step": 8825 }, { "epoch": 1.5105254150265275, "grad_norm": 16.95674705505371, "learning_rate": 1.2259134388247456e-06, "loss": 1.6479, "step": 8826 }, { "epoch": 1.5106965599863083, "grad_norm": 4.243163108825684, "learning_rate": 1.2213554820547513e-06, "loss": 0.3116, "step": 8827 }, { "epoch": 1.5108677049460892, "grad_norm": 15.39980697631836, "learning_rate": 1.2168056547055483e-06, "loss": 1.1657, "step": 8828 }, { "epoch": 1.5110388499058702, "grad_norm": 17.05693244934082, "learning_rate": 1.2122639594615347e-06, "loss": 1.4848, "step": 8829 }, { "epoch": 1.5112099948656512, "grad_norm": 10.453177452087402, "learning_rate": 1.2077303990023253e-06, "loss": 0.9655, "step": 8830 }, { "epoch": 1.5113811398254322, "grad_norm": 21.34493637084961, "learning_rate": 1.2032049760027203e-06, "loss": 2.5356, "step": 8831 }, { "epoch": 1.5115522847852132, "grad_norm": 23.360994338989258, "learning_rate": 1.1986876931327307e-06, "loss": 5.0279, "step": 8832 }, { "epoch": 1.511723429744994, "grad_norm": 13.687023162841797, "learning_rate": 1.1941785530575544e-06, "loss": 1.0645, "step": 8833 }, { "epoch": 1.511894574704775, "grad_norm": 8.73773193359375, "learning_rate": 1.1896775584376002e-06, "loss": 0.6087, "step": 8834 }, { "epoch": 1.5120657196645557, "grad_norm": 0.7991341352462769, "learning_rate": 1.1851847119284536e-06, "loss": 0.1798, "step": 8835 }, { "epoch": 1.5122368646243367, "grad_norm": 9.919504165649414, "learning_rate": 1.1807000161809079e-06, "loss": 0.7322, "step": 8836 }, { "epoch": 1.5124080095841177, "grad_norm": 18.45490264892578, "learning_rate": 1.1762234738409311e-06, "loss": 1.912, "step": 8837 }, { "epoch": 1.5125791545438987, "grad_norm": 40.272918701171875, "learning_rate": 1.1717550875496992e-06, "loss": 6.5837, "step": 8838 }, { "epoch": 1.5127502995036797, "grad_norm": 14.659866333007812, "learning_rate": 1.167294859943558e-06, "loss": 0.9338, "step": 8839 }, { "epoch": 1.5129214444634607, "grad_norm": 12.439068794250488, "learning_rate": 1.1628427936540558e-06, "loss": 0.6551, "step": 8840 }, { "epoch": 1.5130925894232417, "grad_norm": 25.60451889038086, "learning_rate": 1.1583988913079152e-06, "loss": 4.8432, "step": 8841 }, { "epoch": 1.5132637343830224, "grad_norm": 2.7354772090911865, "learning_rate": 1.1539631555270418e-06, "loss": 0.2833, "step": 8842 }, { "epoch": 1.5134348793428034, "grad_norm": 3.1230759620666504, "learning_rate": 1.1495355889285307e-06, "loss": 0.2374, "step": 8843 }, { "epoch": 1.5136060243025842, "grad_norm": 13.000197410583496, "learning_rate": 1.1451161941246525e-06, "loss": 1.0858, "step": 8844 }, { "epoch": 1.5137771692623652, "grad_norm": 21.430810928344727, "learning_rate": 1.1407049737228576e-06, "loss": 2.8638, "step": 8845 }, { "epoch": 1.5139483142221462, "grad_norm": 17.495996475219727, "learning_rate": 1.1363019303257727e-06, "loss": 1.443, "step": 8846 }, { "epoch": 1.5141194591819271, "grad_norm": 9.130285263061523, "learning_rate": 1.1319070665311931e-06, "loss": 0.8287, "step": 8847 }, { "epoch": 1.5142906041417081, "grad_norm": 21.002784729003906, "learning_rate": 1.1275203849321047e-06, "loss": 2.6952, "step": 8848 }, { "epoch": 1.5144617491014891, "grad_norm": 10.095860481262207, "learning_rate": 1.123141888116649e-06, "loss": 0.6741, "step": 8849 }, { "epoch": 1.5146328940612699, "grad_norm": 0.36840200424194336, "learning_rate": 1.1187715786681508e-06, "loss": 0.1191, "step": 8850 }, { "epoch": 1.5148040390210509, "grad_norm": 8.24820327758789, "learning_rate": 1.1144094591650926e-06, "loss": 0.6703, "step": 8851 }, { "epoch": 1.5149751839808316, "grad_norm": 9.150468826293945, "learning_rate": 1.1100555321811395e-06, "loss": 0.7674, "step": 8852 }, { "epoch": 1.5151463289406126, "grad_norm": 16.106292724609375, "learning_rate": 1.1057098002851113e-06, "loss": 1.2652, "step": 8853 }, { "epoch": 1.5153174739003936, "grad_norm": 65.95269775390625, "learning_rate": 1.1013722660409902e-06, "loss": 7.701, "step": 8854 }, { "epoch": 1.5154886188601746, "grad_norm": 20.543350219726562, "learning_rate": 1.0970429320079394e-06, "loss": 2.399, "step": 8855 }, { "epoch": 1.5156597638199556, "grad_norm": 0.37132781744003296, "learning_rate": 1.0927218007402624e-06, "loss": 0.1186, "step": 8856 }, { "epoch": 1.5158309087797366, "grad_norm": 15.066244125366211, "learning_rate": 1.088408874787441e-06, "loss": 1.1444, "step": 8857 }, { "epoch": 1.5160020537395174, "grad_norm": 22.181188583374023, "learning_rate": 1.0841041566941051e-06, "loss": 2.1029, "step": 8858 }, { "epoch": 1.5161731986992983, "grad_norm": 14.736654281616211, "learning_rate": 1.0798076490000397e-06, "loss": 1.1775, "step": 8859 }, { "epoch": 1.516344343659079, "grad_norm": 20.32367706298828, "learning_rate": 1.0755193542401987e-06, "loss": 2.2673, "step": 8860 }, { "epoch": 1.51651548861886, "grad_norm": 12.068868637084961, "learning_rate": 1.0712392749446748e-06, "loss": 0.9325, "step": 8861 }, { "epoch": 1.516686633578641, "grad_norm": 7.799076557159424, "learning_rate": 1.066967413638728e-06, "loss": 0.7503, "step": 8862 }, { "epoch": 1.516857778538422, "grad_norm": 12.228155136108398, "learning_rate": 1.0627037728427592e-06, "loss": 1.1449, "step": 8863 }, { "epoch": 1.517028923498203, "grad_norm": 63.42900848388672, "learning_rate": 1.058448355072324e-06, "loss": 7.3264, "step": 8864 }, { "epoch": 1.517200068457984, "grad_norm": 7.229288578033447, "learning_rate": 1.0542011628381194e-06, "loss": 0.6527, "step": 8865 }, { "epoch": 1.5173712134177648, "grad_norm": 20.163602828979492, "learning_rate": 1.0499621986460072e-06, "loss": 2.3131, "step": 8866 }, { "epoch": 1.5175423583775458, "grad_norm": 13.934006690979004, "learning_rate": 1.0457314649969724e-06, "loss": 1.1895, "step": 8867 }, { "epoch": 1.5177135033373266, "grad_norm": 18.754222869873047, "learning_rate": 1.0415089643871595e-06, "loss": 1.6238, "step": 8868 }, { "epoch": 1.5178846482971076, "grad_norm": 7.147726535797119, "learning_rate": 1.0372946993078458e-06, "loss": 0.6675, "step": 8869 }, { "epoch": 1.5180557932568886, "grad_norm": 0.6331747174263, "learning_rate": 1.033088672245459e-06, "loss": 0.1353, "step": 8870 }, { "epoch": 1.5182269382166695, "grad_norm": 13.960134506225586, "learning_rate": 1.0288908856815577e-06, "loss": 1.2692, "step": 8871 }, { "epoch": 1.5183980831764505, "grad_norm": 9.651081085205078, "learning_rate": 1.0247013420928436e-06, "loss": 0.7943, "step": 8872 }, { "epoch": 1.5185692281362315, "grad_norm": 18.989585876464844, "learning_rate": 1.0205200439511547e-06, "loss": 1.7201, "step": 8873 }, { "epoch": 1.5187403730960123, "grad_norm": 16.395652770996094, "learning_rate": 1.0163469937234576e-06, "loss": 1.4478, "step": 8874 }, { "epoch": 1.5189115180557933, "grad_norm": 11.04360580444336, "learning_rate": 1.0121821938718662e-06, "loss": 1.1428, "step": 8875 }, { "epoch": 1.519082663015574, "grad_norm": 3.3829946517944336, "learning_rate": 1.0080256468536097e-06, "loss": 0.3837, "step": 8876 }, { "epoch": 1.519253807975355, "grad_norm": 11.329069137573242, "learning_rate": 1.003877355121065e-06, "loss": 0.78, "step": 8877 }, { "epoch": 1.519424952935136, "grad_norm": 7.869561672210693, "learning_rate": 9.997373211217291e-07, "loss": 0.6345, "step": 8878 }, { "epoch": 1.519596097894917, "grad_norm": 15.935222625732422, "learning_rate": 9.95605547298225e-07, "loss": 1.3819, "step": 8879 }, { "epoch": 1.519767242854698, "grad_norm": 19.9326171875, "learning_rate": 9.914820360883076e-07, "loss": 1.6083, "step": 8880 }, { "epoch": 1.519938387814479, "grad_norm": 8.975380897521973, "learning_rate": 9.873667899248539e-07, "loss": 0.7083, "step": 8881 }, { "epoch": 1.5201095327742598, "grad_norm": 18.164505004882812, "learning_rate": 9.832598112358654e-07, "loss": 2.0501, "step": 8882 }, { "epoch": 1.5202806777340407, "grad_norm": 0.7483938932418823, "learning_rate": 9.791611024444668e-07, "loss": 0.1313, "step": 8883 }, { "epoch": 1.5204518226938215, "grad_norm": 15.07286262512207, "learning_rate": 9.75070665968899e-07, "loss": 1.2392, "step": 8884 }, { "epoch": 1.5206229676536025, "grad_norm": 19.54453468322754, "learning_rate": 9.709885042225297e-07, "loss": 1.7083, "step": 8885 }, { "epoch": 1.5207941126133835, "grad_norm": 17.705751419067383, "learning_rate": 9.669146196138374e-07, "loss": 1.4097, "step": 8886 }, { "epoch": 1.5209652575731645, "grad_norm": 6.917839050292969, "learning_rate": 9.628490145464208e-07, "loss": 0.5468, "step": 8887 }, { "epoch": 1.5211364025329455, "grad_norm": 15.654844284057617, "learning_rate": 9.587916914189949e-07, "loss": 1.3604, "step": 8888 }, { "epoch": 1.5213075474927265, "grad_norm": 12.90906810760498, "learning_rate": 9.547426526253827e-07, "loss": 1.1209, "step": 8889 }, { "epoch": 1.5214786924525072, "grad_norm": 15.699536323547363, "learning_rate": 9.507019005545286e-07, "loss": 1.1469, "step": 8890 }, { "epoch": 1.5216498374122882, "grad_norm": 7.653834819793701, "learning_rate": 9.466694375904755e-07, "loss": 0.7176, "step": 8891 }, { "epoch": 1.5218209823720692, "grad_norm": 7.591158866882324, "learning_rate": 9.426452661123885e-07, "loss": 0.5431, "step": 8892 }, { "epoch": 1.52199212733185, "grad_norm": 17.280027389526367, "learning_rate": 9.386293884945335e-07, "loss": 1.2493, "step": 8893 }, { "epoch": 1.522163272291631, "grad_norm": 22.16407012939453, "learning_rate": 9.346218071062801e-07, "loss": 4.9454, "step": 8894 }, { "epoch": 1.522334417251412, "grad_norm": 13.23286247253418, "learning_rate": 9.306225243121135e-07, "loss": 1.311, "step": 8895 }, { "epoch": 1.522505562211193, "grad_norm": 9.87154483795166, "learning_rate": 9.266315424716099e-07, "loss": 0.7385, "step": 8896 }, { "epoch": 1.522676707170974, "grad_norm": 88.2677993774414, "learning_rate": 9.226488639394587e-07, "loss": 8.3312, "step": 8897 }, { "epoch": 1.522847852130755, "grad_norm": 11.71298599243164, "learning_rate": 9.18674491065447e-07, "loss": 1.2005, "step": 8898 }, { "epoch": 1.5230189970905357, "grad_norm": 6.461123943328857, "learning_rate": 9.147084261944561e-07, "loss": 0.5265, "step": 8899 }, { "epoch": 1.5231901420503167, "grad_norm": 19.373626708984375, "learning_rate": 9.107506716664771e-07, "loss": 1.5726, "step": 8900 }, { "epoch": 1.5233612870100974, "grad_norm": 14.996251106262207, "learning_rate": 9.06801229816584e-07, "loss": 1.1801, "step": 8901 }, { "epoch": 1.5235324319698784, "grad_norm": 12.366771697998047, "learning_rate": 9.028601029749595e-07, "loss": 0.9663, "step": 8902 }, { "epoch": 1.5237035769296594, "grad_norm": 4.607018947601318, "learning_rate": 8.989272934668686e-07, "loss": 0.3903, "step": 8903 }, { "epoch": 1.5238747218894404, "grad_norm": 4.648444175720215, "learning_rate": 8.9500280361268e-07, "loss": 0.3329, "step": 8904 }, { "epoch": 1.5240458668492214, "grad_norm": 8.761842727661133, "learning_rate": 8.910866357278469e-07, "loss": 0.5836, "step": 8905 }, { "epoch": 1.5242170118090024, "grad_norm": 17.132099151611328, "learning_rate": 8.871787921229091e-07, "loss": 1.4432, "step": 8906 }, { "epoch": 1.5243881567687831, "grad_norm": 15.816734313964844, "learning_rate": 8.83279275103509e-07, "loss": 1.6316, "step": 8907 }, { "epoch": 1.5245593017285641, "grad_norm": 17.845752716064453, "learning_rate": 8.793880869703564e-07, "loss": 1.6238, "step": 8908 }, { "epoch": 1.524730446688345, "grad_norm": 18.31258201599121, "learning_rate": 8.755052300192678e-07, "loss": 1.6217, "step": 8909 }, { "epoch": 1.5249015916481259, "grad_norm": 0.4351605176925659, "learning_rate": 8.716307065411345e-07, "loss": 0.1239, "step": 8910 }, { "epoch": 1.5250727366079069, "grad_norm": 51.22026443481445, "learning_rate": 8.67764518821924e-07, "loss": 7.327, "step": 8911 }, { "epoch": 1.5252438815676879, "grad_norm": 17.76290512084961, "learning_rate": 8.639066691427005e-07, "loss": 1.7448, "step": 8912 }, { "epoch": 1.5254150265274689, "grad_norm": 11.000847816467285, "learning_rate": 8.600571597795931e-07, "loss": 0.6839, "step": 8913 }, { "epoch": 1.5255861714872498, "grad_norm": 19.25257682800293, "learning_rate": 8.562159930038266e-07, "loss": 2.2715, "step": 8914 }, { "epoch": 1.5257573164470306, "grad_norm": 18.515369415283203, "learning_rate": 8.523831710816898e-07, "loss": 2.0248, "step": 8915 }, { "epoch": 1.5259284614068116, "grad_norm": 40.11855697631836, "learning_rate": 8.485586962745523e-07, "loss": 7.0379, "step": 8916 }, { "epoch": 1.5260996063665924, "grad_norm": 18.255218505859375, "learning_rate": 8.447425708388639e-07, "loss": 2.2976, "step": 8917 }, { "epoch": 1.5262707513263734, "grad_norm": 12.95837688446045, "learning_rate": 8.409347970261405e-07, "loss": 0.9101, "step": 8918 }, { "epoch": 1.5264418962861543, "grad_norm": 16.136180877685547, "learning_rate": 8.371353770829748e-07, "loss": 1.8904, "step": 8919 }, { "epoch": 1.5266130412459353, "grad_norm": 1.3963035345077515, "learning_rate": 8.333443132510354e-07, "loss": 0.1653, "step": 8920 }, { "epoch": 1.5267841862057163, "grad_norm": 9.98221492767334, "learning_rate": 8.295616077670498e-07, "loss": 0.7191, "step": 8921 }, { "epoch": 1.5269553311654973, "grad_norm": 11.032527923583984, "learning_rate": 8.257872628628227e-07, "loss": 0.8402, "step": 8922 }, { "epoch": 1.527126476125278, "grad_norm": 23.428810119628906, "learning_rate": 8.220212807652244e-07, "loss": 4.8811, "step": 8923 }, { "epoch": 1.527297621085059, "grad_norm": 20.017208099365234, "learning_rate": 8.182636636961843e-07, "loss": 2.611, "step": 8924 }, { "epoch": 1.5274687660448398, "grad_norm": 13.71910572052002, "learning_rate": 8.145144138727089e-07, "loss": 0.8683, "step": 8925 }, { "epoch": 1.5276399110046208, "grad_norm": 22.843658447265625, "learning_rate": 8.107735335068556e-07, "loss": 4.9961, "step": 8926 }, { "epoch": 1.5278110559644018, "grad_norm": 8.119129180908203, "learning_rate": 8.070410248057536e-07, "loss": 0.4878, "step": 8927 }, { "epoch": 1.5279822009241828, "grad_norm": 6.540073871612549, "learning_rate": 8.033168899715865e-07, "loss": 0.3637, "step": 8928 }, { "epoch": 1.5281533458839638, "grad_norm": 55.22416305541992, "learning_rate": 7.996011312015983e-07, "loss": 7.2079, "step": 8929 }, { "epoch": 1.5283244908437448, "grad_norm": 18.142370223999023, "learning_rate": 7.958937506880953e-07, "loss": 1.755, "step": 8930 }, { "epoch": 1.5284956358035255, "grad_norm": 10.273751258850098, "learning_rate": 7.92194750618433e-07, "loss": 0.8306, "step": 8931 }, { "epoch": 1.5286667807633065, "grad_norm": 0.4016951024532318, "learning_rate": 7.885041331750337e-07, "loss": 0.1218, "step": 8932 }, { "epoch": 1.5288379257230873, "grad_norm": 16.600425720214844, "learning_rate": 7.848219005353607e-07, "loss": 1.4517, "step": 8933 }, { "epoch": 1.5290090706828683, "grad_norm": 0.37767937779426575, "learning_rate": 7.811480548719413e-07, "loss": 0.1239, "step": 8934 }, { "epoch": 1.5291802156426493, "grad_norm": 15.751119613647461, "learning_rate": 7.774825983523482e-07, "loss": 1.3208, "step": 8935 }, { "epoch": 1.5293513606024303, "grad_norm": 0.5373888611793518, "learning_rate": 7.73825533139203e-07, "loss": 0.125, "step": 8936 }, { "epoch": 1.5295225055622113, "grad_norm": 13.056131362915039, "learning_rate": 7.70176861390185e-07, "loss": 1.1174, "step": 8937 }, { "epoch": 1.5296936505219922, "grad_norm": 14.010622024536133, "learning_rate": 7.66536585258012e-07, "loss": 1.204, "step": 8938 }, { "epoch": 1.529864795481773, "grad_norm": 7.968710422515869, "learning_rate": 7.629047068904544e-07, "loss": 0.8895, "step": 8939 }, { "epoch": 1.530035940441554, "grad_norm": 22.329116821289062, "learning_rate": 7.592812284303247e-07, "loss": 5.1966, "step": 8940 }, { "epoch": 1.5302070854013348, "grad_norm": 11.743240356445312, "learning_rate": 7.556661520154812e-07, "loss": 0.9536, "step": 8941 }, { "epoch": 1.5303782303611158, "grad_norm": 33.72946548461914, "learning_rate": 7.520594797788261e-07, "loss": 5.5867, "step": 8942 }, { "epoch": 1.5305493753208967, "grad_norm": 19.174978256225586, "learning_rate": 7.484612138482972e-07, "loss": 1.4363, "step": 8943 }, { "epoch": 1.5307205202806777, "grad_norm": 3.9398553371429443, "learning_rate": 7.448713563468812e-07, "loss": 0.3047, "step": 8944 }, { "epoch": 1.5308916652404587, "grad_norm": 41.23690414428711, "learning_rate": 7.412899093925957e-07, "loss": 6.286, "step": 8945 }, { "epoch": 1.5310628102002397, "grad_norm": 10.646089553833008, "learning_rate": 7.377168750985036e-07, "loss": 0.876, "step": 8946 }, { "epoch": 1.5312339551600207, "grad_norm": 8.800580024719238, "learning_rate": 7.341522555726987e-07, "loss": 0.7881, "step": 8947 }, { "epoch": 1.5314051001198015, "grad_norm": 13.772637367248535, "learning_rate": 7.305960529183087e-07, "loss": 1.1644, "step": 8948 }, { "epoch": 1.5315762450795825, "grad_norm": 7.512633323669434, "learning_rate": 7.270482692335034e-07, "loss": 0.6324, "step": 8949 }, { "epoch": 1.5317473900393632, "grad_norm": 9.42044448852539, "learning_rate": 7.23508906611477e-07, "loss": 0.8012, "step": 8950 }, { "epoch": 1.5319185349991442, "grad_norm": 0.4081376791000366, "learning_rate": 7.199779671404571e-07, "loss": 0.1221, "step": 8951 }, { "epoch": 1.5320896799589252, "grad_norm": 4.098542213439941, "learning_rate": 7.164554529037109e-07, "loss": 0.2911, "step": 8952 }, { "epoch": 1.5322608249187062, "grad_norm": 14.515700340270996, "learning_rate": 7.129413659795192e-07, "loss": 1.058, "step": 8953 }, { "epoch": 1.5324319698784872, "grad_norm": 8.778465270996094, "learning_rate": 7.094357084412034e-07, "loss": 0.9236, "step": 8954 }, { "epoch": 1.5326031148382682, "grad_norm": 10.07398509979248, "learning_rate": 7.059384823571057e-07, "loss": 0.8575, "step": 8955 }, { "epoch": 1.532774259798049, "grad_norm": 6.178288459777832, "learning_rate": 7.024496897905925e-07, "loss": 0.4496, "step": 8956 }, { "epoch": 1.53294540475783, "grad_norm": 11.240833282470703, "learning_rate": 6.989693328000584e-07, "loss": 0.9543, "step": 8957 }, { "epoch": 1.5331165497176107, "grad_norm": 3.4425065517425537, "learning_rate": 6.954974134389181e-07, "loss": 0.3073, "step": 8958 }, { "epoch": 1.5332876946773917, "grad_norm": 1.1511598825454712, "learning_rate": 6.920339337556108e-07, "loss": 0.201, "step": 8959 }, { "epoch": 1.5334588396371727, "grad_norm": 10.617854118347168, "learning_rate": 6.885788957935923e-07, "loss": 0.9172, "step": 8960 }, { "epoch": 1.5336299845969537, "grad_norm": 2.6734681129455566, "learning_rate": 6.851323015913386e-07, "loss": 0.2495, "step": 8961 }, { "epoch": 1.5338011295567346, "grad_norm": 2.7742600440979004, "learning_rate": 6.816941531823529e-07, "loss": 0.2389, "step": 8962 }, { "epoch": 1.5339722745165156, "grad_norm": 17.67864990234375, "learning_rate": 6.782644525951442e-07, "loss": 1.1561, "step": 8963 }, { "epoch": 1.5341434194762964, "grad_norm": 18.713104248046875, "learning_rate": 6.748432018532375e-07, "loss": 2.414, "step": 8964 }, { "epoch": 1.5343145644360774, "grad_norm": 14.849449157714844, "learning_rate": 6.714304029751794e-07, "loss": 1.1648, "step": 8965 }, { "epoch": 1.5344857093958582, "grad_norm": 5.869083881378174, "learning_rate": 6.680260579745245e-07, "loss": 0.5141, "step": 8966 }, { "epoch": 1.5346568543556391, "grad_norm": 16.908506393432617, "learning_rate": 6.646301688598444e-07, "loss": 1.4851, "step": 8967 }, { "epoch": 1.5348279993154201, "grad_norm": 13.016549110412598, "learning_rate": 6.612427376347146e-07, "loss": 0.9686, "step": 8968 }, { "epoch": 1.5349991442752011, "grad_norm": 14.220627784729004, "learning_rate": 6.578637662977283e-07, "loss": 1.0861, "step": 8969 }, { "epoch": 1.535170289234982, "grad_norm": 7.680816650390625, "learning_rate": 6.544932568424822e-07, "loss": 0.8021, "step": 8970 }, { "epoch": 1.535341434194763, "grad_norm": 14.715595245361328, "learning_rate": 6.51131211257579e-07, "loss": 1.357, "step": 8971 }, { "epoch": 1.5355125791545439, "grad_norm": 0.3728656470775604, "learning_rate": 6.477776315266338e-07, "loss": 0.1193, "step": 8972 }, { "epoch": 1.5356837241143249, "grad_norm": 13.2388277053833, "learning_rate": 6.444325196282635e-07, "loss": 1.4974, "step": 8973 }, { "epoch": 1.5358548690741056, "grad_norm": 12.930038452148438, "learning_rate": 6.410958775360898e-07, "loss": 1.0451, "step": 8974 }, { "epoch": 1.5360260140338866, "grad_norm": 12.616342544555664, "learning_rate": 6.37767707218736e-07, "loss": 1.2537, "step": 8975 }, { "epoch": 1.5361971589936676, "grad_norm": 9.606904983520508, "learning_rate": 6.344480106398249e-07, "loss": 0.6369, "step": 8976 }, { "epoch": 1.5363683039534486, "grad_norm": 5.7922043800354, "learning_rate": 6.311367897579862e-07, "loss": 0.352, "step": 8977 }, { "epoch": 1.5365394489132296, "grad_norm": 8.059355735778809, "learning_rate": 6.278340465268406e-07, "loss": 0.6881, "step": 8978 }, { "epoch": 1.5367105938730106, "grad_norm": 26.86087989807129, "learning_rate": 6.245397828950178e-07, "loss": 2.7956, "step": 8979 }, { "epoch": 1.5368817388327913, "grad_norm": 0.9233415722846985, "learning_rate": 6.212540008061313e-07, "loss": 0.1393, "step": 8980 }, { "epoch": 1.5370528837925723, "grad_norm": 11.730569839477539, "learning_rate": 6.179767021988036e-07, "loss": 0.9871, "step": 8981 }, { "epoch": 1.537224028752353, "grad_norm": 4.181680679321289, "learning_rate": 6.147078890066432e-07, "loss": 0.3011, "step": 8982 }, { "epoch": 1.537395173712134, "grad_norm": 8.492830276489258, "learning_rate": 6.114475631582506e-07, "loss": 0.6721, "step": 8983 }, { "epoch": 1.537566318671915, "grad_norm": 2.071878671646118, "learning_rate": 6.081957265772303e-07, "loss": 0.2699, "step": 8984 }, { "epoch": 1.537737463631696, "grad_norm": 22.461790084838867, "learning_rate": 6.049523811821661e-07, "loss": 1.904, "step": 8985 }, { "epoch": 1.537908608591477, "grad_norm": 8.114803314208984, "learning_rate": 6.017175288866389e-07, "loss": 1.0224, "step": 8986 }, { "epoch": 1.538079753551258, "grad_norm": 11.756213188171387, "learning_rate": 5.98491171599217e-07, "loss": 1.0691, "step": 8987 }, { "epoch": 1.5382508985110388, "grad_norm": 3.1281285285949707, "learning_rate": 5.95273311223451e-07, "loss": 0.2794, "step": 8988 }, { "epoch": 1.5384220434708198, "grad_norm": 11.045248985290527, "learning_rate": 5.920639496578889e-07, "loss": 0.8006, "step": 8989 }, { "epoch": 1.5385931884306006, "grad_norm": 5.0460896492004395, "learning_rate": 5.888630887960544e-07, "loss": 0.366, "step": 8990 }, { "epoch": 1.5387643333903815, "grad_norm": 15.382129669189453, "learning_rate": 5.856707305264636e-07, "loss": 0.983, "step": 8991 }, { "epoch": 1.5389354783501625, "grad_norm": 0.474468469619751, "learning_rate": 5.824868767326114e-07, "loss": 0.1339, "step": 8992 }, { "epoch": 1.5391066233099435, "grad_norm": 13.889018058776855, "learning_rate": 5.793115292929768e-07, "loss": 1.0596, "step": 8993 }, { "epoch": 1.5392777682697245, "grad_norm": 0.3914000689983368, "learning_rate": 5.761446900810147e-07, "loss": 0.1176, "step": 8994 }, { "epoch": 1.5394489132295055, "grad_norm": 14.392510414123535, "learning_rate": 5.729863609651736e-07, "loss": 1.1485, "step": 8995 }, { "epoch": 1.5396200581892865, "grad_norm": 9.651872634887695, "learning_rate": 5.698365438088665e-07, "loss": 0.7606, "step": 8996 }, { "epoch": 1.5397912031490673, "grad_norm": 20.689218521118164, "learning_rate": 5.666952404704933e-07, "loss": 2.1712, "step": 8997 }, { "epoch": 1.5399623481088482, "grad_norm": 12.345903396606445, "learning_rate": 5.63562452803425e-07, "loss": 0.9721, "step": 8998 }, { "epoch": 1.540133493068629, "grad_norm": 11.700541496276855, "learning_rate": 5.604381826560129e-07, "loss": 0.9917, "step": 8999 }, { "epoch": 1.54030463802841, "grad_norm": 10.294479370117188, "learning_rate": 5.573224318715775e-07, "loss": 0.8052, "step": 9000 }, { "epoch": 1.540475782988191, "grad_norm": 3.4827311038970947, "learning_rate": 5.542152022884217e-07, "loss": 0.2753, "step": 9001 }, { "epoch": 1.540646927947972, "grad_norm": 8.145967483520508, "learning_rate": 5.511164957398141e-07, "loss": 0.7581, "step": 9002 }, { "epoch": 1.540818072907753, "grad_norm": 13.117892265319824, "learning_rate": 5.48026314053992e-07, "loss": 0.991, "step": 9003 }, { "epoch": 1.540989217867534, "grad_norm": 2.6592934131622314, "learning_rate": 5.44944659054169e-07, "loss": 0.2137, "step": 9004 }, { "epoch": 1.5411603628273147, "grad_norm": 12.108229637145996, "learning_rate": 5.418715325585305e-07, "loss": 1.7312, "step": 9005 }, { "epoch": 1.5413315077870957, "grad_norm": 15.911478996276855, "learning_rate": 5.388069363802211e-07, "loss": 1.1496, "step": 9006 }, { "epoch": 1.5415026527468765, "grad_norm": 0.48203298449516296, "learning_rate": 5.357508723273596e-07, "loss": 0.1322, "step": 9007 }, { "epoch": 1.5416737977066575, "grad_norm": 2.5452756881713867, "learning_rate": 5.32703342203027e-07, "loss": 0.2661, "step": 9008 }, { "epoch": 1.5418449426664385, "grad_norm": 11.785619735717773, "learning_rate": 5.296643478052715e-07, "loss": 0.9611, "step": 9009 }, { "epoch": 1.5420160876262194, "grad_norm": 17.014324188232422, "learning_rate": 5.266338909271023e-07, "loss": 1.6805, "step": 9010 }, { "epoch": 1.5421872325860004, "grad_norm": 1.646148681640625, "learning_rate": 5.236119733564976e-07, "loss": 0.2199, "step": 9011 }, { "epoch": 1.5423583775457814, "grad_norm": 63.23225021362305, "learning_rate": 5.205985968763893e-07, "loss": 6.8718, "step": 9012 }, { "epoch": 1.5425295225055622, "grad_norm": 10.841632843017578, "learning_rate": 5.175937632646738e-07, "loss": 0.9828, "step": 9013 }, { "epoch": 1.5427006674653432, "grad_norm": 12.15488052368164, "learning_rate": 5.145974742942111e-07, "loss": 0.9016, "step": 9014 }, { "epoch": 1.542871812425124, "grad_norm": 15.467876434326172, "learning_rate": 5.116097317328122e-07, "loss": 1.2079, "step": 9015 }, { "epoch": 1.543042957384905, "grad_norm": 8.519033432006836, "learning_rate": 5.086305373432521e-07, "loss": 0.7035, "step": 9016 }, { "epoch": 1.543214102344686, "grad_norm": 14.213433265686035, "learning_rate": 5.056598928832596e-07, "loss": 1.1491, "step": 9017 }, { "epoch": 1.543385247304467, "grad_norm": 28.79752540588379, "learning_rate": 5.026978001055177e-07, "loss": 5.7168, "step": 9018 }, { "epoch": 1.543556392264248, "grad_norm": 13.324151039123535, "learning_rate": 4.997442607576702e-07, "loss": 1.096, "step": 9019 }, { "epoch": 1.543727537224029, "grad_norm": 49.05936813354492, "learning_rate": 4.967992765823031e-07, "loss": 7.0687, "step": 9020 }, { "epoch": 1.5438986821838097, "grad_norm": 19.397933959960938, "learning_rate": 4.938628493169667e-07, "loss": 2.2921, "step": 9021 }, { "epoch": 1.5440698271435906, "grad_norm": 0.4835865795612335, "learning_rate": 4.909349806941549e-07, "loss": 0.1259, "step": 9022 }, { "epoch": 1.5442409721033714, "grad_norm": 15.644149780273438, "learning_rate": 4.880156724413142e-07, "loss": 1.0687, "step": 9023 }, { "epoch": 1.5444121170631524, "grad_norm": 25.968355178833008, "learning_rate": 4.851049262808438e-07, "loss": 3.7607, "step": 9024 }, { "epoch": 1.5445832620229334, "grad_norm": 0.43970176577568054, "learning_rate": 4.822027439300846e-07, "loss": 0.1247, "step": 9025 }, { "epoch": 1.5447544069827144, "grad_norm": 0.43352463841438293, "learning_rate": 4.793091271013289e-07, "loss": 0.1245, "step": 9026 }, { "epoch": 1.5449255519424954, "grad_norm": 5.176083564758301, "learning_rate": 4.7642407750181916e-07, "loss": 0.4381, "step": 9027 }, { "epoch": 1.5450966969022764, "grad_norm": 51.364959716796875, "learning_rate": 4.735475968337338e-07, "loss": 6.9617, "step": 9028 }, { "epoch": 1.5452678418620571, "grad_norm": 15.569823265075684, "learning_rate": 4.7067968679420536e-07, "loss": 1.624, "step": 9029 }, { "epoch": 1.5454389868218381, "grad_norm": 10.124935150146484, "learning_rate": 4.6782034907530023e-07, "loss": 0.7103, "step": 9030 }, { "epoch": 1.5456101317816189, "grad_norm": 10.038396835327148, "learning_rate": 4.6496958536403564e-07, "loss": 0.7724, "step": 9031 }, { "epoch": 1.5457812767413999, "grad_norm": 3.342200756072998, "learning_rate": 4.621273973423629e-07, "loss": 0.2917, "step": 9032 }, { "epoch": 1.5459524217011809, "grad_norm": 0.328497976064682, "learning_rate": 4.5929378668718216e-07, "loss": 0.1122, "step": 9033 }, { "epoch": 1.5461235666609618, "grad_norm": 17.18323516845703, "learning_rate": 4.564687550703245e-07, "loss": 1.3571, "step": 9034 }, { "epoch": 1.5462947116207428, "grad_norm": 13.260428428649902, "learning_rate": 4.5365230415856164e-07, "loss": 0.9791, "step": 9035 }, { "epoch": 1.5464658565805238, "grad_norm": 11.061239242553711, "learning_rate": 4.508444356136077e-07, "loss": 0.6992, "step": 9036 }, { "epoch": 1.5466370015403046, "grad_norm": 11.573921203613281, "learning_rate": 4.480451510921024e-07, "loss": 1.0988, "step": 9037 }, { "epoch": 1.5468081465000856, "grad_norm": 5.063563346862793, "learning_rate": 4.4525445224563643e-07, "loss": 0.5707, "step": 9038 }, { "epoch": 1.5469792914598663, "grad_norm": 16.314910888671875, "learning_rate": 4.4247234072072264e-07, "loss": 1.3021, "step": 9039 }, { "epoch": 1.5471504364196473, "grad_norm": 15.41794204711914, "learning_rate": 4.39698818158808e-07, "loss": 1.7118, "step": 9040 }, { "epoch": 1.5473215813794283, "grad_norm": 19.005983352661133, "learning_rate": 4.3693388619628183e-07, "loss": 1.8426, "step": 9041 }, { "epoch": 1.5474927263392093, "grad_norm": 0.3438902795314789, "learning_rate": 4.3417754646445096e-07, "loss": 0.1147, "step": 9042 }, { "epoch": 1.5476638712989903, "grad_norm": 18.400978088378906, "learning_rate": 4.3142980058956284e-07, "loss": 2.0282, "step": 9043 }, { "epoch": 1.5478350162587713, "grad_norm": 17.359830856323242, "learning_rate": 4.2869065019279395e-07, "loss": 2.0501, "step": 9044 }, { "epoch": 1.5480061612185523, "grad_norm": 12.6216459274292, "learning_rate": 4.2596009689024165e-07, "loss": 1.1233, "step": 9045 }, { "epoch": 1.548177306178333, "grad_norm": 27.768497467041016, "learning_rate": 4.232381422929421e-07, "loss": 5.2883, "step": 9046 }, { "epoch": 1.548348451138114, "grad_norm": 12.217334747314453, "learning_rate": 4.205247880068475e-07, "loss": 0.807, "step": 9047 }, { "epoch": 1.5485195960978948, "grad_norm": 11.662314414978027, "learning_rate": 4.178200356328454e-07, "loss": 1.0757, "step": 9048 }, { "epoch": 1.5486907410576758, "grad_norm": 7.625820636749268, "learning_rate": 4.151238867667412e-07, "loss": 0.9424, "step": 9049 }, { "epoch": 1.5488618860174568, "grad_norm": 12.78079891204834, "learning_rate": 4.1243634299926724e-07, "loss": 0.9648, "step": 9050 }, { "epoch": 1.5490330309772378, "grad_norm": 1.5654774904251099, "learning_rate": 4.0975740591608026e-07, "loss": 0.2126, "step": 9051 }, { "epoch": 1.5492041759370188, "grad_norm": 9.263545989990234, "learning_rate": 4.070870770977558e-07, "loss": 0.9133, "step": 9052 }, { "epoch": 1.5493753208967997, "grad_norm": 9.332427978515625, "learning_rate": 4.0442535811978875e-07, "loss": 0.7466, "step": 9053 }, { "epoch": 1.5495464658565805, "grad_norm": 4.035463809967041, "learning_rate": 4.0177225055260116e-07, "loss": 0.4327, "step": 9054 }, { "epoch": 1.5497176108163615, "grad_norm": 11.780211448669434, "learning_rate": 3.9912775596152915e-07, "loss": 0.7604, "step": 9055 }, { "epoch": 1.5498887557761423, "grad_norm": 0.3255310654640198, "learning_rate": 3.9649187590682977e-07, "loss": 0.1193, "step": 9056 }, { "epoch": 1.5500599007359233, "grad_norm": 10.761378288269043, "learning_rate": 3.938646119436723e-07, "loss": 0.9545, "step": 9057 }, { "epoch": 1.5502310456957042, "grad_norm": 11.020126342773438, "learning_rate": 3.912459656221501e-07, "loss": 1.156, "step": 9058 }, { "epoch": 1.5504021906554852, "grad_norm": 55.7742805480957, "learning_rate": 3.886359384872673e-07, "loss": 7.0279, "step": 9059 }, { "epoch": 1.5505733356152662, "grad_norm": 12.48130989074707, "learning_rate": 3.860345320789438e-07, "loss": 1.145, "step": 9060 }, { "epoch": 1.5507444805750472, "grad_norm": 19.796613693237305, "learning_rate": 3.8344174793201516e-07, "loss": 2.2652, "step": 9061 }, { "epoch": 1.550915625534828, "grad_norm": 15.494932174682617, "learning_rate": 3.8085758757622266e-07, "loss": 1.7837, "step": 9062 }, { "epoch": 1.551086770494609, "grad_norm": 2.1847872734069824, "learning_rate": 3.7828205253623184e-07, "loss": 0.1644, "step": 9063 }, { "epoch": 1.5512579154543897, "grad_norm": 28.286035537719727, "learning_rate": 3.7571514433160704e-07, "loss": 4.9784, "step": 9064 }, { "epoch": 1.5514290604141707, "grad_norm": 21.072509765625, "learning_rate": 3.731568644768285e-07, "loss": 2.5734, "step": 9065 }, { "epoch": 1.5516002053739517, "grad_norm": 15.462085723876953, "learning_rate": 3.706072144812872e-07, "loss": 1.3656, "step": 9066 }, { "epoch": 1.5517713503337327, "grad_norm": 22.0567626953125, "learning_rate": 3.6806619584927817e-07, "loss": 1.9571, "step": 9067 }, { "epoch": 1.5519424952935137, "grad_norm": 26.102827072143555, "learning_rate": 3.655338100800071e-07, "loss": 5.0647, "step": 9068 }, { "epoch": 1.5521136402532947, "grad_norm": 13.920963287353516, "learning_rate": 3.6301005866758554e-07, "loss": 0.8594, "step": 9069 }, { "epoch": 1.5522847852130754, "grad_norm": 9.579822540283203, "learning_rate": 3.604949431010307e-07, "loss": 0.7445, "step": 9070 }, { "epoch": 1.5524559301728564, "grad_norm": 10.131430625915527, "learning_rate": 3.579884648642656e-07, "loss": 0.5885, "step": 9071 }, { "epoch": 1.5526270751326372, "grad_norm": 10.107502937316895, "learning_rate": 3.5549062543611234e-07, "loss": 0.9457, "step": 9072 }, { "epoch": 1.5527982200924182, "grad_norm": 11.525973320007324, "learning_rate": 3.530014262903053e-07, "loss": 0.8199, "step": 9073 }, { "epoch": 1.5529693650521992, "grad_norm": 0.5257748365402222, "learning_rate": 3.505208688954731e-07, "loss": 0.1329, "step": 9074 }, { "epoch": 1.5531405100119802, "grad_norm": 11.91433048248291, "learning_rate": 3.4804895471514665e-07, "loss": 1.1937, "step": 9075 }, { "epoch": 1.5533116549717612, "grad_norm": 11.733511924743652, "learning_rate": 3.45585685207761e-07, "loss": 0.9042, "step": 9076 }, { "epoch": 1.5534827999315421, "grad_norm": 1.6790891885757446, "learning_rate": 3.43131061826647e-07, "loss": 0.2354, "step": 9077 }, { "epoch": 1.553653944891323, "grad_norm": 15.655911445617676, "learning_rate": 3.4068508602003776e-07, "loss": 1.428, "step": 9078 }, { "epoch": 1.553825089851104, "grad_norm": 10.057908058166504, "learning_rate": 3.3824775923106066e-07, "loss": 0.983, "step": 9079 }, { "epoch": 1.5539962348108847, "grad_norm": 10.651629447937012, "learning_rate": 3.3581908289774534e-07, "loss": 0.9995, "step": 9080 }, { "epoch": 1.5541673797706657, "grad_norm": 0.4460441768169403, "learning_rate": 3.333990584530139e-07, "loss": 0.1245, "step": 9081 }, { "epoch": 1.5543385247304466, "grad_norm": 15.128761291503906, "learning_rate": 3.3098768732468086e-07, "loss": 1.2171, "step": 9082 }, { "epoch": 1.5545096696902276, "grad_norm": 16.820087432861328, "learning_rate": 3.285849709354649e-07, "loss": 1.2381, "step": 9083 }, { "epoch": 1.5546808146500086, "grad_norm": 15.274602890014648, "learning_rate": 3.2619091070296705e-07, "loss": 1.4468, "step": 9084 }, { "epoch": 1.5548519596097896, "grad_norm": 23.205049514770508, "learning_rate": 3.2380550803968566e-07, "loss": 2.1682, "step": 9085 }, { "epoch": 1.5550231045695704, "grad_norm": 0.39580458402633667, "learning_rate": 3.214287643530167e-07, "loss": 0.1206, "step": 9086 }, { "epoch": 1.5551942495293514, "grad_norm": 4.077263355255127, "learning_rate": 3.190606810452384e-07, "loss": 0.2675, "step": 9087 }, { "epoch": 1.5553653944891321, "grad_norm": 42.22677993774414, "learning_rate": 3.1670125951352657e-07, "loss": 7.2555, "step": 9088 }, { "epoch": 1.5555365394489131, "grad_norm": 20.885204315185547, "learning_rate": 3.143505011499409e-07, "loss": 2.7388, "step": 9089 }, { "epoch": 1.5557076844086941, "grad_norm": 17.681625366210938, "learning_rate": 3.1200840734143034e-07, "loss": 1.6263, "step": 9090 }, { "epoch": 1.555878829368475, "grad_norm": 20.130407333374023, "learning_rate": 3.096749794698395e-07, "loss": 1.6893, "step": 9091 }, { "epoch": 1.556049974328256, "grad_norm": 26.360910415649414, "learning_rate": 3.0735021891189207e-07, "loss": 5.2415, "step": 9092 }, { "epoch": 1.556221119288037, "grad_norm": 16.412033081054688, "learning_rate": 3.050341270391993e-07, "loss": 1.4471, "step": 9093 }, { "epoch": 1.5563922642478178, "grad_norm": 16.338489532470703, "learning_rate": 3.027267052182597e-07, "loss": 2.1344, "step": 9094 }, { "epoch": 1.5565634092075988, "grad_norm": 22.774322509765625, "learning_rate": 3.004279548104527e-07, "loss": 3.2706, "step": 9095 }, { "epoch": 1.5567345541673798, "grad_norm": 6.465914249420166, "learning_rate": 2.9813787717204835e-07, "loss": 0.4291, "step": 9096 }, { "epoch": 1.5569056991271606, "grad_norm": 9.604875564575195, "learning_rate": 2.958564736541941e-07, "loss": 0.7485, "step": 9097 }, { "epoch": 1.5570768440869416, "grad_norm": 17.393922805786133, "learning_rate": 2.935837456029217e-07, "loss": 1.34, "step": 9098 }, { "epoch": 1.5572479890467226, "grad_norm": 9.411694526672363, "learning_rate": 2.913196943591451e-07, "loss": 0.6584, "step": 9099 }, { "epoch": 1.5574191340065036, "grad_norm": 12.240625381469727, "learning_rate": 2.8906432125865414e-07, "loss": 0.7927, "step": 9100 }, { "epoch": 1.5575902789662845, "grad_norm": 3.7977750301361084, "learning_rate": 2.8681762763212607e-07, "loss": 0.2675, "step": 9101 }, { "epoch": 1.5577614239260655, "grad_norm": 0.38975921273231506, "learning_rate": 2.845796148051122e-07, "loss": 0.1216, "step": 9102 }, { "epoch": 1.5579325688858463, "grad_norm": 21.20785140991211, "learning_rate": 2.8235028409804465e-07, "loss": 1.7529, "step": 9103 }, { "epoch": 1.5581037138456273, "grad_norm": 46.74541091918945, "learning_rate": 2.8012963682623126e-07, "loss": 8.0632, "step": 9104 }, { "epoch": 1.558274858805408, "grad_norm": 2.6074249744415283, "learning_rate": 2.779176742998557e-07, "loss": 0.2467, "step": 9105 }, { "epoch": 1.558446003765189, "grad_norm": 11.43459701538086, "learning_rate": 2.7571439782398067e-07, "loss": 0.9211, "step": 9106 }, { "epoch": 1.55861714872497, "grad_norm": 13.257878303527832, "learning_rate": 2.735198086985413e-07, "loss": 0.9962, "step": 9107 }, { "epoch": 1.558788293684751, "grad_norm": 10.287236213684082, "learning_rate": 2.7133390821835194e-07, "loss": 0.7293, "step": 9108 }, { "epoch": 1.558959438644532, "grad_norm": 14.681013107299805, "learning_rate": 2.691566976730908e-07, "loss": 1.0184, "step": 9109 }, { "epoch": 1.559130583604313, "grad_norm": 12.030794143676758, "learning_rate": 2.6698817834732215e-07, "loss": 0.8029, "step": 9110 }, { "epoch": 1.5593017285640938, "grad_norm": 4.513367176055908, "learning_rate": 2.648283515204708e-07, "loss": 0.5593, "step": 9111 }, { "epoch": 1.5594728735238748, "grad_norm": 1.7358667850494385, "learning_rate": 2.626772184668391e-07, "loss": 0.1867, "step": 9112 }, { "epoch": 1.5596440184836555, "grad_norm": 1.6628427505493164, "learning_rate": 2.605347804556002e-07, "loss": 0.2171, "step": 9113 }, { "epoch": 1.5598151634434365, "grad_norm": 1.8830827474594116, "learning_rate": 2.584010387507929e-07, "loss": 0.2699, "step": 9114 }, { "epoch": 1.5599863084032175, "grad_norm": 17.277284622192383, "learning_rate": 2.562759946113319e-07, "loss": 2.1309, "step": 9115 }, { "epoch": 1.5601574533629985, "grad_norm": 41.75537872314453, "learning_rate": 2.5415964929099424e-07, "loss": 6.855, "step": 9116 }, { "epoch": 1.5603285983227795, "grad_norm": 2.476348876953125, "learning_rate": 2.520520040384261e-07, "loss": 0.258, "step": 9117 }, { "epoch": 1.5604997432825605, "grad_norm": 8.988431930541992, "learning_rate": 2.499530600971428e-07, "loss": 0.7388, "step": 9118 }, { "epoch": 1.5606708882423412, "grad_norm": 2.866429328918457, "learning_rate": 2.4786281870552207e-07, "loss": 0.3374, "step": 9119 }, { "epoch": 1.5608420332021222, "grad_norm": 0.4307229220867157, "learning_rate": 2.4578128109681243e-07, "loss": 0.1224, "step": 9120 }, { "epoch": 1.561013178161903, "grad_norm": 12.526897430419922, "learning_rate": 2.4370844849912145e-07, "loss": 1.0513, "step": 9121 }, { "epoch": 1.561184323121684, "grad_norm": 3.3244662284851074, "learning_rate": 2.416443221354242e-07, "loss": 0.3126, "step": 9122 }, { "epoch": 1.561355468081465, "grad_norm": 6.976736545562744, "learning_rate": 2.395889032235599e-07, "loss": 0.4756, "step": 9123 }, { "epoch": 1.561526613041246, "grad_norm": 1.6432287693023682, "learning_rate": 2.3754219297623004e-07, "loss": 0.2264, "step": 9124 }, { "epoch": 1.561697758001027, "grad_norm": 24.468223571777344, "learning_rate": 2.3550419260099044e-07, "loss": 5.1067, "step": 9125 }, { "epoch": 1.561868902960808, "grad_norm": 17.691425323486328, "learning_rate": 2.334749033002709e-07, "loss": 1.6366, "step": 9126 }, { "epoch": 1.5620400479205887, "grad_norm": 11.790969848632812, "learning_rate": 2.31454326271352e-07, "loss": 1.0218, "step": 9127 }, { "epoch": 1.5622111928803697, "grad_norm": 13.224206924438477, "learning_rate": 2.2944246270637847e-07, "loss": 1.0546, "step": 9128 }, { "epoch": 1.5623823378401505, "grad_norm": 14.788132667541504, "learning_rate": 2.2743931379235083e-07, "loss": 1.0561, "step": 9129 }, { "epoch": 1.5625534827999314, "grad_norm": 13.581404685974121, "learning_rate": 2.25444880711132e-07, "loss": 1.0214, "step": 9130 }, { "epoch": 1.5627246277597124, "grad_norm": 5.454602241516113, "learning_rate": 2.234591646394407e-07, "loss": 0.561, "step": 9131 }, { "epoch": 1.5628957727194934, "grad_norm": 13.103479385375977, "learning_rate": 2.2148216674884813e-07, "loss": 1.1556, "step": 9132 }, { "epoch": 1.5630669176792744, "grad_norm": 0.9748165607452393, "learning_rate": 2.1951388820578955e-07, "loss": 0.2129, "step": 9133 }, { "epoch": 1.5632380626390554, "grad_norm": 9.348187446594238, "learning_rate": 2.1755433017155434e-07, "loss": 0.8327, "step": 9134 }, { "epoch": 1.5634092075988362, "grad_norm": 15.741958618164062, "learning_rate": 2.1560349380228106e-07, "loss": 1.4761, "step": 9135 }, { "epoch": 1.5635803525586172, "grad_norm": 13.901618957519531, "learning_rate": 2.136613802489673e-07, "loss": 1.2376, "step": 9136 }, { "epoch": 1.563751497518398, "grad_norm": 12.563179016113281, "learning_rate": 2.1172799065746318e-07, "loss": 1.2446, "step": 9137 }, { "epoch": 1.563922642478179, "grad_norm": 18.04777717590332, "learning_rate": 2.098033261684712e-07, "loss": 2.3108, "step": 9138 }, { "epoch": 1.56409378743796, "grad_norm": 4.525385856628418, "learning_rate": 2.0788738791754636e-07, "loss": 0.2771, "step": 9139 }, { "epoch": 1.564264932397741, "grad_norm": 12.480391502380371, "learning_rate": 2.0598017703509276e-07, "loss": 1.1999, "step": 9140 }, { "epoch": 1.5644360773575219, "grad_norm": 17.527063369750977, "learning_rate": 2.0408169464637194e-07, "loss": 2.02, "step": 9141 }, { "epoch": 1.5646072223173029, "grad_norm": 12.687166213989258, "learning_rate": 2.0219194187148794e-07, "loss": 1.0058, "step": 9142 }, { "epoch": 1.5647783672770836, "grad_norm": 17.169246673583984, "learning_rate": 2.0031091982540052e-07, "loss": 1.3614, "step": 9143 }, { "epoch": 1.5649495122368646, "grad_norm": 22.947132110595703, "learning_rate": 1.984386296179136e-07, "loss": 2.0893, "step": 9144 }, { "epoch": 1.5651206571966454, "grad_norm": 11.637513160705566, "learning_rate": 1.9657507235368022e-07, "loss": 0.8511, "step": 9145 }, { "epoch": 1.5652918021564264, "grad_norm": 17.147573471069336, "learning_rate": 1.947202491322042e-07, "loss": 1.3839, "step": 9146 }, { "epoch": 1.5654629471162074, "grad_norm": 6.268093109130859, "learning_rate": 1.9287416104783183e-07, "loss": 0.4053, "step": 9147 }, { "epoch": 1.5656340920759884, "grad_norm": 5.430019378662109, "learning_rate": 1.9103680918976186e-07, "loss": 0.2991, "step": 9148 }, { "epoch": 1.5658052370357693, "grad_norm": 20.10877799987793, "learning_rate": 1.8920819464202876e-07, "loss": 2.8672, "step": 9149 }, { "epoch": 1.5659763819955503, "grad_norm": 12.960602760314941, "learning_rate": 1.8738831848352288e-07, "loss": 1.5353, "step": 9150 }, { "epoch": 1.5661475269553313, "grad_norm": 21.424409866333008, "learning_rate": 1.85577181787972e-07, "loss": 2.3259, "step": 9151 }, { "epoch": 1.566318671915112, "grad_norm": 18.28989601135254, "learning_rate": 1.83774785623948e-07, "loss": 1.5165, "step": 9152 }, { "epoch": 1.566489816874893, "grad_norm": 16.376277923583984, "learning_rate": 1.8198113105487023e-07, "loss": 2.0183, "step": 9153 }, { "epoch": 1.5666609618346738, "grad_norm": 12.143623352050781, "learning_rate": 1.8019621913899388e-07, "loss": 1.2175, "step": 9154 }, { "epoch": 1.5668321067944548, "grad_norm": 48.04972457885742, "learning_rate": 1.7842005092942316e-07, "loss": 6.9832, "step": 9155 }, { "epoch": 1.5670032517542358, "grad_norm": 9.188867568969727, "learning_rate": 1.7665262747409817e-07, "loss": 0.8427, "step": 9156 }, { "epoch": 1.5671743967140168, "grad_norm": 15.921174049377441, "learning_rate": 1.7489394981580142e-07, "loss": 1.2193, "step": 9157 }, { "epoch": 1.5673455416737978, "grad_norm": 15.355498313903809, "learning_rate": 1.7314401899215626e-07, "loss": 1.0528, "step": 9158 }, { "epoch": 1.5675166866335788, "grad_norm": 10.517993927001953, "learning_rate": 1.7140283603562346e-07, "loss": 0.7444, "step": 9159 }, { "epoch": 1.5676878315933596, "grad_norm": 22.72956657409668, "learning_rate": 1.6967040197350625e-07, "loss": 2.5323, "step": 9160 }, { "epoch": 1.5678589765531405, "grad_norm": 7.676398754119873, "learning_rate": 1.6794671782793703e-07, "loss": 0.561, "step": 9161 }, { "epoch": 1.5680301215129213, "grad_norm": 10.607348442077637, "learning_rate": 1.6623178461589895e-07, "loss": 0.7477, "step": 9162 }, { "epoch": 1.5682012664727023, "grad_norm": 15.686529159545898, "learning_rate": 1.6452560334920264e-07, "loss": 1.8269, "step": 9163 }, { "epoch": 1.5683724114324833, "grad_norm": 0.43131089210510254, "learning_rate": 1.6282817503449455e-07, "loss": 0.1279, "step": 9164 }, { "epoch": 1.5685435563922643, "grad_norm": 0.8169675469398499, "learning_rate": 1.6113950067326354e-07, "loss": 0.1766, "step": 9165 }, { "epoch": 1.5687147013520453, "grad_norm": 14.726433753967285, "learning_rate": 1.5945958126182936e-07, "loss": 1.1996, "step": 9166 }, { "epoch": 1.5688858463118263, "grad_norm": 15.054525375366211, "learning_rate": 1.5778841779134579e-07, "loss": 1.2098, "step": 9167 }, { "epoch": 1.569056991271607, "grad_norm": 4.448679447174072, "learning_rate": 1.561260112478041e-07, "loss": 0.3657, "step": 9168 }, { "epoch": 1.569228136231388, "grad_norm": 13.440657615661621, "learning_rate": 1.544723626120248e-07, "loss": 1.1184, "step": 9169 }, { "epoch": 1.5693992811911688, "grad_norm": 15.834989547729492, "learning_rate": 1.5282747285966402e-07, "loss": 1.1029, "step": 9170 }, { "epoch": 1.5695704261509498, "grad_norm": 0.8307080268859863, "learning_rate": 1.5119134296120718e-07, "loss": 0.1424, "step": 9171 }, { "epoch": 1.5697415711107308, "grad_norm": 6.559043884277344, "learning_rate": 1.495639738819754e-07, "loss": 0.754, "step": 9172 }, { "epoch": 1.5699127160705117, "grad_norm": 4.420048236846924, "learning_rate": 1.4794536658211733e-07, "loss": 0.5539, "step": 9173 }, { "epoch": 1.5700838610302927, "grad_norm": 15.232937812805176, "learning_rate": 1.463355220166157e-07, "loss": 1.9508, "step": 9174 }, { "epoch": 1.5702550059900737, "grad_norm": 20.91059684753418, "learning_rate": 1.4473444113527912e-07, "loss": 2.7112, "step": 9175 }, { "epoch": 1.5704261509498545, "grad_norm": 20.11099624633789, "learning_rate": 1.431421248827486e-07, "loss": 2.4943, "step": 9176 }, { "epoch": 1.5705972959096355, "grad_norm": 0.436890572309494, "learning_rate": 1.415585741984926e-07, "loss": 0.123, "step": 9177 }, { "epoch": 1.5707684408694162, "grad_norm": 10.478984832763672, "learning_rate": 1.399837900168105e-07, "loss": 0.8018, "step": 9178 }, { "epoch": 1.5709395858291972, "grad_norm": 10.540605545043945, "learning_rate": 1.3841777326682402e-07, "loss": 0.649, "step": 9179 }, { "epoch": 1.5711107307889782, "grad_norm": 2.26641845703125, "learning_rate": 1.3686052487248747e-07, "loss": 0.2539, "step": 9180 }, { "epoch": 1.5712818757487592, "grad_norm": 15.154268264770508, "learning_rate": 1.3531204575258082e-07, "loss": 1.422, "step": 9181 }, { "epoch": 1.5714530207085402, "grad_norm": 1.3577299118041992, "learning_rate": 1.33772336820705e-07, "loss": 0.1476, "step": 9182 }, { "epoch": 1.5716241656683212, "grad_norm": 10.19765567779541, "learning_rate": 1.3224139898529497e-07, "loss": 0.8337, "step": 9183 }, { "epoch": 1.571795310628102, "grad_norm": 9.051892280578613, "learning_rate": 1.3071923314960488e-07, "loss": 0.5737, "step": 9184 }, { "epoch": 1.571966455587883, "grad_norm": 16.13500213623047, "learning_rate": 1.29205840211713e-07, "loss": 1.2261, "step": 9185 }, { "epoch": 1.5721376005476637, "grad_norm": 18.458324432373047, "learning_rate": 1.2770122106452508e-07, "loss": 1.6899, "step": 9186 }, { "epoch": 1.5723087455074447, "grad_norm": 6.380554676055908, "learning_rate": 1.2620537659576936e-07, "loss": 0.5684, "step": 9187 }, { "epoch": 1.5724798904672257, "grad_norm": 18.501176834106445, "learning_rate": 1.2471830768799486e-07, "loss": 1.9986, "step": 9188 }, { "epoch": 1.5726510354270067, "grad_norm": 12.987404823303223, "learning_rate": 1.2324001521857475e-07, "loss": 1.0716, "step": 9189 }, { "epoch": 1.5728221803867877, "grad_norm": 0.6026003360748291, "learning_rate": 1.2177050005970304e-07, "loss": 0.1293, "step": 9190 }, { "epoch": 1.5729933253465687, "grad_norm": 19.3843994140625, "learning_rate": 1.2030976307839947e-07, "loss": 1.9049, "step": 9191 }, { "epoch": 1.5731644703063494, "grad_norm": 1.1674511432647705, "learning_rate": 1.1885780513649635e-07, "loss": 0.1492, "step": 9192 }, { "epoch": 1.5733356152661304, "grad_norm": 63.58915710449219, "learning_rate": 1.1741462709065343e-07, "loss": 7.806, "step": 9193 }, { "epoch": 1.5735067602259112, "grad_norm": 13.459197044372559, "learning_rate": 1.159802297923479e-07, "loss": 0.9621, "step": 9194 }, { "epoch": 1.5736779051856922, "grad_norm": 10.171172142028809, "learning_rate": 1.1455461408787449e-07, "loss": 0.91, "step": 9195 }, { "epoch": 1.5738490501454732, "grad_norm": 18.478303909301758, "learning_rate": 1.1313778081835203e-07, "loss": 1.4514, "step": 9196 }, { "epoch": 1.5740201951052541, "grad_norm": 9.266839027404785, "learning_rate": 1.1172973081971016e-07, "loss": 0.7373, "step": 9197 }, { "epoch": 1.5741913400650351, "grad_norm": 6.405694961547852, "learning_rate": 1.1033046492270438e-07, "loss": 0.4903, "step": 9198 }, { "epoch": 1.5743624850248161, "grad_norm": 13.584548950195312, "learning_rate": 1.0893998395290095e-07, "loss": 1.0092, "step": 9199 }, { "epoch": 1.5745336299845971, "grad_norm": 12.762123107910156, "learning_rate": 1.0755828873068697e-07, "loss": 0.9868, "step": 9200 }, { "epoch": 1.5747047749443779, "grad_norm": 17.396408081054688, "learning_rate": 1.0618538007126533e-07, "loss": 2.0173, "step": 9201 }, { "epoch": 1.5748759199041589, "grad_norm": 17.993640899658203, "learning_rate": 1.0482125878465142e-07, "loss": 1.8555, "step": 9202 }, { "epoch": 1.5750470648639396, "grad_norm": 5.654393196105957, "learning_rate": 1.0346592567568313e-07, "loss": 0.5871, "step": 9203 }, { "epoch": 1.5752182098237206, "grad_norm": 0.38312870264053345, "learning_rate": 1.0211938154400413e-07, "loss": 0.1198, "step": 9204 }, { "epoch": 1.5753893547835016, "grad_norm": 2.9140939712524414, "learning_rate": 1.0078162718408057e-07, "loss": 0.2456, "step": 9205 }, { "epoch": 1.5755604997432826, "grad_norm": 29.00325584411621, "learning_rate": 9.945266338518778e-08, "loss": 5.2685, "step": 9206 }, { "epoch": 1.5757316447030636, "grad_norm": 79.38115692138672, "learning_rate": 9.813249093141851e-08, "loss": 8.2326, "step": 9207 }, { "epoch": 1.5759027896628446, "grad_norm": 11.39216136932373, "learning_rate": 9.682111060167476e-08, "loss": 0.828, "step": 9208 }, { "epoch": 1.5760739346226253, "grad_norm": 1.7181997299194336, "learning_rate": 9.551852316967258e-08, "loss": 0.157, "step": 9209 }, { "epoch": 1.5762450795824063, "grad_norm": 22.244922637939453, "learning_rate": 9.422472940394223e-08, "loss": 4.8658, "step": 9210 }, { "epoch": 1.576416224542187, "grad_norm": 9.702967643737793, "learning_rate": 9.293973006782308e-08, "loss": 0.7911, "step": 9211 }, { "epoch": 1.576587369501968, "grad_norm": 9.212658882141113, "learning_rate": 9.16635259194687e-08, "loss": 0.6724, "step": 9212 }, { "epoch": 1.576758514461749, "grad_norm": 12.398530006408691, "learning_rate": 9.039611771183676e-08, "loss": 1.0109, "step": 9213 }, { "epoch": 1.57692965942153, "grad_norm": 15.223182678222656, "learning_rate": 8.913750619270411e-08, "loss": 1.1652, "step": 9214 }, { "epoch": 1.577100804381311, "grad_norm": 9.750580787658691, "learning_rate": 8.788769210465175e-08, "loss": 0.6868, "step": 9215 }, { "epoch": 1.577271949341092, "grad_norm": 13.471878051757812, "learning_rate": 8.664667618506983e-08, "loss": 1.0171, "step": 9216 }, { "epoch": 1.5774430943008728, "grad_norm": 9.185175895690918, "learning_rate": 8.54144591661643e-08, "loss": 0.7606, "step": 9217 }, { "epoch": 1.5776142392606538, "grad_norm": 1.1860450506210327, "learning_rate": 8.419104177494197e-08, "loss": 0.1466, "step": 9218 }, { "epoch": 1.5777853842204346, "grad_norm": 0.3975621461868286, "learning_rate": 8.297642473322043e-08, "loss": 0.1287, "step": 9219 }, { "epoch": 1.5779565291802156, "grad_norm": 10.542242050170898, "learning_rate": 8.17706087576281e-08, "loss": 0.8654, "step": 9220 }, { "epoch": 1.5781276741399966, "grad_norm": 17.912885665893555, "learning_rate": 8.05735945595959e-08, "loss": 2.2477, "step": 9221 }, { "epoch": 1.5782988190997775, "grad_norm": 10.726658821105957, "learning_rate": 7.93853828453639e-08, "loss": 0.9743, "step": 9222 }, { "epoch": 1.5784699640595585, "grad_norm": 0.39528200030326843, "learning_rate": 7.82059743159813e-08, "loss": 0.1158, "step": 9223 }, { "epoch": 1.5786411090193395, "grad_norm": 5.554496765136719, "learning_rate": 7.703536966729818e-08, "loss": 0.3752, "step": 9224 }, { "epoch": 1.5788122539791203, "grad_norm": 11.4022798538208, "learning_rate": 7.587356958997538e-08, "loss": 1.0386, "step": 9225 }, { "epoch": 1.5789833989389013, "grad_norm": 26.511320114135742, "learning_rate": 7.472057476947458e-08, "loss": 1.1933, "step": 9226 }, { "epoch": 1.579154543898682, "grad_norm": 2.7909345626831055, "learning_rate": 7.357638588606497e-08, "loss": 0.262, "step": 9227 }, { "epoch": 1.579325688858463, "grad_norm": 18.85427474975586, "learning_rate": 7.244100361482153e-08, "loss": 2.0454, "step": 9228 }, { "epoch": 1.579496833818244, "grad_norm": 20.165956497192383, "learning_rate": 7.131442862561843e-08, "loss": 2.3284, "step": 9229 }, { "epoch": 1.579667978778025, "grad_norm": 16.313133239746094, "learning_rate": 7.019666158313898e-08, "loss": 1.2983, "step": 9230 }, { "epoch": 1.579839123737806, "grad_norm": 6.817485809326172, "learning_rate": 6.908770314686564e-08, "loss": 0.597, "step": 9231 }, { "epoch": 1.580010268697587, "grad_norm": 13.1300630569458, "learning_rate": 6.798755397108669e-08, "loss": 0.8611, "step": 9232 }, { "epoch": 1.5801814136573678, "grad_norm": 10.890022277832031, "learning_rate": 6.689621470489126e-08, "loss": 0.8722, "step": 9233 }, { "epoch": 1.5803525586171487, "grad_norm": 8.250706672668457, "learning_rate": 6.581368599217263e-08, "loss": 0.6064, "step": 9234 }, { "epoch": 1.5805237035769295, "grad_norm": 12.516151428222656, "learning_rate": 6.473996847162155e-08, "loss": 0.9802, "step": 9235 }, { "epoch": 1.5806948485367105, "grad_norm": 0.46432220935821533, "learning_rate": 6.367506277673629e-08, "loss": 0.1234, "step": 9236 }, { "epoch": 1.5808659934964915, "grad_norm": 3.919583320617676, "learning_rate": 6.261896953580925e-08, "loss": 0.3469, "step": 9237 }, { "epoch": 1.5810371384562725, "grad_norm": 2.809959888458252, "learning_rate": 6.157168937194036e-08, "loss": 0.2253, "step": 9238 }, { "epoch": 1.5812082834160535, "grad_norm": 1.967377781867981, "learning_rate": 6.053322290302365e-08, "loss": 0.2454, "step": 9239 }, { "epoch": 1.5813794283758345, "grad_norm": 17.481233596801758, "learning_rate": 5.950357074175738e-08, "loss": 1.9165, "step": 9240 }, { "epoch": 1.5815505733356152, "grad_norm": 13.040780067443848, "learning_rate": 5.8482733495638926e-08, "loss": 0.9406, "step": 9241 }, { "epoch": 1.5817217182953962, "grad_norm": 0.3521684408187866, "learning_rate": 5.74707117669615e-08, "loss": 0.1231, "step": 9242 }, { "epoch": 1.581892863255177, "grad_norm": 1.2833713293075562, "learning_rate": 5.646750615282081e-08, "loss": 0.2104, "step": 9243 }, { "epoch": 1.582064008214958, "grad_norm": 0.39002057909965515, "learning_rate": 5.5473117245108396e-08, "loss": 0.1267, "step": 9244 }, { "epoch": 1.582235153174739, "grad_norm": 4.821841239929199, "learning_rate": 5.448754563051661e-08, "loss": 0.4826, "step": 9245 }, { "epoch": 1.58240629813452, "grad_norm": 11.261731147766113, "learning_rate": 5.351079189053365e-08, "loss": 0.85, "step": 9246 }, { "epoch": 1.582577443094301, "grad_norm": 16.655244827270508, "learning_rate": 5.2542856601446865e-08, "loss": 1.5897, "step": 9247 }, { "epoch": 1.582748588054082, "grad_norm": 8.489250183105469, "learning_rate": 5.1583740334336104e-08, "loss": 0.5922, "step": 9248 }, { "epoch": 1.582919733013863, "grad_norm": 15.553121566772461, "learning_rate": 5.0633443655085375e-08, "loss": 1.8856, "step": 9249 }, { "epoch": 1.5830908779736437, "grad_norm": 18.76203727722168, "learning_rate": 4.9691967124367854e-08, "loss": 1.4616, "step": 9250 }, { "epoch": 1.5832620229334247, "grad_norm": 13.547760963439941, "learning_rate": 4.875931129765754e-08, "loss": 0.8983, "step": 9251 }, { "epoch": 1.5834331678932054, "grad_norm": 2.6911144256591797, "learning_rate": 4.783547672522426e-08, "loss": 0.2377, "step": 9252 }, { "epoch": 1.5836043128529864, "grad_norm": 13.054913520812988, "learning_rate": 4.6920463952130345e-08, "loss": 1.11, "step": 9253 }, { "epoch": 1.5837754578127674, "grad_norm": 26.10577392578125, "learning_rate": 4.601427351823395e-08, "loss": 5.0177, "step": 9254 }, { "epoch": 1.5839466027725484, "grad_norm": 32.46567153930664, "learning_rate": 4.5116905958190734e-08, "loss": 5.8632, "step": 9255 }, { "epoch": 1.5841177477323294, "grad_norm": 18.95955467224121, "learning_rate": 4.422836180144552e-08, "loss": 2.134, "step": 9256 }, { "epoch": 1.5842888926921104, "grad_norm": 5.812067031860352, "learning_rate": 4.334864157224394e-08, "loss": 0.3365, "step": 9257 }, { "epoch": 1.5844600376518911, "grad_norm": 10.714271545410156, "learning_rate": 4.247774578962249e-08, "loss": 0.8063, "step": 9258 }, { "epoch": 1.5846311826116721, "grad_norm": 17.015493392944336, "learning_rate": 4.161567496740848e-08, "loss": 1.4556, "step": 9259 }, { "epoch": 1.584802327571453, "grad_norm": 13.046643257141113, "learning_rate": 4.0762429614226715e-08, "loss": 1.1431, "step": 9260 }, { "epoch": 1.5849734725312339, "grad_norm": 5.430252552032471, "learning_rate": 3.9918010233491174e-08, "loss": 0.3494, "step": 9261 }, { "epoch": 1.5851446174910149, "grad_norm": 4.71012544631958, "learning_rate": 3.908241732341167e-08, "loss": 0.4533, "step": 9262 }, { "epoch": 1.5853157624507959, "grad_norm": 17.6203670501709, "learning_rate": 3.825565137699216e-08, "loss": 0.9965, "step": 9263 }, { "epoch": 1.5854869074105769, "grad_norm": 15.655959129333496, "learning_rate": 3.74377128820208e-08, "loss": 1.4464, "step": 9264 }, { "epoch": 1.5856580523703578, "grad_norm": 17.088407516479492, "learning_rate": 3.6628602321086555e-08, "loss": 1.3738, "step": 9265 }, { "epoch": 1.5858291973301386, "grad_norm": 3.8679826259613037, "learning_rate": 3.582832017156423e-08, "loss": 0.3438, "step": 9266 }, { "epoch": 1.5860003422899196, "grad_norm": 18.10364532470703, "learning_rate": 3.503686690562113e-08, "loss": 2.2398, "step": 9267 }, { "epoch": 1.5861714872497004, "grad_norm": 12.124114990234375, "learning_rate": 3.4254242990217066e-08, "loss": 0.9377, "step": 9268 }, { "epoch": 1.5863426322094814, "grad_norm": 14.635051727294922, "learning_rate": 3.3480448887099336e-08, "loss": 1.3146, "step": 9269 }, { "epoch": 1.5865137771692623, "grad_norm": 3.200739860534668, "learning_rate": 3.271548505280941e-08, "loss": 0.2624, "step": 9270 }, { "epoch": 1.5866849221290433, "grad_norm": 11.197732925415039, "learning_rate": 3.195935193867627e-08, "loss": 0.9291, "step": 9271 }, { "epoch": 1.5868560670888243, "grad_norm": 15.519195556640625, "learning_rate": 3.121204999081972e-08, "loss": 1.1368, "step": 9272 }, { "epoch": 1.5870272120486053, "grad_norm": 12.560527801513672, "learning_rate": 3.047357965014874e-08, "loss": 0.9431, "step": 9273 }, { "epoch": 1.587198357008386, "grad_norm": 12.919147491455078, "learning_rate": 2.9743941352363134e-08, "loss": 0.9089, "step": 9274 }, { "epoch": 1.587369501968167, "grad_norm": 12.011872291564941, "learning_rate": 2.9023135527948553e-08, "loss": 0.8026, "step": 9275 }, { "epoch": 1.5875406469279478, "grad_norm": 11.193467140197754, "learning_rate": 2.8311162602184802e-08, "loss": 1.0585, "step": 9276 }, { "epoch": 1.5877117918877288, "grad_norm": 12.129524230957031, "learning_rate": 2.7608022995132544e-08, "loss": 0.8947, "step": 9277 }, { "epoch": 1.5878829368475098, "grad_norm": 8.2146577835083, "learning_rate": 2.6913717121648252e-08, "loss": 0.9594, "step": 9278 }, { "epoch": 1.5880540818072908, "grad_norm": 22.12969207763672, "learning_rate": 2.6228245391372586e-08, "loss": 1.2051, "step": 9279 }, { "epoch": 1.5882252267670718, "grad_norm": 9.634805679321289, "learning_rate": 2.5551608208737033e-08, "loss": 0.7638, "step": 9280 }, { "epoch": 1.5883963717268528, "grad_norm": 0.40069231390953064, "learning_rate": 2.488380597295392e-08, "loss": 0.1199, "step": 9281 }, { "epoch": 1.5885675166866335, "grad_norm": 14.116358757019043, "learning_rate": 2.4224839078031413e-08, "loss": 1.4278, "step": 9282 }, { "epoch": 1.5887386616464145, "grad_norm": 23.130455017089844, "learning_rate": 2.357470791275851e-08, "loss": 3.3008, "step": 9283 }, { "epoch": 1.5889098066061953, "grad_norm": 10.669268608093262, "learning_rate": 2.293341286071504e-08, "loss": 0.7639, "step": 9284 }, { "epoch": 1.5890809515659763, "grad_norm": 21.576173782348633, "learning_rate": 2.2300954300266686e-08, "loss": 2.5258, "step": 9285 }, { "epoch": 1.5892520965257573, "grad_norm": 10.414642333984375, "learning_rate": 2.1677332604563284e-08, "loss": 1.1728, "step": 9286 }, { "epoch": 1.5894232414855383, "grad_norm": 15.18482780456543, "learning_rate": 2.1062548141542183e-08, "loss": 1.5507, "step": 9287 }, { "epoch": 1.5895943864453193, "grad_norm": 25.90163803100586, "learning_rate": 2.0456601273929897e-08, "loss": 5.4722, "step": 9288 }, { "epoch": 1.5897655314051002, "grad_norm": 18.69552230834961, "learning_rate": 1.9859492359233787e-08, "loss": 1.4434, "step": 9289 }, { "epoch": 1.589936676364881, "grad_norm": 0.4567941427230835, "learning_rate": 1.9271221749748714e-08, "loss": 0.1234, "step": 9290 }, { "epoch": 1.590107821324662, "grad_norm": 10.646660804748535, "learning_rate": 1.8691789792557034e-08, "loss": 0.8304, "step": 9291 }, { "epoch": 1.5902789662844428, "grad_norm": 12.730156898498535, "learning_rate": 1.8121196829523622e-08, "loss": 0.9264, "step": 9292 }, { "epoch": 1.5904501112442238, "grad_norm": 9.675321578979492, "learning_rate": 1.755944319729752e-08, "loss": 0.7315, "step": 9293 }, { "epoch": 1.5906212562040047, "grad_norm": 6.210916519165039, "learning_rate": 1.7006529227316935e-08, "loss": 0.5048, "step": 9294 }, { "epoch": 1.5907924011637857, "grad_norm": 11.655627250671387, "learning_rate": 1.6462455245800923e-08, "loss": 0.8819, "step": 9295 }, { "epoch": 1.5909635461235667, "grad_norm": 14.97899055480957, "learning_rate": 1.5927221573752704e-08, "loss": 1.8224, "step": 9296 }, { "epoch": 1.5911346910833477, "grad_norm": 1.2307006120681763, "learning_rate": 1.5400828526963008e-08, "loss": 0.1449, "step": 9297 }, { "epoch": 1.5913058360431287, "grad_norm": 0.38661330938339233, "learning_rate": 1.4883276416001735e-08, "loss": 0.1233, "step": 9298 }, { "epoch": 1.5914769810029095, "grad_norm": 14.279571533203125, "learning_rate": 1.4374565546227958e-08, "loss": 1.0882, "step": 9299 }, { "epoch": 1.5916481259626905, "grad_norm": 12.971138000488281, "learning_rate": 1.387469621778159e-08, "loss": 1.0428, "step": 9300 }, { "epoch": 1.5918192709224712, "grad_norm": 11.934720039367676, "learning_rate": 1.3383668725585052e-08, "loss": 0.9899, "step": 9301 }, { "epoch": 1.5919904158822522, "grad_norm": 5.005281925201416, "learning_rate": 1.29014833593466e-08, "loss": 0.3784, "step": 9302 }, { "epoch": 1.5921615608420332, "grad_norm": 9.84570026397705, "learning_rate": 1.2428140403555332e-08, "loss": 1.1646, "step": 9303 }, { "epoch": 1.5923327058018142, "grad_norm": 16.857385635375977, "learning_rate": 1.1963640137484521e-08, "loss": 1.7743, "step": 9304 }, { "epoch": 1.5925038507615952, "grad_norm": 9.964764595031738, "learning_rate": 1.1507982835189945e-08, "loss": 0.7526, "step": 9305 }, { "epoch": 1.5926749957213762, "grad_norm": 15.45128059387207, "learning_rate": 1.1061168765509889e-08, "loss": 1.6177, "step": 9306 }, { "epoch": 1.592846140681157, "grad_norm": 0.4428521394729614, "learning_rate": 1.0623198192066807e-08, "loss": 0.1195, "step": 9307 }, { "epoch": 1.593017285640938, "grad_norm": 2.8189878463745117, "learning_rate": 1.0194071373262338e-08, "loss": 0.2584, "step": 9308 }, { "epoch": 1.5931884306007187, "grad_norm": 13.739767074584961, "learning_rate": 9.773788562282282e-09, "loss": 1.213, "step": 9309 }, { "epoch": 1.5933595755604997, "grad_norm": 13.034765243530273, "learning_rate": 9.36235000709329e-09, "loss": 1.1833, "step": 9310 }, { "epoch": 1.5935307205202807, "grad_norm": 21.801532745361328, "learning_rate": 8.959755950447845e-09, "loss": 5.0516, "step": 9311 }, { "epoch": 1.5937018654800617, "grad_norm": 16.090665817260742, "learning_rate": 8.566006629874279e-09, "loss": 1.3722, "step": 9312 }, { "epoch": 1.5938730104398426, "grad_norm": 10.490473747253418, "learning_rate": 8.181102277685092e-09, "loss": 0.9025, "step": 9313 }, { "epoch": 1.5940441553996236, "grad_norm": 1.341450572013855, "learning_rate": 7.805043120975297e-09, "loss": 0.2095, "step": 9314 }, { "epoch": 1.5942153003594044, "grad_norm": 20.739734649658203, "learning_rate": 7.437829381620742e-09, "loss": 1.717, "step": 9315 }, { "epoch": 1.5943864453191854, "grad_norm": 7.795355796813965, "learning_rate": 7.079461276278121e-09, "loss": 0.8523, "step": 9316 }, { "epoch": 1.5945575902789662, "grad_norm": 21.838714599609375, "learning_rate": 6.729939016384967e-09, "loss": 2.6063, "step": 9317 }, { "epoch": 1.5947287352387471, "grad_norm": 14.311563491821289, "learning_rate": 6.3892628081579914e-09, "loss": 0.9879, "step": 9318 }, { "epoch": 1.5948998801985281, "grad_norm": 13.169885635375977, "learning_rate": 6.0574328525997426e-09, "loss": 0.9655, "step": 9319 }, { "epoch": 1.5950710251583091, "grad_norm": 5.078220367431641, "learning_rate": 5.734449345488613e-09, "loss": 0.4992, "step": 9320 }, { "epoch": 1.59524217011809, "grad_norm": 10.047645568847656, "learning_rate": 5.420312477385503e-09, "loss": 0.8585, "step": 9321 }, { "epoch": 1.595413315077871, "grad_norm": 14.421669006347656, "learning_rate": 5.115022433632155e-09, "loss": 1.282, "step": 9322 }, { "epoch": 1.5955844600376519, "grad_norm": 0.4456331431865692, "learning_rate": 4.818579394349487e-09, "loss": 0.1258, "step": 9323 }, { "epoch": 1.5957556049974329, "grad_norm": 5.493954181671143, "learning_rate": 4.5309835344409245e-09, "loss": 0.4141, "step": 9324 }, { "epoch": 1.5959267499572136, "grad_norm": 5.5783538818359375, "learning_rate": 4.252235023585738e-09, "loss": 0.3693, "step": 9325 }, { "epoch": 1.5960978949169946, "grad_norm": 6.578329563140869, "learning_rate": 3.982334026247369e-09, "loss": 0.624, "step": 9326 }, { "epoch": 1.5962690398767756, "grad_norm": 15.530854225158691, "learning_rate": 3.721280701668439e-09, "loss": 0.9645, "step": 9327 }, { "epoch": 1.5964401848365566, "grad_norm": 9.04145336151123, "learning_rate": 3.469075203870742e-09, "loss": 1.0568, "step": 9328 }, { "epoch": 1.5966113297963376, "grad_norm": 15.799047470092773, "learning_rate": 3.225717681655249e-09, "loss": 1.6844, "step": 9329 }, { "epoch": 1.5967824747561186, "grad_norm": 7.691936492919922, "learning_rate": 2.9912082786037744e-09, "loss": 0.7141, "step": 9330 }, { "epoch": 1.5969536197158993, "grad_norm": 24.61012840270996, "learning_rate": 2.765547133075641e-09, "loss": 5.0889, "step": 9331 }, { "epoch": 1.5971247646756803, "grad_norm": 12.034052848815918, "learning_rate": 2.5487343782126805e-09, "loss": 0.9847, "step": 9332 }, { "epoch": 1.597295909635461, "grad_norm": 17.825836181640625, "learning_rate": 2.3407701419358994e-09, "loss": 2.2122, "step": 9333 }, { "epoch": 1.597467054595242, "grad_norm": 10.553616523742676, "learning_rate": 2.1416545469421512e-09, "loss": 0.8736, "step": 9334 }, { "epoch": 1.597638199555023, "grad_norm": 14.769635200500488, "learning_rate": 1.951387710710795e-09, "loss": 1.1301, "step": 9335 }, { "epoch": 1.597809344514804, "grad_norm": 12.116341590881348, "learning_rate": 1.7699697455003661e-09, "loss": 0.9533, "step": 9336 }, { "epoch": 1.597980489474585, "grad_norm": 23.96532440185547, "learning_rate": 1.5974007583452466e-09, "loss": 2.8948, "step": 9337 }, { "epoch": 1.598151634434366, "grad_norm": 38.89512634277344, "learning_rate": 1.4336808510656551e-09, "loss": 5.5746, "step": 9338 }, { "epoch": 1.5983227793941468, "grad_norm": 9.253819465637207, "learning_rate": 1.2788101202526602e-09, "loss": 0.857, "step": 9339 }, { "epoch": 1.5984939243539278, "grad_norm": 16.9677791595459, "learning_rate": 1.1327886572815028e-09, "loss": 1.2746, "step": 9340 }, { "epoch": 1.5986650693137086, "grad_norm": 2.785156488418579, "learning_rate": 9.956165483049339e-10, "loss": 0.2266, "step": 9341 }, { "epoch": 1.5988362142734895, "grad_norm": 10.453171730041504, "learning_rate": 8.672938742565472e-10, "loss": 0.8191, "step": 9342 }, { "epoch": 1.5990073592332705, "grad_norm": 5.825189113616943, "learning_rate": 7.47820710844116e-10, "loss": 0.4584, "step": 9343 }, { "epoch": 1.5991785041930515, "grad_norm": 11.611711502075195, "learning_rate": 6.371971285595857e-10, "loss": 1.0154, "step": 9344 }, { "epoch": 1.5993496491528325, "grad_norm": 8.692205429077148, "learning_rate": 5.354231926690822e-10, "loss": 0.6069, "step": 9345 }, { "epoch": 1.5995207941126135, "grad_norm": 13.467569351196289, "learning_rate": 4.4249896321957305e-10, "loss": 0.9317, "step": 9346 }, { "epoch": 1.5996919390723943, "grad_norm": 8.742585182189941, "learning_rate": 3.584244950372018e-10, "loss": 0.6633, "step": 9347 }, { "epoch": 1.5998630840321753, "grad_norm": 14.224180221557617, "learning_rate": 2.8319983772562283e-10, "loss": 0.9377, "step": 9348 }, { "epoch": 1.6000342289919562, "grad_norm": 10.616045951843262, "learning_rate": 2.168250356676671e-10, "loss": 0.7447, "step": 9349 }, { "epoch": 1.600205373951737, "grad_norm": 11.182915687561035, "learning_rate": 1.593001280236761e-10, "loss": 1.0107, "step": 9350 }, { "epoch": 1.600376518911518, "grad_norm": 15.891812324523926, "learning_rate": 1.106251487348331e-10, "loss": 1.311, "step": 9351 }, { "epoch": 1.600547663871299, "grad_norm": 14.399053573608398, "learning_rate": 7.080012651983214e-11, "loss": 1.3368, "step": 9352 }, { "epoch": 1.60071880883108, "grad_norm": 19.445974349975586, "learning_rate": 3.98250848732129e-11, "loss": 2.568, "step": 9353 }, { "epoch": 1.600889953790861, "grad_norm": 10.244200706481934, "learning_rate": 1.770004207368725e-11, "loss": 0.6786, "step": 9354 }, { "epoch": 1.601061098750642, "grad_norm": 12.236209869384766, "learning_rate": 4.4250111708166175e-12, "loss": 0.9355, "step": 9355 }, { "epoch": 1.6012322437104227, "grad_norm": 14.863571166992188, "learning_rate": 3e-05, "loss": 1.2279, "step": 9356 }, { "epoch": 1.6014033886702037, "grad_norm": 23.10441780090332, "learning_rate": 2.999999557498883e-05, "loss": 2.8826, "step": 9357 }, { "epoch": 1.6015745336299845, "grad_norm": 10.672457695007324, "learning_rate": 2.999998229995793e-05, "loss": 1.0313, "step": 9358 }, { "epoch": 1.6017456785897655, "grad_norm": 19.782087326049805, "learning_rate": 2.9999960174915127e-05, "loss": 2.2322, "step": 9359 }, { "epoch": 1.6019168235495465, "grad_norm": 13.38315200805664, "learning_rate": 2.999992919987348e-05, "loss": 1.2971, "step": 9360 }, { "epoch": 1.6020879685093274, "grad_norm": 0.5265617966651917, "learning_rate": 2.9999889374851267e-05, "loss": 0.1317, "step": 9361 }, { "epoch": 1.6022591134691084, "grad_norm": 24.73682975769043, "learning_rate": 2.999984069987198e-05, "loss": 1.6627, "step": 9362 }, { "epoch": 1.6024302584288894, "grad_norm": 15.00418758392334, "learning_rate": 2.9999783174964336e-05, "loss": 1.4611, "step": 9363 }, { "epoch": 1.6026014033886702, "grad_norm": 12.409354209899902, "learning_rate": 2.9999716800162275e-05, "loss": 0.8284, "step": 9364 }, { "epoch": 1.6027725483484512, "grad_norm": 17.951671600341797, "learning_rate": 2.9999641575504964e-05, "loss": 1.3046, "step": 9365 }, { "epoch": 1.602943693308232, "grad_norm": 18.27266502380371, "learning_rate": 2.9999557501036782e-05, "loss": 2.0359, "step": 9366 }, { "epoch": 1.603114838268013, "grad_norm": 0.45058807730674744, "learning_rate": 2.999946457680733e-05, "loss": 0.1273, "step": 9367 }, { "epoch": 1.603285983227794, "grad_norm": 17.236331939697266, "learning_rate": 2.999936280287144e-05, "loss": 1.386, "step": 9368 }, { "epoch": 1.603457128187575, "grad_norm": 10.346175193786621, "learning_rate": 2.9999252179289158e-05, "loss": 0.795, "step": 9369 }, { "epoch": 1.603628273147356, "grad_norm": 59.070220947265625, "learning_rate": 2.9999132706125743e-05, "loss": 7.2703, "step": 9370 }, { "epoch": 1.6037994181071369, "grad_norm": 12.031073570251465, "learning_rate": 2.9999004383451696e-05, "loss": 0.8509, "step": 9371 }, { "epoch": 1.6039705630669177, "grad_norm": 12.239237785339355, "learning_rate": 2.9998867211342718e-05, "loss": 0.9575, "step": 9372 }, { "epoch": 1.6041417080266986, "grad_norm": 14.176753044128418, "learning_rate": 2.999872118987975e-05, "loss": 1.2423, "step": 9373 }, { "epoch": 1.6043128529864794, "grad_norm": 11.396978378295898, "learning_rate": 2.9998566319148938e-05, "loss": 0.9956, "step": 9374 }, { "epoch": 1.6044839979462604, "grad_norm": 16.875986099243164, "learning_rate": 2.9998402599241654e-05, "loss": 1.4363, "step": 9375 }, { "epoch": 1.6046551429060414, "grad_norm": 12.127874374389648, "learning_rate": 2.99982300302545e-05, "loss": 0.7974, "step": 9376 }, { "epoch": 1.6048262878658224, "grad_norm": 16.08709144592285, "learning_rate": 2.9998048612289287e-05, "loss": 1.5343, "step": 9377 }, { "epoch": 1.6049974328256034, "grad_norm": 20.294261932373047, "learning_rate": 2.9997858345453058e-05, "loss": 1.4582, "step": 9378 }, { "epoch": 1.6051685777853844, "grad_norm": 12.85136890411377, "learning_rate": 2.9997659229858064e-05, "loss": 0.9081, "step": 9379 }, { "epoch": 1.6053397227451651, "grad_norm": 16.263931274414062, "learning_rate": 2.999745126562179e-05, "loss": 1.3727, "step": 9380 }, { "epoch": 1.605510867704946, "grad_norm": 4.886488914489746, "learning_rate": 2.9997234452866925e-05, "loss": 0.3255, "step": 9381 }, { "epoch": 1.6056820126647269, "grad_norm": 2.572378635406494, "learning_rate": 2.9997008791721397e-05, "loss": 0.2475, "step": 9382 }, { "epoch": 1.6058531576245079, "grad_norm": 3.0046651363372803, "learning_rate": 2.9996774282318344e-05, "loss": 0.2655, "step": 9383 }, { "epoch": 1.6060243025842889, "grad_norm": 0.8560636639595032, "learning_rate": 2.9996530924796127e-05, "loss": 0.1441, "step": 9384 }, { "epoch": 1.6061954475440698, "grad_norm": 21.85727882385254, "learning_rate": 2.999627871929833e-05, "loss": 1.0124, "step": 9385 }, { "epoch": 1.6063665925038508, "grad_norm": 0.6194969415664673, "learning_rate": 2.999601766597375e-05, "loss": 0.1328, "step": 9386 }, { "epoch": 1.6065377374636318, "grad_norm": 15.175031661987305, "learning_rate": 2.9995747764976414e-05, "loss": 1.221, "step": 9387 }, { "epoch": 1.6067088824234126, "grad_norm": 3.248007297515869, "learning_rate": 2.999546901646556e-05, "loss": 0.2861, "step": 9388 }, { "epoch": 1.6068800273831936, "grad_norm": 15.640496253967285, "learning_rate": 2.9995181420605653e-05, "loss": 1.4736, "step": 9389 }, { "epoch": 1.6070511723429743, "grad_norm": 11.312459945678711, "learning_rate": 2.9994884977566372e-05, "loss": 0.8784, "step": 9390 }, { "epoch": 1.6072223173027553, "grad_norm": 16.614965438842773, "learning_rate": 2.9994579687522615e-05, "loss": 1.8827, "step": 9391 }, { "epoch": 1.6073934622625363, "grad_norm": 3.8147289752960205, "learning_rate": 2.9994265550654512e-05, "loss": 0.2525, "step": 9392 }, { "epoch": 1.6075646072223173, "grad_norm": 9.869797706604004, "learning_rate": 2.9993942567147402e-05, "loss": 0.8839, "step": 9393 }, { "epoch": 1.6077357521820983, "grad_norm": 13.11780834197998, "learning_rate": 2.999361073719184e-05, "loss": 1.0038, "step": 9394 }, { "epoch": 1.6079068971418793, "grad_norm": 10.546785354614258, "learning_rate": 2.999327006098362e-05, "loss": 0.6596, "step": 9395 }, { "epoch": 1.60807804210166, "grad_norm": 2.814810276031494, "learning_rate": 2.9992920538723722e-05, "loss": 0.2285, "step": 9396 }, { "epoch": 1.608249187061441, "grad_norm": 12.349743843078613, "learning_rate": 2.999256217061838e-05, "loss": 1.0061, "step": 9397 }, { "epoch": 1.6084203320212218, "grad_norm": 11.801037788391113, "learning_rate": 2.9992194956879027e-05, "loss": 0.9087, "step": 9398 }, { "epoch": 1.6085914769810028, "grad_norm": 19.1529483795166, "learning_rate": 2.9991818897722315e-05, "loss": 1.5131, "step": 9399 }, { "epoch": 1.6087626219407838, "grad_norm": 21.248210906982422, "learning_rate": 2.9991433993370126e-05, "loss": 2.4489, "step": 9400 }, { "epoch": 1.6089337669005648, "grad_norm": 12.489494323730469, "learning_rate": 2.9991040244049556e-05, "loss": 0.9028, "step": 9401 }, { "epoch": 1.6091049118603458, "grad_norm": 18.785676956176758, "learning_rate": 2.9990637649992906e-05, "loss": 2.1393, "step": 9402 }, { "epoch": 1.6092760568201268, "grad_norm": 2.0328285694122314, "learning_rate": 2.9990226211437717e-05, "loss": 0.2505, "step": 9403 }, { "epoch": 1.6094472017799077, "grad_norm": 2.2887496948242188, "learning_rate": 2.9989805928626736e-05, "loss": 0.225, "step": 9404 }, { "epoch": 1.6096183467396885, "grad_norm": 15.27294635772705, "learning_rate": 2.9989376801807933e-05, "loss": 1.2037, "step": 9405 }, { "epoch": 1.6097894916994695, "grad_norm": 11.583244323730469, "learning_rate": 2.998893883123449e-05, "loss": 0.8366, "step": 9406 }, { "epoch": 1.6099606366592503, "grad_norm": 0.7704922556877136, "learning_rate": 2.9988492017164812e-05, "loss": 0.1335, "step": 9407 }, { "epoch": 1.6101317816190313, "grad_norm": 34.04677963256836, "learning_rate": 2.9988036359862517e-05, "loss": 5.5288, "step": 9408 }, { "epoch": 1.6103029265788122, "grad_norm": 71.19355010986328, "learning_rate": 2.9987571859596446e-05, "loss": 7.2655, "step": 9409 }, { "epoch": 1.6104740715385932, "grad_norm": 22.243663787841797, "learning_rate": 2.9987098516640656e-05, "loss": 0.8516, "step": 9410 }, { "epoch": 1.6106452164983742, "grad_norm": 14.752939224243164, "learning_rate": 2.9986616331274415e-05, "loss": 1.7008, "step": 9411 }, { "epoch": 1.6108163614581552, "grad_norm": 4.229423522949219, "learning_rate": 2.998612530378222e-05, "loss": 0.3025, "step": 9412 }, { "epoch": 1.610987506417936, "grad_norm": 98.59330749511719, "learning_rate": 2.9985625434453774e-05, "loss": 8.1478, "step": 9413 }, { "epoch": 1.611158651377717, "grad_norm": 18.716083526611328, "learning_rate": 2.9985116723584e-05, "loss": 2.026, "step": 9414 }, { "epoch": 1.6113297963374977, "grad_norm": 19.626657485961914, "learning_rate": 2.998459917147304e-05, "loss": 2.4199, "step": 9415 }, { "epoch": 1.6115009412972787, "grad_norm": 19.14718246459961, "learning_rate": 2.9984072778426246e-05, "loss": 2.025, "step": 9416 }, { "epoch": 1.6116720862570597, "grad_norm": 10.98017692565918, "learning_rate": 2.99835375447542e-05, "loss": 0.9052, "step": 9417 }, { "epoch": 1.6118432312168407, "grad_norm": 21.417722702026367, "learning_rate": 2.9982993470772684e-05, "loss": 2.8591, "step": 9418 }, { "epoch": 1.6120143761766217, "grad_norm": 14.140348434448242, "learning_rate": 2.99824405568027e-05, "loss": 1.24, "step": 9419 }, { "epoch": 1.6121855211364027, "grad_norm": 7.0900421142578125, "learning_rate": 2.9981878803170476e-05, "loss": 0.6738, "step": 9420 }, { "epoch": 1.6123566660961834, "grad_norm": 3.92787504196167, "learning_rate": 2.9981308210207444e-05, "loss": 0.3278, "step": 9421 }, { "epoch": 1.6125278110559644, "grad_norm": 2.027553081512451, "learning_rate": 2.998072877825025e-05, "loss": 0.2385, "step": 9422 }, { "epoch": 1.6126989560157452, "grad_norm": 13.30890941619873, "learning_rate": 2.9980140507640764e-05, "loss": 1.2327, "step": 9423 }, { "epoch": 1.6128701009755262, "grad_norm": 16.716304779052734, "learning_rate": 2.9979543398726073e-05, "loss": 1.2795, "step": 9424 }, { "epoch": 1.6130412459353072, "grad_norm": 4.967143535614014, "learning_rate": 2.9978937451858457e-05, "loss": 0.5127, "step": 9425 }, { "epoch": 1.6132123908950882, "grad_norm": 6.9292497634887695, "learning_rate": 2.997832266739544e-05, "loss": 0.5281, "step": 9426 }, { "epoch": 1.6133835358548692, "grad_norm": 30.22280502319336, "learning_rate": 2.9977699045699735e-05, "loss": 5.7263, "step": 9427 }, { "epoch": 1.6135546808146501, "grad_norm": 20.34552001953125, "learning_rate": 2.9977066587139287e-05, "loss": 2.0274, "step": 9428 }, { "epoch": 1.613725825774431, "grad_norm": 3.483440637588501, "learning_rate": 2.9976425292087245e-05, "loss": 0.2381, "step": 9429 }, { "epoch": 1.613896970734212, "grad_norm": 7.574001789093018, "learning_rate": 2.9975775160921972e-05, "loss": 0.4572, "step": 9430 }, { "epoch": 1.6140681156939927, "grad_norm": 15.381829261779785, "learning_rate": 2.9975116194027046e-05, "loss": 1.2496, "step": 9431 }, { "epoch": 1.6142392606537737, "grad_norm": 11.829957962036133, "learning_rate": 2.9974448391791268e-05, "loss": 0.8571, "step": 9432 }, { "epoch": 1.6144104056135546, "grad_norm": 16.823575973510742, "learning_rate": 2.9973771754608627e-05, "loss": 1.1022, "step": 9433 }, { "epoch": 1.6145815505733356, "grad_norm": 13.790877342224121, "learning_rate": 2.9973086282878353e-05, "loss": 1.3232, "step": 9434 }, { "epoch": 1.6147526955331166, "grad_norm": 12.955852508544922, "learning_rate": 2.9972391977004867e-05, "loss": 1.0083, "step": 9435 }, { "epoch": 1.6149238404928976, "grad_norm": 12.367205619812012, "learning_rate": 2.9971688837397816e-05, "loss": 0.9619, "step": 9436 }, { "epoch": 1.6150949854526784, "grad_norm": 15.672718048095703, "learning_rate": 2.997097686447205e-05, "loss": 1.2763, "step": 9437 }, { "epoch": 1.6152661304124594, "grad_norm": 12.648813247680664, "learning_rate": 2.9970256058647636e-05, "loss": 0.9131, "step": 9438 }, { "epoch": 1.6154372753722401, "grad_norm": 1.0086512565612793, "learning_rate": 2.996952642034985e-05, "loss": 0.2108, "step": 9439 }, { "epoch": 1.6156084203320211, "grad_norm": 10.792105674743652, "learning_rate": 2.996878795000918e-05, "loss": 1.0374, "step": 9440 }, { "epoch": 1.6157795652918021, "grad_norm": 22.78766441345215, "learning_rate": 2.9968040648061324e-05, "loss": 1.5161, "step": 9441 }, { "epoch": 1.615950710251583, "grad_norm": 21.82635498046875, "learning_rate": 2.9967284514947192e-05, "loss": 1.6153, "step": 9442 }, { "epoch": 1.616121855211364, "grad_norm": 12.727463722229004, "learning_rate": 2.9966519551112904e-05, "loss": 0.9033, "step": 9443 }, { "epoch": 1.616293000171145, "grad_norm": 12.504409790039062, "learning_rate": 2.9965745757009784e-05, "loss": 0.9684, "step": 9444 }, { "epoch": 1.6164641451309258, "grad_norm": 20.437881469726562, "learning_rate": 2.996496313309438e-05, "loss": 2.2406, "step": 9445 }, { "epoch": 1.6166352900907068, "grad_norm": 20.432300567626953, "learning_rate": 2.9964171679828438e-05, "loss": 0.8893, "step": 9446 }, { "epoch": 1.6168064350504876, "grad_norm": 19.25128936767578, "learning_rate": 2.9963371397678912e-05, "loss": 1.6807, "step": 9447 }, { "epoch": 1.6169775800102686, "grad_norm": 12.064483642578125, "learning_rate": 2.9962562287117978e-05, "loss": 0.8975, "step": 9448 }, { "epoch": 1.6171487249700496, "grad_norm": 8.825759887695312, "learning_rate": 2.996174434862301e-05, "loss": 0.7174, "step": 9449 }, { "epoch": 1.6173198699298306, "grad_norm": 8.955503463745117, "learning_rate": 2.996091758267659e-05, "loss": 1.2239, "step": 9450 }, { "epoch": 1.6174910148896116, "grad_norm": 22.753482818603516, "learning_rate": 2.996008198976651e-05, "loss": 2.302, "step": 9451 }, { "epoch": 1.6176621598493925, "grad_norm": 4.034172058105469, "learning_rate": 2.9959237570385777e-05, "loss": 0.3601, "step": 9452 }, { "epoch": 1.6178333048091735, "grad_norm": 17.076047897338867, "learning_rate": 2.995838432503259e-05, "loss": 2.1736, "step": 9453 }, { "epoch": 1.6180044497689543, "grad_norm": 13.09814167022705, "learning_rate": 2.995752225421038e-05, "loss": 0.8286, "step": 9454 }, { "epoch": 1.6181755947287353, "grad_norm": 0.6300417184829712, "learning_rate": 2.9956651358427758e-05, "loss": 0.1278, "step": 9455 }, { "epoch": 1.618346739688516, "grad_norm": 8.995607376098633, "learning_rate": 2.9955771638198556e-05, "loss": 0.8246, "step": 9456 }, { "epoch": 1.618517884648297, "grad_norm": 18.096311569213867, "learning_rate": 2.9954883094041813e-05, "loss": 1.1985, "step": 9457 }, { "epoch": 1.618689029608078, "grad_norm": 5.634091854095459, "learning_rate": 2.9953985726481765e-05, "loss": 0.4309, "step": 9458 }, { "epoch": 1.618860174567859, "grad_norm": 29.755264282226562, "learning_rate": 2.995307953604787e-05, "loss": 5.5881, "step": 9459 }, { "epoch": 1.61903131952764, "grad_norm": 21.78309440612793, "learning_rate": 2.9952164523274775e-05, "loss": 2.1728, "step": 9460 }, { "epoch": 1.619202464487421, "grad_norm": 13.916932106018066, "learning_rate": 2.9951240688702342e-05, "loss": 1.2166, "step": 9461 }, { "epoch": 1.6193736094472018, "grad_norm": 17.53584861755371, "learning_rate": 2.9950308032875634e-05, "loss": 1.3813, "step": 9462 }, { "epoch": 1.6195447544069828, "grad_norm": 12.141541481018066, "learning_rate": 2.9949366556344916e-05, "loss": 0.8495, "step": 9463 }, { "epoch": 1.6197158993667635, "grad_norm": 14.366056442260742, "learning_rate": 2.9948416259665665e-05, "loss": 0.972, "step": 9464 }, { "epoch": 1.6198870443265445, "grad_norm": 8.276599884033203, "learning_rate": 2.9947457143398554e-05, "loss": 0.6406, "step": 9465 }, { "epoch": 1.6200581892863255, "grad_norm": 14.827068328857422, "learning_rate": 2.9946489208109468e-05, "loss": 1.1714, "step": 9466 }, { "epoch": 1.6202293342461065, "grad_norm": 11.037328720092773, "learning_rate": 2.9945512454369485e-05, "loss": 0.7373, "step": 9467 }, { "epoch": 1.6204004792058875, "grad_norm": 13.822649955749512, "learning_rate": 2.9944526882754894e-05, "loss": 1.1724, "step": 9468 }, { "epoch": 1.6205716241656685, "grad_norm": 0.671640157699585, "learning_rate": 2.994353249384718e-05, "loss": 0.1286, "step": 9469 }, { "epoch": 1.6207427691254492, "grad_norm": 0.4439658522605896, "learning_rate": 2.994252928823304e-05, "loss": 0.1223, "step": 9470 }, { "epoch": 1.6209139140852302, "grad_norm": 15.52882194519043, "learning_rate": 2.9941517266504363e-05, "loss": 1.1754, "step": 9471 }, { "epoch": 1.621085059045011, "grad_norm": 11.884194374084473, "learning_rate": 2.994049642925824e-05, "loss": 0.9013, "step": 9472 }, { "epoch": 1.621256204004792, "grad_norm": 9.321181297302246, "learning_rate": 2.9939466777096975e-05, "loss": 0.6995, "step": 9473 }, { "epoch": 1.621427348964573, "grad_norm": 3.167691707611084, "learning_rate": 2.9938428310628057e-05, "loss": 0.2526, "step": 9474 }, { "epoch": 1.621598493924354, "grad_norm": 9.762499809265137, "learning_rate": 2.993738103046419e-05, "loss": 0.7329, "step": 9475 }, { "epoch": 1.621769638884135, "grad_norm": 16.59564781188965, "learning_rate": 2.9936324937223263e-05, "loss": 1.1964, "step": 9476 }, { "epoch": 1.621940783843916, "grad_norm": 11.760416984558105, "learning_rate": 2.9935260031528377e-05, "loss": 0.9212, "step": 9477 }, { "epoch": 1.6221119288036967, "grad_norm": 19.692575454711914, "learning_rate": 2.993418631400783e-05, "loss": 1.6552, "step": 9478 }, { "epoch": 1.6222830737634777, "grad_norm": 0.4915126860141754, "learning_rate": 2.993310378529511e-05, "loss": 0.1278, "step": 9479 }, { "epoch": 1.6224542187232585, "grad_norm": 20.402124404907227, "learning_rate": 2.9932012446028916e-05, "loss": 2.3188, "step": 9480 }, { "epoch": 1.6226253636830394, "grad_norm": 19.932315826416016, "learning_rate": 2.9930912296853136e-05, "loss": 0.8412, "step": 9481 }, { "epoch": 1.6227965086428204, "grad_norm": 10.41826057434082, "learning_rate": 2.9929803338416863e-05, "loss": 0.9179, "step": 9482 }, { "epoch": 1.6229676536026014, "grad_norm": 19.421825408935547, "learning_rate": 2.992868557137438e-05, "loss": 1.5672, "step": 9483 }, { "epoch": 1.6231387985623824, "grad_norm": 3.735640525817871, "learning_rate": 2.9927558996385178e-05, "loss": 0.344, "step": 9484 }, { "epoch": 1.6233099435221634, "grad_norm": 22.59716796875, "learning_rate": 2.9926423614113935e-05, "loss": 3.2795, "step": 9485 }, { "epoch": 1.6234810884819442, "grad_norm": 15.149606704711914, "learning_rate": 2.9925279425230525e-05, "loss": 1.9589, "step": 9486 }, { "epoch": 1.6236522334417252, "grad_norm": 11.310556411743164, "learning_rate": 2.9924126430410022e-05, "loss": 0.775, "step": 9487 }, { "epoch": 1.623823378401506, "grad_norm": 17.477874755859375, "learning_rate": 2.99229646303327e-05, "loss": 1.1952, "step": 9488 }, { "epoch": 1.623994523361287, "grad_norm": 14.588419914245605, "learning_rate": 2.992179402568402e-05, "loss": 1.097, "step": 9489 }, { "epoch": 1.624165668321068, "grad_norm": 19.209442138671875, "learning_rate": 2.9920614617154634e-05, "loss": 1.6344, "step": 9490 }, { "epoch": 1.624336813280849, "grad_norm": 13.452393531799316, "learning_rate": 2.9919426405440406e-05, "loss": 0.9059, "step": 9491 }, { "epoch": 1.6245079582406299, "grad_norm": 15.676600456237793, "learning_rate": 2.991822939124237e-05, "loss": 1.4546, "step": 9492 }, { "epoch": 1.6246791032004109, "grad_norm": 17.645156860351562, "learning_rate": 2.991702357526678e-05, "loss": 1.5075, "step": 9493 }, { "epoch": 1.6248502481601916, "grad_norm": 13.913947105407715, "learning_rate": 2.9915808958225057e-05, "loss": 1.0438, "step": 9494 }, { "epoch": 1.6250213931199726, "grad_norm": 16.458505630493164, "learning_rate": 2.9914585540833836e-05, "loss": 1.4372, "step": 9495 }, { "epoch": 1.6251925380797534, "grad_norm": 9.064806938171387, "learning_rate": 2.9913353323814928e-05, "loss": 0.6707, "step": 9496 }, { "epoch": 1.6253636830395344, "grad_norm": 10.729021072387695, "learning_rate": 2.9912112307895352e-05, "loss": 0.8591, "step": 9497 }, { "epoch": 1.6255348279993154, "grad_norm": 27.599136352539062, "learning_rate": 2.9910862493807297e-05, "loss": 5.5648, "step": 9498 }, { "epoch": 1.6257059729590964, "grad_norm": 19.244413375854492, "learning_rate": 2.9909603882288167e-05, "loss": 2.653, "step": 9499 }, { "epoch": 1.6258771179188773, "grad_norm": 10.857229232788086, "learning_rate": 2.9908336474080534e-05, "loss": 0.9722, "step": 9500 }, { "epoch": 1.6260482628786583, "grad_norm": 8.702522277832031, "learning_rate": 2.9907060269932176e-05, "loss": 1.0441, "step": 9501 }, { "epoch": 1.6262194078384393, "grad_norm": 16.79013442993164, "learning_rate": 2.9905775270596058e-05, "loss": 1.8106, "step": 9502 }, { "epoch": 1.62639055279822, "grad_norm": 21.37388038635254, "learning_rate": 2.990448147683033e-05, "loss": 1.5079, "step": 9503 }, { "epoch": 1.626561697758001, "grad_norm": 18.78592872619629, "learning_rate": 2.9903178889398325e-05, "loss": 1.5252, "step": 9504 }, { "epoch": 1.6267328427177818, "grad_norm": 10.561485290527344, "learning_rate": 2.9901867509068586e-05, "loss": 0.8131, "step": 9505 }, { "epoch": 1.6269039876775628, "grad_norm": 0.7188830971717834, "learning_rate": 2.9900547336614815e-05, "loss": 0.1344, "step": 9506 }, { "epoch": 1.6270751326373438, "grad_norm": 12.477583885192871, "learning_rate": 2.9899218372815923e-05, "loss": 1.0391, "step": 9507 }, { "epoch": 1.6272462775971248, "grad_norm": 17.953500747680664, "learning_rate": 2.9897880618455996e-05, "loss": 1.421, "step": 9508 }, { "epoch": 1.6274174225569058, "grad_norm": 16.35556411743164, "learning_rate": 2.989653407432432e-05, "loss": 1.0877, "step": 9509 }, { "epoch": 1.6275885675166868, "grad_norm": 1.3530229330062866, "learning_rate": 2.989517874121535e-05, "loss": 0.2251, "step": 9510 }, { "epoch": 1.6277597124764676, "grad_norm": 9.559070587158203, "learning_rate": 2.9893814619928737e-05, "loss": 0.8739, "step": 9511 }, { "epoch": 1.6279308574362485, "grad_norm": 14.72940444946289, "learning_rate": 2.9892441711269315e-05, "loss": 1.2438, "step": 9512 }, { "epoch": 1.6281020023960293, "grad_norm": 18.61958122253418, "learning_rate": 2.9891060016047097e-05, "loss": 2.1884, "step": 9513 }, { "epoch": 1.6282731473558103, "grad_norm": 9.404314041137695, "learning_rate": 2.9889669535077297e-05, "loss": 0.5938, "step": 9514 }, { "epoch": 1.6284442923155913, "grad_norm": 16.75083351135254, "learning_rate": 2.988827026918029e-05, "loss": 1.2545, "step": 9515 }, { "epoch": 1.6286154372753723, "grad_norm": 16.408796310424805, "learning_rate": 2.9886862219181653e-05, "loss": 1.4055, "step": 9516 }, { "epoch": 1.6287865822351533, "grad_norm": 0.525993287563324, "learning_rate": 2.9885445385912124e-05, "loss": 0.1295, "step": 9517 }, { "epoch": 1.6289577271949343, "grad_norm": 11.990853309631348, "learning_rate": 2.9884019770207654e-05, "loss": 0.8407, "step": 9518 }, { "epoch": 1.629128872154715, "grad_norm": 2.095593214035034, "learning_rate": 2.9882585372909345e-05, "loss": 0.2506, "step": 9519 }, { "epoch": 1.629300017114496, "grad_norm": 14.216595649719238, "learning_rate": 2.9881142194863503e-05, "loss": 1.3091, "step": 9520 }, { "epoch": 1.6294711620742768, "grad_norm": 0.7387760281562805, "learning_rate": 2.98796902369216e-05, "loss": 0.1371, "step": 9521 }, { "epoch": 1.6296423070340578, "grad_norm": 37.17905807495117, "learning_rate": 2.9878229499940296e-05, "loss": 5.5673, "step": 9522 }, { "epoch": 1.6298134519938388, "grad_norm": 19.5545711517334, "learning_rate": 2.987675998478143e-05, "loss": 1.7863, "step": 9523 }, { "epoch": 1.6299845969536197, "grad_norm": 11.548747062683105, "learning_rate": 2.9875281692312005e-05, "loss": 0.8135, "step": 9524 }, { "epoch": 1.6301557419134007, "grad_norm": 13.296089172363281, "learning_rate": 2.987379462340423e-05, "loss": 1.1273, "step": 9525 }, { "epoch": 1.6303268868731817, "grad_norm": 19.24650764465332, "learning_rate": 2.9872298778935472e-05, "loss": 2.5649, "step": 9526 }, { "epoch": 1.6304980318329625, "grad_norm": 5.203998565673828, "learning_rate": 2.9870794159788285e-05, "loss": 0.4955, "step": 9527 }, { "epoch": 1.6306691767927435, "grad_norm": 11.983332633972168, "learning_rate": 2.9869280766850397e-05, "loss": 1.0445, "step": 9528 }, { "epoch": 1.6308403217525242, "grad_norm": 5.978402614593506, "learning_rate": 2.986775860101471e-05, "loss": 0.3701, "step": 9529 }, { "epoch": 1.6310114667123052, "grad_norm": 9.32921028137207, "learning_rate": 2.98662276631793e-05, "loss": 0.7305, "step": 9530 }, { "epoch": 1.6311826116720862, "grad_norm": 24.619340896606445, "learning_rate": 2.986468795424742e-05, "loss": 3.2465, "step": 9531 }, { "epoch": 1.6313537566318672, "grad_norm": 17.229135513305664, "learning_rate": 2.9863139475127515e-05, "loss": 1.3281, "step": 9532 }, { "epoch": 1.6315249015916482, "grad_norm": 5.460208415985107, "learning_rate": 2.9861582226733176e-05, "loss": 0.6157, "step": 9533 }, { "epoch": 1.6316960465514292, "grad_norm": 17.93170738220215, "learning_rate": 2.986001620998319e-05, "loss": 1.5872, "step": 9534 }, { "epoch": 1.63186719151121, "grad_norm": 2.8334455490112305, "learning_rate": 2.985844142580151e-05, "loss": 0.2335, "step": 9535 }, { "epoch": 1.632038336470991, "grad_norm": 16.463397979736328, "learning_rate": 2.9856857875117254e-05, "loss": 1.3999, "step": 9536 }, { "epoch": 1.6322094814307717, "grad_norm": 14.94178295135498, "learning_rate": 2.985526555886472e-05, "loss": 0.7161, "step": 9537 }, { "epoch": 1.6323806263905527, "grad_norm": 9.554885864257812, "learning_rate": 2.9853664477983386e-05, "loss": 0.6392, "step": 9538 }, { "epoch": 1.6325517713503337, "grad_norm": 15.145529747009277, "learning_rate": 2.9852054633417885e-05, "loss": 1.3183, "step": 9539 }, { "epoch": 1.6327229163101147, "grad_norm": 12.747987747192383, "learning_rate": 2.985043602611803e-05, "loss": 1.5222, "step": 9540 }, { "epoch": 1.6328940612698957, "grad_norm": 12.055132865905762, "learning_rate": 2.9848808657038795e-05, "loss": 1.0101, "step": 9541 }, { "epoch": 1.6330652062296767, "grad_norm": 12.70936107635498, "learning_rate": 2.9847172527140338e-05, "loss": 0.5939, "step": 9542 }, { "epoch": 1.6332363511894574, "grad_norm": 12.676793098449707, "learning_rate": 2.9845527637387974e-05, "loss": 0.9235, "step": 9543 }, { "epoch": 1.6334074961492384, "grad_norm": 3.724648952484131, "learning_rate": 2.9843873988752195e-05, "loss": 0.3046, "step": 9544 }, { "epoch": 1.6335786411090192, "grad_norm": 15.425995826721191, "learning_rate": 2.9842211582208656e-05, "loss": 1.2122, "step": 9545 }, { "epoch": 1.6337497860688002, "grad_norm": 9.182741165161133, "learning_rate": 2.984054041873817e-05, "loss": 0.6702, "step": 9546 }, { "epoch": 1.6339209310285812, "grad_norm": 17.435718536376953, "learning_rate": 2.983886049932674e-05, "loss": 1.6698, "step": 9547 }, { "epoch": 1.6340920759883621, "grad_norm": 3.9570345878601074, "learning_rate": 2.9837171824965506e-05, "loss": 0.3695, "step": 9548 }, { "epoch": 1.6342632209481431, "grad_norm": 11.785552978515625, "learning_rate": 2.9835474396650802e-05, "loss": 0.8587, "step": 9549 }, { "epoch": 1.6344343659079241, "grad_norm": 19.761457443237305, "learning_rate": 2.9833768215384105e-05, "loss": 2.1181, "step": 9550 }, { "epoch": 1.634605510867705, "grad_norm": 23.16259002685547, "learning_rate": 2.9832053282172062e-05, "loss": 1.9273, "step": 9551 }, { "epoch": 1.6347766558274859, "grad_norm": 17.774866104125977, "learning_rate": 2.9830329598026498e-05, "loss": 1.4665, "step": 9552 }, { "epoch": 1.6349478007872669, "grad_norm": 11.788775444030762, "learning_rate": 2.982859716396438e-05, "loss": 0.7886, "step": 9553 }, { "epoch": 1.6351189457470476, "grad_norm": 18.363697052001953, "learning_rate": 2.9826855981007845e-05, "loss": 2.117, "step": 9554 }, { "epoch": 1.6352900907068286, "grad_norm": 9.547832489013672, "learning_rate": 2.98251060501842e-05, "loss": 0.6957, "step": 9555 }, { "epoch": 1.6354612356666096, "grad_norm": 3.216839551925659, "learning_rate": 2.9823347372525905e-05, "loss": 0.3288, "step": 9556 }, { "epoch": 1.6356323806263906, "grad_norm": 12.316461563110352, "learning_rate": 2.9821579949070577e-05, "loss": 0.7846, "step": 9557 }, { "epoch": 1.6358035255861716, "grad_norm": 0.7034462690353394, "learning_rate": 2.9819803780861006e-05, "loss": 0.1411, "step": 9558 }, { "epoch": 1.6359746705459526, "grad_norm": 7.870783805847168, "learning_rate": 2.9818018868945135e-05, "loss": 1.1442, "step": 9559 }, { "epoch": 1.6361458155057333, "grad_norm": 10.161672592163086, "learning_rate": 2.9816225214376052e-05, "loss": 0.6824, "step": 9560 }, { "epoch": 1.6363169604655143, "grad_norm": 9.250317573547363, "learning_rate": 2.9814422818212032e-05, "loss": 0.8015, "step": 9561 }, { "epoch": 1.636488105425295, "grad_norm": 17.515838623046875, "learning_rate": 2.9812611681516475e-05, "loss": 1.3975, "step": 9562 }, { "epoch": 1.636659250385076, "grad_norm": 0.5384781956672668, "learning_rate": 2.981079180535797e-05, "loss": 0.1353, "step": 9563 }, { "epoch": 1.636830395344857, "grad_norm": 9.282712936401367, "learning_rate": 2.980896319081024e-05, "loss": 0.7694, "step": 9564 }, { "epoch": 1.637001540304638, "grad_norm": 47.83257293701172, "learning_rate": 2.9807125838952168e-05, "loss": 6.69, "step": 9565 }, { "epoch": 1.637172685264419, "grad_norm": 11.236211776733398, "learning_rate": 2.9805279750867796e-05, "loss": 0.8595, "step": 9566 }, { "epoch": 1.6373438302242, "grad_norm": 13.43776798248291, "learning_rate": 2.980342492764632e-05, "loss": 1.0057, "step": 9567 }, { "epoch": 1.6375149751839808, "grad_norm": 20.628433227539062, "learning_rate": 2.980156137038209e-05, "loss": 1.64, "step": 9568 }, { "epoch": 1.6376861201437618, "grad_norm": 48.676055908203125, "learning_rate": 2.97996890801746e-05, "loss": 6.5109, "step": 9569 }, { "epoch": 1.6378572651035426, "grad_norm": 16.684858322143555, "learning_rate": 2.9797808058128513e-05, "loss": 1.9329, "step": 9570 }, { "epoch": 1.6380284100633236, "grad_norm": 0.5401609539985657, "learning_rate": 2.979591830535363e-05, "loss": 0.1209, "step": 9571 }, { "epoch": 1.6381995550231045, "grad_norm": 10.745735168457031, "learning_rate": 2.9794019822964908e-05, "loss": 0.9211, "step": 9572 }, { "epoch": 1.6383706999828855, "grad_norm": 9.435981750488281, "learning_rate": 2.9792112612082455e-05, "loss": 0.4922, "step": 9573 }, { "epoch": 1.6385418449426665, "grad_norm": 8.708836555480957, "learning_rate": 2.979019667383153e-05, "loss": 0.588, "step": 9574 }, { "epoch": 1.6387129899024475, "grad_norm": 16.03264617919922, "learning_rate": 2.9788272009342537e-05, "loss": 1.1887, "step": 9575 }, { "epoch": 1.6388841348622283, "grad_norm": 0.6870530843734741, "learning_rate": 2.978633861975103e-05, "loss": 0.1295, "step": 9576 }, { "epoch": 1.6390552798220093, "grad_norm": 73.51947021484375, "learning_rate": 2.978439650619772e-05, "loss": 7.3975, "step": 9577 }, { "epoch": 1.63922642478179, "grad_norm": 6.200953960418701, "learning_rate": 2.9782445669828445e-05, "loss": 0.6618, "step": 9578 }, { "epoch": 1.639397569741571, "grad_norm": 13.522269248962402, "learning_rate": 2.978048611179421e-05, "loss": 1.0582, "step": 9579 }, { "epoch": 1.639568714701352, "grad_norm": 8.464933395385742, "learning_rate": 2.977851783325115e-05, "loss": 1.2026, "step": 9580 }, { "epoch": 1.639739859661133, "grad_norm": 9.946455955505371, "learning_rate": 2.977654083536056e-05, "loss": 0.7277, "step": 9581 }, { "epoch": 1.639911004620914, "grad_norm": 12.292410850524902, "learning_rate": 2.9774555119288868e-05, "loss": 0.8695, "step": 9582 }, { "epoch": 1.640082149580695, "grad_norm": 16.406246185302734, "learning_rate": 2.9772560686207653e-05, "loss": 1.4411, "step": 9583 }, { "epoch": 1.6402532945404757, "grad_norm": 37.85560989379883, "learning_rate": 2.9770557537293624e-05, "loss": 5.9857, "step": 9584 }, { "epoch": 1.6404244395002567, "grad_norm": 7.946487903594971, "learning_rate": 2.976854567372865e-05, "loss": 0.4556, "step": 9585 }, { "epoch": 1.6405955844600375, "grad_norm": 9.776256561279297, "learning_rate": 2.976652509669973e-05, "loss": 0.9347, "step": 9586 }, { "epoch": 1.6407667294198185, "grad_norm": 6.086668968200684, "learning_rate": 2.976449580739901e-05, "loss": 0.4714, "step": 9587 }, { "epoch": 1.6409378743795995, "grad_norm": 13.835415840148926, "learning_rate": 2.976245780702377e-05, "loss": 1.1116, "step": 9588 }, { "epoch": 1.6411090193393805, "grad_norm": 15.061634063720703, "learning_rate": 2.9760411096776442e-05, "loss": 1.1018, "step": 9589 }, { "epoch": 1.6412801642991615, "grad_norm": 3.073387622833252, "learning_rate": 2.9758355677864574e-05, "loss": 0.2859, "step": 9590 }, { "epoch": 1.6414513092589424, "grad_norm": 5.486158847808838, "learning_rate": 2.9756291551500882e-05, "loss": 0.6109, "step": 9591 }, { "epoch": 1.6416224542187232, "grad_norm": 6.618556976318359, "learning_rate": 2.975421871890319e-05, "loss": 0.5525, "step": 9592 }, { "epoch": 1.6417935991785042, "grad_norm": 17.732152938842773, "learning_rate": 2.9752137181294477e-05, "loss": 2.089, "step": 9593 }, { "epoch": 1.641964744138285, "grad_norm": 4.2366509437561035, "learning_rate": 2.975004693990286e-05, "loss": 0.3538, "step": 9594 }, { "epoch": 1.642135889098066, "grad_norm": 0.9246152639389038, "learning_rate": 2.9747947995961572e-05, "loss": 0.1397, "step": 9595 }, { "epoch": 1.642307034057847, "grad_norm": 18.491466522216797, "learning_rate": 2.974584035070901e-05, "loss": 2.3268, "step": 9596 }, { "epoch": 1.642478179017628, "grad_norm": 14.477716445922852, "learning_rate": 2.974372400538867e-05, "loss": 1.1333, "step": 9597 }, { "epoch": 1.642649323977409, "grad_norm": 12.17567253112793, "learning_rate": 2.974159896124921e-05, "loss": 1.1588, "step": 9598 }, { "epoch": 1.64282046893719, "grad_norm": 147.13381958007812, "learning_rate": 2.97394652195444e-05, "loss": 8.4066, "step": 9599 }, { "epoch": 1.6429916138969707, "grad_norm": 13.222512245178223, "learning_rate": 2.973732278153316e-05, "loss": 0.9243, "step": 9600 }, { "epoch": 1.6431627588567517, "grad_norm": 15.897549629211426, "learning_rate": 2.9735171648479534e-05, "loss": 1.4258, "step": 9601 }, { "epoch": 1.6433339038165324, "grad_norm": 16.28536605834961, "learning_rate": 2.973301182165268e-05, "loss": 1.3999, "step": 9602 }, { "epoch": 1.6435050487763134, "grad_norm": 18.308670043945312, "learning_rate": 2.973084330232691e-05, "loss": 1.36, "step": 9603 }, { "epoch": 1.6436761937360944, "grad_norm": 16.17880630493164, "learning_rate": 2.972866609178165e-05, "loss": 1.3926, "step": 9604 }, { "epoch": 1.6438473386958754, "grad_norm": 3.715040922164917, "learning_rate": 2.972648019130146e-05, "loss": 0.4131, "step": 9605 }, { "epoch": 1.6440184836556564, "grad_norm": 13.48600959777832, "learning_rate": 2.972428560217602e-05, "loss": 1.1866, "step": 9606 }, { "epoch": 1.6441896286154374, "grad_norm": 14.225337982177734, "learning_rate": 2.9722082325700142e-05, "loss": 1.3193, "step": 9607 }, { "epoch": 1.6443607735752184, "grad_norm": 7.564727783203125, "learning_rate": 2.971987036317377e-05, "loss": 0.6653, "step": 9608 }, { "epoch": 1.6445319185349991, "grad_norm": 26.217195510864258, "learning_rate": 2.9717649715901953e-05, "loss": 5.5305, "step": 9609 }, { "epoch": 1.6447030634947801, "grad_norm": 0.6173709630966187, "learning_rate": 2.971542038519489e-05, "loss": 0.1401, "step": 9610 }, { "epoch": 1.644874208454561, "grad_norm": 4.426665782928467, "learning_rate": 2.9713182372367874e-05, "loss": 0.2939, "step": 9611 }, { "epoch": 1.6450453534143419, "grad_norm": 20.80668067932129, "learning_rate": 2.9710935678741347e-05, "loss": 2.3449, "step": 9612 }, { "epoch": 1.6452164983741229, "grad_norm": 18.34193992614746, "learning_rate": 2.9708680305640856e-05, "loss": 1.3478, "step": 9613 }, { "epoch": 1.6453876433339039, "grad_norm": 15.609981536865234, "learning_rate": 2.9706416254397077e-05, "loss": 1.6547, "step": 9614 }, { "epoch": 1.6455587882936848, "grad_norm": 3.6447408199310303, "learning_rate": 2.970414352634581e-05, "loss": 0.2663, "step": 9615 }, { "epoch": 1.6457299332534658, "grad_norm": 2.320060968399048, "learning_rate": 2.9701862122827953e-05, "loss": 0.2238, "step": 9616 }, { "epoch": 1.6459010782132466, "grad_norm": 1.5366935729980469, "learning_rate": 2.9699572045189546e-05, "loss": 0.2297, "step": 9617 }, { "epoch": 1.6460722231730276, "grad_norm": 16.6513729095459, "learning_rate": 2.9697273294781744e-05, "loss": 1.3488, "step": 9618 }, { "epoch": 1.6462433681328084, "grad_norm": 9.224735260009766, "learning_rate": 2.96949658729608e-05, "loss": 0.869, "step": 9619 }, { "epoch": 1.6464145130925893, "grad_norm": 1.4602560997009277, "learning_rate": 2.969264978108811e-05, "loss": 0.238, "step": 9620 }, { "epoch": 1.6465856580523703, "grad_norm": 14.85318660736084, "learning_rate": 2.969032502053016e-05, "loss": 1.0903, "step": 9621 }, { "epoch": 1.6467568030121513, "grad_norm": 48.80546188354492, "learning_rate": 2.968799159265857e-05, "loss": 7.2548, "step": 9622 }, { "epoch": 1.6469279479719323, "grad_norm": 15.441446304321289, "learning_rate": 2.968564949885006e-05, "loss": 1.3527, "step": 9623 }, { "epoch": 1.6470990929317133, "grad_norm": 17.650920867919922, "learning_rate": 2.9683298740486477e-05, "loss": 1.2546, "step": 9624 }, { "epoch": 1.647270237891494, "grad_norm": 0.6384724974632263, "learning_rate": 2.968093931895476e-05, "loss": 0.1246, "step": 9625 }, { "epoch": 1.647441382851275, "grad_norm": 3.891321897506714, "learning_rate": 2.9678571235646983e-05, "loss": 0.2711, "step": 9626 }, { "epoch": 1.6476125278110558, "grad_norm": 3.6239945888519287, "learning_rate": 2.9676194491960313e-05, "loss": 0.3014, "step": 9627 }, { "epoch": 1.6477836727708368, "grad_norm": 5.536699295043945, "learning_rate": 2.9673809089297037e-05, "loss": 0.5479, "step": 9628 }, { "epoch": 1.6479548177306178, "grad_norm": 16.389192581176758, "learning_rate": 2.967141502906454e-05, "loss": 1.324, "step": 9629 }, { "epoch": 1.6481259626903988, "grad_norm": 19.72251319885254, "learning_rate": 2.9669012312675317e-05, "loss": 2.0068, "step": 9630 }, { "epoch": 1.6482971076501798, "grad_norm": 20.543481826782227, "learning_rate": 2.966660094154699e-05, "loss": 1.8467, "step": 9631 }, { "epoch": 1.6484682526099608, "grad_norm": 27.54179573059082, "learning_rate": 2.966418091710226e-05, "loss": 5.3185, "step": 9632 }, { "epoch": 1.6486393975697415, "grad_norm": 20.532411575317383, "learning_rate": 2.966175224076894e-05, "loss": 1.8674, "step": 9633 }, { "epoch": 1.6488105425295225, "grad_norm": 12.591346740722656, "learning_rate": 2.9659314913979966e-05, "loss": 1.2143, "step": 9634 }, { "epoch": 1.6489816874893033, "grad_norm": 18.512563705444336, "learning_rate": 2.9656868938173357e-05, "loss": 2.0024, "step": 9635 }, { "epoch": 1.6491528324490843, "grad_norm": 19.9727783203125, "learning_rate": 2.9654414314792243e-05, "loss": 1.2803, "step": 9636 }, { "epoch": 1.6493239774088653, "grad_norm": 16.64049530029297, "learning_rate": 2.9651951045284857e-05, "loss": 1.533, "step": 9637 }, { "epoch": 1.6494951223686463, "grad_norm": 28.282651901245117, "learning_rate": 2.9649479131104533e-05, "loss": 5.5251, "step": 9638 }, { "epoch": 1.6496662673284272, "grad_norm": 3.84965181350708, "learning_rate": 2.9646998573709693e-05, "loss": 0.2989, "step": 9639 }, { "epoch": 1.6498374122882082, "grad_norm": 11.830977439880371, "learning_rate": 2.9644509374563887e-05, "loss": 0.9753, "step": 9640 }, { "epoch": 1.650008557247989, "grad_norm": 2.894202470779419, "learning_rate": 2.9642011535135736e-05, "loss": 0.3025, "step": 9641 }, { "epoch": 1.65017970220777, "grad_norm": 21.37085723876953, "learning_rate": 2.963950505689897e-05, "loss": 1.2203, "step": 9642 }, { "epoch": 1.6503508471675508, "grad_norm": 13.126275062561035, "learning_rate": 2.9636989941332415e-05, "loss": 0.9947, "step": 9643 }, { "epoch": 1.6505219921273317, "grad_norm": 22.479928970336914, "learning_rate": 2.9634466189919995e-05, "loss": 2.1909, "step": 9644 }, { "epoch": 1.6506931370871127, "grad_norm": 13.26056957244873, "learning_rate": 2.9631933804150726e-05, "loss": 0.9559, "step": 9645 }, { "epoch": 1.6508642820468937, "grad_norm": 11.89049243927002, "learning_rate": 2.9629392785518714e-05, "loss": 0.8871, "step": 9646 }, { "epoch": 1.6510354270066747, "grad_norm": 27.84319496154785, "learning_rate": 2.9626843135523174e-05, "loss": 5.4262, "step": 9647 }, { "epoch": 1.6510354270066747, "eval_nli-pairs_loss": 1.4009040594100952, "eval_nli-pairs_runtime": 4.2978, "eval_nli-pairs_samples_per_second": 46.535, "eval_nli-pairs_steps_per_second": 1.629, "eval_sts-test_pearson_cosine": 0.7795249709519518, "eval_sts-test_pearson_dot": 0.6635853674090747, "eval_sts-test_pearson_euclidean": 0.7671070066184192, "eval_sts-test_pearson_manhattan": 0.7706389143893667, "eval_sts-test_pearson_max": 0.7795249709519518, "eval_sts-test_spearman_cosine": 0.7792564630797549, "eval_sts-test_spearman_dot": 0.6464107462169326, "eval_sts-test_spearman_euclidean": 0.7538156837129777, "eval_sts-test_spearman_manhattan": 0.7585753927448657, "eval_sts-test_spearman_max": 0.7792564630797549, "step": 9647 }, { "epoch": 1.6510354270066747, "eval_vitaminc-pairs_loss": 0.7653877139091492, "eval_vitaminc-pairs_runtime": 2.723, "eval_vitaminc-pairs_samples_per_second": 73.45, "eval_vitaminc-pairs_steps_per_second": 2.571, "step": 9647 }, { "epoch": 1.6510354270066747, "eval_qnli-contrastive_loss": 1.5573922395706177, "eval_qnli-contrastive_runtime": 0.6395, "eval_qnli-contrastive_samples_per_second": 312.745, "eval_qnli-contrastive_steps_per_second": 10.946, "step": 9647 }, { "epoch": 1.6510354270066747, "eval_scitail-pairs-qa_loss": 0.11369741708040237, "eval_scitail-pairs-qa_runtime": 1.6169, "eval_scitail-pairs-qa_samples_per_second": 123.696, "eval_scitail-pairs-qa_steps_per_second": 4.329, "step": 9647 }, { "epoch": 1.6510354270066747, "eval_scitail-pairs-pos_loss": 0.6477851271629333, "eval_scitail-pairs-pos_runtime": 2.6441, "eval_scitail-pairs-pos_samples_per_second": 75.64, "eval_scitail-pairs-pos_steps_per_second": 2.647, "step": 9647 }, { "epoch": 1.6510354270066747, "eval_xsum-pairs_loss": 0.7434301972389221, "eval_xsum-pairs_runtime": 2.6495, "eval_xsum-pairs_samples_per_second": 66.05, "eval_xsum-pairs_steps_per_second": 2.265, "step": 9647 }, { "epoch": 1.6510354270066747, "eval_compression-pairs_loss": 0.24267204105854034, "eval_compression-pairs_runtime": 0.5185, "eval_compression-pairs_samples_per_second": 385.701, "eval_compression-pairs_steps_per_second": 13.5, "step": 9647 }, { "epoch": 1.6510354270066747, "eval_sciq_pairs_loss": 0.44054877758026123, "eval_sciq_pairs_runtime": 9.2484, "eval_sciq_pairs_samples_per_second": 21.625, "eval_sciq_pairs_steps_per_second": 0.757, "step": 9647 }, { "epoch": 1.6510354270066747, "eval_qasc_pairs_loss": 5.688777923583984, "eval_qasc_pairs_runtime": 2.6843, "eval_qasc_pairs_samples_per_second": 74.506, "eval_qasc_pairs_steps_per_second": 2.608, "step": 9647 }, { "epoch": 1.6510354270066747, "eval_openbookqa_pairs_loss": 2.4761011600494385, "eval_openbookqa_pairs_runtime": 0.6537, "eval_openbookqa_pairs_samples_per_second": 105.546, "eval_openbookqa_pairs_steps_per_second": 4.589, "step": 9647 }, { "epoch": 1.6510354270066747, "eval_msmarco_pairs_loss": 1.0777266025543213, "eval_msmarco_pairs_runtime": 3.996, "eval_msmarco_pairs_samples_per_second": 50.05, "eval_msmarco_pairs_steps_per_second": 1.752, "step": 9647 }, { "epoch": 1.6510354270066747, "eval_nq_pairs_loss": 1.3720529079437256, "eval_nq_pairs_runtime": 8.6533, "eval_nq_pairs_samples_per_second": 23.113, "eval_nq_pairs_steps_per_second": 0.809, "step": 9647 }, { "epoch": 1.6510354270066747, "eval_trivia_pairs_loss": 1.6230666637420654, "eval_trivia_pairs_runtime": 12.832, "eval_trivia_pairs_samples_per_second": 15.586, "eval_trivia_pairs_steps_per_second": 0.546, "step": 9647 }, { "epoch": 1.6510354270066747, "eval_quora_pairs_loss": 0.2109854817390442, "eval_quora_pairs_runtime": 1.618, "eval_quora_pairs_samples_per_second": 123.609, "eval_quora_pairs_steps_per_second": 4.326, "step": 9647 }, { "epoch": 1.6510354270066747, "eval_gooaq_pairs_loss": 0.8535041213035583, "eval_gooaq_pairs_runtime": 2.6858, "eval_gooaq_pairs_samples_per_second": 74.465, "eval_gooaq_pairs_steps_per_second": 2.606, "step": 9647 }, { "epoch": 1.6512065719664557, "grad_norm": 16.278053283691406, "learning_rate": 2.9624284855668394e-05, "loss": 1.0953, "step": 9648 }, { "epoch": 1.6513777169262365, "grad_norm": 8.280930519104004, "learning_rate": 2.9621717947463768e-05, "loss": 0.4636, "step": 9649 }, { "epoch": 1.6515488618860175, "grad_norm": 17.820796966552734, "learning_rate": 2.9619142412423775e-05, "loss": 1.5094, "step": 9650 }, { "epoch": 1.6517200068457982, "grad_norm": 11.769182205200195, "learning_rate": 2.9616558252067985e-05, "loss": 1.0501, "step": 9651 }, { "epoch": 1.6518911518055792, "grad_norm": 1.627022624015808, "learning_rate": 2.9613965467921053e-05, "loss": 0.2561, "step": 9652 }, { "epoch": 1.6520622967653602, "grad_norm": 14.669647216796875, "learning_rate": 2.9611364061512733e-05, "loss": 1.4038, "step": 9653 }, { "epoch": 1.6522334417251412, "grad_norm": 16.77896499633789, "learning_rate": 2.960875403437785e-05, "loss": 1.4466, "step": 9654 }, { "epoch": 1.6524045866849222, "grad_norm": 46.12146759033203, "learning_rate": 2.960613538805633e-05, "loss": 5.9943, "step": 9655 }, { "epoch": 1.6525757316447032, "grad_norm": 10.923789978027344, "learning_rate": 2.9603508124093173e-05, "loss": 0.8206, "step": 9656 }, { "epoch": 1.6527468766044842, "grad_norm": 9.892470359802246, "learning_rate": 2.9600872244038473e-05, "loss": 0.7631, "step": 9657 }, { "epoch": 1.652918021564265, "grad_norm": 8.722989082336426, "learning_rate": 2.95982277494474e-05, "loss": 0.8881, "step": 9658 }, { "epoch": 1.653089166524046, "grad_norm": 15.97629451751709, "learning_rate": 2.9595574641880213e-05, "loss": 1.8172, "step": 9659 }, { "epoch": 1.6532603114838267, "grad_norm": 2.1303043365478516, "learning_rate": 2.9592912922902246e-05, "loss": 0.2025, "step": 9660 }, { "epoch": 1.6534314564436077, "grad_norm": 14.248826026916504, "learning_rate": 2.959024259408392e-05, "loss": 1.1596, "step": 9661 }, { "epoch": 1.6536026014033887, "grad_norm": 10.95048713684082, "learning_rate": 2.958756365700073e-05, "loss": 1.1048, "step": 9662 }, { "epoch": 1.6537737463631696, "grad_norm": 20.732789993286133, "learning_rate": 2.958487611323326e-05, "loss": 2.2965, "step": 9663 }, { "epoch": 1.6539448913229506, "grad_norm": 5.646283149719238, "learning_rate": 2.9582179964367155e-05, "loss": 0.3326, "step": 9664 }, { "epoch": 1.6541160362827316, "grad_norm": 14.130982398986816, "learning_rate": 2.9579475211993155e-05, "loss": 1.2945, "step": 9665 }, { "epoch": 1.6542871812425124, "grad_norm": 20.27204132080078, "learning_rate": 2.9576761857707058e-05, "loss": 2.008, "step": 9666 }, { "epoch": 1.6544583262022934, "grad_norm": 14.206185340881348, "learning_rate": 2.9574039903109762e-05, "loss": 1.1003, "step": 9667 }, { "epoch": 1.6546294711620742, "grad_norm": 8.376847267150879, "learning_rate": 2.957130934980721e-05, "loss": 0.7358, "step": 9668 }, { "epoch": 1.6548006161218551, "grad_norm": 11.66020679473877, "learning_rate": 2.9568570199410436e-05, "loss": 1.1096, "step": 9669 }, { "epoch": 1.6549717610816361, "grad_norm": 69.59512329101562, "learning_rate": 2.9565822453535553e-05, "loss": 8.0515, "step": 9670 }, { "epoch": 1.6551429060414171, "grad_norm": 10.807044982910156, "learning_rate": 2.956306611380372e-05, "loss": 0.7129, "step": 9671 }, { "epoch": 1.655314051001198, "grad_norm": 14.485694885253906, "learning_rate": 2.956030118184119e-05, "loss": 1.342, "step": 9672 }, { "epoch": 1.655485195960979, "grad_norm": 8.254524230957031, "learning_rate": 2.955752765927928e-05, "loss": 0.4922, "step": 9673 }, { "epoch": 1.6556563409207599, "grad_norm": 20.903818130493164, "learning_rate": 2.9554745547754364e-05, "loss": 1.6023, "step": 9674 }, { "epoch": 1.6558274858805409, "grad_norm": 22.715177536010742, "learning_rate": 2.9551954848907897e-05, "loss": 5.2262, "step": 9675 }, { "epoch": 1.6559986308403216, "grad_norm": 18.715639114379883, "learning_rate": 2.9549155564386396e-05, "loss": 1.7412, "step": 9676 }, { "epoch": 1.6561697758001026, "grad_norm": 12.56106948852539, "learning_rate": 2.9546347695841443e-05, "loss": 0.9122, "step": 9677 }, { "epoch": 1.6563409207598836, "grad_norm": 5.172019958496094, "learning_rate": 2.9543531244929677e-05, "loss": 0.3591, "step": 9678 }, { "epoch": 1.6565120657196646, "grad_norm": 4.0367536544799805, "learning_rate": 2.954070621331282e-05, "loss": 0.3267, "step": 9679 }, { "epoch": 1.6566832106794456, "grad_norm": 23.669639587402344, "learning_rate": 2.9537872602657637e-05, "loss": 5.0829, "step": 9680 }, { "epoch": 1.6568543556392266, "grad_norm": 15.211515426635742, "learning_rate": 2.953503041463597e-05, "loss": 1.5702, "step": 9681 }, { "epoch": 1.6570255005990073, "grad_norm": 8.860590934753418, "learning_rate": 2.9532179650924702e-05, "loss": 0.7207, "step": 9682 }, { "epoch": 1.6571966455587883, "grad_norm": 13.997873306274414, "learning_rate": 2.9529320313205797e-05, "loss": 1.1755, "step": 9683 }, { "epoch": 1.657367790518569, "grad_norm": 15.151674270629883, "learning_rate": 2.9526452403166268e-05, "loss": 1.2197, "step": 9684 }, { "epoch": 1.65753893547835, "grad_norm": 10.085159301757812, "learning_rate": 2.952357592249818e-05, "loss": 0.8809, "step": 9685 }, { "epoch": 1.657710080438131, "grad_norm": 2.0305681228637695, "learning_rate": 2.952069087289867e-05, "loss": 0.235, "step": 9686 }, { "epoch": 1.657881225397912, "grad_norm": 9.22899341583252, "learning_rate": 2.9517797256069917e-05, "loss": 0.7494, "step": 9687 }, { "epoch": 1.658052370357693, "grad_norm": 0.5515758395195007, "learning_rate": 2.951489507371916e-05, "loss": 0.1294, "step": 9688 }, { "epoch": 1.658223515317474, "grad_norm": 6.569195747375488, "learning_rate": 2.951198432755869e-05, "loss": 0.4616, "step": 9689 }, { "epoch": 1.6583946602772548, "grad_norm": 15.721166610717773, "learning_rate": 2.950906501930585e-05, "loss": 1.3582, "step": 9690 }, { "epoch": 1.6585658052370358, "grad_norm": 10.01101016998291, "learning_rate": 2.950613715068303e-05, "loss": 0.665, "step": 9691 }, { "epoch": 1.6587369501968166, "grad_norm": 15.434731483459473, "learning_rate": 2.9503200723417697e-05, "loss": 1.384, "step": 9692 }, { "epoch": 1.6589080951565975, "grad_norm": 1.8269180059432983, "learning_rate": 2.9500255739242333e-05, "loss": 0.2164, "step": 9693 }, { "epoch": 1.6590792401163785, "grad_norm": 17.16244888305664, "learning_rate": 2.9497302199894482e-05, "loss": 2.357, "step": 9694 }, { "epoch": 1.6592503850761595, "grad_norm": 11.236735343933105, "learning_rate": 2.949434010711674e-05, "loss": 0.8353, "step": 9695 }, { "epoch": 1.6594215300359405, "grad_norm": 14.129475593566895, "learning_rate": 2.949136946265675e-05, "loss": 1.0762, "step": 9696 }, { "epoch": 1.6595926749957215, "grad_norm": 5.514081001281738, "learning_rate": 2.9488390268267186e-05, "loss": 0.3871, "step": 9697 }, { "epoch": 1.6597638199555023, "grad_norm": 20.239360809326172, "learning_rate": 2.948540252570579e-05, "loss": 2.2277, "step": 9698 }, { "epoch": 1.6599349649152833, "grad_norm": 8.115821838378906, "learning_rate": 2.9482406236735328e-05, "loss": 0.8528, "step": 9699 }, { "epoch": 1.660106109875064, "grad_norm": 17.807376861572266, "learning_rate": 2.947940140312361e-05, "loss": 1.5262, "step": 9700 }, { "epoch": 1.660277254834845, "grad_norm": 11.524696350097656, "learning_rate": 2.9476388026643504e-05, "loss": 1.0236, "step": 9701 }, { "epoch": 1.660448399794626, "grad_norm": 5.5488057136535645, "learning_rate": 2.9473366109072895e-05, "loss": 0.5689, "step": 9702 }, { "epoch": 1.660619544754407, "grad_norm": 34.07258987426758, "learning_rate": 2.9470335652194726e-05, "loss": 5.7634, "step": 9703 }, { "epoch": 1.660790689714188, "grad_norm": 16.554664611816406, "learning_rate": 2.9467296657796975e-05, "loss": 1.6639, "step": 9704 }, { "epoch": 1.660961834673969, "grad_norm": 12.793015480041504, "learning_rate": 2.9464249127672638e-05, "loss": 1.2175, "step": 9705 }, { "epoch": 1.66113297963375, "grad_norm": 94.53076171875, "learning_rate": 2.9461193063619777e-05, "loss": 7.2682, "step": 9706 }, { "epoch": 1.6613041245935307, "grad_norm": 24.416723251342773, "learning_rate": 2.945812846744147e-05, "loss": 3.0739, "step": 9707 }, { "epoch": 1.6614752695533117, "grad_norm": 3.0911056995391846, "learning_rate": 2.9455055340945834e-05, "loss": 0.3222, "step": 9708 }, { "epoch": 1.6616464145130925, "grad_norm": 7.973036766052246, "learning_rate": 2.9451973685946008e-05, "loss": 0.6707, "step": 9709 }, { "epoch": 1.6618175594728735, "grad_norm": 26.24905776977539, "learning_rate": 2.9448883504260193e-05, "loss": 1.3262, "step": 9710 }, { "epoch": 1.6619887044326545, "grad_norm": 5.3457932472229, "learning_rate": 2.944578479771158e-05, "loss": 0.5675, "step": 9711 }, { "epoch": 1.6621598493924354, "grad_norm": 23.526723861694336, "learning_rate": 2.9442677568128422e-05, "loss": 5.4245, "step": 9712 }, { "epoch": 1.6623309943522164, "grad_norm": 1.8648773431777954, "learning_rate": 2.943956181734399e-05, "loss": 0.2466, "step": 9713 }, { "epoch": 1.6625021393119974, "grad_norm": 22.36537742614746, "learning_rate": 2.943643754719658e-05, "loss": 2.7939, "step": 9714 }, { "epoch": 1.6626732842717782, "grad_norm": 18.50316619873047, "learning_rate": 2.943330475952951e-05, "loss": 1.5021, "step": 9715 }, { "epoch": 1.6628444292315592, "grad_norm": 14.86069393157959, "learning_rate": 2.9430163456191132e-05, "loss": 1.0778, "step": 9716 }, { "epoch": 1.66301557419134, "grad_norm": 22.760568618774414, "learning_rate": 2.9427013639034825e-05, "loss": 2.4786, "step": 9717 }, { "epoch": 1.663186719151121, "grad_norm": 12.31818962097168, "learning_rate": 2.942385530991899e-05, "loss": 0.9401, "step": 9718 }, { "epoch": 1.663357864110902, "grad_norm": 16.771989822387695, "learning_rate": 2.942068847070703e-05, "loss": 1.3458, "step": 9719 }, { "epoch": 1.663529009070683, "grad_norm": 12.807435035705566, "learning_rate": 2.9417513123267393e-05, "loss": 1.1198, "step": 9720 }, { "epoch": 1.663700154030464, "grad_norm": 0.5760407447814941, "learning_rate": 2.941432926947354e-05, "loss": 0.1394, "step": 9721 }, { "epoch": 1.6638712989902449, "grad_norm": 7.7680344581604, "learning_rate": 2.941113691120395e-05, "loss": 0.7386, "step": 9722 }, { "epoch": 1.6640424439500257, "grad_norm": 4.307456970214844, "learning_rate": 2.9407936050342114e-05, "loss": 0.3588, "step": 9723 }, { "epoch": 1.6642135889098066, "grad_norm": 2.7927045822143555, "learning_rate": 2.940472668877655e-05, "loss": 0.3137, "step": 9724 }, { "epoch": 1.6643847338695874, "grad_norm": 7.942152500152588, "learning_rate": 2.940150882840079e-05, "loss": 0.7421, "step": 9725 }, { "epoch": 1.6645558788293684, "grad_norm": 15.185585975646973, "learning_rate": 2.939828247111336e-05, "loss": 1.2859, "step": 9726 }, { "epoch": 1.6647270237891494, "grad_norm": 43.534263610839844, "learning_rate": 2.9395047618817837e-05, "loss": 5.7853, "step": 9727 }, { "epoch": 1.6648981687489304, "grad_norm": 12.240732192993164, "learning_rate": 2.939180427342277e-05, "loss": 0.8594, "step": 9728 }, { "epoch": 1.6650693137087114, "grad_norm": 18.222366333007812, "learning_rate": 2.938855243684175e-05, "loss": 2.2052, "step": 9729 }, { "epoch": 1.6652404586684924, "grad_norm": 18.32358741760254, "learning_rate": 2.9385292110993364e-05, "loss": 1.8302, "step": 9730 }, { "epoch": 1.6654116036282731, "grad_norm": 13.088190078735352, "learning_rate": 2.9382023297801196e-05, "loss": 1.2317, "step": 9731 }, { "epoch": 1.665582748588054, "grad_norm": 13.66419792175293, "learning_rate": 2.937874599919387e-05, "loss": 1.095, "step": 9732 }, { "epoch": 1.6657538935478349, "grad_norm": 16.1844539642334, "learning_rate": 2.9375460217104987e-05, "loss": 1.1741, "step": 9733 }, { "epoch": 1.6659250385076159, "grad_norm": 12.171327590942383, "learning_rate": 2.937216595347316e-05, "loss": 1.1299, "step": 9734 }, { "epoch": 1.6660961834673969, "grad_norm": 3.8920178413391113, "learning_rate": 2.9368863210242015e-05, "loss": 0.3289, "step": 9735 }, { "epoch": 1.6662673284271778, "grad_norm": 17.042314529418945, "learning_rate": 2.9365551989360173e-05, "loss": 2.2103, "step": 9736 }, { "epoch": 1.6664384733869588, "grad_norm": 18.59898567199707, "learning_rate": 2.9362232292781264e-05, "loss": 1.9277, "step": 9737 }, { "epoch": 1.6666096183467398, "grad_norm": 0.6641459465026855, "learning_rate": 2.9358904122463907e-05, "loss": 0.1313, "step": 9738 }, { "epoch": 1.6667807633065206, "grad_norm": 18.993438720703125, "learning_rate": 2.9355567480371734e-05, "loss": 2.1129, "step": 9739 }, { "epoch": 1.6669519082663016, "grad_norm": 7.076188087463379, "learning_rate": 2.9352222368473366e-05, "loss": 0.617, "step": 9740 }, { "epoch": 1.6671230532260823, "grad_norm": 15.822833061218262, "learning_rate": 2.934886878874242e-05, "loss": 0.8581, "step": 9741 }, { "epoch": 1.6672941981858633, "grad_norm": 3.408019542694092, "learning_rate": 2.934550674315752e-05, "loss": 0.2846, "step": 9742 }, { "epoch": 1.6674653431456443, "grad_norm": 15.833969116210938, "learning_rate": 2.9342136233702272e-05, "loss": 1.1137, "step": 9743 }, { "epoch": 1.6676364881054253, "grad_norm": 9.454253196716309, "learning_rate": 2.933875726236528e-05, "loss": 0.5644, "step": 9744 }, { "epoch": 1.6678076330652063, "grad_norm": 8.413461685180664, "learning_rate": 2.9335369831140155e-05, "loss": 0.8676, "step": 9745 }, { "epoch": 1.6679787780249873, "grad_norm": 12.427563667297363, "learning_rate": 2.9331973942025472e-05, "loss": 1.0824, "step": 9746 }, { "epoch": 1.668149922984768, "grad_norm": 14.477243423461914, "learning_rate": 2.932856959702482e-05, "loss": 1.0226, "step": 9747 }, { "epoch": 1.668321067944549, "grad_norm": 1.3101242780685425, "learning_rate": 2.932515679814676e-05, "loss": 0.212, "step": 9748 }, { "epoch": 1.6684922129043298, "grad_norm": 16.019681930541992, "learning_rate": 2.932173554740486e-05, "loss": 2.2303, "step": 9749 }, { "epoch": 1.6686633578641108, "grad_norm": 22.306777954101562, "learning_rate": 2.9318305846817646e-05, "loss": 2.0858, "step": 9750 }, { "epoch": 1.6688345028238918, "grad_norm": 31.97169303894043, "learning_rate": 2.931486769840866e-05, "loss": 5.7405, "step": 9751 }, { "epoch": 1.6690056477836728, "grad_norm": 6.7964019775390625, "learning_rate": 2.9311421104206407e-05, "loss": 0.6555, "step": 9752 }, { "epoch": 1.6691767927434538, "grad_norm": 17.550384521484375, "learning_rate": 2.9307966066244395e-05, "loss": 1.2871, "step": 9753 }, { "epoch": 1.6693479377032348, "grad_norm": 15.242714881896973, "learning_rate": 2.9304502586561082e-05, "loss": 1.5112, "step": 9754 }, { "epoch": 1.6695190826630155, "grad_norm": 17.70952606201172, "learning_rate": 2.9301030667199943e-05, "loss": 1.6275, "step": 9755 }, { "epoch": 1.6696902276227965, "grad_norm": 6.127594470977783, "learning_rate": 2.929755031020941e-05, "loss": 0.3692, "step": 9756 }, { "epoch": 1.6698613725825775, "grad_norm": 16.62163734436035, "learning_rate": 2.92940615176429e-05, "loss": 1.4945, "step": 9757 }, { "epoch": 1.6700325175423583, "grad_norm": 28.354595184326172, "learning_rate": 2.92905642915588e-05, "loss": 5.4618, "step": 9758 }, { "epoch": 1.6702036625021393, "grad_norm": 16.324853897094727, "learning_rate": 2.9287058634020483e-05, "loss": 1.1898, "step": 9759 }, { "epoch": 1.6703748074619202, "grad_norm": 0.5821172595024109, "learning_rate": 2.9283544547096292e-05, "loss": 0.1332, "step": 9760 }, { "epoch": 1.6705459524217012, "grad_norm": 14.0308256149292, "learning_rate": 2.9280022032859543e-05, "loss": 1.0275, "step": 9761 }, { "epoch": 1.6707170973814822, "grad_norm": 17.138357162475586, "learning_rate": 2.927649109338853e-05, "loss": 1.6363, "step": 9762 }, { "epoch": 1.6708882423412632, "grad_norm": 24.23528289794922, "learning_rate": 2.92729517307665e-05, "loss": 5.3419, "step": 9763 }, { "epoch": 1.671059387301044, "grad_norm": 16.539188385009766, "learning_rate": 2.9269403947081697e-05, "loss": 1.3348, "step": 9764 }, { "epoch": 1.671230532260825, "grad_norm": 20.00057601928711, "learning_rate": 2.9265847744427305e-05, "loss": 2.0582, "step": 9765 }, { "epoch": 1.6714016772206057, "grad_norm": 14.60428524017334, "learning_rate": 2.92622831249015e-05, "loss": 1.5909, "step": 9766 }, { "epoch": 1.6715728221803867, "grad_norm": 10.645134925842285, "learning_rate": 2.9258710090607405e-05, "loss": 0.9259, "step": 9767 }, { "epoch": 1.6717439671401677, "grad_norm": 11.868138313293457, "learning_rate": 2.925512864365312e-05, "loss": 0.8773, "step": 9768 }, { "epoch": 1.6719151120999487, "grad_norm": 15.451043128967285, "learning_rate": 2.9251538786151702e-05, "loss": 0.9799, "step": 9769 }, { "epoch": 1.6720862570597297, "grad_norm": 20.56831169128418, "learning_rate": 2.9247940520221176e-05, "loss": 1.8285, "step": 9770 }, { "epoch": 1.6722574020195107, "grad_norm": 11.12653923034668, "learning_rate": 2.9244333847984522e-05, "loss": 0.8724, "step": 9771 }, { "epoch": 1.6724285469792914, "grad_norm": 3.7660152912139893, "learning_rate": 2.9240718771569676e-05, "loss": 0.2828, "step": 9772 }, { "epoch": 1.6725996919390724, "grad_norm": 13.73226547241211, "learning_rate": 2.923709529310955e-05, "loss": 1.0137, "step": 9773 }, { "epoch": 1.6727708368988532, "grad_norm": 10.139899253845215, "learning_rate": 2.923346341474199e-05, "loss": 0.8313, "step": 9774 }, { "epoch": 1.6729419818586342, "grad_norm": 2.498119592666626, "learning_rate": 2.922982313860982e-05, "loss": 0.2681, "step": 9775 }, { "epoch": 1.6731131268184152, "grad_norm": 10.670347213745117, "learning_rate": 2.9226174466860797e-05, "loss": 0.806, "step": 9776 }, { "epoch": 1.6732842717781962, "grad_norm": 16.343782424926758, "learning_rate": 2.922251740164765e-05, "loss": 1.3409, "step": 9777 }, { "epoch": 1.6734554167379772, "grad_norm": 16.054641723632812, "learning_rate": 2.9218851945128058e-05, "loss": 1.34, "step": 9778 }, { "epoch": 1.6736265616977581, "grad_norm": 18.4311466217041, "learning_rate": 2.9215178099464637e-05, "loss": 1.9104, "step": 9779 }, { "epoch": 1.673797706657539, "grad_norm": 0.6089122295379639, "learning_rate": 2.9211495866824966e-05, "loss": 0.1379, "step": 9780 }, { "epoch": 1.67396885161732, "grad_norm": 11.641411781311035, "learning_rate": 2.9207805249381565e-05, "loss": 0.8673, "step": 9781 }, { "epoch": 1.6741399965771007, "grad_norm": 20.432126998901367, "learning_rate": 2.9204106249311904e-05, "loss": 2.1634, "step": 9782 }, { "epoch": 1.6743111415368817, "grad_norm": 32.062740325927734, "learning_rate": 2.92003988687984e-05, "loss": 6.0213, "step": 9783 }, { "epoch": 1.6744822864966626, "grad_norm": 16.315752029418945, "learning_rate": 2.9196683110028412e-05, "loss": 1.2846, "step": 9784 }, { "epoch": 1.6746534314564436, "grad_norm": 5.4877471923828125, "learning_rate": 2.9192958975194248e-05, "loss": 0.2957, "step": 9785 }, { "epoch": 1.6748245764162246, "grad_norm": 13.836814880371094, "learning_rate": 2.9189226466493146e-05, "loss": 0.8971, "step": 9786 }, { "epoch": 1.6749957213760056, "grad_norm": 14.722482681274414, "learning_rate": 2.9185485586127293e-05, "loss": 1.0007, "step": 9787 }, { "epoch": 1.6751668663357864, "grad_norm": 13.545456886291504, "learning_rate": 2.9181736336303814e-05, "loss": 1.027, "step": 9788 }, { "epoch": 1.6753380112955674, "grad_norm": 12.413348197937012, "learning_rate": 2.9177978719234775e-05, "loss": 1.2273, "step": 9789 }, { "epoch": 1.6755091562553481, "grad_norm": 12.114067077636719, "learning_rate": 2.9174212737137177e-05, "loss": 0.9275, "step": 9790 }, { "epoch": 1.6756803012151291, "grad_norm": 12.126701354980469, "learning_rate": 2.9170438392232947e-05, "loss": 0.9921, "step": 9791 }, { "epoch": 1.67585144617491, "grad_norm": 18.026287078857422, "learning_rate": 2.9166655686748967e-05, "loss": 1.4076, "step": 9792 }, { "epoch": 1.676022591134691, "grad_norm": 18.89499855041504, "learning_rate": 2.916286462291702e-05, "loss": 2.1101, "step": 9793 }, { "epoch": 1.676193736094472, "grad_norm": 12.914731979370117, "learning_rate": 2.9159065202973862e-05, "loss": 1.016, "step": 9794 }, { "epoch": 1.676364881054253, "grad_norm": 0.5270738005638123, "learning_rate": 2.9155257429161136e-05, "loss": 0.1273, "step": 9795 }, { "epoch": 1.6765360260140338, "grad_norm": 0.6374592781066895, "learning_rate": 2.915144130372545e-05, "loss": 0.1385, "step": 9796 }, { "epoch": 1.6767071709738148, "grad_norm": 0.438812255859375, "learning_rate": 2.914761682891831e-05, "loss": 0.1261, "step": 9797 }, { "epoch": 1.6768783159335956, "grad_norm": 28.414331436157227, "learning_rate": 2.9143784006996174e-05, "loss": 5.64, "step": 9798 }, { "epoch": 1.6770494608933766, "grad_norm": 15.795819282531738, "learning_rate": 2.9139942840220407e-05, "loss": 1.18, "step": 9799 }, { "epoch": 1.6772206058531576, "grad_norm": 0.5815959572792053, "learning_rate": 2.91360933308573e-05, "loss": 0.1258, "step": 9800 }, { "epoch": 1.6773917508129386, "grad_norm": 9.532992362976074, "learning_rate": 2.9132235481178077e-05, "loss": 0.7502, "step": 9801 }, { "epoch": 1.6775628957727196, "grad_norm": 18.019777297973633, "learning_rate": 2.912836929345887e-05, "loss": 1.5333, "step": 9802 }, { "epoch": 1.6777340407325005, "grad_norm": 4.460291862487793, "learning_rate": 2.9124494769980734e-05, "loss": 0.3497, "step": 9803 }, { "epoch": 1.6779051856922813, "grad_norm": 14.737872123718262, "learning_rate": 2.9120611913029645e-05, "loss": 1.1435, "step": 9804 }, { "epoch": 1.6780763306520623, "grad_norm": 26.760541915893555, "learning_rate": 2.9116720724896495e-05, "loss": 3.5447, "step": 9805 }, { "epoch": 1.678247475611843, "grad_norm": 12.057034492492676, "learning_rate": 2.9112821207877096e-05, "loss": 0.8764, "step": 9806 }, { "epoch": 1.678418620571624, "grad_norm": 20.99563980102539, "learning_rate": 2.910891336427216e-05, "loss": 1.8458, "step": 9807 }, { "epoch": 1.678589765531405, "grad_norm": 0.3795873820781708, "learning_rate": 2.9104997196387325e-05, "loss": 0.1169, "step": 9808 }, { "epoch": 1.678760910491186, "grad_norm": 5.172293186187744, "learning_rate": 2.9101072706533134e-05, "loss": 0.3442, "step": 9809 }, { "epoch": 1.678932055450967, "grad_norm": 11.776358604431152, "learning_rate": 2.909713989702504e-05, "loss": 1.0627, "step": 9810 }, { "epoch": 1.679103200410748, "grad_norm": 17.426382064819336, "learning_rate": 2.9093198770183416e-05, "loss": 1.1753, "step": 9811 }, { "epoch": 1.679274345370529, "grad_norm": 2.906306505203247, "learning_rate": 2.9089249328333528e-05, "loss": 0.2811, "step": 9812 }, { "epoch": 1.6794454903303098, "grad_norm": 10.755882263183594, "learning_rate": 2.9085291573805546e-05, "loss": 0.9195, "step": 9813 }, { "epoch": 1.6796166352900908, "grad_norm": 12.953862190246582, "learning_rate": 2.9081325508934556e-05, "loss": 1.0489, "step": 9814 }, { "epoch": 1.6797877802498715, "grad_norm": 5.689671516418457, "learning_rate": 2.9077351136060545e-05, "loss": 0.3681, "step": 9815 }, { "epoch": 1.6799589252096525, "grad_norm": 15.082462310791016, "learning_rate": 2.907336845752839e-05, "loss": 1.2996, "step": 9816 }, { "epoch": 1.6801300701694335, "grad_norm": 38.888031005859375, "learning_rate": 2.906937747568789e-05, "loss": 5.4799, "step": 9817 }, { "epoch": 1.6803012151292145, "grad_norm": 16.984092712402344, "learning_rate": 2.9065378192893723e-05, "loss": 1.3902, "step": 9818 }, { "epoch": 1.6804723600889955, "grad_norm": 7.181100368499756, "learning_rate": 2.906137061150547e-05, "loss": 0.7345, "step": 9819 }, { "epoch": 1.6806435050487765, "grad_norm": 8.076035499572754, "learning_rate": 2.905735473388761e-05, "loss": 0.6695, "step": 9820 }, { "epoch": 1.6808146500085572, "grad_norm": 9.857911109924316, "learning_rate": 2.9053330562409525e-05, "loss": 0.6642, "step": 9821 }, { "epoch": 1.6809857949683382, "grad_norm": 23.397382736206055, "learning_rate": 2.9049298099445474e-05, "loss": 1.9808, "step": 9822 }, { "epoch": 1.681156939928119, "grad_norm": 7.789228916168213, "learning_rate": 2.9045257347374616e-05, "loss": 0.6874, "step": 9823 }, { "epoch": 1.6813280848879, "grad_norm": 4.609889507293701, "learning_rate": 2.9041208308581005e-05, "loss": 0.3518, "step": 9824 }, { "epoch": 1.681499229847681, "grad_norm": 14.941460609436035, "learning_rate": 2.903715098545358e-05, "loss": 1.6402, "step": 9825 }, { "epoch": 1.681670374807462, "grad_norm": 1.7049038410186768, "learning_rate": 2.9033085380386163e-05, "loss": 0.2564, "step": 9826 }, { "epoch": 1.681841519767243, "grad_norm": 0.8809799551963806, "learning_rate": 2.902901149577747e-05, "loss": 0.1313, "step": 9827 }, { "epoch": 1.682012664727024, "grad_norm": 21.36051368713379, "learning_rate": 2.9024929334031102e-05, "loss": 1.9333, "step": 9828 }, { "epoch": 1.6821838096868047, "grad_norm": 4.450123310089111, "learning_rate": 2.9020838897555538e-05, "loss": 0.4634, "step": 9829 }, { "epoch": 1.6823549546465857, "grad_norm": 8.756388664245605, "learning_rate": 2.901674018876413e-05, "loss": 0.9206, "step": 9830 }, { "epoch": 1.6825260996063665, "grad_norm": 18.722309112548828, "learning_rate": 2.9012633210075146e-05, "loss": 1.4291, "step": 9831 }, { "epoch": 1.6826972445661474, "grad_norm": 14.32200813293457, "learning_rate": 2.900851796391169e-05, "loss": 1.2152, "step": 9832 }, { "epoch": 1.6828683895259284, "grad_norm": 10.229337692260742, "learning_rate": 2.9004394452701776e-05, "loss": 0.9154, "step": 9833 }, { "epoch": 1.6830395344857094, "grad_norm": 5.019619941711426, "learning_rate": 2.9000262678878266e-05, "loss": 0.3413, "step": 9834 }, { "epoch": 1.6832106794454904, "grad_norm": 11.813813209533691, "learning_rate": 2.8996122644878938e-05, "loss": 1.0278, "step": 9835 }, { "epoch": 1.6833818244052714, "grad_norm": 17.820865631103516, "learning_rate": 2.8991974353146388e-05, "loss": 1.5699, "step": 9836 }, { "epoch": 1.6835529693650522, "grad_norm": 17.85036849975586, "learning_rate": 2.898781780612814e-05, "loss": 1.3099, "step": 9837 }, { "epoch": 1.6837241143248332, "grad_norm": 12.714621543884277, "learning_rate": 2.8983653006276544e-05, "loss": 0.9641, "step": 9838 }, { "epoch": 1.683895259284614, "grad_norm": 9.561012268066406, "learning_rate": 2.897947995604885e-05, "loss": 0.6975, "step": 9839 }, { "epoch": 1.684066404244395, "grad_norm": 10.489484786987305, "learning_rate": 2.8975298657907158e-05, "loss": 0.8856, "step": 9840 }, { "epoch": 1.684237549204176, "grad_norm": 18.42349624633789, "learning_rate": 2.8971109114318442e-05, "loss": 1.9704, "step": 9841 }, { "epoch": 1.6844086941639569, "grad_norm": 0.5426035523414612, "learning_rate": 2.8966911327754543e-05, "loss": 0.1335, "step": 9842 }, { "epoch": 1.6845798391237379, "grad_norm": 0.9668150544166565, "learning_rate": 2.8962705300692156e-05, "loss": 0.1962, "step": 9843 }, { "epoch": 1.6847509840835189, "grad_norm": 5.569203853607178, "learning_rate": 2.8958491035612842e-05, "loss": 0.5411, "step": 9844 }, { "epoch": 1.6849221290432996, "grad_norm": 19.17258071899414, "learning_rate": 2.895426853500303e-05, "loss": 1.0881, "step": 9845 }, { "epoch": 1.6850932740030806, "grad_norm": 17.384492874145508, "learning_rate": 2.8950037801353995e-05, "loss": 1.3613, "step": 9846 }, { "epoch": 1.6852644189628614, "grad_norm": 15.415716171264648, "learning_rate": 2.8945798837161884e-05, "loss": 1.3957, "step": 9847 }, { "epoch": 1.6854355639226424, "grad_norm": 12.663168907165527, "learning_rate": 2.894155164492768e-05, "loss": 1.0979, "step": 9848 }, { "epoch": 1.6856067088824234, "grad_norm": 6.5817766189575195, "learning_rate": 2.8937296227157246e-05, "loss": 0.7224, "step": 9849 }, { "epoch": 1.6857778538422044, "grad_norm": 16.315296173095703, "learning_rate": 2.8933032586361278e-05, "loss": 2.0086, "step": 9850 }, { "epoch": 1.6859489988019853, "grad_norm": 13.393712997436523, "learning_rate": 2.8928760725055328e-05, "loss": 1.1386, "step": 9851 }, { "epoch": 1.6861201437617663, "grad_norm": 30.97249984741211, "learning_rate": 2.8924480645759805e-05, "loss": 5.188, "step": 9852 }, { "epoch": 1.686291288721547, "grad_norm": 15.87905502319336, "learning_rate": 2.892019235099996e-05, "loss": 1.2392, "step": 9853 }, { "epoch": 1.686462433681328, "grad_norm": 8.818626403808594, "learning_rate": 2.8915895843305896e-05, "loss": 0.7025, "step": 9854 }, { "epoch": 1.6866335786411089, "grad_norm": 0.4936555027961731, "learning_rate": 2.891159112521256e-05, "loss": 0.1229, "step": 9855 }, { "epoch": 1.6868047236008898, "grad_norm": 11.993104934692383, "learning_rate": 2.8907278199259737e-05, "loss": 0.8046, "step": 9856 }, { "epoch": 1.6869758685606708, "grad_norm": 19.62760353088379, "learning_rate": 2.8902957067992063e-05, "loss": 2.4751, "step": 9857 }, { "epoch": 1.6871470135204518, "grad_norm": 5.026688575744629, "learning_rate": 2.8898627733959008e-05, "loss": 0.366, "step": 9858 }, { "epoch": 1.6873181584802328, "grad_norm": 5.392689228057861, "learning_rate": 2.8894290199714893e-05, "loss": 0.2688, "step": 9859 }, { "epoch": 1.6874893034400138, "grad_norm": 11.269207954406738, "learning_rate": 2.888994446781886e-05, "loss": 0.9158, "step": 9860 }, { "epoch": 1.6876604483997948, "grad_norm": 11.431229591369629, "learning_rate": 2.888559054083491e-05, "loss": 1.0166, "step": 9861 }, { "epoch": 1.6878315933595756, "grad_norm": 20.1605167388916, "learning_rate": 2.888122842133185e-05, "loss": 2.8746, "step": 9862 }, { "epoch": 1.6880027383193565, "grad_norm": 2.420452833175659, "learning_rate": 2.887685811188335e-05, "loss": 0.2476, "step": 9863 }, { "epoch": 1.6881738832791373, "grad_norm": 16.53036117553711, "learning_rate": 2.8872479615067897e-05, "loss": 1.5109, "step": 9864 }, { "epoch": 1.6883450282389183, "grad_norm": 14.703415870666504, "learning_rate": 2.8868092933468808e-05, "loss": 1.1672, "step": 9865 }, { "epoch": 1.6885161731986993, "grad_norm": 0.5735829472541809, "learning_rate": 2.8863698069674227e-05, "loss": 0.1384, "step": 9866 }, { "epoch": 1.6886873181584803, "grad_norm": 14.369513511657715, "learning_rate": 2.885929502627714e-05, "loss": 1.3884, "step": 9867 }, { "epoch": 1.6888584631182613, "grad_norm": 15.814953804016113, "learning_rate": 2.8854883805875346e-05, "loss": 1.3426, "step": 9868 }, { "epoch": 1.6890296080780423, "grad_norm": 13.552285194396973, "learning_rate": 2.8850464411071465e-05, "loss": 1.1599, "step": 9869 }, { "epoch": 1.689200753037823, "grad_norm": 5.1314215660095215, "learning_rate": 2.884603684447296e-05, "loss": 0.5197, "step": 9870 }, { "epoch": 1.689371897997604, "grad_norm": 7.617180824279785, "learning_rate": 2.8841601108692086e-05, "loss": 0.7705, "step": 9871 }, { "epoch": 1.6895430429573848, "grad_norm": 12.718794822692871, "learning_rate": 2.8837157206345945e-05, "loss": 1.1354, "step": 9872 }, { "epoch": 1.6897141879171658, "grad_norm": 17.702369689941406, "learning_rate": 2.8832705140056437e-05, "loss": 1.3154, "step": 9873 }, { "epoch": 1.6898853328769468, "grad_norm": 13.156858444213867, "learning_rate": 2.8828244912450305e-05, "loss": 0.8987, "step": 9874 }, { "epoch": 1.6900564778367277, "grad_norm": 11.836379051208496, "learning_rate": 2.8823776526159063e-05, "loss": 0.9271, "step": 9875 }, { "epoch": 1.6902276227965087, "grad_norm": 26.937326431274414, "learning_rate": 2.8819299983819096e-05, "loss": 3.2997, "step": 9876 }, { "epoch": 1.6903987677562897, "grad_norm": 15.841500282287598, "learning_rate": 2.8814815288071547e-05, "loss": 1.1996, "step": 9877 }, { "epoch": 1.6905699127160705, "grad_norm": 21.622220993041992, "learning_rate": 2.8810322441562403e-05, "loss": 2.4283, "step": 9878 }, { "epoch": 1.6907410576758515, "grad_norm": 19.27534294128418, "learning_rate": 2.8805821446942442e-05, "loss": 2.1447, "step": 9879 }, { "epoch": 1.6909122026356322, "grad_norm": 9.359821319580078, "learning_rate": 2.8801312306867275e-05, "loss": 0.7215, "step": 9880 }, { "epoch": 1.6910833475954132, "grad_norm": 10.952072143554688, "learning_rate": 2.8796795023997282e-05, "loss": 0.8573, "step": 9881 }, { "epoch": 1.6912544925551942, "grad_norm": 12.66530990600586, "learning_rate": 2.879226960099768e-05, "loss": 0.9994, "step": 9882 }, { "epoch": 1.6914256375149752, "grad_norm": 7.919592380523682, "learning_rate": 2.8787736040538466e-05, "loss": 0.6902, "step": 9883 }, { "epoch": 1.6915967824747562, "grad_norm": 8.239189147949219, "learning_rate": 2.878319434529445e-05, "loss": 0.6065, "step": 9884 }, { "epoch": 1.6917679274345372, "grad_norm": 33.56480026245117, "learning_rate": 2.877864451794525e-05, "loss": 5.3664, "step": 9885 }, { "epoch": 1.691939072394318, "grad_norm": 0.8319238424301147, "learning_rate": 2.8774086561175256e-05, "loss": 0.1326, "step": 9886 }, { "epoch": 1.692110217354099, "grad_norm": 4.896884918212891, "learning_rate": 2.8769520477673678e-05, "loss": 0.2708, "step": 9887 }, { "epoch": 1.6922813623138797, "grad_norm": 21.196035385131836, "learning_rate": 2.8764946270134506e-05, "loss": 1.7286, "step": 9888 }, { "epoch": 1.6924525072736607, "grad_norm": 16.550209045410156, "learning_rate": 2.8760363941256532e-05, "loss": 1.3199, "step": 9889 }, { "epoch": 1.6926236522334417, "grad_norm": 16.431983947753906, "learning_rate": 2.875577349374334e-05, "loss": 1.0718, "step": 9890 }, { "epoch": 1.6927947971932227, "grad_norm": 25.806367874145508, "learning_rate": 2.8751174930303295e-05, "loss": 1.1958, "step": 9891 }, { "epoch": 1.6929659421530037, "grad_norm": 31.568248748779297, "learning_rate": 2.8746568253649562e-05, "loss": 5.3833, "step": 9892 }, { "epoch": 1.6931370871127847, "grad_norm": 10.286754608154297, "learning_rate": 2.8741953466500084e-05, "loss": 0.7985, "step": 9893 }, { "epoch": 1.6933082320725654, "grad_norm": 21.115070343017578, "learning_rate": 2.873733057157759e-05, "loss": 1.7544, "step": 9894 }, { "epoch": 1.6934793770323464, "grad_norm": 19.583215713500977, "learning_rate": 2.8732699571609594e-05, "loss": 2.3346, "step": 9895 }, { "epoch": 1.6936505219921272, "grad_norm": 4.669119358062744, "learning_rate": 2.8728060469328404e-05, "loss": 0.3439, "step": 9896 }, { "epoch": 1.6938216669519082, "grad_norm": 16.40795135498047, "learning_rate": 2.8723413267471086e-05, "loss": 1.2605, "step": 9897 }, { "epoch": 1.6939928119116892, "grad_norm": 17.20766830444336, "learning_rate": 2.8718757968779503e-05, "loss": 1.3757, "step": 9898 }, { "epoch": 1.6941639568714701, "grad_norm": 7.8763322830200195, "learning_rate": 2.871409457600028e-05, "loss": 1.1503, "step": 9899 }, { "epoch": 1.6943351018312511, "grad_norm": 13.040277481079102, "learning_rate": 2.8709423091884836e-05, "loss": 1.055, "step": 9900 }, { "epoch": 1.6945062467910321, "grad_norm": 1.5957226753234863, "learning_rate": 2.8704743519189347e-05, "loss": 0.2287, "step": 9901 }, { "epoch": 1.694677391750813, "grad_norm": 13.322244644165039, "learning_rate": 2.8700055860674765e-05, "loss": 1.0509, "step": 9902 }, { "epoch": 1.6948485367105939, "grad_norm": 13.666224479675293, "learning_rate": 2.8695360119106826e-05, "loss": 1.192, "step": 9903 }, { "epoch": 1.6950196816703746, "grad_norm": 0.48769161105155945, "learning_rate": 2.8690656297256014e-05, "loss": 0.1277, "step": 9904 }, { "epoch": 1.6951908266301556, "grad_norm": 13.460355758666992, "learning_rate": 2.868594439789759e-05, "loss": 1.0904, "step": 9905 }, { "epoch": 1.6953619715899366, "grad_norm": 10.510547637939453, "learning_rate": 2.868122442381159e-05, "loss": 0.7138, "step": 9906 }, { "epoch": 1.6955331165497176, "grad_norm": 2.7616004943847656, "learning_rate": 2.8676496377782805e-05, "loss": 0.3255, "step": 9907 }, { "epoch": 1.6957042615094986, "grad_norm": 9.53403091430664, "learning_rate": 2.867176026260079e-05, "loss": 0.8868, "step": 9908 }, { "epoch": 1.6958754064692796, "grad_norm": 20.327848434448242, "learning_rate": 2.866701608105985e-05, "loss": 2.5847, "step": 9909 }, { "epoch": 1.6960465514290606, "grad_norm": 0.41666966676712036, "learning_rate": 2.866226383595907e-05, "loss": 0.1262, "step": 9910 }, { "epoch": 1.6962176963888413, "grad_norm": 1.8370481729507446, "learning_rate": 2.865750353010227e-05, "loss": 0.2498, "step": 9911 }, { "epoch": 1.6963888413486223, "grad_norm": 10.102760314941406, "learning_rate": 2.8652735166298053e-05, "loss": 1.0128, "step": 9912 }, { "epoch": 1.696559986308403, "grad_norm": 15.321733474731445, "learning_rate": 2.864795874735975e-05, "loss": 1.1352, "step": 9913 }, { "epoch": 1.696731131268184, "grad_norm": 15.445337295532227, "learning_rate": 2.8643174276105456e-05, "loss": 1.5214, "step": 9914 }, { "epoch": 1.696902276227965, "grad_norm": 17.868240356445312, "learning_rate": 2.8638381755358024e-05, "loss": 1.1462, "step": 9915 }, { "epoch": 1.697073421187746, "grad_norm": 11.179777145385742, "learning_rate": 2.863358118794503e-05, "loss": 1.1148, "step": 9916 }, { "epoch": 1.697244566147527, "grad_norm": 12.652956008911133, "learning_rate": 2.862877257669884e-05, "loss": 1.0149, "step": 9917 }, { "epoch": 1.697415711107308, "grad_norm": 0.4308198094367981, "learning_rate": 2.8623955924456525e-05, "loss": 0.1214, "step": 9918 }, { "epoch": 1.6975868560670888, "grad_norm": 3.7478044033050537, "learning_rate": 2.8619131234059933e-05, "loss": 0.2999, "step": 9919 }, { "epoch": 1.6977580010268698, "grad_norm": 98.03883361816406, "learning_rate": 2.861429850835561e-05, "loss": 6.8487, "step": 9920 }, { "epoch": 1.6979291459866506, "grad_norm": 0.5506452918052673, "learning_rate": 2.8609457750194903e-05, "loss": 0.1297, "step": 9921 }, { "epoch": 1.6981002909464316, "grad_norm": 2.451768159866333, "learning_rate": 2.8604608962433847e-05, "loss": 0.2305, "step": 9922 }, { "epoch": 1.6982714359062125, "grad_norm": 20.242000579833984, "learning_rate": 2.8599752147933246e-05, "loss": 0.5926, "step": 9923 }, { "epoch": 1.6984425808659935, "grad_norm": 4.213416576385498, "learning_rate": 2.859488730955861e-05, "loss": 0.3877, "step": 9924 }, { "epoch": 1.6986137258257745, "grad_norm": 9.181305885314941, "learning_rate": 2.859001445018022e-05, "loss": 0.701, "step": 9925 }, { "epoch": 1.6987848707855555, "grad_norm": 14.503806114196777, "learning_rate": 2.858513357267306e-05, "loss": 1.2494, "step": 9926 }, { "epoch": 1.6989560157453363, "grad_norm": 13.629976272583008, "learning_rate": 2.858024467991686e-05, "loss": 1.2104, "step": 9927 }, { "epoch": 1.6991271607051173, "grad_norm": 2.1116864681243896, "learning_rate": 2.8575347774796066e-05, "loss": 0.2704, "step": 9928 }, { "epoch": 1.699298305664898, "grad_norm": 1.7175486087799072, "learning_rate": 2.857044286019987e-05, "loss": 0.2202, "step": 9929 }, { "epoch": 1.699469450624679, "grad_norm": 13.384143829345703, "learning_rate": 2.8565529939022174e-05, "loss": 1.031, "step": 9930 }, { "epoch": 1.69964059558446, "grad_norm": 9.279143333435059, "learning_rate": 2.856060901416161e-05, "loss": 0.8115, "step": 9931 }, { "epoch": 1.699811740544241, "grad_norm": 17.966066360473633, "learning_rate": 2.8555680088521526e-05, "loss": 1.3173, "step": 9932 }, { "epoch": 1.699982885504022, "grad_norm": 9.229269981384277, "learning_rate": 2.855074316501e-05, "loss": 0.7809, "step": 9933 }, { "epoch": 1.700154030463803, "grad_norm": 5.362792015075684, "learning_rate": 2.8545798246539824e-05, "loss": 0.3446, "step": 9934 }, { "epoch": 1.7003251754235837, "grad_norm": 0.44387826323509216, "learning_rate": 2.8540845336028503e-05, "loss": 0.1195, "step": 9935 }, { "epoch": 1.7004963203833647, "grad_norm": 2.5511434078216553, "learning_rate": 2.853588443639827e-05, "loss": 0.3339, "step": 9936 }, { "epoch": 1.7006674653431455, "grad_norm": 18.185916900634766, "learning_rate": 2.8530915550576057e-05, "loss": 1.9295, "step": 9937 }, { "epoch": 1.7008386103029265, "grad_norm": 0.8961790800094604, "learning_rate": 2.8525938681493512e-05, "loss": 0.16, "step": 9938 }, { "epoch": 1.7010097552627075, "grad_norm": 26.16118049621582, "learning_rate": 2.8520953832087005e-05, "loss": 5.277, "step": 9939 }, { "epoch": 1.7011809002224885, "grad_norm": 4.105901718139648, "learning_rate": 2.8515961005297594e-05, "loss": 0.3048, "step": 9940 }, { "epoch": 1.7013520451822695, "grad_norm": 9.890685081481934, "learning_rate": 2.851096020407106e-05, "loss": 0.7518, "step": 9941 }, { "epoch": 1.7015231901420504, "grad_norm": 0.4122461974620819, "learning_rate": 2.850595143135788e-05, "loss": 0.1221, "step": 9942 }, { "epoch": 1.7016943351018312, "grad_norm": 27.437044143676758, "learning_rate": 2.850093469011324e-05, "loss": 5.2539, "step": 9943 }, { "epoch": 1.7018654800616122, "grad_norm": 16.727657318115234, "learning_rate": 2.8495909983297022e-05, "loss": 1.6329, "step": 9944 }, { "epoch": 1.702036625021393, "grad_norm": 14.915423393249512, "learning_rate": 2.8490877313873814e-05, "loss": 1.5229, "step": 9945 }, { "epoch": 1.702207769981174, "grad_norm": 19.803382873535156, "learning_rate": 2.84858366848129e-05, "loss": 2.3938, "step": 9946 }, { "epoch": 1.702378914940955, "grad_norm": 98.7181396484375, "learning_rate": 2.848078809908825e-05, "loss": 8.6278, "step": 9947 }, { "epoch": 1.702550059900736, "grad_norm": 3.8129982948303223, "learning_rate": 2.8475731559678542e-05, "loss": 0.2656, "step": 9948 }, { "epoch": 1.702721204860517, "grad_norm": 15.110697746276855, "learning_rate": 2.8470667069567143e-05, "loss": 0.9848, "step": 9949 }, { "epoch": 1.702892349820298, "grad_norm": 1.6576757431030273, "learning_rate": 2.8465594631742113e-05, "loss": 0.2098, "step": 9950 }, { "epoch": 1.7030634947800787, "grad_norm": 16.213367462158203, "learning_rate": 2.846051424919619e-05, "loss": 1.5889, "step": 9951 }, { "epoch": 1.7032346397398597, "grad_norm": 10.401459693908691, "learning_rate": 2.8455425924926812e-05, "loss": 0.9091, "step": 9952 }, { "epoch": 1.7034057846996404, "grad_norm": 20.338096618652344, "learning_rate": 2.8450329661936096e-05, "loss": 1.5887, "step": 9953 }, { "epoch": 1.7035769296594214, "grad_norm": 2.4694924354553223, "learning_rate": 2.8445225463230852e-05, "loss": 0.2423, "step": 9954 }, { "epoch": 1.7037480746192024, "grad_norm": 9.678728103637695, "learning_rate": 2.8440113331822553e-05, "loss": 0.6702, "step": 9955 }, { "epoch": 1.7039192195789834, "grad_norm": 15.161781311035156, "learning_rate": 2.843499327072737e-05, "loss": 1.2557, "step": 9956 }, { "epoch": 1.7040903645387644, "grad_norm": 22.399354934692383, "learning_rate": 2.8429865282966146e-05, "loss": 5.2376, "step": 9957 }, { "epoch": 1.7042615094985454, "grad_norm": 17.059072494506836, "learning_rate": 2.8424729371564404e-05, "loss": 1.9918, "step": 9958 }, { "epoch": 1.7044326544583264, "grad_norm": 11.05090045928955, "learning_rate": 2.8419585539552334e-05, "loss": 1.2246, "step": 9959 }, { "epoch": 1.7046037994181071, "grad_norm": 20.176177978515625, "learning_rate": 2.841443378996481e-05, "loss": 2.6545, "step": 9960 }, { "epoch": 1.7047749443778881, "grad_norm": 19.83648681640625, "learning_rate": 2.8409274125841366e-05, "loss": 1.4981, "step": 9961 }, { "epoch": 1.704946089337669, "grad_norm": 10.89116382598877, "learning_rate": 2.8404106550226224e-05, "loss": 0.809, "step": 9962 }, { "epoch": 1.7051172342974499, "grad_norm": 10.138261795043945, "learning_rate": 2.839893106616824e-05, "loss": 0.8735, "step": 9963 }, { "epoch": 1.7052883792572309, "grad_norm": 8.025675773620605, "learning_rate": 2.8393747676720982e-05, "loss": 0.7385, "step": 9964 }, { "epoch": 1.7054595242170119, "grad_norm": 0.403274804353714, "learning_rate": 2.8388556384942638e-05, "loss": 0.1169, "step": 9965 }, { "epoch": 1.7056306691767928, "grad_norm": 9.022873878479004, "learning_rate": 2.838335719389609e-05, "loss": 0.5427, "step": 9966 }, { "epoch": 1.7058018141365738, "grad_norm": 6.677605628967285, "learning_rate": 2.8378150106648857e-05, "loss": 0.6303, "step": 9967 }, { "epoch": 1.7059729590963546, "grad_norm": 6.420729637145996, "learning_rate": 2.8372935126273144e-05, "loss": 0.5518, "step": 9968 }, { "epoch": 1.7061441040561356, "grad_norm": 9.240416526794434, "learning_rate": 2.8367712255845776e-05, "loss": 0.6377, "step": 9969 }, { "epoch": 1.7063152490159164, "grad_norm": 15.703892707824707, "learning_rate": 2.8362481498448274e-05, "loss": 1.1793, "step": 9970 }, { "epoch": 1.7064863939756973, "grad_norm": 12.604381561279297, "learning_rate": 2.8357242857166787e-05, "loss": 0.968, "step": 9971 }, { "epoch": 1.7066575389354783, "grad_norm": 26.917823791503906, "learning_rate": 2.8351996335092114e-05, "loss": 4.9579, "step": 9972 }, { "epoch": 1.7068286838952593, "grad_norm": 6.124308109283447, "learning_rate": 2.8346741935319716e-05, "loss": 0.342, "step": 9973 }, { "epoch": 1.7069998288550403, "grad_norm": 0.5692394375801086, "learning_rate": 2.8341479660949704e-05, "loss": 0.1261, "step": 9974 }, { "epoch": 1.7071709738148213, "grad_norm": 11.414334297180176, "learning_rate": 2.8336209515086813e-05, "loss": 1.0165, "step": 9975 }, { "epoch": 1.707342118774602, "grad_norm": 65.06340026855469, "learning_rate": 2.8330931500840446e-05, "loss": 6.6475, "step": 9976 }, { "epoch": 1.707513263734383, "grad_norm": 99.71956634521484, "learning_rate": 2.8325645621324642e-05, "loss": 7.8611, "step": 9977 }, { "epoch": 1.7076844086941638, "grad_norm": 6.38923978805542, "learning_rate": 2.832035187965807e-05, "loss": 0.7355, "step": 9978 }, { "epoch": 1.7078555536539448, "grad_norm": 8.449542999267578, "learning_rate": 2.831505027896405e-05, "loss": 0.8198, "step": 9979 }, { "epoch": 1.7080266986137258, "grad_norm": 17.727577209472656, "learning_rate": 2.8309740822370526e-05, "loss": 2.0581, "step": 9980 }, { "epoch": 1.7081978435735068, "grad_norm": 0.8798796534538269, "learning_rate": 2.8304423513010095e-05, "loss": 0.1353, "step": 9981 }, { "epoch": 1.7083689885332878, "grad_norm": 3.3156654834747314, "learning_rate": 2.8299098354019984e-05, "loss": 0.3228, "step": 9982 }, { "epoch": 1.7085401334930688, "grad_norm": 4.650891304016113, "learning_rate": 2.8293765348542028e-05, "loss": 0.3319, "step": 9983 }, { "epoch": 1.7087112784528495, "grad_norm": 0.7937259674072266, "learning_rate": 2.8288424499722717e-05, "loss": 0.1319, "step": 9984 }, { "epoch": 1.7088824234126305, "grad_norm": 27.978031158447266, "learning_rate": 2.828307581071316e-05, "loss": 4.9367, "step": 9985 }, { "epoch": 1.7090535683724113, "grad_norm": 17.560062408447266, "learning_rate": 2.827771928466909e-05, "loss": 1.3053, "step": 9986 }, { "epoch": 1.7092247133321923, "grad_norm": 2.0997729301452637, "learning_rate": 2.8272354924750868e-05, "loss": 0.2256, "step": 9987 }, { "epoch": 1.7093958582919733, "grad_norm": 11.626079559326172, "learning_rate": 2.8266982734123462e-05, "loss": 0.7937, "step": 9988 }, { "epoch": 1.7095670032517543, "grad_norm": 27.37969207763672, "learning_rate": 2.8261602715956492e-05, "loss": 5.3773, "step": 9989 }, { "epoch": 1.7097381482115352, "grad_norm": 13.20698356628418, "learning_rate": 2.8256214873424167e-05, "loss": 0.9584, "step": 9990 }, { "epoch": 1.7099092931713162, "grad_norm": 15.431042671203613, "learning_rate": 2.8250819209705313e-05, "loss": 1.3026, "step": 9991 }, { "epoch": 1.710080438131097, "grad_norm": 3.749469041824341, "learning_rate": 2.824541572798339e-05, "loss": 0.3736, "step": 9992 }, { "epoch": 1.710251583090878, "grad_norm": 13.427148818969727, "learning_rate": 2.824000443144647e-05, "loss": 0.9376, "step": 9993 }, { "epoch": 1.7104227280506588, "grad_norm": 14.535947799682617, "learning_rate": 2.823458532328721e-05, "loss": 1.2715, "step": 9994 }, { "epoch": 1.7105938730104397, "grad_norm": 18.60892105102539, "learning_rate": 2.822915840670289e-05, "loss": 1.5179, "step": 9995 }, { "epoch": 1.7107650179702207, "grad_norm": 13.396828651428223, "learning_rate": 2.8223723684895413e-05, "loss": 0.9459, "step": 9996 }, { "epoch": 1.7109361629300017, "grad_norm": 3.1966774463653564, "learning_rate": 2.8218281161071262e-05, "loss": 0.2944, "step": 9997 }, { "epoch": 1.7111073078897827, "grad_norm": 14.99667739868164, "learning_rate": 2.8212830838441544e-05, "loss": 1.2385, "step": 9998 }, { "epoch": 1.7112784528495637, "grad_norm": 0.4614241123199463, "learning_rate": 2.8207372720221944e-05, "loss": 0.1258, "step": 9999 }, { "epoch": 1.7114495978093445, "grad_norm": 17.445667266845703, "learning_rate": 2.820190680963277e-05, "loss": 1.1479, "step": 10000 }, { "epoch": 1.7116207427691255, "grad_norm": 13.52795696258545, "learning_rate": 2.8196433109898917e-05, "loss": 1.343, "step": 10001 }, { "epoch": 1.7117918877289062, "grad_norm": 2.96675181388855, "learning_rate": 2.8190951624249866e-05, "loss": 0.2561, "step": 10002 }, { "epoch": 1.7119630326886872, "grad_norm": 12.088296890258789, "learning_rate": 2.818546235591972e-05, "loss": 0.8192, "step": 10003 }, { "epoch": 1.7121341776484682, "grad_norm": 12.255960464477539, "learning_rate": 2.8179965308147136e-05, "loss": 1.0356, "step": 10004 }, { "epoch": 1.7123053226082492, "grad_norm": 13.746953964233398, "learning_rate": 2.81744604841754e-05, "loss": 1.3035, "step": 10005 }, { "epoch": 1.7124764675680302, "grad_norm": 15.755478858947754, "learning_rate": 2.816894788725234e-05, "loss": 1.7555, "step": 10006 }, { "epoch": 1.7126476125278112, "grad_norm": 12.268940925598145, "learning_rate": 2.8163427520630427e-05, "loss": 1.0701, "step": 10007 }, { "epoch": 1.712818757487592, "grad_norm": 11.39901351928711, "learning_rate": 2.8157899387566658e-05, "loss": 0.8335, "step": 10008 }, { "epoch": 1.712989902447373, "grad_norm": 22.460315704345703, "learning_rate": 2.8152363491322664e-05, "loss": 2.5556, "step": 10009 }, { "epoch": 1.713161047407154, "grad_norm": 2.0664615631103516, "learning_rate": 2.8146819835164608e-05, "loss": 0.2383, "step": 10010 }, { "epoch": 1.7133321923669347, "grad_norm": 6.789374828338623, "learning_rate": 2.8141268422363283e-05, "loss": 0.5894, "step": 10011 }, { "epoch": 1.7135033373267157, "grad_norm": 14.659259796142578, "learning_rate": 2.8135709256194e-05, "loss": 1.0485, "step": 10012 }, { "epoch": 1.7136744822864967, "grad_norm": 28.647369384765625, "learning_rate": 2.81301423399367e-05, "loss": 5.1767, "step": 10013 }, { "epoch": 1.7138456272462776, "grad_norm": 10.130996704101562, "learning_rate": 2.8124567676875854e-05, "loss": 1.0341, "step": 10014 }, { "epoch": 1.7140167722060586, "grad_norm": 1.7324222326278687, "learning_rate": 2.8118985270300538e-05, "loss": 0.2286, "step": 10015 }, { "epoch": 1.7141879171658396, "grad_norm": 22.141571044921875, "learning_rate": 2.811339512350437e-05, "loss": 1.8282, "step": 10016 }, { "epoch": 1.7143590621256204, "grad_norm": 10.924015045166016, "learning_rate": 2.8107797239785545e-05, "loss": 0.7232, "step": 10017 }, { "epoch": 1.7145302070854014, "grad_norm": 2.1513490676879883, "learning_rate": 2.8102191622446825e-05, "loss": 0.2035, "step": 10018 }, { "epoch": 1.7147013520451821, "grad_norm": Infinity, "learning_rate": 2.8102191622446825e-05, "loss": 8.5684, "step": 10019 }, { "epoch": 1.7148724970049631, "grad_norm": 21.14261817932129, "learning_rate": 2.8096578274795538e-05, "loss": 4.9187, "step": 10020 }, { "epoch": 1.7150436419647441, "grad_norm": 6.2298479080200195, "learning_rate": 2.809095720014356e-05, "loss": 0.6842, "step": 10021 }, { "epoch": 1.7152147869245251, "grad_norm": 12.99316120147705, "learning_rate": 2.8085328401807334e-05, "loss": 1.0333, "step": 10022 }, { "epoch": 1.715385931884306, "grad_norm": 0.6113194227218628, "learning_rate": 2.807969188310786e-05, "loss": 0.1294, "step": 10023 }, { "epoch": 1.715557076844087, "grad_norm": 55.20231628417969, "learning_rate": 2.8074047647370688e-05, "loss": 6.0993, "step": 10024 }, { "epoch": 1.7157282218038679, "grad_norm": 36.23455047607422, "learning_rate": 2.806839569792594e-05, "loss": 1.2726, "step": 10025 }, { "epoch": 1.7158993667636488, "grad_norm": 11.502626419067383, "learning_rate": 2.8062736038108263e-05, "loss": 0.9015, "step": 10026 }, { "epoch": 1.7160705117234296, "grad_norm": 14.857687950134277, "learning_rate": 2.8057068671256865e-05, "loss": 1.4534, "step": 10027 }, { "epoch": 1.7162416566832106, "grad_norm": 6.289870262145996, "learning_rate": 2.8051393600715507e-05, "loss": 0.5391, "step": 10028 }, { "epoch": 1.7164128016429916, "grad_norm": 5.0386738777160645, "learning_rate": 2.8045710829832482e-05, "loss": 0.4792, "step": 10029 }, { "epoch": 1.7165839466027726, "grad_norm": 16.920785903930664, "learning_rate": 2.804002036196064e-05, "loss": 0.9096, "step": 10030 }, { "epoch": 1.7167550915625536, "grad_norm": 16.40086555480957, "learning_rate": 2.8034322200457354e-05, "loss": 1.4544, "step": 10031 }, { "epoch": 1.7169262365223346, "grad_norm": 7.115223407745361, "learning_rate": 2.802861634868456e-05, "loss": 0.461, "step": 10032 }, { "epoch": 1.7170973814821153, "grad_norm": 16.795604705810547, "learning_rate": 2.8022902810008718e-05, "loss": 0.8633, "step": 10033 }, { "epoch": 1.7172685264418963, "grad_norm": 1.5699342489242554, "learning_rate": 2.801718158780082e-05, "loss": 0.2186, "step": 10034 }, { "epoch": 1.717439671401677, "grad_norm": 78.89163970947266, "learning_rate": 2.801145268543639e-05, "loss": 7.0751, "step": 10035 }, { "epoch": 1.717610816361458, "grad_norm": 10.610182762145996, "learning_rate": 2.80057161062955e-05, "loss": 0.8238, "step": 10036 }, { "epoch": 1.717781961321239, "grad_norm": 5.401898384094238, "learning_rate": 2.7999971853762733e-05, "loss": 0.4171, "step": 10037 }, { "epoch": 1.71795310628102, "grad_norm": 13.071382522583008, "learning_rate": 2.7994219931227215e-05, "loss": 1.0448, "step": 10038 }, { "epoch": 1.718124251240801, "grad_norm": 17.777509689331055, "learning_rate": 2.7988460342082582e-05, "loss": 1.809, "step": 10039 }, { "epoch": 1.718295396200582, "grad_norm": 0.8122604489326477, "learning_rate": 2.7982693089727e-05, "loss": 0.1302, "step": 10040 }, { "epoch": 1.7184665411603628, "grad_norm": 9.412586212158203, "learning_rate": 2.7976918177563157e-05, "loss": 0.8541, "step": 10041 }, { "epoch": 1.7186376861201438, "grad_norm": 16.11511993408203, "learning_rate": 2.797113560899826e-05, "loss": 1.8378, "step": 10042 }, { "epoch": 1.7188088310799245, "grad_norm": 10.382328987121582, "learning_rate": 2.7965345387444035e-05, "loss": 0.9144, "step": 10043 }, { "epoch": 1.7189799760397055, "grad_norm": 6.094365119934082, "learning_rate": 2.7959547516316723e-05, "loss": 0.7101, "step": 10044 }, { "epoch": 1.7191511209994865, "grad_norm": 3.8232181072235107, "learning_rate": 2.7953741999037074e-05, "loss": 0.2367, "step": 10045 }, { "epoch": 1.7193222659592675, "grad_norm": 7.6633992195129395, "learning_rate": 2.794792883903034e-05, "loss": 0.6441, "step": 10046 }, { "epoch": 1.7194934109190485, "grad_norm": 16.32481575012207, "learning_rate": 2.794210803972632e-05, "loss": 1.0829, "step": 10047 }, { "epoch": 1.7196645558788295, "grad_norm": 5.887193202972412, "learning_rate": 2.7936279604559268e-05, "loss": 0.3926, "step": 10048 }, { "epoch": 1.7198357008386103, "grad_norm": 0.6948737502098083, "learning_rate": 2.7930443536967992e-05, "loss": 0.1275, "step": 10049 }, { "epoch": 1.7200068457983912, "grad_norm": 20.418519973754883, "learning_rate": 2.792459984039576e-05, "loss": 1.6352, "step": 10050 }, { "epoch": 1.720177990758172, "grad_norm": 11.67056655883789, "learning_rate": 2.7918748518290382e-05, "loss": 0.6437, "step": 10051 }, { "epoch": 1.720349135717953, "grad_norm": 6.831418037414551, "learning_rate": 2.7912889574104127e-05, "loss": 0.3907, "step": 10052 }, { "epoch": 1.720520280677734, "grad_norm": 16.56626319885254, "learning_rate": 2.7907023011293797e-05, "loss": 1.7217, "step": 10053 }, { "epoch": 1.720691425637515, "grad_norm": 23.400278091430664, "learning_rate": 2.790114883332066e-05, "loss": 5.097, "step": 10054 }, { "epoch": 1.720862570597296, "grad_norm": 16.403501510620117, "learning_rate": 2.78952670436505e-05, "loss": 1.1082, "step": 10055 }, { "epoch": 1.721033715557077, "grad_norm": 8.538691520690918, "learning_rate": 2.7889377645753573e-05, "loss": 0.8097, "step": 10056 }, { "epoch": 1.7212048605168577, "grad_norm": 15.87956428527832, "learning_rate": 2.7883480643104636e-05, "loss": 1.7339, "step": 10057 }, { "epoch": 1.7213760054766387, "grad_norm": 10.19704818725586, "learning_rate": 2.7877576039182934e-05, "loss": 0.6605, "step": 10058 }, { "epoch": 1.7215471504364195, "grad_norm": 18.006404876708984, "learning_rate": 2.7871663837472186e-05, "loss": 1.2495, "step": 10059 }, { "epoch": 1.7217182953962005, "grad_norm": 7.817597389221191, "learning_rate": 2.786574404146061e-05, "loss": 0.3632, "step": 10060 }, { "epoch": 1.7218894403559815, "grad_norm": 14.839675903320312, "learning_rate": 2.785981665464088e-05, "loss": 1.3071, "step": 10061 }, { "epoch": 1.7220605853157624, "grad_norm": 10.854766845703125, "learning_rate": 2.785388168051017e-05, "loss": 0.6804, "step": 10062 }, { "epoch": 1.7222317302755434, "grad_norm": 6.245128631591797, "learning_rate": 2.784793912257012e-05, "loss": 0.7187, "step": 10063 }, { "epoch": 1.7224028752353244, "grad_norm": 1.6270911693572998, "learning_rate": 2.7841988984326857e-05, "loss": 0.2219, "step": 10064 }, { "epoch": 1.7225740201951054, "grad_norm": 0.4840923249721527, "learning_rate": 2.783603126929096e-05, "loss": 0.1214, "step": 10065 }, { "epoch": 1.7227451651548862, "grad_norm": 7.147538185119629, "learning_rate": 2.78300659809775e-05, "loss": 0.7741, "step": 10066 }, { "epoch": 1.7229163101146672, "grad_norm": 13.561247825622559, "learning_rate": 2.7824093122905993e-05, "loss": 0.8269, "step": 10067 }, { "epoch": 1.723087455074448, "grad_norm": 4.580947399139404, "learning_rate": 2.7818112698600452e-05, "loss": 0.2786, "step": 10068 }, { "epoch": 1.723258600034229, "grad_norm": 18.476253509521484, "learning_rate": 2.7812124711589323e-05, "loss": 2.0024, "step": 10069 }, { "epoch": 1.72342974499401, "grad_norm": 10.515847206115723, "learning_rate": 2.7806129165405532e-05, "loss": 0.7248, "step": 10070 }, { "epoch": 1.723600889953791, "grad_norm": 4.211365699768066, "learning_rate": 2.780012606358646e-05, "loss": 0.2897, "step": 10071 }, { "epoch": 1.723772034913572, "grad_norm": 6.201477527618408, "learning_rate": 2.7794115409673942e-05, "loss": 0.3782, "step": 10072 }, { "epoch": 1.7239431798733529, "grad_norm": 2.289546012878418, "learning_rate": 2.778809720721428e-05, "loss": 0.2139, "step": 10073 }, { "epoch": 1.7241143248331336, "grad_norm": 1.5772837400436401, "learning_rate": 2.7782071459758215e-05, "loss": 0.2272, "step": 10074 }, { "epoch": 1.7242854697929146, "grad_norm": 6.730456352233887, "learning_rate": 2.7776038170860952e-05, "loss": 0.6098, "step": 10075 }, { "epoch": 1.7244566147526954, "grad_norm": 10.259713172912598, "learning_rate": 2.7769997344082142e-05, "loss": 0.6773, "step": 10076 }, { "epoch": 1.7246277597124764, "grad_norm": 5.2181010246276855, "learning_rate": 2.7763948982985874e-05, "loss": 0.279, "step": 10077 }, { "epoch": 1.7247989046722574, "grad_norm": 31.597909927368164, "learning_rate": 2.775789309114069e-05, "loss": 5.2896, "step": 10078 }, { "epoch": 1.7249700496320384, "grad_norm": 1.403862476348877, "learning_rate": 2.7751829672119587e-05, "loss": 0.2359, "step": 10079 }, { "epoch": 1.7251411945918194, "grad_norm": 4.681527137756348, "learning_rate": 2.774575872949998e-05, "loss": 0.351, "step": 10080 }, { "epoch": 1.7253123395516003, "grad_norm": 14.847713470458984, "learning_rate": 2.7739680266863744e-05, "loss": 1.1209, "step": 10081 }, { "epoch": 1.7254834845113811, "grad_norm": 9.112345695495605, "learning_rate": 2.773359428779717e-05, "loss": 0.7326, "step": 10082 }, { "epoch": 1.725654629471162, "grad_norm": 1.0872780084609985, "learning_rate": 2.7727500795891004e-05, "loss": 0.2115, "step": 10083 }, { "epoch": 1.7258257744309429, "grad_norm": 9.246759414672852, "learning_rate": 2.7721399794740412e-05, "loss": 0.5965, "step": 10084 }, { "epoch": 1.7259969193907239, "grad_norm": 0.4386415183544159, "learning_rate": 2.771529128794499e-05, "loss": 0.1202, "step": 10085 }, { "epoch": 1.7261680643505048, "grad_norm": 24.830413818359375, "learning_rate": 2.770917527910877e-05, "loss": 5.0208, "step": 10086 }, { "epoch": 1.7263392093102858, "grad_norm": 9.608736038208008, "learning_rate": 2.7703051771840205e-05, "loss": 0.8917, "step": 10087 }, { "epoch": 1.7265103542700668, "grad_norm": 13.751317024230957, "learning_rate": 2.7696920769752176e-05, "loss": 0.7339, "step": 10088 }, { "epoch": 1.7266814992298478, "grad_norm": 15.781951904296875, "learning_rate": 2.7690782276461976e-05, "loss": 0.5453, "step": 10089 }, { "epoch": 1.7268526441896286, "grad_norm": 0.6779071688652039, "learning_rate": 2.768463629559134e-05, "loss": 0.1246, "step": 10090 }, { "epoch": 1.7270237891494096, "grad_norm": 0.5319092869758606, "learning_rate": 2.7678482830766384e-05, "loss": 0.1285, "step": 10091 }, { "epoch": 1.7271949341091903, "grad_norm": 21.715641021728516, "learning_rate": 2.7672321885617683e-05, "loss": 2.2162, "step": 10092 }, { "epoch": 1.7273660790689713, "grad_norm": 11.764262199401855, "learning_rate": 2.766615346378019e-05, "loss": 0.934, "step": 10093 }, { "epoch": 1.7275372240287523, "grad_norm": 19.15314292907715, "learning_rate": 2.7659977568893294e-05, "loss": 1.7748, "step": 10094 }, { "epoch": 1.7277083689885333, "grad_norm": 20.921070098876953, "learning_rate": 2.7653794204600764e-05, "loss": 2.7942, "step": 10095 }, { "epoch": 1.7278795139483143, "grad_norm": 15.933064460754395, "learning_rate": 2.764760337455082e-05, "loss": 1.2859, "step": 10096 }, { "epoch": 1.7280506589080953, "grad_norm": 19.159208297729492, "learning_rate": 2.7641405082396038e-05, "loss": 1.8935, "step": 10097 }, { "epoch": 1.728221803867876, "grad_norm": 27.65618324279785, "learning_rate": 2.7635199331793437e-05, "loss": 4.9238, "step": 10098 }, { "epoch": 1.728392948827657, "grad_norm": 2.3060781955718994, "learning_rate": 2.7628986126404398e-05, "loss": 0.2416, "step": 10099 }, { "epoch": 1.7285640937874378, "grad_norm": 2.2011115550994873, "learning_rate": 2.762276546989474e-05, "loss": 0.225, "step": 10100 }, { "epoch": 1.7287352387472188, "grad_norm": 15.353120803833008, "learning_rate": 2.7616537365934652e-05, "loss": 1.2365, "step": 10101 }, { "epoch": 1.7289063837069998, "grad_norm": 12.401153564453125, "learning_rate": 2.761030181819873e-05, "loss": 0.9663, "step": 10102 }, { "epoch": 1.7290775286667808, "grad_norm": 0.9945599436759949, "learning_rate": 2.7604058830365952e-05, "loss": 0.1394, "step": 10103 }, { "epoch": 1.7292486736265618, "grad_norm": 15.091883659362793, "learning_rate": 2.759780840611969e-05, "loss": 1.087, "step": 10104 }, { "epoch": 1.7294198185863427, "grad_norm": 3.704066038131714, "learning_rate": 2.7591550549147704e-05, "loss": 0.2441, "step": 10105 }, { "epoch": 1.7295909635461235, "grad_norm": 11.864506721496582, "learning_rate": 2.7585285263142143e-05, "loss": 0.8822, "step": 10106 }, { "epoch": 1.7297621085059045, "grad_norm": 20.731000900268555, "learning_rate": 2.7579012551799526e-05, "loss": 2.6169, "step": 10107 }, { "epoch": 1.7299332534656853, "grad_norm": 21.770143508911133, "learning_rate": 2.7572732418820776e-05, "loss": 1.6023, "step": 10108 }, { "epoch": 1.7301043984254663, "grad_norm": 13.117307662963867, "learning_rate": 2.7566444867911165e-05, "loss": 1.0931, "step": 10109 }, { "epoch": 1.7302755433852472, "grad_norm": 11.40585708618164, "learning_rate": 2.7560149902780358e-05, "loss": 1.0986, "step": 10110 }, { "epoch": 1.7304466883450282, "grad_norm": 16.017263412475586, "learning_rate": 2.7553847527142412e-05, "loss": 1.1343, "step": 10111 }, { "epoch": 1.7306178333048092, "grad_norm": 15.53979778289795, "learning_rate": 2.7547537744715722e-05, "loss": 1.7829, "step": 10112 }, { "epoch": 1.7307889782645902, "grad_norm": 0.5331031084060669, "learning_rate": 2.754122055922307e-05, "loss": 0.1188, "step": 10113 }, { "epoch": 1.7309601232243712, "grad_norm": 21.962480545043945, "learning_rate": 2.7534895974391614e-05, "loss": 4.8448, "step": 10114 }, { "epoch": 1.731131268184152, "grad_norm": 8.583685874938965, "learning_rate": 2.7528563993952863e-05, "loss": 0.7292, "step": 10115 }, { "epoch": 1.731302413143933, "grad_norm": 7.773699760437012, "learning_rate": 2.7522224621642692e-05, "loss": 0.4965, "step": 10116 }, { "epoch": 1.7314735581037137, "grad_norm": 0.4721170663833618, "learning_rate": 2.7515877861201348e-05, "loss": 0.1199, "step": 10117 }, { "epoch": 1.7316447030634947, "grad_norm": 13.667069435119629, "learning_rate": 2.750952371637342e-05, "loss": 0.7077, "step": 10118 }, { "epoch": 1.7318158480232757, "grad_norm": 10.211444854736328, "learning_rate": 2.7503162190907868e-05, "loss": 0.6854, "step": 10119 }, { "epoch": 1.7319869929830567, "grad_norm": 17.950668334960938, "learning_rate": 2.7496793288558e-05, "loss": 0.9565, "step": 10120 }, { "epoch": 1.7321581379428377, "grad_norm": 3.791140079498291, "learning_rate": 2.7490417013081472e-05, "loss": 0.4526, "step": 10121 }, { "epoch": 1.7323292829026187, "grad_norm": 12.861430168151855, "learning_rate": 2.748403336824031e-05, "loss": 1.1083, "step": 10122 }, { "epoch": 1.7325004278623994, "grad_norm": 17.513408660888672, "learning_rate": 2.7477642357800863e-05, "loss": 1.8508, "step": 10123 }, { "epoch": 1.7326715728221804, "grad_norm": 11.23606014251709, "learning_rate": 2.7471243985533842e-05, "loss": 1.0919, "step": 10124 }, { "epoch": 1.7328427177819612, "grad_norm": 7.304426193237305, "learning_rate": 2.7464838255214296e-05, "loss": 0.5871, "step": 10125 }, { "epoch": 1.7330138627417422, "grad_norm": 7.261144638061523, "learning_rate": 2.745842517062161e-05, "loss": 0.6289, "step": 10126 }, { "epoch": 1.7331850077015232, "grad_norm": 14.603657722473145, "learning_rate": 2.745200473553952e-05, "loss": 1.1827, "step": 10127 }, { "epoch": 1.7333561526613042, "grad_norm": 9.474886894226074, "learning_rate": 2.7445576953756088e-05, "loss": 0.7185, "step": 10128 }, { "epoch": 1.7335272976210852, "grad_norm": 4.022946357727051, "learning_rate": 2.7439141829063718e-05, "loss": 0.2628, "step": 10129 }, { "epoch": 1.7336984425808661, "grad_norm": 2.4965269565582275, "learning_rate": 2.7432699365259136e-05, "loss": 0.2175, "step": 10130 }, { "epoch": 1.733869587540647, "grad_norm": 26.057817459106445, "learning_rate": 2.742624956614341e-05, "loss": 5.1216, "step": 10131 }, { "epoch": 1.734040732500428, "grad_norm": 18.28431510925293, "learning_rate": 2.7419792435521932e-05, "loss": 1.4029, "step": 10132 }, { "epoch": 1.7342118774602087, "grad_norm": 0.40122735500335693, "learning_rate": 2.7413327977204426e-05, "loss": 0.1123, "step": 10133 }, { "epoch": 1.7343830224199897, "grad_norm": 10.551216125488281, "learning_rate": 2.7406856195004914e-05, "loss": 0.9831, "step": 10134 }, { "epoch": 1.7345541673797706, "grad_norm": 22.480213165283203, "learning_rate": 2.740037709274178e-05, "loss": 2.8996, "step": 10135 }, { "epoch": 1.7347253123395516, "grad_norm": 6.143136501312256, "learning_rate": 2.739389067423768e-05, "loss": 0.5258, "step": 10136 }, { "epoch": 1.7348964572993326, "grad_norm": 1.379388451576233, "learning_rate": 2.738739694331963e-05, "loss": 0.2072, "step": 10137 }, { "epoch": 1.7350676022591136, "grad_norm": 12.318464279174805, "learning_rate": 2.7380895903818927e-05, "loss": 1.2021, "step": 10138 }, { "epoch": 1.7352387472188944, "grad_norm": 11.983359336853027, "learning_rate": 2.7374387559571206e-05, "loss": 0.9473, "step": 10139 }, { "epoch": 1.7354098921786754, "grad_norm": 19.089994430541992, "learning_rate": 2.7367871914416383e-05, "loss": 2.2155, "step": 10140 }, { "epoch": 1.7355810371384561, "grad_norm": 0.4292986989021301, "learning_rate": 2.7361348972198724e-05, "loss": 0.1153, "step": 10141 }, { "epoch": 1.7357521820982371, "grad_norm": 12.31729507446289, "learning_rate": 2.7354818736766747e-05, "loss": 0.8885, "step": 10142 }, { "epoch": 1.735923327058018, "grad_norm": 9.585973739624023, "learning_rate": 2.7348281211973317e-05, "loss": 0.822, "step": 10143 }, { "epoch": 1.736094472017799, "grad_norm": 12.39317798614502, "learning_rate": 2.7341736401675578e-05, "loss": 0.8036, "step": 10144 }, { "epoch": 1.73626561697758, "grad_norm": 58.83088684082031, "learning_rate": 2.7335184309734987e-05, "loss": 6.3029, "step": 10145 }, { "epoch": 1.736436761937361, "grad_norm": 11.902667045593262, "learning_rate": 2.7328624940017273e-05, "loss": 0.9195, "step": 10146 }, { "epoch": 1.7366079068971418, "grad_norm": 17.135595321655273, "learning_rate": 2.7322058296392484e-05, "loss": 1.4782, "step": 10147 }, { "epoch": 1.7367790518569228, "grad_norm": 7.817293643951416, "learning_rate": 2.731548438273495e-05, "loss": 0.6313, "step": 10148 }, { "epoch": 1.7369501968167036, "grad_norm": 19.34001350402832, "learning_rate": 2.7308903202923284e-05, "loss": 1.3526, "step": 10149 }, { "epoch": 1.7371213417764846, "grad_norm": 4.881283760070801, "learning_rate": 2.7302314760840392e-05, "loss": 0.278, "step": 10150 }, { "epoch": 1.7372924867362656, "grad_norm": 2.7330501079559326, "learning_rate": 2.7295719060373468e-05, "loss": 0.2528, "step": 10151 }, { "epoch": 1.7374636316960466, "grad_norm": 0.4408290386199951, "learning_rate": 2.728911610541399e-05, "loss": 0.1246, "step": 10152 }, { "epoch": 1.7376347766558276, "grad_norm": 13.156779289245605, "learning_rate": 2.728250589985769e-05, "loss": 0.9977, "step": 10153 }, { "epoch": 1.7378059216156085, "grad_norm": 18.428970336914062, "learning_rate": 2.7275888447604632e-05, "loss": 2.2405, "step": 10154 }, { "epoch": 1.7379770665753893, "grad_norm": 19.1968936920166, "learning_rate": 2.7269263752559102e-05, "loss": 2.2284, "step": 10155 }, { "epoch": 1.7381482115351703, "grad_norm": 13.401687622070312, "learning_rate": 2.7262631818629676e-05, "loss": 1.1257, "step": 10156 }, { "epoch": 1.738319356494951, "grad_norm": 0.7043326497077942, "learning_rate": 2.7255992649729222e-05, "loss": 0.1285, "step": 10157 }, { "epoch": 1.738490501454732, "grad_norm": 11.497830390930176, "learning_rate": 2.7249346249774843e-05, "loss": 0.7624, "step": 10158 }, { "epoch": 1.738661646414513, "grad_norm": 10.493492126464844, "learning_rate": 2.7242692622687934e-05, "loss": 0.7424, "step": 10159 }, { "epoch": 1.738832791374294, "grad_norm": 15.352989196777344, "learning_rate": 2.723603177239415e-05, "loss": 1.1748, "step": 10160 }, { "epoch": 1.739003936334075, "grad_norm": 17.102439880371094, "learning_rate": 2.7229363702823385e-05, "loss": 1.6002, "step": 10161 }, { "epoch": 1.739175081293856, "grad_norm": 31.396949768066406, "learning_rate": 2.722268841790982e-05, "loss": 5.445, "step": 10162 }, { "epoch": 1.739346226253637, "grad_norm": 6.2537407875061035, "learning_rate": 2.7216005921591886e-05, "loss": 0.6991, "step": 10163 }, { "epoch": 1.7395173712134178, "grad_norm": 5.512460708618164, "learning_rate": 2.720931621781226e-05, "loss": 0.5572, "step": 10164 }, { "epoch": 1.7396885161731988, "grad_norm": 23.22819709777832, "learning_rate": 2.7202619310517885e-05, "loss": 2.815, "step": 10165 }, { "epoch": 1.7398596611329795, "grad_norm": 5.185771465301514, "learning_rate": 2.719591520365994e-05, "loss": 0.5125, "step": 10166 }, { "epoch": 1.7400308060927605, "grad_norm": 5.277246952056885, "learning_rate": 2.718920390119386e-05, "loss": 0.5579, "step": 10167 }, { "epoch": 1.7402019510525415, "grad_norm": 0.3999665379524231, "learning_rate": 2.7182485407079323e-05, "loss": 0.1183, "step": 10168 }, { "epoch": 1.7403730960123225, "grad_norm": 20.051895141601562, "learning_rate": 2.717575972528025e-05, "loss": 2.2798, "step": 10169 }, { "epoch": 1.7405442409721035, "grad_norm": 9.100671768188477, "learning_rate": 2.7169026859764806e-05, "loss": 0.7509, "step": 10170 }, { "epoch": 1.7407153859318845, "grad_norm": 6.618154048919678, "learning_rate": 2.7162286814505385e-05, "loss": 0.7364, "step": 10171 }, { "epoch": 1.7408865308916652, "grad_norm": 9.0869140625, "learning_rate": 2.7155539593478633e-05, "loss": 0.6368, "step": 10172 }, { "epoch": 1.7410576758514462, "grad_norm": 2.8390488624572754, "learning_rate": 2.714878520066541e-05, "loss": 0.303, "step": 10173 }, { "epoch": 1.741228820811227, "grad_norm": 12.351017951965332, "learning_rate": 2.714202364005083e-05, "loss": 0.7143, "step": 10174 }, { "epoch": 1.741399965771008, "grad_norm": 3.5667779445648193, "learning_rate": 2.7135254915624207e-05, "loss": 0.2367, "step": 10175 }, { "epoch": 1.741571110730789, "grad_norm": 3.0237159729003906, "learning_rate": 2.712847903137912e-05, "loss": 0.2151, "step": 10176 }, { "epoch": 1.74174225569057, "grad_norm": 27.097774505615234, "learning_rate": 2.7121695991313332e-05, "loss": 5.3406, "step": 10177 }, { "epoch": 1.741913400650351, "grad_norm": 26.50831413269043, "learning_rate": 2.7114905799428866e-05, "loss": 5.2133, "step": 10178 }, { "epoch": 1.742084545610132, "grad_norm": 3.4558353424072266, "learning_rate": 2.7108108459731917e-05, "loss": 0.3151, "step": 10179 }, { "epoch": 1.7422556905699127, "grad_norm": 15.095887184143066, "learning_rate": 2.710130397623296e-05, "loss": 1.0146, "step": 10180 }, { "epoch": 1.7424268355296937, "grad_norm": 6.311438083648682, "learning_rate": 2.7094492352946612e-05, "loss": 0.3742, "step": 10181 }, { "epoch": 1.7425979804894745, "grad_norm": 2.67081356048584, "learning_rate": 2.7087673593891782e-05, "loss": 0.3163, "step": 10182 }, { "epoch": 1.7427691254492554, "grad_norm": 7.84975528717041, "learning_rate": 2.708084770309151e-05, "loss": 0.6266, "step": 10183 }, { "epoch": 1.7429402704090364, "grad_norm": 20.54033851623535, "learning_rate": 2.7074014684573116e-05, "loss": 1.7484, "step": 10184 }, { "epoch": 1.7431114153688174, "grad_norm": 23.39609146118164, "learning_rate": 2.7067174542368064e-05, "loss": 4.8539, "step": 10185 }, { "epoch": 1.7432825603285984, "grad_norm": 10.89504623413086, "learning_rate": 2.7060327280512057e-05, "loss": 0.8126, "step": 10186 }, { "epoch": 1.7434537052883794, "grad_norm": 17.79994010925293, "learning_rate": 2.7053472903044994e-05, "loss": 1.5674, "step": 10187 }, { "epoch": 1.7436248502481602, "grad_norm": 3.35904598236084, "learning_rate": 2.7046611414010968e-05, "loss": 0.25, "step": 10188 }, { "epoch": 1.7437959952079412, "grad_norm": 13.228403091430664, "learning_rate": 2.7039742817458263e-05, "loss": 1.4147, "step": 10189 }, { "epoch": 1.743967140167722, "grad_norm": 5.680575847625732, "learning_rate": 2.703286711743936e-05, "loss": 0.4747, "step": 10190 }, { "epoch": 1.744138285127503, "grad_norm": 154.94168090820312, "learning_rate": 2.7025984318010942e-05, "loss": 8.1203, "step": 10191 }, { "epoch": 1.744309430087284, "grad_norm": 7.715725898742676, "learning_rate": 2.7019094423233856e-05, "loss": 0.6938, "step": 10192 }, { "epoch": 1.7444805750470649, "grad_norm": 11.057262420654297, "learning_rate": 2.7012197437173163e-05, "loss": 0.8815, "step": 10193 }, { "epoch": 1.7446517200068459, "grad_norm": 14.388287544250488, "learning_rate": 2.700529336389809e-05, "loss": 1.2882, "step": 10194 }, { "epoch": 1.7448228649666269, "grad_norm": 9.23729419708252, "learning_rate": 2.6998382207482048e-05, "loss": 0.648, "step": 10195 }, { "epoch": 1.7449940099264076, "grad_norm": 29.983173370361328, "learning_rate": 2.699146397200263e-05, "loss": 5.5166, "step": 10196 }, { "epoch": 1.7451651548861886, "grad_norm": 2.492482900619507, "learning_rate": 2.6984538661541615e-05, "loss": 0.2252, "step": 10197 }, { "epoch": 1.7453362998459694, "grad_norm": 21.238197326660156, "learning_rate": 2.6977606280184937e-05, "loss": 2.6589, "step": 10198 }, { "epoch": 1.7455074448057504, "grad_norm": 24.376617431640625, "learning_rate": 2.6970666832022713e-05, "loss": 5.0534, "step": 10199 }, { "epoch": 1.7456785897655314, "grad_norm": 10.86424446105957, "learning_rate": 2.696372032114923e-05, "loss": 0.9157, "step": 10200 }, { "epoch": 1.7458497347253124, "grad_norm": 8.663307189941406, "learning_rate": 2.6956766751662936e-05, "loss": 0.67, "step": 10201 }, { "epoch": 1.7460208796850933, "grad_norm": 7.895942211151123, "learning_rate": 2.694980612766645e-05, "loss": 0.7894, "step": 10202 }, { "epoch": 1.7461920246448743, "grad_norm": 20.085344314575195, "learning_rate": 2.6942838453266547e-05, "loss": 1.8032, "step": 10203 }, { "epoch": 1.746363169604655, "grad_norm": 15.651019096374512, "learning_rate": 2.6935863732574174e-05, "loss": 1.5438, "step": 10204 }, { "epoch": 1.746534314564436, "grad_norm": 10.772604942321777, "learning_rate": 2.6928881969704416e-05, "loss": 0.7978, "step": 10205 }, { "epoch": 1.7467054595242169, "grad_norm": 17.72235870361328, "learning_rate": 2.6921893168776534e-05, "loss": 1.4163, "step": 10206 }, { "epoch": 1.7468766044839978, "grad_norm": 9.42874526977539, "learning_rate": 2.6914897333913914e-05, "loss": 0.9302, "step": 10207 }, { "epoch": 1.7470477494437788, "grad_norm": 5.742290496826172, "learning_rate": 2.6907894469244127e-05, "loss": 0.62, "step": 10208 }, { "epoch": 1.7472188944035598, "grad_norm": 2.14652156829834, "learning_rate": 2.690088457889887e-05, "loss": 0.2582, "step": 10209 }, { "epoch": 1.7473900393633408, "grad_norm": 3.1807568073272705, "learning_rate": 2.689386766701398e-05, "loss": 0.3013, "step": 10210 }, { "epoch": 1.7475611843231218, "grad_norm": 28.904104232788086, "learning_rate": 2.6886843737729453e-05, "loss": 1.6705, "step": 10211 }, { "epoch": 1.7477323292829026, "grad_norm": 1.2151024341583252, "learning_rate": 2.6879812795189417e-05, "loss": 0.2068, "step": 10212 }, { "epoch": 1.7479034742426836, "grad_norm": 18.48325538635254, "learning_rate": 2.687277484354214e-05, "loss": 1.6742, "step": 10213 }, { "epoch": 1.7480746192024645, "grad_norm": 8.478416442871094, "learning_rate": 2.686572988694002e-05, "loss": 0.6492, "step": 10214 }, { "epoch": 1.7482457641622453, "grad_norm": 32.773887634277344, "learning_rate": 2.685867792953959e-05, "loss": 5.554, "step": 10215 }, { "epoch": 1.7484169091220263, "grad_norm": 13.97948169708252, "learning_rate": 2.685161897550152e-05, "loss": 0.9882, "step": 10216 }, { "epoch": 1.7485880540818073, "grad_norm": 8.478384971618652, "learning_rate": 2.6844553028990602e-05, "loss": 0.5849, "step": 10217 }, { "epoch": 1.7487591990415883, "grad_norm": 13.30849838256836, "learning_rate": 2.683748009417575e-05, "loss": 1.154, "step": 10218 }, { "epoch": 1.7489303440013693, "grad_norm": 18.877824783325195, "learning_rate": 2.6830400175230022e-05, "loss": 2.1017, "step": 10219 }, { "epoch": 1.7491014889611503, "grad_norm": 16.665237426757812, "learning_rate": 2.6823313276330557e-05, "loss": 1.3498, "step": 10220 }, { "epoch": 1.749272633920931, "grad_norm": 21.83778953552246, "learning_rate": 2.6816219401658664e-05, "loss": 2.9272, "step": 10221 }, { "epoch": 1.749443778880712, "grad_norm": 11.275954246520996, "learning_rate": 2.680911855539971e-05, "loss": 0.8909, "step": 10222 }, { "epoch": 1.7496149238404928, "grad_norm": 19.484846115112305, "learning_rate": 2.6802010741743227e-05, "loss": 2.0629, "step": 10223 }, { "epoch": 1.7497860688002738, "grad_norm": 14.734002113342285, "learning_rate": 2.679489596488282e-05, "loss": 0.9356, "step": 10224 }, { "epoch": 1.7499572137600548, "grad_norm": 1.7300753593444824, "learning_rate": 2.678777422901624e-05, "loss": 0.2182, "step": 10225 }, { "epoch": 1.7501283587198357, "grad_norm": 8.711627006530762, "learning_rate": 2.678064553834529e-05, "loss": 0.6602, "step": 10226 }, { "epoch": 1.7502995036796167, "grad_norm": 14.179986953735352, "learning_rate": 2.677350989707594e-05, "loss": 1.0055, "step": 10227 }, { "epoch": 1.7504706486393977, "grad_norm": 2.0565381050109863, "learning_rate": 2.6766367309418206e-05, "loss": 0.2245, "step": 10228 }, { "epoch": 1.7506417935991785, "grad_norm": 10.38598918914795, "learning_rate": 2.6759217779586237e-05, "loss": 1.0019, "step": 10229 }, { "epoch": 1.7508129385589595, "grad_norm": 14.204425811767578, "learning_rate": 2.6752061311798263e-05, "loss": 1.1776, "step": 10230 }, { "epoch": 1.7509840835187402, "grad_norm": 1.0385279655456543, "learning_rate": 2.6744897910276612e-05, "loss": 0.1365, "step": 10231 }, { "epoch": 1.7511552284785212, "grad_norm": 14.590753555297852, "learning_rate": 2.6737727579247696e-05, "loss": 1.2097, "step": 10232 }, { "epoch": 1.7513263734383022, "grad_norm": 8.677552223205566, "learning_rate": 2.6730550322942032e-05, "loss": 0.7583, "step": 10233 }, { "epoch": 1.7514975183980832, "grad_norm": 12.209345817565918, "learning_rate": 2.67233661455942e-05, "loss": 1.2238, "step": 10234 }, { "epoch": 1.7516686633578642, "grad_norm": 15.755433082580566, "learning_rate": 2.671617505144288e-05, "loss": 1.6844, "step": 10235 }, { "epoch": 1.7518398083176452, "grad_norm": 7.497293949127197, "learning_rate": 2.6708977044730832e-05, "loss": 0.5228, "step": 10236 }, { "epoch": 1.752010953277426, "grad_norm": 19.907268524169922, "learning_rate": 2.6701772129704887e-05, "loss": 1.5764, "step": 10237 }, { "epoch": 1.752182098237207, "grad_norm": 9.310452461242676, "learning_rate": 2.6694560310615958e-05, "loss": 0.6604, "step": 10238 }, { "epoch": 1.7523532431969877, "grad_norm": 6.269540309906006, "learning_rate": 2.6687341591719016e-05, "loss": 0.7911, "step": 10239 }, { "epoch": 1.7525243881567687, "grad_norm": 9.012858390808105, "learning_rate": 2.668011597727314e-05, "loss": 0.5564, "step": 10240 }, { "epoch": 1.7526955331165497, "grad_norm": 13.380846977233887, "learning_rate": 2.6672883471541436e-05, "loss": 1.1636, "step": 10241 }, { "epoch": 1.7528666780763307, "grad_norm": 0.46485528349876404, "learning_rate": 2.6665644078791098e-05, "loss": 0.1124, "step": 10242 }, { "epoch": 1.7530378230361117, "grad_norm": 17.56967544555664, "learning_rate": 2.6658397803293376e-05, "loss": 1.7598, "step": 10243 }, { "epoch": 1.7532089679958927, "grad_norm": 9.06078815460205, "learning_rate": 2.665114464932359e-05, "loss": 0.8302, "step": 10244 }, { "epoch": 1.7533801129556734, "grad_norm": 0.43702182173728943, "learning_rate": 2.6643884621161102e-05, "loss": 0.1154, "step": 10245 }, { "epoch": 1.7535512579154544, "grad_norm": 10.447807312011719, "learning_rate": 2.663661772308935e-05, "loss": 0.7763, "step": 10246 }, { "epoch": 1.7537224028752352, "grad_norm": 16.421716690063477, "learning_rate": 2.6629343959395805e-05, "loss": 1.0051, "step": 10247 }, { "epoch": 1.7538935478350162, "grad_norm": 2.2965593338012695, "learning_rate": 2.6622063334372e-05, "loss": 0.2575, "step": 10248 }, { "epoch": 1.7540646927947972, "grad_norm": 14.886336326599121, "learning_rate": 2.6614775852313523e-05, "loss": 1.1209, "step": 10249 }, { "epoch": 1.7542358377545781, "grad_norm": 13.00224781036377, "learning_rate": 2.6607481517519984e-05, "loss": 1.2123, "step": 10250 }, { "epoch": 1.7544069827143591, "grad_norm": 0.4165373146533966, "learning_rate": 2.6600180334295066e-05, "loss": 0.1156, "step": 10251 }, { "epoch": 1.7545781276741401, "grad_norm": 0.49036213755607605, "learning_rate": 2.6592872306946476e-05, "loss": 0.1276, "step": 10252 }, { "epoch": 1.7547492726339209, "grad_norm": 11.204863548278809, "learning_rate": 2.6585557439785955e-05, "loss": 0.7156, "step": 10253 }, { "epoch": 1.7549204175937019, "grad_norm": 10.537985801696777, "learning_rate": 2.6578235737129292e-05, "loss": 0.9573, "step": 10254 }, { "epoch": 1.7550915625534826, "grad_norm": 4.228142261505127, "learning_rate": 2.65709072032963e-05, "loss": 0.2743, "step": 10255 }, { "epoch": 1.7552627075132636, "grad_norm": 12.881701469421387, "learning_rate": 2.6563571842610817e-05, "loss": 1.0617, "step": 10256 }, { "epoch": 1.7554338524730446, "grad_norm": 5.535711765289307, "learning_rate": 2.6556229659400724e-05, "loss": 0.3497, "step": 10257 }, { "epoch": 1.7556049974328256, "grad_norm": 10.0292329788208, "learning_rate": 2.6548880657997922e-05, "loss": 1.1755, "step": 10258 }, { "epoch": 1.7557761423926066, "grad_norm": 16.752626419067383, "learning_rate": 2.6541524842738333e-05, "loss": 2.0731, "step": 10259 }, { "epoch": 1.7559472873523876, "grad_norm": 14.894298553466797, "learning_rate": 2.6534162217961893e-05, "loss": 1.463, "step": 10260 }, { "epoch": 1.7561184323121684, "grad_norm": 13.606311798095703, "learning_rate": 2.6526792788012555e-05, "loss": 1.0134, "step": 10261 }, { "epoch": 1.7562895772719493, "grad_norm": 14.41943645477295, "learning_rate": 2.651941655723832e-05, "loss": 0.9996, "step": 10262 }, { "epoch": 1.75646072223173, "grad_norm": 16.46200180053711, "learning_rate": 2.6512033529991148e-05, "loss": 1.4287, "step": 10263 }, { "epoch": 1.756631867191511, "grad_norm": 18.014118194580078, "learning_rate": 2.6504643710627054e-05, "loss": 1.9574, "step": 10264 }, { "epoch": 1.756803012151292, "grad_norm": 11.047093391418457, "learning_rate": 2.6497247103506023e-05, "loss": 0.7509, "step": 10265 }, { "epoch": 1.756974157111073, "grad_norm": 20.963741302490234, "learning_rate": 2.6489843712992097e-05, "loss": 2.7246, "step": 10266 }, { "epoch": 1.757145302070854, "grad_norm": 15.897561073303223, "learning_rate": 2.6482433543453245e-05, "loss": 1.3867, "step": 10267 }, { "epoch": 1.757316447030635, "grad_norm": 13.643783569335938, "learning_rate": 2.647501659926152e-05, "loss": 1.1002, "step": 10268 }, { "epoch": 1.757487591990416, "grad_norm": 1.9189598560333252, "learning_rate": 2.6467592884792892e-05, "loss": 0.2263, "step": 10269 }, { "epoch": 1.7576587369501968, "grad_norm": 5.104213714599609, "learning_rate": 2.646016240442739e-05, "loss": 0.4402, "step": 10270 }, { "epoch": 1.7578298819099778, "grad_norm": 12.099289894104004, "learning_rate": 2.6452725162548994e-05, "loss": 0.9438, "step": 10271 }, { "epoch": 1.7580010268697586, "grad_norm": 2.8792405128479004, "learning_rate": 2.6445281163545698e-05, "loss": 0.2253, "step": 10272 }, { "epoch": 1.7581721718295396, "grad_norm": 20.573467254638672, "learning_rate": 2.643783041180947e-05, "loss": 1.4729, "step": 10273 }, { "epoch": 1.7583433167893205, "grad_norm": 10.47651481628418, "learning_rate": 2.643037291173626e-05, "loss": 0.8733, "step": 10274 }, { "epoch": 1.7585144617491015, "grad_norm": 0.6114203333854675, "learning_rate": 2.642290866772601e-05, "loss": 0.1223, "step": 10275 }, { "epoch": 1.7586856067088825, "grad_norm": 0.39483264088630676, "learning_rate": 2.6415437684182626e-05, "loss": 0.1166, "step": 10276 }, { "epoch": 1.7588567516686635, "grad_norm": 15.776938438415527, "learning_rate": 2.640795996551401e-05, "loss": 1.4013, "step": 10277 }, { "epoch": 1.7590278966284443, "grad_norm": 14.356256484985352, "learning_rate": 2.6400475516132026e-05, "loss": 1.4613, "step": 10278 }, { "epoch": 1.7591990415882253, "grad_norm": 73.7004165649414, "learning_rate": 2.63929843404525e-05, "loss": 8.9708, "step": 10279 }, { "epoch": 1.759370186548006, "grad_norm": 10.204928398132324, "learning_rate": 2.638548644289525e-05, "loss": 0.7871, "step": 10280 }, { "epoch": 1.759541331507787, "grad_norm": 2.801974058151245, "learning_rate": 2.6377981827884035e-05, "loss": 0.2595, "step": 10281 }, { "epoch": 1.759712476467568, "grad_norm": 0.5604311227798462, "learning_rate": 2.637047049984659e-05, "loss": 0.1189, "step": 10282 }, { "epoch": 1.759883621427349, "grad_norm": 5.561570644378662, "learning_rate": 2.636295246321462e-05, "loss": 0.5831, "step": 10283 }, { "epoch": 1.76005476638713, "grad_norm": 4.580888271331787, "learning_rate": 2.6355427722423774e-05, "loss": 0.4918, "step": 10284 }, { "epoch": 1.760225911346911, "grad_norm": 8.747218132019043, "learning_rate": 2.6347896281913657e-05, "loss": 0.7147, "step": 10285 }, { "epoch": 1.7603970563066917, "grad_norm": 13.402180671691895, "learning_rate": 2.6340358146127835e-05, "loss": 0.8645, "step": 10286 }, { "epoch": 1.7605682012664727, "grad_norm": 11.5165376663208, "learning_rate": 2.6332813319513813e-05, "loss": 0.8429, "step": 10287 }, { "epoch": 1.7607393462262535, "grad_norm": 17.46820068359375, "learning_rate": 2.6325261806523055e-05, "loss": 1.954, "step": 10288 }, { "epoch": 1.7609104911860345, "grad_norm": 11.869661331176758, "learning_rate": 2.6317703611610957e-05, "loss": 0.7961, "step": 10289 }, { "epoch": 1.7610816361458155, "grad_norm": 12.764878273010254, "learning_rate": 2.6310138739236873e-05, "loss": 1.2976, "step": 10290 }, { "epoch": 1.7612527811055965, "grad_norm": 24.02213478088379, "learning_rate": 2.630256719386409e-05, "loss": 1.1067, "step": 10291 }, { "epoch": 1.7614239260653775, "grad_norm": 11.394536972045898, "learning_rate": 2.6294988979959815e-05, "loss": 0.6122, "step": 10292 }, { "epoch": 1.7615950710251584, "grad_norm": 24.44853401184082, "learning_rate": 2.6287404101995228e-05, "loss": 5.284, "step": 10293 }, { "epoch": 1.7617662159849392, "grad_norm": 2.5412909984588623, "learning_rate": 2.6279812564445414e-05, "loss": 0.2211, "step": 10294 }, { "epoch": 1.7619373609447202, "grad_norm": 10.956635475158691, "learning_rate": 2.6272214371789382e-05, "loss": 0.6898, "step": 10295 }, { "epoch": 1.762108505904501, "grad_norm": 24.394775390625, "learning_rate": 2.626460952851008e-05, "loss": 5.0201, "step": 10296 }, { "epoch": 1.762279650864282, "grad_norm": 9.61738395690918, "learning_rate": 2.6256998039094383e-05, "loss": 0.7263, "step": 10297 }, { "epoch": 1.762450795824063, "grad_norm": 1.2656222581863403, "learning_rate": 2.624937990803307e-05, "loss": 0.2001, "step": 10298 }, { "epoch": 1.762621940783844, "grad_norm": 3.850708246231079, "learning_rate": 2.6241755139820857e-05, "loss": 0.3232, "step": 10299 }, { "epoch": 1.762793085743625, "grad_norm": 27.74964141845703, "learning_rate": 2.623412373895637e-05, "loss": 4.9438, "step": 10300 }, { "epoch": 1.762964230703406, "grad_norm": 14.960726737976074, "learning_rate": 2.622648570994214e-05, "loss": 1.1665, "step": 10301 }, { "epoch": 1.7631353756631867, "grad_norm": 0.39376190304756165, "learning_rate": 2.6218841057284624e-05, "loss": 0.1122, "step": 10302 }, { "epoch": 1.7633065206229677, "grad_norm": 14.592577934265137, "learning_rate": 2.621118978549417e-05, "loss": 0.9921, "step": 10303 }, { "epoch": 1.7634776655827484, "grad_norm": 23.36099624633789, "learning_rate": 2.6203531899085042e-05, "loss": 2.3534, "step": 10304 }, { "epoch": 1.7636488105425294, "grad_norm": 18.485464096069336, "learning_rate": 2.619586740257542e-05, "loss": 1.5195, "step": 10305 }, { "epoch": 1.7638199555023104, "grad_norm": 16.18571662902832, "learning_rate": 2.618819630048734e-05, "loss": 1.7761, "step": 10306 }, { "epoch": 1.7639911004620914, "grad_norm": 16.630390167236328, "learning_rate": 2.6180518597346788e-05, "loss": 1.7684, "step": 10307 }, { "epoch": 1.7641622454218724, "grad_norm": 16.2050838470459, "learning_rate": 2.61728342976836e-05, "loss": 1.5221, "step": 10308 }, { "epoch": 1.7643333903816534, "grad_norm": 13.191843032836914, "learning_rate": 2.6165143406031547e-05, "loss": 0.9676, "step": 10309 }, { "epoch": 1.7645045353414341, "grad_norm": 5.008785724639893, "learning_rate": 2.615744592692824e-05, "loss": 0.3571, "step": 10310 }, { "epoch": 1.7646756803012151, "grad_norm": 5.067178726196289, "learning_rate": 2.6149741864915236e-05, "loss": 0.3257, "step": 10311 }, { "epoch": 1.764846825260996, "grad_norm": 0.4848376512527466, "learning_rate": 2.6142031224537907e-05, "loss": 0.1157, "step": 10312 }, { "epoch": 1.7650179702207769, "grad_norm": 0.47084203362464905, "learning_rate": 2.6134314010345575e-05, "loss": 0.1173, "step": 10313 }, { "epoch": 1.7651891151805579, "grad_norm": 0.37208569049835205, "learning_rate": 2.612659022689138e-05, "loss": 0.1121, "step": 10314 }, { "epoch": 1.7653602601403389, "grad_norm": 16.788341522216797, "learning_rate": 2.6118859878732382e-05, "loss": 1.1917, "step": 10315 }, { "epoch": 1.7655314051001199, "grad_norm": 0.41175344586372375, "learning_rate": 2.6111122970429495e-05, "loss": 0.1091, "step": 10316 }, { "epoch": 1.7657025500599008, "grad_norm": 16.212858200073242, "learning_rate": 2.6103379506547513e-05, "loss": 1.4031, "step": 10317 }, { "epoch": 1.7658736950196818, "grad_norm": 6.374989986419678, "learning_rate": 2.6095629491655078e-05, "loss": 0.6412, "step": 10318 }, { "epoch": 1.7660448399794626, "grad_norm": 10.924112319946289, "learning_rate": 2.6087872930324717e-05, "loss": 1.1679, "step": 10319 }, { "epoch": 1.7662159849392436, "grad_norm": 13.240218162536621, "learning_rate": 2.608010982713281e-05, "loss": 1.0596, "step": 10320 }, { "epoch": 1.7663871298990244, "grad_norm": 14.675152778625488, "learning_rate": 2.60723401866596e-05, "loss": 0.8537, "step": 10321 }, { "epoch": 1.7665582748588053, "grad_norm": 14.745795249938965, "learning_rate": 2.6064564013489195e-05, "loss": 1.3669, "step": 10322 }, { "epoch": 1.7667294198185863, "grad_norm": 18.18724822998047, "learning_rate": 2.605678131220954e-05, "loss": 1.3658, "step": 10323 }, { "epoch": 1.7669005647783673, "grad_norm": 29.999698638916016, "learning_rate": 2.6048992087412437e-05, "loss": 0.8949, "step": 10324 }, { "epoch": 1.7670717097381483, "grad_norm": 1.9175353050231934, "learning_rate": 2.604119634369355e-05, "loss": 0.2071, "step": 10325 }, { "epoch": 1.7672428546979293, "grad_norm": 11.485109329223633, "learning_rate": 2.603339408565237e-05, "loss": 0.9166, "step": 10326 }, { "epoch": 1.76741399965771, "grad_norm": 24.048248291015625, "learning_rate": 2.602558531789225e-05, "loss": 2.0134, "step": 10327 }, { "epoch": 1.767585144617491, "grad_norm": 0.47436535358428955, "learning_rate": 2.601777004502037e-05, "loss": 0.1184, "step": 10328 }, { "epoch": 1.7677562895772718, "grad_norm": 26.04397201538086, "learning_rate": 2.6009948271647753e-05, "loss": 1.1109, "step": 10329 }, { "epoch": 1.7679274345370528, "grad_norm": 13.924844741821289, "learning_rate": 2.6002120002389257e-05, "loss": 0.972, "step": 10330 }, { "epoch": 1.7680985794968338, "grad_norm": 6.72910213470459, "learning_rate": 2.5994285241863572e-05, "loss": 0.5722, "step": 10331 }, { "epoch": 1.7682697244566148, "grad_norm": 11.797749519348145, "learning_rate": 2.5986443994693216e-05, "loss": 0.9829, "step": 10332 }, { "epoch": 1.7684408694163958, "grad_norm": 13.208076477050781, "learning_rate": 2.5978596265504545e-05, "loss": 1.0163, "step": 10333 }, { "epoch": 1.7686120143761768, "grad_norm": 15.372968673706055, "learning_rate": 2.5970742058927723e-05, "loss": 1.3011, "step": 10334 }, { "epoch": 1.7687831593359575, "grad_norm": 13.39724349975586, "learning_rate": 2.5962881379596738e-05, "loss": 1.098, "step": 10335 }, { "epoch": 1.7689543042957385, "grad_norm": 9.688801765441895, "learning_rate": 2.5955014232149416e-05, "loss": 0.7406, "step": 10336 }, { "epoch": 1.7691254492555193, "grad_norm": 12.890881538391113, "learning_rate": 2.5947140621227384e-05, "loss": 1.0946, "step": 10337 }, { "epoch": 1.7692965942153003, "grad_norm": 15.380660057067871, "learning_rate": 2.5939260551476075e-05, "loss": 1.536, "step": 10338 }, { "epoch": 1.7694677391750813, "grad_norm": 0.4869334101676941, "learning_rate": 2.5931374027544752e-05, "loss": 0.1168, "step": 10339 }, { "epoch": 1.7696388841348623, "grad_norm": 10.648853302001953, "learning_rate": 2.5923481054086467e-05, "loss": 0.812, "step": 10340 }, { "epoch": 1.7698100290946432, "grad_norm": 19.027511596679688, "learning_rate": 2.5915581635758086e-05, "loss": 1.9685, "step": 10341 }, { "epoch": 1.7699811740544242, "grad_norm": 21.071332931518555, "learning_rate": 2.590767577722029e-05, "loss": 2.9194, "step": 10342 }, { "epoch": 1.770152319014205, "grad_norm": 4.469263553619385, "learning_rate": 2.5899763483137538e-05, "loss": 0.3248, "step": 10343 }, { "epoch": 1.770323463973986, "grad_norm": 0.4350958466529846, "learning_rate": 2.5891844758178092e-05, "loss": 0.1073, "step": 10344 }, { "epoch": 1.7704946089337668, "grad_norm": 8.709773063659668, "learning_rate": 2.588391960701402e-05, "loss": 0.5835, "step": 10345 }, { "epoch": 1.7706657538935477, "grad_norm": 16.564308166503906, "learning_rate": 2.5875988034321167e-05, "loss": 1.9639, "step": 10346 }, { "epoch": 1.7708368988533287, "grad_norm": 9.579218864440918, "learning_rate": 2.5868050044779163e-05, "loss": 0.618, "step": 10347 }, { "epoch": 1.7710080438131097, "grad_norm": 7.619257926940918, "learning_rate": 2.5860105643071466e-05, "loss": 0.5715, "step": 10348 }, { "epoch": 1.7711791887728907, "grad_norm": 0.9791240096092224, "learning_rate": 2.5852154833885245e-05, "loss": 0.129, "step": 10349 }, { "epoch": 1.7713503337326717, "grad_norm": 9.30814266204834, "learning_rate": 2.5844197621911526e-05, "loss": 0.6327, "step": 10350 }, { "epoch": 1.7715214786924525, "grad_norm": 30.156314849853516, "learning_rate": 2.5836234011845042e-05, "loss": 5.0229, "step": 10351 }, { "epoch": 1.7716926236522335, "grad_norm": 4.488891124725342, "learning_rate": 2.5828264008384365e-05, "loss": 0.4632, "step": 10352 }, { "epoch": 1.7718637686120142, "grad_norm": 0.8750544190406799, "learning_rate": 2.582028761623178e-05, "loss": 0.176, "step": 10353 }, { "epoch": 1.7720349135717952, "grad_norm": 8.145329475402832, "learning_rate": 2.58123048400934e-05, "loss": 0.6359, "step": 10354 }, { "epoch": 1.7722060585315762, "grad_norm": 20.375080108642578, "learning_rate": 2.5804315684679046e-05, "loss": 4.7529, "step": 10355 }, { "epoch": 1.7723772034913572, "grad_norm": 10.037342071533203, "learning_rate": 2.579632015470236e-05, "loss": 0.7681, "step": 10356 }, { "epoch": 1.7725483484511382, "grad_norm": 2.4784340858459473, "learning_rate": 2.578831825488069e-05, "loss": 0.2703, "step": 10357 }, { "epoch": 1.7727194934109192, "grad_norm": 21.732370376586914, "learning_rate": 2.578030998993518e-05, "loss": 1.7692, "step": 10358 }, { "epoch": 1.7728906383707, "grad_norm": 15.518199920654297, "learning_rate": 2.5772295364590726e-05, "loss": 0.8834, "step": 10359 }, { "epoch": 1.773061783330481, "grad_norm": 12.520959854125977, "learning_rate": 2.5764274383575962e-05, "loss": 0.9395, "step": 10360 }, { "epoch": 1.7732329282902617, "grad_norm": 8.39258098602295, "learning_rate": 2.5756247051623274e-05, "loss": 0.5656, "step": 10361 }, { "epoch": 1.7734040732500427, "grad_norm": 21.212387084960938, "learning_rate": 2.574821337346881e-05, "loss": 4.9729, "step": 10362 }, { "epoch": 1.7735752182098237, "grad_norm": 16.1068172454834, "learning_rate": 2.5740173353852443e-05, "loss": 1.7381, "step": 10363 }, { "epoch": 1.7737463631696047, "grad_norm": 18.294607162475586, "learning_rate": 2.5732126997517798e-05, "loss": 1.2684, "step": 10364 }, { "epoch": 1.7739175081293856, "grad_norm": 5.852933406829834, "learning_rate": 2.5724074309212246e-05, "loss": 0.5152, "step": 10365 }, { "epoch": 1.7740886530891666, "grad_norm": 10.431079864501953, "learning_rate": 2.5716015293686872e-05, "loss": 0.9369, "step": 10366 }, { "epoch": 1.7742597980489476, "grad_norm": 0.4428052306175232, "learning_rate": 2.5707949955696513e-05, "loss": 0.1189, "step": 10367 }, { "epoch": 1.7744309430087284, "grad_norm": 10.019430160522461, "learning_rate": 2.569987829999973e-05, "loss": 0.833, "step": 10368 }, { "epoch": 1.7746020879685094, "grad_norm": 0.6407434940338135, "learning_rate": 2.569180033135882e-05, "loss": 0.1258, "step": 10369 }, { "epoch": 1.7747732329282901, "grad_norm": 0.5379723310470581, "learning_rate": 2.5683716054539787e-05, "loss": 0.1168, "step": 10370 }, { "epoch": 1.7749443778880711, "grad_norm": 15.757037162780762, "learning_rate": 2.567562547431237e-05, "loss": 0.9937, "step": 10371 }, { "epoch": 1.7751155228478521, "grad_norm": 2.9075448513031006, "learning_rate": 2.5667528595450024e-05, "loss": 0.2467, "step": 10372 }, { "epoch": 1.7752866678076331, "grad_norm": 13.997809410095215, "learning_rate": 2.565942542272991e-05, "loss": 1.2107, "step": 10373 }, { "epoch": 1.775457812767414, "grad_norm": 12.780007362365723, "learning_rate": 2.5651315960932926e-05, "loss": 1.0496, "step": 10374 }, { "epoch": 1.775628957727195, "grad_norm": 17.68670082092285, "learning_rate": 2.5643200214843658e-05, "loss": 1.362, "step": 10375 }, { "epoch": 1.7758001026869759, "grad_norm": 1.3410274982452393, "learning_rate": 2.5635078189250418e-05, "loss": 0.1668, "step": 10376 }, { "epoch": 1.7759712476467568, "grad_norm": 12.347916603088379, "learning_rate": 2.5626949888945196e-05, "loss": 0.8229, "step": 10377 }, { "epoch": 1.7761423926065376, "grad_norm": 9.421760559082031, "learning_rate": 2.5618815318723706e-05, "loss": 0.8804, "step": 10378 }, { "epoch": 1.7763135375663186, "grad_norm": 14.991144180297852, "learning_rate": 2.5610674483385373e-05, "loss": 1.6508, "step": 10379 }, { "epoch": 1.7764846825260996, "grad_norm": 15.948822021484375, "learning_rate": 2.5602527387733287e-05, "loss": 1.1515, "step": 10380 }, { "epoch": 1.7766558274858806, "grad_norm": 0.36547958850860596, "learning_rate": 2.559437403657425e-05, "loss": 0.1148, "step": 10381 }, { "epoch": 1.7768269724456616, "grad_norm": 3.497053861618042, "learning_rate": 2.5586214434718756e-05, "loss": 0.2868, "step": 10382 }, { "epoch": 1.7769981174054426, "grad_norm": 0.3742232322692871, "learning_rate": 2.5578048586980974e-05, "loss": 0.1084, "step": 10383 }, { "epoch": 1.7771692623652233, "grad_norm": 7.912871360778809, "learning_rate": 2.5569876498178774e-05, "loss": 0.6082, "step": 10384 }, { "epoch": 1.7773404073250043, "grad_norm": 6.877047061920166, "learning_rate": 2.556169817313369e-05, "loss": 0.5437, "step": 10385 }, { "epoch": 1.777511552284785, "grad_norm": 22.603391647338867, "learning_rate": 2.5553513616670957e-05, "loss": 1.249, "step": 10386 }, { "epoch": 1.777682697244566, "grad_norm": 0.5017625689506531, "learning_rate": 2.554532283361947e-05, "loss": 0.1129, "step": 10387 }, { "epoch": 1.777853842204347, "grad_norm": 0.35598164796829224, "learning_rate": 2.5537125828811803e-05, "loss": 0.1097, "step": 10388 }, { "epoch": 1.778024987164128, "grad_norm": 3.7753570079803467, "learning_rate": 2.5528922607084203e-05, "loss": 0.2928, "step": 10389 }, { "epoch": 1.778196132123909, "grad_norm": 0.3458811640739441, "learning_rate": 2.5520713173276575e-05, "loss": 0.1072, "step": 10390 }, { "epoch": 1.77836727708369, "grad_norm": 6.145935535430908, "learning_rate": 2.551249753223252e-05, "loss": 0.3423, "step": 10391 }, { "epoch": 1.7785384220434708, "grad_norm": 16.863523483276367, "learning_rate": 2.550427568879925e-05, "loss": 1.3675, "step": 10392 }, { "epoch": 1.7787095670032518, "grad_norm": 8.658441543579102, "learning_rate": 2.5496047647827694e-05, "loss": 0.6902, "step": 10393 }, { "epoch": 1.7788807119630325, "grad_norm": 19.10110855102539, "learning_rate": 2.5487813414172374e-05, "loss": 2.0998, "step": 10394 }, { "epoch": 1.7790518569228135, "grad_norm": 12.609996795654297, "learning_rate": 2.5479572992691534e-05, "loss": 0.8618, "step": 10395 }, { "epoch": 1.7792230018825945, "grad_norm": 9.116121292114258, "learning_rate": 2.5471326388247003e-05, "loss": 0.6686, "step": 10396 }, { "epoch": 1.7793941468423755, "grad_norm": 10.63254165649414, "learning_rate": 2.5463073605704326e-05, "loss": 0.6955, "step": 10397 }, { "epoch": 1.7795652918021565, "grad_norm": 11.819313049316406, "learning_rate": 2.5454814649932616e-05, "loss": 0.9883, "step": 10398 }, { "epoch": 1.7797364367619375, "grad_norm": 22.25132942199707, "learning_rate": 2.544654952580471e-05, "loss": 3.2376, "step": 10399 }, { "epoch": 1.7799075817217183, "grad_norm": 3.9720101356506348, "learning_rate": 2.5438278238197005e-05, "loss": 0.2619, "step": 10400 }, { "epoch": 1.7800787266814992, "grad_norm": 9.763450622558594, "learning_rate": 2.5430000791989604e-05, "loss": 0.6992, "step": 10401 }, { "epoch": 1.78024987164128, "grad_norm": 10.496820449829102, "learning_rate": 2.5421717192066202e-05, "loss": 1.062, "step": 10402 }, { "epoch": 1.780421016601061, "grad_norm": 9.838890075683594, "learning_rate": 2.541342744331413e-05, "loss": 0.9596, "step": 10403 }, { "epoch": 1.780592161560842, "grad_norm": 16.673309326171875, "learning_rate": 2.5405131550624355e-05, "loss": 1.3983, "step": 10404 }, { "epoch": 1.780763306520623, "grad_norm": 13.044134140014648, "learning_rate": 2.5396829518891472e-05, "loss": 0.9738, "step": 10405 }, { "epoch": 1.780934451480404, "grad_norm": 16.015647888183594, "learning_rate": 2.5388521353013683e-05, "loss": 1.0288, "step": 10406 }, { "epoch": 1.781105596440185, "grad_norm": 18.47735023498535, "learning_rate": 2.5380207057892822e-05, "loss": 1.996, "step": 10407 }, { "epoch": 1.7812767413999657, "grad_norm": 16.379287719726562, "learning_rate": 2.5371886638434335e-05, "loss": 1.2099, "step": 10408 }, { "epoch": 1.7814478863597467, "grad_norm": 14.682717323303223, "learning_rate": 2.5363560099547286e-05, "loss": 1.0835, "step": 10409 }, { "epoch": 1.7816190313195275, "grad_norm": 16.952077865600586, "learning_rate": 2.5355227446144337e-05, "loss": 1.7878, "step": 10410 }, { "epoch": 1.7817901762793085, "grad_norm": 19.056716918945312, "learning_rate": 2.5346888683141776e-05, "loss": 1.5765, "step": 10411 }, { "epoch": 1.7819613212390895, "grad_norm": 2.8447682857513428, "learning_rate": 2.5338543815459478e-05, "loss": 0.2574, "step": 10412 }, { "epoch": 1.7821324661988704, "grad_norm": 7.928298473358154, "learning_rate": 2.5330192848020935e-05, "loss": 0.7971, "step": 10413 }, { "epoch": 1.7823036111586514, "grad_norm": 21.470142364501953, "learning_rate": 2.532183578575322e-05, "loss": 4.8949, "step": 10414 }, { "epoch": 1.7824747561184324, "grad_norm": 16.37494659423828, "learning_rate": 2.5313472633587025e-05, "loss": 1.3299, "step": 10415 }, { "epoch": 1.7826459010782134, "grad_norm": 25.45989418029785, "learning_rate": 2.5305103396456608e-05, "loss": 3.0794, "step": 10416 }, { "epoch": 1.7828170460379942, "grad_norm": 3.677121162414551, "learning_rate": 2.529672807929984e-05, "loss": 0.3113, "step": 10417 }, { "epoch": 1.7829881909977752, "grad_norm": 21.711658477783203, "learning_rate": 2.5288346687058167e-05, "loss": 2.0656, "step": 10418 }, { "epoch": 1.783159335957556, "grad_norm": 19.351655960083008, "learning_rate": 2.5279959224676627e-05, "loss": 1.7848, "step": 10419 }, { "epoch": 1.783330480917337, "grad_norm": 13.19867992401123, "learning_rate": 2.527156569710383e-05, "loss": 0.9991, "step": 10420 }, { "epoch": 1.783501625877118, "grad_norm": 9.921602249145508, "learning_rate": 2.526316610929197e-05, "loss": 0.7917, "step": 10421 }, { "epoch": 1.783672770836899, "grad_norm": 10.281218528747559, "learning_rate": 2.5254760466196822e-05, "loss": 0.8535, "step": 10422 }, { "epoch": 1.78384391579668, "grad_norm": 2.721592426300049, "learning_rate": 2.5246348772777726e-05, "loss": 0.2915, "step": 10423 }, { "epoch": 1.7840150607564609, "grad_norm": 19.938426971435547, "learning_rate": 2.5237931033997594e-05, "loss": 2.015, "step": 10424 }, { "epoch": 1.7841862057162416, "grad_norm": 12.277912139892578, "learning_rate": 2.5229507254822898e-05, "loss": 0.893, "step": 10425 }, { "epoch": 1.7843573506760226, "grad_norm": 13.061349868774414, "learning_rate": 2.5221077440223696e-05, "loss": 0.8648, "step": 10426 }, { "epoch": 1.7845284956358034, "grad_norm": 10.003819465637207, "learning_rate": 2.521264159517357e-05, "loss": 0.8588, "step": 10427 }, { "epoch": 1.7846996405955844, "grad_norm": 11.28300666809082, "learning_rate": 2.5204199724649696e-05, "loss": 0.8586, "step": 10428 }, { "epoch": 1.7848707855553654, "grad_norm": 0.7198940515518188, "learning_rate": 2.5195751833632784e-05, "loss": 0.1227, "step": 10429 }, { "epoch": 1.7850419305151464, "grad_norm": 9.506392478942871, "learning_rate": 2.5187297927107106e-05, "loss": 0.791, "step": 10430 }, { "epoch": 1.7852130754749274, "grad_norm": 14.367445945739746, "learning_rate": 2.5178838010060475e-05, "loss": 1.2387, "step": 10431 }, { "epoch": 1.7853842204347083, "grad_norm": 7.0618743896484375, "learning_rate": 2.517037208748426e-05, "loss": 0.5011, "step": 10432 }, { "epoch": 1.7855553653944891, "grad_norm": 27.2537841796875, "learning_rate": 2.5161900164373354e-05, "loss": 5.0903, "step": 10433 }, { "epoch": 1.78572651035427, "grad_norm": 5.037086009979248, "learning_rate": 2.5153422245726232e-05, "loss": 0.3374, "step": 10434 }, { "epoch": 1.7858976553140509, "grad_norm": 0.429697722196579, "learning_rate": 2.5144938336544846e-05, "loss": 0.1111, "step": 10435 }, { "epoch": 1.7860688002738319, "grad_norm": 3.639899492263794, "learning_rate": 2.513644844183475e-05, "loss": 0.3102, "step": 10436 }, { "epoch": 1.7862399452336128, "grad_norm": 50.77555465698242, "learning_rate": 2.5127952566604953e-05, "loss": 7.0498, "step": 10437 }, { "epoch": 1.7864110901933938, "grad_norm": 158.03240966796875, "learning_rate": 2.5119450715868078e-05, "loss": 8.2365, "step": 10438 }, { "epoch": 1.7865822351531748, "grad_norm": 10.476318359375, "learning_rate": 2.5110942894640192e-05, "loss": 0.7327, "step": 10439 }, { "epoch": 1.7867533801129558, "grad_norm": 17.24152374267578, "learning_rate": 2.5102429107940954e-05, "loss": 1.8214, "step": 10440 }, { "epoch": 1.7869245250727366, "grad_norm": 26.077465057373047, "learning_rate": 2.5093909360793476e-05, "loss": 5.4135, "step": 10441 }, { "epoch": 1.7870956700325176, "grad_norm": 15.95528793334961, "learning_rate": 2.508538365822446e-05, "loss": 1.3166, "step": 10442 }, { "epoch": 1.7872668149922983, "grad_norm": 10.180129051208496, "learning_rate": 2.5076852005264045e-05, "loss": 0.811, "step": 10443 }, { "epoch": 1.7874379599520793, "grad_norm": 2.9141955375671387, "learning_rate": 2.5068314406945948e-05, "loss": 0.3344, "step": 10444 }, { "epoch": 1.7876091049118603, "grad_norm": 6.407169342041016, "learning_rate": 2.5059770868307353e-05, "loss": 0.3563, "step": 10445 }, { "epoch": 1.7877802498716413, "grad_norm": 11.510123252868652, "learning_rate": 2.5051221394388965e-05, "loss": 0.7489, "step": 10446 }, { "epoch": 1.7879513948314223, "grad_norm": 11.884770393371582, "learning_rate": 2.504266599023498e-05, "loss": 0.8846, "step": 10447 }, { "epoch": 1.7881225397912033, "grad_norm": 1.5599706172943115, "learning_rate": 2.5034104660893106e-05, "loss": 0.2444, "step": 10448 }, { "epoch": 1.788293684750984, "grad_norm": 16.447084426879883, "learning_rate": 2.5025537411414532e-05, "loss": 1.0319, "step": 10449 }, { "epoch": 1.788464829710765, "grad_norm": 11.653722763061523, "learning_rate": 2.501696424685396e-05, "loss": 1.0039, "step": 10450 }, { "epoch": 1.7886359746705458, "grad_norm": 24.58965301513672, "learning_rate": 2.5008385172269556e-05, "loss": 4.8749, "step": 10451 }, { "epoch": 1.7888071196303268, "grad_norm": 2.518771171569824, "learning_rate": 2.4999800192722995e-05, "loss": 0.2409, "step": 10452 }, { "epoch": 1.7889782645901078, "grad_norm": 7.972427845001221, "learning_rate": 2.4991209313279415e-05, "loss": 0.7675, "step": 10453 }, { "epoch": 1.7891494095498888, "grad_norm": 19.604293823242188, "learning_rate": 2.498261253900747e-05, "loss": 2.0831, "step": 10454 }, { "epoch": 1.7893205545096698, "grad_norm": 21.403095245361328, "learning_rate": 2.497400987497926e-05, "loss": 2.3933, "step": 10455 }, { "epoch": 1.7894916994694507, "grad_norm": 18.018718719482422, "learning_rate": 2.4965401326270365e-05, "loss": 1.8096, "step": 10456 }, { "epoch": 1.7896628444292315, "grad_norm": 12.347350120544434, "learning_rate": 2.4956786897959844e-05, "loss": 0.7967, "step": 10457 }, { "epoch": 1.7898339893890125, "grad_norm": 18.030380249023438, "learning_rate": 2.4948166595130227e-05, "loss": 1.7904, "step": 10458 }, { "epoch": 1.7900051343487933, "grad_norm": 1.8014403581619263, "learning_rate": 2.4939540422867497e-05, "loss": 0.2437, "step": 10459 }, { "epoch": 1.7901762793085743, "grad_norm": 8.256983757019043, "learning_rate": 2.493090838626112e-05, "loss": 0.7842, "step": 10460 }, { "epoch": 1.7903474242683552, "grad_norm": 11.641915321350098, "learning_rate": 2.4922270490403997e-05, "loss": 0.9683, "step": 10461 }, { "epoch": 1.7905185692281362, "grad_norm": 1.5542426109313965, "learning_rate": 2.4913626740392505e-05, "loss": 0.213, "step": 10462 }, { "epoch": 1.7906897141879172, "grad_norm": 14.501445770263672, "learning_rate": 2.490497714132647e-05, "loss": 1.3424, "step": 10463 }, { "epoch": 1.7908608591476982, "grad_norm": 12.392258644104004, "learning_rate": 2.4896321698309166e-05, "loss": 1.1328, "step": 10464 }, { "epoch": 1.791032004107479, "grad_norm": 0.575232982635498, "learning_rate": 2.4887660416447322e-05, "loss": 0.1355, "step": 10465 }, { "epoch": 1.79120314906726, "grad_norm": 0.7616779208183289, "learning_rate": 2.4878993300851108e-05, "loss": 0.129, "step": 10466 }, { "epoch": 1.791374294027041, "grad_norm": 6.738126277923584, "learning_rate": 2.487032035663413e-05, "loss": 0.7456, "step": 10467 }, { "epoch": 1.7915454389868217, "grad_norm": 7.624906539916992, "learning_rate": 2.4861641588913444e-05, "loss": 0.5458, "step": 10468 }, { "epoch": 1.7917165839466027, "grad_norm": 10.610275268554688, "learning_rate": 2.4852957002809534e-05, "loss": 0.9281, "step": 10469 }, { "epoch": 1.7918877289063837, "grad_norm": 9.12974739074707, "learning_rate": 2.4844266603446308e-05, "loss": 0.7653, "step": 10470 }, { "epoch": 1.7920588738661647, "grad_norm": 7.504889488220215, "learning_rate": 2.483557039595113e-05, "loss": 0.8737, "step": 10471 }, { "epoch": 1.7922300188259457, "grad_norm": 10.571599006652832, "learning_rate": 2.4826868385454767e-05, "loss": 0.7331, "step": 10472 }, { "epoch": 1.7924011637857267, "grad_norm": 8.558573722839355, "learning_rate": 2.4818160577091417e-05, "loss": 0.7441, "step": 10473 }, { "epoch": 1.7925723087455074, "grad_norm": 15.16938304901123, "learning_rate": 2.4809446975998707e-05, "loss": 1.2562, "step": 10474 }, { "epoch": 1.7927434537052884, "grad_norm": 20.587244033813477, "learning_rate": 2.480072758731767e-05, "loss": 1.7994, "step": 10475 }, { "epoch": 1.7929145986650692, "grad_norm": 14.571403503417969, "learning_rate": 2.4792002416192747e-05, "loss": 0.991, "step": 10476 }, { "epoch": 1.7930857436248502, "grad_norm": 3.559631586074829, "learning_rate": 2.4783271467771835e-05, "loss": 0.2996, "step": 10477 }, { "epoch": 1.7932568885846312, "grad_norm": 8.960917472839355, "learning_rate": 2.4774534747206165e-05, "loss": 0.6315, "step": 10478 }, { "epoch": 1.7934280335444122, "grad_norm": 0.5084958076477051, "learning_rate": 2.4765792259650456e-05, "loss": 0.1121, "step": 10479 }, { "epoch": 1.7935991785041931, "grad_norm": 17.50481605529785, "learning_rate": 2.4757044010262747e-05, "loss": 1.1762, "step": 10480 }, { "epoch": 1.7937703234639741, "grad_norm": 0.4192851483821869, "learning_rate": 2.4748290004204557e-05, "loss": 0.113, "step": 10481 }, { "epoch": 1.793941468423755, "grad_norm": 5.151569366455078, "learning_rate": 2.473953024664073e-05, "loss": 0.3257, "step": 10482 }, { "epoch": 1.794112613383536, "grad_norm": 14.655202865600586, "learning_rate": 2.473076474273956e-05, "loss": 1.2677, "step": 10483 }, { "epoch": 1.7942837583433167, "grad_norm": 0.8716297149658203, "learning_rate": 2.4721993497672693e-05, "loss": 0.1728, "step": 10484 }, { "epoch": 1.7944549033030976, "grad_norm": 11.219311714172363, "learning_rate": 2.4713216516615182e-05, "loss": 0.7431, "step": 10485 }, { "epoch": 1.7946260482628786, "grad_norm": 7.502663612365723, "learning_rate": 2.4704433804745465e-05, "loss": 0.5809, "step": 10486 }, { "epoch": 1.7947971932226596, "grad_norm": 16.03567123413086, "learning_rate": 2.469564536724534e-05, "loss": 1.2553, "step": 10487 }, { "epoch": 1.7949683381824406, "grad_norm": 13.756152153015137, "learning_rate": 2.4686851209300017e-05, "loss": 1.0214, "step": 10488 }, { "epoch": 1.7951394831422216, "grad_norm": 3.5514681339263916, "learning_rate": 2.4678051336098048e-05, "loss": 0.2898, "step": 10489 }, { "epoch": 1.7953106281020024, "grad_norm": 0.3688797056674957, "learning_rate": 2.4669245752831375e-05, "loss": 0.1039, "step": 10490 }, { "epoch": 1.7954817730617834, "grad_norm": 7.392839431762695, "learning_rate": 2.4660434464695304e-05, "loss": 1.0287, "step": 10491 }, { "epoch": 1.7956529180215641, "grad_norm": 32.428985595703125, "learning_rate": 2.465161747688851e-05, "loss": 5.5347, "step": 10492 }, { "epoch": 1.7958240629813451, "grad_norm": 9.745067596435547, "learning_rate": 2.4642794794613033e-05, "loss": 0.6966, "step": 10493 }, { "epoch": 1.795995207941126, "grad_norm": 0.40775883197784424, "learning_rate": 2.4633966423074267e-05, "loss": 0.1066, "step": 10494 }, { "epoch": 1.796166352900907, "grad_norm": 1.0456066131591797, "learning_rate": 2.4625132367480955e-05, "loss": 0.1674, "step": 10495 }, { "epoch": 1.796337497860688, "grad_norm": 11.191424369812012, "learning_rate": 2.4616292633045203e-05, "loss": 0.7014, "step": 10496 }, { "epoch": 1.796508642820469, "grad_norm": 29.256553649902344, "learning_rate": 2.4607447224982484e-05, "loss": 5.0834, "step": 10497 }, { "epoch": 1.7966797877802498, "grad_norm": 4.557840347290039, "learning_rate": 2.4598596148511592e-05, "loss": 0.2854, "step": 10498 }, { "epoch": 1.7968509327400308, "grad_norm": 15.044614791870117, "learning_rate": 2.4589739408854678e-05, "loss": 1.3189, "step": 10499 }, { "epoch": 1.7970220776998116, "grad_norm": 2.0372025966644287, "learning_rate": 2.4580877011237228e-05, "loss": 0.2429, "step": 10500 }, { "epoch": 1.7971932226595926, "grad_norm": 11.341931343078613, "learning_rate": 2.457200896088807e-05, "loss": 0.9288, "step": 10501 }, { "epoch": 1.7973643676193736, "grad_norm": 12.332311630249023, "learning_rate": 2.4563135263039368e-05, "loss": 0.8983, "step": 10502 }, { "epoch": 1.7975355125791546, "grad_norm": 11.178953170776367, "learning_rate": 2.4554255922926618e-05, "loss": 0.8332, "step": 10503 }, { "epoch": 1.7977066575389355, "grad_norm": 4.5832014083862305, "learning_rate": 2.4545370945788642e-05, "loss": 0.3179, "step": 10504 }, { "epoch": 1.7978778024987165, "grad_norm": 3.8249104022979736, "learning_rate": 2.453648033686759e-05, "loss": 0.3765, "step": 10505 }, { "epoch": 1.7980489474584973, "grad_norm": 14.047713279724121, "learning_rate": 2.4527584101408938e-05, "loss": 1.3213, "step": 10506 }, { "epoch": 1.7982200924182783, "grad_norm": 1.9695353507995605, "learning_rate": 2.4518682244661462e-05, "loss": 0.2262, "step": 10507 }, { "epoch": 1.798391237378059, "grad_norm": 5.688499927520752, "learning_rate": 2.4509774771877292e-05, "loss": 0.5501, "step": 10508 }, { "epoch": 1.79856238233784, "grad_norm": 16.07000732421875, "learning_rate": 2.450086168831185e-05, "loss": 1.5337, "step": 10509 }, { "epoch": 1.798733527297621, "grad_norm": 16.7916259765625, "learning_rate": 2.4491942999223856e-05, "loss": 0.9924, "step": 10510 }, { "epoch": 1.798904672257402, "grad_norm": 4.900243282318115, "learning_rate": 2.448301870987535e-05, "loss": 0.308, "step": 10511 }, { "epoch": 1.799075817217183, "grad_norm": 25.748153686523438, "learning_rate": 2.4474088825531687e-05, "loss": 4.9777, "step": 10512 }, { "epoch": 1.799246962176964, "grad_norm": 16.483747482299805, "learning_rate": 2.4465153351461504e-05, "loss": 1.619, "step": 10513 }, { "epoch": 1.7994181071367448, "grad_norm": 12.384346961975098, "learning_rate": 2.4456212292936744e-05, "loss": 1.4058, "step": 10514 }, { "epoch": 1.7995892520965258, "grad_norm": 17.98931884765625, "learning_rate": 2.444726565523265e-05, "loss": 1.4183, "step": 10515 }, { "epoch": 1.7997603970563065, "grad_norm": 20.718271255493164, "learning_rate": 2.4438313443627748e-05, "loss": 2.4409, "step": 10516 }, { "epoch": 1.7999315420160875, "grad_norm": 0.4016176760196686, "learning_rate": 2.442935566340385e-05, "loss": 0.1079, "step": 10517 }, { "epoch": 1.8001026869758685, "grad_norm": 14.254204750061035, "learning_rate": 2.442039231984607e-05, "loss": 1.1001, "step": 10518 }, { "epoch": 1.8002738319356495, "grad_norm": 10.214805603027344, "learning_rate": 2.4411423418242783e-05, "loss": 0.7182, "step": 10519 }, { "epoch": 1.8004449768954305, "grad_norm": 0.40824073553085327, "learning_rate": 2.4402448963885676e-05, "loss": 0.1165, "step": 10520 }, { "epoch": 1.8006161218552115, "grad_norm": 12.056835174560547, "learning_rate": 2.4393468962069663e-05, "loss": 0.8357, "step": 10521 }, { "epoch": 1.8007872668149925, "grad_norm": 8.641940116882324, "learning_rate": 2.4384483418092983e-05, "loss": 0.8742, "step": 10522 }, { "epoch": 1.8009584117747732, "grad_norm": 15.820038795471191, "learning_rate": 2.4375492337257093e-05, "loss": 1.1823, "step": 10523 }, { "epoch": 1.8011295567345542, "grad_norm": 13.849246978759766, "learning_rate": 2.436649572486678e-05, "loss": 1.0983, "step": 10524 }, { "epoch": 1.8011295567345542, "eval_nli-pairs_loss": 1.31276535987854, "eval_nli-pairs_runtime": 4.6915, "eval_nli-pairs_samples_per_second": 42.63, "eval_nli-pairs_steps_per_second": 1.492, "eval_sts-test_pearson_cosine": 0.7694634701063054, "eval_sts-test_pearson_dot": 0.6399368427464057, "eval_sts-test_pearson_euclidean": 0.7602967870484356, "eval_sts-test_pearson_manhattan": 0.763736204622872, "eval_sts-test_pearson_max": 0.7694634701063054, "eval_sts-test_spearman_cosine": 0.7706477625418822, "eval_sts-test_spearman_dot": 0.6189211341756057, "eval_sts-test_spearman_euclidean": 0.7500665172642003, "eval_sts-test_spearman_manhattan": 0.7555098187950947, "eval_sts-test_spearman_max": 0.7706477625418822, "step": 10524 }, { "epoch": 1.8011295567345542, "eval_vitaminc-pairs_loss": 0.6740850806236267, "eval_vitaminc-pairs_runtime": 3.0068, "eval_vitaminc-pairs_samples_per_second": 66.515, "eval_vitaminc-pairs_steps_per_second": 2.328, "step": 10524 }, { "epoch": 1.8011295567345542, "eval_qnli-contrastive_loss": 1.401273250579834, "eval_qnli-contrastive_runtime": 0.6665, "eval_qnli-contrastive_samples_per_second": 300.095, "eval_qnli-contrastive_steps_per_second": 10.503, "step": 10524 }, { "epoch": 1.8011295567345542, "eval_scitail-pairs-qa_loss": 0.09272729605436325, "eval_scitail-pairs-qa_runtime": 1.8179, "eval_scitail-pairs-qa_samples_per_second": 110.018, "eval_scitail-pairs-qa_steps_per_second": 3.851, "step": 10524 }, { "epoch": 1.8011295567345542, "eval_scitail-pairs-pos_loss": 0.6821168065071106, "eval_scitail-pairs-pos_runtime": 2.7735, "eval_scitail-pairs-pos_samples_per_second": 72.11, "eval_scitail-pairs-pos_steps_per_second": 2.524, "step": 10524 }, { "epoch": 1.8011295567345542, "eval_xsum-pairs_loss": 0.6900661587715149, "eval_xsum-pairs_runtime": 2.6685, "eval_xsum-pairs_samples_per_second": 65.581, "eval_xsum-pairs_steps_per_second": 2.248, "step": 10524 }, { "epoch": 1.8011295567345542, "eval_compression-pairs_loss": 0.20854850113391876, "eval_compression-pairs_runtime": 0.5199, "eval_compression-pairs_samples_per_second": 384.719, "eval_compression-pairs_steps_per_second": 13.465, "step": 10524 }, { "epoch": 1.8011295567345542, "eval_sciq_pairs_loss": 0.5288339853286743, "eval_sciq_pairs_runtime": 9.4438, "eval_sciq_pairs_samples_per_second": 21.178, "eval_sciq_pairs_steps_per_second": 0.741, "step": 10524 }, { "epoch": 1.8011295567345542, "eval_qasc_pairs_loss": 5.346717357635498, "eval_qasc_pairs_runtime": 2.9524, "eval_qasc_pairs_samples_per_second": 67.743, "eval_qasc_pairs_steps_per_second": 2.371, "step": 10524 }, { "epoch": 1.8011295567345542, "eval_openbookqa_pairs_loss": 2.4358038902282715, "eval_openbookqa_pairs_runtime": 0.714, "eval_openbookqa_pairs_samples_per_second": 96.642, "eval_openbookqa_pairs_steps_per_second": 4.202, "step": 10524 }, { "epoch": 1.8011295567345542, "eval_msmarco_pairs_loss": 0.9730263352394104, "eval_msmarco_pairs_runtime": 4.0426, "eval_msmarco_pairs_samples_per_second": 49.473, "eval_msmarco_pairs_steps_per_second": 1.732, "step": 10524 }, { "epoch": 1.8011295567345542, "eval_nq_pairs_loss": 1.2295622825622559, "eval_nq_pairs_runtime": 8.7231, "eval_nq_pairs_samples_per_second": 22.928, "eval_nq_pairs_steps_per_second": 0.802, "step": 10524 }, { "epoch": 1.8011295567345542, "eval_trivia_pairs_loss": 1.6438909769058228, "eval_trivia_pairs_runtime": 12.6883, "eval_trivia_pairs_samples_per_second": 15.763, "eval_trivia_pairs_steps_per_second": 0.552, "step": 10524 }, { "epoch": 1.8011295567345542, "eval_quora_pairs_loss": 0.18803828954696655, "eval_quora_pairs_runtime": 1.5787, "eval_quora_pairs_samples_per_second": 126.684, "eval_quora_pairs_steps_per_second": 4.434, "step": 10524 }, { "epoch": 1.8011295567345542, "eval_gooaq_pairs_loss": 0.7770086526870728, "eval_gooaq_pairs_runtime": 2.6256, "eval_gooaq_pairs_samples_per_second": 76.174, "eval_gooaq_pairs_steps_per_second": 2.666, "step": 10524 }, { "epoch": 1.801300701694335, "grad_norm": 13.913226127624512, "learning_rate": 2.4357493586230015e-05, "loss": 1.178, "step": 10525 }, { "epoch": 1.801471846654116, "grad_norm": 16.965167999267578, "learning_rate": 2.4348485926658115e-05, "loss": 1.3099, "step": 10526 }, { "epoch": 1.801642991613897, "grad_norm": 13.210633277893066, "learning_rate": 2.433947275146558e-05, "loss": 1.0095, "step": 10527 }, { "epoch": 1.801814136573678, "grad_norm": 0.4790407419204712, "learning_rate": 2.433045406597022e-05, "loss": 0.1245, "step": 10528 }, { "epoch": 1.801985281533459, "grad_norm": 0.42059680819511414, "learning_rate": 2.432142987549306e-05, "loss": 0.1135, "step": 10529 }, { "epoch": 1.80215642649324, "grad_norm": 17.914653778076172, "learning_rate": 2.4312400185358393e-05, "loss": 1.9481, "step": 10530 }, { "epoch": 1.8023275714530207, "grad_norm": 19.361345291137695, "learning_rate": 2.4303365000893744e-05, "loss": 2.1832, "step": 10531 }, { "epoch": 1.8024987164128017, "grad_norm": 5.194599151611328, "learning_rate": 2.4294324327429887e-05, "loss": 0.3857, "step": 10532 }, { "epoch": 1.8026698613725824, "grad_norm": 9.510862350463867, "learning_rate": 2.4285278170300835e-05, "loss": 0.4975, "step": 10533 }, { "epoch": 1.8028410063323634, "grad_norm": 0.3769650161266327, "learning_rate": 2.427622653484383e-05, "loss": 0.1078, "step": 10534 }, { "epoch": 1.8030121512921444, "grad_norm": 3.5494749546051025, "learning_rate": 2.4267169426399356e-05, "loss": 0.3379, "step": 10535 }, { "epoch": 1.8031832962519254, "grad_norm": 16.069507598876953, "learning_rate": 2.4258106850311118e-05, "loss": 1.5926, "step": 10536 }, { "epoch": 1.8033544412117064, "grad_norm": 12.284733772277832, "learning_rate": 2.424903881192605e-05, "loss": 0.9126, "step": 10537 }, { "epoch": 1.8035255861714874, "grad_norm": 11.091384887695312, "learning_rate": 2.42399653165943e-05, "loss": 0.8914, "step": 10538 }, { "epoch": 1.8036967311312682, "grad_norm": 16.73651123046875, "learning_rate": 2.4230886369669244e-05, "loss": 1.2514, "step": 10539 }, { "epoch": 1.8038678760910491, "grad_norm": 10.643180847167969, "learning_rate": 2.4221801976507495e-05, "loss": 0.7043, "step": 10540 }, { "epoch": 1.80403902105083, "grad_norm": 14.223689079284668, "learning_rate": 2.421271214246884e-05, "loss": 0.9864, "step": 10541 }, { "epoch": 1.804210166010611, "grad_norm": 10.764747619628906, "learning_rate": 2.42036168729163e-05, "loss": 0.9028, "step": 10542 }, { "epoch": 1.804381310970392, "grad_norm": 13.467955589294434, "learning_rate": 2.4194516173216097e-05, "loss": 0.8867, "step": 10543 }, { "epoch": 1.8045524559301729, "grad_norm": 5.153761863708496, "learning_rate": 2.4185410048737654e-05, "loss": 0.3129, "step": 10544 }, { "epoch": 1.8047236008899539, "grad_norm": 12.632559776306152, "learning_rate": 2.4176298504853604e-05, "loss": 0.8023, "step": 10545 }, { "epoch": 1.8048947458497349, "grad_norm": 12.970573425292969, "learning_rate": 2.4167181546939765e-05, "loss": 0.8488, "step": 10546 }, { "epoch": 1.8050658908095156, "grad_norm": 7.814690113067627, "learning_rate": 2.4158059180375164e-05, "loss": 0.6618, "step": 10547 }, { "epoch": 1.8052370357692966, "grad_norm": 2.9643869400024414, "learning_rate": 2.4148931410542002e-05, "loss": 0.4066, "step": 10548 }, { "epoch": 1.8054081807290774, "grad_norm": 13.034117698669434, "learning_rate": 2.413979824282568e-05, "loss": 0.9538, "step": 10549 }, { "epoch": 1.8055793256888584, "grad_norm": 14.05811595916748, "learning_rate": 2.413065968261478e-05, "loss": 1.0989, "step": 10550 }, { "epoch": 1.8057504706486394, "grad_norm": 16.089841842651367, "learning_rate": 2.412151573530107e-05, "loss": 1.5686, "step": 10551 }, { "epoch": 1.8059216156084203, "grad_norm": 18.324560165405273, "learning_rate": 2.4112366406279492e-05, "loss": 1.5343, "step": 10552 }, { "epoch": 1.8060927605682013, "grad_norm": 1.1804416179656982, "learning_rate": 2.4103211700948163e-05, "loss": 0.1895, "step": 10553 }, { "epoch": 1.8062639055279823, "grad_norm": 0.4205726683139801, "learning_rate": 2.4094051624708367e-05, "loss": 0.1073, "step": 10554 }, { "epoch": 1.806435050487763, "grad_norm": 17.26517677307129, "learning_rate": 2.408488618296457e-05, "loss": 1.4436, "step": 10555 }, { "epoch": 1.806606195447544, "grad_norm": 4.29479455947876, "learning_rate": 2.4075715381124394e-05, "loss": 0.4185, "step": 10556 }, { "epoch": 1.8067773404073248, "grad_norm": 1.8246326446533203, "learning_rate": 2.4066539224598623e-05, "loss": 0.225, "step": 10557 }, { "epoch": 1.8069484853671058, "grad_norm": 190.94227600097656, "learning_rate": 2.405735771880121e-05, "loss": 9.0874, "step": 10558 }, { "epoch": 1.8071196303268868, "grad_norm": 0.36224856972694397, "learning_rate": 2.4048170869149248e-05, "loss": 0.1111, "step": 10559 }, { "epoch": 1.8072907752866678, "grad_norm": 5.2600626945495605, "learning_rate": 2.4038978681062995e-05, "loss": 0.5249, "step": 10560 }, { "epoch": 1.8074619202464488, "grad_norm": 20.3133544921875, "learning_rate": 2.402978115996586e-05, "loss": 2.472, "step": 10561 }, { "epoch": 1.8076330652062298, "grad_norm": 9.233213424682617, "learning_rate": 2.4020578311284383e-05, "loss": 0.7764, "step": 10562 }, { "epoch": 1.8078042101660106, "grad_norm": 10.254998207092285, "learning_rate": 2.4011370140448284e-05, "loss": 0.9778, "step": 10563 }, { "epoch": 1.8079753551257916, "grad_norm": 16.84878921508789, "learning_rate": 2.400215665289036e-05, "loss": 1.0587, "step": 10564 }, { "epoch": 1.8081465000855723, "grad_norm": 0.4186026453971863, "learning_rate": 2.3992937854046624e-05, "loss": 0.1108, "step": 10565 }, { "epoch": 1.8083176450453533, "grad_norm": 11.795625686645508, "learning_rate": 2.3983713749356138e-05, "loss": 0.978, "step": 10566 }, { "epoch": 1.8084887900051343, "grad_norm": 9.317899703979492, "learning_rate": 2.3974484344261178e-05, "loss": 0.6822, "step": 10567 }, { "epoch": 1.8086599349649153, "grad_norm": 9.422846794128418, "learning_rate": 2.3965249644207072e-05, "loss": 0.6532, "step": 10568 }, { "epoch": 1.8088310799246963, "grad_norm": 5.776649475097656, "learning_rate": 2.3956009654642336e-05, "loss": 0.3218, "step": 10569 }, { "epoch": 1.8090022248844773, "grad_norm": 15.626049995422363, "learning_rate": 2.3946764381018548e-05, "loss": 1.8483, "step": 10570 }, { "epoch": 1.8091733698442583, "grad_norm": 11.826764106750488, "learning_rate": 2.393751382879046e-05, "loss": 0.9655, "step": 10571 }, { "epoch": 1.809344514804039, "grad_norm": 13.415749549865723, "learning_rate": 2.3928258003415902e-05, "loss": 0.9834, "step": 10572 }, { "epoch": 1.80951565976382, "grad_norm": 62.958412170410156, "learning_rate": 2.391899691035582e-05, "loss": 7.5816, "step": 10573 }, { "epoch": 1.8096868047236008, "grad_norm": 14.074838638305664, "learning_rate": 2.390973055507428e-05, "loss": 1.2572, "step": 10574 }, { "epoch": 1.8098579496833818, "grad_norm": 3.93892240524292, "learning_rate": 2.3900458943038437e-05, "loss": 0.3304, "step": 10575 }, { "epoch": 1.8100290946431628, "grad_norm": 14.101479530334473, "learning_rate": 2.3891182079718563e-05, "loss": 0.7636, "step": 10576 }, { "epoch": 1.8102002396029437, "grad_norm": 11.888821601867676, "learning_rate": 2.388189997058802e-05, "loss": 0.9686, "step": 10577 }, { "epoch": 1.8103713845627247, "grad_norm": 17.45565414428711, "learning_rate": 2.3872612621123265e-05, "loss": 1.7602, "step": 10578 }, { "epoch": 1.8105425295225057, "grad_norm": 14.736584663391113, "learning_rate": 2.386332003680385e-05, "loss": 1.0891, "step": 10579 }, { "epoch": 1.8107136744822865, "grad_norm": 2.5605058670043945, "learning_rate": 2.3854022223112408e-05, "loss": 0.2319, "step": 10580 }, { "epoch": 1.8108848194420675, "grad_norm": 8.081610679626465, "learning_rate": 2.3844719185534667e-05, "loss": 0.5964, "step": 10581 }, { "epoch": 1.8110559644018482, "grad_norm": 2.8100156784057617, "learning_rate": 2.3835410929559428e-05, "loss": 0.2436, "step": 10582 }, { "epoch": 1.8112271093616292, "grad_norm": 12.817913055419922, "learning_rate": 2.3826097460678584e-05, "loss": 0.7147, "step": 10583 }, { "epoch": 1.8113982543214102, "grad_norm": 12.596198081970215, "learning_rate": 2.3816778784387094e-05, "loss": 0.7516, "step": 10584 }, { "epoch": 1.8115693992811912, "grad_norm": 16.32467269897461, "learning_rate": 2.3807454906182992e-05, "loss": 1.677, "step": 10585 }, { "epoch": 1.8117405442409722, "grad_norm": 9.071980476379395, "learning_rate": 2.3798125831567373e-05, "loss": 0.8723, "step": 10586 }, { "epoch": 1.8119116892007532, "grad_norm": 15.115654945373535, "learning_rate": 2.378879156604441e-05, "loss": 1.5593, "step": 10587 }, { "epoch": 1.812082834160534, "grad_norm": 4.975087642669678, "learning_rate": 2.3779452115121332e-05, "loss": 0.3107, "step": 10588 }, { "epoch": 1.812253979120315, "grad_norm": 10.732250213623047, "learning_rate": 2.3770107484308435e-05, "loss": 0.6253, "step": 10589 }, { "epoch": 1.8124251240800957, "grad_norm": 13.216413497924805, "learning_rate": 2.3760757679119054e-05, "loss": 1.0319, "step": 10590 }, { "epoch": 1.8125962690398767, "grad_norm": 13.91293716430664, "learning_rate": 2.3751402705069595e-05, "loss": 1.0176, "step": 10591 }, { "epoch": 1.8127674139996577, "grad_norm": 10.46422004699707, "learning_rate": 2.3742042567679505e-05, "loss": 0.6607, "step": 10592 }, { "epoch": 1.8129385589594387, "grad_norm": 7.227677345275879, "learning_rate": 2.373267727247127e-05, "loss": 0.679, "step": 10593 }, { "epoch": 1.8131097039192197, "grad_norm": 18.546497344970703, "learning_rate": 2.3723306824970443e-05, "loss": 1.5696, "step": 10594 }, { "epoch": 1.8132808488790007, "grad_norm": 1.5283751487731934, "learning_rate": 2.37139312307056e-05, "loss": 0.2094, "step": 10595 }, { "epoch": 1.8134519938387814, "grad_norm": 5.209463119506836, "learning_rate": 2.3704550495208353e-05, "loss": 0.4176, "step": 10596 }, { "epoch": 1.8136231387985624, "grad_norm": 14.793638229370117, "learning_rate": 2.3695164624013344e-05, "loss": 1.2144, "step": 10597 }, { "epoch": 1.8137942837583432, "grad_norm": 10.423752784729004, "learning_rate": 2.368577362265826e-05, "loss": 0.8261, "step": 10598 }, { "epoch": 1.8139654287181242, "grad_norm": 19.44396209716797, "learning_rate": 2.3676377496683796e-05, "loss": 1.5038, "step": 10599 }, { "epoch": 1.8141365736779052, "grad_norm": 32.127986907958984, "learning_rate": 2.366697625163369e-05, "loss": 5.4691, "step": 10600 }, { "epoch": 1.8143077186376861, "grad_norm": 12.171806335449219, "learning_rate": 2.365756989305469e-05, "loss": 0.888, "step": 10601 }, { "epoch": 1.8144788635974671, "grad_norm": 0.6755067706108093, "learning_rate": 2.3648158426496556e-05, "loss": 0.1143, "step": 10602 }, { "epoch": 1.8146500085572481, "grad_norm": 5.405158042907715, "learning_rate": 2.3638741857512056e-05, "loss": 0.4886, "step": 10603 }, { "epoch": 1.8148211535170289, "grad_norm": 76.35613250732422, "learning_rate": 2.3629320191657016e-05, "loss": 6.7704, "step": 10604 }, { "epoch": 1.8149922984768099, "grad_norm": 19.082910537719727, "learning_rate": 2.3619893434490187e-05, "loss": 1.0322, "step": 10605 }, { "epoch": 1.8151634434365906, "grad_norm": 15.955602645874023, "learning_rate": 2.361046159157341e-05, "loss": 1.5209, "step": 10606 }, { "epoch": 1.8153345883963716, "grad_norm": 12.689452171325684, "learning_rate": 2.360102466847146e-05, "loss": 1.0441, "step": 10607 }, { "epoch": 1.8155057333561526, "grad_norm": 2.0050296783447266, "learning_rate": 2.3591582670752154e-05, "loss": 0.2412, "step": 10608 }, { "epoch": 1.8156768783159336, "grad_norm": 14.409708976745605, "learning_rate": 2.358213560398626e-05, "loss": 1.2126, "step": 10609 }, { "epoch": 1.8158480232757146, "grad_norm": 47.97038650512695, "learning_rate": 2.3572683473747593e-05, "loss": 6.3358, "step": 10610 }, { "epoch": 1.8160191682354956, "grad_norm": 7.257107734680176, "learning_rate": 2.35632262856129e-05, "loss": 0.7416, "step": 10611 }, { "epoch": 1.8161903131952764, "grad_norm": 7.002153396606445, "learning_rate": 2.355376404516195e-05, "loss": 0.5038, "step": 10612 }, { "epoch": 1.8163614581550573, "grad_norm": 1.664169192314148, "learning_rate": 2.3544296757977458e-05, "loss": 0.212, "step": 10613 }, { "epoch": 1.816532603114838, "grad_norm": 13.75250244140625, "learning_rate": 2.3534824429645163e-05, "loss": 1.2093, "step": 10614 }, { "epoch": 1.816703748074619, "grad_norm": 12.412364959716797, "learning_rate": 2.352534706575374e-05, "loss": 1.0103, "step": 10615 }, { "epoch": 1.8168748930344, "grad_norm": 20.979095458984375, "learning_rate": 2.3515864671894853e-05, "loss": 2.3759, "step": 10616 }, { "epoch": 1.817046037994181, "grad_norm": 18.982507705688477, "learning_rate": 2.3506377253663125e-05, "loss": 1.3329, "step": 10617 }, { "epoch": 1.817217182953962, "grad_norm": 16.52355194091797, "learning_rate": 2.3496884816656145e-05, "loss": 1.4345, "step": 10618 }, { "epoch": 1.817388327913743, "grad_norm": 11.46009635925293, "learning_rate": 2.348738736647447e-05, "loss": 0.8415, "step": 10619 }, { "epoch": 1.817559472873524, "grad_norm": 11.508846282958984, "learning_rate": 2.3477884908721605e-05, "loss": 0.9465, "step": 10620 }, { "epoch": 1.8177306178333048, "grad_norm": 0.8934975266456604, "learning_rate": 2.3468377449004018e-05, "loss": 0.1746, "step": 10621 }, { "epoch": 1.8179017627930858, "grad_norm": 0.45116984844207764, "learning_rate": 2.3458864992931125e-05, "loss": 0.1097, "step": 10622 }, { "epoch": 1.8180729077528666, "grad_norm": 5.693823337554932, "learning_rate": 2.344934754611528e-05, "loss": 0.4121, "step": 10623 }, { "epoch": 1.8182440527126476, "grad_norm": 0.5761781930923462, "learning_rate": 2.3439825114171805e-05, "loss": 0.1228, "step": 10624 }, { "epoch": 1.8184151976724285, "grad_norm": 17.807600021362305, "learning_rate": 2.343029770271893e-05, "loss": 1.0029, "step": 10625 }, { "epoch": 1.8185863426322095, "grad_norm": 14.049668312072754, "learning_rate": 2.3420765317377864e-05, "loss": 1.3839, "step": 10626 }, { "epoch": 1.8187574875919905, "grad_norm": 1.20028817653656, "learning_rate": 2.3411227963772713e-05, "loss": 0.1826, "step": 10627 }, { "epoch": 1.8189286325517715, "grad_norm": 7.2917304039001465, "learning_rate": 2.340168564753054e-05, "loss": 0.6195, "step": 10628 }, { "epoch": 1.8190997775115523, "grad_norm": 12.094229698181152, "learning_rate": 2.339213837428132e-05, "loss": 0.957, "step": 10629 }, { "epoch": 1.8192709224713333, "grad_norm": 18.31485939025879, "learning_rate": 2.338258614965796e-05, "loss": 1.9819, "step": 10630 }, { "epoch": 1.819442067431114, "grad_norm": 27.522642135620117, "learning_rate": 2.3373028979296286e-05, "loss": 5.0931, "step": 10631 }, { "epoch": 1.819613212390895, "grad_norm": 12.018763542175293, "learning_rate": 2.3363466868835045e-05, "loss": 0.9674, "step": 10632 }, { "epoch": 1.819784357350676, "grad_norm": 10.738811492919922, "learning_rate": 2.335389982391589e-05, "loss": 0.8602, "step": 10633 }, { "epoch": 1.819955502310457, "grad_norm": 15.91081714630127, "learning_rate": 2.3344327850183398e-05, "loss": 1.3569, "step": 10634 }, { "epoch": 1.820126647270238, "grad_norm": 8.462516784667969, "learning_rate": 2.3334750953285047e-05, "loss": 0.7951, "step": 10635 }, { "epoch": 1.820297792230019, "grad_norm": 9.47549057006836, "learning_rate": 2.332516913887121e-05, "loss": 0.994, "step": 10636 }, { "epoch": 1.8204689371897997, "grad_norm": 15.810328483581543, "learning_rate": 2.3315582412595188e-05, "loss": 1.0658, "step": 10637 }, { "epoch": 1.8206400821495807, "grad_norm": 15.090182304382324, "learning_rate": 2.3305990780113156e-05, "loss": 1.1192, "step": 10638 }, { "epoch": 1.8208112271093615, "grad_norm": 13.68138599395752, "learning_rate": 2.32963942470842e-05, "loss": 0.9451, "step": 10639 }, { "epoch": 1.8209823720691425, "grad_norm": 9.710210800170898, "learning_rate": 2.3286792819170276e-05, "loss": 0.8535, "step": 10640 }, { "epoch": 1.8211535170289235, "grad_norm": 11.099365234375, "learning_rate": 2.327718650203624e-05, "loss": 0.8901, "step": 10641 }, { "epoch": 1.8213246619887045, "grad_norm": 9.152802467346191, "learning_rate": 2.326757530134985e-05, "loss": 0.7372, "step": 10642 }, { "epoch": 1.8214958069484855, "grad_norm": 9.394492149353027, "learning_rate": 2.3257959222781708e-05, "loss": 0.7694, "step": 10643 }, { "epoch": 1.8216669519082664, "grad_norm": 84.067138671875, "learning_rate": 2.3248338272005332e-05, "loss": 7.1887, "step": 10644 }, { "epoch": 1.8218380968680472, "grad_norm": 16.088125228881836, "learning_rate": 2.323871245469709e-05, "loss": 1.2665, "step": 10645 }, { "epoch": 1.8220092418278282, "grad_norm": 16.147735595703125, "learning_rate": 2.3229081776536217e-05, "loss": 1.6137, "step": 10646 }, { "epoch": 1.822180386787609, "grad_norm": 0.43942829966545105, "learning_rate": 2.3219446243204856e-05, "loss": 0.1141, "step": 10647 }, { "epoch": 1.82235153174739, "grad_norm": 16.777986526489258, "learning_rate": 2.320980586038795e-05, "loss": 1.2206, "step": 10648 }, { "epoch": 1.822522676707171, "grad_norm": 6.412774562835693, "learning_rate": 2.320016063377338e-05, "loss": 0.6006, "step": 10649 }, { "epoch": 1.822693821666952, "grad_norm": 13.338603019714355, "learning_rate": 2.3190510569051803e-05, "loss": 1.2322, "step": 10650 }, { "epoch": 1.822864966626733, "grad_norm": 0.5447220206260681, "learning_rate": 2.3180855671916807e-05, "loss": 0.1231, "step": 10651 }, { "epoch": 1.823036111586514, "grad_norm": 12.126182556152344, "learning_rate": 2.317119594806476e-05, "loss": 1.1241, "step": 10652 }, { "epoch": 1.8232072565462947, "grad_norm": 15.191003799438477, "learning_rate": 2.3161531403194957e-05, "loss": 1.1577, "step": 10653 }, { "epoch": 1.8233784015060757, "grad_norm": 1.0059387683868408, "learning_rate": 2.3151862043009443e-05, "loss": 0.1686, "step": 10654 }, { "epoch": 1.8235495464658564, "grad_norm": 7.575471878051758, "learning_rate": 2.314218787321321e-05, "loss": 0.6308, "step": 10655 }, { "epoch": 1.8237206914256374, "grad_norm": 17.214534759521484, "learning_rate": 2.313250889951398e-05, "loss": 1.9644, "step": 10656 }, { "epoch": 1.8238918363854184, "grad_norm": 9.227192878723145, "learning_rate": 2.3122825127622397e-05, "loss": 0.6782, "step": 10657 }, { "epoch": 1.8240629813451994, "grad_norm": 11.300491333007812, "learning_rate": 2.311313656325189e-05, "loss": 0.7853, "step": 10658 }, { "epoch": 1.8242341263049804, "grad_norm": 5.306514263153076, "learning_rate": 2.3103443212118728e-05, "loss": 0.4455, "step": 10659 }, { "epoch": 1.8244052712647614, "grad_norm": 10.548945426940918, "learning_rate": 2.3093745079942e-05, "loss": 0.7209, "step": 10660 }, { "epoch": 1.8245764162245421, "grad_norm": 0.48353883624076843, "learning_rate": 2.3084042172443615e-05, "loss": 0.1086, "step": 10661 }, { "epoch": 1.8247475611843231, "grad_norm": 9.604976654052734, "learning_rate": 2.307433449534831e-05, "loss": 0.906, "step": 10662 }, { "epoch": 1.824918706144104, "grad_norm": 12.667705535888672, "learning_rate": 2.3064622054383628e-05, "loss": 0.981, "step": 10663 }, { "epoch": 1.8250898511038849, "grad_norm": 23.483829498291016, "learning_rate": 2.3054904855279924e-05, "loss": 3.2427, "step": 10664 }, { "epoch": 1.8252609960636659, "grad_norm": 3.277705430984497, "learning_rate": 2.3045182903770357e-05, "loss": 0.2871, "step": 10665 }, { "epoch": 1.8254321410234469, "grad_norm": 1.3834161758422852, "learning_rate": 2.303545620559089e-05, "loss": 0.1406, "step": 10666 }, { "epoch": 1.8256032859832279, "grad_norm": 15.603012084960938, "learning_rate": 2.3025724766480298e-05, "loss": 1.1761, "step": 10667 }, { "epoch": 1.8257744309430088, "grad_norm": 9.568479537963867, "learning_rate": 2.301598859218013e-05, "loss": 0.7798, "step": 10668 }, { "epoch": 1.8259455759027896, "grad_norm": 0.3612367808818817, "learning_rate": 2.3006247688434758e-05, "loss": 0.103, "step": 10669 }, { "epoch": 1.8261167208625706, "grad_norm": 13.102872848510742, "learning_rate": 2.2996502060991327e-05, "loss": 0.9425, "step": 10670 }, { "epoch": 1.8262878658223516, "grad_norm": 7.860074043273926, "learning_rate": 2.2986751715599767e-05, "loss": 0.5271, "step": 10671 }, { "epoch": 1.8264590107821324, "grad_norm": 7.906700611114502, "learning_rate": 2.2976996658012805e-05, "loss": 0.6228, "step": 10672 }, { "epoch": 1.8266301557419133, "grad_norm": 1.4205461740493774, "learning_rate": 2.296723689398593e-05, "loss": 0.1842, "step": 10673 }, { "epoch": 1.8268013007016943, "grad_norm": 12.818958282470703, "learning_rate": 2.295747242927742e-05, "loss": 0.9891, "step": 10674 }, { "epoch": 1.8269724456614753, "grad_norm": 17.719608306884766, "learning_rate": 2.294770326964833e-05, "loss": 1.4046, "step": 10675 }, { "epoch": 1.8271435906212563, "grad_norm": 16.1218318939209, "learning_rate": 2.2937929420862464e-05, "loss": 1.673, "step": 10676 }, { "epoch": 1.8273147355810373, "grad_norm": 5.908822536468506, "learning_rate": 2.2928150888686418e-05, "loss": 0.5234, "step": 10677 }, { "epoch": 1.827485880540818, "grad_norm": 4.266765117645264, "learning_rate": 2.2918367678889542e-05, "loss": 0.2725, "step": 10678 }, { "epoch": 1.827657025500599, "grad_norm": 3.0567710399627686, "learning_rate": 2.2908579797243936e-05, "loss": 0.2653, "step": 10679 }, { "epoch": 1.8278281704603798, "grad_norm": 2.0204389095306396, "learning_rate": 2.2898787249524474e-05, "loss": 0.1948, "step": 10680 }, { "epoch": 1.8279993154201608, "grad_norm": 11.196292877197266, "learning_rate": 2.2888990041508768e-05, "loss": 0.8565, "step": 10681 }, { "epoch": 1.8281704603799418, "grad_norm": 1.3900225162506104, "learning_rate": 2.2879188178977194e-05, "loss": 0.2037, "step": 10682 }, { "epoch": 1.8283416053397228, "grad_norm": 0.42360720038414, "learning_rate": 2.2869381667712853e-05, "loss": 0.1081, "step": 10683 }, { "epoch": 1.8285127502995038, "grad_norm": 5.662801742553711, "learning_rate": 2.285957051350161e-05, "loss": 0.3401, "step": 10684 }, { "epoch": 1.8286838952592848, "grad_norm": 18.79254150390625, "learning_rate": 2.2849754722132058e-05, "loss": 1.8463, "step": 10685 }, { "epoch": 1.8288550402190655, "grad_norm": 5.144514083862305, "learning_rate": 2.2839934299395526e-05, "loss": 0.2921, "step": 10686 }, { "epoch": 1.8290261851788465, "grad_norm": 18.2552490234375, "learning_rate": 2.283010925108608e-05, "loss": 2.1816, "step": 10687 }, { "epoch": 1.8291973301386273, "grad_norm": 10.40813159942627, "learning_rate": 2.2820279583000514e-05, "loss": 0.7952, "step": 10688 }, { "epoch": 1.8293684750984083, "grad_norm": 0.37699419260025024, "learning_rate": 2.2810445300938335e-05, "loss": 0.1125, "step": 10689 }, { "epoch": 1.8295396200581893, "grad_norm": 0.33640703558921814, "learning_rate": 2.2800606410701817e-05, "loss": 0.0997, "step": 10690 }, { "epoch": 1.8297107650179703, "grad_norm": 10.160209655761719, "learning_rate": 2.279076291809588e-05, "loss": 0.9097, "step": 10691 }, { "epoch": 1.8298819099777512, "grad_norm": 15.933124542236328, "learning_rate": 2.278091482892823e-05, "loss": 1.1795, "step": 10692 }, { "epoch": 1.8300530549375322, "grad_norm": 3.6139233112335205, "learning_rate": 2.277106214900923e-05, "loss": 0.3587, "step": 10693 }, { "epoch": 1.830224199897313, "grad_norm": 2.53991961479187, "learning_rate": 2.2761204884152e-05, "loss": 0.2216, "step": 10694 }, { "epoch": 1.830395344857094, "grad_norm": 1.1256651878356934, "learning_rate": 2.275134304017231e-05, "loss": 0.1157, "step": 10695 }, { "epoch": 1.8305664898168748, "grad_norm": 11.670063018798828, "learning_rate": 2.2741476622888704e-05, "loss": 0.7447, "step": 10696 }, { "epoch": 1.8307376347766557, "grad_norm": 7.091588020324707, "learning_rate": 2.273160563812234e-05, "loss": 0.6081, "step": 10697 }, { "epoch": 1.8309087797364367, "grad_norm": 0.3013341426849365, "learning_rate": 2.272173009169715e-05, "loss": 0.102, "step": 10698 }, { "epoch": 1.8310799246962177, "grad_norm": 1.8344836235046387, "learning_rate": 2.271184998943969e-05, "loss": 0.217, "step": 10699 }, { "epoch": 1.8312510696559987, "grad_norm": 8.233963966369629, "learning_rate": 2.2701965337179254e-05, "loss": 0.6062, "step": 10700 }, { "epoch": 1.8314222146157797, "grad_norm": 15.439166069030762, "learning_rate": 2.26920761407478e-05, "loss": 1.3537, "step": 10701 }, { "epoch": 1.8315933595755605, "grad_norm": 11.700836181640625, "learning_rate": 2.2682182405979963e-05, "loss": 1.0057, "step": 10702 }, { "epoch": 1.8317645045353415, "grad_norm": 15.735676765441895, "learning_rate": 2.2672284138713066e-05, "loss": 1.1717, "step": 10703 }, { "epoch": 1.8319356494951222, "grad_norm": 8.892084121704102, "learning_rate": 2.2662381344787106e-05, "loss": 0.674, "step": 10704 }, { "epoch": 1.8321067944549032, "grad_norm": 1.718575119972229, "learning_rate": 2.265247403004473e-05, "loss": 0.232, "step": 10705 }, { "epoch": 1.8322779394146842, "grad_norm": 12.543073654174805, "learning_rate": 2.2642562200331283e-05, "loss": 1.1239, "step": 10706 }, { "epoch": 1.8324490843744652, "grad_norm": 17.805330276489258, "learning_rate": 2.2632645861494755e-05, "loss": 1.7145, "step": 10707 }, { "epoch": 1.8326202293342462, "grad_norm": 21.37221908569336, "learning_rate": 2.2622725019385797e-05, "loss": 1.7524, "step": 10708 }, { "epoch": 1.8327913742940272, "grad_norm": 17.15726089477539, "learning_rate": 2.2612799679857726e-05, "loss": 1.6768, "step": 10709 }, { "epoch": 1.832962519253808, "grad_norm": 7.450227737426758, "learning_rate": 2.2602869848766504e-05, "loss": 0.5338, "step": 10710 }, { "epoch": 1.833133664213589, "grad_norm": 1.2225533723831177, "learning_rate": 2.2592935531970742e-05, "loss": 0.1943, "step": 10711 }, { "epoch": 1.8333048091733697, "grad_norm": 8.618548393249512, "learning_rate": 2.258299673533171e-05, "loss": 0.6149, "step": 10712 }, { "epoch": 1.8334759541331507, "grad_norm": 5.420083045959473, "learning_rate": 2.2573053464713314e-05, "loss": 0.5281, "step": 10713 }, { "epoch": 1.8336470990929317, "grad_norm": 12.915985107421875, "learning_rate": 2.2563105725982094e-05, "loss": 1.0498, "step": 10714 }, { "epoch": 1.8338182440527127, "grad_norm": 6.939027786254883, "learning_rate": 2.2553153525007227e-05, "loss": 0.5104, "step": 10715 }, { "epoch": 1.8339893890124936, "grad_norm": 9.001492500305176, "learning_rate": 2.2543196867660534e-05, "loss": 0.6066, "step": 10716 }, { "epoch": 1.8341605339722746, "grad_norm": 0.4177084267139435, "learning_rate": 2.2533235759816454e-05, "loss": 0.1135, "step": 10717 }, { "epoch": 1.8343316789320554, "grad_norm": 11.667120933532715, "learning_rate": 2.2523270207352052e-05, "loss": 0.8969, "step": 10718 }, { "epoch": 1.8345028238918364, "grad_norm": 0.3338482975959778, "learning_rate": 2.2513300216147023e-05, "loss": 0.1068, "step": 10719 }, { "epoch": 1.8346739688516172, "grad_norm": 8.736976623535156, "learning_rate": 2.250332579208367e-05, "loss": 0.7248, "step": 10720 }, { "epoch": 1.8348451138113981, "grad_norm": 16.588937759399414, "learning_rate": 2.2493346941046922e-05, "loss": 0.6916, "step": 10721 }, { "epoch": 1.8350162587711791, "grad_norm": 29.95481300354004, "learning_rate": 2.248336366892433e-05, "loss": 5.4476, "step": 10722 }, { "epoch": 1.8351874037309601, "grad_norm": 13.606313705444336, "learning_rate": 2.2473375981606024e-05, "loss": 1.2716, "step": 10723 }, { "epoch": 1.835358548690741, "grad_norm": 12.299969673156738, "learning_rate": 2.2463383884984752e-05, "loss": 1.0662, "step": 10724 }, { "epoch": 1.835529693650522, "grad_norm": 0.40356501936912537, "learning_rate": 2.2453387384955877e-05, "loss": 0.1096, "step": 10725 }, { "epoch": 1.835700838610303, "grad_norm": 10.544236183166504, "learning_rate": 2.2443386487417345e-05, "loss": 0.9266, "step": 10726 }, { "epoch": 1.8358719835700839, "grad_norm": 13.301421165466309, "learning_rate": 2.243338119826969e-05, "loss": 0.965, "step": 10727 }, { "epoch": 1.8360431285298648, "grad_norm": 0.3932670056819916, "learning_rate": 2.2423371523416065e-05, "loss": 0.1045, "step": 10728 }, { "epoch": 1.8362142734896456, "grad_norm": 79.77033996582031, "learning_rate": 2.2413357468762182e-05, "loss": 7.0694, "step": 10729 }, { "epoch": 1.8363854184494266, "grad_norm": 14.764008522033691, "learning_rate": 2.2403339040216348e-05, "loss": 1.4573, "step": 10730 }, { "epoch": 1.8365565634092076, "grad_norm": 6.939533710479736, "learning_rate": 2.239331624368946e-05, "loss": 0.6298, "step": 10731 }, { "epoch": 1.8367277083689886, "grad_norm": 13.607168197631836, "learning_rate": 2.238328908509496e-05, "loss": 1.4319, "step": 10732 }, { "epoch": 1.8368988533287696, "grad_norm": 0.8993772864341736, "learning_rate": 2.2373257570348924e-05, "loss": 0.1695, "step": 10733 }, { "epoch": 1.8370699982885506, "grad_norm": 12.150834083557129, "learning_rate": 2.236322170536992e-05, "loss": 1.3032, "step": 10734 }, { "epoch": 1.8372411432483313, "grad_norm": 15.254476547241211, "learning_rate": 2.235318149607916e-05, "loss": 1.0559, "step": 10735 }, { "epoch": 1.8374122882081123, "grad_norm": 25.905391693115234, "learning_rate": 2.234313694840035e-05, "loss": 5.0835, "step": 10736 }, { "epoch": 1.837583433167893, "grad_norm": 8.142675399780273, "learning_rate": 2.2333088068259816e-05, "loss": 0.9263, "step": 10737 }, { "epoch": 1.837754578127674, "grad_norm": 60.67076110839844, "learning_rate": 2.2323034861586385e-05, "loss": 7.6488, "step": 10738 }, { "epoch": 1.837925723087455, "grad_norm": 28.15801429748535, "learning_rate": 2.23129773343115e-05, "loss": 4.9479, "step": 10739 }, { "epoch": 1.838096868047236, "grad_norm": 22.687803268432617, "learning_rate": 2.230291549236907e-05, "loss": 2.7711, "step": 10740 }, { "epoch": 1.838268013007017, "grad_norm": 10.584364891052246, "learning_rate": 2.2292849341695644e-05, "loss": 0.6533, "step": 10741 }, { "epoch": 1.838439157966798, "grad_norm": 10.927813529968262, "learning_rate": 2.2282778888230224e-05, "loss": 0.8783, "step": 10742 }, { "epoch": 1.8386103029265788, "grad_norm": 6.0828986167907715, "learning_rate": 2.227270413791442e-05, "loss": 0.4529, "step": 10743 }, { "epoch": 1.8387814478863598, "grad_norm": 9.146825790405273, "learning_rate": 2.226262509669235e-05, "loss": 0.8066, "step": 10744 }, { "epoch": 1.8389525928461405, "grad_norm": 18.116863250732422, "learning_rate": 2.225254177051065e-05, "loss": 1.9311, "step": 10745 }, { "epoch": 1.8391237378059215, "grad_norm": 15.593891143798828, "learning_rate": 2.2242454165318507e-05, "loss": 1.6272, "step": 10746 }, { "epoch": 1.8392948827657025, "grad_norm": 10.521073341369629, "learning_rate": 2.223236228706761e-05, "loss": 0.8065, "step": 10747 }, { "epoch": 1.8394660277254835, "grad_norm": 16.30931854248047, "learning_rate": 2.22222661417122e-05, "loss": 1.3972, "step": 10748 }, { "epoch": 1.8396371726852645, "grad_norm": 13.134941101074219, "learning_rate": 2.2212165735209004e-05, "loss": 0.9875, "step": 10749 }, { "epoch": 1.8398083176450455, "grad_norm": 15.051519393920898, "learning_rate": 2.2202061073517288e-05, "loss": 1.1269, "step": 10750 }, { "epoch": 1.8399794626048263, "grad_norm": 24.25493049621582, "learning_rate": 2.219195216259881e-05, "loss": 5.1378, "step": 10751 }, { "epoch": 1.8401506075646072, "grad_norm": 17.18368148803711, "learning_rate": 2.2181839008417835e-05, "loss": 1.8185, "step": 10752 }, { "epoch": 1.840321752524388, "grad_norm": 6.578704833984375, "learning_rate": 2.2171721616941154e-05, "loss": 0.3513, "step": 10753 }, { "epoch": 1.840492897484169, "grad_norm": 1.1263909339904785, "learning_rate": 2.2161599994138032e-05, "loss": 0.1746, "step": 10754 }, { "epoch": 1.84066404244395, "grad_norm": 18.18806266784668, "learning_rate": 2.2151474145980255e-05, "loss": 1.4899, "step": 10755 }, { "epoch": 1.840835187403731, "grad_norm": 7.874475479125977, "learning_rate": 2.2141344078442076e-05, "loss": 0.6977, "step": 10756 }, { "epoch": 1.841006332363512, "grad_norm": 9.029332160949707, "learning_rate": 2.2131209797500253e-05, "loss": 0.6667, "step": 10757 }, { "epoch": 1.841177477323293, "grad_norm": 0.9133652448654175, "learning_rate": 2.2121071309134033e-05, "loss": 0.1765, "step": 10758 }, { "epoch": 1.8413486222830737, "grad_norm": 2.2835850715637207, "learning_rate": 2.211092861932513e-05, "loss": 0.1994, "step": 10759 }, { "epoch": 1.8415197672428547, "grad_norm": 11.585211753845215, "learning_rate": 2.210078173405775e-05, "loss": 0.8204, "step": 10760 }, { "epoch": 1.8416909122026355, "grad_norm": 4.341249465942383, "learning_rate": 2.209063065931857e-05, "loss": 0.2601, "step": 10761 }, { "epoch": 1.8418620571624165, "grad_norm": 12.828780174255371, "learning_rate": 2.2080475401096746e-05, "loss": 1.0181, "step": 10762 }, { "epoch": 1.8420332021221975, "grad_norm": 16.33918571472168, "learning_rate": 2.2070315965383886e-05, "loss": 1.1493, "step": 10763 }, { "epoch": 1.8422043470819784, "grad_norm": 1.3210057020187378, "learning_rate": 2.2060152358174063e-05, "loss": 0.2038, "step": 10764 }, { "epoch": 1.8423754920417594, "grad_norm": 7.309770107269287, "learning_rate": 2.2049984585463844e-05, "loss": 0.7551, "step": 10765 }, { "epoch": 1.8425466370015404, "grad_norm": 11.696361541748047, "learning_rate": 2.203981265325222e-05, "loss": 0.8308, "step": 10766 }, { "epoch": 1.8427177819613212, "grad_norm": 12.948139190673828, "learning_rate": 2.2029636567540642e-05, "loss": 1.0667, "step": 10767 }, { "epoch": 1.8428889269211022, "grad_norm": 14.713232040405273, "learning_rate": 2.2019456334333023e-05, "loss": 1.2956, "step": 10768 }, { "epoch": 1.843060071880883, "grad_norm": 9.00324535369873, "learning_rate": 2.2009271959635705e-05, "loss": 0.6042, "step": 10769 }, { "epoch": 1.843231216840664, "grad_norm": 10.209758758544922, "learning_rate": 2.19990834494575e-05, "loss": 0.7411, "step": 10770 }, { "epoch": 1.843402361800445, "grad_norm": 8.930167198181152, "learning_rate": 2.1988890809809632e-05, "loss": 0.6751, "step": 10771 }, { "epoch": 1.843573506760226, "grad_norm": 7.664986610412598, "learning_rate": 2.1978694046705773e-05, "loss": 0.774, "step": 10772 }, { "epoch": 1.843744651720007, "grad_norm": 16.12793731689453, "learning_rate": 2.1968493166162032e-05, "loss": 1.6914, "step": 10773 }, { "epoch": 1.8439157966797879, "grad_norm": 0.4523647129535675, "learning_rate": 2.1958288174196947e-05, "loss": 0.111, "step": 10774 }, { "epoch": 1.8440869416395689, "grad_norm": 14.644676208496094, "learning_rate": 2.1948079076831465e-05, "loss": 1.1938, "step": 10775 }, { "epoch": 1.8442580865993496, "grad_norm": 10.372179985046387, "learning_rate": 2.1937865880088994e-05, "loss": 0.8615, "step": 10776 }, { "epoch": 1.8444292315591306, "grad_norm": 4.100748538970947, "learning_rate": 2.1927648589995305e-05, "loss": 0.2751, "step": 10777 }, { "epoch": 1.8446003765189114, "grad_norm": 13.971488952636719, "learning_rate": 2.191742721257865e-05, "loss": 1.1253, "step": 10778 }, { "epoch": 1.8447715214786924, "grad_norm": 13.470091819763184, "learning_rate": 2.1907201753869618e-05, "loss": 1.1792, "step": 10779 }, { "epoch": 1.8449426664384734, "grad_norm": 4.998398303985596, "learning_rate": 2.1896972219901277e-05, "loss": 0.4235, "step": 10780 }, { "epoch": 1.8451138113982544, "grad_norm": 4.809243202209473, "learning_rate": 2.1886738616709038e-05, "loss": 0.3071, "step": 10781 }, { "epoch": 1.8452849563580354, "grad_norm": 1.6509063243865967, "learning_rate": 2.187650095033077e-05, "loss": 0.1932, "step": 10782 }, { "epoch": 1.8454561013178163, "grad_norm": 55.93296813964844, "learning_rate": 2.1866259226806687e-05, "loss": 6.7501, "step": 10783 }, { "epoch": 1.8456272462775971, "grad_norm": 6.096895694732666, "learning_rate": 2.185601345217945e-05, "loss": 0.5613, "step": 10784 }, { "epoch": 1.845798391237378, "grad_norm": 18.828163146972656, "learning_rate": 2.1845763632494046e-05, "loss": 1.9223, "step": 10785 }, { "epoch": 1.8459695361971589, "grad_norm": 12.429137229919434, "learning_rate": 2.183550977379791e-05, "loss": 1.0899, "step": 10786 }, { "epoch": 1.8461406811569399, "grad_norm": 33.194793701171875, "learning_rate": 2.182525188214083e-05, "loss": 5.2709, "step": 10787 }, { "epoch": 1.8463118261167208, "grad_norm": 12.165261268615723, "learning_rate": 2.181498996357497e-05, "loss": 0.7739, "step": 10788 }, { "epoch": 1.8464829710765018, "grad_norm": 22.796131134033203, "learning_rate": 2.1804724024154883e-05, "loss": 2.6537, "step": 10789 }, { "epoch": 1.8466541160362828, "grad_norm": 5.031498908996582, "learning_rate": 2.1794454069937485e-05, "loss": 0.5269, "step": 10790 }, { "epoch": 1.8468252609960638, "grad_norm": 10.853936195373535, "learning_rate": 2.1784180106982066e-05, "loss": 0.8698, "step": 10791 }, { "epoch": 1.8469964059558446, "grad_norm": 12.59391975402832, "learning_rate": 2.177390214135028e-05, "loss": 0.7118, "step": 10792 }, { "epoch": 1.8471675509156256, "grad_norm": 7.0883564949035645, "learning_rate": 2.1763620179106137e-05, "loss": 0.8072, "step": 10793 }, { "epoch": 1.8473386958754063, "grad_norm": 11.56019401550293, "learning_rate": 2.1753334226316023e-05, "loss": 0.8239, "step": 10794 }, { "epoch": 1.8475098408351873, "grad_norm": 7.500759124755859, "learning_rate": 2.1743044289048654e-05, "loss": 0.6449, "step": 10795 }, { "epoch": 1.8476809857949683, "grad_norm": 1.5768293142318726, "learning_rate": 2.1732750373375108e-05, "loss": 0.1889, "step": 10796 }, { "epoch": 1.8478521307547493, "grad_norm": 5.387068271636963, "learning_rate": 2.1722452485368804e-05, "loss": 0.5263, "step": 10797 }, { "epoch": 1.8480232757145303, "grad_norm": 18.951486587524414, "learning_rate": 2.1712150631105528e-05, "loss": 1.7336, "step": 10798 }, { "epoch": 1.8481944206743113, "grad_norm": 1.4376548528671265, "learning_rate": 2.1701844816663387e-05, "loss": 0.2012, "step": 10799 }, { "epoch": 1.848365565634092, "grad_norm": 18.728057861328125, "learning_rate": 2.1691535048122818e-05, "loss": 1.8572, "step": 10800 }, { "epoch": 1.848536710593873, "grad_norm": 16.776987075805664, "learning_rate": 2.1681221331566605e-05, "loss": 1.7367, "step": 10801 }, { "epoch": 1.8487078555536538, "grad_norm": 14.22872257232666, "learning_rate": 2.167090367307986e-05, "loss": 1.1277, "step": 10802 }, { "epoch": 1.8488790005134348, "grad_norm": 0.353922963142395, "learning_rate": 2.1660582078750006e-05, "loss": 0.1017, "step": 10803 }, { "epoch": 1.8490501454732158, "grad_norm": 25.504093170166016, "learning_rate": 2.1650256554666804e-05, "loss": 5.2835, "step": 10804 }, { "epoch": 1.8492212904329968, "grad_norm": 12.376045227050781, "learning_rate": 2.1639927106922334e-05, "loss": 0.9438, "step": 10805 }, { "epoch": 1.8493924353927778, "grad_norm": 9.680256843566895, "learning_rate": 2.162959374161098e-05, "loss": 0.7306, "step": 10806 }, { "epoch": 1.8495635803525587, "grad_norm": 59.66267776489258, "learning_rate": 2.1619256464829433e-05, "loss": 6.9786, "step": 10807 }, { "epoch": 1.8497347253123395, "grad_norm": 10.332234382629395, "learning_rate": 2.160891528267672e-05, "loss": 1.0796, "step": 10808 }, { "epoch": 1.8499058702721205, "grad_norm": 19.80345344543457, "learning_rate": 2.159857020125415e-05, "loss": 1.5882, "step": 10809 }, { "epoch": 1.8500770152319013, "grad_norm": 0.30056577920913696, "learning_rate": 2.158822122666533e-05, "loss": 0.1019, "step": 10810 }, { "epoch": 1.8502481601916823, "grad_norm": 14.505559921264648, "learning_rate": 2.157786836501618e-05, "loss": 1.0006, "step": 10811 }, { "epoch": 1.8504193051514632, "grad_norm": 3.341278076171875, "learning_rate": 2.156751162241489e-05, "loss": 0.2496, "step": 10812 }, { "epoch": 1.8505904501112442, "grad_norm": 0.8923941850662231, "learning_rate": 2.1557151004971965e-05, "loss": 0.1792, "step": 10813 }, { "epoch": 1.8507615950710252, "grad_norm": 0.4024316370487213, "learning_rate": 2.154678651880018e-05, "loss": 0.1063, "step": 10814 }, { "epoch": 1.8509327400308062, "grad_norm": 20.183387756347656, "learning_rate": 2.1536418170014595e-05, "loss": 2.3376, "step": 10815 }, { "epoch": 1.851103884990587, "grad_norm": 16.09478759765625, "learning_rate": 2.1526045964732556e-05, "loss": 1.4545, "step": 10816 }, { "epoch": 1.851275029950368, "grad_norm": 0.4229826033115387, "learning_rate": 2.1515669909073675e-05, "loss": 0.1115, "step": 10817 }, { "epoch": 1.8514461749101487, "grad_norm": 4.297288417816162, "learning_rate": 2.1505290009159836e-05, "loss": 0.2564, "step": 10818 }, { "epoch": 1.8516173198699297, "grad_norm": 13.234524726867676, "learning_rate": 2.1494906271115225e-05, "loss": 0.8752, "step": 10819 }, { "epoch": 1.8517884648297107, "grad_norm": 10.590407371520996, "learning_rate": 2.1484518701066216e-05, "loss": 0.7467, "step": 10820 }, { "epoch": 1.8519596097894917, "grad_norm": 12.723146438598633, "learning_rate": 2.147412730514153e-05, "loss": 1.0623, "step": 10821 }, { "epoch": 1.8521307547492727, "grad_norm": 2.4097933769226074, "learning_rate": 2.146373208947208e-05, "loss": 0.1814, "step": 10822 }, { "epoch": 1.8523018997090537, "grad_norm": 10.440643310546875, "learning_rate": 2.1453333060191083e-05, "loss": 0.8701, "step": 10823 }, { "epoch": 1.8524730446688347, "grad_norm": 14.675512313842773, "learning_rate": 2.1442930223433952e-05, "loss": 1.1106, "step": 10824 }, { "epoch": 1.8526441896286154, "grad_norm": 4.249091148376465, "learning_rate": 2.143252358533841e-05, "loss": 0.303, "step": 10825 }, { "epoch": 1.8528153345883964, "grad_norm": 8.596614837646484, "learning_rate": 2.1422113152044354e-05, "loss": 0.5038, "step": 10826 }, { "epoch": 1.8529864795481772, "grad_norm": 7.743111610412598, "learning_rate": 2.1411698929693996e-05, "loss": 0.5755, "step": 10827 }, { "epoch": 1.8531576245079582, "grad_norm": 14.712414741516113, "learning_rate": 2.1401280924431694e-05, "loss": 1.0919, "step": 10828 }, { "epoch": 1.8533287694677392, "grad_norm": 6.543259620666504, "learning_rate": 2.1390859142404124e-05, "loss": 0.5625, "step": 10829 }, { "epoch": 1.8534999144275202, "grad_norm": 4.716846942901611, "learning_rate": 2.1380433589760144e-05, "loss": 0.4401, "step": 10830 }, { "epoch": 1.8536710593873011, "grad_norm": 0.7594923973083496, "learning_rate": 2.1370004272650837e-05, "loss": 0.1551, "step": 10831 }, { "epoch": 1.8538422043470821, "grad_norm": 10.25179672241211, "learning_rate": 2.1359571197229526e-05, "loss": 0.7173, "step": 10832 }, { "epoch": 1.854013349306863, "grad_norm": 15.10999584197998, "learning_rate": 2.1349134369651732e-05, "loss": 1.1694, "step": 10833 }, { "epoch": 1.854184494266644, "grad_norm": 0.3838544487953186, "learning_rate": 2.1338693796075205e-05, "loss": 0.107, "step": 10834 }, { "epoch": 1.8543556392264247, "grad_norm": 10.520954132080078, "learning_rate": 2.13282494826599e-05, "loss": 0.7538, "step": 10835 }, { "epoch": 1.8545267841862056, "grad_norm": 8.797869682312012, "learning_rate": 2.1317801435567974e-05, "loss": 1.0373, "step": 10836 }, { "epoch": 1.8546979291459866, "grad_norm": 1.6293644905090332, "learning_rate": 2.130734966096379e-05, "loss": 0.2308, "step": 10837 }, { "epoch": 1.8548690741057676, "grad_norm": 0.39286115765571594, "learning_rate": 2.1296894165013907e-05, "loss": 0.11, "step": 10838 }, { "epoch": 1.8550402190655486, "grad_norm": 18.098834991455078, "learning_rate": 2.128643495388708e-05, "loss": 1.1785, "step": 10839 }, { "epoch": 1.8552113640253296, "grad_norm": 12.845710754394531, "learning_rate": 2.1275972033754284e-05, "loss": 1.1849, "step": 10840 }, { "epoch": 1.8553825089851104, "grad_norm": 11.085079193115234, "learning_rate": 2.126550541078863e-05, "loss": 0.9651, "step": 10841 }, { "epoch": 1.8555536539448914, "grad_norm": 5.548030376434326, "learning_rate": 2.1255035091165456e-05, "loss": 0.5438, "step": 10842 }, { "epoch": 1.8557247989046721, "grad_norm": 1.3577691316604614, "learning_rate": 2.1244561081062262e-05, "loss": 0.1574, "step": 10843 }, { "epoch": 1.8558959438644531, "grad_norm": 11.23949146270752, "learning_rate": 2.123408338665873e-05, "loss": 0.9191, "step": 10844 }, { "epoch": 1.856067088824234, "grad_norm": 13.015701293945312, "learning_rate": 2.1223602014136712e-05, "loss": 0.7807, "step": 10845 }, { "epoch": 1.856238233784015, "grad_norm": 9.170563697814941, "learning_rate": 2.1213116969680237e-05, "loss": 0.6894, "step": 10846 }, { "epoch": 1.856409378743796, "grad_norm": 24.079917907714844, "learning_rate": 2.1202628259475498e-05, "loss": 3.4337, "step": 10847 }, { "epoch": 1.856580523703577, "grad_norm": 27.1368350982666, "learning_rate": 2.1192135889710844e-05, "loss": 5.3312, "step": 10848 }, { "epoch": 1.8567516686633578, "grad_norm": 9.313065528869629, "learning_rate": 2.118163986657679e-05, "loss": 0.5324, "step": 10849 }, { "epoch": 1.8569228136231388, "grad_norm": 14.801884651184082, "learning_rate": 2.1171140196266005e-05, "loss": 1.1781, "step": 10850 }, { "epoch": 1.8570939585829196, "grad_norm": 61.00002670288086, "learning_rate": 2.1160636884973315e-05, "loss": 7.985, "step": 10851 }, { "epoch": 1.8572651035427006, "grad_norm": 10.613734245300293, "learning_rate": 2.1150129938895692e-05, "loss": 0.8008, "step": 10852 }, { "epoch": 1.8574362485024816, "grad_norm": 11.31788444519043, "learning_rate": 2.1139619364232243e-05, "loss": 0.9352, "step": 10853 }, { "epoch": 1.8576073934622626, "grad_norm": 2.86678409576416, "learning_rate": 2.1129105167184223e-05, "loss": 0.2214, "step": 10854 }, { "epoch": 1.8577785384220435, "grad_norm": 12.258174896240234, "learning_rate": 2.1118587353955027e-05, "loss": 0.9906, "step": 10855 }, { "epoch": 1.8579496833818245, "grad_norm": 4.081489562988281, "learning_rate": 2.1108065930750177e-05, "loss": 0.2961, "step": 10856 }, { "epoch": 1.8581208283416053, "grad_norm": 2.702458381652832, "learning_rate": 2.109754090377733e-05, "loss": 0.3577, "step": 10857 }, { "epoch": 1.8582919733013863, "grad_norm": 15.138577461242676, "learning_rate": 2.108701227924627e-05, "loss": 1.23, "step": 10858 }, { "epoch": 1.858463118261167, "grad_norm": 8.741546630859375, "learning_rate": 2.10764800633689e-05, "loss": 0.7976, "step": 10859 }, { "epoch": 1.858634263220948, "grad_norm": 13.997965812683105, "learning_rate": 2.1065944262359238e-05, "loss": 1.2063, "step": 10860 }, { "epoch": 1.858805408180729, "grad_norm": 12.155943870544434, "learning_rate": 2.105540488243342e-05, "loss": 0.8342, "step": 10861 }, { "epoch": 1.85897655314051, "grad_norm": 13.503629684448242, "learning_rate": 2.1044861929809715e-05, "loss": 1.0784, "step": 10862 }, { "epoch": 1.859147698100291, "grad_norm": 12.979475021362305, "learning_rate": 2.1034315410708455e-05, "loss": 0.839, "step": 10863 }, { "epoch": 1.859318843060072, "grad_norm": 13.34347915649414, "learning_rate": 2.1023765331352134e-05, "loss": 0.8821, "step": 10864 }, { "epoch": 1.8594899880198528, "grad_norm": 0.528994619846344, "learning_rate": 2.1013211697965273e-05, "loss": 0.1203, "step": 10865 }, { "epoch": 1.8596611329796338, "grad_norm": 7.5725297927856445, "learning_rate": 2.100265451677457e-05, "loss": 1.0058, "step": 10866 }, { "epoch": 1.8598322779394145, "grad_norm": 17.048770904541016, "learning_rate": 2.099209379400875e-05, "loss": 1.1325, "step": 10867 }, { "epoch": 1.8600034228991955, "grad_norm": 9.666221618652344, "learning_rate": 2.0981529535898686e-05, "loss": 0.6904, "step": 10868 }, { "epoch": 1.8601745678589765, "grad_norm": 0.9909911155700684, "learning_rate": 2.097096174867726e-05, "loss": 0.1645, "step": 10869 }, { "epoch": 1.8603457128187575, "grad_norm": 5.6123833656311035, "learning_rate": 2.096039043857953e-05, "loss": 0.4169, "step": 10870 }, { "epoch": 1.8605168577785385, "grad_norm": 15.576091766357422, "learning_rate": 2.094981561184255e-05, "loss": 1.4976, "step": 10871 }, { "epoch": 1.8606880027383195, "grad_norm": 15.352063179016113, "learning_rate": 2.09392372747055e-05, "loss": 1.7295, "step": 10872 }, { "epoch": 1.8608591476981002, "grad_norm": 7.716357707977295, "learning_rate": 2.0928655433409614e-05, "loss": 0.722, "step": 10873 }, { "epoch": 1.8610302926578812, "grad_norm": 9.084063529968262, "learning_rate": 2.0918070094198195e-05, "loss": 0.7495, "step": 10874 }, { "epoch": 1.8612014376176622, "grad_norm": 19.156829833984375, "learning_rate": 2.09074812633166e-05, "loss": 2.4875, "step": 10875 }, { "epoch": 1.861372582577443, "grad_norm": 16.93818473815918, "learning_rate": 2.0896888947012265e-05, "loss": 1.6597, "step": 10876 }, { "epoch": 1.861543727537224, "grad_norm": 15.14963436126709, "learning_rate": 2.0886293151534666e-05, "loss": 1.2295, "step": 10877 }, { "epoch": 1.861714872497005, "grad_norm": 5.938241958618164, "learning_rate": 2.087569388313534e-05, "loss": 0.4823, "step": 10878 }, { "epoch": 1.861886017456786, "grad_norm": 0.44269004464149475, "learning_rate": 2.0865091148067874e-05, "loss": 0.1112, "step": 10879 }, { "epoch": 1.862057162416567, "grad_norm": 0.7413665056228638, "learning_rate": 2.085448495258789e-05, "loss": 0.1629, "step": 10880 }, { "epoch": 1.862228307376348, "grad_norm": 2.094348192214966, "learning_rate": 2.0843875302953067e-05, "loss": 0.2247, "step": 10881 }, { "epoch": 1.8623994523361287, "grad_norm": 3.51492977142334, "learning_rate": 2.08332622054231e-05, "loss": 0.2427, "step": 10882 }, { "epoch": 1.8625705972959097, "grad_norm": 27.432876586914062, "learning_rate": 2.0822645666259758e-05, "loss": 5.348, "step": 10883 }, { "epoch": 1.8627417422556904, "grad_norm": 5.467396259307861, "learning_rate": 2.0812025691726795e-05, "loss": 0.4927, "step": 10884 }, { "epoch": 1.8629128872154714, "grad_norm": 6.952663421630859, "learning_rate": 2.080140228809002e-05, "loss": 0.5584, "step": 10885 }, { "epoch": 1.8630840321752524, "grad_norm": 11.318577766418457, "learning_rate": 2.079077546161725e-05, "loss": 0.7655, "step": 10886 }, { "epoch": 1.8632551771350334, "grad_norm": 7.115924835205078, "learning_rate": 2.0780145218578337e-05, "loss": 0.8828, "step": 10887 }, { "epoch": 1.8634263220948144, "grad_norm": 7.851391792297363, "learning_rate": 2.076951156524513e-05, "loss": 0.7149, "step": 10888 }, { "epoch": 1.8635974670545954, "grad_norm": 10.235215187072754, "learning_rate": 2.0758874507891514e-05, "loss": 0.8558, "step": 10889 }, { "epoch": 1.8637686120143762, "grad_norm": 1.4381293058395386, "learning_rate": 2.0748234052793353e-05, "loss": 0.2012, "step": 10890 }, { "epoch": 1.8639397569741571, "grad_norm": 36.89596939086914, "learning_rate": 2.0737590206228547e-05, "loss": 5.7988, "step": 10891 }, { "epoch": 1.864110901933938, "grad_norm": 9.610917091369629, "learning_rate": 2.072694297447697e-05, "loss": 0.8569, "step": 10892 }, { "epoch": 1.864282046893719, "grad_norm": 0.40948453545570374, "learning_rate": 2.0716292363820497e-05, "loss": 0.1047, "step": 10893 }, { "epoch": 1.8644531918535, "grad_norm": 8.652310371398926, "learning_rate": 2.0705638380543027e-05, "loss": 0.7544, "step": 10894 }, { "epoch": 1.8646243368132809, "grad_norm": 11.809414863586426, "learning_rate": 2.069498103093041e-05, "loss": 0.7871, "step": 10895 }, { "epoch": 1.8647954817730619, "grad_norm": 9.175246238708496, "learning_rate": 2.0684320321270502e-05, "loss": 0.7865, "step": 10896 }, { "epoch": 1.8649666267328429, "grad_norm": 11.313461303710938, "learning_rate": 2.067365625785314e-05, "loss": 0.773, "step": 10897 }, { "epoch": 1.8651377716926236, "grad_norm": 3.796443223953247, "learning_rate": 2.0662988846970137e-05, "loss": 0.2601, "step": 10898 }, { "epoch": 1.8653089166524046, "grad_norm": 0.4691271185874939, "learning_rate": 2.065231809491528e-05, "loss": 0.1052, "step": 10899 }, { "epoch": 1.8654800616121854, "grad_norm": 12.024251937866211, "learning_rate": 2.064164400798433e-05, "loss": 1.0157, "step": 10900 }, { "epoch": 1.8656512065719664, "grad_norm": 8.918675422668457, "learning_rate": 2.0630966592475006e-05, "loss": 0.7565, "step": 10901 }, { "epoch": 1.8658223515317474, "grad_norm": 20.736446380615234, "learning_rate": 2.062028585468701e-05, "loss": 2.5025, "step": 10902 }, { "epoch": 1.8659934964915283, "grad_norm": 10.270184516906738, "learning_rate": 2.0609601800921984e-05, "loss": 0.806, "step": 10903 }, { "epoch": 1.8661646414513093, "grad_norm": 5.702853202819824, "learning_rate": 2.0598914437483534e-05, "loss": 0.5466, "step": 10904 }, { "epoch": 1.8663357864110903, "grad_norm": 37.74603271484375, "learning_rate": 2.0588223770677247e-05, "loss": 5.7011, "step": 10905 }, { "epoch": 1.866506931370871, "grad_norm": 12.338099479675293, "learning_rate": 2.057752980681059e-05, "loss": 0.8662, "step": 10906 }, { "epoch": 1.866678076330652, "grad_norm": 11.297871589660645, "learning_rate": 2.056683255219306e-05, "loss": 0.7641, "step": 10907 }, { "epoch": 1.8668492212904328, "grad_norm": 10.080599784851074, "learning_rate": 2.055613201313601e-05, "loss": 0.7618, "step": 10908 }, { "epoch": 1.8670203662502138, "grad_norm": 15.190640449523926, "learning_rate": 2.054542819595282e-05, "loss": 1.5234, "step": 10909 }, { "epoch": 1.8671915112099948, "grad_norm": 6.907302379608154, "learning_rate": 2.0534721106958715e-05, "loss": 0.7357, "step": 10910 }, { "epoch": 1.8673626561697758, "grad_norm": 8.1217622756958, "learning_rate": 2.052401075247093e-05, "loss": 0.7367, "step": 10911 }, { "epoch": 1.8675338011295568, "grad_norm": 13.733673095703125, "learning_rate": 2.0513297138808555e-05, "loss": 0.9242, "step": 10912 }, { "epoch": 1.8677049460893378, "grad_norm": 22.06145668029785, "learning_rate": 2.0502580272292677e-05, "loss": 4.8898, "step": 10913 }, { "epoch": 1.8678760910491186, "grad_norm": 3.0712060928344727, "learning_rate": 2.0491860159246223e-05, "loss": 0.2254, "step": 10914 }, { "epoch": 1.8680472360088995, "grad_norm": 6.636814594268799, "learning_rate": 2.0481136805994104e-05, "loss": 0.4198, "step": 10915 }, { "epoch": 1.8682183809686803, "grad_norm": 9.061782836914062, "learning_rate": 2.0470410218863106e-05, "loss": 1.0116, "step": 10916 }, { "epoch": 1.8683895259284613, "grad_norm": 14.0357666015625, "learning_rate": 2.045968040418193e-05, "loss": 1.1322, "step": 10917 }, { "epoch": 1.8685606708882423, "grad_norm": 21.54892921447754, "learning_rate": 2.0448947368281183e-05, "loss": 1.9333, "step": 10918 }, { "epoch": 1.8687318158480233, "grad_norm": 17.386098861694336, "learning_rate": 2.0438211117493374e-05, "loss": 1.9079, "step": 10919 }, { "epoch": 1.8689029608078043, "grad_norm": 10.340031623840332, "learning_rate": 2.0427471658152902e-05, "loss": 0.8175, "step": 10920 }, { "epoch": 1.8690741057675853, "grad_norm": 3.9318366050720215, "learning_rate": 2.0416728996596073e-05, "loss": 0.4227, "step": 10921 }, { "epoch": 1.869245250727366, "grad_norm": 11.366128921508789, "learning_rate": 2.0405983139161067e-05, "loss": 0.6995, "step": 10922 }, { "epoch": 1.869416395687147, "grad_norm": 55.6683235168457, "learning_rate": 2.0395234092187956e-05, "loss": 6.6016, "step": 10923 }, { "epoch": 1.869587540646928, "grad_norm": 26.88568687438965, "learning_rate": 2.0384481862018697e-05, "loss": 5.1706, "step": 10924 }, { "epoch": 1.8697586856067088, "grad_norm": 6.457910537719727, "learning_rate": 2.0373726454997106e-05, "loss": 0.4757, "step": 10925 }, { "epoch": 1.8699298305664898, "grad_norm": 5.67574405670166, "learning_rate": 2.0362967877468916e-05, "loss": 0.3994, "step": 10926 }, { "epoch": 1.8701009755262707, "grad_norm": 25.18231201171875, "learning_rate": 2.0352206135781683e-05, "loss": 4.8458, "step": 10927 }, { "epoch": 1.8702721204860517, "grad_norm": 4.942233562469482, "learning_rate": 2.0341441236284865e-05, "loss": 0.3523, "step": 10928 }, { "epoch": 1.8704432654458327, "grad_norm": 17.266733169555664, "learning_rate": 2.033067318532976e-05, "loss": 1.4119, "step": 10929 }, { "epoch": 1.8706144104056137, "grad_norm": 9.579458236694336, "learning_rate": 2.0319901989269536e-05, "loss": 0.6989, "step": 10930 }, { "epoch": 1.8707855553653945, "grad_norm": 4.9515275955200195, "learning_rate": 2.0309127654459216e-05, "loss": 0.3033, "step": 10931 }, { "epoch": 1.8709567003251755, "grad_norm": 21.85822105407715, "learning_rate": 2.029835018725567e-05, "loss": 5.0032, "step": 10932 }, { "epoch": 1.8711278452849562, "grad_norm": 10.328897476196289, "learning_rate": 2.028756959401762e-05, "loss": 0.8418, "step": 10933 }, { "epoch": 1.8712989902447372, "grad_norm": 2.095024585723877, "learning_rate": 2.0276785881105638e-05, "loss": 0.2213, "step": 10934 }, { "epoch": 1.8714701352045182, "grad_norm": 0.40235358476638794, "learning_rate": 2.0265999054882124e-05, "loss": 0.114, "step": 10935 }, { "epoch": 1.8716412801642992, "grad_norm": 8.539332389831543, "learning_rate": 2.0255209121711313e-05, "loss": 0.9111, "step": 10936 }, { "epoch": 1.8718124251240802, "grad_norm": 0.4323277175426483, "learning_rate": 2.0244416087959302e-05, "loss": 0.1163, "step": 10937 }, { "epoch": 1.8719835700838612, "grad_norm": 19.622129440307617, "learning_rate": 2.023361995999399e-05, "loss": 2.144, "step": 10938 }, { "epoch": 1.872154715043642, "grad_norm": 11.604806900024414, "learning_rate": 2.0222820744185106e-05, "loss": 1.0451, "step": 10939 }, { "epoch": 1.872325860003423, "grad_norm": 1.3092888593673706, "learning_rate": 2.021201844690421e-05, "loss": 0.1836, "step": 10940 }, { "epoch": 1.8724970049632037, "grad_norm": 8.795955657958984, "learning_rate": 2.020121307452466e-05, "loss": 0.8064, "step": 10941 }, { "epoch": 1.8726681499229847, "grad_norm": 7.463593006134033, "learning_rate": 2.019040463342165e-05, "loss": 0.7289, "step": 10942 }, { "epoch": 1.8728392948827657, "grad_norm": 0.5047159790992737, "learning_rate": 2.0179593129972178e-05, "loss": 0.109, "step": 10943 }, { "epoch": 1.8730104398425467, "grad_norm": 3.0091607570648193, "learning_rate": 2.016877857055504e-05, "loss": 0.2721, "step": 10944 }, { "epoch": 1.8731815848023277, "grad_norm": 14.303910255432129, "learning_rate": 2.015796096155085e-05, "loss": 1.0736, "step": 10945 }, { "epoch": 1.8733527297621086, "grad_norm": 14.880782127380371, "learning_rate": 2.0147140309342008e-05, "loss": 1.3198, "step": 10946 }, { "epoch": 1.8735238747218894, "grad_norm": 9.587919235229492, "learning_rate": 2.013631662031271e-05, "loss": 0.6118, "step": 10947 }, { "epoch": 1.8736950196816704, "grad_norm": 26.846824645996094, "learning_rate": 2.0125489900848974e-05, "loss": 5.6594, "step": 10948 }, { "epoch": 1.8738661646414512, "grad_norm": 117.05803680419922, "learning_rate": 2.0114660157338545e-05, "loss": 8.7741, "step": 10949 }, { "epoch": 1.8740373096012322, "grad_norm": 10.23105239868164, "learning_rate": 2.0103827396171017e-05, "loss": 0.7479, "step": 10950 }, { "epoch": 1.8742084545610131, "grad_norm": 6.994273662567139, "learning_rate": 2.0092991623737713e-05, "loss": 0.6129, "step": 10951 }, { "epoch": 1.8743795995207941, "grad_norm": 6.12116813659668, "learning_rate": 2.008215284643178e-05, "loss": 0.5343, "step": 10952 }, { "epoch": 1.8745507444805751, "grad_norm": 5.879600524902344, "learning_rate": 2.0071311070648076e-05, "loss": 0.4668, "step": 10953 }, { "epoch": 1.8747218894403561, "grad_norm": 16.475170135498047, "learning_rate": 2.006046630278331e-05, "loss": 1.3974, "step": 10954 }, { "epoch": 1.8748930344001369, "grad_norm": 13.854368209838867, "learning_rate": 2.004961854923587e-05, "loss": 1.2775, "step": 10955 }, { "epoch": 1.8750641793599179, "grad_norm": 9.903826713562012, "learning_rate": 2.003876781640598e-05, "loss": 0.7992, "step": 10956 }, { "epoch": 1.8752353243196986, "grad_norm": 72.08663177490234, "learning_rate": 2.0027914110695558e-05, "loss": 7.8587, "step": 10957 }, { "epoch": 1.8754064692794796, "grad_norm": 5.309252738952637, "learning_rate": 2.001705743850833e-05, "loss": 0.5551, "step": 10958 }, { "epoch": 1.8755776142392606, "grad_norm": 2.497652530670166, "learning_rate": 2.0006197806249737e-05, "loss": 0.2584, "step": 10959 }, { "epoch": 1.8757487591990416, "grad_norm": 8.305780410766602, "learning_rate": 1.9995335220326985e-05, "loss": 0.537, "step": 10960 }, { "epoch": 1.8759199041588226, "grad_norm": 16.306188583374023, "learning_rate": 1.998446968714901e-05, "loss": 1.6359, "step": 10961 }, { "epoch": 1.8760910491186036, "grad_norm": 23.607818603515625, "learning_rate": 1.99736012131265e-05, "loss": 5.1471, "step": 10962 }, { "epoch": 1.8762621940783843, "grad_norm": 9.052146911621094, "learning_rate": 1.9962729804671868e-05, "loss": 0.6284, "step": 10963 }, { "epoch": 1.8764333390381653, "grad_norm": 15.171002388000488, "learning_rate": 1.9951855468199255e-05, "loss": 1.168, "step": 10964 }, { "epoch": 1.876604483997946, "grad_norm": 5.359767913818359, "learning_rate": 1.9940978210124538e-05, "loss": 0.2921, "step": 10965 }, { "epoch": 1.876775628957727, "grad_norm": 2.0385851860046387, "learning_rate": 1.9930098036865322e-05, "loss": 0.2157, "step": 10966 }, { "epoch": 1.876946773917508, "grad_norm": 17.289926528930664, "learning_rate": 1.9919214954840918e-05, "loss": 1.251, "step": 10967 }, { "epoch": 1.877117918877289, "grad_norm": 5.154585361480713, "learning_rate": 1.9908328970472357e-05, "loss": 0.4082, "step": 10968 }, { "epoch": 1.87728906383707, "grad_norm": 14.859457015991211, "learning_rate": 1.98974400901824e-05, "loss": 0.9993, "step": 10969 }, { "epoch": 1.877460208796851, "grad_norm": 21.52395248413086, "learning_rate": 1.9886548320395496e-05, "loss": 2.0328, "step": 10970 }, { "epoch": 1.8776313537566318, "grad_norm": 0.5694449543952942, "learning_rate": 1.9875653667537804e-05, "loss": 0.1123, "step": 10971 }, { "epoch": 1.8778024987164128, "grad_norm": 8.036969184875488, "learning_rate": 1.9864756138037188e-05, "loss": 0.5987, "step": 10972 }, { "epoch": 1.8779736436761936, "grad_norm": 15.053860664367676, "learning_rate": 1.9853855738323204e-05, "loss": 1.2788, "step": 10973 }, { "epoch": 1.8781447886359746, "grad_norm": 3.836268901824951, "learning_rate": 1.9842952474827102e-05, "loss": 0.2737, "step": 10974 }, { "epoch": 1.8783159335957555, "grad_norm": 16.48788833618164, "learning_rate": 1.983204635398183e-05, "loss": 0.7088, "step": 10975 }, { "epoch": 1.8784870785555365, "grad_norm": 15.461217880249023, "learning_rate": 1.9821137382222012e-05, "loss": 1.0075, "step": 10976 }, { "epoch": 1.8786582235153175, "grad_norm": 11.009471893310547, "learning_rate": 1.9810225565983953e-05, "loss": 0.9398, "step": 10977 }, { "epoch": 1.8788293684750985, "grad_norm": 18.990861892700195, "learning_rate": 1.9799310911705654e-05, "loss": 1.8358, "step": 10978 }, { "epoch": 1.8790005134348795, "grad_norm": 8.872893333435059, "learning_rate": 1.978839342582675e-05, "loss": 0.5334, "step": 10979 }, { "epoch": 1.8791716583946603, "grad_norm": 31.12604331970215, "learning_rate": 1.977747311478861e-05, "loss": 4.8944, "step": 10980 }, { "epoch": 1.8793428033544413, "grad_norm": 16.22289276123047, "learning_rate": 1.976654998503421e-05, "loss": 1.1549, "step": 10981 }, { "epoch": 1.879513948314222, "grad_norm": 4.3253493309021, "learning_rate": 1.9755624043008223e-05, "loss": 0.5676, "step": 10982 }, { "epoch": 1.879685093274003, "grad_norm": 10.190035820007324, "learning_rate": 1.9744695295156966e-05, "loss": 0.7164, "step": 10983 }, { "epoch": 1.879856238233784, "grad_norm": 23.617250442504883, "learning_rate": 1.9733763747928415e-05, "loss": 4.9841, "step": 10984 }, { "epoch": 1.880027383193565, "grad_norm": 11.775053024291992, "learning_rate": 1.9722829407772208e-05, "loss": 1.0246, "step": 10985 }, { "epoch": 1.880198528153346, "grad_norm": 9.426666259765625, "learning_rate": 1.971189228113961e-05, "loss": 0.7519, "step": 10986 }, { "epoch": 1.880369673113127, "grad_norm": 15.487780570983887, "learning_rate": 1.970095237448355e-05, "loss": 1.5476, "step": 10987 }, { "epoch": 1.8805408180729077, "grad_norm": 6.212828159332275, "learning_rate": 1.9690009694258593e-05, "loss": 0.4695, "step": 10988 }, { "epoch": 1.8807119630326887, "grad_norm": 12.039491653442383, "learning_rate": 1.9679064246920923e-05, "loss": 0.8013, "step": 10989 }, { "epoch": 1.8808831079924695, "grad_norm": 13.665583610534668, "learning_rate": 1.9668116038928373e-05, "loss": 1.0688, "step": 10990 }, { "epoch": 1.8810542529522505, "grad_norm": 0.3609735071659088, "learning_rate": 1.9657165076740426e-05, "loss": 0.0986, "step": 10991 }, { "epoch": 1.8812253979120315, "grad_norm": 1.1990466117858887, "learning_rate": 1.9646211366818123e-05, "loss": 0.1739, "step": 10992 }, { "epoch": 1.8813965428718125, "grad_norm": 0.3588649332523346, "learning_rate": 1.9635254915624214e-05, "loss": 0.1065, "step": 10993 }, { "epoch": 1.8815676878315934, "grad_norm": 15.997937202453613, "learning_rate": 1.9624295729622984e-05, "loss": 1.9312, "step": 10994 }, { "epoch": 1.8817388327913744, "grad_norm": 0.5036999583244324, "learning_rate": 1.961333381528041e-05, "loss": 0.1178, "step": 10995 }, { "epoch": 1.8819099777511552, "grad_norm": 0.4245542287826538, "learning_rate": 1.9602369179063987e-05, "loss": 0.108, "step": 10996 }, { "epoch": 1.8820811227109362, "grad_norm": 0.3913329243659973, "learning_rate": 1.9591401827442914e-05, "loss": 0.1115, "step": 10997 }, { "epoch": 1.882252267670717, "grad_norm": 7.960232734680176, "learning_rate": 1.9580431766887904e-05, "loss": 0.5283, "step": 10998 }, { "epoch": 1.882423412630498, "grad_norm": 2.926450729370117, "learning_rate": 1.9569459003871348e-05, "loss": 0.2357, "step": 10999 }, { "epoch": 1.882594557590279, "grad_norm": 26.785545349121094, "learning_rate": 1.955848354486716e-05, "loss": 4.9466, "step": 11000 }, { "epoch": 1.88276570255006, "grad_norm": 13.84740161895752, "learning_rate": 1.9547505396350893e-05, "loss": 1.0723, "step": 11001 }, { "epoch": 1.882936847509841, "grad_norm": 8.930294036865234, "learning_rate": 1.9536524564799673e-05, "loss": 0.6947, "step": 11002 }, { "epoch": 1.883107992469622, "grad_norm": 13.230756759643555, "learning_rate": 1.95255410566922e-05, "loss": 1.076, "step": 11003 }, { "epoch": 1.8832791374294027, "grad_norm": 24.585920333862305, "learning_rate": 1.951455487850877e-05, "loss": 4.8605, "step": 11004 }, { "epoch": 1.8834502823891837, "grad_norm": 11.91327953338623, "learning_rate": 1.950356603673123e-05, "loss": 1.0938, "step": 11005 }, { "epoch": 1.8836214273489644, "grad_norm": 3.097719430923462, "learning_rate": 1.9492574537843024e-05, "loss": 0.2453, "step": 11006 }, { "epoch": 1.8837925723087454, "grad_norm": 21.164554595947266, "learning_rate": 1.9481580388329148e-05, "loss": 2.6965, "step": 11007 }, { "epoch": 1.8839637172685264, "grad_norm": 21.902578353881836, "learning_rate": 1.9470583594676167e-05, "loss": 2.3096, "step": 11008 }, { "epoch": 1.8841348622283074, "grad_norm": 18.65900421142578, "learning_rate": 1.9459584163372203e-05, "loss": 2.1908, "step": 11009 }, { "epoch": 1.8843060071880884, "grad_norm": 7.1866350173950195, "learning_rate": 1.9448582100906946e-05, "loss": 0.5824, "step": 11010 }, { "epoch": 1.8844771521478694, "grad_norm": 7.539106369018555, "learning_rate": 1.9437577413771616e-05, "loss": 0.5361, "step": 11011 }, { "epoch": 1.8846482971076501, "grad_norm": 14.392020225524902, "learning_rate": 1.9426570108459007e-05, "loss": 0.9287, "step": 11012 }, { "epoch": 1.8848194420674311, "grad_norm": 12.855627059936523, "learning_rate": 1.9415560191463444e-05, "loss": 0.7432, "step": 11013 }, { "epoch": 1.884990587027212, "grad_norm": 12.463791847229004, "learning_rate": 1.940454766928079e-05, "loss": 1.1663, "step": 11014 }, { "epoch": 1.8851617319869929, "grad_norm": 10.976508140563965, "learning_rate": 1.9393532548408447e-05, "loss": 0.9216, "step": 11015 }, { "epoch": 1.8853328769467739, "grad_norm": 8.354618072509766, "learning_rate": 1.938251483534536e-05, "loss": 0.6809, "step": 11016 }, { "epoch": 1.8855040219065549, "grad_norm": 10.187040328979492, "learning_rate": 1.937149453659199e-05, "loss": 0.7098, "step": 11017 }, { "epoch": 1.8856751668663359, "grad_norm": 12.203680038452148, "learning_rate": 1.9360471658650336e-05, "loss": 1.0308, "step": 11018 }, { "epoch": 1.8858463118261168, "grad_norm": 10.70466136932373, "learning_rate": 1.9349446208023903e-05, "loss": 0.8023, "step": 11019 }, { "epoch": 1.8860174567858976, "grad_norm": 7.011648178100586, "learning_rate": 1.9338418191217736e-05, "loss": 0.6093, "step": 11020 }, { "epoch": 1.8861886017456786, "grad_norm": 10.923543930053711, "learning_rate": 1.9327387614738375e-05, "loss": 0.9573, "step": 11021 }, { "epoch": 1.8863597467054594, "grad_norm": 10.400789260864258, "learning_rate": 1.931635448509386e-05, "loss": 0.7391, "step": 11022 }, { "epoch": 1.8865308916652404, "grad_norm": 9.70921802520752, "learning_rate": 1.930531880879378e-05, "loss": 0.8157, "step": 11023 }, { "epoch": 1.8867020366250213, "grad_norm": 20.018142700195312, "learning_rate": 1.9294280592349193e-05, "loss": 2.599, "step": 11024 }, { "epoch": 1.8868731815848023, "grad_norm": 22.01080322265625, "learning_rate": 1.9283239842272658e-05, "loss": 4.7794, "step": 11025 }, { "epoch": 1.8870443265445833, "grad_norm": 1.018039584159851, "learning_rate": 1.9272196565078238e-05, "loss": 0.1807, "step": 11026 }, { "epoch": 1.8872154715043643, "grad_norm": 10.157571792602539, "learning_rate": 1.926115076728148e-05, "loss": 0.7124, "step": 11027 }, { "epoch": 1.8873866164641453, "grad_norm": 17.982688903808594, "learning_rate": 1.9250102455399427e-05, "loss": 1.9376, "step": 11028 }, { "epoch": 1.887557761423926, "grad_norm": 13.251009941101074, "learning_rate": 1.9239051635950588e-05, "loss": 1.438, "step": 11029 }, { "epoch": 1.887728906383707, "grad_norm": 1.4690003395080566, "learning_rate": 1.9227998315454976e-05, "loss": 0.1359, "step": 11030 }, { "epoch": 1.8879000513434878, "grad_norm": 6.90065860748291, "learning_rate": 1.9216942500434055e-05, "loss": 0.538, "step": 11031 }, { "epoch": 1.8880711963032688, "grad_norm": 10.520479202270508, "learning_rate": 1.920588419741078e-05, "loss": 0.7702, "step": 11032 }, { "epoch": 1.8882423412630498, "grad_norm": 2.779036521911621, "learning_rate": 1.919482341290956e-05, "loss": 0.2164, "step": 11033 }, { "epoch": 1.8884134862228308, "grad_norm": 11.28299331665039, "learning_rate": 1.9183760153456286e-05, "loss": 0.85, "step": 11034 }, { "epoch": 1.8885846311826118, "grad_norm": 10.533530235290527, "learning_rate": 1.917269442557828e-05, "loss": 0.727, "step": 11035 }, { "epoch": 1.8887557761423928, "grad_norm": 9.156173706054688, "learning_rate": 1.9161626235804368e-05, "loss": 0.7499, "step": 11036 }, { "epoch": 1.8889269211021735, "grad_norm": 10.982351303100586, "learning_rate": 1.9150555590664754e-05, "loss": 0.7712, "step": 11037 }, { "epoch": 1.8890980660619545, "grad_norm": 10.883585929870605, "learning_rate": 1.913948249669118e-05, "loss": 0.8736, "step": 11038 }, { "epoch": 1.8892692110217353, "grad_norm": 9.689952850341797, "learning_rate": 1.9128406960416748e-05, "loss": 0.8077, "step": 11039 }, { "epoch": 1.8894403559815163, "grad_norm": 15.513401985168457, "learning_rate": 1.911732898837608e-05, "loss": 1.2321, "step": 11040 }, { "epoch": 1.8896115009412973, "grad_norm": 29.893096923828125, "learning_rate": 1.9106248587105154e-05, "loss": 1.2545, "step": 11041 }, { "epoch": 1.8897826459010783, "grad_norm": 16.580673217773438, "learning_rate": 1.909516576314147e-05, "loss": 1.2963, "step": 11042 }, { "epoch": 1.8899537908608592, "grad_norm": 2.048891544342041, "learning_rate": 1.9084080523023862e-05, "loss": 0.3144, "step": 11043 }, { "epoch": 1.8901249358206402, "grad_norm": 13.597352027893066, "learning_rate": 1.9072992873292676e-05, "loss": 1.3371, "step": 11044 }, { "epoch": 1.890296080780421, "grad_norm": 8.856637954711914, "learning_rate": 1.9061902820489628e-05, "loss": 0.6013, "step": 11045 }, { "epoch": 1.890467225740202, "grad_norm": 2.2777340412139893, "learning_rate": 1.9050810371157865e-05, "loss": 0.2153, "step": 11046 }, { "epoch": 1.8906383706999828, "grad_norm": 11.621878623962402, "learning_rate": 1.9039715531841946e-05, "loss": 0.9325, "step": 11047 }, { "epoch": 1.8908095156597637, "grad_norm": 8.110366821289062, "learning_rate": 1.902861830908785e-05, "loss": 0.5972, "step": 11048 }, { "epoch": 1.8909806606195447, "grad_norm": 13.743197441101074, "learning_rate": 1.901751870944295e-05, "loss": 1.2966, "step": 11049 }, { "epoch": 1.8911518055793257, "grad_norm": 0.4667554795742035, "learning_rate": 1.9006416739456028e-05, "loss": 0.1137, "step": 11050 }, { "epoch": 1.8913229505391067, "grad_norm": 11.86550521850586, "learning_rate": 1.8995312405677262e-05, "loss": 0.6825, "step": 11051 }, { "epoch": 1.8914940954988877, "grad_norm": 2.6213161945343018, "learning_rate": 1.8984205714658226e-05, "loss": 0.2455, "step": 11052 }, { "epoch": 1.8916652404586685, "grad_norm": 9.102559089660645, "learning_rate": 1.8973096672951887e-05, "loss": 0.6712, "step": 11053 }, { "epoch": 1.8918363854184495, "grad_norm": 14.183549880981445, "learning_rate": 1.896198528711258e-05, "loss": 1.0724, "step": 11054 }, { "epoch": 1.8920075303782302, "grad_norm": 14.953872680664062, "learning_rate": 1.8950871563696058e-05, "loss": 1.1564, "step": 11055 }, { "epoch": 1.8921786753380112, "grad_norm": 12.92641830444336, "learning_rate": 1.893975550925943e-05, "loss": 0.9092, "step": 11056 }, { "epoch": 1.8923498202977922, "grad_norm": 0.4172672629356384, "learning_rate": 1.892863713036119e-05, "loss": 0.1184, "step": 11057 }, { "epoch": 1.8925209652575732, "grad_norm": 15.714910507202148, "learning_rate": 1.891751643356119e-05, "loss": 1.2615, "step": 11058 }, { "epoch": 1.8926921102173542, "grad_norm": 12.251708984375, "learning_rate": 1.8906393425420654e-05, "loss": 0.7931, "step": 11059 }, { "epoch": 1.8928632551771352, "grad_norm": 75.99552917480469, "learning_rate": 1.889526811250219e-05, "loss": 6.9769, "step": 11060 }, { "epoch": 1.893034400136916, "grad_norm": 10.3998441696167, "learning_rate": 1.8884140501369725e-05, "loss": 0.7291, "step": 11061 }, { "epoch": 1.893205545096697, "grad_norm": 19.825651168823242, "learning_rate": 1.8873010598588583e-05, "loss": 1.6145, "step": 11062 }, { "epoch": 1.8933766900564777, "grad_norm": 0.5082797408103943, "learning_rate": 1.886187841072542e-05, "loss": 0.1126, "step": 11063 }, { "epoch": 1.8935478350162587, "grad_norm": 13.398571968078613, "learning_rate": 1.8850743944348244e-05, "loss": 0.9987, "step": 11064 }, { "epoch": 1.8937189799760397, "grad_norm": 2.0857551097869873, "learning_rate": 1.8839607206026393e-05, "loss": 0.2257, "step": 11065 }, { "epoch": 1.8938901249358207, "grad_norm": 13.001497268676758, "learning_rate": 1.882846820233058e-05, "loss": 1.0808, "step": 11066 }, { "epoch": 1.8940612698956016, "grad_norm": 0.49432164430618286, "learning_rate": 1.8817326939832828e-05, "loss": 0.1112, "step": 11067 }, { "epoch": 1.8942324148553826, "grad_norm": 9.867591857910156, "learning_rate": 1.8806183425106497e-05, "loss": 0.7748, "step": 11068 }, { "epoch": 1.8944035598151634, "grad_norm": 7.640634536743164, "learning_rate": 1.8795037664726276e-05, "loss": 0.5331, "step": 11069 }, { "epoch": 1.8945747047749444, "grad_norm": 13.66068172454834, "learning_rate": 1.878388966526818e-05, "loss": 1.0858, "step": 11070 }, { "epoch": 1.8947458497347252, "grad_norm": 0.38369643688201904, "learning_rate": 1.877273943330954e-05, "loss": 0.1069, "step": 11071 }, { "epoch": 1.8949169946945061, "grad_norm": 9.891222953796387, "learning_rate": 1.8761586975429022e-05, "loss": 0.6478, "step": 11072 }, { "epoch": 1.8950881396542871, "grad_norm": 6.3234453201293945, "learning_rate": 1.875043229820658e-05, "loss": 0.5472, "step": 11073 }, { "epoch": 1.8952592846140681, "grad_norm": 17.456462860107422, "learning_rate": 1.8739275408223497e-05, "loss": 1.913, "step": 11074 }, { "epoch": 1.895430429573849, "grad_norm": 0.3114881217479706, "learning_rate": 1.872811631206236e-05, "loss": 0.1028, "step": 11075 }, { "epoch": 1.89560157453363, "grad_norm": 10.522832870483398, "learning_rate": 1.871695501630703e-05, "loss": 0.7006, "step": 11076 }, { "epoch": 1.895772719493411, "grad_norm": 1.5503888130187988, "learning_rate": 1.870579152754273e-05, "loss": 0.1625, "step": 11077 }, { "epoch": 1.8959438644531919, "grad_norm": 9.525410652160645, "learning_rate": 1.869462585235588e-05, "loss": 0.8122, "step": 11078 }, { "epoch": 1.8961150094129728, "grad_norm": 5.633816719055176, "learning_rate": 1.86834579973343e-05, "loss": 0.4279, "step": 11079 }, { "epoch": 1.8962861543727536, "grad_norm": 13.81872844696045, "learning_rate": 1.8672287969066995e-05, "loss": 1.3355, "step": 11080 }, { "epoch": 1.8964572993325346, "grad_norm": 1.4858864545822144, "learning_rate": 1.8661115774144333e-05, "loss": 0.1859, "step": 11081 }, { "epoch": 1.8966284442923156, "grad_norm": 24.20428466796875, "learning_rate": 1.8649941419157897e-05, "loss": 2.8056, "step": 11082 }, { "epoch": 1.8967995892520966, "grad_norm": 11.029074668884277, "learning_rate": 1.8638764910700606e-05, "loss": 0.7509, "step": 11083 }, { "epoch": 1.8969707342118776, "grad_norm": 55.44279861450195, "learning_rate": 1.862758625536658e-05, "loss": 6.6988, "step": 11084 }, { "epoch": 1.8971418791716586, "grad_norm": 15.203019142150879, "learning_rate": 1.861640545975128e-05, "loss": 1.4733, "step": 11085 }, { "epoch": 1.8973130241314393, "grad_norm": 47.18984603881836, "learning_rate": 1.860522253045135e-05, "loss": 6.36, "step": 11086 }, { "epoch": 1.8974841690912203, "grad_norm": 9.319621086120605, "learning_rate": 1.8594037474064767e-05, "loss": 0.6479, "step": 11087 }, { "epoch": 1.897655314051001, "grad_norm": 3.714374542236328, "learning_rate": 1.858285029719072e-05, "loss": 0.3392, "step": 11088 }, { "epoch": 1.897826459010782, "grad_norm": 19.683815002441406, "learning_rate": 1.857166100642966e-05, "loss": 2.3137, "step": 11089 }, { "epoch": 1.897997603970563, "grad_norm": 6.602670192718506, "learning_rate": 1.8560469608383293e-05, "loss": 0.6688, "step": 11090 }, { "epoch": 1.898168748930344, "grad_norm": 7.918621063232422, "learning_rate": 1.854927610965455e-05, "loss": 0.7917, "step": 11091 }, { "epoch": 1.898339893890125, "grad_norm": 13.220263481140137, "learning_rate": 1.8538080516847615e-05, "loss": 1.0715, "step": 11092 }, { "epoch": 1.898511038849906, "grad_norm": 2.334465742111206, "learning_rate": 1.8526882836567907e-05, "loss": 0.3213, "step": 11093 }, { "epoch": 1.8986821838096868, "grad_norm": 55.58744812011719, "learning_rate": 1.8515683075422073e-05, "loss": 6.6071, "step": 11094 }, { "epoch": 1.8988533287694678, "grad_norm": 5.3943400382995605, "learning_rate": 1.8504481240017984e-05, "loss": 0.5099, "step": 11095 }, { "epoch": 1.8990244737292485, "grad_norm": 5.783933639526367, "learning_rate": 1.8493277336964745e-05, "loss": 0.3449, "step": 11096 }, { "epoch": 1.8991956186890295, "grad_norm": 73.77400970458984, "learning_rate": 1.8482071372872666e-05, "loss": 7.7981, "step": 11097 }, { "epoch": 1.8993667636488105, "grad_norm": 17.4135799407959, "learning_rate": 1.84708633543533e-05, "loss": 1.5133, "step": 11098 }, { "epoch": 1.8995379086085915, "grad_norm": 0.4633524417877197, "learning_rate": 1.8459653288019385e-05, "loss": 0.1082, "step": 11099 }, { "epoch": 1.8997090535683725, "grad_norm": 1.4940106868743896, "learning_rate": 1.8448441180484876e-05, "loss": 0.1987, "step": 11100 }, { "epoch": 1.8998801985281535, "grad_norm": 5.933655261993408, "learning_rate": 1.8437227038364935e-05, "loss": 0.3913, "step": 11101 }, { "epoch": 1.9000513434879343, "grad_norm": 19.30069923400879, "learning_rate": 1.842601086827592e-05, "loss": 1.7778, "step": 11102 }, { "epoch": 1.9002224884477152, "grad_norm": 17.890560150146484, "learning_rate": 1.8414792676835395e-05, "loss": 1.5311, "step": 11103 }, { "epoch": 1.900393633407496, "grad_norm": 16.31072998046875, "learning_rate": 1.84035724706621e-05, "loss": 0.784, "step": 11104 }, { "epoch": 1.900564778367277, "grad_norm": 0.4599243700504303, "learning_rate": 1.839235025637598e-05, "loss": 0.1104, "step": 11105 }, { "epoch": 1.900735923327058, "grad_norm": 7.634531021118164, "learning_rate": 1.8381126040598154e-05, "loss": 0.5867, "step": 11106 }, { "epoch": 1.900907068286839, "grad_norm": 28.447126388549805, "learning_rate": 1.8369899829950928e-05, "loss": 4.8961, "step": 11107 }, { "epoch": 1.90107821324662, "grad_norm": 13.7837495803833, "learning_rate": 1.8358671631057772e-05, "loss": 1.0896, "step": 11108 }, { "epoch": 1.901249358206401, "grad_norm": 25.254196166992188, "learning_rate": 1.834744145054337e-05, "loss": 4.8079, "step": 11109 }, { "epoch": 1.9014205031661817, "grad_norm": 19.63176727294922, "learning_rate": 1.8336209295033516e-05, "loss": 2.1203, "step": 11110 }, { "epoch": 1.9015916481259627, "grad_norm": 0.49353131651878357, "learning_rate": 1.8324975171155214e-05, "loss": 0.1096, "step": 11111 }, { "epoch": 1.9017627930857435, "grad_norm": 9.293465614318848, "learning_rate": 1.8313739085536606e-05, "loss": 0.6789, "step": 11112 }, { "epoch": 1.9019339380455245, "grad_norm": 10.174165725708008, "learning_rate": 1.8302501044807002e-05, "loss": 0.7031, "step": 11113 }, { "epoch": 1.9021050830053055, "grad_norm": 20.894193649291992, "learning_rate": 1.8291261055596863e-05, "loss": 4.9556, "step": 11114 }, { "epoch": 1.9022762279650864, "grad_norm": 4.404016494750977, "learning_rate": 1.82800191245378e-05, "loss": 0.6305, "step": 11115 }, { "epoch": 1.9024473729248674, "grad_norm": 20.146804809570312, "learning_rate": 1.8268775258262567e-05, "loss": 1.5296, "step": 11116 }, { "epoch": 1.9026185178846484, "grad_norm": 3.739093065261841, "learning_rate": 1.8257529463405063e-05, "loss": 0.4174, "step": 11117 }, { "epoch": 1.9027896628444292, "grad_norm": 11.310981750488281, "learning_rate": 1.8246281746600325e-05, "loss": 0.635, "step": 11118 }, { "epoch": 1.9029608078042102, "grad_norm": 13.064981460571289, "learning_rate": 1.8235032114484507e-05, "loss": 1.2464, "step": 11119 }, { "epoch": 1.903131952763991, "grad_norm": 16.47406768798828, "learning_rate": 1.822378057369495e-05, "loss": 1.9794, "step": 11120 }, { "epoch": 1.903303097723772, "grad_norm": 8.210929870605469, "learning_rate": 1.8212527130870025e-05, "loss": 0.6657, "step": 11121 }, { "epoch": 1.903474242683553, "grad_norm": 12.415950775146484, "learning_rate": 1.8201271792649334e-05, "loss": 1.1376, "step": 11122 }, { "epoch": 1.903645387643334, "grad_norm": 23.4403076171875, "learning_rate": 1.8190014565673493e-05, "loss": 4.8599, "step": 11123 }, { "epoch": 1.903816532603115, "grad_norm": 3.087543249130249, "learning_rate": 1.817875545658433e-05, "loss": 0.2853, "step": 11124 }, { "epoch": 1.9039876775628959, "grad_norm": 50.61466979980469, "learning_rate": 1.8167494472024694e-05, "loss": 6.6304, "step": 11125 }, { "epoch": 1.9041588225226767, "grad_norm": 16.231592178344727, "learning_rate": 1.8156231618638616e-05, "loss": 1.7553, "step": 11126 }, { "epoch": 1.9043299674824576, "grad_norm": 11.319093704223633, "learning_rate": 1.814496690307117e-05, "loss": 0.7168, "step": 11127 }, { "epoch": 1.9045011124422386, "grad_norm": 20.789342880249023, "learning_rate": 1.813370033196858e-05, "loss": 4.8728, "step": 11128 }, { "epoch": 1.9046722574020194, "grad_norm": 5.458575248718262, "learning_rate": 1.812243191197811e-05, "loss": 0.4276, "step": 11129 }, { "epoch": 1.9048434023618004, "grad_norm": 12.27827262878418, "learning_rate": 1.811116164974817e-05, "loss": 0.9465, "step": 11130 }, { "epoch": 1.9050145473215814, "grad_norm": 4.549838542938232, "learning_rate": 1.809988955192822e-05, "loss": 0.5321, "step": 11131 }, { "epoch": 1.9051856922813624, "grad_norm": 5.2972540855407715, "learning_rate": 1.808861562516882e-05, "loss": 0.5663, "step": 11132 }, { "epoch": 1.9053568372411434, "grad_norm": 8.835708618164062, "learning_rate": 1.8077339876121604e-05, "loss": 0.5135, "step": 11133 }, { "epoch": 1.9055279822009243, "grad_norm": 3.539470672607422, "learning_rate": 1.8066062311439275e-05, "loss": 0.2414, "step": 11134 }, { "epoch": 1.905699127160705, "grad_norm": 12.222406387329102, "learning_rate": 1.8054782937775613e-05, "loss": 0.785, "step": 11135 }, { "epoch": 1.905870272120486, "grad_norm": 8.045392990112305, "learning_rate": 1.8043501761785468e-05, "loss": 0.6772, "step": 11136 }, { "epoch": 1.9060414170802669, "grad_norm": 12.934812545776367, "learning_rate": 1.803221879012475e-05, "loss": 0.5563, "step": 11137 }, { "epoch": 1.9062125620400479, "grad_norm": 58.079681396484375, "learning_rate": 1.8020934029450433e-05, "loss": 6.608, "step": 11138 }, { "epoch": 1.9063837069998288, "grad_norm": 2.84472393989563, "learning_rate": 1.800964748642054e-05, "loss": 0.2215, "step": 11139 }, { "epoch": 1.9065548519596098, "grad_norm": 13.69168472290039, "learning_rate": 1.7998359167694134e-05, "loss": 1.2424, "step": 11140 }, { "epoch": 1.9067259969193908, "grad_norm": 18.007862091064453, "learning_rate": 1.7987069079931363e-05, "loss": 2.1051, "step": 11141 }, { "epoch": 1.9068971418791718, "grad_norm": 12.354101181030273, "learning_rate": 1.7975777229793386e-05, "loss": 0.9335, "step": 11142 }, { "epoch": 1.9070682868389526, "grad_norm": 13.575602531433105, "learning_rate": 1.7964483623942413e-05, "loss": 1.0962, "step": 11143 }, { "epoch": 1.9072394317987336, "grad_norm": 7.813348770141602, "learning_rate": 1.7953188269041686e-05, "loss": 0.7871, "step": 11144 }, { "epoch": 1.9074105767585143, "grad_norm": 11.26419734954834, "learning_rate": 1.794189117175548e-05, "loss": 0.6984, "step": 11145 }, { "epoch": 1.9075817217182953, "grad_norm": 0.3961126506328583, "learning_rate": 1.79305923387491e-05, "loss": 0.1116, "step": 11146 }, { "epoch": 1.9077528666780763, "grad_norm": 0.5036306977272034, "learning_rate": 1.7919291776688875e-05, "loss": 0.1153, "step": 11147 }, { "epoch": 1.9079240116378573, "grad_norm": 74.00426483154297, "learning_rate": 1.7907989492242157e-05, "loss": 7.4072, "step": 11148 }, { "epoch": 1.9080951565976383, "grad_norm": 11.895257949829102, "learning_rate": 1.7896685492077306e-05, "loss": 1.0936, "step": 11149 }, { "epoch": 1.9082663015574193, "grad_norm": 9.16962718963623, "learning_rate": 1.788537978286369e-05, "loss": 0.6951, "step": 11150 }, { "epoch": 1.9084374465172, "grad_norm": 7.669610500335693, "learning_rate": 1.7874072371271714e-05, "loss": 0.7483, "step": 11151 }, { "epoch": 1.908608591476981, "grad_norm": 18.07941246032715, "learning_rate": 1.7862763263972756e-05, "loss": 1.2213, "step": 11152 }, { "epoch": 1.9087797364367618, "grad_norm": 12.579314231872559, "learning_rate": 1.785145246763921e-05, "loss": 0.9866, "step": 11153 }, { "epoch": 1.9089508813965428, "grad_norm": 0.5396085977554321, "learning_rate": 1.7840139988944463e-05, "loss": 0.1186, "step": 11154 }, { "epoch": 1.9091220263563238, "grad_norm": 9.365839004516602, "learning_rate": 1.782882583456289e-05, "loss": 0.6488, "step": 11155 }, { "epoch": 1.9092931713161048, "grad_norm": 9.069755554199219, "learning_rate": 1.7817510011169858e-05, "loss": 0.6712, "step": 11156 }, { "epoch": 1.9094643162758858, "grad_norm": 16.036863327026367, "learning_rate": 1.7806192525441734e-05, "loss": 1.1813, "step": 11157 }, { "epoch": 1.9096354612356667, "grad_norm": 91.83458709716797, "learning_rate": 1.7794873384055832e-05, "loss": 7.1507, "step": 11158 }, { "epoch": 1.9098066061954475, "grad_norm": 12.535982131958008, "learning_rate": 1.778355259369047e-05, "loss": 0.8159, "step": 11159 }, { "epoch": 1.9099777511552285, "grad_norm": 10.487643241882324, "learning_rate": 1.7772230161024935e-05, "loss": 0.7681, "step": 11160 }, { "epoch": 1.9101488961150093, "grad_norm": 7.372982501983643, "learning_rate": 1.776090609273947e-05, "loss": 0.452, "step": 11161 }, { "epoch": 1.9103200410747903, "grad_norm": 0.4139964282512665, "learning_rate": 1.7749580395515295e-05, "loss": 0.1029, "step": 11162 }, { "epoch": 1.9104911860345712, "grad_norm": 8.743830680847168, "learning_rate": 1.7738253076034608e-05, "loss": 0.6095, "step": 11163 }, { "epoch": 1.9106623309943522, "grad_norm": 3.124191999435425, "learning_rate": 1.7726924140980506e-05, "loss": 0.252, "step": 11164 }, { "epoch": 1.9108334759541332, "grad_norm": 4.782133102416992, "learning_rate": 1.7715593597037124e-05, "loss": 0.3033, "step": 11165 }, { "epoch": 1.9110046209139142, "grad_norm": 10.927142143249512, "learning_rate": 1.7704261450889454e-05, "loss": 0.8868, "step": 11166 }, { "epoch": 1.911175765873695, "grad_norm": 0.5223371982574463, "learning_rate": 1.7692927709223518e-05, "loss": 0.1069, "step": 11167 }, { "epoch": 1.911346910833476, "grad_norm": 21.862918853759766, "learning_rate": 1.76815923787262e-05, "loss": 1.6509, "step": 11168 }, { "epoch": 1.9115180557932567, "grad_norm": 1.2769150733947754, "learning_rate": 1.7670255466085408e-05, "loss": 0.2046, "step": 11169 }, { "epoch": 1.9116892007530377, "grad_norm": 88.98126983642578, "learning_rate": 1.7658916977989894e-05, "loss": 7.3795, "step": 11170 }, { "epoch": 1.9118603457128187, "grad_norm": 4.103940486907959, "learning_rate": 1.7647576921129422e-05, "loss": 0.4036, "step": 11171 }, { "epoch": 1.9120314906725997, "grad_norm": 6.393118858337402, "learning_rate": 1.7636235302194598e-05, "loss": 0.747, "step": 11172 }, { "epoch": 1.9122026356323807, "grad_norm": 10.147290229797363, "learning_rate": 1.762489212787704e-05, "loss": 0.8161, "step": 11173 }, { "epoch": 1.9123737805921617, "grad_norm": 16.89185333251953, "learning_rate": 1.7613547404869208e-05, "loss": 1.3672, "step": 11174 }, { "epoch": 1.9125449255519424, "grad_norm": 0.3489692807197571, "learning_rate": 1.7602201139864518e-05, "loss": 0.099, "step": 11175 }, { "epoch": 1.9127160705117234, "grad_norm": 5.340574264526367, "learning_rate": 1.7590853339557276e-05, "loss": 0.4851, "step": 11176 }, { "epoch": 1.9128872154715042, "grad_norm": 69.83049011230469, "learning_rate": 1.757950401064271e-05, "loss": 7.5326, "step": 11177 }, { "epoch": 1.9130583604312852, "grad_norm": 10.132957458496094, "learning_rate": 1.7568153159816933e-05, "loss": 0.7316, "step": 11178 }, { "epoch": 1.9132295053910662, "grad_norm": 0.45942407846450806, "learning_rate": 1.7556800793776965e-05, "loss": 0.1038, "step": 11179 }, { "epoch": 1.9134006503508472, "grad_norm": 6.0494585037231445, "learning_rate": 1.7545446919220723e-05, "loss": 0.4143, "step": 11180 }, { "epoch": 1.9135717953106282, "grad_norm": 9.050570487976074, "learning_rate": 1.753409154284701e-05, "loss": 0.6966, "step": 11181 }, { "epoch": 1.9137429402704091, "grad_norm": 7.54727029800415, "learning_rate": 1.7522734671355506e-05, "loss": 0.563, "step": 11182 }, { "epoch": 1.9139140852301901, "grad_norm": 15.637146949768066, "learning_rate": 1.7511376311446782e-05, "loss": 1.5064, "step": 11183 }, { "epoch": 1.914085230189971, "grad_norm": 4.679746627807617, "learning_rate": 1.75000164698223e-05, "loss": 0.3589, "step": 11184 }, { "epoch": 1.9142563751497519, "grad_norm": 3.7490456104278564, "learning_rate": 1.748865515318438e-05, "loss": 0.2454, "step": 11185 }, { "epoch": 1.9144275201095327, "grad_norm": 8.015555381774902, "learning_rate": 1.7477292368236214e-05, "loss": 0.6699, "step": 11186 }, { "epoch": 1.9145986650693136, "grad_norm": 1.9101818799972534, "learning_rate": 1.7465928121681858e-05, "loss": 0.1953, "step": 11187 }, { "epoch": 1.9147698100290946, "grad_norm": 11.852784156799316, "learning_rate": 1.7454562420226242e-05, "loss": 0.9285, "step": 11188 }, { "epoch": 1.9149409549888756, "grad_norm": 13.236194610595703, "learning_rate": 1.744319527057514e-05, "loss": 1.0344, "step": 11189 }, { "epoch": 1.9151120999486566, "grad_norm": 5.379486083984375, "learning_rate": 1.743182667943519e-05, "loss": 0.3577, "step": 11190 }, { "epoch": 1.9152832449084376, "grad_norm": 33.96356964111328, "learning_rate": 1.7420456653513874e-05, "loss": 5.2151, "step": 11191 }, { "epoch": 1.9154543898682184, "grad_norm": 8.72598934173584, "learning_rate": 1.7409085199519527e-05, "loss": 0.6371, "step": 11192 }, { "epoch": 1.9156255348279994, "grad_norm": 9.567851066589355, "learning_rate": 1.7397712324161315e-05, "loss": 0.6967, "step": 11193 }, { "epoch": 1.9157966797877801, "grad_norm": 13.066400527954102, "learning_rate": 1.738633803414927e-05, "loss": 1.0767, "step": 11194 }, { "epoch": 1.915967824747561, "grad_norm": 8.746240615844727, "learning_rate": 1.7374962336194233e-05, "loss": 0.6144, "step": 11195 }, { "epoch": 1.916138969707342, "grad_norm": 5.417791366577148, "learning_rate": 1.736358523700788e-05, "loss": 0.6221, "step": 11196 }, { "epoch": 1.916310114667123, "grad_norm": 17.85866355895996, "learning_rate": 1.7352206743302714e-05, "loss": 1.7941, "step": 11197 }, { "epoch": 1.916481259626904, "grad_norm": 16.749004364013672, "learning_rate": 1.734082686179207e-05, "loss": 1.5598, "step": 11198 }, { "epoch": 1.916652404586685, "grad_norm": 10.370431900024414, "learning_rate": 1.7329445599190087e-05, "loss": 0.9207, "step": 11199 }, { "epoch": 1.9168235495464658, "grad_norm": 6.169619083404541, "learning_rate": 1.7318062962211734e-05, "loss": 0.4224, "step": 11200 }, { "epoch": 1.9169946945062468, "grad_norm": 4.205878257751465, "learning_rate": 1.7306678957572778e-05, "loss": 0.2841, "step": 11201 }, { "epoch": 1.9171658394660276, "grad_norm": 17.847095489501953, "learning_rate": 1.72952935919898e-05, "loss": 2.1506, "step": 11202 }, { "epoch": 1.9173369844258086, "grad_norm": 19.8399658203125, "learning_rate": 1.7283906872180185e-05, "loss": 1.9189, "step": 11203 }, { "epoch": 1.9175081293855896, "grad_norm": 13.28238582611084, "learning_rate": 1.7272518804862115e-05, "loss": 0.8756, "step": 11204 }, { "epoch": 1.9176792743453706, "grad_norm": 3.1359052658081055, "learning_rate": 1.7261129396754555e-05, "loss": 0.22, "step": 11205 }, { "epoch": 1.9178504193051515, "grad_norm": 3.2376694679260254, "learning_rate": 1.7249738654577306e-05, "loss": 0.2672, "step": 11206 }, { "epoch": 1.9180215642649325, "grad_norm": 6.327885150909424, "learning_rate": 1.7238346585050878e-05, "loss": 0.5013, "step": 11207 }, { "epoch": 1.9181927092247133, "grad_norm": 0.4649300277233124, "learning_rate": 1.7226953194896648e-05, "loss": 0.1113, "step": 11208 }, { "epoch": 1.9183638541844943, "grad_norm": 10.849967956542969, "learning_rate": 1.72155584908367e-05, "loss": 0.6442, "step": 11209 }, { "epoch": 1.918534999144275, "grad_norm": 24.597280502319336, "learning_rate": 1.720416247959396e-05, "loss": 4.6899, "step": 11210 }, { "epoch": 1.918706144104056, "grad_norm": 0.4895784258842468, "learning_rate": 1.7192765167892054e-05, "loss": 0.113, "step": 11211 }, { "epoch": 1.918877289063837, "grad_norm": 24.041486740112305, "learning_rate": 1.7181366562455456e-05, "loss": 0.8581, "step": 11212 }, { "epoch": 1.919048434023618, "grad_norm": 17.157032012939453, "learning_rate": 1.716996667000931e-05, "loss": 1.6043, "step": 11213 }, { "epoch": 1.919219578983399, "grad_norm": 20.2746639251709, "learning_rate": 1.7158565497279626e-05, "loss": 2.2021, "step": 11214 }, { "epoch": 1.91939072394318, "grad_norm": 12.727091789245605, "learning_rate": 1.7147163050993057e-05, "loss": 1.2322, "step": 11215 }, { "epoch": 1.9195618689029608, "grad_norm": 82.75260162353516, "learning_rate": 1.7135759337877103e-05, "loss": 7.6525, "step": 11216 }, { "epoch": 1.9197330138627418, "grad_norm": 15.870882987976074, "learning_rate": 1.7124354364659955e-05, "loss": 1.3119, "step": 11217 }, { "epoch": 1.9199041588225225, "grad_norm": 3.861424207687378, "learning_rate": 1.711294813807057e-05, "loss": 0.2517, "step": 11218 }, { "epoch": 1.9200753037823035, "grad_norm": 14.615012168884277, "learning_rate": 1.7101540664838635e-05, "loss": 1.1506, "step": 11219 }, { "epoch": 1.9202464487420845, "grad_norm": 2.1588850021362305, "learning_rate": 1.7090131951694577e-05, "loss": 0.2294, "step": 11220 }, { "epoch": 1.9204175937018655, "grad_norm": 9.757156372070312, "learning_rate": 1.707872200536956e-05, "loss": 0.7401, "step": 11221 }, { "epoch": 1.9205887386616465, "grad_norm": 10.981807708740234, "learning_rate": 1.706731083259546e-05, "loss": 0.7694, "step": 11222 }, { "epoch": 1.9207598836214275, "grad_norm": 4.825652122497559, "learning_rate": 1.7055898440104887e-05, "loss": 0.3262, "step": 11223 }, { "epoch": 1.9209310285812082, "grad_norm": 7.9354567527771, "learning_rate": 1.7044484834631174e-05, "loss": 0.6641, "step": 11224 }, { "epoch": 1.9211021735409892, "grad_norm": 14.91639232635498, "learning_rate": 1.7033070022908364e-05, "loss": 1.0599, "step": 11225 }, { "epoch": 1.92127331850077, "grad_norm": 5.600493431091309, "learning_rate": 1.70216540116712e-05, "loss": 0.498, "step": 11226 }, { "epoch": 1.921444463460551, "grad_norm": 9.734099388122559, "learning_rate": 1.7010236807655172e-05, "loss": 0.6425, "step": 11227 }, { "epoch": 1.921615608420332, "grad_norm": 5.237175464630127, "learning_rate": 1.699881841759643e-05, "loss": 0.4587, "step": 11228 }, { "epoch": 1.921786753380113, "grad_norm": 17.491621017456055, "learning_rate": 1.6987398848231845e-05, "loss": 1.275, "step": 11229 }, { "epoch": 1.921957898339894, "grad_norm": 2.8434088230133057, "learning_rate": 1.6975978106298984e-05, "loss": 0.2719, "step": 11230 }, { "epoch": 1.922129043299675, "grad_norm": 1.183643102645874, "learning_rate": 1.6964556198536093e-05, "loss": 0.1819, "step": 11231 }, { "epoch": 1.922300188259456, "grad_norm": 0.3297038972377777, "learning_rate": 1.6953133131682116e-05, "loss": 0.1057, "step": 11232 }, { "epoch": 1.9224713332192367, "grad_norm": 60.053218841552734, "learning_rate": 1.6941708912476687e-05, "loss": 7.619, "step": 11233 }, { "epoch": 1.9226424781790177, "grad_norm": 10.53298568725586, "learning_rate": 1.6930283547660106e-05, "loss": 0.6874, "step": 11234 }, { "epoch": 1.9228136231387984, "grad_norm": 14.702582359313965, "learning_rate": 1.6918857043973357e-05, "loss": 1.4489, "step": 11235 }, { "epoch": 1.9229847680985794, "grad_norm": 23.72079086303711, "learning_rate": 1.6907429408158082e-05, "loss": 5.0905, "step": 11236 }, { "epoch": 1.9231559130583604, "grad_norm": 1.0574589967727661, "learning_rate": 1.6896000646956625e-05, "loss": 0.1816, "step": 11237 }, { "epoch": 1.9233270580181414, "grad_norm": 11.51854133605957, "learning_rate": 1.688457076711197e-05, "loss": 1.0079, "step": 11238 }, { "epoch": 1.9234982029779224, "grad_norm": 11.91737174987793, "learning_rate": 1.687313977536775e-05, "loss": 0.7683, "step": 11239 }, { "epoch": 1.9236693479377034, "grad_norm": 18.16565704345703, "learning_rate": 1.6861707678468272e-05, "loss": 2.3488, "step": 11240 }, { "epoch": 1.9238404928974842, "grad_norm": 18.575462341308594, "learning_rate": 1.685027448315849e-05, "loss": 1.6815, "step": 11241 }, { "epoch": 1.9240116378572651, "grad_norm": 19.53261947631836, "learning_rate": 1.6838840196184003e-05, "loss": 1.989, "step": 11242 }, { "epoch": 1.924182782817046, "grad_norm": 3.4600558280944824, "learning_rate": 1.682740482429107e-05, "loss": 0.2584, "step": 11243 }, { "epoch": 1.924353927776827, "grad_norm": 15.724080085754395, "learning_rate": 1.6815968374226565e-05, "loss": 1.0928, "step": 11244 }, { "epoch": 1.924525072736608, "grad_norm": 7.386577606201172, "learning_rate": 1.6804530852738016e-05, "loss": 0.5536, "step": 11245 }, { "epoch": 1.9246962176963889, "grad_norm": 1.9802742004394531, "learning_rate": 1.6793092266573576e-05, "loss": 0.2088, "step": 11246 }, { "epoch": 1.9248673626561699, "grad_norm": 14.767595291137695, "learning_rate": 1.6781652622482024e-05, "loss": 0.9641, "step": 11247 }, { "epoch": 1.9250385076159509, "grad_norm": 7.598844051361084, "learning_rate": 1.6770211927212765e-05, "loss": 0.5938, "step": 11248 }, { "epoch": 1.9252096525757316, "grad_norm": 3.227468967437744, "learning_rate": 1.6758770187515846e-05, "loss": 0.203, "step": 11249 }, { "epoch": 1.9253807975355126, "grad_norm": 14.497116088867188, "learning_rate": 1.6747327410141883e-05, "loss": 1.6592, "step": 11250 }, { "epoch": 1.9255519424952934, "grad_norm": 3.2012853622436523, "learning_rate": 1.6735883601842164e-05, "loss": 0.2442, "step": 11251 }, { "epoch": 1.9257230874550744, "grad_norm": 13.1262845993042, "learning_rate": 1.6724438769368516e-05, "loss": 0.9185, "step": 11252 }, { "epoch": 1.9258942324148554, "grad_norm": 8.012948036193848, "learning_rate": 1.6712992919473447e-05, "loss": 0.7314, "step": 11253 }, { "epoch": 1.9260653773746363, "grad_norm": 1.1880744695663452, "learning_rate": 1.6701546058909978e-05, "loss": 0.1624, "step": 11254 }, { "epoch": 1.9262365223344173, "grad_norm": 1.298662781715393, "learning_rate": 1.6690098194431825e-05, "loss": 0.1737, "step": 11255 }, { "epoch": 1.9264076672941983, "grad_norm": 11.267284393310547, "learning_rate": 1.6678649332793198e-05, "loss": 0.8635, "step": 11256 }, { "epoch": 1.926578812253979, "grad_norm": 4.095193862915039, "learning_rate": 1.666719948074898e-05, "loss": 0.2475, "step": 11257 }, { "epoch": 1.92674995721376, "grad_norm": 17.15755271911621, "learning_rate": 1.665574864505457e-05, "loss": 1.1112, "step": 11258 }, { "epoch": 1.9269211021735408, "grad_norm": 19.683277130126953, "learning_rate": 1.6644296832466e-05, "loss": 2.0265, "step": 11259 }, { "epoch": 1.9270922471333218, "grad_norm": 15.45804214477539, "learning_rate": 1.6632844049739856e-05, "loss": 1.0287, "step": 11260 }, { "epoch": 1.9272633920931028, "grad_norm": 6.968849182128906, "learning_rate": 1.6621390303633287e-05, "loss": 0.6804, "step": 11261 }, { "epoch": 1.9274345370528838, "grad_norm": 9.922377586364746, "learning_rate": 1.6609935600904025e-05, "loss": 0.7079, "step": 11262 }, { "epoch": 1.9276056820126648, "grad_norm": 1.2813118696212769, "learning_rate": 1.659847994831036e-05, "loss": 0.1858, "step": 11263 }, { "epoch": 1.9277768269724458, "grad_norm": 8.561687469482422, "learning_rate": 1.6587023352611144e-05, "loss": 0.9768, "step": 11264 }, { "epoch": 1.9279479719322266, "grad_norm": 4.412155628204346, "learning_rate": 1.6575565820565785e-05, "loss": 0.5208, "step": 11265 }, { "epoch": 1.9281191168920075, "grad_norm": 6.477213382720947, "learning_rate": 1.6564107358934245e-05, "loss": 0.7358, "step": 11266 }, { "epoch": 1.9282902618517883, "grad_norm": 13.179936408996582, "learning_rate": 1.6552647974477033e-05, "loss": 1.1621, "step": 11267 }, { "epoch": 1.9284614068115693, "grad_norm": 4.469620704650879, "learning_rate": 1.6541187673955196e-05, "loss": 0.4564, "step": 11268 }, { "epoch": 1.9286325517713503, "grad_norm": 12.788333892822266, "learning_rate": 1.6529726464130348e-05, "loss": 0.778, "step": 11269 }, { "epoch": 1.9288036967311313, "grad_norm": 4.392690658569336, "learning_rate": 1.6518264351764606e-05, "loss": 0.318, "step": 11270 }, { "epoch": 1.9289748416909123, "grad_norm": 10.2575044631958, "learning_rate": 1.6506801343620635e-05, "loss": 0.8658, "step": 11271 }, { "epoch": 1.9291459866506933, "grad_norm": 14.89240837097168, "learning_rate": 1.6495337446461623e-05, "loss": 1.1729, "step": 11272 }, { "epoch": 1.929317131610474, "grad_norm": 0.4402133524417877, "learning_rate": 1.648387266705129e-05, "loss": 0.107, "step": 11273 }, { "epoch": 1.929488276570255, "grad_norm": 11.809808731079102, "learning_rate": 1.6472407012153877e-05, "loss": 0.8246, "step": 11274 }, { "epoch": 1.9296594215300358, "grad_norm": 67.29407501220703, "learning_rate": 1.6460940488534133e-05, "loss": 7.1618, "step": 11275 }, { "epoch": 1.9298305664898168, "grad_norm": 1.1348905563354492, "learning_rate": 1.6449473102957327e-05, "loss": 0.1762, "step": 11276 }, { "epoch": 1.9300017114495978, "grad_norm": 15.595890045166016, "learning_rate": 1.643800486218923e-05, "loss": 1.6665, "step": 11277 }, { "epoch": 1.9301728564093787, "grad_norm": 7.036072731018066, "learning_rate": 1.6426535772996123e-05, "loss": 0.4985, "step": 11278 }, { "epoch": 1.9303440013691597, "grad_norm": 17.681447982788086, "learning_rate": 1.6415065842144773e-05, "loss": 1.4102, "step": 11279 }, { "epoch": 1.9305151463289407, "grad_norm": 0.36844831705093384, "learning_rate": 1.640359507640248e-05, "loss": 0.1067, "step": 11280 }, { "epoch": 1.9306862912887217, "grad_norm": 6.839493274688721, "learning_rate": 1.6392123482537e-05, "loss": 0.4582, "step": 11281 }, { "epoch": 1.9308574362485025, "grad_norm": 8.361574172973633, "learning_rate": 1.638065106731659e-05, "loss": 0.445, "step": 11282 }, { "epoch": 1.9310285812082835, "grad_norm": 64.76461791992188, "learning_rate": 1.636917783751e-05, "loss": 6.8482, "step": 11283 }, { "epoch": 1.9311997261680642, "grad_norm": 0.39011716842651367, "learning_rate": 1.6357703799886442e-05, "loss": 0.1076, "step": 11284 }, { "epoch": 1.9313708711278452, "grad_norm": 4.419445037841797, "learning_rate": 1.634622896121562e-05, "loss": 0.4029, "step": 11285 }, { "epoch": 1.9315420160876262, "grad_norm": 6.187053203582764, "learning_rate": 1.6334753328267706e-05, "loss": 0.7064, "step": 11286 }, { "epoch": 1.9317131610474072, "grad_norm": 9.447680473327637, "learning_rate": 1.632327690781334e-05, "loss": 0.8913, "step": 11287 }, { "epoch": 1.9318843060071882, "grad_norm": 4.361727714538574, "learning_rate": 1.631179970662363e-05, "loss": 0.3484, "step": 11288 }, { "epoch": 1.9320554509669692, "grad_norm": 21.93549156188965, "learning_rate": 1.6300321731470136e-05, "loss": 4.6222, "step": 11289 }, { "epoch": 1.93222659592675, "grad_norm": 7.830143451690674, "learning_rate": 1.6288842989124883e-05, "loss": 0.6458, "step": 11290 }, { "epoch": 1.932397740886531, "grad_norm": 0.29859113693237305, "learning_rate": 1.627736348636034e-05, "loss": 0.0995, "step": 11291 }, { "epoch": 1.9325688858463117, "grad_norm": 8.199200630187988, "learning_rate": 1.6265883229949455e-05, "loss": 0.5879, "step": 11292 }, { "epoch": 1.9327400308060927, "grad_norm": 9.72121810913086, "learning_rate": 1.625440222666556e-05, "loss": 0.945, "step": 11293 }, { "epoch": 1.9329111757658737, "grad_norm": 0.5942777991294861, "learning_rate": 1.6242920483282506e-05, "loss": 0.1088, "step": 11294 }, { "epoch": 1.9330823207256547, "grad_norm": 7.585069179534912, "learning_rate": 1.6231438006574493e-05, "loss": 1.0807, "step": 11295 }, { "epoch": 1.9332534656854357, "grad_norm": 10.174237251281738, "learning_rate": 1.621995480331624e-05, "loss": 0.7311, "step": 11296 }, { "epoch": 1.9334246106452166, "grad_norm": 9.04881477355957, "learning_rate": 1.6208470880282816e-05, "loss": 0.6281, "step": 11297 }, { "epoch": 1.9335957556049974, "grad_norm": 10.809017181396484, "learning_rate": 1.6196986244249793e-05, "loss": 0.9161, "step": 11298 }, { "epoch": 1.9337669005647784, "grad_norm": 16.34033966064453, "learning_rate": 1.6185500901993086e-05, "loss": 1.7214, "step": 11299 }, { "epoch": 1.9339380455245592, "grad_norm": 18.158540725708008, "learning_rate": 1.6174014860289097e-05, "loss": 2.1396, "step": 11300 }, { "epoch": 1.9341091904843402, "grad_norm": 3.5736474990844727, "learning_rate": 1.6162528125914575e-05, "loss": 0.2823, "step": 11301 }, { "epoch": 1.9342803354441211, "grad_norm": 18.184226989746094, "learning_rate": 1.6151040705646737e-05, "loss": 1.3894, "step": 11302 }, { "epoch": 1.9344514804039021, "grad_norm": 0.34050172567367554, "learning_rate": 1.6139552606263167e-05, "loss": 0.1035, "step": 11303 }, { "epoch": 1.9346226253636831, "grad_norm": 11.194930076599121, "learning_rate": 1.6128063834541862e-05, "loss": 0.6397, "step": 11304 }, { "epoch": 1.9347937703234641, "grad_norm": 14.446388244628906, "learning_rate": 1.6116574397261217e-05, "loss": 1.7691, "step": 11305 }, { "epoch": 1.9349649152832449, "grad_norm": 10.41291618347168, "learning_rate": 1.610508430120001e-05, "loss": 0.7449, "step": 11306 }, { "epoch": 1.9351360602430259, "grad_norm": 20.625141143798828, "learning_rate": 1.609359355313742e-05, "loss": 2.2465, "step": 11307 }, { "epoch": 1.9353072052028066, "grad_norm": 2.8247673511505127, "learning_rate": 1.608210215985301e-05, "loss": 0.4027, "step": 11308 }, { "epoch": 1.9354783501625876, "grad_norm": 110.67079162597656, "learning_rate": 1.607061012812671e-05, "loss": 8.1491, "step": 11309 }, { "epoch": 1.9356494951223686, "grad_norm": 14.671128273010254, "learning_rate": 1.605911746473884e-05, "loss": 1.7593, "step": 11310 }, { "epoch": 1.9358206400821496, "grad_norm": 13.54640007019043, "learning_rate": 1.604762417647008e-05, "loss": 1.2471, "step": 11311 }, { "epoch": 1.9359917850419306, "grad_norm": 11.861666679382324, "learning_rate": 1.6036130270101503e-05, "loss": 0.8656, "step": 11312 }, { "epoch": 1.9361629300017116, "grad_norm": 10.604565620422363, "learning_rate": 1.6024635752414523e-05, "loss": 0.5764, "step": 11313 }, { "epoch": 1.9363340749614923, "grad_norm": 8.506979942321777, "learning_rate": 1.6013140630190924e-05, "loss": 0.6507, "step": 11314 }, { "epoch": 1.9365052199212733, "grad_norm": 6.508911609649658, "learning_rate": 1.6001644910212843e-05, "loss": 0.6515, "step": 11315 }, { "epoch": 1.936676364881054, "grad_norm": 0.34357813000679016, "learning_rate": 1.5990148599262772e-05, "loss": 0.1029, "step": 11316 }, { "epoch": 1.936847509840835, "grad_norm": 24.1650390625, "learning_rate": 1.5978651704123557e-05, "loss": 5.0736, "step": 11317 }, { "epoch": 1.937018654800616, "grad_norm": 58.466941833496094, "learning_rate": 1.596715423157838e-05, "loss": 6.6852, "step": 11318 }, { "epoch": 1.937189799760397, "grad_norm": 4.330345153808594, "learning_rate": 1.5955656188410763e-05, "loss": 0.5945, "step": 11319 }, { "epoch": 1.937360944720178, "grad_norm": 8.09549331665039, "learning_rate": 1.5944157581404568e-05, "loss": 0.5816, "step": 11320 }, { "epoch": 1.937532089679959, "grad_norm": 24.434621810913086, "learning_rate": 1.5932658417343998e-05, "loss": 3.6472, "step": 11321 }, { "epoch": 1.9377032346397398, "grad_norm": 10.486015319824219, "learning_rate": 1.592115870301356e-05, "loss": 0.702, "step": 11322 }, { "epoch": 1.9378743795995208, "grad_norm": 14.575980186462402, "learning_rate": 1.5909658445198128e-05, "loss": 1.5462, "step": 11323 }, { "epoch": 1.9380455245593016, "grad_norm": 7.449291706085205, "learning_rate": 1.5898157650682855e-05, "loss": 0.4706, "step": 11324 }, { "epoch": 1.9382166695190826, "grad_norm": 6.596275806427002, "learning_rate": 1.5886656326253233e-05, "loss": 0.4791, "step": 11325 }, { "epoch": 1.9383878144788635, "grad_norm": 4.882842540740967, "learning_rate": 1.587515447869506e-05, "loss": 0.7004, "step": 11326 }, { "epoch": 1.9385589594386445, "grad_norm": 8.324540138244629, "learning_rate": 1.586365211479444e-05, "loss": 0.4908, "step": 11327 }, { "epoch": 1.9387301043984255, "grad_norm": 10.983134269714355, "learning_rate": 1.585214924133778e-05, "loss": 0.7886, "step": 11328 }, { "epoch": 1.9389012493582065, "grad_norm": 8.711219787597656, "learning_rate": 1.5840645865111804e-05, "loss": 0.7619, "step": 11329 }, { "epoch": 1.9390723943179873, "grad_norm": 0.6153932809829712, "learning_rate": 1.5829141992903513e-05, "loss": 0.1077, "step": 11330 }, { "epoch": 1.9392435392777683, "grad_norm": 15.790937423706055, "learning_rate": 1.5817637631500213e-05, "loss": 1.1577, "step": 11331 }, { "epoch": 1.9394146842375493, "grad_norm": 5.956581115722656, "learning_rate": 1.5806132787689492e-05, "loss": 0.4723, "step": 11332 }, { "epoch": 1.93958582919733, "grad_norm": 10.454607009887695, "learning_rate": 1.5794627468259224e-05, "loss": 0.5998, "step": 11333 }, { "epoch": 1.939756974157111, "grad_norm": 12.792913436889648, "learning_rate": 1.5783121679997558e-05, "loss": 0.8568, "step": 11334 }, { "epoch": 1.939928119116892, "grad_norm": 12.209720611572266, "learning_rate": 1.5771615429692958e-05, "loss": 0.956, "step": 11335 }, { "epoch": 1.940099264076673, "grad_norm": 83.43050384521484, "learning_rate": 1.5760108724134078e-05, "loss": 6.9127, "step": 11336 }, { "epoch": 1.940270409036454, "grad_norm": 0.440584272146225, "learning_rate": 1.574860157010994e-05, "loss": 0.1097, "step": 11337 }, { "epoch": 1.940441553996235, "grad_norm": 1.7370363473892212, "learning_rate": 1.5737093974409745e-05, "loss": 0.1707, "step": 11338 }, { "epoch": 1.9406126989560157, "grad_norm": 1.7174816131591797, "learning_rate": 1.5725585943823022e-05, "loss": 0.2036, "step": 11339 }, { "epoch": 1.9407838439157967, "grad_norm": 63.12074279785156, "learning_rate": 1.5714077485139496e-05, "loss": 6.1581, "step": 11340 }, { "epoch": 1.9409549888755775, "grad_norm": 2.967423677444458, "learning_rate": 1.570256860514921e-05, "loss": 0.3161, "step": 11341 }, { "epoch": 1.9411261338353585, "grad_norm": 6.28383731842041, "learning_rate": 1.569105931064238e-05, "loss": 0.5562, "step": 11342 }, { "epoch": 1.9412972787951395, "grad_norm": 4.567384243011475, "learning_rate": 1.567954960840954e-05, "loss": 0.5041, "step": 11343 }, { "epoch": 1.9414684237549205, "grad_norm": 18.94024085998535, "learning_rate": 1.5668039505241407e-05, "loss": 0.759, "step": 11344 }, { "epoch": 1.9416395687147014, "grad_norm": 9.674101829528809, "learning_rate": 1.565652900792898e-05, "loss": 0.7657, "step": 11345 }, { "epoch": 1.9418107136744824, "grad_norm": 17.56503677368164, "learning_rate": 1.564501812326346e-05, "loss": 1.7641, "step": 11346 }, { "epoch": 1.9419818586342632, "grad_norm": 11.445001602172852, "learning_rate": 1.5633506858036286e-05, "loss": 0.9993, "step": 11347 }, { "epoch": 1.9421530035940442, "grad_norm": 10.151552200317383, "learning_rate": 1.5621995219039122e-05, "loss": 0.715, "step": 11348 }, { "epoch": 1.942324148553825, "grad_norm": 6.462473392486572, "learning_rate": 1.561048321306385e-05, "loss": 0.5514, "step": 11349 }, { "epoch": 1.942495293513606, "grad_norm": 11.29268741607666, "learning_rate": 1.5598970846902578e-05, "loss": 0.9772, "step": 11350 }, { "epoch": 1.942666438473387, "grad_norm": 10.974055290222168, "learning_rate": 1.558745812734761e-05, "loss": 0.7638, "step": 11351 }, { "epoch": 1.942837583433168, "grad_norm": 2.067368745803833, "learning_rate": 1.5575945061191474e-05, "loss": 0.1826, "step": 11352 }, { "epoch": 1.943008728392949, "grad_norm": 12.557221412658691, "learning_rate": 1.5564431655226894e-05, "loss": 1.0898, "step": 11353 }, { "epoch": 1.94317987335273, "grad_norm": 18.654016494750977, "learning_rate": 1.5552917916246786e-05, "loss": 1.9611, "step": 11354 }, { "epoch": 1.9433510183125107, "grad_norm": 16.102670669555664, "learning_rate": 1.5541403851044294e-05, "loss": 1.0735, "step": 11355 }, { "epoch": 1.9435221632722917, "grad_norm": 0.32596296072006226, "learning_rate": 1.552988946641272e-05, "loss": 0.099, "step": 11356 }, { "epoch": 1.9436933082320724, "grad_norm": 15.397614479064941, "learning_rate": 1.5518374769145577e-05, "loss": 1.3073, "step": 11357 }, { "epoch": 1.9438644531918534, "grad_norm": 12.287120819091797, "learning_rate": 1.550685976603655e-05, "loss": 0.9969, "step": 11358 }, { "epoch": 1.9440355981516344, "grad_norm": 9.74353313446045, "learning_rate": 1.5495344463879502e-05, "loss": 0.6681, "step": 11359 }, { "epoch": 1.9442067431114154, "grad_norm": 7.125378131866455, "learning_rate": 1.548382886946848e-05, "loss": 0.631, "step": 11360 }, { "epoch": 1.9443778880711964, "grad_norm": 11.805506706237793, "learning_rate": 1.5472312989597707e-05, "loss": 0.694, "step": 11361 }, { "epoch": 1.9445490330309774, "grad_norm": 99.5108413696289, "learning_rate": 1.5460796831061567e-05, "loss": 7.408, "step": 11362 }, { "epoch": 1.9447201779907581, "grad_norm": 13.144158363342285, "learning_rate": 1.544928040065461e-05, "loss": 1.0358, "step": 11363 }, { "epoch": 1.9448913229505391, "grad_norm": 7.560821056365967, "learning_rate": 1.543776370517155e-05, "loss": 0.6744, "step": 11364 }, { "epoch": 1.94506246791032, "grad_norm": 0.4159019887447357, "learning_rate": 1.5426246751407238e-05, "loss": 0.1073, "step": 11365 }, { "epoch": 1.9452336128701009, "grad_norm": 4.792381286621094, "learning_rate": 1.5414729546156717e-05, "loss": 0.3403, "step": 11366 }, { "epoch": 1.9454047578298819, "grad_norm": 9.552940368652344, "learning_rate": 1.5403212096215155e-05, "loss": 0.6406, "step": 11367 }, { "epoch": 1.9455759027896629, "grad_norm": 10.684208869934082, "learning_rate": 1.5391694408377847e-05, "loss": 0.7906, "step": 11368 }, { "epoch": 1.9457470477494438, "grad_norm": 9.589296340942383, "learning_rate": 1.5380176489440255e-05, "loss": 0.6823, "step": 11369 }, { "epoch": 1.9459181927092248, "grad_norm": 9.931077003479004, "learning_rate": 1.536865834619797e-05, "loss": 0.8008, "step": 11370 }, { "epoch": 1.9460893376690056, "grad_norm": 10.533417701721191, "learning_rate": 1.5357139985446712e-05, "loss": 0.835, "step": 11371 }, { "epoch": 1.9462604826287866, "grad_norm": 1.2411259412765503, "learning_rate": 1.5345621413982327e-05, "loss": 0.1755, "step": 11372 }, { "epoch": 1.9464316275885674, "grad_norm": 16.325969696044922, "learning_rate": 1.5334102638600797e-05, "loss": 1.5283, "step": 11373 }, { "epoch": 1.9466027725483483, "grad_norm": 16.08247947692871, "learning_rate": 1.5322583666098214e-05, "loss": 1.4663, "step": 11374 }, { "epoch": 1.9467739175081293, "grad_norm": 9.158523559570312, "learning_rate": 1.5311064503270783e-05, "loss": 0.7014, "step": 11375 }, { "epoch": 1.9469450624679103, "grad_norm": 16.340208053588867, "learning_rate": 1.5299545156914833e-05, "loss": 1.9631, "step": 11376 }, { "epoch": 1.9471162074276913, "grad_norm": 15.00509262084961, "learning_rate": 1.5288025633826787e-05, "loss": 1.5552, "step": 11377 }, { "epoch": 1.9472873523874723, "grad_norm": 11.726522445678711, "learning_rate": 1.5276505940803207e-05, "loss": 0.8961, "step": 11378 }, { "epoch": 1.947458497347253, "grad_norm": 13.861359596252441, "learning_rate": 1.5264986084640688e-05, "loss": 1.035, "step": 11379 }, { "epoch": 1.947629642307034, "grad_norm": 8.416667938232422, "learning_rate": 1.5253466072136005e-05, "loss": 0.7146, "step": 11380 }, { "epoch": 1.9478007872668148, "grad_norm": 0.5665331482887268, "learning_rate": 1.5241945910085943e-05, "loss": 0.1112, "step": 11381 }, { "epoch": 1.9479719322265958, "grad_norm": 7.4365668296813965, "learning_rate": 1.5230425605287455e-05, "loss": 0.6755, "step": 11382 }, { "epoch": 1.9481430771863768, "grad_norm": 18.226795196533203, "learning_rate": 1.5218905164537493e-05, "loss": 1.8541, "step": 11383 }, { "epoch": 1.9483142221461578, "grad_norm": 3.757307767868042, "learning_rate": 1.5207384594633181e-05, "loss": 0.2638, "step": 11384 }, { "epoch": 1.9484853671059388, "grad_norm": 9.092303276062012, "learning_rate": 1.5195863902371629e-05, "loss": 0.6229, "step": 11385 }, { "epoch": 1.9486565120657198, "grad_norm": 7.776149749755859, "learning_rate": 1.518434309455009e-05, "loss": 0.5769, "step": 11386 }, { "epoch": 1.9488276570255008, "grad_norm": 11.039986610412598, "learning_rate": 1.517282217796585e-05, "loss": 0.6608, "step": 11387 }, { "epoch": 1.9489988019852815, "grad_norm": 3.094700813293457, "learning_rate": 1.516130115941627e-05, "loss": 0.2832, "step": 11388 }, { "epoch": 1.9491699469450625, "grad_norm": 65.62916564941406, "learning_rate": 1.5149780045698768e-05, "loss": 5.9456, "step": 11389 }, { "epoch": 1.9493410919048433, "grad_norm": 15.568326950073242, "learning_rate": 1.5138258843610814e-05, "loss": 1.0278, "step": 11390 }, { "epoch": 1.9495122368646243, "grad_norm": 14.610957145690918, "learning_rate": 1.5126737559949937e-05, "loss": 1.4152, "step": 11391 }, { "epoch": 1.9496833818244053, "grad_norm": 0.3632209897041321, "learning_rate": 1.511521620151371e-05, "loss": 0.103, "step": 11392 }, { "epoch": 1.9498545267841862, "grad_norm": 10.502372741699219, "learning_rate": 1.5103694775099751e-05, "loss": 1.0416, "step": 11393 }, { "epoch": 1.9500256717439672, "grad_norm": 69.8099136352539, "learning_rate": 1.5092173287505722e-05, "loss": 6.4658, "step": 11394 }, { "epoch": 1.9501968167037482, "grad_norm": 8.951812744140625, "learning_rate": 1.5080651745529313e-05, "loss": 0.7347, "step": 11395 }, { "epoch": 1.950367961663529, "grad_norm": 16.769899368286133, "learning_rate": 1.5069130155968259e-05, "loss": 1.1836, "step": 11396 }, { "epoch": 1.95053910662331, "grad_norm": 5.360583305358887, "learning_rate": 1.5057608525620298e-05, "loss": 0.6269, "step": 11397 }, { "epoch": 1.9507102515830907, "grad_norm": 20.55162239074707, "learning_rate": 1.5046086861283228e-05, "loss": 4.6779, "step": 11398 }, { "epoch": 1.9508813965428717, "grad_norm": 5.87799072265625, "learning_rate": 1.5034565169754846e-05, "loss": 0.5737, "step": 11399 }, { "epoch": 1.9510525415026527, "grad_norm": 8.899847984313965, "learning_rate": 1.502304345783296e-05, "loss": 0.7939, "step": 11400 }, { "epoch": 1.9512236864624337, "grad_norm": 7.604985237121582, "learning_rate": 1.50115217323154e-05, "loss": 0.5649, "step": 11401 }, { "epoch": 1.9512236864624337, "eval_nli-pairs_loss": 1.22280752658844, "eval_nli-pairs_runtime": 4.2347, "eval_nli-pairs_samples_per_second": 47.229, "eval_nli-pairs_steps_per_second": 1.653, "eval_sts-test_pearson_cosine": 0.7699975737960316, "eval_sts-test_pearson_dot": 0.6197729076613322, "eval_sts-test_pearson_euclidean": 0.7570638862036466, "eval_sts-test_pearson_manhattan": 0.7599535827110853, "eval_sts-test_pearson_max": 0.7699975737960316, "eval_sts-test_spearman_cosine": 0.7757133424395433, "eval_sts-test_spearman_dot": 0.5942990403117303, "eval_sts-test_spearman_euclidean": 0.7458288407323578, "eval_sts-test_spearman_manhattan": 0.7512040918025963, "eval_sts-test_spearman_max": 0.7757133424395433, "step": 11401 }, { "epoch": 1.9512236864624337, "eval_vitaminc-pairs_loss": 0.6488355994224548, "eval_vitaminc-pairs_runtime": 2.71, "eval_vitaminc-pairs_samples_per_second": 73.801, "eval_vitaminc-pairs_steps_per_second": 2.583, "step": 11401 }, { "epoch": 1.9512236864624337, "eval_qnli-contrastive_loss": 1.3747411966323853, "eval_qnli-contrastive_runtime": 0.6236, "eval_qnli-contrastive_samples_per_second": 320.735, "eval_qnli-contrastive_steps_per_second": 11.226, "step": 11401 }, { "epoch": 1.9512236864624337, "eval_scitail-pairs-qa_loss": 0.0833611935377121, "eval_scitail-pairs-qa_runtime": 1.5822, "eval_scitail-pairs-qa_samples_per_second": 126.404, "eval_scitail-pairs-qa_steps_per_second": 4.424, "step": 11401 }, { "epoch": 1.9512236864624337, "eval_scitail-pairs-pos_loss": 0.6010007858276367, "eval_scitail-pairs-pos_runtime": 2.5788, "eval_scitail-pairs-pos_samples_per_second": 77.556, "eval_scitail-pairs-pos_steps_per_second": 2.714, "step": 11401 }, { "epoch": 1.9512236864624337, "eval_xsum-pairs_loss": 0.639953076839447, "eval_xsum-pairs_runtime": 2.6369, "eval_xsum-pairs_samples_per_second": 66.366, "eval_xsum-pairs_steps_per_second": 2.275, "step": 11401 }, { "epoch": 1.9512236864624337, "eval_compression-pairs_loss": 0.18661309778690338, "eval_compression-pairs_runtime": 0.5052, "eval_compression-pairs_samples_per_second": 395.903, "eval_compression-pairs_steps_per_second": 13.857, "step": 11401 }, { "epoch": 1.9512236864624337, "eval_sciq_pairs_loss": 0.4085307717323303, "eval_sciq_pairs_runtime": 9.1017, "eval_sciq_pairs_samples_per_second": 21.974, "eval_sciq_pairs_steps_per_second": 0.769, "step": 11401 }, { "epoch": 1.9512236864624337, "eval_qasc_pairs_loss": 5.166599273681641, "eval_qasc_pairs_runtime": 2.6309, "eval_qasc_pairs_samples_per_second": 76.02, "eval_qasc_pairs_steps_per_second": 2.661, "step": 11401 }, { "epoch": 1.9512236864624337, "eval_openbookqa_pairs_loss": 2.410891532897949, "eval_openbookqa_pairs_runtime": 0.6421, "eval_openbookqa_pairs_samples_per_second": 107.456, "eval_openbookqa_pairs_steps_per_second": 4.672, "step": 11401 }, { "epoch": 1.9512236864624337, "eval_msmarco_pairs_loss": 0.8655012249946594, "eval_msmarco_pairs_runtime": 3.9086, "eval_msmarco_pairs_samples_per_second": 51.169, "eval_msmarco_pairs_steps_per_second": 1.791, "step": 11401 }, { "epoch": 1.9512236864624337, "eval_nq_pairs_loss": 1.0393075942993164, "eval_nq_pairs_runtime": 8.5406, "eval_nq_pairs_samples_per_second": 23.418, "eval_nq_pairs_steps_per_second": 0.82, "step": 11401 }, { "epoch": 1.9512236864624337, "eval_trivia_pairs_loss": 1.4714889526367188, "eval_trivia_pairs_runtime": 12.7403, "eval_trivia_pairs_samples_per_second": 15.698, "eval_trivia_pairs_steps_per_second": 0.549, "step": 11401 }, { "epoch": 1.9512236864624337, "eval_quora_pairs_loss": 0.17628449201583862, "eval_quora_pairs_runtime": 1.5751, "eval_quora_pairs_samples_per_second": 126.973, "eval_quora_pairs_steps_per_second": 4.444, "step": 11401 }, { "epoch": 1.9512236864624337, "eval_gooaq_pairs_loss": 0.7246831655502319, "eval_gooaq_pairs_runtime": 2.6396, "eval_gooaq_pairs_samples_per_second": 75.77, "eval_gooaq_pairs_steps_per_second": 2.652, "step": 11401 }, { "epoch": 1.9513948314222147, "grad_norm": 13.76504898071289, "learning_rate": 1.5e-05, "loss": 1.5212, "step": 11402 }, { "epoch": 1.9515659763819957, "grad_norm": 9.34089469909668, "learning_rate": 1.49884782676846e-05, "loss": 0.6495, "step": 11403 }, { "epoch": 1.9517371213417765, "grad_norm": 12.933805465698242, "learning_rate": 1.497695654216704e-05, "loss": 1.5776, "step": 11404 }, { "epoch": 1.9519082663015574, "grad_norm": 111.4419937133789, "learning_rate": 1.4965434830245154e-05, "loss": 8.3416, "step": 11405 }, { "epoch": 1.9520794112613382, "grad_norm": 18.83938217163086, "learning_rate": 1.4953913138716775e-05, "loss": 1.994, "step": 11406 }, { "epoch": 1.9522505562211192, "grad_norm": 13.376595497131348, "learning_rate": 1.4942391474379706e-05, "loss": 0.9529, "step": 11407 }, { "epoch": 1.9524217011809002, "grad_norm": 8.468413352966309, "learning_rate": 1.4930869844031755e-05, "loss": 0.8211, "step": 11408 }, { "epoch": 1.9525928461406812, "grad_norm": 1.6203304529190063, "learning_rate": 1.4919348254470699e-05, "loss": 0.2416, "step": 11409 }, { "epoch": 1.9527639911004622, "grad_norm": 0.39726293087005615, "learning_rate": 1.490782671249429e-05, "loss": 0.1044, "step": 11410 }, { "epoch": 1.9529351360602432, "grad_norm": 11.79739761352539, "learning_rate": 1.4896305224900262e-05, "loss": 0.9868, "step": 11411 }, { "epoch": 1.953106281020024, "grad_norm": 7.333981990814209, "learning_rate": 1.4884783798486305e-05, "loss": 0.5493, "step": 11412 }, { "epoch": 1.953277425979805, "grad_norm": 3.2601091861724854, "learning_rate": 1.4873262440050065e-05, "loss": 0.2362, "step": 11413 }, { "epoch": 1.9534485709395857, "grad_norm": 7.405086040496826, "learning_rate": 1.4861741156389176e-05, "loss": 0.635, "step": 11414 }, { "epoch": 1.9536197158993667, "grad_norm": 15.983068466186523, "learning_rate": 1.4850219954301236e-05, "loss": 1.9486, "step": 11415 }, { "epoch": 1.9537908608591477, "grad_norm": 14.185470581054688, "learning_rate": 1.4838698840583722e-05, "loss": 1.0546, "step": 11416 }, { "epoch": 1.9539620058189286, "grad_norm": 16.015426635742188, "learning_rate": 1.4827177822034152e-05, "loss": 1.3527, "step": 11417 }, { "epoch": 1.9541331507787096, "grad_norm": 3.8789985179901123, "learning_rate": 1.4815656905449903e-05, "loss": 0.4741, "step": 11418 }, { "epoch": 1.9543042957384906, "grad_norm": 3.636272668838501, "learning_rate": 1.4804136097628375e-05, "loss": 0.2047, "step": 11419 }, { "epoch": 1.9544754406982714, "grad_norm": 2.8629751205444336, "learning_rate": 1.4792615405366823e-05, "loss": 0.2125, "step": 11420 }, { "epoch": 1.9546465856580524, "grad_norm": 10.631260871887207, "learning_rate": 1.478109483546251e-05, "loss": 0.7332, "step": 11421 }, { "epoch": 1.9548177306178331, "grad_norm": 52.27153778076172, "learning_rate": 1.4769574394712548e-05, "loss": 1.0386, "step": 11422 }, { "epoch": 1.9549888755776141, "grad_norm": 16.259428024291992, "learning_rate": 1.4758054089914061e-05, "loss": 1.6662, "step": 11423 }, { "epoch": 1.9551600205373951, "grad_norm": 9.222062110900879, "learning_rate": 1.4746533927863997e-05, "loss": 0.7401, "step": 11424 }, { "epoch": 1.9553311654971761, "grad_norm": Infinity, "learning_rate": 1.4746533927863997e-05, "loss": 3.4197, "step": 11425 }, { "epoch": 1.955502310456957, "grad_norm": 12.260978698730469, "learning_rate": 1.4735013915359317e-05, "loss": 0.8092, "step": 11426 }, { "epoch": 1.955673455416738, "grad_norm": 6.977792263031006, "learning_rate": 1.4723494059196796e-05, "loss": 0.6547, "step": 11427 }, { "epoch": 1.9558446003765189, "grad_norm": 2.91377329826355, "learning_rate": 1.4711974366173215e-05, "loss": 0.3489, "step": 11428 }, { "epoch": 1.9560157453362998, "grad_norm": 13.654753684997559, "learning_rate": 1.4700454843085167e-05, "loss": 0.8931, "step": 11429 }, { "epoch": 1.9561868902960806, "grad_norm": 16.902057647705078, "learning_rate": 1.4688935496729228e-05, "loss": 1.6215, "step": 11430 }, { "epoch": 1.9563580352558616, "grad_norm": 29.850019454956055, "learning_rate": 1.4677416333901789e-05, "loss": 5.4715, "step": 11431 }, { "epoch": 1.9565291802156426, "grad_norm": 13.372759819030762, "learning_rate": 1.4665897361399216e-05, "loss": 1.4638, "step": 11432 }, { "epoch": 1.9567003251754236, "grad_norm": 19.199113845825195, "learning_rate": 1.4654378586017674e-05, "loss": 2.3407, "step": 11433 }, { "epoch": 1.9568714701352046, "grad_norm": 16.506311416625977, "learning_rate": 1.4642860014553302e-05, "loss": 1.6599, "step": 11434 }, { "epoch": 1.9570426150949856, "grad_norm": 9.993104934692383, "learning_rate": 1.4631341653802032e-05, "loss": 0.8454, "step": 11435 }, { "epoch": 1.9572137600547665, "grad_norm": 56.69374465942383, "learning_rate": 1.4619823510559737e-05, "loss": 7.6287, "step": 11436 }, { "epoch": 1.9573849050145473, "grad_norm": 2.2921462059020996, "learning_rate": 1.4608305591622147e-05, "loss": 0.1066, "step": 11437 }, { "epoch": 1.9575560499743283, "grad_norm": 25.4880313873291, "learning_rate": 1.459678790378484e-05, "loss": 4.7699, "step": 11438 }, { "epoch": 1.957727194934109, "grad_norm": 1.2839221954345703, "learning_rate": 1.458527045384327e-05, "loss": 0.2274, "step": 11439 }, { "epoch": 1.95789833989389, "grad_norm": 13.574799537658691, "learning_rate": 1.4573753248592756e-05, "loss": 1.0131, "step": 11440 }, { "epoch": 1.958069484853671, "grad_norm": 9.477666854858398, "learning_rate": 1.4562236294828455e-05, "loss": 0.6323, "step": 11441 }, { "epoch": 1.958240629813452, "grad_norm": 0.42693617939949036, "learning_rate": 1.4550719599345392e-05, "loss": 0.1007, "step": 11442 }, { "epoch": 1.958411774773233, "grad_norm": 5.813660144805908, "learning_rate": 1.4539203168938434e-05, "loss": 0.5022, "step": 11443 }, { "epoch": 1.958582919733014, "grad_norm": 1.5622849464416504, "learning_rate": 1.4527687010402294e-05, "loss": 0.2419, "step": 11444 }, { "epoch": 1.9587540646927948, "grad_norm": 1.7237932682037354, "learning_rate": 1.451617113053152e-05, "loss": 0.2506, "step": 11445 }, { "epoch": 1.9589252096525758, "grad_norm": 22.354421615600586, "learning_rate": 1.4504655536120502e-05, "loss": 4.5613, "step": 11446 }, { "epoch": 1.9590963546123565, "grad_norm": 10.504575729370117, "learning_rate": 1.4493140233963452e-05, "loss": 0.8333, "step": 11447 }, { "epoch": 1.9592674995721375, "grad_norm": 5.0905375480651855, "learning_rate": 1.4481625230854426e-05, "loss": 0.1971, "step": 11448 }, { "epoch": 1.9594386445319185, "grad_norm": 52.00832748413086, "learning_rate": 1.447011053358728e-05, "loss": 5.6469, "step": 11449 }, { "epoch": 1.9596097894916995, "grad_norm": 0.5835732221603394, "learning_rate": 1.4458596148955709e-05, "loss": 0.1067, "step": 11450 }, { "epoch": 1.9597809344514805, "grad_norm": 0.6231808662414551, "learning_rate": 1.4447082083753217e-05, "loss": 0.1469, "step": 11451 }, { "epoch": 1.9599520794112615, "grad_norm": 10.706957817077637, "learning_rate": 1.4435568344773118e-05, "loss": 1.2046, "step": 11452 }, { "epoch": 1.9601232243710422, "grad_norm": 7.006687641143799, "learning_rate": 1.4424054938808537e-05, "loss": 0.5382, "step": 11453 }, { "epoch": 1.9602943693308232, "grad_norm": 2.716679573059082, "learning_rate": 1.44125418726524e-05, "loss": 0.2342, "step": 11454 }, { "epoch": 1.960465514290604, "grad_norm": 2.7693655490875244, "learning_rate": 1.4401029153097437e-05, "loss": 0.2641, "step": 11455 }, { "epoch": 1.960636659250385, "grad_norm": 3.9419169425964355, "learning_rate": 1.4389516786936162e-05, "loss": 0.4563, "step": 11456 }, { "epoch": 1.960807804210166, "grad_norm": 9.503533363342285, "learning_rate": 1.4378004780960882e-05, "loss": 0.7965, "step": 11457 }, { "epoch": 1.960978949169947, "grad_norm": 0.6426921486854553, "learning_rate": 1.4366493141963708e-05, "loss": 0.147, "step": 11458 }, { "epoch": 1.961150094129728, "grad_norm": 10.962170600891113, "learning_rate": 1.4354981876736543e-05, "loss": 0.9281, "step": 11459 }, { "epoch": 1.961321239089509, "grad_norm": 11.467341423034668, "learning_rate": 1.4343470992071012e-05, "loss": 0.7275, "step": 11460 }, { "epoch": 1.9614923840492897, "grad_norm": 12.436102867126465, "learning_rate": 1.4331960494758594e-05, "loss": 1.2291, "step": 11461 }, { "epoch": 1.9616635290090707, "grad_norm": 0.9076922535896301, "learning_rate": 1.4320450391590458e-05, "loss": 0.1904, "step": 11462 }, { "epoch": 1.9618346739688515, "grad_norm": 0.47052261233329773, "learning_rate": 1.4308940689357624e-05, "loss": 0.1077, "step": 11463 }, { "epoch": 1.9620058189286325, "grad_norm": 1.9374362230300903, "learning_rate": 1.4297431394850793e-05, "loss": 0.2078, "step": 11464 }, { "epoch": 1.9621769638884135, "grad_norm": 15.643912315368652, "learning_rate": 1.4285922514860508e-05, "loss": 1.9105, "step": 11465 }, { "epoch": 1.9623481088481944, "grad_norm": 10.599296569824219, "learning_rate": 1.4274414056176978e-05, "loss": 0.984, "step": 11466 }, { "epoch": 1.9625192538079754, "grad_norm": 10.101789474487305, "learning_rate": 1.4262906025590258e-05, "loss": 0.7062, "step": 11467 }, { "epoch": 1.9626903987677564, "grad_norm": 1.5755131244659424, "learning_rate": 1.425139842989006e-05, "loss": 0.1809, "step": 11468 }, { "epoch": 1.9628615437275372, "grad_norm": 18.65258026123047, "learning_rate": 1.4239891275865923e-05, "loss": 1.6689, "step": 11469 }, { "epoch": 1.9630326886873182, "grad_norm": 0.30211278796195984, "learning_rate": 1.4228384570307047e-05, "loss": 0.1017, "step": 11470 }, { "epoch": 1.963203833647099, "grad_norm": 18.631153106689453, "learning_rate": 1.4216878320002445e-05, "loss": 1.2364, "step": 11471 }, { "epoch": 1.96337497860688, "grad_norm": 10.039510726928711, "learning_rate": 1.4205372531740779e-05, "loss": 0.8694, "step": 11472 }, { "epoch": 1.963546123566661, "grad_norm": 137.0620574951172, "learning_rate": 1.4193867212310522e-05, "loss": 1.4276, "step": 11473 }, { "epoch": 1.963717268526442, "grad_norm": 12.923408508300781, "learning_rate": 1.418236236849979e-05, "loss": 1.0573, "step": 11474 }, { "epoch": 1.963888413486223, "grad_norm": 12.115560531616211, "learning_rate": 1.41708580070965e-05, "loss": 0.3295, "step": 11475 }, { "epoch": 1.9640595584460039, "grad_norm": 0.3599986433982849, "learning_rate": 1.4159354134888199e-05, "loss": 0.0974, "step": 11476 }, { "epoch": 1.9642307034057847, "grad_norm": 11.545021057128906, "learning_rate": 1.414785075866223e-05, "loss": 0.7663, "step": 11477 }, { "epoch": 1.9644018483655656, "grad_norm": 8.929696083068848, "learning_rate": 1.4136347885205565e-05, "loss": 0.5052, "step": 11478 }, { "epoch": 1.9645729933253464, "grad_norm": 2.842339038848877, "learning_rate": 1.4124845521304932e-05, "loss": 0.2677, "step": 11479 }, { "epoch": 1.9647441382851274, "grad_norm": 17.621807098388672, "learning_rate": 1.4113343673746756e-05, "loss": 1.695, "step": 11480 }, { "epoch": 1.9649152832449084, "grad_norm": 10.730348587036133, "learning_rate": 1.4101842349317137e-05, "loss": 0.7905, "step": 11481 }, { "epoch": 1.9650864282046894, "grad_norm": 2.169240713119507, "learning_rate": 1.4090341554801866e-05, "loss": 0.1491, "step": 11482 }, { "epoch": 1.9652575731644704, "grad_norm": 87.90546417236328, "learning_rate": 1.4078841296986431e-05, "loss": 6.8993, "step": 11483 }, { "epoch": 1.9654287181242514, "grad_norm": 6.352499485015869, "learning_rate": 1.4067341582656006e-05, "loss": 0.4028, "step": 11484 }, { "epoch": 1.9655998630840323, "grad_norm": 19.394460678100586, "learning_rate": 1.4055842418595432e-05, "loss": 0.4328, "step": 11485 }, { "epoch": 1.965771008043813, "grad_norm": 11.65325927734375, "learning_rate": 1.4044343811589241e-05, "loss": 0.9974, "step": 11486 }, { "epoch": 1.965942153003594, "grad_norm": 1.1428706645965576, "learning_rate": 1.4032845768421624e-05, "loss": 0.18, "step": 11487 }, { "epoch": 1.9661132979633749, "grad_norm": 6.143984794616699, "learning_rate": 1.4021348295876447e-05, "loss": 0.7042, "step": 11488 }, { "epoch": 1.9662844429231559, "grad_norm": 40.40150833129883, "learning_rate": 1.4009851400737227e-05, "loss": 6.2267, "step": 11489 }, { "epoch": 1.9664555878829368, "grad_norm": 0.3291659951210022, "learning_rate": 1.399835508978716e-05, "loss": 0.1044, "step": 11490 }, { "epoch": 1.9666267328427178, "grad_norm": 58.467220306396484, "learning_rate": 1.398685936980908e-05, "loss": 6.4263, "step": 11491 }, { "epoch": 1.9667978778024988, "grad_norm": 12.200599670410156, "learning_rate": 1.397536424758548e-05, "loss": 0.8335, "step": 11492 }, { "epoch": 1.9669690227622798, "grad_norm": 5.337753772735596, "learning_rate": 1.3963869729898501e-05, "loss": 0.3995, "step": 11493 }, { "epoch": 1.9671401677220606, "grad_norm": 23.909313201904297, "learning_rate": 1.3952375823529925e-05, "loss": 3.703, "step": 11494 }, { "epoch": 1.9673113126818416, "grad_norm": 1.4301708936691284, "learning_rate": 1.3940882535261173e-05, "loss": 0.173, "step": 11495 }, { "epoch": 1.9674824576416223, "grad_norm": 0.914540708065033, "learning_rate": 1.3929389871873306e-05, "loss": 0.1628, "step": 11496 }, { "epoch": 1.9676536026014033, "grad_norm": 0.3504679501056671, "learning_rate": 1.3917897840147006e-05, "loss": 0.0991, "step": 11497 }, { "epoch": 1.9678247475611843, "grad_norm": 15.854573249816895, "learning_rate": 1.3906406446862592e-05, "loss": 0.9748, "step": 11498 }, { "epoch": 1.9679958925209653, "grad_norm": 4.362349510192871, "learning_rate": 1.3894915698800004e-05, "loss": 0.281, "step": 11499 }, { "epoch": 1.9681670374807463, "grad_norm": 1.1514097452163696, "learning_rate": 1.3883425602738787e-05, "loss": 0.1827, "step": 11500 }, { "epoch": 1.9683381824405273, "grad_norm": 15.568102836608887, "learning_rate": 1.3871936165458129e-05, "loss": 1.804, "step": 11501 }, { "epoch": 1.968509327400308, "grad_norm": 4.195194244384766, "learning_rate": 1.3860447393736834e-05, "loss": 0.4532, "step": 11502 }, { "epoch": 1.968680472360089, "grad_norm": 15.161959648132324, "learning_rate": 1.3848959294353253e-05, "loss": 1.5057, "step": 11503 }, { "epoch": 1.9688516173198698, "grad_norm": 11.962011337280273, "learning_rate": 1.3837471874085428e-05, "loss": 0.7762, "step": 11504 }, { "epoch": 1.9690227622796508, "grad_norm": 1.292962908744812, "learning_rate": 1.3825985139710905e-05, "loss": 0.1937, "step": 11505 }, { "epoch": 1.9691939072394318, "grad_norm": 58.722984313964844, "learning_rate": 1.3814499098006918e-05, "loss": 7.3964, "step": 11506 }, { "epoch": 1.9693650521992128, "grad_norm": 1.6295530796051025, "learning_rate": 1.3803013755750208e-05, "loss": 0.1636, "step": 11507 }, { "epoch": 1.9695361971589938, "grad_norm": 10.765946388244629, "learning_rate": 1.3791529119717186e-05, "loss": 0.704, "step": 11508 }, { "epoch": 1.9697073421187747, "grad_norm": 1.1291992664337158, "learning_rate": 1.3780045196683764e-05, "loss": 0.2011, "step": 11509 }, { "epoch": 1.9698784870785555, "grad_norm": 6.461856365203857, "learning_rate": 1.3768561993425512e-05, "loss": 0.2461, "step": 11510 }, { "epoch": 1.9700496320383365, "grad_norm": 10.1381196975708, "learning_rate": 1.3757079516717496e-05, "loss": 0.7894, "step": 11511 }, { "epoch": 1.9702207769981173, "grad_norm": 0.5924562215805054, "learning_rate": 1.3745597773334443e-05, "loss": 0.1098, "step": 11512 }, { "epoch": 1.9703919219578983, "grad_norm": 0.4203587472438812, "learning_rate": 1.3734116770050548e-05, "loss": 0.1023, "step": 11513 }, { "epoch": 1.9705630669176792, "grad_norm": 0.9006542563438416, "learning_rate": 1.3722636513639664e-05, "loss": 0.128, "step": 11514 }, { "epoch": 1.9707342118774602, "grad_norm": 11.079994201660156, "learning_rate": 1.371115701087512e-05, "loss": 0.7969, "step": 11515 }, { "epoch": 1.9709053568372412, "grad_norm": 26.592166900634766, "learning_rate": 1.3699678268529877e-05, "loss": 5.6968, "step": 11516 }, { "epoch": 1.9710765017970222, "grad_norm": 21.089893341064453, "learning_rate": 1.3688200293376372e-05, "loss": 1.9548, "step": 11517 }, { "epoch": 1.971247646756803, "grad_norm": 4.493779182434082, "learning_rate": 1.367672309218667e-05, "loss": 0.2545, "step": 11518 }, { "epoch": 1.971418791716584, "grad_norm": 6.074472904205322, "learning_rate": 1.3665246671732296e-05, "loss": 0.503, "step": 11519 }, { "epoch": 1.9715899366763647, "grad_norm": 13.482118606567383, "learning_rate": 1.3653771038784396e-05, "loss": 1.4865, "step": 11520 }, { "epoch": 1.9717610816361457, "grad_norm": 0.33575913310050964, "learning_rate": 1.3642296200113554e-05, "loss": 0.0933, "step": 11521 }, { "epoch": 1.9719322265959267, "grad_norm": 0.2952156066894531, "learning_rate": 1.3630822162489995e-05, "loss": 0.0929, "step": 11522 }, { "epoch": 1.9721033715557077, "grad_norm": 8.773137092590332, "learning_rate": 1.36193489326834e-05, "loss": 0.7757, "step": 11523 }, { "epoch": 1.9722745165154887, "grad_norm": 2.717783212661743, "learning_rate": 1.3607876517462993e-05, "loss": 0.2154, "step": 11524 }, { "epoch": 1.9724456614752697, "grad_norm": 8.850228309631348, "learning_rate": 1.3596404923597514e-05, "loss": 0.6986, "step": 11525 }, { "epoch": 1.9726168064350504, "grad_norm": 9.164724349975586, "learning_rate": 1.358493415785522e-05, "loss": 0.7334, "step": 11526 }, { "epoch": 1.9727879513948314, "grad_norm": 0.8617563247680664, "learning_rate": 1.357346422700388e-05, "loss": 0.1828, "step": 11527 }, { "epoch": 1.9729590963546122, "grad_norm": 20.47298812866211, "learning_rate": 1.3561995137810771e-05, "loss": 2.0894, "step": 11528 }, { "epoch": 1.9731302413143932, "grad_norm": 2.090559959411621, "learning_rate": 1.3550526897042677e-05, "loss": 0.1928, "step": 11529 }, { "epoch": 1.9733013862741742, "grad_norm": 10.984954833984375, "learning_rate": 1.3539059511465868e-05, "loss": 0.9743, "step": 11530 }, { "epoch": 1.9734725312339552, "grad_norm": 38.830291748046875, "learning_rate": 1.3527592987846124e-05, "loss": 5.6217, "step": 11531 }, { "epoch": 1.9736436761937362, "grad_norm": 5.7370076179504395, "learning_rate": 1.3516127332948709e-05, "loss": 0.5231, "step": 11532 }, { "epoch": 1.9738148211535171, "grad_norm": 8.704179763793945, "learning_rate": 1.350466255353838e-05, "loss": 0.712, "step": 11533 }, { "epoch": 1.9739859661132981, "grad_norm": 6.740230083465576, "learning_rate": 1.349319865637937e-05, "loss": 0.8736, "step": 11534 }, { "epoch": 1.974157111073079, "grad_norm": 17.918611526489258, "learning_rate": 1.3481735648235398e-05, "loss": 1.5271, "step": 11535 }, { "epoch": 1.9743282560328599, "grad_norm": 3.665757894515991, "learning_rate": 1.3470273535869658e-05, "loss": 0.3672, "step": 11536 }, { "epoch": 1.9744994009926407, "grad_norm": 14.196667671203613, "learning_rate": 1.3458812326044806e-05, "loss": 0.7985, "step": 11537 }, { "epoch": 1.9746705459524216, "grad_norm": 12.327401161193848, "learning_rate": 1.3447352025522978e-05, "loss": 0.8136, "step": 11538 }, { "epoch": 1.9748416909122026, "grad_norm": 5.919031143188477, "learning_rate": 1.3435892641065767e-05, "loss": 0.5644, "step": 11539 }, { "epoch": 1.9750128358719836, "grad_norm": 12.566990852355957, "learning_rate": 1.3424434179434227e-05, "loss": 1.0637, "step": 11540 }, { "epoch": 1.9751839808317646, "grad_norm": 0.9902839660644531, "learning_rate": 1.341297664738887e-05, "loss": 0.1624, "step": 11541 }, { "epoch": 1.9753551257915456, "grad_norm": 3.0829524993896484, "learning_rate": 1.3401520051689654e-05, "loss": 0.3694, "step": 11542 }, { "epoch": 1.9755262707513264, "grad_norm": 7.033138275146484, "learning_rate": 1.3390064399095977e-05, "loss": 0.3809, "step": 11543 }, { "epoch": 1.9756974157111074, "grad_norm": 19.863101959228516, "learning_rate": 1.3378609696366705e-05, "loss": 2.0019, "step": 11544 }, { "epoch": 1.9758685606708881, "grad_norm": 53.59492874145508, "learning_rate": 1.3367155950260148e-05, "loss": 6.1528, "step": 11545 }, { "epoch": 1.976039705630669, "grad_norm": 3.55564022064209, "learning_rate": 1.3355703167533989e-05, "loss": 0.2282, "step": 11546 }, { "epoch": 1.97621085059045, "grad_norm": 16.635129928588867, "learning_rate": 1.3344251354945433e-05, "loss": 1.7731, "step": 11547 }, { "epoch": 1.976381995550231, "grad_norm": 10.41441822052002, "learning_rate": 1.3332800519251021e-05, "loss": 0.817, "step": 11548 }, { "epoch": 1.976553140510012, "grad_norm": 1.4082058668136597, "learning_rate": 1.3321350667206808e-05, "loss": 0.1766, "step": 11549 }, { "epoch": 1.976724285469793, "grad_norm": 23.87366485595703, "learning_rate": 1.330990180556818e-05, "loss": 2.1703, "step": 11550 }, { "epoch": 1.9768954304295738, "grad_norm": 15.4796781539917, "learning_rate": 1.3298453941090023e-05, "loss": 1.1762, "step": 11551 }, { "epoch": 1.9770665753893548, "grad_norm": 68.19232177734375, "learning_rate": 1.3287007080526557e-05, "loss": 0.6034, "step": 11552 }, { "epoch": 1.9772377203491356, "grad_norm": 5.2124433517456055, "learning_rate": 1.3275561230631488e-05, "loss": 0.1693, "step": 11553 }, { "epoch": 1.9774088653089166, "grad_norm": 19.899372100830078, "learning_rate": 1.326411639815784e-05, "loss": 1.8894, "step": 11554 }, { "epoch": 1.9775800102686976, "grad_norm": 1.0012872219085693, "learning_rate": 1.325267258985812e-05, "loss": 0.1526, "step": 11555 }, { "epoch": 1.9777511552284786, "grad_norm": 5.495520114898682, "learning_rate": 1.3241229812484153e-05, "loss": 0.4624, "step": 11556 }, { "epoch": 1.9779223001882595, "grad_norm": 7.681595802307129, "learning_rate": 1.322978807278724e-05, "loss": 0.6436, "step": 11557 }, { "epoch": 1.9780934451480405, "grad_norm": 8.36526870727539, "learning_rate": 1.3218347377517979e-05, "loss": 0.7442, "step": 11558 }, { "epoch": 1.9782645901078213, "grad_norm": 14.942583084106445, "learning_rate": 1.3206907733426442e-05, "loss": 1.5199, "step": 11559 }, { "epoch": 1.9784357350676023, "grad_norm": 16.872425079345703, "learning_rate": 1.3195469147261987e-05, "loss": 1.8179, "step": 11560 }, { "epoch": 1.978606880027383, "grad_norm": 7.6675286293029785, "learning_rate": 1.318403162577345e-05, "loss": 0.6611, "step": 11561 }, { "epoch": 1.978778024987164, "grad_norm": 9.60794734954834, "learning_rate": 1.3172595175708934e-05, "loss": 1.0265, "step": 11562 }, { "epoch": 1.978949169946945, "grad_norm": 7.175992488861084, "learning_rate": 1.3161159803816006e-05, "loss": 0.9576, "step": 11563 }, { "epoch": 1.979120314906726, "grad_norm": 14.204499244689941, "learning_rate": 1.3149725516841504e-05, "loss": 1.267, "step": 11564 }, { "epoch": 1.979291459866507, "grad_norm": 6.134832382202148, "learning_rate": 1.3138292321531724e-05, "loss": 0.3137, "step": 11565 }, { "epoch": 1.979462604826288, "grad_norm": 8.256912231445312, "learning_rate": 1.3126860224632245e-05, "loss": 0.5901, "step": 11566 }, { "epoch": 1.9796337497860688, "grad_norm": 1.058651328086853, "learning_rate": 1.3115429232888025e-05, "loss": 0.1966, "step": 11567 }, { "epoch": 1.9798048947458498, "grad_norm": 6.951485633850098, "learning_rate": 1.3103999353043362e-05, "loss": 0.4772, "step": 11568 }, { "epoch": 1.9799760397056305, "grad_norm": 1.1561604738235474, "learning_rate": 1.3092570591841909e-05, "loss": 0.1624, "step": 11569 }, { "epoch": 1.9801471846654115, "grad_norm": 2.619316339492798, "learning_rate": 1.3081142956026647e-05, "loss": 0.2686, "step": 11570 }, { "epoch": 1.9803183296251925, "grad_norm": 1.3069579601287842, "learning_rate": 1.3069716452339897e-05, "loss": 0.124, "step": 11571 }, { "epoch": 1.9804894745849735, "grad_norm": 15.08419132232666, "learning_rate": 1.3058291087523315e-05, "loss": 1.5691, "step": 11572 }, { "epoch": 1.9806606195447545, "grad_norm": 6.9841718673706055, "learning_rate": 1.3046866868317883e-05, "loss": 0.8101, "step": 11573 }, { "epoch": 1.9808317645045355, "grad_norm": 15.11148738861084, "learning_rate": 1.303544380146391e-05, "loss": 1.4234, "step": 11574 }, { "epoch": 1.9810029094643162, "grad_norm": 0.31231775879859924, "learning_rate": 1.3024021893701019e-05, "loss": 0.0993, "step": 11575 }, { "epoch": 1.9811740544240972, "grad_norm": 61.65127182006836, "learning_rate": 1.3012601151768157e-05, "loss": 6.5006, "step": 11576 }, { "epoch": 1.981345199383878, "grad_norm": 3.11611008644104, "learning_rate": 1.3001181582403573e-05, "loss": 0.2488, "step": 11577 }, { "epoch": 1.981516344343659, "grad_norm": 16.134965896606445, "learning_rate": 1.298976319234483e-05, "loss": 1.1805, "step": 11578 }, { "epoch": 1.98168748930344, "grad_norm": 20.658838272094727, "learning_rate": 1.29783459883288e-05, "loss": 2.0056, "step": 11579 }, { "epoch": 1.981858634263221, "grad_norm": 3.786620616912842, "learning_rate": 1.296692997709165e-05, "loss": 0.5104, "step": 11580 }, { "epoch": 1.982029779223002, "grad_norm": 15.750129699707031, "learning_rate": 1.2955515165368839e-05, "loss": 1.1992, "step": 11581 }, { "epoch": 1.982200924182783, "grad_norm": 0.40533846616744995, "learning_rate": 1.2944101559895124e-05, "loss": 0.0961, "step": 11582 }, { "epoch": 1.9823720691425637, "grad_norm": 14.998530387878418, "learning_rate": 1.2932689167404553e-05, "loss": 1.18, "step": 11583 }, { "epoch": 1.9825432141023447, "grad_norm": 12.795882225036621, "learning_rate": 1.2921277994630456e-05, "loss": 1.0451, "step": 11584 }, { "epoch": 1.9827143590621257, "grad_norm": 4.228010654449463, "learning_rate": 1.2909868048305434e-05, "loss": 0.365, "step": 11585 }, { "epoch": 1.9828855040219064, "grad_norm": 8.487818717956543, "learning_rate": 1.2898459335161369e-05, "loss": 0.6717, "step": 11586 }, { "epoch": 1.9830566489816874, "grad_norm": 7.513432025909424, "learning_rate": 1.2887051861929424e-05, "loss": 1.0106, "step": 11587 }, { "epoch": 1.9832277939414684, "grad_norm": 7.560845851898193, "learning_rate": 1.287564563534005e-05, "loss": 0.5746, "step": 11588 }, { "epoch": 1.9833989389012494, "grad_norm": 20.59306526184082, "learning_rate": 1.2864240662122893e-05, "loss": 2.1207, "step": 11589 }, { "epoch": 1.9835700838610304, "grad_norm": 30.73232078552246, "learning_rate": 1.2852836949006946e-05, "loss": 0.75, "step": 11590 }, { "epoch": 1.9837412288208114, "grad_norm": 8.641944885253906, "learning_rate": 1.2841434502720378e-05, "loss": 0.6437, "step": 11591 }, { "epoch": 1.9839123737805922, "grad_norm": 26.409238815307617, "learning_rate": 1.2830033329990692e-05, "loss": 1.4392, "step": 11592 }, { "epoch": 1.9840835187403731, "grad_norm": 14.037877082824707, "learning_rate": 1.2818633437544549e-05, "loss": 1.262, "step": 11593 }, { "epoch": 1.984254663700154, "grad_norm": 5.6162638664245605, "learning_rate": 1.280723483210795e-05, "loss": 0.3071, "step": 11594 }, { "epoch": 1.984425808659935, "grad_norm": 18.443252563476562, "learning_rate": 1.2795837520406045e-05, "loss": 2.2217, "step": 11595 }, { "epoch": 1.9845969536197159, "grad_norm": 5.872466087341309, "learning_rate": 1.2784441509163303e-05, "loss": 0.5691, "step": 11596 }, { "epoch": 1.9847680985794969, "grad_norm": 13.880841255187988, "learning_rate": 1.2773046805103353e-05, "loss": 1.1013, "step": 11597 }, { "epoch": 1.9849392435392779, "grad_norm": 15.525230407714844, "learning_rate": 1.2761653414949125e-05, "loss": 1.4786, "step": 11598 }, { "epoch": 1.9851103884990589, "grad_norm": 9.494690895080566, "learning_rate": 1.2750261345422696e-05, "loss": 0.5867, "step": 11599 }, { "epoch": 1.9852815334588396, "grad_norm": 14.745691299438477, "learning_rate": 1.2738870603245446e-05, "loss": 1.3649, "step": 11600 }, { "epoch": 1.9854526784186206, "grad_norm": 8.521980285644531, "learning_rate": 1.2727481195137886e-05, "loss": 0.5255, "step": 11601 }, { "epoch": 1.9856238233784014, "grad_norm": 1.3374757766723633, "learning_rate": 1.2716093127819826e-05, "loss": 0.1822, "step": 11602 }, { "epoch": 1.9857949683381824, "grad_norm": 7.335762023925781, "learning_rate": 1.2704706408010203e-05, "loss": 0.4352, "step": 11603 }, { "epoch": 1.9859661132979634, "grad_norm": 7.295063495635986, "learning_rate": 1.2693321042427235e-05, "loss": 0.5217, "step": 11604 }, { "epoch": 1.9861372582577443, "grad_norm": 5.531447887420654, "learning_rate": 1.2681937037788272e-05, "loss": 0.6134, "step": 11605 }, { "epoch": 1.9863084032175253, "grad_norm": 21.078895568847656, "learning_rate": 1.2670554400809925e-05, "loss": 1.7649, "step": 11606 }, { "epoch": 1.9864795481773063, "grad_norm": 1.5643624067306519, "learning_rate": 1.2659173138207923e-05, "loss": 0.1805, "step": 11607 }, { "epoch": 1.986650693137087, "grad_norm": 1.5933406352996826, "learning_rate": 1.2647793256697277e-05, "loss": 0.1731, "step": 11608 }, { "epoch": 1.986821838096868, "grad_norm": 12.710850715637207, "learning_rate": 1.2636414762992115e-05, "loss": 0.7374, "step": 11609 }, { "epoch": 1.9869929830566488, "grad_norm": 1.8667336702346802, "learning_rate": 1.2625037663805761e-05, "loss": 0.3301, "step": 11610 }, { "epoch": 1.9871641280164298, "grad_norm": 19.817913055419922, "learning_rate": 1.2613661965850722e-05, "loss": 2.8407, "step": 11611 }, { "epoch": 1.9873352729762108, "grad_norm": 18.864486694335938, "learning_rate": 1.2602287675838679e-05, "loss": 2.1319, "step": 11612 }, { "epoch": 1.9875064179359918, "grad_norm": 14.335698127746582, "learning_rate": 1.2590914800480476e-05, "loss": 1.34, "step": 11613 }, { "epoch": 1.9876775628957728, "grad_norm": 5.743200302124023, "learning_rate": 1.2579543346486132e-05, "loss": 0.5998, "step": 11614 }, { "epoch": 1.9878487078555538, "grad_norm": 6.288778781890869, "learning_rate": 1.2568173320564815e-05, "loss": 0.5301, "step": 11615 }, { "epoch": 1.9880198528153346, "grad_norm": 2.5157384872436523, "learning_rate": 1.2556804729424863e-05, "loss": 0.5079, "step": 11616 }, { "epoch": 1.9881909977751155, "grad_norm": 1.6363633871078491, "learning_rate": 1.2545437579773762e-05, "loss": 0.1657, "step": 11617 }, { "epoch": 1.9883621427348963, "grad_norm": 9.711610794067383, "learning_rate": 1.2534071878318143e-05, "loss": 0.6944, "step": 11618 }, { "epoch": 1.9885332876946773, "grad_norm": 17.18990135192871, "learning_rate": 1.252270763176379e-05, "loss": 1.6218, "step": 11619 }, { "epoch": 1.9887044326544583, "grad_norm": 1.9120348691940308, "learning_rate": 1.2511344846815621e-05, "loss": 0.1776, "step": 11620 }, { "epoch": 1.9888755776142393, "grad_norm": 0.8377760052680969, "learning_rate": 1.24999835301777e-05, "loss": 0.1273, "step": 11621 }, { "epoch": 1.9890467225740203, "grad_norm": 7.121527194976807, "learning_rate": 1.248862368855322e-05, "loss": 0.7134, "step": 11622 }, { "epoch": 1.9892178675338013, "grad_norm": 1.9628398418426514, "learning_rate": 1.2477265328644505e-05, "loss": 0.1895, "step": 11623 }, { "epoch": 1.989389012493582, "grad_norm": 2.5684893131256104, "learning_rate": 1.2465908457153006e-05, "loss": 0.2118, "step": 11624 }, { "epoch": 1.989560157453363, "grad_norm": 18.3275203704834, "learning_rate": 1.245455308077929e-05, "loss": 1.6228, "step": 11625 }, { "epoch": 1.9897313024131438, "grad_norm": 12.81849479675293, "learning_rate": 1.2443199206223046e-05, "loss": 0.8738, "step": 11626 }, { "epoch": 1.9899024473729248, "grad_norm": 2.7395644187927246, "learning_rate": 1.243184684018308e-05, "loss": 0.2161, "step": 11627 }, { "epoch": 1.9900735923327058, "grad_norm": 5.60601806640625, "learning_rate": 1.2420495989357304e-05, "loss": 0.5329, "step": 11628 }, { "epoch": 1.9902447372924867, "grad_norm": 6.362064361572266, "learning_rate": 1.2409146660442723e-05, "loss": 0.5135, "step": 11629 }, { "epoch": 1.9904158822522677, "grad_norm": 5.805427551269531, "learning_rate": 1.2397798860135474e-05, "loss": 0.5589, "step": 11630 }, { "epoch": 1.9905870272120487, "grad_norm": 0.758171796798706, "learning_rate": 1.2386452595130793e-05, "loss": 0.1172, "step": 11631 }, { "epoch": 1.9907581721718295, "grad_norm": 62.83370590209961, "learning_rate": 1.2375107872122955e-05, "loss": 7.5544, "step": 11632 }, { "epoch": 1.9909293171316105, "grad_norm": 13.032822608947754, "learning_rate": 1.2363764697805402e-05, "loss": 0.9325, "step": 11633 }, { "epoch": 1.9911004620913912, "grad_norm": 25.193408966064453, "learning_rate": 1.2352423078870584e-05, "loss": 5.196, "step": 11634 }, { "epoch": 1.9912716070511722, "grad_norm": 10.469444274902344, "learning_rate": 1.234108302201011e-05, "loss": 0.9742, "step": 11635 }, { "epoch": 1.9914427520109532, "grad_norm": 11.19771671295166, "learning_rate": 1.2329744533914596e-05, "loss": 0.8299, "step": 11636 }, { "epoch": 1.9916138969707342, "grad_norm": 18.783218383789062, "learning_rate": 1.2318407621273801e-05, "loss": 2.5522, "step": 11637 }, { "epoch": 1.9917850419305152, "grad_norm": 14.585447311401367, "learning_rate": 1.2307072290776485e-05, "loss": 1.3024, "step": 11638 }, { "epoch": 1.9919561868902962, "grad_norm": 3.909926414489746, "learning_rate": 1.229573854911055e-05, "loss": 0.292, "step": 11639 }, { "epoch": 1.9921273318500772, "grad_norm": 16.434045791625977, "learning_rate": 1.2284406402962877e-05, "loss": 1.6737, "step": 11640 }, { "epoch": 1.992298476809858, "grad_norm": 1.8504114151000977, "learning_rate": 1.2273075859019495e-05, "loss": 0.1983, "step": 11641 }, { "epoch": 1.992469621769639, "grad_norm": 16.672513961791992, "learning_rate": 1.2261746923965395e-05, "loss": 1.8585, "step": 11642 }, { "epoch": 1.9926407667294197, "grad_norm": 4.540879726409912, "learning_rate": 1.2250419604484708e-05, "loss": 0.4141, "step": 11643 }, { "epoch": 1.9928119116892007, "grad_norm": 22.639528274536133, "learning_rate": 1.223909390726053e-05, "loss": 0.7548, "step": 11644 }, { "epoch": 1.9929830566489817, "grad_norm": 0.8706194758415222, "learning_rate": 1.2227769838975079e-05, "loss": 0.1677, "step": 11645 }, { "epoch": 1.9931542016087627, "grad_norm": 4.138914108276367, "learning_rate": 1.221644740630953e-05, "loss": 0.4252, "step": 11646 }, { "epoch": 1.9933253465685437, "grad_norm": 1.858681082725525, "learning_rate": 1.220512661594418e-05, "loss": 0.2048, "step": 11647 }, { "epoch": 1.9934964915283246, "grad_norm": 15.731382369995117, "learning_rate": 1.2193807474558268e-05, "loss": 1.2326, "step": 11648 }, { "epoch": 1.9936676364881054, "grad_norm": 12.55146598815918, "learning_rate": 1.2182489988830151e-05, "loss": 1.0814, "step": 11649 }, { "epoch": 1.9938387814478864, "grad_norm": 9.507720947265625, "learning_rate": 1.2171174165437104e-05, "loss": 0.7314, "step": 11650 }, { "epoch": 1.9940099264076672, "grad_norm": 2.034435510635376, "learning_rate": 1.215986001105553e-05, "loss": 0.2096, "step": 11651 }, { "epoch": 1.9941810713674482, "grad_norm": 0.8830540180206299, "learning_rate": 1.2148547532360783e-05, "loss": 0.1372, "step": 11652 }, { "epoch": 1.9943522163272291, "grad_norm": 8.764945030212402, "learning_rate": 1.2137236736027236e-05, "loss": 0.6025, "step": 11653 }, { "epoch": 1.9945233612870101, "grad_norm": 1.085350751876831, "learning_rate": 1.2125927628728278e-05, "loss": 0.1797, "step": 11654 }, { "epoch": 1.9946945062467911, "grad_norm": 1.3240504264831543, "learning_rate": 1.2114620217136302e-05, "loss": 0.2117, "step": 11655 }, { "epoch": 1.994865651206572, "grad_norm": 22.144235610961914, "learning_rate": 1.2103314507922697e-05, "loss": 1.5402, "step": 11656 }, { "epoch": 1.9950367961663529, "grad_norm": 47.04197311401367, "learning_rate": 1.2092010507757849e-05, "loss": 5.9088, "step": 11657 }, { "epoch": 1.9952079411261339, "grad_norm": 0.9703665971755981, "learning_rate": 1.2080708223311127e-05, "loss": 0.1541, "step": 11658 }, { "epoch": 1.9953790860859146, "grad_norm": 3.0606601238250732, "learning_rate": 1.2069407661250903e-05, "loss": 0.3748, "step": 11659 }, { "epoch": 1.9955502310456956, "grad_norm": 1.6897790431976318, "learning_rate": 1.2058108828244524e-05, "loss": 0.1971, "step": 11660 }, { "epoch": 1.9957213760054766, "grad_norm": 47.40699005126953, "learning_rate": 1.204681173095832e-05, "loss": 1.0008, "step": 11661 }, { "epoch": 1.9958925209652576, "grad_norm": 23.223684310913086, "learning_rate": 1.2035516376057591e-05, "loss": 5.1575, "step": 11662 }, { "epoch": 1.9960636659250386, "grad_norm": 29.95488166809082, "learning_rate": 1.2024222770206614e-05, "loss": 5.1365, "step": 11663 }, { "epoch": 1.9962348108848196, "grad_norm": 10.282565116882324, "learning_rate": 1.2012930920068638e-05, "loss": 0.7617, "step": 11664 }, { "epoch": 1.9964059558446003, "grad_norm": 5.446146488189697, "learning_rate": 1.2001640832305872e-05, "loss": 0.3382, "step": 11665 }, { "epoch": 1.9965771008043813, "grad_norm": 9.002534866333008, "learning_rate": 1.1990352513579476e-05, "loss": 0.684, "step": 11666 }, { "epoch": 1.996748245764162, "grad_norm": 16.225831985473633, "learning_rate": 1.197906597054958e-05, "loss": 1.2218, "step": 11667 }, { "epoch": 1.996919390723943, "grad_norm": 4.037108421325684, "learning_rate": 1.1967781209875261e-05, "loss": 0.1677, "step": 11668 }, { "epoch": 1.997090535683724, "grad_norm": 5.418506622314453, "learning_rate": 1.1956498238214543e-05, "loss": 0.4381, "step": 11669 }, { "epoch": 1.997261680643505, "grad_norm": 24.53140640258789, "learning_rate": 1.19452170622244e-05, "loss": 4.3996, "step": 11670 }, { "epoch": 1.997432825603286, "grad_norm": 0.47869163751602173, "learning_rate": 1.193393768856074e-05, "loss": 0.0981, "step": 11671 }, { "epoch": 1.997603970563067, "grad_norm": 11.55920124053955, "learning_rate": 1.19226601238784e-05, "loss": 0.8737, "step": 11672 }, { "epoch": 1.9977751155228478, "grad_norm": 8.690323829650879, "learning_rate": 1.191138437483117e-05, "loss": 0.4816, "step": 11673 }, { "epoch": 1.9979462604826288, "grad_norm": 2.1394011974334717, "learning_rate": 1.1900110448071781e-05, "loss": 0.1605, "step": 11674 }, { "epoch": 1.9981174054424096, "grad_norm": 6.5807671546936035, "learning_rate": 1.1888838350251825e-05, "loss": 0.4129, "step": 11675 }, { "epoch": 1.9982885504021906, "grad_norm": 7.3815178871154785, "learning_rate": 1.1877568088021896e-05, "loss": 0.6743, "step": 11676 }, { "epoch": 1.9984596953619715, "grad_norm": 54.914512634277344, "learning_rate": 1.1866299668031424e-05, "loss": 4.6585, "step": 11677 }, { "epoch": 1.9986308403217525, "grad_norm": 0.9881729483604431, "learning_rate": 1.1855033096928834e-05, "loss": 0.1345, "step": 11678 }, { "epoch": 1.9988019852815335, "grad_norm": 0.38421735167503357, "learning_rate": 1.1843768381361384e-05, "loss": 0.1014, "step": 11679 }, { "epoch": 1.9989731302413145, "grad_norm": 2.279719114303589, "learning_rate": 1.1832505527975309e-05, "loss": 0.203, "step": 11680 }, { "epoch": 1.9991442752010953, "grad_norm": 13.591947555541992, "learning_rate": 1.1821244543415671e-05, "loss": 0.8735, "step": 11681 }, { "epoch": 1.9993154201608763, "grad_norm": 0.4051353633403778, "learning_rate": 1.1809985434326507e-05, "loss": 0.0972, "step": 11682 }, { "epoch": 1.999486565120657, "grad_norm": 132.51124572753906, "learning_rate": 1.179872820735067e-05, "loss": 5.1727, "step": 11683 }, { "epoch": 1.999657710080438, "grad_norm": 2.0249786376953125, "learning_rate": 1.1787472869129975e-05, "loss": 0.1688, "step": 11684 }, { "epoch": 1.999828855040219, "grad_norm": 2.4699254035949707, "learning_rate": 1.1776219426305055e-05, "loss": 0.2338, "step": 11685 }, { "epoch": 2.0, "grad_norm": 2.467048168182373, "learning_rate": 1.1764967885515494e-05, "loss": 0.157, "step": 11686 }, { "epoch": 2.000171144959781, "grad_norm": 9.27706241607666, "learning_rate": 1.1753718253399677e-05, "loss": 0.7216, "step": 11687 }, { "epoch": 2.000342289919562, "grad_norm": 11.445172309875488, "learning_rate": 1.1742470536594951e-05, "loss": 0.9084, "step": 11688 }, { "epoch": 2.000513434879343, "grad_norm": 15.222339630126953, "learning_rate": 1.1731224741737437e-05, "loss": 0.9669, "step": 11689 }, { "epoch": 2.000684579839124, "grad_norm": 6.603372573852539, "learning_rate": 1.1719980875462215e-05, "loss": 0.5134, "step": 11690 }, { "epoch": 2.0008557247989045, "grad_norm": 24.55394744873047, "learning_rate": 1.170873894440314e-05, "loss": 5.3807, "step": 11691 }, { "epoch": 2.0010268697586855, "grad_norm": 18.623994827270508, "learning_rate": 1.169749895519301e-05, "loss": 2.2217, "step": 11692 }, { "epoch": 2.0011980147184665, "grad_norm": 1.7960540056228638, "learning_rate": 1.1686260914463388e-05, "loss": 0.1972, "step": 11693 }, { "epoch": 2.0013691596782475, "grad_norm": 112.69210815429688, "learning_rate": 1.167502482884478e-05, "loss": 6.9911, "step": 11694 }, { "epoch": 2.0015403046380285, "grad_norm": 8.671306610107422, "learning_rate": 1.1663790704966475e-05, "loss": 0.6478, "step": 11695 }, { "epoch": 2.0017114495978094, "grad_norm": 0.26874473690986633, "learning_rate": 1.1652558549456623e-05, "loss": 0.0926, "step": 11696 }, { "epoch": 2.0018825945575904, "grad_norm": 8.209640502929688, "learning_rate": 1.1641328368942215e-05, "loss": 0.7233, "step": 11697 }, { "epoch": 2.0020537395173714, "grad_norm": 1.6676769256591797, "learning_rate": 1.1630100170049073e-05, "loss": 0.176, "step": 11698 }, { "epoch": 2.002224884477152, "grad_norm": 9.504531860351562, "learning_rate": 1.1618873959401848e-05, "loss": 0.8362, "step": 11699 }, { "epoch": 2.002396029436933, "grad_norm": 12.602211952209473, "learning_rate": 1.1607649743624024e-05, "loss": 0.8255, "step": 11700 }, { "epoch": 2.002567174396714, "grad_norm": 5.88329553604126, "learning_rate": 1.15964275293379e-05, "loss": 0.5281, "step": 11701 }, { "epoch": 2.002738319356495, "grad_norm": 4.189892292022705, "learning_rate": 1.1585207323164607e-05, "loss": 0.3214, "step": 11702 }, { "epoch": 2.002909464316276, "grad_norm": 17.199134826660156, "learning_rate": 1.1573989131724079e-05, "loss": 1.7972, "step": 11703 }, { "epoch": 2.003080609276057, "grad_norm": 4.707541465759277, "learning_rate": 1.1562772961635064e-05, "loss": 0.4323, "step": 11704 }, { "epoch": 2.003251754235838, "grad_norm": 16.28215789794922, "learning_rate": 1.1551558819515127e-05, "loss": 1.2081, "step": 11705 }, { "epoch": 2.003422899195619, "grad_norm": 6.139052867889404, "learning_rate": 1.154034671198062e-05, "loss": 0.6451, "step": 11706 }, { "epoch": 2.0035940441553994, "grad_norm": 14.46859359741211, "learning_rate": 1.1529136645646705e-05, "loss": 1.2435, "step": 11707 }, { "epoch": 2.0037651891151804, "grad_norm": 20.49411392211914, "learning_rate": 1.1517928627127338e-05, "loss": 2.492, "step": 11708 }, { "epoch": 2.0039363340749614, "grad_norm": 3.111706256866455, "learning_rate": 1.1506722663035266e-05, "loss": 0.1906, "step": 11709 }, { "epoch": 2.0041074790347424, "grad_norm": 9.387779235839844, "learning_rate": 1.1495518759982027e-05, "loss": 0.7572, "step": 11710 }, { "epoch": 2.0042786239945234, "grad_norm": 7.210862159729004, "learning_rate": 1.1484316924577941e-05, "loss": 0.7283, "step": 11711 }, { "epoch": 2.0044497689543044, "grad_norm": 8.932064056396484, "learning_rate": 1.1473117163432105e-05, "loss": 0.6798, "step": 11712 }, { "epoch": 2.0046209139140854, "grad_norm": 15.608705520629883, "learning_rate": 1.1461919483152397e-05, "loss": 1.6835, "step": 11713 }, { "epoch": 2.0047920588738664, "grad_norm": 32.524349212646484, "learning_rate": 1.1450723890345465e-05, "loss": 6.1015, "step": 11714 }, { "epoch": 2.004963203833647, "grad_norm": 10.95119571685791, "learning_rate": 1.1439530391616711e-05, "loss": 0.7055, "step": 11715 }, { "epoch": 2.005134348793428, "grad_norm": 4.271819591522217, "learning_rate": 1.142833899357033e-05, "loss": 0.4505, "step": 11716 }, { "epoch": 2.005305493753209, "grad_norm": 1.0302865505218506, "learning_rate": 1.1417149702809283e-05, "loss": 0.1676, "step": 11717 }, { "epoch": 2.00547663871299, "grad_norm": 9.645342826843262, "learning_rate": 1.1405962525935227e-05, "loss": 0.6654, "step": 11718 }, { "epoch": 2.005647783672771, "grad_norm": 1.9339349269866943, "learning_rate": 1.1394777469548654e-05, "loss": 0.193, "step": 11719 }, { "epoch": 2.005818928632552, "grad_norm": 8.588444709777832, "learning_rate": 1.1383594540248724e-05, "loss": 0.524, "step": 11720 }, { "epoch": 2.005990073592333, "grad_norm": 6.7710161209106445, "learning_rate": 1.1372413744633424e-05, "loss": 0.5131, "step": 11721 }, { "epoch": 2.006161218552114, "grad_norm": 18.046680450439453, "learning_rate": 1.1361235089299395e-05, "loss": 1.3145, "step": 11722 }, { "epoch": 2.0063323635118944, "grad_norm": 8.006476402282715, "learning_rate": 1.1350058580842105e-05, "loss": 0.7531, "step": 11723 }, { "epoch": 2.0065035084716754, "grad_norm": 4.6407060623168945, "learning_rate": 1.1338884225855667e-05, "loss": 0.3001, "step": 11724 }, { "epoch": 2.0066746534314563, "grad_norm": 9.277440071105957, "learning_rate": 1.1327712030933009e-05, "loss": 0.9125, "step": 11725 }, { "epoch": 2.0068457983912373, "grad_norm": 8.176043510437012, "learning_rate": 1.1316542002665701e-05, "loss": 0.5891, "step": 11726 }, { "epoch": 2.0070169433510183, "grad_norm": 11.001191139221191, "learning_rate": 1.130537414764412e-05, "loss": 1.0758, "step": 11727 }, { "epoch": 2.0071880883107993, "grad_norm": 11.75613021850586, "learning_rate": 1.1294208472457276e-05, "loss": 0.8769, "step": 11728 }, { "epoch": 2.0073592332705803, "grad_norm": 19.39358139038086, "learning_rate": 1.1283044983692974e-05, "loss": 1.8808, "step": 11729 }, { "epoch": 2.0075303782303613, "grad_norm": 9.617931365966797, "learning_rate": 1.1271883687937645e-05, "loss": 0.9607, "step": 11730 }, { "epoch": 2.007701523190142, "grad_norm": 11.841121673583984, "learning_rate": 1.1260724591776512e-05, "loss": 0.9739, "step": 11731 }, { "epoch": 2.007872668149923, "grad_norm": 10.263586044311523, "learning_rate": 1.1249567701793422e-05, "loss": 0.7022, "step": 11732 }, { "epoch": 2.008043813109704, "grad_norm": 8.591386795043945, "learning_rate": 1.1238413024570992e-05, "loss": 0.6975, "step": 11733 }, { "epoch": 2.008214958069485, "grad_norm": 9.117953300476074, "learning_rate": 1.122726056669046e-05, "loss": 0.7195, "step": 11734 }, { "epoch": 2.008386103029266, "grad_norm": 10.898491859436035, "learning_rate": 1.1216110334731835e-05, "loss": 0.717, "step": 11735 }, { "epoch": 2.008557247989047, "grad_norm": 8.783679962158203, "learning_rate": 1.1204962335273718e-05, "loss": 0.5387, "step": 11736 }, { "epoch": 2.0087283929488278, "grad_norm": 11.453118324279785, "learning_rate": 1.1193816574893492e-05, "loss": 0.7051, "step": 11737 }, { "epoch": 2.0088995379086088, "grad_norm": 8.79861831665039, "learning_rate": 1.1182673060167164e-05, "loss": 0.6922, "step": 11738 }, { "epoch": 2.0090706828683897, "grad_norm": 27.42424201965332, "learning_rate": 1.117153179766941e-05, "loss": 5.0848, "step": 11739 }, { "epoch": 2.0092418278281703, "grad_norm": 11.820676803588867, "learning_rate": 1.1160392793973601e-05, "loss": 0.8968, "step": 11740 }, { "epoch": 2.0094129727879513, "grad_norm": 4.422729969024658, "learning_rate": 1.114925605565176e-05, "loss": 0.4199, "step": 11741 }, { "epoch": 2.0095841177477323, "grad_norm": 5.189631938934326, "learning_rate": 1.113812158927458e-05, "loss": 0.4849, "step": 11742 }, { "epoch": 2.0097552627075133, "grad_norm": 4.323444366455078, "learning_rate": 1.1126989401411418e-05, "loss": 0.3817, "step": 11743 }, { "epoch": 2.0099264076672942, "grad_norm": 12.876853942871094, "learning_rate": 1.1115859498630277e-05, "loss": 0.98, "step": 11744 }, { "epoch": 2.0100975526270752, "grad_norm": 21.06958770751953, "learning_rate": 1.1104731887497817e-05, "loss": 0.8013, "step": 11745 }, { "epoch": 2.0102686975868562, "grad_norm": 2.954761028289795, "learning_rate": 1.1093606574579346e-05, "loss": 0.2326, "step": 11746 }, { "epoch": 2.010439842546637, "grad_norm": 29.024124145507812, "learning_rate": 1.1082483566438814e-05, "loss": 5.7958, "step": 11747 }, { "epoch": 2.0106109875064178, "grad_norm": 10.775449752807617, "learning_rate": 1.107136286963881e-05, "loss": 0.7734, "step": 11748 }, { "epoch": 2.0107821324661987, "grad_norm": 152.95953369140625, "learning_rate": 1.1060244490740567e-05, "loss": 7.6618, "step": 11749 }, { "epoch": 2.0109532774259797, "grad_norm": 7.560899257659912, "learning_rate": 1.1049128436303943e-05, "loss": 0.5418, "step": 11750 }, { "epoch": 2.0111244223857607, "grad_norm": 13.286551475524902, "learning_rate": 1.1038014712887425e-05, "loss": 0.8579, "step": 11751 }, { "epoch": 2.0112955673455417, "grad_norm": 11.943893432617188, "learning_rate": 1.1026903327048128e-05, "loss": 0.9758, "step": 11752 }, { "epoch": 2.0114667123053227, "grad_norm": 11.58649730682373, "learning_rate": 1.1015794285341789e-05, "loss": 0.777, "step": 11753 }, { "epoch": 2.0116378572651037, "grad_norm": 0.9989314079284668, "learning_rate": 1.1004687594322752e-05, "loss": 0.1861, "step": 11754 }, { "epoch": 2.0118090022248847, "grad_norm": 5.666943073272705, "learning_rate": 1.0993583260543985e-05, "loss": 0.4236, "step": 11755 }, { "epoch": 2.0119801471846652, "grad_norm": 8.372326850891113, "learning_rate": 1.0982481290557063e-05, "loss": 0.6001, "step": 11756 }, { "epoch": 2.012151292144446, "grad_norm": 5.611756324768066, "learning_rate": 1.0971381690912145e-05, "loss": 0.3321, "step": 11757 }, { "epoch": 2.012322437104227, "grad_norm": 1.239336609840393, "learning_rate": 1.0960284468158055e-05, "loss": 0.1749, "step": 11758 }, { "epoch": 2.012493582064008, "grad_norm": 31.632980346679688, "learning_rate": 1.0949189628842129e-05, "loss": 5.3928, "step": 11759 }, { "epoch": 2.012664727023789, "grad_norm": 6.581335544586182, "learning_rate": 1.0938097179510376e-05, "loss": 0.6295, "step": 11760 }, { "epoch": 2.01283587198357, "grad_norm": 5.980733871459961, "learning_rate": 1.0927007126707316e-05, "loss": 0.5001, "step": 11761 }, { "epoch": 2.013007016943351, "grad_norm": 5.669210910797119, "learning_rate": 1.0915919476976142e-05, "loss": 0.3783, "step": 11762 }, { "epoch": 2.013178161903132, "grad_norm": 13.811125755310059, "learning_rate": 1.0904834236858536e-05, "loss": 1.0066, "step": 11763 }, { "epoch": 2.0133493068629127, "grad_norm": 12.24834156036377, "learning_rate": 1.0893751412894848e-05, "loss": 0.796, "step": 11764 }, { "epoch": 2.0135204518226937, "grad_norm": 16.85426139831543, "learning_rate": 1.0882671011623924e-05, "loss": 1.1641, "step": 11765 }, { "epoch": 2.0136915967824747, "grad_norm": 13.410322189331055, "learning_rate": 1.0871593039583256e-05, "loss": 0.9992, "step": 11766 }, { "epoch": 2.0138627417422557, "grad_norm": 6.641827583312988, "learning_rate": 1.0860517503308822e-05, "loss": 0.5091, "step": 11767 }, { "epoch": 2.0140338867020366, "grad_norm": 18.126930236816406, "learning_rate": 1.084944440933525e-05, "loss": 1.3358, "step": 11768 }, { "epoch": 2.0142050316618176, "grad_norm": 26.080705642700195, "learning_rate": 1.0838373764195636e-05, "loss": 3.284, "step": 11769 }, { "epoch": 2.0143761766215986, "grad_norm": 3.846012830734253, "learning_rate": 1.0827305574421723e-05, "loss": 0.341, "step": 11770 }, { "epoch": 2.0145473215813796, "grad_norm": 4.846517562866211, "learning_rate": 1.0816239846543714e-05, "loss": 0.3084, "step": 11771 }, { "epoch": 2.01471846654116, "grad_norm": 8.114249229431152, "learning_rate": 1.0805176587090446e-05, "loss": 0.6068, "step": 11772 }, { "epoch": 2.014889611500941, "grad_norm": 11.607542037963867, "learning_rate": 1.079411580258922e-05, "loss": 0.9212, "step": 11773 }, { "epoch": 2.015060756460722, "grad_norm": 21.473356246948242, "learning_rate": 1.0783057499565955e-05, "loss": 2.2081, "step": 11774 }, { "epoch": 2.015231901420503, "grad_norm": 5.1696648597717285, "learning_rate": 1.0772001684545027e-05, "loss": 0.4727, "step": 11775 }, { "epoch": 2.015403046380284, "grad_norm": 9.633221626281738, "learning_rate": 1.0760948364049423e-05, "loss": 0.6905, "step": 11776 }, { "epoch": 2.015574191340065, "grad_norm": 11.767236709594727, "learning_rate": 1.0749897544600576e-05, "loss": 0.9195, "step": 11777 }, { "epoch": 2.015745336299846, "grad_norm": 10.934749603271484, "learning_rate": 1.073884923271853e-05, "loss": 0.9196, "step": 11778 }, { "epoch": 2.015916481259627, "grad_norm": 3.0369269847869873, "learning_rate": 1.0727803434921754e-05, "loss": 0.2636, "step": 11779 }, { "epoch": 2.0160876262194076, "grad_norm": 22.558263778686523, "learning_rate": 1.0716760157727333e-05, "loss": 4.6886, "step": 11780 }, { "epoch": 2.0162587711791886, "grad_norm": 14.996073722839355, "learning_rate": 1.0705719407650801e-05, "loss": 1.1287, "step": 11781 }, { "epoch": 2.0164299161389696, "grad_norm": 10.492242813110352, "learning_rate": 1.0694681191206211e-05, "loss": 0.7579, "step": 11782 }, { "epoch": 2.0166010610987506, "grad_norm": 6.188160419464111, "learning_rate": 1.0683645514906135e-05, "loss": 0.3482, "step": 11783 }, { "epoch": 2.0167722060585316, "grad_norm": 1.4606053829193115, "learning_rate": 1.0672612385261631e-05, "loss": 0.197, "step": 11784 }, { "epoch": 2.0169433510183126, "grad_norm": 1.7969367504119873, "learning_rate": 1.0661581808782264e-05, "loss": 0.2241, "step": 11785 }, { "epoch": 2.0171144959780936, "grad_norm": 8.081125259399414, "learning_rate": 1.0650553791976096e-05, "loss": 0.5809, "step": 11786 }, { "epoch": 2.0172856409378745, "grad_norm": 14.835763931274414, "learning_rate": 1.0639528341349668e-05, "loss": 1.0678, "step": 11787 }, { "epoch": 2.0174567858976555, "grad_norm": 1.2715688943862915, "learning_rate": 1.062850546340801e-05, "loss": 0.1704, "step": 11788 }, { "epoch": 2.017627930857436, "grad_norm": 9.18881607055664, "learning_rate": 1.0617485164654645e-05, "loss": 0.7513, "step": 11789 }, { "epoch": 2.017799075817217, "grad_norm": 7.736057281494141, "learning_rate": 1.0606467451591556e-05, "loss": 0.5137, "step": 11790 }, { "epoch": 2.017970220776998, "grad_norm": 9.122797012329102, "learning_rate": 1.0595452330719214e-05, "loss": 0.6815, "step": 11791 }, { "epoch": 2.018141365736779, "grad_norm": 16.6207218170166, "learning_rate": 1.058443980853656e-05, "loss": 1.6444, "step": 11792 }, { "epoch": 2.01831251069656, "grad_norm": 15.66812801361084, "learning_rate": 1.0573429891540995e-05, "loss": 1.1268, "step": 11793 }, { "epoch": 2.018483655656341, "grad_norm": 22.041797637939453, "learning_rate": 1.056242258622839e-05, "loss": 2.2751, "step": 11794 }, { "epoch": 2.018654800616122, "grad_norm": 3.802612781524658, "learning_rate": 1.0551417899093064e-05, "loss": 0.3865, "step": 11795 }, { "epoch": 2.018825945575903, "grad_norm": 8.309850692749023, "learning_rate": 1.0540415836627807e-05, "loss": 0.7762, "step": 11796 }, { "epoch": 2.0189970905356835, "grad_norm": 18.20221710205078, "learning_rate": 1.0529416405323846e-05, "loss": 1.1466, "step": 11797 }, { "epoch": 2.0191682354954645, "grad_norm": 7.259613990783691, "learning_rate": 1.0518419611670866e-05, "loss": 0.606, "step": 11798 }, { "epoch": 2.0193393804552455, "grad_norm": 12.507852554321289, "learning_rate": 1.0507425462156989e-05, "loss": 0.7949, "step": 11799 }, { "epoch": 2.0195105254150265, "grad_norm": 10.645299911499023, "learning_rate": 1.0496433963268763e-05, "loss": 0.8756, "step": 11800 }, { "epoch": 2.0196816703748075, "grad_norm": 7.09652853012085, "learning_rate": 1.0485445121491234e-05, "loss": 0.6707, "step": 11801 }, { "epoch": 2.0198528153345885, "grad_norm": 10.860014915466309, "learning_rate": 1.0474458943307789e-05, "loss": 0.7853, "step": 11802 }, { "epoch": 2.0200239602943695, "grad_norm": 6.626948356628418, "learning_rate": 1.0463475435200332e-05, "loss": 0.8323, "step": 11803 }, { "epoch": 2.0201951052541505, "grad_norm": 9.496315002441406, "learning_rate": 1.0452494603649101e-05, "loss": 0.6328, "step": 11804 }, { "epoch": 2.020366250213931, "grad_norm": 10.84108829498291, "learning_rate": 1.0441516455132846e-05, "loss": 0.7389, "step": 11805 }, { "epoch": 2.020537395173712, "grad_norm": 17.434284210205078, "learning_rate": 1.0430540996128653e-05, "loss": 1.0676, "step": 11806 }, { "epoch": 2.020708540133493, "grad_norm": 16.66282844543457, "learning_rate": 1.0419568233112098e-05, "loss": 1.1832, "step": 11807 }, { "epoch": 2.020879685093274, "grad_norm": 4.737600803375244, "learning_rate": 1.040859817255709e-05, "loss": 0.4277, "step": 11808 }, { "epoch": 2.021050830053055, "grad_norm": 17.645137786865234, "learning_rate": 1.0397630820936017e-05, "loss": 1.5208, "step": 11809 }, { "epoch": 2.021221975012836, "grad_norm": 8.478151321411133, "learning_rate": 1.0386666184719594e-05, "loss": 0.7116, "step": 11810 }, { "epoch": 2.021393119972617, "grad_norm": 11.887932777404785, "learning_rate": 1.0375704270377017e-05, "loss": 0.7217, "step": 11811 }, { "epoch": 2.021564264932398, "grad_norm": 16.484882354736328, "learning_rate": 1.0364745084375787e-05, "loss": 1.5667, "step": 11812 }, { "epoch": 2.0217354098921785, "grad_norm": 1.2436736822128296, "learning_rate": 1.035378863318188e-05, "loss": 0.1762, "step": 11813 }, { "epoch": 2.0219065548519595, "grad_norm": 5.28762149810791, "learning_rate": 1.034283492325958e-05, "loss": 0.5177, "step": 11814 }, { "epoch": 2.0220776998117405, "grad_norm": 2.081716537475586, "learning_rate": 1.0331883961071631e-05, "loss": 0.2192, "step": 11815 }, { "epoch": 2.0222488447715214, "grad_norm": 11.639471054077148, "learning_rate": 1.0320935753079077e-05, "loss": 0.6343, "step": 11816 }, { "epoch": 2.0224199897313024, "grad_norm": 5.428324222564697, "learning_rate": 1.0309990305741423e-05, "loss": 0.529, "step": 11817 }, { "epoch": 2.0225911346910834, "grad_norm": 0.3826965093612671, "learning_rate": 1.0299047625516452e-05, "loss": 0.1005, "step": 11818 }, { "epoch": 2.0227622796508644, "grad_norm": 0.35101577639579773, "learning_rate": 1.0288107718860401e-05, "loss": 0.0999, "step": 11819 }, { "epoch": 2.0229334246106454, "grad_norm": 15.225916862487793, "learning_rate": 1.0277170592227796e-05, "loss": 1.7696, "step": 11820 }, { "epoch": 2.023104569570426, "grad_norm": 9.394540786743164, "learning_rate": 1.0266236252071594e-05, "loss": 0.6677, "step": 11821 }, { "epoch": 2.023275714530207, "grad_norm": 6.120732307434082, "learning_rate": 1.0255304704843026e-05, "loss": 0.4583, "step": 11822 }, { "epoch": 2.023446859489988, "grad_norm": 11.734889030456543, "learning_rate": 1.0244375956991769e-05, "loss": 0.9542, "step": 11823 }, { "epoch": 2.023618004449769, "grad_norm": 11.931617736816406, "learning_rate": 1.023345001496578e-05, "loss": 0.8963, "step": 11824 }, { "epoch": 2.02378914940955, "grad_norm": 20.416879653930664, "learning_rate": 1.0222526885211384e-05, "loss": 2.3912, "step": 11825 }, { "epoch": 2.023960294369331, "grad_norm": 21.58074188232422, "learning_rate": 1.0211606574173239e-05, "loss": 2.7476, "step": 11826 }, { "epoch": 2.024131439329112, "grad_norm": 8.834613800048828, "learning_rate": 1.0200689088294352e-05, "loss": 0.6329, "step": 11827 }, { "epoch": 2.024302584288893, "grad_norm": 13.899346351623535, "learning_rate": 1.0189774434016048e-05, "loss": 1.4122, "step": 11828 }, { "epoch": 2.0244737292486734, "grad_norm": 13.606340408325195, "learning_rate": 1.017886261777799e-05, "loss": 0.8091, "step": 11829 }, { "epoch": 2.0246448742084544, "grad_norm": 7.768982410430908, "learning_rate": 1.0167953646018171e-05, "loss": 0.719, "step": 11830 }, { "epoch": 2.0248160191682354, "grad_norm": 11.211511611938477, "learning_rate": 1.0157047525172897e-05, "loss": 0.9234, "step": 11831 }, { "epoch": 2.0249871641280164, "grad_norm": 14.533052444458008, "learning_rate": 1.0146144261676798e-05, "loss": 1.0307, "step": 11832 }, { "epoch": 2.0251583090877974, "grad_norm": 7.1327433586120605, "learning_rate": 1.0135243861962813e-05, "loss": 0.7015, "step": 11833 }, { "epoch": 2.0253294540475784, "grad_norm": 21.100841522216797, "learning_rate": 1.0124346332462198e-05, "loss": 2.3477, "step": 11834 }, { "epoch": 2.0255005990073593, "grad_norm": 10.027507781982422, "learning_rate": 1.0113451679604507e-05, "loss": 0.7434, "step": 11835 }, { "epoch": 2.0256717439671403, "grad_norm": 10.144492149353027, "learning_rate": 1.0102559909817604e-05, "loss": 0.7746, "step": 11836 }, { "epoch": 2.0258428889269213, "grad_norm": 33.77898025512695, "learning_rate": 1.0091671029527644e-05, "loss": 5.2434, "step": 11837 }, { "epoch": 2.026014033886702, "grad_norm": 9.839831352233887, "learning_rate": 1.0080785045159091e-05, "loss": 0.6055, "step": 11838 }, { "epoch": 2.026185178846483, "grad_norm": 5.409238338470459, "learning_rate": 1.0069901963134689e-05, "loss": 0.5535, "step": 11839 }, { "epoch": 2.026356323806264, "grad_norm": 6.498706817626953, "learning_rate": 1.0059021789875473e-05, "loss": 0.4125, "step": 11840 }, { "epoch": 2.026527468766045, "grad_norm": 13.350198745727539, "learning_rate": 1.0048144531800757e-05, "loss": 0.9123, "step": 11841 }, { "epoch": 2.026698613725826, "grad_norm": 19.89065933227539, "learning_rate": 1.0037270195328148e-05, "loss": 2.3747, "step": 11842 }, { "epoch": 2.026869758685607, "grad_norm": 1.1392699480056763, "learning_rate": 1.0026398786873488e-05, "loss": 0.1734, "step": 11843 }, { "epoch": 2.027040903645388, "grad_norm": 1.1173834800720215, "learning_rate": 1.0015530312850989e-05, "loss": 0.1626, "step": 11844 }, { "epoch": 2.027212048605169, "grad_norm": 19.53468132019043, "learning_rate": 1.0004664779673007e-05, "loss": 2.329, "step": 11845 }, { "epoch": 2.0273831935649493, "grad_norm": 21.438766479492188, "learning_rate": 9.993802193750263e-06, "loss": 2.4116, "step": 11846 }, { "epoch": 2.0275543385247303, "grad_norm": 13.992535591125488, "learning_rate": 9.982942561491663e-06, "loss": 1.0284, "step": 11847 }, { "epoch": 2.0277254834845113, "grad_norm": 8.403829574584961, "learning_rate": 9.972085889304445e-06, "loss": 0.7673, "step": 11848 }, { "epoch": 2.0278966284442923, "grad_norm": 16.157991409301758, "learning_rate": 9.96123218359402e-06, "loss": 1.1482, "step": 11849 }, { "epoch": 2.0280677734040733, "grad_norm": 6.766063690185547, "learning_rate": 9.950381450764136e-06, "loss": 0.5653, "step": 11850 }, { "epoch": 2.0282389183638543, "grad_norm": 8.006422996520996, "learning_rate": 9.939533697216692e-06, "loss": 0.6255, "step": 11851 }, { "epoch": 2.0284100633236353, "grad_norm": 2.805673837661743, "learning_rate": 9.928688929351926e-06, "loss": 0.4057, "step": 11852 }, { "epoch": 2.0285812082834163, "grad_norm": 1.0670318603515625, "learning_rate": 9.917847153568224e-06, "loss": 0.1629, "step": 11853 }, { "epoch": 2.028752353243197, "grad_norm": 13.251568794250488, "learning_rate": 9.907008376262291e-06, "loss": 1.494, "step": 11854 }, { "epoch": 2.028923498202978, "grad_norm": 0.9644579887390137, "learning_rate": 9.896172603828982e-06, "loss": 0.1803, "step": 11855 }, { "epoch": 2.029094643162759, "grad_norm": 1.5300921201705933, "learning_rate": 9.88533984266146e-06, "loss": 0.1473, "step": 11856 }, { "epoch": 2.0292657881225398, "grad_norm": 18.34384536743164, "learning_rate": 9.874510099151028e-06, "loss": 1.4486, "step": 11857 }, { "epoch": 2.0294369330823208, "grad_norm": 20.48468589782715, "learning_rate": 9.863683379687294e-06, "loss": 4.5778, "step": 11858 }, { "epoch": 2.0296080780421017, "grad_norm": 15.95300579071045, "learning_rate": 9.852859690657995e-06, "loss": 1.6137, "step": 11859 }, { "epoch": 2.0297792230018827, "grad_norm": 17.988435745239258, "learning_rate": 9.842039038449162e-06, "loss": 1.8747, "step": 11860 }, { "epoch": 2.0299503679616637, "grad_norm": 9.741762161254883, "learning_rate": 9.831221429444963e-06, "loss": 0.6963, "step": 11861 }, { "epoch": 2.0301215129214443, "grad_norm": 5.502725601196289, "learning_rate": 9.820406870027835e-06, "loss": 0.5162, "step": 11862 }, { "epoch": 2.0302926578812253, "grad_norm": 1.0133944749832153, "learning_rate": 9.809595366578351e-06, "loss": 0.1608, "step": 11863 }, { "epoch": 2.0304638028410062, "grad_norm": 0.33509692549705505, "learning_rate": 9.798786925475352e-06, "loss": 0.0968, "step": 11864 }, { "epoch": 2.0306349478007872, "grad_norm": 9.39943790435791, "learning_rate": 9.787981553095784e-06, "loss": 0.7695, "step": 11865 }, { "epoch": 2.0308060927605682, "grad_norm": 11.290970802307129, "learning_rate": 9.777179255814881e-06, "loss": 0.9132, "step": 11866 }, { "epoch": 2.030977237720349, "grad_norm": 1.7600926160812378, "learning_rate": 9.766380040006002e-06, "loss": 0.2031, "step": 11867 }, { "epoch": 2.03114838268013, "grad_norm": 15.89739990234375, "learning_rate": 9.755583912040688e-06, "loss": 2.0879, "step": 11868 }, { "epoch": 2.031319527639911, "grad_norm": 12.4143705368042, "learning_rate": 9.74479087828868e-06, "loss": 1.244, "step": 11869 }, { "epoch": 2.0314906725996917, "grad_norm": 16.46491813659668, "learning_rate": 9.73400094511788e-06, "loss": 1.3612, "step": 11870 }, { "epoch": 2.0316618175594727, "grad_norm": 6.553382873535156, "learning_rate": 9.723214118894366e-06, "loss": 0.5333, "step": 11871 }, { "epoch": 2.0318329625192537, "grad_norm": 0.39805933833122253, "learning_rate": 9.712430405982382e-06, "loss": 0.1017, "step": 11872 }, { "epoch": 2.0320041074790347, "grad_norm": 15.95334243774414, "learning_rate": 9.701649812744335e-06, "loss": 1.2972, "step": 11873 }, { "epoch": 2.0321752524388157, "grad_norm": 7.786844253540039, "learning_rate": 9.69087234554079e-06, "loss": 0.5605, "step": 11874 }, { "epoch": 2.0323463973985967, "grad_norm": 4.406012535095215, "learning_rate": 9.680098010730468e-06, "loss": 0.2521, "step": 11875 }, { "epoch": 2.0325175423583777, "grad_norm": 15.982555389404297, "learning_rate": 9.669326814670244e-06, "loss": 1.6077, "step": 11876 }, { "epoch": 2.0326886873181587, "grad_norm": 13.098200798034668, "learning_rate": 9.658558763715139e-06, "loss": 0.8198, "step": 11877 }, { "epoch": 2.032859832277939, "grad_norm": 8.308065414428711, "learning_rate": 9.647793864218318e-06, "loss": 0.6817, "step": 11878 }, { "epoch": 2.03303097723772, "grad_norm": 4.6567702293396, "learning_rate": 9.63703212253109e-06, "loss": 0.402, "step": 11879 }, { "epoch": 2.033202122197501, "grad_norm": 12.576587677001953, "learning_rate": 9.626273545002897e-06, "loss": 1.0562, "step": 11880 }, { "epoch": 2.033373267157282, "grad_norm": 2.4134254455566406, "learning_rate": 9.615518137981317e-06, "loss": 0.2094, "step": 11881 }, { "epoch": 2.033544412117063, "grad_norm": 2.3268203735351562, "learning_rate": 9.604765907812058e-06, "loss": 0.2003, "step": 11882 }, { "epoch": 2.033715557076844, "grad_norm": 4.821282386779785, "learning_rate": 9.594016860838946e-06, "loss": 0.4753, "step": 11883 }, { "epoch": 2.033886702036625, "grad_norm": 14.654294967651367, "learning_rate": 9.583271003403938e-06, "loss": 1.0813, "step": 11884 }, { "epoch": 2.034057846996406, "grad_norm": 14.425127029418945, "learning_rate": 9.572528341847107e-06, "loss": 1.0595, "step": 11885 }, { "epoch": 2.0342289919561867, "grad_norm": 7.347255229949951, "learning_rate": 9.561788882506619e-06, "loss": 0.5042, "step": 11886 }, { "epoch": 2.0344001369159677, "grad_norm": 15.083477973937988, "learning_rate": 9.55105263171882e-06, "loss": 1.0545, "step": 11887 }, { "epoch": 2.0345712818757486, "grad_norm": 14.551390647888184, "learning_rate": 9.540319595818064e-06, "loss": 0.9603, "step": 11888 }, { "epoch": 2.0347424268355296, "grad_norm": 10.663408279418945, "learning_rate": 9.529589781136899e-06, "loss": 0.8898, "step": 11889 }, { "epoch": 2.0349135717953106, "grad_norm": 9.25648021697998, "learning_rate": 9.518863194005888e-06, "loss": 0.6702, "step": 11890 }, { "epoch": 2.0350847167550916, "grad_norm": 24.819244384765625, "learning_rate": 9.508139840753782e-06, "loss": 5.083, "step": 11891 }, { "epoch": 2.0352558617148726, "grad_norm": 8.87114143371582, "learning_rate": 9.497419727707327e-06, "loss": 0.5988, "step": 11892 }, { "epoch": 2.0354270066746536, "grad_norm": 13.749015808105469, "learning_rate": 9.486702861191446e-06, "loss": 1.4283, "step": 11893 }, { "epoch": 2.0355981516344346, "grad_norm": 10.862671852111816, "learning_rate": 9.475989247529072e-06, "loss": 0.5416, "step": 11894 }, { "epoch": 2.035769296594215, "grad_norm": 2.410940647125244, "learning_rate": 9.465278893041289e-06, "loss": 0.2622, "step": 11895 }, { "epoch": 2.035940441553996, "grad_norm": 25.568391799926758, "learning_rate": 9.454571804047182e-06, "loss": 5.0286, "step": 11896 }, { "epoch": 2.036111586513777, "grad_norm": 6.245731353759766, "learning_rate": 9.44386798686399e-06, "loss": 0.6794, "step": 11897 }, { "epoch": 2.036282731473558, "grad_norm": 10.772602081298828, "learning_rate": 9.433167447806942e-06, "loss": 0.6648, "step": 11898 }, { "epoch": 2.036453876433339, "grad_norm": 1.3819735050201416, "learning_rate": 9.422470193189416e-06, "loss": 0.1876, "step": 11899 }, { "epoch": 2.03662502139312, "grad_norm": 10.68278694152832, "learning_rate": 9.411776229322759e-06, "loss": 0.7467, "step": 11900 }, { "epoch": 2.036796166352901, "grad_norm": 3.9970545768737793, "learning_rate": 9.401085562516469e-06, "loss": 0.3338, "step": 11901 }, { "epoch": 2.036967311312682, "grad_norm": 14.894466400146484, "learning_rate": 9.390398199078018e-06, "loss": 1.168, "step": 11902 }, { "epoch": 2.0371384562724626, "grad_norm": 13.540384292602539, "learning_rate": 9.379714145313003e-06, "loss": 1.0395, "step": 11903 }, { "epoch": 2.0373096012322436, "grad_norm": 6.194725513458252, "learning_rate": 9.369033407524996e-06, "loss": 0.5251, "step": 11904 }, { "epoch": 2.0374807461920246, "grad_norm": 8.772257804870605, "learning_rate": 9.358355992015686e-06, "loss": 0.6919, "step": 11905 }, { "epoch": 2.0376518911518056, "grad_norm": 3.056460380554199, "learning_rate": 9.34768190508472e-06, "loss": 0.2237, "step": 11906 }, { "epoch": 2.0378230361115865, "grad_norm": 3.9268977642059326, "learning_rate": 9.337011153029874e-06, "loss": 0.2976, "step": 11907 }, { "epoch": 2.0379941810713675, "grad_norm": 12.569440841674805, "learning_rate": 9.326343742146851e-06, "loss": 0.8627, "step": 11908 }, { "epoch": 2.0381653260311485, "grad_norm": 56.91066360473633, "learning_rate": 9.315679678729488e-06, "loss": 6.3147, "step": 11909 }, { "epoch": 2.0383364709909295, "grad_norm": 8.111218452453613, "learning_rate": 9.305018969069582e-06, "loss": 0.7815, "step": 11910 }, { "epoch": 2.03850761595071, "grad_norm": 1.022740364074707, "learning_rate": 9.294361619456968e-06, "loss": 0.1623, "step": 11911 }, { "epoch": 2.038678760910491, "grad_norm": 1.9786064624786377, "learning_rate": 9.283707636179497e-06, "loss": 0.1961, "step": 11912 }, { "epoch": 2.038849905870272, "grad_norm": 6.420360088348389, "learning_rate": 9.273057025523034e-06, "loss": 0.4892, "step": 11913 }, { "epoch": 2.039021050830053, "grad_norm": 6.501669406890869, "learning_rate": 9.262409793771455e-06, "loss": 0.5237, "step": 11914 }, { "epoch": 2.039192195789834, "grad_norm": 14.225391387939453, "learning_rate": 9.251765947206648e-06, "loss": 1.1213, "step": 11915 }, { "epoch": 2.039363340749615, "grad_norm": 7.966073513031006, "learning_rate": 9.24112549210849e-06, "loss": 0.6116, "step": 11916 }, { "epoch": 2.039534485709396, "grad_norm": 51.80902099609375, "learning_rate": 9.230488434754869e-06, "loss": 6.7733, "step": 11917 }, { "epoch": 2.039705630669177, "grad_norm": 2.137594223022461, "learning_rate": 9.219854781421665e-06, "loss": 0.1973, "step": 11918 }, { "epoch": 2.0398767756289575, "grad_norm": 7.4564208984375, "learning_rate": 9.209224538382751e-06, "loss": 0.6855, "step": 11919 }, { "epoch": 2.0400479205887385, "grad_norm": 7.235912799835205, "learning_rate": 9.198597711909983e-06, "loss": 0.5597, "step": 11920 }, { "epoch": 2.0402190655485195, "grad_norm": 3.020845890045166, "learning_rate": 9.187974308273206e-06, "loss": 0.2184, "step": 11921 }, { "epoch": 2.0403902105083005, "grad_norm": 10.317389488220215, "learning_rate": 9.177354333740248e-06, "loss": 0.758, "step": 11922 }, { "epoch": 2.0405613554680815, "grad_norm": 19.480255126953125, "learning_rate": 9.166737794576901e-06, "loss": 2.0139, "step": 11923 }, { "epoch": 2.0407325004278625, "grad_norm": 4.532049655914307, "learning_rate": 9.156124697046946e-06, "loss": 0.3113, "step": 11924 }, { "epoch": 2.0409036453876435, "grad_norm": 56.90672302246094, "learning_rate": 9.145515047412122e-06, "loss": 6.0342, "step": 11925 }, { "epoch": 2.0410747903474245, "grad_norm": 9.121246337890625, "learning_rate": 9.13490885193214e-06, "loss": 0.5515, "step": 11926 }, { "epoch": 2.041245935307205, "grad_norm": 10.970090866088867, "learning_rate": 9.124306116864671e-06, "loss": 0.7421, "step": 11927 }, { "epoch": 2.041417080266986, "grad_norm": 7.338834285736084, "learning_rate": 9.113706848465345e-06, "loss": 0.4871, "step": 11928 }, { "epoch": 2.041588225226767, "grad_norm": 11.335954666137695, "learning_rate": 9.10311105298773e-06, "loss": 0.7165, "step": 11929 }, { "epoch": 2.041759370186548, "grad_norm": 17.9515323638916, "learning_rate": 9.0925187366834e-06, "loss": 1.2206, "step": 11930 }, { "epoch": 2.041930515146329, "grad_norm": 12.171897888183594, "learning_rate": 9.081929905801799e-06, "loss": 1.0112, "step": 11931 }, { "epoch": 2.04210166010611, "grad_norm": 9.260212898254395, "learning_rate": 9.071344566590387e-06, "loss": 0.8493, "step": 11932 }, { "epoch": 2.042272805065891, "grad_norm": 15.020618438720703, "learning_rate": 9.060762725294493e-06, "loss": 1.1217, "step": 11933 }, { "epoch": 2.042443950025672, "grad_norm": 0.4814656674861908, "learning_rate": 9.050184388157454e-06, "loss": 0.1155, "step": 11934 }, { "epoch": 2.0426150949854525, "grad_norm": 16.86335563659668, "learning_rate": 9.03960956142047e-06, "loss": 1.4919, "step": 11935 }, { "epoch": 2.0427862399452335, "grad_norm": 6.340185642242432, "learning_rate": 9.029038251322741e-06, "loss": 0.8307, "step": 11936 }, { "epoch": 2.0429573849050144, "grad_norm": 12.193196296691895, "learning_rate": 9.01847046410132e-06, "loss": 0.9213, "step": 11937 }, { "epoch": 2.0431285298647954, "grad_norm": 7.088001251220703, "learning_rate": 9.007906205991252e-06, "loss": 0.5649, "step": 11938 }, { "epoch": 2.0432996748245764, "grad_norm": 0.3370470106601715, "learning_rate": 8.99734548322543e-06, "loss": 0.1029, "step": 11939 }, { "epoch": 2.0434708197843574, "grad_norm": 1.8187381029129028, "learning_rate": 8.98678830203473e-06, "loss": 0.1762, "step": 11940 }, { "epoch": 2.0436419647441384, "grad_norm": 16.899436950683594, "learning_rate": 8.976234668647871e-06, "loss": 2.1062, "step": 11941 }, { "epoch": 2.0438131097039194, "grad_norm": 26.64852523803711, "learning_rate": 8.965684589291547e-06, "loss": 4.7089, "step": 11942 }, { "epoch": 2.0439842546637, "grad_norm": 1.91254460811615, "learning_rate": 8.955138070190284e-06, "loss": 0.1695, "step": 11943 }, { "epoch": 2.044155399623481, "grad_norm": 12.300413131713867, "learning_rate": 8.944595117566584e-06, "loss": 0.7917, "step": 11944 }, { "epoch": 2.044326544583262, "grad_norm": 16.593351364135742, "learning_rate": 8.934055737640765e-06, "loss": 1.867, "step": 11945 }, { "epoch": 2.044497689543043, "grad_norm": 12.263434410095215, "learning_rate": 8.923519936631115e-06, "loss": 0.7278, "step": 11946 }, { "epoch": 2.044668834502824, "grad_norm": 11.947538375854492, "learning_rate": 8.912987720753735e-06, "loss": 1.1132, "step": 11947 }, { "epoch": 2.044839979462605, "grad_norm": 21.168092727661133, "learning_rate": 8.90245909622268e-06, "loss": 4.6404, "step": 11948 }, { "epoch": 2.045011124422386, "grad_norm": 21.596288681030273, "learning_rate": 8.891934069249827e-06, "loss": 2.279, "step": 11949 }, { "epoch": 2.045182269382167, "grad_norm": 6.329024791717529, "learning_rate": 8.881412646044986e-06, "loss": 0.5062, "step": 11950 }, { "epoch": 2.045353414341948, "grad_norm": 17.447141647338867, "learning_rate": 8.87089483281577e-06, "loss": 1.3201, "step": 11951 }, { "epoch": 2.0455245593017284, "grad_norm": 16.544572830200195, "learning_rate": 8.860380635767752e-06, "loss": 1.8656, "step": 11952 }, { "epoch": 2.0456957042615094, "grad_norm": 14.725135803222656, "learning_rate": 8.849870061104302e-06, "loss": 1.0449, "step": 11953 }, { "epoch": 2.0458668492212904, "grad_norm": 18.921649932861328, "learning_rate": 8.839363115026677e-06, "loss": 2.1984, "step": 11954 }, { "epoch": 2.0460379941810714, "grad_norm": 9.62386703491211, "learning_rate": 8.82885980373399e-06, "loss": 0.6107, "step": 11955 }, { "epoch": 2.0462091391408523, "grad_norm": 0.37506747245788574, "learning_rate": 8.818360133423211e-06, "loss": 0.0997, "step": 11956 }, { "epoch": 2.0463802841006333, "grad_norm": 3.907926321029663, "learning_rate": 8.807864110289159e-06, "loss": 0.3722, "step": 11957 }, { "epoch": 2.0465514290604143, "grad_norm": 7.3398308753967285, "learning_rate": 8.797371740524508e-06, "loss": 0.6073, "step": 11958 }, { "epoch": 2.0467225740201953, "grad_norm": 8.504714012145996, "learning_rate": 8.786883030319765e-06, "loss": 0.7321, "step": 11959 }, { "epoch": 2.046893718979976, "grad_norm": 8.988840103149414, "learning_rate": 8.776397985863289e-06, "loss": 0.6231, "step": 11960 }, { "epoch": 2.047064863939757, "grad_norm": 10.813318252563477, "learning_rate": 8.765916613341272e-06, "loss": 0.8349, "step": 11961 }, { "epoch": 2.047236008899538, "grad_norm": 11.73931884765625, "learning_rate": 8.75543891893774e-06, "loss": 0.8376, "step": 11962 }, { "epoch": 2.047407153859319, "grad_norm": 8.261616706848145, "learning_rate": 8.744964908834543e-06, "loss": 0.6908, "step": 11963 }, { "epoch": 2.0475782988191, "grad_norm": 13.029111862182617, "learning_rate": 8.734494589211371e-06, "loss": 0.903, "step": 11964 }, { "epoch": 2.047749443778881, "grad_norm": 18.04186248779297, "learning_rate": 8.724027966245718e-06, "loss": 1.7882, "step": 11965 }, { "epoch": 2.047920588738662, "grad_norm": 7.08510160446167, "learning_rate": 8.71356504611292e-06, "loss": 0.5896, "step": 11966 }, { "epoch": 2.0480917336984428, "grad_norm": 6.447249889373779, "learning_rate": 8.703105834986106e-06, "loss": 0.4979, "step": 11967 }, { "epoch": 2.0482628786582233, "grad_norm": 0.31540340185165405, "learning_rate": 8.692650339036222e-06, "loss": 0.0973, "step": 11968 }, { "epoch": 2.0484340236180043, "grad_norm": 10.661703109741211, "learning_rate": 8.68219856443204e-06, "loss": 0.8368, "step": 11969 }, { "epoch": 2.0486051685777853, "grad_norm": 1.0013847351074219, "learning_rate": 8.67175051734011e-06, "loss": 0.1637, "step": 11970 }, { "epoch": 2.0487763135375663, "grad_norm": 13.735188484191895, "learning_rate": 8.661306203924804e-06, "loss": 1.1198, "step": 11971 }, { "epoch": 2.0489474584973473, "grad_norm": 20.656496047973633, "learning_rate": 8.650865630348258e-06, "loss": 2.2865, "step": 11972 }, { "epoch": 2.0491186034571283, "grad_norm": 12.35753059387207, "learning_rate": 8.640428802770474e-06, "loss": 1.0165, "step": 11973 }, { "epoch": 2.0492897484169093, "grad_norm": 10.251640319824219, "learning_rate": 8.629995727349152e-06, "loss": 0.6946, "step": 11974 }, { "epoch": 2.0494608933766902, "grad_norm": 22.148473739624023, "learning_rate": 8.619566410239862e-06, "loss": 4.7663, "step": 11975 }, { "epoch": 2.049632038336471, "grad_norm": 2.664194345474243, "learning_rate": 8.60914085759587e-06, "loss": 0.2056, "step": 11976 }, { "epoch": 2.0498031832962518, "grad_norm": 10.310507774353027, "learning_rate": 8.598719075568308e-06, "loss": 0.7746, "step": 11977 }, { "epoch": 2.0499743282560328, "grad_norm": 7.585750579833984, "learning_rate": 8.58830107030601e-06, "loss": 0.5491, "step": 11978 }, { "epoch": 2.0501454732158138, "grad_norm": 1.231826663017273, "learning_rate": 8.577886847955647e-06, "loss": 0.1571, "step": 11979 }, { "epoch": 2.0503166181755947, "grad_norm": 15.5543851852417, "learning_rate": 8.567476414661591e-06, "loss": 1.7639, "step": 11980 }, { "epoch": 2.0504877631353757, "grad_norm": 0.3049885928630829, "learning_rate": 8.55706977656605e-06, "loss": 0.0956, "step": 11981 }, { "epoch": 2.0506589080951567, "grad_norm": 108.16285705566406, "learning_rate": 8.546666939808918e-06, "loss": 6.733, "step": 11982 }, { "epoch": 2.0508300530549377, "grad_norm": 0.5632036924362183, "learning_rate": 8.536267910527924e-06, "loss": 0.1057, "step": 11983 }, { "epoch": 2.0510011980147183, "grad_norm": 5.060983180999756, "learning_rate": 8.52587269485847e-06, "loss": 0.331, "step": 11984 }, { "epoch": 2.0511723429744992, "grad_norm": 18.311870574951172, "learning_rate": 8.51548129893379e-06, "loss": 1.9103, "step": 11985 }, { "epoch": 2.0513434879342802, "grad_norm": 19.06296157836914, "learning_rate": 8.50509372888478e-06, "loss": 1.9013, "step": 11986 }, { "epoch": 2.051514632894061, "grad_norm": 20.983905792236328, "learning_rate": 8.494709990840165e-06, "loss": 4.958, "step": 11987 }, { "epoch": 2.051685777853842, "grad_norm": 13.0452299118042, "learning_rate": 8.484330090926324e-06, "loss": 1.0111, "step": 11988 }, { "epoch": 2.051856922813623, "grad_norm": 2.6100704669952393, "learning_rate": 8.473954035267453e-06, "loss": 0.2089, "step": 11989 }, { "epoch": 2.052028067773404, "grad_norm": 17.106739044189453, "learning_rate": 8.463581829985406e-06, "loss": 1.7617, "step": 11990 }, { "epoch": 2.052199212733185, "grad_norm": 7.796577453613281, "learning_rate": 8.453213481199837e-06, "loss": 0.574, "step": 11991 }, { "epoch": 2.0523703576929657, "grad_norm": 9.661202430725098, "learning_rate": 8.44284899502804e-06, "loss": 0.6447, "step": 11992 }, { "epoch": 2.0525415026527467, "grad_norm": 7.536768913269043, "learning_rate": 8.432488377585104e-06, "loss": 0.501, "step": 11993 }, { "epoch": 2.0527126476125277, "grad_norm": 3.810490608215332, "learning_rate": 8.422131634983814e-06, "loss": 0.4212, "step": 11994 }, { "epoch": 2.0528837925723087, "grad_norm": 11.295687675476074, "learning_rate": 8.411778773334662e-06, "loss": 0.6297, "step": 11995 }, { "epoch": 2.0530549375320897, "grad_norm": 0.34953275322914124, "learning_rate": 8.401429798745842e-06, "loss": 0.1004, "step": 11996 }, { "epoch": 2.0532260824918707, "grad_norm": 7.498044490814209, "learning_rate": 8.391084717323271e-06, "loss": 0.4742, "step": 11997 }, { "epoch": 2.0533972274516517, "grad_norm": 2.501674175262451, "learning_rate": 8.380743535170558e-06, "loss": 0.2124, "step": 11998 }, { "epoch": 2.0535683724114326, "grad_norm": 9.349990844726562, "learning_rate": 8.370406258389024e-06, "loss": 0.6813, "step": 11999 }, { "epoch": 2.0537395173712136, "grad_norm": 5.395793437957764, "learning_rate": 8.360072893077672e-06, "loss": 0.4153, "step": 12000 }, { "epoch": 2.053910662330994, "grad_norm": 10.118413925170898, "learning_rate": 8.349743445333196e-06, "loss": 0.8985, "step": 12001 }, { "epoch": 2.054081807290775, "grad_norm": 7.633265972137451, "learning_rate": 8.339417921249998e-06, "loss": 0.7414, "step": 12002 }, { "epoch": 2.054252952250556, "grad_norm": 2.6745824813842773, "learning_rate": 8.329096326920142e-06, "loss": 0.2411, "step": 12003 }, { "epoch": 2.054424097210337, "grad_norm": 7.342970848083496, "learning_rate": 8.318778668433396e-06, "loss": 0.713, "step": 12004 }, { "epoch": 2.054595242170118, "grad_norm": 0.3082271218299866, "learning_rate": 8.308464951877181e-06, "loss": 0.1007, "step": 12005 }, { "epoch": 2.054766387129899, "grad_norm": 12.544508934020996, "learning_rate": 8.298155183336617e-06, "loss": 0.885, "step": 12006 }, { "epoch": 2.05493753208968, "grad_norm": 7.243256568908691, "learning_rate": 8.287849368894476e-06, "loss": 0.5105, "step": 12007 }, { "epoch": 2.055108677049461, "grad_norm": 118.71031951904297, "learning_rate": 8.277547514631201e-06, "loss": 7.7632, "step": 12008 }, { "epoch": 2.0552798220092416, "grad_norm": 12.098856925964355, "learning_rate": 8.267249626624908e-06, "loss": 0.7554, "step": 12009 }, { "epoch": 2.0554509669690226, "grad_norm": 16.177705764770508, "learning_rate": 8.256955710951359e-06, "loss": 1.2698, "step": 12010 }, { "epoch": 2.0556221119288036, "grad_norm": 7.198063850402832, "learning_rate": 8.246665773683991e-06, "loss": 0.5804, "step": 12011 }, { "epoch": 2.0557932568885846, "grad_norm": 1.1358643770217896, "learning_rate": 8.236379820893868e-06, "loss": 0.1501, "step": 12012 }, { "epoch": 2.0559644018483656, "grad_norm": 19.300708770751953, "learning_rate": 8.22609785864973e-06, "loss": 2.051, "step": 12013 }, { "epoch": 2.0561355468081466, "grad_norm": 3.80964732170105, "learning_rate": 8.215819893017941e-06, "loss": 0.2686, "step": 12014 }, { "epoch": 2.0563066917679276, "grad_norm": 12.524219512939453, "learning_rate": 8.205545930062508e-06, "loss": 0.9689, "step": 12015 }, { "epoch": 2.0564778367277086, "grad_norm": 14.7266206741333, "learning_rate": 8.195275975845118e-06, "loss": 1.3043, "step": 12016 }, { "epoch": 2.056648981687489, "grad_norm": 18.319597244262695, "learning_rate": 8.185010036425022e-06, "loss": 1.7, "step": 12017 }, { "epoch": 2.05682012664727, "grad_norm": 11.62499713897705, "learning_rate": 8.17474811785917e-06, "loss": 0.9709, "step": 12018 }, { "epoch": 2.056991271607051, "grad_norm": 13.032102584838867, "learning_rate": 8.164490226202079e-06, "loss": 1.0622, "step": 12019 }, { "epoch": 2.057162416566832, "grad_norm": 15.581334114074707, "learning_rate": 8.154236367505955e-06, "loss": 0.9826, "step": 12020 }, { "epoch": 2.057333561526613, "grad_norm": 8.071442604064941, "learning_rate": 8.143986547820551e-06, "loss": 0.5927, "step": 12021 }, { "epoch": 2.057504706486394, "grad_norm": 7.1524505615234375, "learning_rate": 8.133740773193313e-06, "loss": 0.8573, "step": 12022 }, { "epoch": 2.057675851446175, "grad_norm": 9.042652130126953, "learning_rate": 8.123499049669228e-06, "loss": 0.5731, "step": 12023 }, { "epoch": 2.057846996405956, "grad_norm": 9.338467597961426, "learning_rate": 8.113261383290971e-06, "loss": 0.6727, "step": 12024 }, { "epoch": 2.0580181413657366, "grad_norm": 52.94066619873047, "learning_rate": 8.10302778009873e-06, "loss": 5.8416, "step": 12025 }, { "epoch": 2.0581892863255176, "grad_norm": 2.991269826889038, "learning_rate": 8.092798246130391e-06, "loss": 0.2138, "step": 12026 }, { "epoch": 2.0583604312852986, "grad_norm": 14.370052337646484, "learning_rate": 8.082572787421357e-06, "loss": 1.1909, "step": 12027 }, { "epoch": 2.0585315762450795, "grad_norm": 1.0152217149734497, "learning_rate": 8.072351410004696e-06, "loss": 0.1815, "step": 12028 }, { "epoch": 2.0587027212048605, "grad_norm": 11.966301918029785, "learning_rate": 8.062134119911007e-06, "loss": 0.5782, "step": 12029 }, { "epoch": 2.0588738661646415, "grad_norm": 12.83458423614502, "learning_rate": 8.051920923168539e-06, "loss": 0.8823, "step": 12030 }, { "epoch": 2.0590450111244225, "grad_norm": 14.110050201416016, "learning_rate": 8.041711825803055e-06, "loss": 1.0712, "step": 12031 }, { "epoch": 2.0592161560842035, "grad_norm": 1.6581625938415527, "learning_rate": 8.031506833837977e-06, "loss": 0.1874, "step": 12032 }, { "epoch": 2.059387301043984, "grad_norm": 7.732429504394531, "learning_rate": 8.02130595329423e-06, "loss": 0.5699, "step": 12033 }, { "epoch": 2.059558446003765, "grad_norm": 58.31907653808594, "learning_rate": 8.01110919019038e-06, "loss": 4.8628, "step": 12034 }, { "epoch": 2.059729590963546, "grad_norm": 0.6284747123718262, "learning_rate": 8.0009165505425e-06, "loss": 0.1452, "step": 12035 }, { "epoch": 2.059900735923327, "grad_norm": 9.404206275939941, "learning_rate": 7.990728040364282e-06, "loss": 0.8343, "step": 12036 }, { "epoch": 2.060071880883108, "grad_norm": 13.370593070983887, "learning_rate": 7.980543665666973e-06, "loss": 1.0792, "step": 12037 }, { "epoch": 2.060243025842889, "grad_norm": 15.009908676147461, "learning_rate": 7.970363432459347e-06, "loss": 1.1034, "step": 12038 }, { "epoch": 2.06041417080267, "grad_norm": 12.847310066223145, "learning_rate": 7.960187346747775e-06, "loss": 0.9665, "step": 12039 }, { "epoch": 2.060585315762451, "grad_norm": 9.181509971618652, "learning_rate": 7.950015414536152e-06, "loss": 0.7375, "step": 12040 }, { "epoch": 2.0607564607222315, "grad_norm": 61.34807586669922, "learning_rate": 7.93984764182593e-06, "loss": 5.9498, "step": 12041 }, { "epoch": 2.0609276056820125, "grad_norm": 5.871700763702393, "learning_rate": 7.929684034616122e-06, "loss": 0.5876, "step": 12042 }, { "epoch": 2.0610987506417935, "grad_norm": 10.55725383758545, "learning_rate": 7.919524598903256e-06, "loss": 0.7522, "step": 12043 }, { "epoch": 2.0612698956015745, "grad_norm": 3.886810302734375, "learning_rate": 7.90936934068143e-06, "loss": 0.2356, "step": 12044 }, { "epoch": 2.0614410405613555, "grad_norm": 6.406006336212158, "learning_rate": 7.89921826594225e-06, "loss": 0.7425, "step": 12045 }, { "epoch": 2.0616121855211365, "grad_norm": 8.286447525024414, "learning_rate": 7.889071380674873e-06, "loss": 0.6397, "step": 12046 }, { "epoch": 2.0617833304809174, "grad_norm": 5.529025077819824, "learning_rate": 7.878928690865967e-06, "loss": 0.5268, "step": 12047 }, { "epoch": 2.0619544754406984, "grad_norm": 12.164166450500488, "learning_rate": 7.868790202499748e-06, "loss": 0.9916, "step": 12048 }, { "epoch": 2.0621256204004794, "grad_norm": 13.664443969726562, "learning_rate": 7.858655921557928e-06, "loss": 1.4452, "step": 12049 }, { "epoch": 2.06229676536026, "grad_norm": 20.972003936767578, "learning_rate": 7.848525854019749e-06, "loss": 2.6095, "step": 12050 }, { "epoch": 2.062467910320041, "grad_norm": 19.633014678955078, "learning_rate": 7.838400005861972e-06, "loss": 1.7025, "step": 12051 }, { "epoch": 2.062639055279822, "grad_norm": 10.406224250793457, "learning_rate": 7.828278383058852e-06, "loss": 0.7055, "step": 12052 }, { "epoch": 2.062810200239603, "grad_norm": 6.273819446563721, "learning_rate": 7.818160991582174e-06, "loss": 0.4054, "step": 12053 }, { "epoch": 2.062981345199384, "grad_norm": 9.866125106811523, "learning_rate": 7.808047837401202e-06, "loss": 0.6593, "step": 12054 }, { "epoch": 2.063152490159165, "grad_norm": 6.995495796203613, "learning_rate": 7.797938926482724e-06, "loss": 0.6497, "step": 12055 }, { "epoch": 2.063323635118946, "grad_norm": 10.044342994689941, "learning_rate": 7.787834264791008e-06, "loss": 0.6712, "step": 12056 }, { "epoch": 2.063494780078727, "grad_norm": 19.529191970825195, "learning_rate": 7.777733858287812e-06, "loss": 1.7619, "step": 12057 }, { "epoch": 2.0636659250385074, "grad_norm": 8.460591316223145, "learning_rate": 7.767637712932383e-06, "loss": 0.6503, "step": 12058 }, { "epoch": 2.0638370699982884, "grad_norm": 11.768653869628906, "learning_rate": 7.7575458346815e-06, "loss": 0.9316, "step": 12059 }, { "epoch": 2.0640082149580694, "grad_norm": 12.821147918701172, "learning_rate": 7.747458229489345e-06, "loss": 0.9593, "step": 12060 }, { "epoch": 2.0641793599178504, "grad_norm": 11.793830871582031, "learning_rate": 7.737374903307653e-06, "loss": 0.662, "step": 12061 }, { "epoch": 2.0643505048776314, "grad_norm": 10.993307113647461, "learning_rate": 7.727295862085569e-06, "loss": 0.6661, "step": 12062 }, { "epoch": 2.0645216498374124, "grad_norm": 18.010366439819336, "learning_rate": 7.717221111769777e-06, "loss": 2.0671, "step": 12063 }, { "epoch": 2.0646927947971934, "grad_norm": 0.2998868525028229, "learning_rate": 7.70715065830436e-06, "loss": 0.0953, "step": 12064 }, { "epoch": 2.0648639397569744, "grad_norm": 14.099712371826172, "learning_rate": 7.697084507630932e-06, "loss": 1.024, "step": 12065 }, { "epoch": 2.065035084716755, "grad_norm": 0.666097104549408, "learning_rate": 7.687022665688505e-06, "loss": 0.1569, "step": 12066 }, { "epoch": 2.065206229676536, "grad_norm": 14.61345386505127, "learning_rate": 7.676965138413617e-06, "loss": 1.1617, "step": 12067 }, { "epoch": 2.065377374636317, "grad_norm": 14.953215599060059, "learning_rate": 7.666911931740184e-06, "loss": 1.4047, "step": 12068 }, { "epoch": 2.065548519596098, "grad_norm": 0.3579903841018677, "learning_rate": 7.656863051599653e-06, "loss": 0.101, "step": 12069 }, { "epoch": 2.065719664555879, "grad_norm": 75.73260498046875, "learning_rate": 7.646818503920841e-06, "loss": 7.1987, "step": 12070 }, { "epoch": 2.06589080951566, "grad_norm": 0.4419393539428711, "learning_rate": 7.636778294630086e-06, "loss": 0.1033, "step": 12071 }, { "epoch": 2.066061954475441, "grad_norm": 2.4691989421844482, "learning_rate": 7.6267424296510836e-06, "loss": 0.2526, "step": 12072 }, { "epoch": 2.066233099435222, "grad_norm": 8.887480735778809, "learning_rate": 7.616710914905045e-06, "loss": 0.6822, "step": 12073 }, { "epoch": 2.0664042443950024, "grad_norm": 1.5953325033187866, "learning_rate": 7.606683756310548e-06, "loss": 0.2204, "step": 12074 }, { "epoch": 2.0665753893547834, "grad_norm": 18.213680267333984, "learning_rate": 7.596660959783665e-06, "loss": 2.1227, "step": 12075 }, { "epoch": 2.0667465343145643, "grad_norm": 0.6412040591239929, "learning_rate": 7.586642531237823e-06, "loss": 0.1013, "step": 12076 }, { "epoch": 2.0669176792743453, "grad_norm": 0.783392071723938, "learning_rate": 7.576628476583947e-06, "loss": 0.1125, "step": 12077 }, { "epoch": 2.0670888242341263, "grad_norm": 14.314652442932129, "learning_rate": 7.56661880173031e-06, "loss": 1.0123, "step": 12078 }, { "epoch": 2.0672599691939073, "grad_norm": 53.17918014526367, "learning_rate": 7.5566135125826525e-06, "loss": 5.8608, "step": 12079 }, { "epoch": 2.0674311141536883, "grad_norm": 14.923975944519043, "learning_rate": 7.546612615044115e-06, "loss": 1.6835, "step": 12080 }, { "epoch": 2.0676022591134693, "grad_norm": 5.52974271774292, "learning_rate": 7.536616115015239e-06, "loss": 0.3979, "step": 12081 }, { "epoch": 2.06777340407325, "grad_norm": 7.308667182922363, "learning_rate": 7.526624018393975e-06, "loss": 0.5386, "step": 12082 }, { "epoch": 2.067944549033031, "grad_norm": 17.006975173950195, "learning_rate": 7.516636331075664e-06, "loss": 1.7344, "step": 12083 }, { "epoch": 2.068115693992812, "grad_norm": 13.940009117126465, "learning_rate": 7.5066530589530715e-06, "loss": 0.9107, "step": 12084 }, { "epoch": 2.068286838952593, "grad_norm": 3.1415724754333496, "learning_rate": 7.496674207916326e-06, "loss": 0.2024, "step": 12085 }, { "epoch": 2.068457983912374, "grad_norm": 8.864142417907715, "learning_rate": 7.486699783852983e-06, "loss": 0.6689, "step": 12086 }, { "epoch": 2.0686291288721548, "grad_norm": 13.856688499450684, "learning_rate": 7.476729792647949e-06, "loss": 1.4268, "step": 12087 }, { "epoch": 2.0688002738319358, "grad_norm": 14.482255935668945, "learning_rate": 7.466764240183551e-06, "loss": 1.2648, "step": 12088 }, { "epoch": 2.0689714187917168, "grad_norm": 10.345146179199219, "learning_rate": 7.456803132339472e-06, "loss": 0.7661, "step": 12089 }, { "epoch": 2.0691425637514973, "grad_norm": 14.924386024475098, "learning_rate": 7.446846474992774e-06, "loss": 1.0614, "step": 12090 }, { "epoch": 2.0693137087112783, "grad_norm": 4.855666160583496, "learning_rate": 7.4368942740179114e-06, "loss": 0.4409, "step": 12091 }, { "epoch": 2.0694848536710593, "grad_norm": 8.111489295959473, "learning_rate": 7.426946535286687e-06, "loss": 0.624, "step": 12092 }, { "epoch": 2.0696559986308403, "grad_norm": 0.8348966240882874, "learning_rate": 7.4170032646682915e-06, "loss": 0.1679, "step": 12093 }, { "epoch": 2.0698271435906213, "grad_norm": 14.6089506149292, "learning_rate": 7.407064468029259e-06, "loss": 1.1211, "step": 12094 }, { "epoch": 2.0699982885504022, "grad_norm": 7.881638050079346, "learning_rate": 7.3971301512335055e-06, "loss": 0.6188, "step": 12095 }, { "epoch": 2.0701694335101832, "grad_norm": 13.101614952087402, "learning_rate": 7.387200320142287e-06, "loss": 0.7638, "step": 12096 }, { "epoch": 2.070340578469964, "grad_norm": 11.043891906738281, "learning_rate": 7.377274980614212e-06, "loss": 0.6251, "step": 12097 }, { "epoch": 2.070511723429745, "grad_norm": 14.50870418548584, "learning_rate": 7.367354138505259e-06, "loss": 1.0163, "step": 12098 }, { "epoch": 2.0706828683895258, "grad_norm": 6.624751091003418, "learning_rate": 7.357437799668727e-06, "loss": 0.8749, "step": 12099 }, { "epoch": 2.0708540133493067, "grad_norm": 1.1789734363555908, "learning_rate": 7.347525969955281e-06, "loss": 0.1753, "step": 12100 }, { "epoch": 2.0710251583090877, "grad_norm": 6.192749977111816, "learning_rate": 7.3376186552128885e-06, "loss": 0.6629, "step": 12101 }, { "epoch": 2.0711963032688687, "grad_norm": 6.576925277709961, "learning_rate": 7.327715861286931e-06, "loss": 0.7269, "step": 12102 }, { "epoch": 2.0713674482286497, "grad_norm": 9.999547958374023, "learning_rate": 7.317817594020026e-06, "loss": 0.8994, "step": 12103 }, { "epoch": 2.0715385931884307, "grad_norm": 19.876232147216797, "learning_rate": 7.307923859252206e-06, "loss": 2.137, "step": 12104 }, { "epoch": 2.0717097381482117, "grad_norm": 9.537254333496094, "learning_rate": 7.298034662820743e-06, "loss": 0.6595, "step": 12105 }, { "epoch": 2.0718808831079927, "grad_norm": 1.8034896850585938, "learning_rate": 7.288150010560317e-06, "loss": 0.1985, "step": 12106 }, { "epoch": 2.072052028067773, "grad_norm": 0.36084455251693726, "learning_rate": 7.278269908302854e-06, "loss": 0.096, "step": 12107 }, { "epoch": 2.072223173027554, "grad_norm": 13.04633903503418, "learning_rate": 7.268394361877664e-06, "loss": 0.9708, "step": 12108 }, { "epoch": 2.072394317987335, "grad_norm": 10.780892372131348, "learning_rate": 7.258523377111301e-06, "loss": 0.7595, "step": 12109 }, { "epoch": 2.072565462947116, "grad_norm": 15.826729774475098, "learning_rate": 7.248656959827692e-06, "loss": 0.9642, "step": 12110 }, { "epoch": 2.072736607906897, "grad_norm": 7.85272741317749, "learning_rate": 7.238795115848003e-06, "loss": 0.6435, "step": 12111 }, { "epoch": 2.072907752866678, "grad_norm": 0.32071030139923096, "learning_rate": 7.228937850990775e-06, "loss": 0.0945, "step": 12112 }, { "epoch": 2.073078897826459, "grad_norm": 14.882099151611328, "learning_rate": 7.219085171071771e-06, "loss": 1.4799, "step": 12113 }, { "epoch": 2.07325004278624, "grad_norm": 0.8807559609413147, "learning_rate": 7.209237081904124e-06, "loss": 0.1125, "step": 12114 }, { "epoch": 2.0734211877460207, "grad_norm": 3.0653038024902344, "learning_rate": 7.199393589298185e-06, "loss": 0.232, "step": 12115 }, { "epoch": 2.0735923327058017, "grad_norm": 3.3550987243652344, "learning_rate": 7.189554699061665e-06, "loss": 0.2642, "step": 12116 }, { "epoch": 2.0737634776655827, "grad_norm": 0.3961687386035919, "learning_rate": 7.179720416999488e-06, "loss": 0.1032, "step": 12117 }, { "epoch": 2.0739346226253637, "grad_norm": 10.76473617553711, "learning_rate": 7.169890748913929e-06, "loss": 0.7011, "step": 12118 }, { "epoch": 2.0741057675851446, "grad_norm": 23.70259666442871, "learning_rate": 7.160065700604475e-06, "loss": 0.7263, "step": 12119 }, { "epoch": 2.0742769125449256, "grad_norm": 2.9498953819274902, "learning_rate": 7.150245277867957e-06, "loss": 0.2301, "step": 12120 }, { "epoch": 2.0744480575047066, "grad_norm": 12.222895622253418, "learning_rate": 7.140429486498394e-06, "loss": 0.9473, "step": 12121 }, { "epoch": 2.0746192024644876, "grad_norm": 19.773466110229492, "learning_rate": 7.130618332287142e-06, "loss": 2.282, "step": 12122 }, { "epoch": 2.074790347424268, "grad_norm": 8.537799835205078, "learning_rate": 7.1208118210228e-06, "loss": 0.8369, "step": 12123 }, { "epoch": 2.074961492384049, "grad_norm": 15.170889854431152, "learning_rate": 7.111009958491225e-06, "loss": 0.9639, "step": 12124 }, { "epoch": 2.07513263734383, "grad_norm": 8.921980857849121, "learning_rate": 7.101212750475517e-06, "loss": 0.6637, "step": 12125 }, { "epoch": 2.075303782303611, "grad_norm": 10.858015060424805, "learning_rate": 7.09142020275606e-06, "loss": 0.8171, "step": 12126 }, { "epoch": 2.075474927263392, "grad_norm": 1.0369586944580078, "learning_rate": 7.081632321110455e-06, "loss": 0.1787, "step": 12127 }, { "epoch": 2.075646072223173, "grad_norm": 1.0272165536880493, "learning_rate": 7.0718491113135815e-06, "loss": 0.1791, "step": 12128 }, { "epoch": 2.075817217182954, "grad_norm": 8.102487564086914, "learning_rate": 7.062070579137541e-06, "loss": 0.7433, "step": 12129 }, { "epoch": 2.075988362142735, "grad_norm": 11.80177116394043, "learning_rate": 7.052296730351676e-06, "loss": 0.8734, "step": 12130 }, { "epoch": 2.0761595071025156, "grad_norm": 1.875447154045105, "learning_rate": 7.042527570722584e-06, "loss": 0.2361, "step": 12131 }, { "epoch": 2.0763306520622966, "grad_norm": 26.69366455078125, "learning_rate": 7.0327631060140705e-06, "loss": 5.4323, "step": 12132 }, { "epoch": 2.0765017970220776, "grad_norm": 13.793013572692871, "learning_rate": 7.023003341987198e-06, "loss": 0.9908, "step": 12133 }, { "epoch": 2.0766729419818586, "grad_norm": 7.597231864929199, "learning_rate": 7.01324828440023e-06, "loss": 0.5493, "step": 12134 }, { "epoch": 2.0768440869416396, "grad_norm": 18.652191162109375, "learning_rate": 7.0034979390086755e-06, "loss": 2.1144, "step": 12135 }, { "epoch": 2.0770152319014206, "grad_norm": 5.6538262367248535, "learning_rate": 6.9937523115652464e-06, "loss": 0.4524, "step": 12136 }, { "epoch": 2.0771863768612016, "grad_norm": 17.920682907104492, "learning_rate": 6.9840114078198745e-06, "loss": 2.0412, "step": 12137 }, { "epoch": 2.0773575218209825, "grad_norm": 2.917215585708618, "learning_rate": 6.974275233519717e-06, "loss": 0.3882, "step": 12138 }, { "epoch": 2.077528666780763, "grad_norm": 2.818291187286377, "learning_rate": 6.964543794409118e-06, "loss": 0.3293, "step": 12139 }, { "epoch": 2.077699811740544, "grad_norm": 5.576604843139648, "learning_rate": 6.954817096229658e-06, "loss": 0.4654, "step": 12140 }, { "epoch": 2.077870956700325, "grad_norm": 5.780550956726074, "learning_rate": 6.9450951447200855e-06, "loss": 0.508, "step": 12141 }, { "epoch": 2.078042101660106, "grad_norm": 9.944047927856445, "learning_rate": 6.9353779456163804e-06, "loss": 0.6269, "step": 12142 }, { "epoch": 2.078213246619887, "grad_norm": 6.8254265785217285, "learning_rate": 6.925665504651702e-06, "loss": 0.571, "step": 12143 }, { "epoch": 2.078384391579668, "grad_norm": 14.667645454406738, "learning_rate": 6.915957827556377e-06, "loss": 1.8661, "step": 12144 }, { "epoch": 2.078555536539449, "grad_norm": 10.591224670410156, "learning_rate": 6.906254920058005e-06, "loss": 0.6342, "step": 12145 }, { "epoch": 2.07872668149923, "grad_norm": 16.009662628173828, "learning_rate": 6.896556787881268e-06, "loss": 1.5951, "step": 12146 }, { "epoch": 2.078897826459011, "grad_norm": 6.49214506149292, "learning_rate": 6.8868634367481105e-06, "loss": 0.5759, "step": 12147 }, { "epoch": 2.0790689714187915, "grad_norm": 18.11467170715332, "learning_rate": 6.877174872377596e-06, "loss": 1.6491, "step": 12148 }, { "epoch": 2.0792401163785725, "grad_norm": 8.221776008605957, "learning_rate": 6.867491100486021e-06, "loss": 0.5319, "step": 12149 }, { "epoch": 2.0794112613383535, "grad_norm": 5.171175479888916, "learning_rate": 6.857812126786793e-06, "loss": 0.4537, "step": 12150 }, { "epoch": 2.0795824062981345, "grad_norm": 13.825112342834473, "learning_rate": 6.848137956990553e-06, "loss": 0.9258, "step": 12151 }, { "epoch": 2.0797535512579155, "grad_norm": 8.968208312988281, "learning_rate": 6.838468596805045e-06, "loss": 0.8795, "step": 12152 }, { "epoch": 2.0799246962176965, "grad_norm": 3.204644203186035, "learning_rate": 6.828804051935243e-06, "loss": 0.3832, "step": 12153 }, { "epoch": 2.0800958411774775, "grad_norm": 14.705578804016113, "learning_rate": 6.8191443280831985e-06, "loss": 1.1189, "step": 12154 }, { "epoch": 2.0802669861372585, "grad_norm": 16.179912567138672, "learning_rate": 6.809489430948202e-06, "loss": 1.1157, "step": 12155 }, { "epoch": 2.080438131097039, "grad_norm": 8.70103931427002, "learning_rate": 6.799839366226626e-06, "loss": 0.6381, "step": 12156 }, { "epoch": 2.08060927605682, "grad_norm": 12.236988067626953, "learning_rate": 6.790194139612053e-06, "loss": 0.7737, "step": 12157 }, { "epoch": 2.080780421016601, "grad_norm": 11.918542861938477, "learning_rate": 6.780553756795148e-06, "loss": 0.7417, "step": 12158 }, { "epoch": 2.080951565976382, "grad_norm": 15.374375343322754, "learning_rate": 6.770918223463788e-06, "loss": 0.8578, "step": 12159 }, { "epoch": 2.081122710936163, "grad_norm": 2.522052764892578, "learning_rate": 6.761287545302915e-06, "loss": 0.2609, "step": 12160 }, { "epoch": 2.081293855895944, "grad_norm": 13.692422866821289, "learning_rate": 6.751661727994677e-06, "loss": 1.6109, "step": 12161 }, { "epoch": 2.081465000855725, "grad_norm": 6.656675815582275, "learning_rate": 6.7420407772182906e-06, "loss": 0.4577, "step": 12162 }, { "epoch": 2.081636145815506, "grad_norm": 0.3403339684009552, "learning_rate": 6.732424698650161e-06, "loss": 0.1041, "step": 12163 }, { "epoch": 2.0818072907752865, "grad_norm": 12.80004596710205, "learning_rate": 6.722813497963758e-06, "loss": 0.8304, "step": 12164 }, { "epoch": 2.0819784357350675, "grad_norm": 25.59823226928711, "learning_rate": 6.713207180829718e-06, "loss": 4.9471, "step": 12165 }, { "epoch": 2.0821495806948485, "grad_norm": 7.218756198883057, "learning_rate": 6.703605752915796e-06, "loss": 0.6152, "step": 12166 }, { "epoch": 2.0823207256546294, "grad_norm": 13.28813362121582, "learning_rate": 6.694009219886831e-06, "loss": 0.9352, "step": 12167 }, { "epoch": 2.0824918706144104, "grad_norm": 54.30827331542969, "learning_rate": 6.684417587404803e-06, "loss": 5.8207, "step": 12168 }, { "epoch": 2.0826630155741914, "grad_norm": 0.4861740171909332, "learning_rate": 6.6748308611287855e-06, "loss": 0.1035, "step": 12169 }, { "epoch": 2.0828341605339724, "grad_norm": 17.787147521972656, "learning_rate": 6.665249046714955e-06, "loss": 1.2305, "step": 12170 }, { "epoch": 2.0830053054937534, "grad_norm": 21.971818923950195, "learning_rate": 6.655672149816605e-06, "loss": 5.2417, "step": 12171 }, { "epoch": 2.083176450453534, "grad_norm": 5.7993483543396, "learning_rate": 6.646100176084111e-06, "loss": 0.3068, "step": 12172 }, { "epoch": 2.083347595413315, "grad_norm": 0.3392581641674042, "learning_rate": 6.6365331311649604e-06, "loss": 0.0952, "step": 12173 }, { "epoch": 2.083518740373096, "grad_norm": 10.269072532653809, "learning_rate": 6.626971020703714e-06, "loss": 0.6997, "step": 12174 }, { "epoch": 2.083689885332877, "grad_norm": 8.237215042114258, "learning_rate": 6.617413850342042e-06, "loss": 0.6259, "step": 12175 }, { "epoch": 2.083861030292658, "grad_norm": 12.255851745605469, "learning_rate": 6.607861625718684e-06, "loss": 1.0948, "step": 12176 }, { "epoch": 2.084032175252439, "grad_norm": 0.3219527304172516, "learning_rate": 6.598314352469461e-06, "loss": 0.0974, "step": 12177 }, { "epoch": 2.08420332021222, "grad_norm": 16.824554443359375, "learning_rate": 6.58877203622729e-06, "loss": 1.5412, "step": 12178 }, { "epoch": 2.084374465172001, "grad_norm": 10.882758140563965, "learning_rate": 6.579234682622139e-06, "loss": 0.7642, "step": 12179 }, { "epoch": 2.0845456101317814, "grad_norm": 3.1497888565063477, "learning_rate": 6.5697022972810745e-06, "loss": 0.2763, "step": 12180 }, { "epoch": 2.0847167550915624, "grad_norm": 0.34292444586753845, "learning_rate": 6.5601748858282065e-06, "loss": 0.0978, "step": 12181 }, { "epoch": 2.0848879000513434, "grad_norm": 5.213808059692383, "learning_rate": 6.550652453884729e-06, "loss": 0.2609, "step": 12182 }, { "epoch": 2.0850590450111244, "grad_norm": 0.558381974697113, "learning_rate": 6.541135007068887e-06, "loss": 0.1434, "step": 12183 }, { "epoch": 2.0852301899709054, "grad_norm": 11.533126831054688, "learning_rate": 6.531622550995993e-06, "loss": 1.0335, "step": 12184 }, { "epoch": 2.0854013349306864, "grad_norm": 7.610280990600586, "learning_rate": 6.5221150912784074e-06, "loss": 0.5106, "step": 12185 }, { "epoch": 2.0855724798904673, "grad_norm": 13.149750709533691, "learning_rate": 6.512612633525535e-06, "loss": 1.0309, "step": 12186 }, { "epoch": 2.0857436248502483, "grad_norm": 17.768402099609375, "learning_rate": 6.50311518334385e-06, "loss": 1.7948, "step": 12187 }, { "epoch": 2.085914769810029, "grad_norm": 21.48052215576172, "learning_rate": 6.4936227463368795e-06, "loss": 2.2838, "step": 12188 }, { "epoch": 2.08608591476981, "grad_norm": 26.855064392089844, "learning_rate": 6.484135328105143e-06, "loss": 5.1908, "step": 12189 }, { "epoch": 2.086257059729591, "grad_norm": 8.480819702148438, "learning_rate": 6.474652934246262e-06, "loss": 0.4656, "step": 12190 }, { "epoch": 2.086428204689372, "grad_norm": 4.383869647979736, "learning_rate": 6.465175570354832e-06, "loss": 0.3024, "step": 12191 }, { "epoch": 2.086599349649153, "grad_norm": 23.337060928344727, "learning_rate": 6.455703242022543e-06, "loss": 5.1359, "step": 12192 }, { "epoch": 2.086770494608934, "grad_norm": 8.683834075927734, "learning_rate": 6.446235954838053e-06, "loss": 0.6771, "step": 12193 }, { "epoch": 2.086941639568715, "grad_norm": 5.487773418426514, "learning_rate": 6.436773714387106e-06, "loss": 0.614, "step": 12194 }, { "epoch": 2.087112784528496, "grad_norm": 7.932293891906738, "learning_rate": 6.427316526252405e-06, "loss": 0.5762, "step": 12195 }, { "epoch": 2.087283929488277, "grad_norm": 9.785226821899414, "learning_rate": 6.41786439601374e-06, "loss": 0.6629, "step": 12196 }, { "epoch": 2.0874550744480573, "grad_norm": 9.893636703491211, "learning_rate": 6.408417329247847e-06, "loss": 0.8088, "step": 12197 }, { "epoch": 2.0876262194078383, "grad_norm": 86.0622787475586, "learning_rate": 6.3989753315285406e-06, "loss": 5.8329, "step": 12198 }, { "epoch": 2.0877973643676193, "grad_norm": 0.6352384686470032, "learning_rate": 6.389538408426587e-06, "loss": 0.1413, "step": 12199 }, { "epoch": 2.0879685093274003, "grad_norm": 19.251192092895508, "learning_rate": 6.3801065655098165e-06, "loss": 2.2262, "step": 12200 }, { "epoch": 2.0881396542871813, "grad_norm": 16.50313377380371, "learning_rate": 6.370679808342991e-06, "loss": 1.7644, "step": 12201 }, { "epoch": 2.0883107992469623, "grad_norm": 5.311058521270752, "learning_rate": 6.361258142487948e-06, "loss": 0.4058, "step": 12202 }, { "epoch": 2.0884819442067433, "grad_norm": 11.930335998535156, "learning_rate": 6.35184157350345e-06, "loss": 1.0426, "step": 12203 }, { "epoch": 2.0886530891665243, "grad_norm": 10.157562255859375, "learning_rate": 6.342430106945322e-06, "loss": 0.716, "step": 12204 }, { "epoch": 2.088824234126305, "grad_norm": 4.22382116317749, "learning_rate": 6.333023748366311e-06, "loss": 0.2879, "step": 12205 }, { "epoch": 2.088995379086086, "grad_norm": 22.903278350830078, "learning_rate": 6.323622503316211e-06, "loss": 4.8989, "step": 12206 }, { "epoch": 2.089166524045867, "grad_norm": 15.26009464263916, "learning_rate": 6.314226377341743e-06, "loss": 1.3637, "step": 12207 }, { "epoch": 2.0893376690056478, "grad_norm": 19.294307708740234, "learning_rate": 6.304835375986649e-06, "loss": 2.0071, "step": 12208 }, { "epoch": 2.0895088139654288, "grad_norm": 9.782569885253906, "learning_rate": 6.2954495047916445e-06, "loss": 0.7599, "step": 12209 }, { "epoch": 2.0896799589252097, "grad_norm": 1.3692255020141602, "learning_rate": 6.286068769294393e-06, "loss": 0.1727, "step": 12210 }, { "epoch": 2.0898511038849907, "grad_norm": 8.501511573791504, "learning_rate": 6.276693175029553e-06, "loss": 0.7113, "step": 12211 }, { "epoch": 2.0900222488447717, "grad_norm": 17.74875831604004, "learning_rate": 6.267322727528726e-06, "loss": 1.4958, "step": 12212 }, { "epoch": 2.0901933938045523, "grad_norm": 12.815526008605957, "learning_rate": 6.257957432320501e-06, "loss": 1.086, "step": 12213 }, { "epoch": 2.0903645387643333, "grad_norm": 6.476343631744385, "learning_rate": 6.248597294930407e-06, "loss": 0.4795, "step": 12214 }, { "epoch": 2.0905356837241142, "grad_norm": 0.5211002826690674, "learning_rate": 6.23924232088095e-06, "loss": 0.0996, "step": 12215 }, { "epoch": 2.0907068286838952, "grad_norm": 0.3298466205596924, "learning_rate": 6.229892515691566e-06, "loss": 0.0991, "step": 12216 }, { "epoch": 2.0908779736436762, "grad_norm": 12.241260528564453, "learning_rate": 6.220547884878667e-06, "loss": 1.0223, "step": 12217 }, { "epoch": 2.091049118603457, "grad_norm": 9.237482070922852, "learning_rate": 6.211208433955592e-06, "loss": 0.5836, "step": 12218 }, { "epoch": 2.091220263563238, "grad_norm": 5.703768253326416, "learning_rate": 6.201874168432627e-06, "loss": 0.5118, "step": 12219 }, { "epoch": 2.091391408523019, "grad_norm": 23.705324172973633, "learning_rate": 6.192545093817011e-06, "loss": 5.2883, "step": 12220 }, { "epoch": 2.0915625534827997, "grad_norm": 8.598515510559082, "learning_rate": 6.1832212156129045e-06, "loss": 0.7711, "step": 12221 }, { "epoch": 2.0917336984425807, "grad_norm": 0.49912920594215393, "learning_rate": 6.173902539321417e-06, "loss": 0.1018, "step": 12222 }, { "epoch": 2.0919048434023617, "grad_norm": 128.41470336914062, "learning_rate": 6.164589070440572e-06, "loss": 8.1274, "step": 12223 }, { "epoch": 2.0920759883621427, "grad_norm": 10.98254680633545, "learning_rate": 6.155280814465341e-06, "loss": 0.8118, "step": 12224 }, { "epoch": 2.0922471333219237, "grad_norm": 7.3365373611450195, "learning_rate": 6.145977776887604e-06, "loss": 0.6601, "step": 12225 }, { "epoch": 2.0924182782817047, "grad_norm": 7.6063408851623535, "learning_rate": 6.1366799631961604e-06, "loss": 0.6467, "step": 12226 }, { "epoch": 2.0925894232414857, "grad_norm": 12.426116943359375, "learning_rate": 6.127387378876746e-06, "loss": 0.8158, "step": 12227 }, { "epoch": 2.0927605682012667, "grad_norm": 6.0945515632629395, "learning_rate": 6.1181000294119874e-06, "loss": 0.5732, "step": 12228 }, { "epoch": 2.092931713161047, "grad_norm": 17.69382667541504, "learning_rate": 6.108817920281436e-06, "loss": 1.7922, "step": 12229 }, { "epoch": 2.093102858120828, "grad_norm": 6.411680698394775, "learning_rate": 6.099541056961553e-06, "loss": 0.7342, "step": 12230 }, { "epoch": 2.093274003080609, "grad_norm": 146.19932556152344, "learning_rate": 6.090269444925722e-06, "loss": 7.0016, "step": 12231 }, { "epoch": 2.09344514804039, "grad_norm": 7.426905155181885, "learning_rate": 6.081003089644171e-06, "loss": 0.7969, "step": 12232 }, { "epoch": 2.093616293000171, "grad_norm": 6.8522257804870605, "learning_rate": 6.071741996584104e-06, "loss": 0.4889, "step": 12233 }, { "epoch": 2.093787437959952, "grad_norm": 21.34181022644043, "learning_rate": 6.0624861712095355e-06, "loss": 4.901, "step": 12234 }, { "epoch": 2.093958582919733, "grad_norm": 16.318679809570312, "learning_rate": 6.053235618981454e-06, "loss": 1.4968, "step": 12235 }, { "epoch": 2.094129727879514, "grad_norm": 13.211913108825684, "learning_rate": 6.0439903453576665e-06, "loss": 1.0677, "step": 12236 }, { "epoch": 2.0943008728392947, "grad_norm": 14.884153366088867, "learning_rate": 6.034750355792932e-06, "loss": 1.7141, "step": 12237 }, { "epoch": 2.0944720177990757, "grad_norm": 15.627666473388672, "learning_rate": 6.025515655738824e-06, "loss": 1.2868, "step": 12238 }, { "epoch": 2.0946431627588566, "grad_norm": 17.632801055908203, "learning_rate": 6.016286250643864e-06, "loss": 1.3114, "step": 12239 }, { "epoch": 2.0948143077186376, "grad_norm": 10.28708267211914, "learning_rate": 6.0070621459533795e-06, "loss": 0.7839, "step": 12240 }, { "epoch": 2.0949854526784186, "grad_norm": 14.263157844543457, "learning_rate": 5.997843347109639e-06, "loss": 1.3731, "step": 12241 }, { "epoch": 2.0951565976381996, "grad_norm": 10.61111068725586, "learning_rate": 5.988629859551719e-06, "loss": 0.8161, "step": 12242 }, { "epoch": 2.0953277425979806, "grad_norm": 13.280919075012207, "learning_rate": 5.979421688715617e-06, "loss": 0.8464, "step": 12243 }, { "epoch": 2.0954988875577616, "grad_norm": 15.816527366638184, "learning_rate": 5.9702188400341394e-06, "loss": 1.7619, "step": 12244 }, { "epoch": 2.0956700325175426, "grad_norm": 14.596202850341797, "learning_rate": 5.9610213189370105e-06, "loss": 1.1625, "step": 12245 }, { "epoch": 2.095841177477323, "grad_norm": 8.234358787536621, "learning_rate": 5.951829130850753e-06, "loss": 0.5193, "step": 12246 }, { "epoch": 2.096012322437104, "grad_norm": 23.48274040222168, "learning_rate": 5.9426422811987995e-06, "loss": 5.0675, "step": 12247 }, { "epoch": 2.096183467396885, "grad_norm": 20.020809173583984, "learning_rate": 5.933460775401376e-06, "loss": 2.1134, "step": 12248 }, { "epoch": 2.096354612356666, "grad_norm": 10.504180908203125, "learning_rate": 5.924284618875619e-06, "loss": 0.6388, "step": 12249 }, { "epoch": 2.096525757316447, "grad_norm": 11.281728744506836, "learning_rate": 5.915113817035433e-06, "loss": 0.9457, "step": 12250 }, { "epoch": 2.096696902276228, "grad_norm": 0.9959191679954529, "learning_rate": 5.905948375291628e-06, "loss": 0.1732, "step": 12251 }, { "epoch": 2.096868047236009, "grad_norm": 5.866310119628906, "learning_rate": 5.896788299051832e-06, "loss": 0.5993, "step": 12252 }, { "epoch": 2.09703919219579, "grad_norm": 0.34504595398902893, "learning_rate": 5.8876335937205035e-06, "loss": 0.0955, "step": 12253 }, { "epoch": 2.0972103371555706, "grad_norm": 8.674176216125488, "learning_rate": 5.878484264698922e-06, "loss": 0.6405, "step": 12254 }, { "epoch": 2.0973814821153516, "grad_norm": 24.563919067382812, "learning_rate": 5.869340317385216e-06, "loss": 4.8516, "step": 12255 }, { "epoch": 2.0975526270751326, "grad_norm": 3.0636067390441895, "learning_rate": 5.8602017571743166e-06, "loss": 0.2175, "step": 12256 }, { "epoch": 2.0977237720349136, "grad_norm": 12.210594177246094, "learning_rate": 5.851068589458e-06, "loss": 0.968, "step": 12257 }, { "epoch": 2.0978949169946945, "grad_norm": 12.487648963928223, "learning_rate": 5.841940819624841e-06, "loss": 0.9434, "step": 12258 }, { "epoch": 2.0980660619544755, "grad_norm": 5.007801532745361, "learning_rate": 5.832818453060236e-06, "loss": 0.4823, "step": 12259 }, { "epoch": 2.0982372069142565, "grad_norm": 22.37860107421875, "learning_rate": 5.823701495146401e-06, "loss": 4.4246, "step": 12260 }, { "epoch": 2.0984083518740375, "grad_norm": 18.377269744873047, "learning_rate": 5.814589951262346e-06, "loss": 1.1927, "step": 12261 }, { "epoch": 2.098579496833818, "grad_norm": 10.410011291503906, "learning_rate": 5.805483826783909e-06, "loss": 0.7231, "step": 12262 }, { "epoch": 2.098750641793599, "grad_norm": 15.64672565460205, "learning_rate": 5.796383127083702e-06, "loss": 2.0417, "step": 12263 }, { "epoch": 2.09892178675338, "grad_norm": 17.89153480529785, "learning_rate": 5.787287857531164e-06, "loss": 1.6618, "step": 12264 }, { "epoch": 2.099092931713161, "grad_norm": 20.804140090942383, "learning_rate": 5.778198023492512e-06, "loss": 2.6203, "step": 12265 }, { "epoch": 2.099264076672942, "grad_norm": 21.22849464416504, "learning_rate": 5.769113630330755e-06, "loss": 2.397, "step": 12266 }, { "epoch": 2.099435221632723, "grad_norm": 2.559537410736084, "learning_rate": 5.760034683405712e-06, "loss": 0.2419, "step": 12267 }, { "epoch": 2.099606366592504, "grad_norm": 15.019838333129883, "learning_rate": 5.750961188073963e-06, "loss": 1.5579, "step": 12268 }, { "epoch": 2.099777511552285, "grad_norm": 0.32152462005615234, "learning_rate": 5.741893149688895e-06, "loss": 0.0982, "step": 12269 }, { "epoch": 2.0999486565120655, "grad_norm": 1.1599749326705933, "learning_rate": 5.732830573600652e-06, "loss": 0.1701, "step": 12270 }, { "epoch": 2.1001198014718465, "grad_norm": 11.784408569335938, "learning_rate": 5.723773465156179e-06, "loss": 0.8494, "step": 12271 }, { "epoch": 2.1002909464316275, "grad_norm": 5.574843883514404, "learning_rate": 5.714721829699168e-06, "loss": 0.6175, "step": 12272 }, { "epoch": 2.1004620913914085, "grad_norm": 16.805585861206055, "learning_rate": 5.705675672570107e-06, "loss": 1.6765, "step": 12273 }, { "epoch": 2.1006332363511895, "grad_norm": 7.845545768737793, "learning_rate": 5.696634999106258e-06, "loss": 0.7041, "step": 12274 }, { "epoch": 2.1008043813109705, "grad_norm": 10.239072799682617, "learning_rate": 5.687599814641602e-06, "loss": 0.7406, "step": 12275 }, { "epoch": 2.1009755262707515, "grad_norm": 3.576368570327759, "learning_rate": 5.6785701245069405e-06, "loss": 0.4328, "step": 12276 }, { "epoch": 2.1011466712305324, "grad_norm": 11.931581497192383, "learning_rate": 5.669545934029775e-06, "loss": 0.9275, "step": 12277 }, { "epoch": 2.101317816190313, "grad_norm": 17.57024383544922, "learning_rate": 5.66052724853442e-06, "loss": 1.5889, "step": 12278 }, { "epoch": 2.101317816190313, "eval_nli-pairs_loss": 1.2058579921722412, "eval_nli-pairs_runtime": 4.6694, "eval_nli-pairs_samples_per_second": 42.832, "eval_nli-pairs_steps_per_second": 1.499, "eval_sts-test_pearson_cosine": 0.7778089716749725, "eval_sts-test_pearson_dot": 0.6365128073667151, "eval_sts-test_pearson_euclidean": 0.7626862642088545, "eval_sts-test_pearson_manhattan": 0.7655837247082306, "eval_sts-test_pearson_max": 0.7778089716749725, "eval_sts-test_spearman_cosine": 0.7826369883149322, "eval_sts-test_spearman_dot": 0.6133237678861742, "eval_sts-test_spearman_euclidean": 0.7527692065832519, "eval_sts-test_spearman_manhattan": 0.7579239551021633, "eval_sts-test_spearman_max": 0.7826369883149322, "step": 12278 }, { "epoch": 2.101317816190313, "eval_vitaminc-pairs_loss": 0.5853947401046753, "eval_vitaminc-pairs_runtime": 2.8285, "eval_vitaminc-pairs_samples_per_second": 70.708, "eval_vitaminc-pairs_steps_per_second": 2.475, "step": 12278 }, { "epoch": 2.101317816190313, "eval_qnli-contrastive_loss": 1.2590630054473877, "eval_qnli-contrastive_runtime": 0.652, "eval_qnli-contrastive_samples_per_second": 306.763, "eval_qnli-contrastive_steps_per_second": 10.737, "step": 12278 }, { "epoch": 2.101317816190313, "eval_scitail-pairs-qa_loss": 0.08177236467599869, "eval_scitail-pairs-qa_runtime": 1.8903, "eval_scitail-pairs-qa_samples_per_second": 105.801, "eval_scitail-pairs-qa_steps_per_second": 3.703, "step": 12278 }, { "epoch": 2.101317816190313, "eval_scitail-pairs-pos_loss": 0.603409469127655, "eval_scitail-pairs-pos_runtime": 2.6998, "eval_scitail-pairs-pos_samples_per_second": 74.079, "eval_scitail-pairs-pos_steps_per_second": 2.593, "step": 12278 }, { "epoch": 2.101317816190313, "eval_xsum-pairs_loss": 0.6208459734916687, "eval_xsum-pairs_runtime": 2.7013, "eval_xsum-pairs_samples_per_second": 64.783, "eval_xsum-pairs_steps_per_second": 2.221, "step": 12278 }, { "epoch": 2.101317816190313, "eval_compression-pairs_loss": 0.175527423620224, "eval_compression-pairs_runtime": 0.5298, "eval_compression-pairs_samples_per_second": 377.533, "eval_compression-pairs_steps_per_second": 13.214, "step": 12278 }, { "epoch": 2.101317816190313, "eval_sciq_pairs_loss": 0.32900524139404297, "eval_sciq_pairs_runtime": 9.452, "eval_sciq_pairs_samples_per_second": 21.16, "eval_sciq_pairs_steps_per_second": 0.741, "step": 12278 }, { "epoch": 2.101317816190313, "eval_qasc_pairs_loss": 5.0988287925720215, "eval_qasc_pairs_runtime": 2.829, "eval_qasc_pairs_samples_per_second": 70.697, "eval_qasc_pairs_steps_per_second": 2.474, "step": 12278 }, { "epoch": 2.101317816190313, "eval_openbookqa_pairs_loss": 2.27594256401062, "eval_openbookqa_pairs_runtime": 0.7069, "eval_openbookqa_pairs_samples_per_second": 97.614, "eval_openbookqa_pairs_steps_per_second": 4.244, "step": 12278 }, { "epoch": 2.101317816190313, "eval_msmarco_pairs_loss": 0.85939621925354, "eval_msmarco_pairs_runtime": 4.0748, "eval_msmarco_pairs_samples_per_second": 49.083, "eval_msmarco_pairs_steps_per_second": 1.718, "step": 12278 }, { "epoch": 2.101317816190313, "eval_nq_pairs_loss": 1.0224627256393433, "eval_nq_pairs_runtime": 8.8095, "eval_nq_pairs_samples_per_second": 22.703, "eval_nq_pairs_steps_per_second": 0.795, "step": 12278 }, { "epoch": 2.101317816190313, "eval_trivia_pairs_loss": 1.3771618604660034, "eval_trivia_pairs_runtime": 12.8919, "eval_trivia_pairs_samples_per_second": 15.514, "eval_trivia_pairs_steps_per_second": 0.543, "step": 12278 }, { "epoch": 2.101317816190313, "eval_quora_pairs_loss": 0.1604246199131012, "eval_quora_pairs_runtime": 1.5888, "eval_quora_pairs_samples_per_second": 125.878, "eval_quora_pairs_steps_per_second": 4.406, "step": 12278 }, { "epoch": 2.101317816190313, "eval_gooaq_pairs_loss": 0.6805934309959412, "eval_gooaq_pairs_runtime": 2.6448, "eval_gooaq_pairs_samples_per_second": 75.621, "eval_gooaq_pairs_steps_per_second": 2.647, "step": 12278 }, { "epoch": 2.101488961150094, "grad_norm": 2.468231678009033, "learning_rate": 5.651514073341884e-06, "loss": 0.1967, "step": 12279 }, { "epoch": 2.101660106109875, "grad_norm": 3.6627144813537598, "learning_rate": 5.642506413769985e-06, "loss": 0.215, "step": 12280 }, { "epoch": 2.101831251069656, "grad_norm": 7.181698799133301, "learning_rate": 5.633504275133223e-06, "loss": 0.5828, "step": 12281 }, { "epoch": 2.102002396029437, "grad_norm": 1.949537992477417, "learning_rate": 5.6245076627429124e-06, "loss": 0.1849, "step": 12282 }, { "epoch": 2.102173540989218, "grad_norm": 0.3266027569770813, "learning_rate": 5.615516581907022e-06, "loss": 0.0964, "step": 12283 }, { "epoch": 2.102344685948999, "grad_norm": 10.785505294799805, "learning_rate": 5.606531037930343e-06, "loss": 0.6898, "step": 12284 }, { "epoch": 2.10251583090878, "grad_norm": 18.772653579711914, "learning_rate": 5.597551036114328e-06, "loss": 1.701, "step": 12285 }, { "epoch": 2.1026869758685605, "grad_norm": 6.859883785247803, "learning_rate": 5.58857658175722e-06, "loss": 0.464, "step": 12286 }, { "epoch": 2.1028581208283414, "grad_norm": 14.721001625061035, "learning_rate": 5.579607680153932e-06, "loss": 1.0006, "step": 12287 }, { "epoch": 2.1030292657881224, "grad_norm": 1.0394103527069092, "learning_rate": 5.57064433659616e-06, "loss": 0.1699, "step": 12288 }, { "epoch": 2.1032004107479034, "grad_norm": 1.2281742095947266, "learning_rate": 5.561686556372258e-06, "loss": 0.1916, "step": 12289 }, { "epoch": 2.1033715557076844, "grad_norm": 5.1613993644714355, "learning_rate": 5.552734344767361e-06, "loss": 0.3993, "step": 12290 }, { "epoch": 2.1035427006674654, "grad_norm": 3.736942768096924, "learning_rate": 5.543787707063256e-06, "loss": 0.2372, "step": 12291 }, { "epoch": 2.1037138456272464, "grad_norm": 0.2604852318763733, "learning_rate": 5.534846648538506e-06, "loss": 0.0984, "step": 12292 }, { "epoch": 2.1038849905870274, "grad_norm": 6.2854766845703125, "learning_rate": 5.525911174468313e-06, "loss": 0.2556, "step": 12293 }, { "epoch": 2.1040561355468084, "grad_norm": 12.933722496032715, "learning_rate": 5.51698129012464e-06, "loss": 0.7388, "step": 12294 }, { "epoch": 2.104227280506589, "grad_norm": 1.675539255142212, "learning_rate": 5.50805700077614e-06, "loss": 0.1874, "step": 12295 }, { "epoch": 2.10439842546637, "grad_norm": 23.54041290283203, "learning_rate": 5.499138311688143e-06, "loss": 1.5584, "step": 12296 }, { "epoch": 2.104569570426151, "grad_norm": 4.646311283111572, "learning_rate": 5.490225228122699e-06, "loss": 0.4786, "step": 12297 }, { "epoch": 2.104740715385932, "grad_norm": 0.33248868584632874, "learning_rate": 5.481317755338534e-06, "loss": 0.0995, "step": 12298 }, { "epoch": 2.104911860345713, "grad_norm": 8.811219215393066, "learning_rate": 5.4724158985910666e-06, "loss": 0.6131, "step": 12299 }, { "epoch": 2.105083005305494, "grad_norm": 5.760908603668213, "learning_rate": 5.463519663132413e-06, "loss": 0.5409, "step": 12300 }, { "epoch": 2.105254150265275, "grad_norm": 0.3891908824443817, "learning_rate": 5.45462905421136e-06, "loss": 0.1026, "step": 12301 }, { "epoch": 2.1054252952250554, "grad_norm": 0.42044273018836975, "learning_rate": 5.445744077073386e-06, "loss": 0.1026, "step": 12302 }, { "epoch": 2.1055964401848364, "grad_norm": 16.32809066772461, "learning_rate": 5.4368647369606315e-06, "loss": 1.3798, "step": 12303 }, { "epoch": 2.1057675851446174, "grad_norm": 3.0390267372131348, "learning_rate": 5.4279910391119335e-06, "loss": 0.2341, "step": 12304 }, { "epoch": 2.1059387301043984, "grad_norm": 87.11619567871094, "learning_rate": 5.419122988762777e-06, "loss": 7.0618, "step": 12305 }, { "epoch": 2.1061098750641793, "grad_norm": 6.803827285766602, "learning_rate": 5.410260591145324e-06, "loss": 0.615, "step": 12306 }, { "epoch": 2.1062810200239603, "grad_norm": 21.950124740600586, "learning_rate": 5.40140385148841e-06, "loss": 0.9957, "step": 12307 }, { "epoch": 2.1064521649837413, "grad_norm": 5.025242805480957, "learning_rate": 5.392552775017515e-06, "loss": 0.3145, "step": 12308 }, { "epoch": 2.1066233099435223, "grad_norm": 11.411694526672363, "learning_rate": 5.383707366954799e-06, "loss": 0.885, "step": 12309 }, { "epoch": 2.1067944549033033, "grad_norm": 4.159953594207764, "learning_rate": 5.374867632519054e-06, "loss": 0.571, "step": 12310 }, { "epoch": 2.106965599863084, "grad_norm": 0.9912519454956055, "learning_rate": 5.366033576925747e-06, "loss": 0.1668, "step": 12311 }, { "epoch": 2.107136744822865, "grad_norm": 9.306662559509277, "learning_rate": 5.357205205386974e-06, "loss": 0.722, "step": 12312 }, { "epoch": 2.107307889782646, "grad_norm": 14.439556121826172, "learning_rate": 5.348382523111497e-06, "loss": 1.3842, "step": 12313 }, { "epoch": 2.107479034742427, "grad_norm": 6.540797710418701, "learning_rate": 5.339565535304708e-06, "loss": 0.5768, "step": 12314 }, { "epoch": 2.107650179702208, "grad_norm": 6.275105953216553, "learning_rate": 5.330754247168631e-06, "loss": 0.6235, "step": 12315 }, { "epoch": 2.107821324661989, "grad_norm": 10.712638854980469, "learning_rate": 5.321948663901951e-06, "loss": 0.7588, "step": 12316 }, { "epoch": 2.10799246962177, "grad_norm": 7.408370018005371, "learning_rate": 5.313148790699989e-06, "loss": 0.4871, "step": 12317 }, { "epoch": 2.1081636145815508, "grad_norm": 16.50934600830078, "learning_rate": 5.304354632754652e-06, "loss": 1.154, "step": 12318 }, { "epoch": 2.1083347595413313, "grad_norm": 139.87306213378906, "learning_rate": 5.295566195254541e-06, "loss": 9.0456, "step": 12319 }, { "epoch": 2.1085059045011123, "grad_norm": 10.269725799560547, "learning_rate": 5.286783483384812e-06, "loss": 0.6885, "step": 12320 }, { "epoch": 2.1086770494608933, "grad_norm": 10.606011390686035, "learning_rate": 5.27800650232731e-06, "loss": 0.9097, "step": 12321 }, { "epoch": 2.1088481944206743, "grad_norm": 13.953461647033691, "learning_rate": 5.269235257260439e-06, "loss": 1.1386, "step": 12322 }, { "epoch": 2.1090193393804553, "grad_norm": 8.227935791015625, "learning_rate": 5.260469753359273e-06, "loss": 0.4148, "step": 12323 }, { "epoch": 2.1091904843402363, "grad_norm": 11.149458885192871, "learning_rate": 5.251709995795445e-06, "loss": 0.8086, "step": 12324 }, { "epoch": 2.1093616293000172, "grad_norm": 2.6202380657196045, "learning_rate": 5.242955989737255e-06, "loss": 0.2421, "step": 12325 }, { "epoch": 2.1095327742597982, "grad_norm": 8.621912956237793, "learning_rate": 5.234207740349547e-06, "loss": 0.5638, "step": 12326 }, { "epoch": 2.109703919219579, "grad_norm": 7.426543235778809, "learning_rate": 5.225465252793835e-06, "loss": 0.4896, "step": 12327 }, { "epoch": 2.1098750641793598, "grad_norm": 18.94154930114746, "learning_rate": 5.216728532228166e-06, "loss": 1.9491, "step": 12328 }, { "epoch": 2.1100462091391408, "grad_norm": 9.982481002807617, "learning_rate": 5.207997583807256e-06, "loss": 0.6949, "step": 12329 }, { "epoch": 2.1102173540989217, "grad_norm": 20.56634521484375, "learning_rate": 5.199272412682336e-06, "loss": 2.2828, "step": 12330 }, { "epoch": 2.1103884990587027, "grad_norm": 2.546142101287842, "learning_rate": 5.190553024001304e-06, "loss": 0.2118, "step": 12331 }, { "epoch": 2.1105596440184837, "grad_norm": 2.0170505046844482, "learning_rate": 5.181839422908585e-06, "loss": 0.239, "step": 12332 }, { "epoch": 2.1107307889782647, "grad_norm": 4.592349052429199, "learning_rate": 5.173131614545244e-06, "loss": 0.5393, "step": 12333 }, { "epoch": 2.1109019339380457, "grad_norm": 12.688459396362305, "learning_rate": 5.164429604048872e-06, "loss": 1.0374, "step": 12334 }, { "epoch": 2.1110730788978262, "grad_norm": 7.936845302581787, "learning_rate": 5.1557333965537e-06, "loss": 0.491, "step": 12335 }, { "epoch": 2.1112442238576072, "grad_norm": 8.862156867980957, "learning_rate": 5.147042997190471e-06, "loss": 0.5936, "step": 12336 }, { "epoch": 2.1114153688173882, "grad_norm": 3.149007558822632, "learning_rate": 5.13835841108655e-06, "loss": 0.2478, "step": 12337 }, { "epoch": 2.111586513777169, "grad_norm": 10.951807975769043, "learning_rate": 5.129679643365864e-06, "loss": 0.8327, "step": 12338 }, { "epoch": 2.11175765873695, "grad_norm": 19.91534423828125, "learning_rate": 5.121006699148884e-06, "loss": 5.136, "step": 12339 }, { "epoch": 2.111928803696731, "grad_norm": 11.950395584106445, "learning_rate": 5.112339583552672e-06, "loss": 0.7769, "step": 12340 }, { "epoch": 2.112099948656512, "grad_norm": 7.5248613357543945, "learning_rate": 5.103678301690828e-06, "loss": 0.5764, "step": 12341 }, { "epoch": 2.112271093616293, "grad_norm": 17.80353546142578, "learning_rate": 5.095022858673531e-06, "loss": 2.1129, "step": 12342 }, { "epoch": 2.112442238576074, "grad_norm": 22.260961532592773, "learning_rate": 5.086373259607495e-06, "loss": 4.8007, "step": 12343 }, { "epoch": 2.1126133835358547, "grad_norm": 14.172402381896973, "learning_rate": 5.077729509596009e-06, "loss": 0.9863, "step": 12344 }, { "epoch": 2.1127845284956357, "grad_norm": 7.373724460601807, "learning_rate": 5.069091613738883e-06, "loss": 0.6108, "step": 12345 }, { "epoch": 2.1129556734554167, "grad_norm": 1.3255455493927002, "learning_rate": 5.060459577132504e-06, "loss": 0.1622, "step": 12346 }, { "epoch": 2.1131268184151977, "grad_norm": 3.0318076610565186, "learning_rate": 5.051833404869778e-06, "loss": 0.2426, "step": 12347 }, { "epoch": 2.1132979633749787, "grad_norm": 7.101712226867676, "learning_rate": 5.043213102040155e-06, "loss": 0.5893, "step": 12348 }, { "epoch": 2.1134691083347596, "grad_norm": 22.4307804107666, "learning_rate": 5.034598673729637e-06, "loss": 4.7618, "step": 12349 }, { "epoch": 2.1136402532945406, "grad_norm": 15.979168891906738, "learning_rate": 5.02599012502074e-06, "loss": 0.9711, "step": 12350 }, { "epoch": 2.113811398254321, "grad_norm": 14.170233726501465, "learning_rate": 5.017387460992531e-06, "loss": 1.0187, "step": 12351 }, { "epoch": 2.113982543214102, "grad_norm": 7.2313551902771, "learning_rate": 5.0087906867205825e-06, "loss": 0.6065, "step": 12352 }, { "epoch": 2.114153688173883, "grad_norm": 10.064969062805176, "learning_rate": 5.000199807277016e-06, "loss": 0.6797, "step": 12353 }, { "epoch": 2.114324833133664, "grad_norm": 15.745534896850586, "learning_rate": 4.991614827730458e-06, "loss": 1.4842, "step": 12354 }, { "epoch": 2.114495978093445, "grad_norm": 8.604680061340332, "learning_rate": 4.983035753146053e-06, "loss": 0.7823, "step": 12355 }, { "epoch": 2.114667123053226, "grad_norm": 0.6079570651054382, "learning_rate": 4.974462588585479e-06, "loss": 0.1102, "step": 12356 }, { "epoch": 2.114838268013007, "grad_norm": 11.123053550720215, "learning_rate": 4.965895339106904e-06, "loss": 0.8079, "step": 12357 }, { "epoch": 2.115009412972788, "grad_norm": 7.959338188171387, "learning_rate": 4.95733400976502e-06, "loss": 0.6013, "step": 12358 }, { "epoch": 2.115180557932569, "grad_norm": 17.095109939575195, "learning_rate": 4.948778605611029e-06, "loss": 1.0582, "step": 12359 }, { "epoch": 2.1153517028923496, "grad_norm": 10.109193801879883, "learning_rate": 4.940229131692646e-06, "loss": 0.8249, "step": 12360 }, { "epoch": 2.1155228478521306, "grad_norm": 23.358226776123047, "learning_rate": 4.931685593054045e-06, "loss": 4.6396, "step": 12361 }, { "epoch": 2.1156939928119116, "grad_norm": 7.454588413238525, "learning_rate": 4.923147994735959e-06, "loss": 0.616, "step": 12362 }, { "epoch": 2.1158651377716926, "grad_norm": 8.279791831970215, "learning_rate": 4.914616341775545e-06, "loss": 0.7517, "step": 12363 }, { "epoch": 2.1160362827314736, "grad_norm": 13.654438972473145, "learning_rate": 4.906090639206528e-06, "loss": 1.0321, "step": 12364 }, { "epoch": 2.1162074276912546, "grad_norm": 0.3023872375488281, "learning_rate": 4.897570892059052e-06, "loss": 0.0978, "step": 12365 }, { "epoch": 2.1163785726510356, "grad_norm": 3.257613182067871, "learning_rate": 4.889057105359812e-06, "loss": 0.2887, "step": 12366 }, { "epoch": 2.1165497176108166, "grad_norm": 11.743584632873535, "learning_rate": 4.880549284131924e-06, "loss": 0.9245, "step": 12367 }, { "epoch": 2.116720862570597, "grad_norm": 0.45920777320861816, "learning_rate": 4.8720474333950465e-06, "loss": 0.1002, "step": 12368 }, { "epoch": 2.116892007530378, "grad_norm": 5.3993940353393555, "learning_rate": 4.863551558165255e-06, "loss": 0.5991, "step": 12369 }, { "epoch": 2.117063152490159, "grad_norm": 9.421904563903809, "learning_rate": 4.8550616634551555e-06, "loss": 0.4831, "step": 12370 }, { "epoch": 2.11723429744994, "grad_norm": 8.427457809448242, "learning_rate": 4.8465777542737686e-06, "loss": 0.5733, "step": 12371 }, { "epoch": 2.117405442409721, "grad_norm": 21.551969528198242, "learning_rate": 4.8380998356266475e-06, "loss": 2.7298, "step": 12372 }, { "epoch": 2.117576587369502, "grad_norm": 6.320952415466309, "learning_rate": 4.829627912515742e-06, "loss": 0.5445, "step": 12373 }, { "epoch": 2.117747732329283, "grad_norm": 7.48321008682251, "learning_rate": 4.821161989939533e-06, "loss": 0.5825, "step": 12374 }, { "epoch": 2.117918877289064, "grad_norm": 5.991507053375244, "learning_rate": 4.812702072892895e-06, "loss": 0.3993, "step": 12375 }, { "epoch": 2.1180900222488446, "grad_norm": 13.947715759277344, "learning_rate": 4.804248166367224e-06, "loss": 1.003, "step": 12376 }, { "epoch": 2.1182611672086256, "grad_norm": 3.3935422897338867, "learning_rate": 4.795800275350304e-06, "loss": 0.3776, "step": 12377 }, { "epoch": 2.1184323121684066, "grad_norm": 10.852747917175293, "learning_rate": 4.787358404826441e-06, "loss": 0.7934, "step": 12378 }, { "epoch": 2.1186034571281875, "grad_norm": 16.74319839477539, "learning_rate": 4.778922559776311e-06, "loss": 1.7589, "step": 12379 }, { "epoch": 2.1187746020879685, "grad_norm": 4.899751663208008, "learning_rate": 4.770492745177095e-06, "loss": 0.4367, "step": 12380 }, { "epoch": 2.1189457470477495, "grad_norm": 7.31135368347168, "learning_rate": 4.762068966002399e-06, "loss": 0.7135, "step": 12381 }, { "epoch": 2.1191168920075305, "grad_norm": 5.472841739654541, "learning_rate": 4.753651227222269e-06, "loss": 0.4766, "step": 12382 }, { "epoch": 2.1192880369673115, "grad_norm": 1.5402108430862427, "learning_rate": 4.745239533803171e-06, "loss": 0.1965, "step": 12383 }, { "epoch": 2.119459181927092, "grad_norm": 18.52389144897461, "learning_rate": 4.736833890708026e-06, "loss": 2.005, "step": 12384 }, { "epoch": 2.119630326886873, "grad_norm": 4.437579154968262, "learning_rate": 4.728434302896168e-06, "loss": 0.4707, "step": 12385 }, { "epoch": 2.119801471846654, "grad_norm": 11.236756324768066, "learning_rate": 4.720040775323374e-06, "loss": 1.0121, "step": 12386 }, { "epoch": 2.119972616806435, "grad_norm": 8.964266777038574, "learning_rate": 4.711653312941836e-06, "loss": 0.6918, "step": 12387 }, { "epoch": 2.120143761766216, "grad_norm": 5.10436487197876, "learning_rate": 4.703271920700162e-06, "loss": 0.3661, "step": 12388 }, { "epoch": 2.120314906725997, "grad_norm": 2.804274082183838, "learning_rate": 4.694896603543396e-06, "loss": 0.3398, "step": 12389 }, { "epoch": 2.120486051685778, "grad_norm": 2.765272617340088, "learning_rate": 4.686527366412978e-06, "loss": 0.343, "step": 12390 }, { "epoch": 2.120657196645559, "grad_norm": 1.1607071161270142, "learning_rate": 4.67816421424678e-06, "loss": 0.1663, "step": 12391 }, { "epoch": 2.12082834160534, "grad_norm": 1.3772364854812622, "learning_rate": 4.669807151979065e-06, "loss": 0.1625, "step": 12392 }, { "epoch": 2.1209994865651205, "grad_norm": 9.962944984436035, "learning_rate": 4.661456184540523e-06, "loss": 0.6466, "step": 12393 }, { "epoch": 2.1211706315249015, "grad_norm": 3.9277775287628174, "learning_rate": 4.6531113168582285e-06, "loss": 0.395, "step": 12394 }, { "epoch": 2.1213417764846825, "grad_norm": 124.07168579101562, "learning_rate": 4.644772553855665e-06, "loss": 8.9092, "step": 12395 }, { "epoch": 2.1215129214444635, "grad_norm": 19.440576553344727, "learning_rate": 4.636439900452722e-06, "loss": 2.2187, "step": 12396 }, { "epoch": 2.1216840664042445, "grad_norm": 16.701778411865234, "learning_rate": 4.628113361565669e-06, "loss": 1.7123, "step": 12397 }, { "epoch": 2.1218552113640254, "grad_norm": 17.302919387817383, "learning_rate": 4.619792942107188e-06, "loss": 1.7174, "step": 12398 }, { "epoch": 2.1220263563238064, "grad_norm": 20.516460418701172, "learning_rate": 4.611478646986326e-06, "loss": 2.2071, "step": 12399 }, { "epoch": 2.122197501283587, "grad_norm": 15.776718139648438, "learning_rate": 4.60317048110854e-06, "loss": 1.4449, "step": 12400 }, { "epoch": 2.122368646243368, "grad_norm": 10.126150131225586, "learning_rate": 4.5948684493756465e-06, "loss": 0.6643, "step": 12401 }, { "epoch": 2.122539791203149, "grad_norm": 0.3476387858390808, "learning_rate": 4.586572556685866e-06, "loss": 0.1005, "step": 12402 }, { "epoch": 2.12271093616293, "grad_norm": 11.189613342285156, "learning_rate": 4.578282807933802e-06, "loss": 0.8496, "step": 12403 }, { "epoch": 2.122882081122711, "grad_norm": 24.52934455871582, "learning_rate": 4.569999208010389e-06, "loss": 3.1335, "step": 12404 }, { "epoch": 2.123053226082492, "grad_norm": 19.71367835998535, "learning_rate": 4.5617217618029935e-06, "loss": 1.2134, "step": 12405 }, { "epoch": 2.123224371042273, "grad_norm": 11.26441764831543, "learning_rate": 4.553450474195291e-06, "loss": 0.6905, "step": 12406 }, { "epoch": 2.123395516002054, "grad_norm": 1.539315104484558, "learning_rate": 4.545185350067384e-06, "loss": 0.1997, "step": 12407 }, { "epoch": 2.123566660961835, "grad_norm": 3.420771598815918, "learning_rate": 4.536926394295677e-06, "loss": 0.3555, "step": 12408 }, { "epoch": 2.1237378059216154, "grad_norm": 7.8353071212768555, "learning_rate": 4.528673611752997e-06, "loss": 0.6151, "step": 12409 }, { "epoch": 2.1239089508813964, "grad_norm": 7.502870082855225, "learning_rate": 4.520427007308466e-06, "loss": 0.5038, "step": 12410 }, { "epoch": 2.1240800958411774, "grad_norm": 20.872066497802734, "learning_rate": 4.512186585827631e-06, "loss": 2.4158, "step": 12411 }, { "epoch": 2.1242512408009584, "grad_norm": 2.0421457290649414, "learning_rate": 4.503952352172312e-06, "loss": 0.2133, "step": 12412 }, { "epoch": 2.1244223857607394, "grad_norm": 2.0319366455078125, "learning_rate": 4.495724311200752e-06, "loss": 0.3417, "step": 12413 }, { "epoch": 2.1245935307205204, "grad_norm": 19.550594329833984, "learning_rate": 4.487502467767481e-06, "loss": 2.1197, "step": 12414 }, { "epoch": 2.1247646756803014, "grad_norm": 12.376076698303223, "learning_rate": 4.479286826723425e-06, "loss": 0.7368, "step": 12415 }, { "epoch": 2.1249358206400824, "grad_norm": 2.7443225383758545, "learning_rate": 4.471077392915798e-06, "loss": 0.2353, "step": 12416 }, { "epoch": 2.125106965599863, "grad_norm": 15.757041931152344, "learning_rate": 4.462874171188206e-06, "loss": 1.0594, "step": 12417 }, { "epoch": 2.125278110559644, "grad_norm": 13.59072208404541, "learning_rate": 4.454677166380533e-06, "loss": 1.4164, "step": 12418 }, { "epoch": 2.125449255519425, "grad_norm": 11.37222671508789, "learning_rate": 4.446486383329053e-06, "loss": 0.7262, "step": 12419 }, { "epoch": 2.125620400479206, "grad_norm": 18.9943904876709, "learning_rate": 4.438301826866311e-06, "loss": 2.0357, "step": 12420 }, { "epoch": 2.125791545438987, "grad_norm": 1.2464213371276855, "learning_rate": 4.430123501821238e-06, "loss": 0.1934, "step": 12421 }, { "epoch": 2.125962690398768, "grad_norm": 3.8480594158172607, "learning_rate": 4.421951413019023e-06, "loss": 0.4744, "step": 12422 }, { "epoch": 2.126133835358549, "grad_norm": 9.73202896118164, "learning_rate": 4.413785565281239e-06, "loss": 0.616, "step": 12423 }, { "epoch": 2.12630498031833, "grad_norm": 2.7773597240448, "learning_rate": 4.4056259634257425e-06, "loss": 0.2426, "step": 12424 }, { "epoch": 2.1264761252781104, "grad_norm": 12.293949127197266, "learning_rate": 4.397472612266706e-06, "loss": 0.866, "step": 12425 }, { "epoch": 2.1266472702378914, "grad_norm": 6.90822696685791, "learning_rate": 4.389325516614623e-06, "loss": 0.7684, "step": 12426 }, { "epoch": 2.1268184151976723, "grad_norm": 18.3587646484375, "learning_rate": 4.381184681276289e-06, "loss": 1.7757, "step": 12427 }, { "epoch": 2.1269895601574533, "grad_norm": 0.9008998274803162, "learning_rate": 4.373050111054805e-06, "loss": 0.1565, "step": 12428 }, { "epoch": 2.1271607051172343, "grad_norm": 18.117237091064453, "learning_rate": 4.36492181074959e-06, "loss": 1.8904, "step": 12429 }, { "epoch": 2.1273318500770153, "grad_norm": 7.620315074920654, "learning_rate": 4.356799785156341e-06, "loss": 0.5731, "step": 12430 }, { "epoch": 2.1275029950367963, "grad_norm": 19.78044319152832, "learning_rate": 4.3486840390670755e-06, "loss": 1.7396, "step": 12431 }, { "epoch": 2.1276741399965773, "grad_norm": 7.385937690734863, "learning_rate": 4.3405745772700875e-06, "loss": 0.5585, "step": 12432 }, { "epoch": 2.127845284956358, "grad_norm": 5.186272144317627, "learning_rate": 4.3324714045499815e-06, "loss": 0.3795, "step": 12433 }, { "epoch": 2.128016429916139, "grad_norm": 18.985313415527344, "learning_rate": 4.324374525687635e-06, "loss": 1.6351, "step": 12434 }, { "epoch": 2.12818757487592, "grad_norm": 1.5599828958511353, "learning_rate": 4.3162839454602135e-06, "loss": 0.183, "step": 12435 }, { "epoch": 2.128358719835701, "grad_norm": 19.52484893798828, "learning_rate": 4.3081996686411825e-06, "loss": 2.2567, "step": 12436 }, { "epoch": 2.128529864795482, "grad_norm": 7.850955963134766, "learning_rate": 4.300121700000269e-06, "loss": 0.6745, "step": 12437 }, { "epoch": 2.1287010097552628, "grad_norm": 3.7401528358459473, "learning_rate": 4.2920500443034915e-06, "loss": 0.1995, "step": 12438 }, { "epoch": 2.1288721547150438, "grad_norm": 13.880343437194824, "learning_rate": 4.283984706313135e-06, "loss": 1.1533, "step": 12439 }, { "epoch": 2.1290432996748248, "grad_norm": 15.962678909301758, "learning_rate": 4.275925690787765e-06, "loss": 1.1588, "step": 12440 }, { "epoch": 2.1292144446346057, "grad_norm": 3.478412389755249, "learning_rate": 4.267873002482213e-06, "loss": 0.2784, "step": 12441 }, { "epoch": 2.1293855895943863, "grad_norm": 5.1060261726379395, "learning_rate": 4.259826646147567e-06, "loss": 0.5361, "step": 12442 }, { "epoch": 2.1295567345541673, "grad_norm": 0.5605698227882385, "learning_rate": 4.251786626531202e-06, "loss": 0.1378, "step": 12443 }, { "epoch": 2.1297278795139483, "grad_norm": 2.920491933822632, "learning_rate": 4.2437529483767305e-06, "loss": 0.2697, "step": 12444 }, { "epoch": 2.1298990244737293, "grad_norm": 3.329118251800537, "learning_rate": 4.235725616424036e-06, "loss": 0.4167, "step": 12445 }, { "epoch": 2.1300701694335102, "grad_norm": 1.6195085048675537, "learning_rate": 4.227704635409279e-06, "loss": 0.18, "step": 12446 }, { "epoch": 2.1302413143932912, "grad_norm": 2.59922194480896, "learning_rate": 4.219690010064815e-06, "loss": 0.2136, "step": 12447 }, { "epoch": 2.130412459353072, "grad_norm": 7.035032272338867, "learning_rate": 4.2116817451193165e-06, "loss": 0.5977, "step": 12448 }, { "epoch": 2.1305836043128528, "grad_norm": 35.40533447265625, "learning_rate": 4.203679845297643e-06, "loss": 5.6519, "step": 12449 }, { "epoch": 2.1307547492726338, "grad_norm": 14.62741756439209, "learning_rate": 4.195684315320957e-06, "loss": 1.4211, "step": 12450 }, { "epoch": 2.1309258942324147, "grad_norm": 25.971837997436523, "learning_rate": 4.187695159906602e-06, "loss": 4.8233, "step": 12451 }, { "epoch": 2.1310970391921957, "grad_norm": 7.523331165313721, "learning_rate": 4.179712383768221e-06, "loss": 0.6934, "step": 12452 }, { "epoch": 2.1312681841519767, "grad_norm": 22.85276985168457, "learning_rate": 4.171735991615636e-06, "loss": 1.1773, "step": 12453 }, { "epoch": 2.1314393291117577, "grad_norm": 8.329923629760742, "learning_rate": 4.163765988154959e-06, "loss": 0.7595, "step": 12454 }, { "epoch": 2.1316104740715387, "grad_norm": 2.769582748413086, "learning_rate": 4.155802378088475e-06, "loss": 0.2328, "step": 12455 }, { "epoch": 2.1317816190313197, "grad_norm": 5.693929672241211, "learning_rate": 4.1478451661147535e-06, "loss": 0.3834, "step": 12456 }, { "epoch": 2.1319527639911007, "grad_norm": 0.44276392459869385, "learning_rate": 4.139894356928535e-06, "loss": 0.1019, "step": 12457 }, { "epoch": 2.132123908950881, "grad_norm": 17.487232208251953, "learning_rate": 4.131949955220839e-06, "loss": 1.3225, "step": 12458 }, { "epoch": 2.132295053910662, "grad_norm": 4.400411128997803, "learning_rate": 4.124011965678838e-06, "loss": 0.3374, "step": 12459 }, { "epoch": 2.132466198870443, "grad_norm": 6.325746059417725, "learning_rate": 4.1160803929859915e-06, "loss": 0.4258, "step": 12460 }, { "epoch": 2.132637343830224, "grad_norm": 18.417999267578125, "learning_rate": 4.10815524182191e-06, "loss": 1.6426, "step": 12461 }, { "epoch": 2.132808488790005, "grad_norm": 10.510699272155762, "learning_rate": 4.1002365168624736e-06, "loss": 0.6642, "step": 12462 }, { "epoch": 2.132979633749786, "grad_norm": 11.085676193237305, "learning_rate": 4.092324222779711e-06, "loss": 0.6723, "step": 12463 }, { "epoch": 2.133150778709567, "grad_norm": 12.40665054321289, "learning_rate": 4.08441836424192e-06, "loss": 1.0471, "step": 12464 }, { "epoch": 2.133321923669348, "grad_norm": 17.599895477294922, "learning_rate": 4.076518945913527e-06, "loss": 1.7409, "step": 12465 }, { "epoch": 2.1334930686291287, "grad_norm": 10.822998046875, "learning_rate": 4.0686259724552456e-06, "loss": 0.8325, "step": 12466 }, { "epoch": 2.1336642135889097, "grad_norm": 1.4773058891296387, "learning_rate": 4.060739448523921e-06, "loss": 0.1764, "step": 12467 }, { "epoch": 2.1338353585486907, "grad_norm": 4.279238224029541, "learning_rate": 4.052859378772612e-06, "loss": 0.4344, "step": 12468 }, { "epoch": 2.1340065035084717, "grad_norm": 3.0132389068603516, "learning_rate": 4.04498576785058e-06, "loss": 0.2602, "step": 12469 }, { "epoch": 2.1341776484682526, "grad_norm": 9.039013862609863, "learning_rate": 4.037118620403256e-06, "loss": 0.6464, "step": 12470 }, { "epoch": 2.1343487934280336, "grad_norm": 50.75867462158203, "learning_rate": 4.029257941072281e-06, "loss": 5.9967, "step": 12471 }, { "epoch": 2.1345199383878146, "grad_norm": 11.904842376708984, "learning_rate": 4.021403734495455e-06, "loss": 0.8956, "step": 12472 }, { "epoch": 2.1346910833475956, "grad_norm": 2.690201997756958, "learning_rate": 4.013556005306785e-06, "loss": 0.3057, "step": 12473 }, { "epoch": 2.134862228307376, "grad_norm": 8.702156066894531, "learning_rate": 4.0057147581364324e-06, "loss": 0.8506, "step": 12474 }, { "epoch": 2.135033373267157, "grad_norm": 12.741438865661621, "learning_rate": 3.997879997610745e-06, "loss": 0.7628, "step": 12475 }, { "epoch": 2.135204518226938, "grad_norm": 12.95738410949707, "learning_rate": 3.990051728352252e-06, "loss": 0.8195, "step": 12476 }, { "epoch": 2.135375663186719, "grad_norm": 19.897781372070312, "learning_rate": 3.982229954979631e-06, "loss": 4.7806, "step": 12477 }, { "epoch": 2.1355468081465, "grad_norm": 8.843745231628418, "learning_rate": 3.9744146821077546e-06, "loss": 0.4804, "step": 12478 }, { "epoch": 2.135717953106281, "grad_norm": 16.472064971923828, "learning_rate": 3.96660591434763e-06, "loss": 1.0823, "step": 12479 }, { "epoch": 2.135889098066062, "grad_norm": 9.142592430114746, "learning_rate": 3.958803656306456e-06, "loss": 0.7105, "step": 12480 }, { "epoch": 2.136060243025843, "grad_norm": 11.56128215789795, "learning_rate": 3.951007912587566e-06, "loss": 1.073, "step": 12481 }, { "epoch": 2.1362313879856236, "grad_norm": 18.758136749267578, "learning_rate": 3.9432186877904684e-06, "loss": 1.5481, "step": 12482 }, { "epoch": 2.1364025329454046, "grad_norm": 1.0498899221420288, "learning_rate": 3.9354359865108154e-06, "loss": 0.1595, "step": 12483 }, { "epoch": 2.1365736779051856, "grad_norm": 11.298416137695312, "learning_rate": 3.927659813340403e-06, "loss": 0.6436, "step": 12484 }, { "epoch": 2.1367448228649666, "grad_norm": 16.441858291625977, "learning_rate": 3.9198901728672e-06, "loss": 0.7204, "step": 12485 }, { "epoch": 2.1369159678247476, "grad_norm": 11.617677688598633, "learning_rate": 3.912127069675293e-06, "loss": 0.9333, "step": 12486 }, { "epoch": 2.1370871127845286, "grad_norm": 9.478771209716797, "learning_rate": 3.904370508344924e-06, "loss": 0.6541, "step": 12487 }, { "epoch": 2.1372582577443096, "grad_norm": 12.2900972366333, "learning_rate": 3.896620493452483e-06, "loss": 0.9924, "step": 12488 }, { "epoch": 2.1374294027040905, "grad_norm": 7.887842655181885, "learning_rate": 3.888877029570503e-06, "loss": 0.4028, "step": 12489 }, { "epoch": 2.1376005476638715, "grad_norm": 116.24420166015625, "learning_rate": 3.881140121267609e-06, "loss": 7.601, "step": 12490 }, { "epoch": 2.137771692623652, "grad_norm": 11.636894226074219, "learning_rate": 3.873409773108625e-06, "loss": 0.7542, "step": 12491 }, { "epoch": 2.137942837583433, "grad_norm": 12.683733940124512, "learning_rate": 3.865685989654429e-06, "loss": 0.9661, "step": 12492 }, { "epoch": 2.138113982543214, "grad_norm": 17.177330017089844, "learning_rate": 3.857968775462096e-06, "loss": 1.2784, "step": 12493 }, { "epoch": 2.138285127502995, "grad_norm": 6.17103385925293, "learning_rate": 3.850258135084768e-06, "loss": 0.4712, "step": 12494 }, { "epoch": 2.138456272462776, "grad_norm": 32.85896682739258, "learning_rate": 3.84255407307176e-06, "loss": 5.5987, "step": 12495 }, { "epoch": 2.138627417422557, "grad_norm": 1.6723809242248535, "learning_rate": 3.834856593968456e-06, "loss": 0.1787, "step": 12496 }, { "epoch": 2.138798562382338, "grad_norm": 3.366302967071533, "learning_rate": 3.827165702316403e-06, "loss": 0.1744, "step": 12497 }, { "epoch": 2.1389697073421186, "grad_norm": 0.3117910623550415, "learning_rate": 3.8194814026532146e-06, "loss": 0.0965, "step": 12498 }, { "epoch": 2.1391408523018995, "grad_norm": 12.651409149169922, "learning_rate": 3.8118036995126646e-06, "loss": 0.8363, "step": 12499 }, { "epoch": 2.1393119972616805, "grad_norm": 21.555559158325195, "learning_rate": 3.8041325974245826e-06, "loss": 2.7474, "step": 12500 }, { "epoch": 2.1394831422214615, "grad_norm": 12.68983268737793, "learning_rate": 3.796468100914958e-06, "loss": 0.877, "step": 12501 }, { "epoch": 2.1396542871812425, "grad_norm": 7.7101335525512695, "learning_rate": 3.788810214505829e-06, "loss": 0.5346, "step": 12502 }, { "epoch": 2.1398254321410235, "grad_norm": 5.914426803588867, "learning_rate": 3.7811589427153827e-06, "loss": 0.4358, "step": 12503 }, { "epoch": 2.1399965771008045, "grad_norm": 7.763607978820801, "learning_rate": 3.7735142900578578e-06, "loss": 0.6026, "step": 12504 }, { "epoch": 2.1401677220605855, "grad_norm": 14.509535789489746, "learning_rate": 3.765876261043642e-06, "loss": 0.8993, "step": 12505 }, { "epoch": 2.1403388670203665, "grad_norm": 18.210468292236328, "learning_rate": 3.758244860179147e-06, "loss": 1.8382, "step": 12506 }, { "epoch": 2.140510011980147, "grad_norm": 1.7662572860717773, "learning_rate": 3.750620091966941e-06, "loss": 0.1778, "step": 12507 }, { "epoch": 2.140681156939928, "grad_norm": 1.3584481477737427, "learning_rate": 3.7430019609056136e-06, "loss": 0.1455, "step": 12508 }, { "epoch": 2.140852301899709, "grad_norm": 15.75182056427002, "learning_rate": 3.735390471489915e-06, "loss": 1.1179, "step": 12509 }, { "epoch": 2.14102344685949, "grad_norm": 17.733699798583984, "learning_rate": 3.7277856282106127e-06, "loss": 1.4943, "step": 12510 }, { "epoch": 2.141194591819271, "grad_norm": 7.663135051727295, "learning_rate": 3.7201874355545824e-06, "loss": 0.9636, "step": 12511 }, { "epoch": 2.141365736779052, "grad_norm": 12.226543426513672, "learning_rate": 3.7125958980047616e-06, "loss": 0.8526, "step": 12512 }, { "epoch": 2.141536881738833, "grad_norm": 4.951159477233887, "learning_rate": 3.705011020040177e-06, "loss": 0.47, "step": 12513 }, { "epoch": 2.141708026698614, "grad_norm": 21.248043060302734, "learning_rate": 3.6974328061359113e-06, "loss": 2.2694, "step": 12514 }, { "epoch": 2.1418791716583945, "grad_norm": 15.436549186706543, "learning_rate": 3.6898612607631277e-06, "loss": 1.4249, "step": 12515 }, { "epoch": 2.1420503166181755, "grad_norm": 9.83232307434082, "learning_rate": 3.6822963883890476e-06, "loss": 0.7638, "step": 12516 }, { "epoch": 2.1422214615779565, "grad_norm": 9.91031551361084, "learning_rate": 3.674738193476949e-06, "loss": 0.8175, "step": 12517 }, { "epoch": 2.1423926065377374, "grad_norm": 11.306273460388184, "learning_rate": 3.6671866804861903e-06, "loss": 0.8304, "step": 12518 }, { "epoch": 2.1425637514975184, "grad_norm": 3.574805736541748, "learning_rate": 3.659641853872167e-06, "loss": 0.2345, "step": 12519 }, { "epoch": 2.1427348964572994, "grad_norm": 10.269393920898438, "learning_rate": 3.652103718086344e-06, "loss": 0.7337, "step": 12520 }, { "epoch": 2.1429060414170804, "grad_norm": 12.782596588134766, "learning_rate": 3.644572277576224e-06, "loss": 0.9254, "step": 12521 }, { "epoch": 2.1430771863768614, "grad_norm": 15.425829887390137, "learning_rate": 3.637047536785379e-06, "loss": 1.5075, "step": 12522 }, { "epoch": 2.143248331336642, "grad_norm": 1.2998089790344238, "learning_rate": 3.6295295001534133e-06, "loss": 0.1815, "step": 12523 }, { "epoch": 2.143419476296423, "grad_norm": 3.6391587257385254, "learning_rate": 3.622018172115973e-06, "loss": 0.2243, "step": 12524 }, { "epoch": 2.143590621256204, "grad_norm": 0.8835386633872986, "learning_rate": 3.614513557104762e-06, "loss": 0.1161, "step": 12525 }, { "epoch": 2.143761766215985, "grad_norm": 17.756797790527344, "learning_rate": 3.607015659547506e-06, "loss": 2.0377, "step": 12526 }, { "epoch": 2.143932911175766, "grad_norm": 21.9942684173584, "learning_rate": 3.5995244838679847e-06, "loss": 5.0157, "step": 12527 }, { "epoch": 2.144104056135547, "grad_norm": 1.9417732954025269, "learning_rate": 3.5920400344859956e-06, "loss": 0.2052, "step": 12528 }, { "epoch": 2.144275201095328, "grad_norm": 0.3366601765155792, "learning_rate": 3.5845623158173814e-06, "loss": 0.0959, "step": 12529 }, { "epoch": 2.144446346055109, "grad_norm": 56.09529495239258, "learning_rate": 3.5770913322739947e-06, "loss": 6.4952, "step": 12530 }, { "epoch": 2.1446174910148894, "grad_norm": 8.444234848022461, "learning_rate": 3.569627088263736e-06, "loss": 1.1156, "step": 12531 }, { "epoch": 2.1447886359746704, "grad_norm": 7.06710147857666, "learning_rate": 3.562169588190533e-06, "loss": 0.4663, "step": 12532 }, { "epoch": 2.1449597809344514, "grad_norm": 11.284481048583984, "learning_rate": 3.554718836454298e-06, "loss": 1.0219, "step": 12533 }, { "epoch": 2.1451309258942324, "grad_norm": 1.901021122932434, "learning_rate": 3.5472748374510065e-06, "loss": 0.2131, "step": 12534 }, { "epoch": 2.1453020708540134, "grad_norm": 61.34193801879883, "learning_rate": 3.5398375955726088e-06, "loss": 6.4206, "step": 12535 }, { "epoch": 2.1454732158137944, "grad_norm": 3.0484061241149902, "learning_rate": 3.5324071152071118e-06, "loss": 0.2485, "step": 12536 }, { "epoch": 2.1456443607735753, "grad_norm": 13.050834655761719, "learning_rate": 3.5249834007384828e-06, "loss": 1.0253, "step": 12537 }, { "epoch": 2.1458155057333563, "grad_norm": 177.67279052734375, "learning_rate": 3.517566456546758e-06, "loss": 9.204, "step": 12538 }, { "epoch": 2.1459866506931373, "grad_norm": 14.755181312561035, "learning_rate": 3.5101562870079085e-06, "loss": 1.1965, "step": 12539 }, { "epoch": 2.146157795652918, "grad_norm": 14.586289405822754, "learning_rate": 3.502752896493977e-06, "loss": 1.4349, "step": 12540 }, { "epoch": 2.146328940612699, "grad_norm": 2.3725063800811768, "learning_rate": 3.495356289372948e-06, "loss": 0.2083, "step": 12541 }, { "epoch": 2.14650008557248, "grad_norm": 105.0072021484375, "learning_rate": 3.4879664700088553e-06, "loss": 7.8507, "step": 12542 }, { "epoch": 2.146671230532261, "grad_norm": 27.089704513549805, "learning_rate": 3.4805834427616817e-06, "loss": 5.1584, "step": 12543 }, { "epoch": 2.146842375492042, "grad_norm": 8.798052787780762, "learning_rate": 3.4732072119874457e-06, "loss": 0.8362, "step": 12544 }, { "epoch": 2.147013520451823, "grad_norm": 14.8900728225708, "learning_rate": 3.4658377820381114e-06, "loss": 0.6364, "step": 12545 }, { "epoch": 2.147184665411604, "grad_norm": 4.091823101043701, "learning_rate": 3.4584751572616773e-06, "loss": 0.271, "step": 12546 }, { "epoch": 2.1473558103713843, "grad_norm": 12.392218589782715, "learning_rate": 3.4511193420020786e-06, "loss": 0.9719, "step": 12547 }, { "epoch": 2.1475269553311653, "grad_norm": 15.248124122619629, "learning_rate": 3.443770340599282e-06, "loss": 1.2812, "step": 12548 }, { "epoch": 2.1476981002909463, "grad_norm": 2.7879281044006348, "learning_rate": 3.436428157389186e-06, "loss": 0.1958, "step": 12549 }, { "epoch": 2.1478692452507273, "grad_norm": 10.797467231750488, "learning_rate": 3.4290927967037116e-06, "loss": 0.8552, "step": 12550 }, { "epoch": 2.1480403902105083, "grad_norm": 9.3255033493042, "learning_rate": 3.4217642628707056e-06, "loss": 0.5464, "step": 12551 }, { "epoch": 2.1482115351702893, "grad_norm": 9.236723899841309, "learning_rate": 3.4144425602140377e-06, "loss": 0.565, "step": 12552 }, { "epoch": 2.1483826801300703, "grad_norm": 7.940511226654053, "learning_rate": 3.407127693053519e-06, "loss": 0.7821, "step": 12553 }, { "epoch": 2.1485538250898513, "grad_norm": 1.33772873878479, "learning_rate": 3.399819665704925e-06, "loss": 0.1647, "step": 12554 }, { "epoch": 2.1487249700496323, "grad_norm": 4.268560409545898, "learning_rate": 3.392518482480011e-06, "loss": 0.282, "step": 12555 }, { "epoch": 2.148896115009413, "grad_norm": 12.968268394470215, "learning_rate": 3.385224147686482e-06, "loss": 0.8938, "step": 12556 }, { "epoch": 2.149067259969194, "grad_norm": 6.071305274963379, "learning_rate": 3.3779366656279993e-06, "loss": 0.526, "step": 12557 }, { "epoch": 2.1492384049289748, "grad_norm": 14.006571769714355, "learning_rate": 3.3706560406041996e-06, "loss": 0.8173, "step": 12558 }, { "epoch": 2.1494095498887558, "grad_norm": 13.55515193939209, "learning_rate": 3.3633822769106544e-06, "loss": 1.0517, "step": 12559 }, { "epoch": 2.1495806948485368, "grad_norm": 11.77253532409668, "learning_rate": 3.3561153788388998e-06, "loss": 0.8961, "step": 12560 }, { "epoch": 2.1497518398083177, "grad_norm": 15.493350982666016, "learning_rate": 3.348855350676412e-06, "loss": 1.1871, "step": 12561 }, { "epoch": 2.1499229847680987, "grad_norm": 23.477890014648438, "learning_rate": 3.3416021967066256e-06, "loss": 4.7464, "step": 12562 }, { "epoch": 2.1500941297278797, "grad_norm": 6.419351100921631, "learning_rate": 3.3343559212089083e-06, "loss": 0.4164, "step": 12563 }, { "epoch": 2.1502652746876603, "grad_norm": 10.13318920135498, "learning_rate": 3.3271165284585677e-06, "loss": 0.8949, "step": 12564 }, { "epoch": 2.1504364196474413, "grad_norm": 7.23954963684082, "learning_rate": 3.3198840227268657e-06, "loss": 0.7182, "step": 12565 }, { "epoch": 2.1506075646072222, "grad_norm": 2.2838330268859863, "learning_rate": 3.312658408280984e-06, "loss": 0.4508, "step": 12566 }, { "epoch": 2.1507787095670032, "grad_norm": 5.489869594573975, "learning_rate": 3.305439689384053e-06, "loss": 0.4082, "step": 12567 }, { "epoch": 2.150949854526784, "grad_norm": 17.5528621673584, "learning_rate": 3.2982278702951195e-06, "loss": 1.4469, "step": 12568 }, { "epoch": 2.151120999486565, "grad_norm": 9.994990348815918, "learning_rate": 3.2910229552691763e-06, "loss": 0.6077, "step": 12569 }, { "epoch": 2.151292144446346, "grad_norm": 22.341999053955078, "learning_rate": 3.283824948557129e-06, "loss": 5.3375, "step": 12570 }, { "epoch": 2.151463289406127, "grad_norm": 0.2729330062866211, "learning_rate": 3.27663385440581e-06, "loss": 0.0958, "step": 12571 }, { "epoch": 2.1516344343659077, "grad_norm": 10.165854454040527, "learning_rate": 3.269449677057981e-06, "loss": 1.0899, "step": 12572 }, { "epoch": 2.1518055793256887, "grad_norm": 6.232246398925781, "learning_rate": 3.262272420752307e-06, "loss": 0.4228, "step": 12573 }, { "epoch": 2.1519767242854697, "grad_norm": 3.0261495113372803, "learning_rate": 3.255102089723388e-06, "loss": 0.3124, "step": 12574 }, { "epoch": 2.1521478692452507, "grad_norm": 2.7473268508911133, "learning_rate": 3.247938688201742e-06, "loss": 0.2284, "step": 12575 }, { "epoch": 2.1523190142050317, "grad_norm": 5.519251823425293, "learning_rate": 3.2407822204137615e-06, "loss": 0.2948, "step": 12576 }, { "epoch": 2.1524901591648127, "grad_norm": 18.153696060180664, "learning_rate": 3.2336326905817978e-06, "loss": 2.266, "step": 12577 }, { "epoch": 2.1526613041245937, "grad_norm": 5.66696310043335, "learning_rate": 3.2264901029240607e-06, "loss": 0.4961, "step": 12578 }, { "epoch": 2.1528324490843747, "grad_norm": 8.242439270019531, "learning_rate": 3.21935446165471e-06, "loss": 0.6227, "step": 12579 }, { "epoch": 2.153003594044155, "grad_norm": 0.28609517216682434, "learning_rate": 3.2122257709837627e-06, "loss": 0.097, "step": 12580 }, { "epoch": 2.153174739003936, "grad_norm": 1.737850308418274, "learning_rate": 3.2051040351171793e-06, "loss": 0.2101, "step": 12581 }, { "epoch": 2.153345883963717, "grad_norm": 12.363580703735352, "learning_rate": 3.197989258256773e-06, "loss": 0.934, "step": 12582 }, { "epoch": 2.153517028923498, "grad_norm": 0.3658955693244934, "learning_rate": 3.1908814446002925e-06, "loss": 0.0958, "step": 12583 }, { "epoch": 2.153688173883279, "grad_norm": 19.801855087280273, "learning_rate": 3.1837805983413382e-06, "loss": 1.9196, "step": 12584 }, { "epoch": 2.15385931884306, "grad_norm": 2.326270818710327, "learning_rate": 3.176686723669443e-06, "loss": 0.185, "step": 12585 }, { "epoch": 2.154030463802841, "grad_norm": 2.5991008281707764, "learning_rate": 3.1695998247699774e-06, "loss": 0.2142, "step": 12586 }, { "epoch": 2.154201608762622, "grad_norm": 6.987499713897705, "learning_rate": 3.1625199058242536e-06, "loss": 0.6358, "step": 12587 }, { "epoch": 2.154372753722403, "grad_norm": 25.251195907592773, "learning_rate": 3.1554469710094e-06, "loss": 5.2151, "step": 12588 }, { "epoch": 2.1545438986821837, "grad_norm": 4.79584264755249, "learning_rate": 3.148381024498489e-06, "loss": 0.2926, "step": 12589 }, { "epoch": 2.1547150436419646, "grad_norm": 9.120394706726074, "learning_rate": 3.1413220704604133e-06, "loss": 0.5426, "step": 12590 }, { "epoch": 2.1548861886017456, "grad_norm": 9.001848220825195, "learning_rate": 3.1342701130599904e-06, "loss": 0.7856, "step": 12591 }, { "epoch": 2.1550573335615266, "grad_norm": 12.618179321289062, "learning_rate": 3.1272251564578637e-06, "loss": 1.027, "step": 12592 }, { "epoch": 2.1552284785213076, "grad_norm": 7.247100353240967, "learning_rate": 3.1201872048105893e-06, "loss": 0.4471, "step": 12593 }, { "epoch": 2.1553996234810886, "grad_norm": 8.797866821289062, "learning_rate": 3.1131562622705386e-06, "loss": 0.5461, "step": 12594 }, { "epoch": 2.1555707684408696, "grad_norm": 4.6465582847595215, "learning_rate": 3.106132332986015e-06, "loss": 0.4004, "step": 12595 }, { "epoch": 2.15574191340065, "grad_norm": 1.0581963062286377, "learning_rate": 3.0991154211011303e-06, "loss": 0.176, "step": 12596 }, { "epoch": 2.155913058360431, "grad_norm": 6.387577056884766, "learning_rate": 3.092105530755868e-06, "loss": 0.717, "step": 12597 }, { "epoch": 2.156084203320212, "grad_norm": 11.099308967590332, "learning_rate": 3.0851026660860844e-06, "loss": 0.838, "step": 12598 }, { "epoch": 2.156255348279993, "grad_norm": 0.2741212844848633, "learning_rate": 3.078106831223468e-06, "loss": 0.0964, "step": 12599 }, { "epoch": 2.156426493239774, "grad_norm": 11.075316429138184, "learning_rate": 3.0711180302955854e-06, "loss": 0.8797, "step": 12600 }, { "epoch": 2.156597638199555, "grad_norm": 0.7430543899536133, "learning_rate": 3.064136267425827e-06, "loss": 0.1587, "step": 12601 }, { "epoch": 2.156768783159336, "grad_norm": 16.309938430786133, "learning_rate": 3.0571615467334536e-06, "loss": 1.3397, "step": 12602 }, { "epoch": 2.156939928119117, "grad_norm": 15.686036109924316, "learning_rate": 3.050193872333554e-06, "loss": 1.2544, "step": 12603 }, { "epoch": 2.157111073078898, "grad_norm": 6.141115665435791, "learning_rate": 3.043233248337066e-06, "loss": 0.5723, "step": 12604 }, { "epoch": 2.1572822180386786, "grad_norm": 7.712655067443848, "learning_rate": 3.036279678850776e-06, "loss": 0.7782, "step": 12605 }, { "epoch": 2.1574533629984596, "grad_norm": 17.317977905273438, "learning_rate": 3.0293331679772893e-06, "loss": 1.713, "step": 12606 }, { "epoch": 2.1576245079582406, "grad_norm": 1.9004970788955688, "learning_rate": 3.0223937198150675e-06, "loss": 0.2128, "step": 12607 }, { "epoch": 2.1577956529180216, "grad_norm": 9.0211820602417, "learning_rate": 3.015461338458386e-06, "loss": 0.5702, "step": 12608 }, { "epoch": 2.1579667978778025, "grad_norm": 5.3882527351379395, "learning_rate": 3.0085360279973707e-06, "loss": 0.5736, "step": 12609 }, { "epoch": 2.1581379428375835, "grad_norm": 12.501594543457031, "learning_rate": 3.0016177925179555e-06, "loss": 1.1664, "step": 12610 }, { "epoch": 2.1583090877973645, "grad_norm": 17.01532554626465, "learning_rate": 2.994706636101918e-06, "loss": 1.7872, "step": 12611 }, { "epoch": 2.158480232757145, "grad_norm": 9.063935279846191, "learning_rate": 2.9878025628268467e-06, "loss": 0.6823, "step": 12612 }, { "epoch": 2.158651377716926, "grad_norm": 12.867799758911133, "learning_rate": 2.98090557676615e-06, "loss": 0.9162, "step": 12613 }, { "epoch": 2.158822522676707, "grad_norm": 5.801804065704346, "learning_rate": 2.9740156819890696e-06, "loss": 0.4779, "step": 12614 }, { "epoch": 2.158993667636488, "grad_norm": 12.502382278442383, "learning_rate": 2.967132882560633e-06, "loss": 0.958, "step": 12615 }, { "epoch": 2.159164812596269, "grad_norm": 13.144845962524414, "learning_rate": 2.9602571825417383e-06, "loss": 0.7271, "step": 12616 }, { "epoch": 2.15933595755605, "grad_norm": 9.043917655944824, "learning_rate": 2.9533885859890274e-06, "loss": 0.765, "step": 12617 }, { "epoch": 2.159507102515831, "grad_norm": 13.030762672424316, "learning_rate": 2.946527096955005e-06, "loss": 0.895, "step": 12618 }, { "epoch": 2.159678247475612, "grad_norm": 9.800630569458008, "learning_rate": 2.9396727194879365e-06, "loss": 0.6909, "step": 12619 }, { "epoch": 2.159849392435393, "grad_norm": 15.80376148223877, "learning_rate": 2.932825457631943e-06, "loss": 1.8813, "step": 12620 }, { "epoch": 2.1600205373951735, "grad_norm": 7.1760053634643555, "learning_rate": 2.9259853154268866e-06, "loss": 0.7754, "step": 12621 }, { "epoch": 2.1601916823549545, "grad_norm": 0.3060266971588135, "learning_rate": 2.9191522969084895e-06, "loss": 0.0975, "step": 12622 }, { "epoch": 2.1603628273147355, "grad_norm": 18.946067810058594, "learning_rate": 2.9123264061082194e-06, "loss": 1.8707, "step": 12623 }, { "epoch": 2.1605339722745165, "grad_norm": 8.23867130279541, "learning_rate": 2.905507647053387e-06, "loss": 0.453, "step": 12624 }, { "epoch": 2.1607051172342975, "grad_norm": 4.174916744232178, "learning_rate": 2.898696023767044e-06, "loss": 0.3968, "step": 12625 }, { "epoch": 2.1608762621940785, "grad_norm": 25.94167709350586, "learning_rate": 2.891891540268084e-06, "loss": 3.2963, "step": 12626 }, { "epoch": 2.1610474071538595, "grad_norm": 2.212507963180542, "learning_rate": 2.8850942005711372e-06, "loss": 0.1594, "step": 12627 }, { "epoch": 2.1612185521136404, "grad_norm": 9.891725540161133, "learning_rate": 2.878304008686669e-06, "loss": 0.7644, "step": 12628 }, { "epoch": 2.161389697073421, "grad_norm": 13.003744125366211, "learning_rate": 2.8715209686208783e-06, "loss": 0.9148, "step": 12629 }, { "epoch": 2.161560842033202, "grad_norm": 17.78162384033203, "learning_rate": 2.8647450843757933e-06, "loss": 1.1899, "step": 12630 }, { "epoch": 2.161731986992983, "grad_norm": 12.192255020141602, "learning_rate": 2.8579763599491715e-06, "loss": 0.8096, "step": 12631 }, { "epoch": 2.161903131952764, "grad_norm": 0.8739219307899475, "learning_rate": 2.851214799334595e-06, "loss": 0.1552, "step": 12632 }, { "epoch": 2.162074276912545, "grad_norm": 6.126412868499756, "learning_rate": 2.8444604065213693e-06, "loss": 0.6141, "step": 12633 }, { "epoch": 2.162245421872326, "grad_norm": 7.0388665199279785, "learning_rate": 2.8377131854946247e-06, "loss": 0.577, "step": 12634 }, { "epoch": 2.162416566832107, "grad_norm": 9.99653434753418, "learning_rate": 2.830973140235199e-06, "loss": 0.8544, "step": 12635 }, { "epoch": 2.162587711791888, "grad_norm": 7.5981550216674805, "learning_rate": 2.8242402747197592e-06, "loss": 0.714, "step": 12636 }, { "epoch": 2.1627588567516685, "grad_norm": 7.861494541168213, "learning_rate": 2.817514592920673e-06, "loss": 0.6883, "step": 12637 }, { "epoch": 2.1629300017114494, "grad_norm": 16.006898880004883, "learning_rate": 2.8107960988061376e-06, "loss": 1.4264, "step": 12638 }, { "epoch": 2.1631011466712304, "grad_norm": 22.158977508544922, "learning_rate": 2.8040847963400557e-06, "loss": 4.9872, "step": 12639 }, { "epoch": 2.1632722916310114, "grad_norm": 13.174286842346191, "learning_rate": 2.797380689482113e-06, "loss": 0.8858, "step": 12640 }, { "epoch": 2.1634434365907924, "grad_norm": 18.80160140991211, "learning_rate": 2.790683782187734e-06, "loss": 1.2191, "step": 12641 }, { "epoch": 2.1636145815505734, "grad_norm": 11.325397491455078, "learning_rate": 2.783994078408113e-06, "loss": 0.8256, "step": 12642 }, { "epoch": 2.1637857265103544, "grad_norm": 12.784300804138184, "learning_rate": 2.777311582090178e-06, "loss": 0.952, "step": 12643 }, { "epoch": 2.1639568714701354, "grad_norm": 6.859395980834961, "learning_rate": 2.770636297176618e-06, "loss": 0.6075, "step": 12644 }, { "epoch": 2.164128016429916, "grad_norm": 13.260940551757812, "learning_rate": 2.7639682276058583e-06, "loss": 1.0872, "step": 12645 }, { "epoch": 2.164299161389697, "grad_norm": 15.883008003234863, "learning_rate": 2.7573073773120645e-06, "loss": 0.5635, "step": 12646 }, { "epoch": 2.164470306349478, "grad_norm": 8.982694625854492, "learning_rate": 2.7506537502251582e-06, "loss": 0.6875, "step": 12647 }, { "epoch": 2.164641451309259, "grad_norm": 9.766581535339355, "learning_rate": 2.7440073502707796e-06, "loss": 0.7523, "step": 12648 }, { "epoch": 2.16481259626904, "grad_norm": 9.956009864807129, "learning_rate": 2.737368181370323e-06, "loss": 0.7992, "step": 12649 }, { "epoch": 2.164983741228821, "grad_norm": 10.58755111694336, "learning_rate": 2.730736247440901e-06, "loss": 0.8123, "step": 12650 }, { "epoch": 2.165154886188602, "grad_norm": 6.560888290405273, "learning_rate": 2.7241115523953707e-06, "loss": 0.5591, "step": 12651 }, { "epoch": 2.165326031148383, "grad_norm": 4.343291282653809, "learning_rate": 2.7174941001423083e-06, "loss": 0.4285, "step": 12652 }, { "epoch": 2.165497176108164, "grad_norm": 16.31952667236328, "learning_rate": 2.710883894586018e-06, "loss": 1.4642, "step": 12653 }, { "epoch": 2.1656683210679444, "grad_norm": 1.1782512664794922, "learning_rate": 2.7042809396265377e-06, "loss": 0.1669, "step": 12654 }, { "epoch": 2.1658394660277254, "grad_norm": 12.693705558776855, "learning_rate": 2.697685239159614e-06, "loss": 1.0318, "step": 12655 }, { "epoch": 2.1660106109875064, "grad_norm": 11.188959121704102, "learning_rate": 2.691096797076726e-06, "loss": 0.798, "step": 12656 }, { "epoch": 2.1661817559472873, "grad_norm": 15.394315719604492, "learning_rate": 2.6845156172650587e-06, "loss": 1.511, "step": 12657 }, { "epoch": 2.1663529009070683, "grad_norm": 8.980091094970703, "learning_rate": 2.6779417036075115e-06, "loss": 0.5251, "step": 12658 }, { "epoch": 2.1665240458668493, "grad_norm": 11.188470840454102, "learning_rate": 2.6713750599827287e-06, "loss": 0.6199, "step": 12659 }, { "epoch": 2.1666951908266303, "grad_norm": 16.472131729125977, "learning_rate": 2.6648156902650106e-06, "loss": 1.7616, "step": 12660 }, { "epoch": 2.166866335786411, "grad_norm": 11.199334144592285, "learning_rate": 2.6582635983244203e-06, "loss": 1.0034, "step": 12661 }, { "epoch": 2.167037480746192, "grad_norm": 12.611692428588867, "learning_rate": 2.6517187880266785e-06, "loss": 0.9623, "step": 12662 }, { "epoch": 2.167208625705973, "grad_norm": 4.17472505569458, "learning_rate": 2.645181263233255e-06, "loss": 0.2367, "step": 12663 }, { "epoch": 2.167379770665754, "grad_norm": 24.9051570892334, "learning_rate": 2.6386510278012777e-06, "loss": 4.9184, "step": 12664 }, { "epoch": 2.167550915625535, "grad_norm": 4.516915321350098, "learning_rate": 2.632128085583616e-06, "loss": 0.3948, "step": 12665 }, { "epoch": 2.167722060585316, "grad_norm": 3.2252132892608643, "learning_rate": 2.6256124404287936e-06, "loss": 0.2415, "step": 12666 }, { "epoch": 2.167893205545097, "grad_norm": 2.702890157699585, "learning_rate": 2.619104096181078e-06, "loss": 0.2052, "step": 12667 }, { "epoch": 2.168064350504878, "grad_norm": 9.606281280517578, "learning_rate": 2.6126030566803714e-06, "loss": 0.7097, "step": 12668 }, { "epoch": 2.1682354954646588, "grad_norm": 9.416097640991211, "learning_rate": 2.6061093257623243e-06, "loss": 0.7221, "step": 12669 }, { "epoch": 2.1684066404244393, "grad_norm": 7.6353678703308105, "learning_rate": 2.599622907258223e-06, "loss": 0.5318, "step": 12670 }, { "epoch": 2.1685777853842203, "grad_norm": 11.834870338439941, "learning_rate": 2.593143804995088e-06, "loss": 0.8098, "step": 12671 }, { "epoch": 2.1687489303440013, "grad_norm": 18.902528762817383, "learning_rate": 2.586672022795575e-06, "loss": 2.2055, "step": 12672 }, { "epoch": 2.1689200753037823, "grad_norm": 15.063268661499023, "learning_rate": 2.5802075644780694e-06, "loss": 1.1936, "step": 12673 }, { "epoch": 2.1690912202635633, "grad_norm": 19.27220344543457, "learning_rate": 2.5737504338565887e-06, "loss": 5.0163, "step": 12674 }, { "epoch": 2.1692623652233443, "grad_norm": 16.254451751708984, "learning_rate": 2.567300634740872e-06, "loss": 1.5437, "step": 12675 }, { "epoch": 2.1694335101831252, "grad_norm": 17.067506790161133, "learning_rate": 2.5608581709362878e-06, "loss": 1.1635, "step": 12676 }, { "epoch": 2.1696046551429062, "grad_norm": 4.304327487945557, "learning_rate": 2.554423046243921e-06, "loss": 0.2836, "step": 12677 }, { "epoch": 2.169775800102687, "grad_norm": 0.32198211550712585, "learning_rate": 2.547995264460482e-06, "loss": 0.0986, "step": 12678 }, { "epoch": 2.1699469450624678, "grad_norm": 6.998874664306641, "learning_rate": 2.5415748293783967e-06, "loss": 0.6052, "step": 12679 }, { "epoch": 2.1701180900222488, "grad_norm": 7.617682456970215, "learning_rate": 2.5351617447857023e-06, "loss": 0.5351, "step": 12680 }, { "epoch": 2.1702892349820297, "grad_norm": 11.262653350830078, "learning_rate": 2.5287560144661533e-06, "loss": 0.7256, "step": 12681 }, { "epoch": 2.1704603799418107, "grad_norm": 5.046858310699463, "learning_rate": 2.522357642199133e-06, "loss": 0.4207, "step": 12682 }, { "epoch": 2.1706315249015917, "grad_norm": 20.260496139526367, "learning_rate": 2.515966631759685e-06, "loss": 1.8949, "step": 12683 }, { "epoch": 2.1708026698613727, "grad_norm": 7.616825103759766, "learning_rate": 2.5095829869185238e-06, "loss": 0.7665, "step": 12684 }, { "epoch": 2.1709738148211537, "grad_norm": 18.023616790771484, "learning_rate": 2.503206711442004e-06, "loss": 2.0722, "step": 12685 }, { "epoch": 2.1711449597809342, "grad_norm": 5.723252773284912, "learning_rate": 2.496837809092134e-06, "loss": 0.5298, "step": 12686 }, { "epoch": 2.1713161047407152, "grad_norm": 6.707979679107666, "learning_rate": 2.490476283626584e-06, "loss": 0.5003, "step": 12687 }, { "epoch": 2.1714872497004962, "grad_norm": 11.471222877502441, "learning_rate": 2.484122138798653e-06, "loss": 0.7862, "step": 12688 }, { "epoch": 2.171658394660277, "grad_norm": 15.179308891296387, "learning_rate": 2.4777753783573078e-06, "loss": 1.5507, "step": 12689 }, { "epoch": 2.171829539620058, "grad_norm": 2.2913591861724854, "learning_rate": 2.4714360060471375e-06, "loss": 0.2164, "step": 12690 }, { "epoch": 2.172000684579839, "grad_norm": 30.852548599243164, "learning_rate": 2.4651040256083857e-06, "loss": 5.3271, "step": 12691 }, { "epoch": 2.17217182953962, "grad_norm": 15.323151588439941, "learning_rate": 2.4587794407769304e-06, "loss": 1.124, "step": 12692 }, { "epoch": 2.172342974499401, "grad_norm": 14.117292404174805, "learning_rate": 2.452462255284282e-06, "loss": 1.2967, "step": 12693 }, { "epoch": 2.1725141194591817, "grad_norm": 5.012913703918457, "learning_rate": 2.446152472857595e-06, "loss": 0.4493, "step": 12694 }, { "epoch": 2.1726852644189627, "grad_norm": 17.487817764282227, "learning_rate": 2.4398500972196423e-06, "loss": 1.7518, "step": 12695 }, { "epoch": 2.1728564093787437, "grad_norm": 14.199211120605469, "learning_rate": 2.433555132088846e-06, "loss": 1.107, "step": 12696 }, { "epoch": 2.1730275543385247, "grad_norm": 11.847704887390137, "learning_rate": 2.4272675811792348e-06, "loss": 0.868, "step": 12697 }, { "epoch": 2.1731986992983057, "grad_norm": 10.319013595581055, "learning_rate": 2.42098744820048e-06, "loss": 0.9438, "step": 12698 }, { "epoch": 2.1733698442580867, "grad_norm": 6.211767196655273, "learning_rate": 2.414714736857868e-06, "loss": 0.6211, "step": 12699 }, { "epoch": 2.1735409892178676, "grad_norm": 25.471328735351562, "learning_rate": 2.4084494508523003e-06, "loss": 5.3338, "step": 12700 }, { "epoch": 2.1737121341776486, "grad_norm": 13.583085060119629, "learning_rate": 2.4021915938803065e-06, "loss": 1.073, "step": 12701 }, { "epoch": 2.1738832791374296, "grad_norm": 8.872940063476562, "learning_rate": 2.3959411696340507e-06, "loss": 0.6038, "step": 12702 }, { "epoch": 2.17405442409721, "grad_norm": 16.717931747436523, "learning_rate": 2.3896981818012663e-06, "loss": 1.5353, "step": 12703 }, { "epoch": 2.174225569056991, "grad_norm": 3.887176036834717, "learning_rate": 2.3834626340653476e-06, "loss": 0.2791, "step": 12704 }, { "epoch": 2.174396714016772, "grad_norm": 6.346592426300049, "learning_rate": 2.377234530105256e-06, "loss": 0.4846, "step": 12705 }, { "epoch": 2.174567858976553, "grad_norm": 8.172645568847656, "learning_rate": 2.3710138735956044e-06, "loss": 0.5277, "step": 12706 }, { "epoch": 2.174739003936334, "grad_norm": 0.357258141040802, "learning_rate": 2.3648006682065664e-06, "loss": 0.1025, "step": 12707 }, { "epoch": 2.174910148896115, "grad_norm": 13.903961181640625, "learning_rate": 2.3585949176039652e-06, "loss": 0.9651, "step": 12708 }, { "epoch": 2.175081293855896, "grad_norm": 9.958335876464844, "learning_rate": 2.3523966254491795e-06, "loss": 0.656, "step": 12709 }, { "epoch": 2.1752524388156766, "grad_norm": 6.147921085357666, "learning_rate": 2.346205795399235e-06, "loss": 0.5783, "step": 12710 }, { "epoch": 2.1754235837754576, "grad_norm": 16.985084533691406, "learning_rate": 2.340022431106706e-06, "loss": 1.4235, "step": 12711 }, { "epoch": 2.1755947287352386, "grad_norm": 11.594037055969238, "learning_rate": 2.333846536219811e-06, "loss": 0.7842, "step": 12712 }, { "epoch": 2.1757658736950196, "grad_norm": 6.811429500579834, "learning_rate": 2.327678114382315e-06, "loss": 0.5074, "step": 12713 }, { "epoch": 2.1759370186548006, "grad_norm": 10.079765319824219, "learning_rate": 2.3215171692336172e-06, "loss": 0.6716, "step": 12714 }, { "epoch": 2.1761081636145816, "grad_norm": 10.274985313415527, "learning_rate": 2.3153637044086616e-06, "loss": 0.6935, "step": 12715 }, { "epoch": 2.1762793085743626, "grad_norm": 5.8747992515563965, "learning_rate": 2.3092177235380293e-06, "loss": 0.5628, "step": 12716 }, { "epoch": 2.1764504535341436, "grad_norm": 5.173713684082031, "learning_rate": 2.303079230247827e-06, "loss": 0.4758, "step": 12717 }, { "epoch": 2.1766215984939246, "grad_norm": 0.3327542841434479, "learning_rate": 2.296948228159802e-06, "loss": 0.093, "step": 12718 }, { "epoch": 2.176792743453705, "grad_norm": 18.54994773864746, "learning_rate": 2.2908247208912337e-06, "loss": 1.4531, "step": 12719 }, { "epoch": 2.176963888413486, "grad_norm": 0.44089287519454956, "learning_rate": 2.2847087120550188e-06, "loss": 0.1015, "step": 12720 }, { "epoch": 2.177135033373267, "grad_norm": 6.889634609222412, "learning_rate": 2.278600205259592e-06, "loss": 0.489, "step": 12721 }, { "epoch": 2.177306178333048, "grad_norm": 116.26907348632812, "learning_rate": 2.2724992041090033e-06, "loss": 8.3151, "step": 12722 }, { "epoch": 2.177477323292829, "grad_norm": 1.643180251121521, "learning_rate": 2.2664057122028237e-06, "loss": 0.1948, "step": 12723 }, { "epoch": 2.17764846825261, "grad_norm": 13.18047046661377, "learning_rate": 2.260319733136253e-06, "loss": 1.1258, "step": 12724 }, { "epoch": 2.177819613212391, "grad_norm": 9.38187026977539, "learning_rate": 2.2542412705000153e-06, "loss": 0.6231, "step": 12725 }, { "epoch": 2.177990758172172, "grad_norm": 10.394333839416504, "learning_rate": 2.2481703278804107e-06, "loss": 0.7149, "step": 12726 }, { "epoch": 2.1781619031319526, "grad_norm": 6.620190620422363, "learning_rate": 2.2421069088593083e-06, "loss": 0.5837, "step": 12727 }, { "epoch": 2.1783330480917336, "grad_norm": 12.332088470458984, "learning_rate": 2.2360510170141287e-06, "loss": 0.7703, "step": 12728 }, { "epoch": 2.1785041930515145, "grad_norm": 11.209345817565918, "learning_rate": 2.2300026559178634e-06, "loss": 1.1139, "step": 12729 }, { "epoch": 2.1786753380112955, "grad_norm": 14.972101211547852, "learning_rate": 2.223961829139048e-06, "loss": 1.2122, "step": 12730 }, { "epoch": 2.1788464829710765, "grad_norm": 15.517940521240234, "learning_rate": 2.2179285402417866e-06, "loss": 1.1965, "step": 12731 }, { "epoch": 2.1790176279308575, "grad_norm": 1.114290714263916, "learning_rate": 2.211902792785725e-06, "loss": 0.1612, "step": 12732 }, { "epoch": 2.1791887728906385, "grad_norm": 19.715087890625, "learning_rate": 2.2058845903260595e-06, "loss": 1.6599, "step": 12733 }, { "epoch": 2.1793599178504195, "grad_norm": 0.3031153380870819, "learning_rate": 2.1998739364135446e-06, "loss": 0.098, "step": 12734 }, { "epoch": 2.1795310628102, "grad_norm": 60.14946365356445, "learning_rate": 2.1938708345944703e-06, "loss": 6.9813, "step": 12735 }, { "epoch": 2.179702207769981, "grad_norm": 19.599987030029297, "learning_rate": 2.18787528841068e-06, "loss": 1.9836, "step": 12736 }, { "epoch": 2.179873352729762, "grad_norm": 6.552761554718018, "learning_rate": 2.1818873013995495e-06, "loss": 0.5498, "step": 12737 }, { "epoch": 2.180044497689543, "grad_norm": 9.733787536621094, "learning_rate": 2.175906877094007e-06, "loss": 0.6816, "step": 12738 }, { "epoch": 2.180215642649324, "grad_norm": 23.294403076171875, "learning_rate": 2.1699340190225057e-06, "loss": 4.9627, "step": 12739 }, { "epoch": 2.180386787609105, "grad_norm": 14.008401870727539, "learning_rate": 2.163968730709045e-06, "loss": 0.7963, "step": 12740 }, { "epoch": 2.180557932568886, "grad_norm": 22.220521926879883, "learning_rate": 2.1580110156731525e-06, "loss": 4.9413, "step": 12741 }, { "epoch": 2.180729077528667, "grad_norm": 0.3076349198818207, "learning_rate": 2.152060877429885e-06, "loss": 0.0886, "step": 12742 }, { "epoch": 2.1809002224884475, "grad_norm": 11.264089584350586, "learning_rate": 2.1461183194898393e-06, "loss": 0.7948, "step": 12743 }, { "epoch": 2.1810713674482285, "grad_norm": 17.408729553222656, "learning_rate": 2.1401833453591173e-06, "loss": 1.9773, "step": 12744 }, { "epoch": 2.1812425124080095, "grad_norm": 14.526143074035645, "learning_rate": 2.1342559585393933e-06, "loss": 0.9752, "step": 12745 }, { "epoch": 2.1814136573677905, "grad_norm": 13.483246803283691, "learning_rate": 2.1283361625278046e-06, "loss": 0.8714, "step": 12746 }, { "epoch": 2.1815848023275715, "grad_norm": 11.354848861694336, "learning_rate": 2.1224239608170644e-06, "loss": 0.9339, "step": 12747 }, { "epoch": 2.1817559472873524, "grad_norm": 5.727118492126465, "learning_rate": 2.1165193568953552e-06, "loss": 0.6386, "step": 12748 }, { "epoch": 2.1819270922471334, "grad_norm": 0.5182165503501892, "learning_rate": 2.1106223542464304e-06, "loss": 0.1022, "step": 12749 }, { "epoch": 2.1820982372069144, "grad_norm": 2.8027727603912354, "learning_rate": 2.1047329563495007e-06, "loss": 0.2147, "step": 12750 }, { "epoch": 2.1822693821666954, "grad_norm": 0.40114375948905945, "learning_rate": 2.098851166679344e-06, "loss": 0.0997, "step": 12751 }, { "epoch": 2.182440527126476, "grad_norm": 5.483966827392578, "learning_rate": 2.092976988706204e-06, "loss": 0.4542, "step": 12752 }, { "epoch": 2.182611672086257, "grad_norm": 18.49843406677246, "learning_rate": 2.0871104258958757e-06, "loss": 1.9353, "step": 12753 }, { "epoch": 2.182782817046038, "grad_norm": 6.484086990356445, "learning_rate": 2.081251481709619e-06, "loss": 0.4438, "step": 12754 }, { "epoch": 2.182953962005819, "grad_norm": 21.89228057861328, "learning_rate": 2.0754001596042406e-06, "loss": 2.1026, "step": 12755 }, { "epoch": 2.1831251069656, "grad_norm": 22.45412826538086, "learning_rate": 2.0695564630320083e-06, "loss": 5.0305, "step": 12756 }, { "epoch": 2.183296251925381, "grad_norm": 0.5025789737701416, "learning_rate": 2.0637203954407335e-06, "loss": 0.106, "step": 12757 }, { "epoch": 2.183467396885162, "grad_norm": 9.57309341430664, "learning_rate": 2.0578919602736813e-06, "loss": 0.8086, "step": 12758 }, { "epoch": 2.1836385418449424, "grad_norm": 10.482095718383789, "learning_rate": 2.0520711609696607e-06, "loss": 0.7833, "step": 12759 }, { "epoch": 2.1838096868047234, "grad_norm": 3.095557689666748, "learning_rate": 2.04625800096293e-06, "loss": 0.251, "step": 12760 }, { "epoch": 2.1839808317645044, "grad_norm": 10.498327255249023, "learning_rate": 2.0404524836832823e-06, "loss": 0.6605, "step": 12761 }, { "epoch": 2.1841519767242854, "grad_norm": 2.9403512477874756, "learning_rate": 2.0346546125559622e-06, "loss": 0.2554, "step": 12762 }, { "epoch": 2.1843231216840664, "grad_norm": 3.4951839447021484, "learning_rate": 2.0288643910017473e-06, "loss": 0.2585, "step": 12763 }, { "epoch": 2.1844942666438474, "grad_norm": 0.9477766156196594, "learning_rate": 2.0230818224368463e-06, "loss": 0.1658, "step": 12764 }, { "epoch": 2.1846654116036284, "grad_norm": 15.923053741455078, "learning_rate": 2.0173069102730085e-06, "loss": 1.3495, "step": 12765 }, { "epoch": 2.1848365565634094, "grad_norm": 9.324854850769043, "learning_rate": 2.011539657917415e-06, "loss": 0.6799, "step": 12766 }, { "epoch": 2.1850077015231903, "grad_norm": 14.38515567779541, "learning_rate": 2.0057800687727825e-06, "loss": 0.9648, "step": 12767 }, { "epoch": 2.185178846482971, "grad_norm": 5.100430488586426, "learning_rate": 2.0000281462372604e-06, "loss": 0.4983, "step": 12768 }, { "epoch": 2.185349991442752, "grad_norm": 14.506260871887207, "learning_rate": 1.994283893704498e-06, "loss": 0.9512, "step": 12769 }, { "epoch": 2.185521136402533, "grad_norm": 12.590641975402832, "learning_rate": 1.9885473145636064e-06, "loss": 1.1413, "step": 12770 }, { "epoch": 2.185692281362314, "grad_norm": 8.531139373779297, "learning_rate": 1.982818412199184e-06, "loss": 0.8418, "step": 12771 }, { "epoch": 2.185863426322095, "grad_norm": 0.40358129143714905, "learning_rate": 1.977097189991282e-06, "loss": 0.0985, "step": 12772 }, { "epoch": 2.186034571281876, "grad_norm": 23.76403045654297, "learning_rate": 1.9713836513154393e-06, "loss": 3.261, "step": 12773 }, { "epoch": 2.186205716241657, "grad_norm": 0.5205931663513184, "learning_rate": 1.965677799542647e-06, "loss": 0.138, "step": 12774 }, { "epoch": 2.186376861201438, "grad_norm": 5.038591384887695, "learning_rate": 1.9599796380393632e-06, "loss": 0.4994, "step": 12775 }, { "epoch": 2.1865480061612184, "grad_norm": 2.0771820545196533, "learning_rate": 1.9542891701675206e-06, "loss": 0.237, "step": 12776 }, { "epoch": 2.1867191511209993, "grad_norm": 12.628132820129395, "learning_rate": 1.948606399284495e-06, "loss": 1.0623, "step": 12777 }, { "epoch": 2.1868902960807803, "grad_norm": 13.482566833496094, "learning_rate": 1.942931328743135e-06, "loss": 1.3844, "step": 12778 }, { "epoch": 2.1870614410405613, "grad_norm": 15.681252479553223, "learning_rate": 1.9372639618917378e-06, "loss": 1.0056, "step": 12779 }, { "epoch": 2.1872325860003423, "grad_norm": 11.374387741088867, "learning_rate": 1.9316043020740616e-06, "loss": 0.798, "step": 12780 }, { "epoch": 2.1874037309601233, "grad_norm": 9.494460105895996, "learning_rate": 1.9259523526293123e-06, "loss": 0.8412, "step": 12781 }, { "epoch": 2.1875748759199043, "grad_norm": 15.757201194763184, "learning_rate": 1.9203081168921454e-06, "loss": 1.1376, "step": 12782 }, { "epoch": 2.1877460208796853, "grad_norm": 20.652910232543945, "learning_rate": 1.9146715981926743e-06, "loss": 2.1744, "step": 12783 }, { "epoch": 2.187917165839466, "grad_norm": 11.144262313842773, "learning_rate": 1.909042799856447e-06, "loss": 0.771, "step": 12784 }, { "epoch": 2.188088310799247, "grad_norm": 6.267108917236328, "learning_rate": 1.9034217252044682e-06, "loss": 0.5006, "step": 12785 }, { "epoch": 2.188259455759028, "grad_norm": 2.8420214653015137, "learning_rate": 1.8978083775531774e-06, "loss": 0.226, "step": 12786 }, { "epoch": 2.188430600718809, "grad_norm": 1.3811126947402954, "learning_rate": 1.8922027602144498e-06, "loss": 0.1654, "step": 12787 }, { "epoch": 2.18860174567859, "grad_norm": 78.43463134765625, "learning_rate": 1.8866048764956318e-06, "loss": 5.3651, "step": 12788 }, { "epoch": 2.1887728906383708, "grad_norm": 9.718424797058105, "learning_rate": 1.8810147296994595e-06, "loss": 0.6564, "step": 12789 }, { "epoch": 2.1889440355981518, "grad_norm": 1.165261149406433, "learning_rate": 1.8754323231241466e-06, "loss": 0.1754, "step": 12790 }, { "epoch": 2.1891151805579327, "grad_norm": 8.18349552154541, "learning_rate": 1.8698576600632999e-06, "loss": 0.5733, "step": 12791 }, { "epoch": 2.1892863255177133, "grad_norm": 0.3329847753047943, "learning_rate": 1.864290743806002e-06, "loss": 0.0955, "step": 12792 }, { "epoch": 2.1894574704774943, "grad_norm": 8.19064712524414, "learning_rate": 1.8587315776367204e-06, "loss": 0.6941, "step": 12793 }, { "epoch": 2.1896286154372753, "grad_norm": 9.488381385803223, "learning_rate": 1.8531801648353913e-06, "loss": 0.836, "step": 12794 }, { "epoch": 2.1897997603970563, "grad_norm": 11.382331848144531, "learning_rate": 1.8476365086773367e-06, "loss": 0.6242, "step": 12795 }, { "epoch": 2.1899709053568372, "grad_norm": 7.608189105987549, "learning_rate": 1.8421006124333433e-06, "loss": 0.5235, "step": 12796 }, { "epoch": 2.1901420503166182, "grad_norm": 19.370058059692383, "learning_rate": 1.8365724793695754e-06, "loss": 1.7966, "step": 12797 }, { "epoch": 2.1903131952763992, "grad_norm": 15.536296844482422, "learning_rate": 1.8310521127476604e-06, "loss": 1.5564, "step": 12798 }, { "epoch": 2.19048434023618, "grad_norm": 16.73215675354004, "learning_rate": 1.8255395158246046e-06, "loss": 1.3776, "step": 12799 }, { "epoch": 2.190655485195961, "grad_norm": 12.460187911987305, "learning_rate": 1.8200346918528666e-06, "loss": 1.1546, "step": 12800 }, { "epoch": 2.1908266301557417, "grad_norm": 17.21982192993164, "learning_rate": 1.8145376440802813e-06, "loss": 1.5093, "step": 12801 }, { "epoch": 2.1909977751155227, "grad_norm": 1.7292855978012085, "learning_rate": 1.8090483757501347e-06, "loss": 0.2004, "step": 12802 }, { "epoch": 2.1911689200753037, "grad_norm": 0.40288296341896057, "learning_rate": 1.8035668901010844e-06, "loss": 0.099, "step": 12803 }, { "epoch": 2.1913400650350847, "grad_norm": 17.917552947998047, "learning_rate": 1.798093190367233e-06, "loss": 1.9237, "step": 12804 }, { "epoch": 2.1915112099948657, "grad_norm": 21.208417892456055, "learning_rate": 1.7926272797780546e-06, "loss": 1.5942, "step": 12805 }, { "epoch": 2.1916823549546467, "grad_norm": 15.759162902832031, "learning_rate": 1.787169161558464e-06, "loss": 1.716, "step": 12806 }, { "epoch": 2.1918534999144277, "grad_norm": 11.347615242004395, "learning_rate": 1.781718838928737e-06, "loss": 0.8276, "step": 12807 }, { "epoch": 2.1920246448742082, "grad_norm": 9.27266788482666, "learning_rate": 1.7762763151045914e-06, "loss": 0.6898, "step": 12808 }, { "epoch": 2.192195789833989, "grad_norm": 4.554643630981445, "learning_rate": 1.7708415932971051e-06, "loss": 0.2682, "step": 12809 }, { "epoch": 2.19236693479377, "grad_norm": 8.731319427490234, "learning_rate": 1.7654146767127898e-06, "loss": 0.646, "step": 12810 }, { "epoch": 2.192538079753551, "grad_norm": 6.206872463226318, "learning_rate": 1.7599955685535296e-06, "loss": 0.5229, "step": 12811 }, { "epoch": 2.192709224713332, "grad_norm": 12.733479499816895, "learning_rate": 1.7545842720166017e-06, "loss": 0.9728, "step": 12812 }, { "epoch": 2.192880369673113, "grad_norm": 11.883296966552734, "learning_rate": 1.7491807902946837e-06, "loss": 0.8116, "step": 12813 }, { "epoch": 2.193051514632894, "grad_norm": 22.871891021728516, "learning_rate": 1.7437851265758375e-06, "loss": 4.5201, "step": 12814 }, { "epoch": 2.193222659592675, "grad_norm": 8.025163650512695, "learning_rate": 1.7383972840435081e-06, "loss": 0.6279, "step": 12815 }, { "epoch": 2.193393804552456, "grad_norm": 3.107952117919922, "learning_rate": 1.7330172658765375e-06, "loss": 0.2244, "step": 12816 }, { "epoch": 2.1935649495122367, "grad_norm": 2.8887197971343994, "learning_rate": 1.7276450752491352e-06, "loss": 0.277, "step": 12817 }, { "epoch": 2.1937360944720177, "grad_norm": 11.705055236816406, "learning_rate": 1.7222807153309123e-06, "loss": 0.822, "step": 12818 }, { "epoch": 2.1939072394317987, "grad_norm": 21.631450653076172, "learning_rate": 1.7169241892868403e-06, "loss": 2.6241, "step": 12819 }, { "epoch": 2.1940783843915797, "grad_norm": 12.743314743041992, "learning_rate": 1.7115755002772848e-06, "loss": 0.8278, "step": 12820 }, { "epoch": 2.1942495293513606, "grad_norm": 9.924036979675293, "learning_rate": 1.7062346514579747e-06, "loss": 0.7543, "step": 12821 }, { "epoch": 2.1944206743111416, "grad_norm": 18.2040958404541, "learning_rate": 1.7009016459800193e-06, "loss": 1.8255, "step": 12822 }, { "epoch": 2.1945918192709226, "grad_norm": 12.610291481018066, "learning_rate": 1.695576486989905e-06, "loss": 0.9839, "step": 12823 }, { "epoch": 2.1947629642307036, "grad_norm": 3.4950778484344482, "learning_rate": 1.690259177629475e-06, "loss": 0.2842, "step": 12824 }, { "epoch": 2.194934109190484, "grad_norm": 23.43720245361328, "learning_rate": 1.6849497210359589e-06, "loss": 1.8927, "step": 12825 }, { "epoch": 2.195105254150265, "grad_norm": 15.96544075012207, "learning_rate": 1.6796481203419367e-06, "loss": 1.7521, "step": 12826 }, { "epoch": 2.195276399110046, "grad_norm": 12.51791000366211, "learning_rate": 1.674354378675365e-06, "loss": 1.0077, "step": 12827 }, { "epoch": 2.195447544069827, "grad_norm": 9.372371673583984, "learning_rate": 1.669068499159559e-06, "loss": 0.704, "step": 12828 }, { "epoch": 2.195618689029608, "grad_norm": 10.962714195251465, "learning_rate": 1.6637904849131918e-06, "loss": 0.791, "step": 12829 }, { "epoch": 2.195789833989389, "grad_norm": 2.145580530166626, "learning_rate": 1.6585203390502963e-06, "loss": 0.3766, "step": 12830 }, { "epoch": 2.19596097894917, "grad_norm": 8.839455604553223, "learning_rate": 1.6532580646802831e-06, "loss": 0.6047, "step": 12831 }, { "epoch": 2.196132123908951, "grad_norm": 67.61552429199219, "learning_rate": 1.6480036649078839e-06, "loss": 6.1077, "step": 12832 }, { "epoch": 2.1963032688687316, "grad_norm": 0.2952430546283722, "learning_rate": 1.6427571428332171e-06, "loss": 0.094, "step": 12833 }, { "epoch": 2.1964744138285126, "grad_norm": 16.531780242919922, "learning_rate": 1.637518501551722e-06, "loss": 2.0661, "step": 12834 }, { "epoch": 2.1966455587882936, "grad_norm": 3.3139665126800537, "learning_rate": 1.632287744154224e-06, "loss": 0.2254, "step": 12835 }, { "epoch": 2.1968167037480746, "grad_norm": 12.821074485778809, "learning_rate": 1.6270648737268578e-06, "loss": 0.817, "step": 12836 }, { "epoch": 2.1969878487078556, "grad_norm": 2.7656197547912598, "learning_rate": 1.6218498933511438e-06, "loss": 0.3965, "step": 12837 }, { "epoch": 2.1971589936676366, "grad_norm": 16.01896858215332, "learning_rate": 1.6166428061039108e-06, "loss": 1.6353, "step": 12838 }, { "epoch": 2.1973301386274176, "grad_norm": 13.113499641418457, "learning_rate": 1.6114436150573641e-06, "loss": 0.9595, "step": 12839 }, { "epoch": 2.1975012835871985, "grad_norm": 10.02807903289795, "learning_rate": 1.6062523232790172e-06, "loss": 0.8543, "step": 12840 }, { "epoch": 2.197672428546979, "grad_norm": 12.452269554138184, "learning_rate": 1.6010689338317597e-06, "loss": 0.6703, "step": 12841 }, { "epoch": 2.19784357350676, "grad_norm": 0.29012298583984375, "learning_rate": 1.595893449773777e-06, "loss": 0.0953, "step": 12842 }, { "epoch": 2.198014718466541, "grad_norm": 23.577735900878906, "learning_rate": 1.590725874158635e-06, "loss": 5.104, "step": 12843 }, { "epoch": 2.198185863426322, "grad_norm": 1.5666701793670654, "learning_rate": 1.5855662100351897e-06, "loss": 0.1654, "step": 12844 }, { "epoch": 2.198357008386103, "grad_norm": 3.022829532623291, "learning_rate": 1.5804144604476729e-06, "loss": 0.2068, "step": 12845 }, { "epoch": 2.198528153345884, "grad_norm": 9.160985946655273, "learning_rate": 1.5752706284355993e-06, "loss": 0.8053, "step": 12846 }, { "epoch": 2.198699298305665, "grad_norm": 4.251462936401367, "learning_rate": 1.5701347170338604e-06, "loss": 0.4799, "step": 12847 }, { "epoch": 2.198870443265446, "grad_norm": 0.3423362970352173, "learning_rate": 1.5650067292726332e-06, "loss": 0.1, "step": 12848 }, { "epoch": 2.199041588225227, "grad_norm": 3.429561138153076, "learning_rate": 1.559886668177455e-06, "loss": 0.2778, "step": 12849 }, { "epoch": 2.1992127331850075, "grad_norm": 11.785917282104492, "learning_rate": 1.5547745367691517e-06, "loss": 0.9768, "step": 12850 }, { "epoch": 2.1993838781447885, "grad_norm": 12.844002723693848, "learning_rate": 1.5496703380638982e-06, "loss": 0.565, "step": 12851 }, { "epoch": 2.1995550231045695, "grad_norm": 10.958035469055176, "learning_rate": 1.5445740750731835e-06, "loss": 0.8111, "step": 12852 }, { "epoch": 2.1997261680643505, "grad_norm": 0.2833699584007263, "learning_rate": 1.5394857508038074e-06, "loss": 0.092, "step": 12853 }, { "epoch": 2.1998973130241315, "grad_norm": 10.951325416564941, "learning_rate": 1.5344053682578869e-06, "loss": 0.9063, "step": 12854 }, { "epoch": 2.2000684579839125, "grad_norm": 20.89097023010254, "learning_rate": 1.5293329304328524e-06, "loss": 4.6232, "step": 12855 }, { "epoch": 2.2002396029436935, "grad_norm": 3.284724473953247, "learning_rate": 1.5242684403214569e-06, "loss": 0.2461, "step": 12856 }, { "epoch": 2.200410747903474, "grad_norm": 10.944758415222168, "learning_rate": 1.51921190091175e-06, "loss": 0.6887, "step": 12857 }, { "epoch": 2.200581892863255, "grad_norm": 2.1685824394226074, "learning_rate": 1.5141633151871037e-06, "loss": 0.3043, "step": 12858 }, { "epoch": 2.200753037823036, "grad_norm": 9.362020492553711, "learning_rate": 1.5091226861261836e-06, "loss": 0.9898, "step": 12859 }, { "epoch": 2.200924182782817, "grad_norm": 10.947218894958496, "learning_rate": 1.5040900167029775e-06, "loss": 0.7472, "step": 12860 }, { "epoch": 2.201095327742598, "grad_norm": 4.095358848571777, "learning_rate": 1.4990653098867635e-06, "loss": 0.2855, "step": 12861 }, { "epoch": 2.201266472702379, "grad_norm": 16.83753204345703, "learning_rate": 1.4940485686421217e-06, "loss": 1.6443, "step": 12862 }, { "epoch": 2.20143761766216, "grad_norm": 17.795305252075195, "learning_rate": 1.489039795928943e-06, "loss": 1.4362, "step": 12863 }, { "epoch": 2.201608762621941, "grad_norm": 21.37703514099121, "learning_rate": 1.4840389947024085e-06, "loss": 2.7537, "step": 12864 }, { "epoch": 2.201779907581722, "grad_norm": 19.15669059753418, "learning_rate": 1.4790461679129997e-06, "loss": 1.2611, "step": 12865 }, { "epoch": 2.2019510525415025, "grad_norm": 3.0843324661254883, "learning_rate": 1.4740613185064883e-06, "loss": 0.3566, "step": 12866 }, { "epoch": 2.2021221975012835, "grad_norm": 9.879291534423828, "learning_rate": 1.4690844494239468e-06, "loss": 0.7469, "step": 12867 }, { "epoch": 2.2022933424610645, "grad_norm": 13.45974349975586, "learning_rate": 1.464115563601735e-06, "loss": 1.3359, "step": 12868 }, { "epoch": 2.2024644874208454, "grad_norm": 6.336744785308838, "learning_rate": 1.4591546639714993e-06, "loss": 0.7479, "step": 12869 }, { "epoch": 2.2026356323806264, "grad_norm": 8.72073745727539, "learning_rate": 1.4542017534601831e-06, "loss": 0.7788, "step": 12870 }, { "epoch": 2.2028067773404074, "grad_norm": 17.544092178344727, "learning_rate": 1.449256834990006e-06, "loss": 1.3071, "step": 12871 }, { "epoch": 2.2029779223001884, "grad_norm": 10.573201179504395, "learning_rate": 1.4443199114784821e-06, "loss": 0.8095, "step": 12872 }, { "epoch": 2.2031490672599694, "grad_norm": 1.3696458339691162, "learning_rate": 1.4393909858383885e-06, "loss": 0.1677, "step": 12873 }, { "epoch": 2.20332021221975, "grad_norm": 22.88322639465332, "learning_rate": 1.4344700609778271e-06, "loss": 4.3488, "step": 12874 }, { "epoch": 2.203491357179531, "grad_norm": 11.740894317626953, "learning_rate": 1.4295571398001256e-06, "loss": 0.8155, "step": 12875 }, { "epoch": 2.203662502139312, "grad_norm": 118.69126892089844, "learning_rate": 1.4246522252039335e-06, "loss": 6.7908, "step": 12876 }, { "epoch": 2.203833647099093, "grad_norm": 1.092010259628296, "learning_rate": 1.4197553200831393e-06, "loss": 0.1615, "step": 12877 }, { "epoch": 2.204004792058874, "grad_norm": 12.387702941894531, "learning_rate": 1.4148664273269436e-06, "loss": 0.9831, "step": 12878 }, { "epoch": 2.204175937018655, "grad_norm": 12.539447784423828, "learning_rate": 1.4099855498197806e-06, "loss": 1.3345, "step": 12879 }, { "epoch": 2.204347081978436, "grad_norm": 1.2089227437973022, "learning_rate": 1.4051126904413935e-06, "loss": 0.1861, "step": 12880 }, { "epoch": 2.204518226938217, "grad_norm": 16.14443588256836, "learning_rate": 1.4002478520667594e-06, "loss": 1.0036, "step": 12881 }, { "epoch": 2.2046893718979974, "grad_norm": 4.041808128356934, "learning_rate": 1.3953910375661573e-06, "loss": 0.3101, "step": 12882 }, { "epoch": 2.2048605168577784, "grad_norm": 3.158625841140747, "learning_rate": 1.390542249805098e-06, "loss": 0.2336, "step": 12883 }, { "epoch": 2.2050316618175594, "grad_norm": 8.790532112121582, "learning_rate": 1.38570149164439e-06, "loss": 0.5621, "step": 12884 }, { "epoch": 2.2052028067773404, "grad_norm": 10.18777847290039, "learning_rate": 1.380868765940072e-06, "loss": 0.7298, "step": 12885 }, { "epoch": 2.2053739517371214, "grad_norm": 11.528207778930664, "learning_rate": 1.376044075543475e-06, "loss": 0.7752, "step": 12886 }, { "epoch": 2.2055450966969024, "grad_norm": 7.260785102844238, "learning_rate": 1.3712274233011585e-06, "loss": 0.5522, "step": 12887 }, { "epoch": 2.2057162416566833, "grad_norm": 16.415040969848633, "learning_rate": 1.3664188120549693e-06, "loss": 1.2222, "step": 12888 }, { "epoch": 2.2058873866164643, "grad_norm": 13.674951553344727, "learning_rate": 1.3616182446419795e-06, "loss": 1.1048, "step": 12889 }, { "epoch": 2.206058531576245, "grad_norm": 5.737102508544922, "learning_rate": 1.356825723894547e-06, "loss": 0.6262, "step": 12890 }, { "epoch": 2.206229676536026, "grad_norm": 11.906977653503418, "learning_rate": 1.3520412526402515e-06, "loss": 0.8245, "step": 12891 }, { "epoch": 2.206400821495807, "grad_norm": 9.771538734436035, "learning_rate": 1.347264833701955e-06, "loss": 0.5495, "step": 12892 }, { "epoch": 2.206571966455588, "grad_norm": 14.073369979858398, "learning_rate": 1.3424964698977315e-06, "loss": 0.9388, "step": 12893 }, { "epoch": 2.206743111415369, "grad_norm": 9.042618751525879, "learning_rate": 1.3377361640409325e-06, "loss": 0.6501, "step": 12894 }, { "epoch": 2.20691425637515, "grad_norm": 15.885311126708984, "learning_rate": 1.3329839189401493e-06, "loss": 1.1792, "step": 12895 }, { "epoch": 2.207085401334931, "grad_norm": 15.236607551574707, "learning_rate": 1.3282397373992129e-06, "loss": 1.6575, "step": 12896 }, { "epoch": 2.207256546294712, "grad_norm": 24.41050910949707, "learning_rate": 1.323503622217192e-06, "loss": 4.7386, "step": 12897 }, { "epoch": 2.207427691254493, "grad_norm": 7.946477890014648, "learning_rate": 1.318775576188407e-06, "loss": 0.5506, "step": 12898 }, { "epoch": 2.2075988362142733, "grad_norm": 16.67565155029297, "learning_rate": 1.3140556021024075e-06, "loss": 1.1635, "step": 12899 }, { "epoch": 2.2077699811740543, "grad_norm": 16.999099731445312, "learning_rate": 1.3093437027439897e-06, "loss": 1.285, "step": 12900 }, { "epoch": 2.2079411261338353, "grad_norm": 1.9476802349090576, "learning_rate": 1.3046398808931797e-06, "loss": 0.2031, "step": 12901 }, { "epoch": 2.2081122710936163, "grad_norm": 10.156214714050293, "learning_rate": 1.2999441393252344e-06, "loss": 0.6196, "step": 12902 }, { "epoch": 2.2082834160533973, "grad_norm": 0.280158668756485, "learning_rate": 1.2952564808106571e-06, "loss": 0.0972, "step": 12903 }, { "epoch": 2.2084545610131783, "grad_norm": 7.326751708984375, "learning_rate": 1.2905769081151658e-06, "loss": 0.5568, "step": 12904 }, { "epoch": 2.2086257059729593, "grad_norm": 13.2022705078125, "learning_rate": 1.2859054239997203e-06, "loss": 0.8329, "step": 12905 }, { "epoch": 2.20879685093274, "grad_norm": 10.262593269348145, "learning_rate": 1.2812420312204992e-06, "loss": 0.7165, "step": 12906 }, { "epoch": 2.208967995892521, "grad_norm": 8.892220497131348, "learning_rate": 1.2765867325289148e-06, "loss": 0.5634, "step": 12907 }, { "epoch": 2.209139140852302, "grad_norm": 8.55375862121582, "learning_rate": 1.271939530671597e-06, "loss": 0.7138, "step": 12908 }, { "epoch": 2.2093102858120828, "grad_norm": 12.159561157226562, "learning_rate": 1.2673004283904055e-06, "loss": 0.779, "step": 12909 }, { "epoch": 2.2094814307718638, "grad_norm": 3.645000457763672, "learning_rate": 1.262669428422416e-06, "loss": 0.239, "step": 12910 }, { "epoch": 2.2096525757316448, "grad_norm": 0.7756264805793762, "learning_rate": 1.258046533499923e-06, "loss": 0.1558, "step": 12911 }, { "epoch": 2.2098237206914257, "grad_norm": 14.178545951843262, "learning_rate": 1.2534317463504447e-06, "loss": 0.9665, "step": 12912 }, { "epoch": 2.2099948656512067, "grad_norm": 9.43134593963623, "learning_rate": 1.2488250696967096e-06, "loss": 0.5965, "step": 12913 }, { "epoch": 2.2101660106109877, "grad_norm": 14.6178560256958, "learning_rate": 1.2442265062566672e-06, "loss": 1.5429, "step": 12914 }, { "epoch": 2.2103371555707683, "grad_norm": 0.9433627724647522, "learning_rate": 1.2396360587434718e-06, "loss": 0.1697, "step": 12915 }, { "epoch": 2.2105083005305493, "grad_norm": 10.292922973632812, "learning_rate": 1.2350537298654923e-06, "loss": 0.7251, "step": 12916 }, { "epoch": 2.2106794454903302, "grad_norm": 5.983931541442871, "learning_rate": 1.2304795223263243e-06, "loss": 0.5628, "step": 12917 }, { "epoch": 2.2108505904501112, "grad_norm": 8.24634838104248, "learning_rate": 1.2259134388247405e-06, "loss": 0.8358, "step": 12918 }, { "epoch": 2.211021735409892, "grad_norm": 8.891365051269531, "learning_rate": 1.2213554820547513e-06, "loss": 0.7444, "step": 12919 }, { "epoch": 2.211192880369673, "grad_norm": 7.61710786819458, "learning_rate": 1.2168056547055417e-06, "loss": 0.7064, "step": 12920 }, { "epoch": 2.211364025329454, "grad_norm": 8.137414932250977, "learning_rate": 1.2122639594615347e-06, "loss": 0.6364, "step": 12921 }, { "epoch": 2.211535170289235, "grad_norm": 8.638619422912598, "learning_rate": 1.2077303990023204e-06, "loss": 0.5295, "step": 12922 }, { "epoch": 2.2117063152490157, "grad_norm": 50.07402420043945, "learning_rate": 1.2032049760027203e-06, "loss": 7.1853, "step": 12923 }, { "epoch": 2.2118774602087967, "grad_norm": 17.49907684326172, "learning_rate": 1.1986876931327256e-06, "loss": 1.7193, "step": 12924 }, { "epoch": 2.2120486051685777, "grad_norm": 11.92387866973877, "learning_rate": 1.1941785530575594e-06, "loss": 0.8109, "step": 12925 }, { "epoch": 2.2122197501283587, "grad_norm": 2.8895344734191895, "learning_rate": 1.1896775584376002e-06, "loss": 0.2298, "step": 12926 }, { "epoch": 2.2123908950881397, "grad_norm": 3.406383991241455, "learning_rate": 1.1851847119284604e-06, "loss": 0.2748, "step": 12927 }, { "epoch": 2.2125620400479207, "grad_norm": 13.005111694335938, "learning_rate": 1.1807000161809079e-06, "loss": 1.1281, "step": 12928 }, { "epoch": 2.2127331850077017, "grad_norm": 5.310043811798096, "learning_rate": 1.1762234738409362e-06, "loss": 0.4982, "step": 12929 }, { "epoch": 2.2129043299674827, "grad_norm": 9.53742790222168, "learning_rate": 1.1717550875496992e-06, "loss": 0.7591, "step": 12930 }, { "epoch": 2.213075474927263, "grad_norm": 19.31202507019043, "learning_rate": 1.1672948599435628e-06, "loss": 1.7302, "step": 12931 }, { "epoch": 2.213246619887044, "grad_norm": 7.793004989624023, "learning_rate": 1.1628427936540558e-06, "loss": 0.609, "step": 12932 }, { "epoch": 2.213417764846825, "grad_norm": 13.441190719604492, "learning_rate": 1.158398891307917e-06, "loss": 0.9553, "step": 12933 }, { "epoch": 2.213588909806606, "grad_norm": 0.3105558753013611, "learning_rate": 1.1539631555270418e-06, "loss": 0.0961, "step": 12934 }, { "epoch": 2.213760054766387, "grad_norm": 2.6791045665740967, "learning_rate": 1.1495355889285358e-06, "loss": 0.3855, "step": 12935 }, { "epoch": 2.213931199726168, "grad_norm": 11.297131538391113, "learning_rate": 1.1451161941246557e-06, "loss": 0.6256, "step": 12936 }, { "epoch": 2.214102344685949, "grad_norm": 22.585163116455078, "learning_rate": 1.1407049737228559e-06, "loss": 2.7017, "step": 12937 }, { "epoch": 2.21427348964573, "grad_norm": 8.574952125549316, "learning_rate": 1.1363019303257693e-06, "loss": 0.6957, "step": 12938 }, { "epoch": 2.2144446346055107, "grad_norm": 10.168221473693848, "learning_rate": 1.1319070665311914e-06, "loss": 0.7339, "step": 12939 }, { "epoch": 2.2146157795652917, "grad_norm": 3.852912425994873, "learning_rate": 1.127520384932103e-06, "loss": 0.3331, "step": 12940 }, { "epoch": 2.2147869245250726, "grad_norm": 16.314733505249023, "learning_rate": 1.1231418881166455e-06, "loss": 1.3213, "step": 12941 }, { "epoch": 2.2149580694848536, "grad_norm": 8.520816802978516, "learning_rate": 1.1187715786681474e-06, "loss": 0.6585, "step": 12942 }, { "epoch": 2.2151292144446346, "grad_norm": 8.956523895263672, "learning_rate": 1.1144094591650926e-06, "loss": 0.6561, "step": 12943 }, { "epoch": 2.2153003594044156, "grad_norm": 25.49339485168457, "learning_rate": 1.1100555321811378e-06, "loss": 5.2112, "step": 12944 }, { "epoch": 2.2154715043641966, "grad_norm": 15.114352226257324, "learning_rate": 1.1057098002851096e-06, "loss": 1.2443, "step": 12945 }, { "epoch": 2.2156426493239776, "grad_norm": 11.622343063354492, "learning_rate": 1.1013722660409902e-06, "loss": 0.7607, "step": 12946 }, { "epoch": 2.2158137942837586, "grad_norm": 17.059030532836914, "learning_rate": 1.0970429320079394e-06, "loss": 1.1357, "step": 12947 }, { "epoch": 2.215984939243539, "grad_norm": 7.06776762008667, "learning_rate": 1.0927218007402624e-06, "loss": 0.5412, "step": 12948 }, { "epoch": 2.21615608420332, "grad_norm": 0.3299717903137207, "learning_rate": 1.088408874787441e-06, "loss": 0.1022, "step": 12949 }, { "epoch": 2.216327229163101, "grad_norm": 15.467559814453125, "learning_rate": 1.0841041566941051e-06, "loss": 0.9876, "step": 12950 }, { "epoch": 2.216498374122882, "grad_norm": 72.37061309814453, "learning_rate": 1.0798076490000397e-06, "loss": 7.0917, "step": 12951 }, { "epoch": 2.216669519082663, "grad_norm": 16.380287170410156, "learning_rate": 1.0755193542401987e-06, "loss": 1.3054, "step": 12952 }, { "epoch": 2.216840664042444, "grad_norm": 10.158836364746094, "learning_rate": 1.0712392749446748e-06, "loss": 0.503, "step": 12953 }, { "epoch": 2.217011809002225, "grad_norm": 10.299524307250977, "learning_rate": 1.066967413638728e-06, "loss": 0.7199, "step": 12954 }, { "epoch": 2.2171829539620056, "grad_norm": 0.3613644242286682, "learning_rate": 1.0627037728427592e-06, "loss": 0.0999, "step": 12955 }, { "epoch": 2.2173540989217866, "grad_norm": 18.809602737426758, "learning_rate": 1.058448355072324e-06, "loss": 2.1256, "step": 12956 }, { "epoch": 2.2175252438815676, "grad_norm": 1.5499141216278076, "learning_rate": 1.0542011628381243e-06, "loss": 0.1935, "step": 12957 }, { "epoch": 2.2176963888413486, "grad_norm": 12.048866271972656, "learning_rate": 1.049962198646009e-06, "loss": 0.7834, "step": 12958 }, { "epoch": 2.2178675338011296, "grad_norm": 12.04792594909668, "learning_rate": 1.045731464996969e-06, "loss": 0.6873, "step": 12959 }, { "epoch": 2.2180386787609105, "grad_norm": 10.955022811889648, "learning_rate": 1.0415089643871595e-06, "loss": 0.9173, "step": 12960 }, { "epoch": 2.2182098237206915, "grad_norm": 9.003745079040527, "learning_rate": 1.0372946993078441e-06, "loss": 0.7866, "step": 12961 }, { "epoch": 2.2183809686804725, "grad_norm": 7.995996952056885, "learning_rate": 1.033088672245459e-06, "loss": 0.5629, "step": 12962 }, { "epoch": 2.2185521136402535, "grad_norm": 16.564273834228516, "learning_rate": 1.0288908856815543e-06, "loss": 1.3793, "step": 12963 }, { "epoch": 2.218723258600034, "grad_norm": 11.603302001953125, "learning_rate": 1.0247013420928436e-06, "loss": 0.9341, "step": 12964 }, { "epoch": 2.218894403559815, "grad_norm": 13.717855453491211, "learning_rate": 1.0205200439511498e-06, "loss": 1.0089, "step": 12965 }, { "epoch": 2.219065548519596, "grad_norm": 0.971432089805603, "learning_rate": 1.0163469937234576e-06, "loss": 0.1677, "step": 12966 }, { "epoch": 2.219236693479377, "grad_norm": 5.145203113555908, "learning_rate": 1.012182193871861e-06, "loss": 0.3545, "step": 12967 }, { "epoch": 2.219407838439158, "grad_norm": 19.127588272094727, "learning_rate": 1.0080256468536131e-06, "loss": 1.9624, "step": 12968 }, { "epoch": 2.219578983398939, "grad_norm": 8.375225067138672, "learning_rate": 1.003877355121065e-06, "loss": 0.5987, "step": 12969 }, { "epoch": 2.21975012835872, "grad_norm": 6.070406436920166, "learning_rate": 9.997373211217308e-07, "loss": 0.5131, "step": 12970 }, { "epoch": 2.219921273318501, "grad_norm": 0.29485654830932617, "learning_rate": 9.95605547298225e-07, "loss": 0.0926, "step": 12971 }, { "epoch": 2.2200924182782815, "grad_norm": 10.690065383911133, "learning_rate": 9.914820360883108e-07, "loss": 0.7763, "step": 12972 }, { "epoch": 2.2202635632380625, "grad_norm": 2.388949155807495, "learning_rate": 9.873667899248539e-07, "loss": 0.1671, "step": 12973 }, { "epoch": 2.2204347081978435, "grad_norm": 15.509180068969727, "learning_rate": 9.832598112358705e-07, "loss": 1.8963, "step": 12974 }, { "epoch": 2.2206058531576245, "grad_norm": 6.19705057144165, "learning_rate": 9.791611024444668e-07, "loss": 0.6193, "step": 12975 }, { "epoch": 2.2207769981174055, "grad_norm": 8.535962104797363, "learning_rate": 9.75070665968904e-07, "loss": 1.0878, "step": 12976 }, { "epoch": 2.2209481430771865, "grad_norm": 12.4988374710083, "learning_rate": 9.709885042225297e-07, "loss": 0.9252, "step": 12977 }, { "epoch": 2.2211192880369675, "grad_norm": 4.247642517089844, "learning_rate": 9.669146196138422e-07, "loss": 0.2885, "step": 12978 }, { "epoch": 2.2212904329967484, "grad_norm": 10.524779319763184, "learning_rate": 9.628490145464225e-07, "loss": 0.7382, "step": 12979 }, { "epoch": 2.221461577956529, "grad_norm": 21.601436614990234, "learning_rate": 9.587916914189932e-07, "loss": 2.8091, "step": 12980 }, { "epoch": 2.22163272291631, "grad_norm": 11.700606346130371, "learning_rate": 9.54742652625381e-07, "loss": 0.9635, "step": 12981 }, { "epoch": 2.221803867876091, "grad_norm": 10.361104011535645, "learning_rate": 9.507019005545253e-07, "loss": 0.7211, "step": 12982 }, { "epoch": 2.221975012835872, "grad_norm": 4.951571941375732, "learning_rate": 9.466694375904755e-07, "loss": 0.6156, "step": 12983 }, { "epoch": 2.222146157795653, "grad_norm": 16.597631454467773, "learning_rate": 9.426452661123868e-07, "loss": 1.5371, "step": 12984 }, { "epoch": 2.222317302755434, "grad_norm": 13.571664810180664, "learning_rate": 9.386293884945318e-07, "loss": 1.1237, "step": 12985 }, { "epoch": 2.222488447715215, "grad_norm": 10.94641399383545, "learning_rate": 9.346218071062784e-07, "loss": 0.8125, "step": 12986 }, { "epoch": 2.222659592674996, "grad_norm": 14.476384162902832, "learning_rate": 9.306225243121103e-07, "loss": 1.1479, "step": 12987 }, { "epoch": 2.2228307376347765, "grad_norm": 83.57835388183594, "learning_rate": 9.266315424716066e-07, "loss": 7.6865, "step": 12988 }, { "epoch": 2.2230018825945574, "grad_norm": 6.8391571044921875, "learning_rate": 9.226488639394587e-07, "loss": 0.4503, "step": 12989 }, { "epoch": 2.2231730275543384, "grad_norm": 13.575326919555664, "learning_rate": 9.18674491065447e-07, "loss": 1.0559, "step": 12990 }, { "epoch": 2.2233441725141194, "grad_norm": 20.80021095275879, "learning_rate": 9.147084261944561e-07, "loss": 2.5632, "step": 12991 }, { "epoch": 2.2235153174739004, "grad_norm": 15.797767639160156, "learning_rate": 9.107506716664771e-07, "loss": 1.282, "step": 12992 }, { "epoch": 2.2236864624336814, "grad_norm": 12.71924877166748, "learning_rate": 9.06801229816584e-07, "loss": 0.8732, "step": 12993 }, { "epoch": 2.2238576073934624, "grad_norm": 0.8061949014663696, "learning_rate": 9.028601029749595e-07, "loss": 0.1522, "step": 12994 }, { "epoch": 2.2240287523532434, "grad_norm": 1.777208924293518, "learning_rate": 8.989272934668686e-07, "loss": 0.1817, "step": 12995 }, { "epoch": 2.2241998973130244, "grad_norm": 3.2328524589538574, "learning_rate": 8.9500280361268e-07, "loss": 0.2224, "step": 12996 }, { "epoch": 2.224371042272805, "grad_norm": 9.421292304992676, "learning_rate": 8.910866357278469e-07, "loss": 0.7272, "step": 12997 }, { "epoch": 2.224542187232586, "grad_norm": 10.223075866699219, "learning_rate": 8.871787921229091e-07, "loss": 0.6321, "step": 12998 }, { "epoch": 2.224713332192367, "grad_norm": 2.0578982830047607, "learning_rate": 8.83279275103509e-07, "loss": 0.1952, "step": 12999 }, { "epoch": 2.224884477152148, "grad_norm": 11.626383781433105, "learning_rate": 8.793880869703597e-07, "loss": 0.4482, "step": 13000 }, { "epoch": 2.225055622111929, "grad_norm": 0.7555655837059021, "learning_rate": 8.755052300192729e-07, "loss": 0.1496, "step": 13001 }, { "epoch": 2.22522676707171, "grad_norm": 3.0204126834869385, "learning_rate": 8.716307065411294e-07, "loss": 0.2231, "step": 13002 }, { "epoch": 2.225397912031491, "grad_norm": 12.830486297607422, "learning_rate": 8.67764518821924e-07, "loss": 0.8858, "step": 13003 }, { "epoch": 2.2255690569912714, "grad_norm": 2.4990925788879395, "learning_rate": 8.639066691426956e-07, "loss": 0.2172, "step": 13004 }, { "epoch": 2.2257402019510524, "grad_norm": 5.251211643218994, "learning_rate": 8.600571597795931e-07, "loss": 0.529, "step": 13005 }, { "epoch": 2.2259113469108334, "grad_norm": 8.000630378723145, "learning_rate": 8.562159930038215e-07, "loss": 0.5792, "step": 13006 }, { "epoch": 2.2260824918706144, "grad_norm": 8.150513648986816, "learning_rate": 8.523831710816898e-07, "loss": 0.5802, "step": 13007 }, { "epoch": 2.2262536368303953, "grad_norm": 10.863048553466797, "learning_rate": 8.485586962745506e-07, "loss": 0.7219, "step": 13008 }, { "epoch": 2.2264247817901763, "grad_norm": 1.791520118713379, "learning_rate": 8.447425708388639e-07, "loss": 0.1862, "step": 13009 }, { "epoch": 2.2265959267499573, "grad_norm": 13.488287925720215, "learning_rate": 8.409347970261389e-07, "loss": 0.8333, "step": 13010 }, { "epoch": 2.2267670717097383, "grad_norm": 0.3185636103153229, "learning_rate": 8.371353770829798e-07, "loss": 0.0974, "step": 13011 }, { "epoch": 2.2269382166695193, "grad_norm": 9.865966796875, "learning_rate": 8.333443132510354e-07, "loss": 0.5861, "step": 13012 }, { "epoch": 2.2271093616293, "grad_norm": 14.190336227416992, "learning_rate": 8.295616077670532e-07, "loss": 1.1061, "step": 13013 }, { "epoch": 2.227280506589081, "grad_norm": 8.584248542785645, "learning_rate": 8.257872628628227e-07, "loss": 0.7214, "step": 13014 }, { "epoch": 2.227451651548862, "grad_norm": 5.325438976287842, "learning_rate": 8.220212807652261e-07, "loss": 0.6428, "step": 13015 }, { "epoch": 2.227622796508643, "grad_norm": 9.269179344177246, "learning_rate": 8.182636636961843e-07, "loss": 0.5413, "step": 13016 }, { "epoch": 2.227793941468424, "grad_norm": 4.590892791748047, "learning_rate": 8.145144138727106e-07, "loss": 0.4582, "step": 13017 }, { "epoch": 2.227965086428205, "grad_norm": 8.500200271606445, "learning_rate": 8.107735335068556e-07, "loss": 0.6432, "step": 13018 }, { "epoch": 2.2281362313879858, "grad_norm": 15.026817321777344, "learning_rate": 8.070410248057569e-07, "loss": 1.1249, "step": 13019 }, { "epoch": 2.2283073763477668, "grad_norm": 8.9320650100708, "learning_rate": 8.033168899715865e-07, "loss": 0.5696, "step": 13020 }, { "epoch": 2.2284785213075473, "grad_norm": 1.5287339687347412, "learning_rate": 7.996011312016033e-07, "loss": 0.1838, "step": 13021 }, { "epoch": 2.2286496662673283, "grad_norm": 5.719951152801514, "learning_rate": 7.95893750688097e-07, "loss": 0.7262, "step": 13022 }, { "epoch": 2.2288208112271093, "grad_norm": 5.635246753692627, "learning_rate": 7.92194750618433e-07, "loss": 0.4261, "step": 13023 }, { "epoch": 2.2289919561868903, "grad_norm": 7.941003799438477, "learning_rate": 7.88504133175032e-07, "loss": 0.6051, "step": 13024 }, { "epoch": 2.2291631011466713, "grad_norm": 25.065336227416992, "learning_rate": 7.848219005353607e-07, "loss": 5.3669, "step": 13025 }, { "epoch": 2.2293342461064523, "grad_norm": 18.63829803466797, "learning_rate": 7.81148054871938e-07, "loss": 1.6273, "step": 13026 }, { "epoch": 2.2295053910662332, "grad_norm": 11.445050239562988, "learning_rate": 7.774825983523448e-07, "loss": 0.7743, "step": 13027 }, { "epoch": 2.2296765360260142, "grad_norm": 15.75743579864502, "learning_rate": 7.738255331392014e-07, "loss": 1.6174, "step": 13028 }, { "epoch": 2.2298476809857948, "grad_norm": 0.470559686422348, "learning_rate": 7.701768613901833e-07, "loss": 0.1007, "step": 13029 }, { "epoch": 2.2300188259455758, "grad_norm": 4.385748386383057, "learning_rate": 7.66536585258012e-07, "loss": 0.197, "step": 13030 }, { "epoch": 2.2301899709053568, "grad_norm": 0.32931485772132874, "learning_rate": 7.629047068904527e-07, "loss": 0.098, "step": 13031 }, { "epoch": 2.2303611158651377, "grad_norm": 15.651450157165527, "learning_rate": 7.592812284303247e-07, "loss": 1.3043, "step": 13032 }, { "epoch": 2.2305322608249187, "grad_norm": 0.30546700954437256, "learning_rate": 7.556661520154812e-07, "loss": 0.0988, "step": 13033 }, { "epoch": 2.2307034057846997, "grad_norm": 10.008369445800781, "learning_rate": 7.520594797788261e-07, "loss": 0.6852, "step": 13034 }, { "epoch": 2.2308745507444807, "grad_norm": 8.587058067321777, "learning_rate": 7.484612138482972e-07, "loss": 0.8186, "step": 13035 }, { "epoch": 2.2310456957042617, "grad_norm": 8.257025718688965, "learning_rate": 7.448713563468812e-07, "loss": 1.0311, "step": 13036 }, { "epoch": 2.2312168406640422, "grad_norm": 16.386600494384766, "learning_rate": 7.412899093925957e-07, "loss": 1.7356, "step": 13037 }, { "epoch": 2.2313879856238232, "grad_norm": 9.190754890441895, "learning_rate": 7.377168750985036e-07, "loss": 0.5956, "step": 13038 }, { "epoch": 2.2315591305836042, "grad_norm": 82.77942657470703, "learning_rate": 7.341522555726987e-07, "loss": 7.0148, "step": 13039 }, { "epoch": 2.231730275543385, "grad_norm": 0.5631898641586304, "learning_rate": 7.305960529183087e-07, "loss": 0.1069, "step": 13040 }, { "epoch": 2.231901420503166, "grad_norm": 13.585488319396973, "learning_rate": 7.270482692335034e-07, "loss": 0.886, "step": 13041 }, { "epoch": 2.232072565462947, "grad_norm": 3.212167501449585, "learning_rate": 7.23508906611477e-07, "loss": 0.3449, "step": 13042 }, { "epoch": 2.232243710422728, "grad_norm": 19.560922622680664, "learning_rate": 7.199779671404605e-07, "loss": 2.2957, "step": 13043 }, { "epoch": 2.232414855382509, "grad_norm": 0.6872985363006592, "learning_rate": 7.164554529037126e-07, "loss": 0.1113, "step": 13044 }, { "epoch": 2.2325860003422897, "grad_norm": 6.629322528839111, "learning_rate": 7.129413659795175e-07, "loss": 0.4891, "step": 13045 }, { "epoch": 2.2327571453020707, "grad_norm": 13.890636444091797, "learning_rate": 7.094357084412034e-07, "loss": 0.9167, "step": 13046 }, { "epoch": 2.2329282902618517, "grad_norm": 1.4705158472061157, "learning_rate": 7.059384823571025e-07, "loss": 0.169, "step": 13047 }, { "epoch": 2.2330994352216327, "grad_norm": 3.8810665607452393, "learning_rate": 7.024496897905925e-07, "loss": 0.227, "step": 13048 }, { "epoch": 2.2332705801814137, "grad_norm": 12.006173133850098, "learning_rate": 6.989693328000535e-07, "loss": 0.8901, "step": 13049 }, { "epoch": 2.2334417251411947, "grad_norm": 16.267919540405273, "learning_rate": 6.954974134389181e-07, "loss": 1.675, "step": 13050 }, { "epoch": 2.2336128701009756, "grad_norm": 4.272639274597168, "learning_rate": 6.920339337556059e-07, "loss": 0.2301, "step": 13051 }, { "epoch": 2.2337840150607566, "grad_norm": 9.494405746459961, "learning_rate": 6.885788957935923e-07, "loss": 0.6608, "step": 13052 }, { "epoch": 2.233955160020537, "grad_norm": 10.67234992980957, "learning_rate": 6.851323015913386e-07, "loss": 0.6812, "step": 13053 }, { "epoch": 2.234126304980318, "grad_norm": 11.500781059265137, "learning_rate": 6.816941531823579e-07, "loss": 0.6928, "step": 13054 }, { "epoch": 2.234297449940099, "grad_norm": 12.830184936523438, "learning_rate": 6.782644525951442e-07, "loss": 0.7614, "step": 13055 }, { "epoch": 2.23446859489988, "grad_norm": 17.00982093811035, "learning_rate": 6.748432018532407e-07, "loss": 1.5012, "step": 13056 }, { "epoch": 2.234639739859661, "grad_norm": 21.861440658569336, "learning_rate": 6.714304029751794e-07, "loss": 2.4919, "step": 13057 }, { "epoch": 2.234810884819442, "grad_norm": 59.978675842285156, "learning_rate": 6.680260579745296e-07, "loss": 6.6839, "step": 13058 }, { "epoch": 2.234982029779223, "grad_norm": 0.42582395672798157, "learning_rate": 6.646301688598444e-07, "loss": 0.0972, "step": 13059 }, { "epoch": 2.235153174739004, "grad_norm": 24.015987396240234, "learning_rate": 6.61242737634718e-07, "loss": 5.1624, "step": 13060 }, { "epoch": 2.235324319698785, "grad_norm": 0.3473697602748871, "learning_rate": 6.578637662977283e-07, "loss": 0.0991, "step": 13061 }, { "epoch": 2.2354954646585656, "grad_norm": 8.682121276855469, "learning_rate": 6.544932568424839e-07, "loss": 0.6332, "step": 13062 }, { "epoch": 2.2356666096183466, "grad_norm": 0.8249577283859253, "learning_rate": 6.51131211257579e-07, "loss": 0.1519, "step": 13063 }, { "epoch": 2.2358377545781276, "grad_norm": 19.418609619140625, "learning_rate": 6.477776315266387e-07, "loss": 1.4514, "step": 13064 }, { "epoch": 2.2360088995379086, "grad_norm": 5.546756744384766, "learning_rate": 6.444325196282652e-07, "loss": 0.5343, "step": 13065 }, { "epoch": 2.2361800444976896, "grad_norm": 1.0450749397277832, "learning_rate": 6.410958775360881e-07, "loss": 0.1663, "step": 13066 }, { "epoch": 2.2363511894574706, "grad_norm": 23.829912185668945, "learning_rate": 6.377677072187343e-07, "loss": 2.4785, "step": 13067 }, { "epoch": 2.2365223344172516, "grad_norm": 9.122870445251465, "learning_rate": 6.344480106398232e-07, "loss": 0.7826, "step": 13068 }, { "epoch": 2.236693479377032, "grad_norm": 1.4363149404525757, "learning_rate": 6.311367897579845e-07, "loss": 0.1892, "step": 13069 }, { "epoch": 2.236864624336813, "grad_norm": 9.11483097076416, "learning_rate": 6.278340465268389e-07, "loss": 0.68, "step": 13070 }, { "epoch": 2.237035769296594, "grad_norm": 4.091996192932129, "learning_rate": 6.245397828950161e-07, "loss": 0.3376, "step": 13071 }, { "epoch": 2.237206914256375, "grad_norm": 5.2624688148498535, "learning_rate": 6.212540008061313e-07, "loss": 0.5535, "step": 13072 }, { "epoch": 2.237378059216156, "grad_norm": 14.547506332397461, "learning_rate": 6.17976702198802e-07, "loss": 1.4101, "step": 13073 }, { "epoch": 2.237549204175937, "grad_norm": 10.177671432495117, "learning_rate": 6.147078890066415e-07, "loss": 0.8328, "step": 13074 }, { "epoch": 2.237720349135718, "grad_norm": 12.56728458404541, "learning_rate": 6.114475631582506e-07, "loss": 0.9, "step": 13075 }, { "epoch": 2.237891494095499, "grad_norm": 8.571402549743652, "learning_rate": 6.081957265772303e-07, "loss": 0.632, "step": 13076 }, { "epoch": 2.23806263905528, "grad_norm": 16.196224212646484, "learning_rate": 6.049523811821661e-07, "loss": 0.9291, "step": 13077 }, { "epoch": 2.2382337840150606, "grad_norm": 0.3193437159061432, "learning_rate": 6.017175288866389e-07, "loss": 0.0974, "step": 13078 }, { "epoch": 2.2384049289748416, "grad_norm": 26.976716995239258, "learning_rate": 5.98491171599217e-07, "loss": 5.5091, "step": 13079 }, { "epoch": 2.2385760739346225, "grad_norm": 0.29868242144584656, "learning_rate": 5.95273311223451e-07, "loss": 0.0958, "step": 13080 }, { "epoch": 2.2387472188944035, "grad_norm": 4.527113914489746, "learning_rate": 5.920639496578889e-07, "loss": 0.5268, "step": 13081 }, { "epoch": 2.2389183638541845, "grad_norm": 15.720495223999023, "learning_rate": 5.888630887960544e-07, "loss": 0.9755, "step": 13082 }, { "epoch": 2.2390895088139655, "grad_norm": 13.965887069702148, "learning_rate": 5.856707305264636e-07, "loss": 1.3635, "step": 13083 }, { "epoch": 2.2392606537737465, "grad_norm": 22.086772918701172, "learning_rate": 5.824868767326114e-07, "loss": 2.3201, "step": 13084 }, { "epoch": 2.2394317987335275, "grad_norm": 1.2461740970611572, "learning_rate": 5.793115292929768e-07, "loss": 0.1328, "step": 13085 }, { "epoch": 2.239602943693308, "grad_norm": 8.845321655273438, "learning_rate": 5.761446900810196e-07, "loss": 1.0047, "step": 13086 }, { "epoch": 2.239774088653089, "grad_norm": 11.162590026855469, "learning_rate": 5.729863609651736e-07, "loss": 0.6957, "step": 13087 }, { "epoch": 2.23994523361287, "grad_norm": 0.9283007383346558, "learning_rate": 5.698365438088648e-07, "loss": 0.1697, "step": 13088 }, { "epoch": 2.240116378572651, "grad_norm": 8.38036823272705, "learning_rate": 5.666952404704933e-07, "loss": 0.5786, "step": 13089 }, { "epoch": 2.240287523532432, "grad_norm": 12.467147827148438, "learning_rate": 5.635624528034217e-07, "loss": 0.8975, "step": 13090 }, { "epoch": 2.240458668492213, "grad_norm": 24.2241268157959, "learning_rate": 5.604381826560129e-07, "loss": 3.034, "step": 13091 }, { "epoch": 2.240629813451994, "grad_norm": 9.754952430725098, "learning_rate": 5.573224318715758e-07, "loss": 0.5591, "step": 13092 }, { "epoch": 2.240800958411775, "grad_norm": 8.983259201049805, "learning_rate": 5.542152022884217e-07, "loss": 0.6339, "step": 13093 }, { "epoch": 2.2409721033715555, "grad_norm": 17.05608558654785, "learning_rate": 5.511164957398107e-07, "loss": 1.7978, "step": 13094 }, { "epoch": 2.2411432483313365, "grad_norm": 10.81096076965332, "learning_rate": 5.48026314053992e-07, "loss": 0.8524, "step": 13095 }, { "epoch": 2.2413143932911175, "grad_norm": 5.296098232269287, "learning_rate": 5.44944659054169e-07, "loss": 0.4655, "step": 13096 }, { "epoch": 2.2414855382508985, "grad_norm": 7.806074619293213, "learning_rate": 5.418715325585322e-07, "loss": 0.5301, "step": 13097 }, { "epoch": 2.2416566832106795, "grad_norm": 8.544699668884277, "learning_rate": 5.388069363802211e-07, "loss": 0.3742, "step": 13098 }, { "epoch": 2.2418278281704604, "grad_norm": 9.619799613952637, "learning_rate": 5.357508723273613e-07, "loss": 0.6033, "step": 13099 }, { "epoch": 2.2419989731302414, "grad_norm": 0.9181874394416809, "learning_rate": 5.32703342203027e-07, "loss": 0.1597, "step": 13100 }, { "epoch": 2.2421701180900224, "grad_norm": 13.6786470413208, "learning_rate": 5.296643478052748e-07, "loss": 0.8173, "step": 13101 }, { "epoch": 2.242341263049803, "grad_norm": 1.3959088325500488, "learning_rate": 5.266338909271023e-07, "loss": 0.1957, "step": 13102 }, { "epoch": 2.242512408009584, "grad_norm": 6.243566513061523, "learning_rate": 5.236119733565009e-07, "loss": 0.3056, "step": 13103 }, { "epoch": 2.242683552969365, "grad_norm": 4.356396198272705, "learning_rate": 5.205985968763893e-07, "loss": 0.3626, "step": 13104 }, { "epoch": 2.242854697929146, "grad_norm": 0.4175911545753479, "learning_rate": 5.175937632646771e-07, "loss": 0.0957, "step": 13105 }, { "epoch": 2.243025842888927, "grad_norm": 20.403255462646484, "learning_rate": 5.145974742942111e-07, "loss": 2.2927, "step": 13106 }, { "epoch": 2.243196987848708, "grad_norm": 16.19629669189453, "learning_rate": 5.116097317328156e-07, "loss": 1.1689, "step": 13107 }, { "epoch": 2.243368132808489, "grad_norm": 17.915990829467773, "learning_rate": 5.086305373432538e-07, "loss": 1.2502, "step": 13108 }, { "epoch": 2.24353927776827, "grad_norm": 22.759078979492188, "learning_rate": 5.056598928832579e-07, "loss": 4.9447, "step": 13109 }, { "epoch": 2.243710422728051, "grad_norm": 11.86913013458252, "learning_rate": 5.02697800105516e-07, "loss": 0.7681, "step": 13110 }, { "epoch": 2.2438815676878314, "grad_norm": 9.63438892364502, "learning_rate": 4.997442607576685e-07, "loss": 0.6216, "step": 13111 }, { "epoch": 2.2440527126476124, "grad_norm": 0.27162039279937744, "learning_rate": 4.967992765823031e-07, "loss": 0.094, "step": 13112 }, { "epoch": 2.2442238576073934, "grad_norm": 10.762030601501465, "learning_rate": 4.93862849316965e-07, "loss": 0.9868, "step": 13113 }, { "epoch": 2.2443950025671744, "grad_norm": 17.595993041992188, "learning_rate": 4.909349806941532e-07, "loss": 1.9436, "step": 13114 }, { "epoch": 2.2445661475269554, "grad_norm": 30.00774383544922, "learning_rate": 4.880156724413127e-07, "loss": 5.5446, "step": 13115 }, { "epoch": 2.2447372924867364, "grad_norm": 6.7139763832092285, "learning_rate": 4.851049262808421e-07, "loss": 0.5064, "step": 13116 }, { "epoch": 2.2449084374465174, "grad_norm": 11.079005241394043, "learning_rate": 4.822027439300831e-07, "loss": 0.9564, "step": 13117 }, { "epoch": 2.245079582406298, "grad_norm": 0.2677161395549774, "learning_rate": 4.793091271013289e-07, "loss": 0.0935, "step": 13118 }, { "epoch": 2.245250727366079, "grad_norm": 8.821941375732422, "learning_rate": 4.7642407750181916e-07, "loss": 0.7288, "step": 13119 }, { "epoch": 2.24542187232586, "grad_norm": 15.374306678771973, "learning_rate": 4.735475968337338e-07, "loss": 0.8736, "step": 13120 }, { "epoch": 2.245593017285641, "grad_norm": 2.3548104763031006, "learning_rate": 4.7067968679420536e-07, "loss": 0.1985, "step": 13121 }, { "epoch": 2.245764162245422, "grad_norm": 6.569535732269287, "learning_rate": 4.6782034907530023e-07, "loss": 0.3416, "step": 13122 }, { "epoch": 2.245935307205203, "grad_norm": 25.076589584350586, "learning_rate": 4.6496958536403564e-07, "loss": 5.1363, "step": 13123 }, { "epoch": 2.246106452164984, "grad_norm": 16.385753631591797, "learning_rate": 4.621273973423629e-07, "loss": 1.3395, "step": 13124 }, { "epoch": 2.246277597124765, "grad_norm": 0.2801778316497803, "learning_rate": 4.5929378668718216e-07, "loss": 0.0937, "step": 13125 }, { "epoch": 2.246448742084546, "grad_norm": 65.72528076171875, "learning_rate": 4.564687550703245e-07, "loss": 5.7337, "step": 13126 }, { "epoch": 2.2466198870443264, "grad_norm": 5.389486789703369, "learning_rate": 4.5365230415856164e-07, "loss": 0.377, "step": 13127 }, { "epoch": 2.2467910320041073, "grad_norm": 1.4720032215118408, "learning_rate": 4.508444356136077e-07, "loss": 0.1626, "step": 13128 }, { "epoch": 2.2469621769638883, "grad_norm": 10.943506240844727, "learning_rate": 4.4804515109210406e-07, "loss": 0.8853, "step": 13129 }, { "epoch": 2.2471333219236693, "grad_norm": 7.070681571960449, "learning_rate": 4.4525445224563643e-07, "loss": 0.5662, "step": 13130 }, { "epoch": 2.2473044668834503, "grad_norm": 16.568607330322266, "learning_rate": 4.424723407207193e-07, "loss": 1.4149, "step": 13131 }, { "epoch": 2.2474756118432313, "grad_norm": 9.080849647521973, "learning_rate": 4.39698818158808e-07, "loss": 0.7372, "step": 13132 }, { "epoch": 2.2476467568030123, "grad_norm": 10.662124633789062, "learning_rate": 4.369338861962785e-07, "loss": 0.7036, "step": 13133 }, { "epoch": 2.2478179017627933, "grad_norm": 7.165326118469238, "learning_rate": 4.3417754646445096e-07, "loss": 0.3821, "step": 13134 }, { "epoch": 2.247989046722574, "grad_norm": 16.219419479370117, "learning_rate": 4.3142980058956114e-07, "loss": 1.3958, "step": 13135 }, { "epoch": 2.248160191682355, "grad_norm": 8.2588472366333, "learning_rate": 4.2869065019279395e-07, "loss": 0.5977, "step": 13136 }, { "epoch": 2.248331336642136, "grad_norm": 1.148255705833435, "learning_rate": 4.2596009689024165e-07, "loss": 0.1591, "step": 13137 }, { "epoch": 2.248502481601917, "grad_norm": 7.867048263549805, "learning_rate": 4.232381422929421e-07, "loss": 0.56, "step": 13138 }, { "epoch": 2.248673626561698, "grad_norm": 13.986119270324707, "learning_rate": 4.205247880068475e-07, "loss": 1.17, "step": 13139 }, { "epoch": 2.2488447715214788, "grad_norm": 6.803550720214844, "learning_rate": 4.178200356328471e-07, "loss": 0.4545, "step": 13140 }, { "epoch": 2.2490159164812598, "grad_norm": 15.860236167907715, "learning_rate": 4.151238867667412e-07, "loss": 1.7329, "step": 13141 }, { "epoch": 2.2491870614410407, "grad_norm": 0.4019230306148529, "learning_rate": 4.124363429992706e-07, "loss": 0.0955, "step": 13142 }, { "epoch": 2.2493582064008213, "grad_norm": 7.294815540313721, "learning_rate": 4.0975740591608026e-07, "loss": 0.6552, "step": 13143 }, { "epoch": 2.2495293513606023, "grad_norm": 54.85688018798828, "learning_rate": 4.070870770977575e-07, "loss": 6.2224, "step": 13144 }, { "epoch": 2.2497004963203833, "grad_norm": 51.04165267944336, "learning_rate": 4.0442535811978875e-07, "loss": 6.4761, "step": 13145 }, { "epoch": 2.2498716412801643, "grad_norm": 24.872379302978516, "learning_rate": 4.017722505526028e-07, "loss": 5.202, "step": 13146 }, { "epoch": 2.2500427862399452, "grad_norm": 16.272708892822266, "learning_rate": 3.9912775596152915e-07, "loss": 1.1797, "step": 13147 }, { "epoch": 2.2502139311997262, "grad_norm": 19.196151733398438, "learning_rate": 3.9649187590682977e-07, "loss": 1.569, "step": 13148 }, { "epoch": 2.2503850761595072, "grad_norm": 17.46942138671875, "learning_rate": 3.938646119436723e-07, "loss": 2.0938, "step": 13149 }, { "epoch": 2.250556221119288, "grad_norm": 0.29984575510025024, "learning_rate": 3.912459656221534e-07, "loss": 0.096, "step": 13150 }, { "epoch": 2.2507273660790688, "grad_norm": 5.469048023223877, "learning_rate": 3.88635938487269e-07, "loss": 0.4247, "step": 13151 }, { "epoch": 2.2508985110388497, "grad_norm": 13.191169738769531, "learning_rate": 3.860345320789438e-07, "loss": 0.9042, "step": 13152 }, { "epoch": 2.2510696559986307, "grad_norm": 5.983879566192627, "learning_rate": 3.8344174793201347e-07, "loss": 0.4279, "step": 13153 }, { "epoch": 2.2512408009584117, "grad_norm": 14.107702255249023, "learning_rate": 3.8085758757622266e-07, "loss": 1.1188, "step": 13154 }, { "epoch": 2.2514119459181927, "grad_norm": 10.44036865234375, "learning_rate": 3.7828205253623014e-07, "loss": 0.7672, "step": 13155 }, { "epoch": 2.2514119459181927, "eval_nli-pairs_loss": 1.1813894510269165, "eval_nli-pairs_runtime": 4.2674, "eval_nli-pairs_samples_per_second": 46.867, "eval_nli-pairs_steps_per_second": 1.64, "eval_sts-test_pearson_cosine": 0.7820971228196386, "eval_sts-test_pearson_dot": 0.6395154522950764, "eval_sts-test_pearson_euclidean": 0.765358580331564, "eval_sts-test_pearson_manhattan": 0.768305487374388, "eval_sts-test_pearson_max": 0.7820971228196386, "eval_sts-test_spearman_cosine": 0.7846797414148841, "eval_sts-test_spearman_dot": 0.6181636315506001, "eval_sts-test_spearman_euclidean": 0.7546899054655223, "eval_sts-test_spearman_manhattan": 0.7596287420387944, "eval_sts-test_spearman_max": 0.7846797414148841, "step": 13155 }, { "epoch": 2.2514119459181927, "eval_vitaminc-pairs_loss": 0.5848109722137451, "eval_vitaminc-pairs_runtime": 2.7051, "eval_vitaminc-pairs_samples_per_second": 73.934, "eval_vitaminc-pairs_steps_per_second": 2.588, "step": 13155 }, { "epoch": 2.2514119459181927, "eval_qnli-contrastive_loss": 1.2583311796188354, "eval_qnli-contrastive_runtime": 0.6264, "eval_qnli-contrastive_samples_per_second": 319.309, "eval_qnli-contrastive_steps_per_second": 11.176, "step": 13155 }, { "epoch": 2.2514119459181927, "eval_scitail-pairs-qa_loss": 0.08096228539943695, "eval_scitail-pairs-qa_runtime": 1.5757, "eval_scitail-pairs-qa_samples_per_second": 126.93, "eval_scitail-pairs-qa_steps_per_second": 4.443, "step": 13155 }, { "epoch": 2.2514119459181927, "eval_scitail-pairs-pos_loss": 0.5762280821800232, "eval_scitail-pairs-pos_runtime": 2.5917, "eval_scitail-pairs-pos_samples_per_second": 77.169, "eval_scitail-pairs-pos_steps_per_second": 2.701, "step": 13155 }, { "epoch": 2.2514119459181927, "eval_xsum-pairs_loss": 0.597815752029419, "eval_xsum-pairs_runtime": 2.6457, "eval_xsum-pairs_samples_per_second": 66.144, "eval_xsum-pairs_steps_per_second": 2.268, "step": 13155 }, { "epoch": 2.2514119459181927, "eval_compression-pairs_loss": 0.1732936054468155, "eval_compression-pairs_runtime": 0.5095, "eval_compression-pairs_samples_per_second": 392.526, "eval_compression-pairs_steps_per_second": 13.738, "step": 13155 }, { "epoch": 2.2514119459181927, "eval_sciq_pairs_loss": 0.3168482780456543, "eval_sciq_pairs_runtime": 9.1741, "eval_sciq_pairs_samples_per_second": 21.8, "eval_sciq_pairs_steps_per_second": 0.763, "step": 13155 }, { "epoch": 2.2514119459181927, "eval_qasc_pairs_loss": 5.082820892333984, "eval_qasc_pairs_runtime": 2.6338, "eval_qasc_pairs_samples_per_second": 75.937, "eval_qasc_pairs_steps_per_second": 2.658, "step": 13155 }, { "epoch": 2.2514119459181927, "eval_openbookqa_pairs_loss": 2.2364211082458496, "eval_openbookqa_pairs_runtime": 0.6349, "eval_openbookqa_pairs_samples_per_second": 108.679, "eval_openbookqa_pairs_steps_per_second": 4.725, "step": 13155 }, { "epoch": 2.2514119459181927, "eval_msmarco_pairs_loss": 0.8298202753067017, "eval_msmarco_pairs_runtime": 3.9129, "eval_msmarco_pairs_samples_per_second": 51.113, "eval_msmarco_pairs_steps_per_second": 1.789, "step": 13155 }, { "epoch": 2.2514119459181927, "eval_nq_pairs_loss": 0.9941509962081909, "eval_nq_pairs_runtime": 8.5556, "eval_nq_pairs_samples_per_second": 23.377, "eval_nq_pairs_steps_per_second": 0.818, "step": 13155 }, { "epoch": 2.2514119459181927, "eval_trivia_pairs_loss": 1.3513805866241455, "eval_trivia_pairs_runtime": 12.7983, "eval_trivia_pairs_samples_per_second": 15.627, "eval_trivia_pairs_steps_per_second": 0.547, "step": 13155 }, { "epoch": 2.2514119459181927, "eval_quora_pairs_loss": 0.15714246034622192, "eval_quora_pairs_runtime": 1.5886, "eval_quora_pairs_samples_per_second": 125.896, "eval_quora_pairs_steps_per_second": 4.406, "step": 13155 }, { "epoch": 2.2514119459181927, "eval_gooaq_pairs_loss": 0.6573526263237, "eval_gooaq_pairs_runtime": 2.6295, "eval_gooaq_pairs_samples_per_second": 76.06, "eval_gooaq_pairs_steps_per_second": 2.662, "step": 13155 }, { "epoch": 2.2515830908779737, "grad_norm": 10.500983238220215, "learning_rate": 3.7571514433160534e-07, "loss": 0.7022, "step": 13156 }, { "epoch": 2.2517542358377547, "grad_norm": 66.47660064697266, "learning_rate": 3.7315686447682687e-07, "loss": 7.6724, "step": 13157 }, { "epoch": 2.2519253807975357, "grad_norm": 8.798561096191406, "learning_rate": 3.7060721448128556e-07, "loss": 0.5625, "step": 13158 }, { "epoch": 2.2520965257573167, "grad_norm": 8.41103458404541, "learning_rate": 3.6806619584927817e-07, "loss": 0.6681, "step": 13159 }, { "epoch": 2.252267670717097, "grad_norm": 14.65064811706543, "learning_rate": 3.655338100800054e-07, "loss": 1.0391, "step": 13160 }, { "epoch": 2.252438815676878, "grad_norm": 18.625627517700195, "learning_rate": 3.6301005866758554e-07, "loss": 0.8083, "step": 13161 }, { "epoch": 2.252609960636659, "grad_norm": 0.6180931925773621, "learning_rate": 3.604949431010307e-07, "loss": 0.0997, "step": 13162 }, { "epoch": 2.25278110559644, "grad_norm": 14.793701171875, "learning_rate": 3.579884648642656e-07, "loss": 0.9738, "step": 13163 }, { "epoch": 2.252952250556221, "grad_norm": 11.088869094848633, "learning_rate": 3.5549062543611234e-07, "loss": 0.6812, "step": 13164 }, { "epoch": 2.253123395516002, "grad_norm": 8.060577392578125, "learning_rate": 3.530014262903053e-07, "loss": 0.7012, "step": 13165 }, { "epoch": 2.253294540475783, "grad_norm": 7.510422706604004, "learning_rate": 3.505208688954731e-07, "loss": 0.6353, "step": 13166 }, { "epoch": 2.2534656854355637, "grad_norm": 2.0423285961151123, "learning_rate": 3.4804895471514665e-07, "loss": 0.2004, "step": 13167 }, { "epoch": 2.2536368303953447, "grad_norm": 18.561023712158203, "learning_rate": 3.45585685207761e-07, "loss": 1.4546, "step": 13168 }, { "epoch": 2.2538079753551257, "grad_norm": 20.123624801635742, "learning_rate": 3.43131061826647e-07, "loss": 1.4412, "step": 13169 }, { "epoch": 2.2539791203149067, "grad_norm": 0.6947031617164612, "learning_rate": 3.4068508602003776e-07, "loss": 0.1592, "step": 13170 }, { "epoch": 2.2541502652746876, "grad_norm": 5.514309883117676, "learning_rate": 3.382477592310623e-07, "loss": 0.3798, "step": 13171 }, { "epoch": 2.2543214102344686, "grad_norm": 18.965892791748047, "learning_rate": 3.358190828977487e-07, "loss": 2.2618, "step": 13172 }, { "epoch": 2.2544925551942496, "grad_norm": 21.744670867919922, "learning_rate": 3.333990584530139e-07, "loss": 2.6481, "step": 13173 }, { "epoch": 2.2546637001540306, "grad_norm": 10.571277618408203, "learning_rate": 3.3098768732468086e-07, "loss": 0.766, "step": 13174 }, { "epoch": 2.2548348451138116, "grad_norm": 17.898815155029297, "learning_rate": 3.285849709354649e-07, "loss": 1.4914, "step": 13175 }, { "epoch": 2.255005990073592, "grad_norm": 11.839081764221191, "learning_rate": 3.261909107029637e-07, "loss": 0.8294, "step": 13176 }, { "epoch": 2.255177135033373, "grad_norm": 12.411638259887695, "learning_rate": 3.2380550803968566e-07, "loss": 0.7692, "step": 13177 }, { "epoch": 2.255348279993154, "grad_norm": 15.551597595214844, "learning_rate": 3.2142876435301507e-07, "loss": 1.6064, "step": 13178 }, { "epoch": 2.255519424952935, "grad_norm": 22.395097732543945, "learning_rate": 3.190606810452384e-07, "loss": 2.6918, "step": 13179 }, { "epoch": 2.255690569912716, "grad_norm": 4.864678382873535, "learning_rate": 3.1670125951352324e-07, "loss": 0.298, "step": 13180 }, { "epoch": 2.255861714872497, "grad_norm": 8.338438987731934, "learning_rate": 3.143505011499409e-07, "loss": 0.8323, "step": 13181 }, { "epoch": 2.256032859832278, "grad_norm": 16.13868522644043, "learning_rate": 3.1200840734143034e-07, "loss": 1.1523, "step": 13182 }, { "epoch": 2.2562040047920586, "grad_norm": 2.9353833198547363, "learning_rate": 3.096749794698428e-07, "loss": 0.3546, "step": 13183 }, { "epoch": 2.2563751497518396, "grad_norm": 16.54633903503418, "learning_rate": 3.0735021891189207e-07, "loss": 1.3474, "step": 13184 }, { "epoch": 2.2565462947116206, "grad_norm": 3.976797580718994, "learning_rate": 3.0503412703920096e-07, "loss": 0.4583, "step": 13185 }, { "epoch": 2.2567174396714016, "grad_norm": 11.761466026306152, "learning_rate": 3.027267052182597e-07, "loss": 0.7383, "step": 13186 }, { "epoch": 2.2568885846311826, "grad_norm": 3.04795503616333, "learning_rate": 3.0042795481045604e-07, "loss": 0.3892, "step": 13187 }, { "epoch": 2.2570597295909636, "grad_norm": 20.88356590270996, "learning_rate": 2.9813787717204835e-07, "loss": 2.5549, "step": 13188 }, { "epoch": 2.2572308745507446, "grad_norm": 6.123494625091553, "learning_rate": 2.9585647365419575e-07, "loss": 0.4624, "step": 13189 }, { "epoch": 2.2574020195105255, "grad_norm": 24.89466094970703, "learning_rate": 2.935837456029217e-07, "loss": 3.9574, "step": 13190 }, { "epoch": 2.2575731644703065, "grad_norm": 9.490939140319824, "learning_rate": 2.9131969435914673e-07, "loss": 0.7518, "step": 13191 }, { "epoch": 2.2577443094300875, "grad_norm": 6.475113868713379, "learning_rate": 2.8906432125865414e-07, "loss": 0.4944, "step": 13192 }, { "epoch": 2.257915454389868, "grad_norm": 5.616264820098877, "learning_rate": 2.8681762763212935e-07, "loss": 0.2715, "step": 13193 }, { "epoch": 2.258086599349649, "grad_norm": 12.840689659118652, "learning_rate": 2.8457961480511387e-07, "loss": 0.9139, "step": 13194 }, { "epoch": 2.25825774430943, "grad_norm": 10.92798900604248, "learning_rate": 2.8235028409804465e-07, "loss": 0.6789, "step": 13195 }, { "epoch": 2.258428889269211, "grad_norm": 2.63999605178833, "learning_rate": 2.8012963682623126e-07, "loss": 0.2212, "step": 13196 }, { "epoch": 2.258600034228992, "grad_norm": 7.181967735290527, "learning_rate": 2.7791767429985403e-07, "loss": 0.6307, "step": 13197 }, { "epoch": 2.258771179188773, "grad_norm": 0.3574516773223877, "learning_rate": 2.7571439782398067e-07, "loss": 0.1075, "step": 13198 }, { "epoch": 2.258942324148554, "grad_norm": 14.440571784973145, "learning_rate": 2.735198086985413e-07, "loss": 1.4473, "step": 13199 }, { "epoch": 2.2591134691083345, "grad_norm": 14.29772663116455, "learning_rate": 2.7133390821835024e-07, "loss": 0.7728, "step": 13200 }, { "epoch": 2.2592846140681155, "grad_norm": 5.625275135040283, "learning_rate": 2.691566976730908e-07, "loss": 0.3512, "step": 13201 }, { "epoch": 2.2594557590278965, "grad_norm": 1.5007134675979614, "learning_rate": 2.6698817834732045e-07, "loss": 0.1687, "step": 13202 }, { "epoch": 2.2596269039876775, "grad_norm": 1.9240413904190063, "learning_rate": 2.648283515204708e-07, "loss": 0.1409, "step": 13203 }, { "epoch": 2.2597980489474585, "grad_norm": 9.401556968688965, "learning_rate": 2.626772184668391e-07, "loss": 0.5873, "step": 13204 }, { "epoch": 2.2599691939072395, "grad_norm": 5.54839563369751, "learning_rate": 2.605347804556002e-07, "loss": 0.5275, "step": 13205 }, { "epoch": 2.2601403388670205, "grad_norm": 15.161388397216797, "learning_rate": 2.584010387507929e-07, "loss": 1.2867, "step": 13206 }, { "epoch": 2.2603114838268015, "grad_norm": 13.004766464233398, "learning_rate": 2.562759946113319e-07, "loss": 1.104, "step": 13207 }, { "epoch": 2.2604826287865825, "grad_norm": 0.3045908510684967, "learning_rate": 2.5415964929099424e-07, "loss": 0.0949, "step": 13208 }, { "epoch": 2.260653773746363, "grad_norm": 13.692811012268066, "learning_rate": 2.520520040384261e-07, "loss": 1.0222, "step": 13209 }, { "epoch": 2.260824918706144, "grad_norm": 3.2852301597595215, "learning_rate": 2.499530600971428e-07, "loss": 0.2899, "step": 13210 }, { "epoch": 2.260996063665925, "grad_norm": 14.465264320373535, "learning_rate": 2.4786281870552207e-07, "loss": 1.0571, "step": 13211 }, { "epoch": 2.261167208625706, "grad_norm": 8.99935531616211, "learning_rate": 2.4578128109681243e-07, "loss": 0.67, "step": 13212 }, { "epoch": 2.261338353585487, "grad_norm": 15.659977912902832, "learning_rate": 2.4370844849912145e-07, "loss": 1.6749, "step": 13213 }, { "epoch": 2.261509498545268, "grad_norm": 8.221485137939453, "learning_rate": 2.4164432213542584e-07, "loss": 0.8963, "step": 13214 }, { "epoch": 2.261680643505049, "grad_norm": 19.416683197021484, "learning_rate": 2.395889032235632e-07, "loss": 1.7404, "step": 13215 }, { "epoch": 2.2618517884648295, "grad_norm": 5.296562194824219, "learning_rate": 2.3754219297623004e-07, "loss": 0.4549, "step": 13216 }, { "epoch": 2.2620229334246105, "grad_norm": 14.134732246398926, "learning_rate": 2.3550419260099044e-07, "loss": 1.0922, "step": 13217 }, { "epoch": 2.2621940783843915, "grad_norm": 15.534061431884766, "learning_rate": 2.334749033002709e-07, "loss": 1.2715, "step": 13218 }, { "epoch": 2.2623652233441724, "grad_norm": 0.31707122921943665, "learning_rate": 2.3145432627135033e-07, "loss": 0.0914, "step": 13219 }, { "epoch": 2.2625363683039534, "grad_norm": 12.832395553588867, "learning_rate": 2.2944246270637847e-07, "loss": 0.8609, "step": 13220 }, { "epoch": 2.2627075132637344, "grad_norm": 24.639606475830078, "learning_rate": 2.2743931379234916e-07, "loss": 5.1988, "step": 13221 }, { "epoch": 2.2628786582235154, "grad_norm": 11.205442428588867, "learning_rate": 2.25444880711132e-07, "loss": 0.9502, "step": 13222 }, { "epoch": 2.2630498031832964, "grad_norm": 17.025489807128906, "learning_rate": 2.2345916463943738e-07, "loss": 0.7083, "step": 13223 }, { "epoch": 2.2632209481430774, "grad_norm": 7.902225017547607, "learning_rate": 2.2148216674884813e-07, "loss": 0.5083, "step": 13224 }, { "epoch": 2.263392093102858, "grad_norm": 23.122800827026367, "learning_rate": 2.1951388820578955e-07, "loss": 5.0072, "step": 13225 }, { "epoch": 2.263563238062639, "grad_norm": 14.570465087890625, "learning_rate": 2.17554330171556e-07, "loss": 1.2823, "step": 13226 }, { "epoch": 2.26373438302242, "grad_norm": 2.1054322719573975, "learning_rate": 2.1560349380228106e-07, "loss": 0.2237, "step": 13227 }, { "epoch": 2.263905527982201, "grad_norm": 11.018634796142578, "learning_rate": 2.1366138024896898e-07, "loss": 0.8184, "step": 13228 }, { "epoch": 2.264076672941982, "grad_norm": 0.3439165949821472, "learning_rate": 2.1172799065746318e-07, "loss": 0.096, "step": 13229 }, { "epoch": 2.264247817901763, "grad_norm": 8.499422073364258, "learning_rate": 2.0980332616847288e-07, "loss": 0.4341, "step": 13230 }, { "epoch": 2.264418962861544, "grad_norm": 0.28027254343032837, "learning_rate": 2.0788738791754636e-07, "loss": 0.0909, "step": 13231 }, { "epoch": 2.2645901078213244, "grad_norm": 9.076695442199707, "learning_rate": 2.0598017703509607e-07, "loss": 0.6854, "step": 13232 }, { "epoch": 2.2647612527811054, "grad_norm": 23.91187858581543, "learning_rate": 2.0408169464637194e-07, "loss": 5.1152, "step": 13233 }, { "epoch": 2.2649323977408864, "grad_norm": 6.29807710647583, "learning_rate": 2.0219194187149125e-07, "loss": 0.4934, "step": 13234 }, { "epoch": 2.2651035427006674, "grad_norm": 15.981432914733887, "learning_rate": 2.0031091982540052e-07, "loss": 1.0467, "step": 13235 }, { "epoch": 2.2652746876604484, "grad_norm": 1.7458370923995972, "learning_rate": 1.9843862961791525e-07, "loss": 0.1881, "step": 13236 }, { "epoch": 2.2654458326202294, "grad_norm": 5.296671390533447, "learning_rate": 1.9657507235368188e-07, "loss": 0.3031, "step": 13237 }, { "epoch": 2.2656169775800103, "grad_norm": 55.28773880004883, "learning_rate": 1.9472024913220254e-07, "loss": 7.1637, "step": 13238 }, { "epoch": 2.2657881225397913, "grad_norm": 14.490683555603027, "learning_rate": 1.9287416104783183e-07, "loss": 1.3175, "step": 13239 }, { "epoch": 2.2659592674995723, "grad_norm": 17.42160987854004, "learning_rate": 1.910368091897602e-07, "loss": 1.3141, "step": 13240 }, { "epoch": 2.2661304124593533, "grad_norm": 11.07043743133545, "learning_rate": 1.8920819464202876e-07, "loss": 0.8108, "step": 13241 }, { "epoch": 2.266301557419134, "grad_norm": 7.017482757568359, "learning_rate": 1.8738831848352288e-07, "loss": 0.5861, "step": 13242 }, { "epoch": 2.266472702378915, "grad_norm": 13.373224258422852, "learning_rate": 1.85577181787972e-07, "loss": 1.1012, "step": 13243 }, { "epoch": 2.266643847338696, "grad_norm": 8.8458890914917, "learning_rate": 1.83774785623948e-07, "loss": 0.7136, "step": 13244 }, { "epoch": 2.266814992298477, "grad_norm": 15.785042762756348, "learning_rate": 1.819811310548686e-07, "loss": 1.6039, "step": 13245 }, { "epoch": 2.266986137258258, "grad_norm": 18.898574829101562, "learning_rate": 1.8019621913899388e-07, "loss": 1.436, "step": 13246 }, { "epoch": 2.267157282218039, "grad_norm": 7.952098846435547, "learning_rate": 1.7842005092942316e-07, "loss": 0.6087, "step": 13247 }, { "epoch": 2.26732842717782, "grad_norm": 11.375345230102539, "learning_rate": 1.7665262747409817e-07, "loss": 0.8574, "step": 13248 }, { "epoch": 2.2674995721376003, "grad_norm": 0.972829282283783, "learning_rate": 1.7489394981580142e-07, "loss": 0.1576, "step": 13249 }, { "epoch": 2.2676707170973813, "grad_norm": 5.318256378173828, "learning_rate": 1.7314401899215626e-07, "loss": 0.6675, "step": 13250 }, { "epoch": 2.2678418620571623, "grad_norm": 16.574729919433594, "learning_rate": 1.7140283603562346e-07, "loss": 1.8317, "step": 13251 }, { "epoch": 2.2680130070169433, "grad_norm": 164.29112243652344, "learning_rate": 1.6967040197350625e-07, "loss": 8.6781, "step": 13252 }, { "epoch": 2.2681841519767243, "grad_norm": 1.5062873363494873, "learning_rate": 1.6794671782793703e-07, "loss": 0.1763, "step": 13253 }, { "epoch": 2.2683552969365053, "grad_norm": 12.467024803161621, "learning_rate": 1.6623178461589895e-07, "loss": 0.8848, "step": 13254 }, { "epoch": 2.2685264418962863, "grad_norm": 15.118840217590332, "learning_rate": 1.6452560334920264e-07, "loss": 0.9279, "step": 13255 }, { "epoch": 2.2686975868560673, "grad_norm": 17.481006622314453, "learning_rate": 1.6282817503449455e-07, "loss": 1.541, "step": 13256 }, { "epoch": 2.2688687318158482, "grad_norm": 1.7160124778747559, "learning_rate": 1.611395006732652e-07, "loss": 0.1806, "step": 13257 }, { "epoch": 2.269039876775629, "grad_norm": 70.15164184570312, "learning_rate": 1.5945958126183103e-07, "loss": 7.3075, "step": 13258 }, { "epoch": 2.26921102173541, "grad_norm": 8.267501831054688, "learning_rate": 1.5778841779134579e-07, "loss": 0.6714, "step": 13259 }, { "epoch": 2.2693821666951908, "grad_norm": 9.440468788146973, "learning_rate": 1.5612601124780247e-07, "loss": 0.604, "step": 13260 }, { "epoch": 2.2695533116549718, "grad_norm": 11.46180248260498, "learning_rate": 1.544723626120248e-07, "loss": 0.8058, "step": 13261 }, { "epoch": 2.2697244566147528, "grad_norm": 18.058996200561523, "learning_rate": 1.5282747285966235e-07, "loss": 4.4297, "step": 13262 }, { "epoch": 2.2698956015745337, "grad_norm": 7.531244277954102, "learning_rate": 1.5119134296120718e-07, "loss": 0.9479, "step": 13263 }, { "epoch": 2.2700667465343147, "grad_norm": 9.171891212463379, "learning_rate": 1.4956397388197373e-07, "loss": 0.7156, "step": 13264 }, { "epoch": 2.2702378914940953, "grad_norm": 3.0762486457824707, "learning_rate": 1.4794536658211733e-07, "loss": 0.2259, "step": 13265 }, { "epoch": 2.2704090364538763, "grad_norm": 12.653220176696777, "learning_rate": 1.4633552201661406e-07, "loss": 0.989, "step": 13266 }, { "epoch": 2.2705801814136573, "grad_norm": 12.26001262664795, "learning_rate": 1.4473444113527912e-07, "loss": 0.8672, "step": 13267 }, { "epoch": 2.2707513263734382, "grad_norm": 8.995638847351074, "learning_rate": 1.431421248827486e-07, "loss": 0.6812, "step": 13268 }, { "epoch": 2.2709224713332192, "grad_norm": 0.34802842140197754, "learning_rate": 1.4155857419849428e-07, "loss": 0.0911, "step": 13269 }, { "epoch": 2.271093616293, "grad_norm": 0.4962272346019745, "learning_rate": 1.399837900168105e-07, "loss": 0.1017, "step": 13270 }, { "epoch": 2.271264761252781, "grad_norm": 7.099430084228516, "learning_rate": 1.384177732668257e-07, "loss": 0.4986, "step": 13271 }, { "epoch": 2.271435906212562, "grad_norm": 14.091185569763184, "learning_rate": 1.3686052487248747e-07, "loss": 1.0178, "step": 13272 }, { "epoch": 2.271607051172343, "grad_norm": 10.157958984375, "learning_rate": 1.3531204575258082e-07, "loss": 0.6977, "step": 13273 }, { "epoch": 2.2717781961321237, "grad_norm": 9.568290710449219, "learning_rate": 1.33772336820705e-07, "loss": 0.6964, "step": 13274 }, { "epoch": 2.2719493410919047, "grad_norm": 16.770429611206055, "learning_rate": 1.3224139898529664e-07, "loss": 1.6272, "step": 13275 }, { "epoch": 2.2721204860516857, "grad_norm": 0.2961198389530182, "learning_rate": 1.3071923314960488e-07, "loss": 0.0976, "step": 13276 }, { "epoch": 2.2722916310114667, "grad_norm": 15.955753326416016, "learning_rate": 1.2920584021171467e-07, "loss": 1.1921, "step": 13277 }, { "epoch": 2.2724627759712477, "grad_norm": 7.560957908630371, "learning_rate": 1.2770122106452508e-07, "loss": 0.6627, "step": 13278 }, { "epoch": 2.2726339209310287, "grad_norm": 0.33919811248779297, "learning_rate": 1.2620537659577102e-07, "loss": 0.101, "step": 13279 }, { "epoch": 2.2728050658908097, "grad_norm": 6.2482805252075195, "learning_rate": 1.247183076879932e-07, "loss": 0.3346, "step": 13280 }, { "epoch": 2.27297621085059, "grad_norm": 0.3379679024219513, "learning_rate": 1.2324001521857475e-07, "loss": 0.101, "step": 13281 }, { "epoch": 2.273147355810371, "grad_norm": 17.936552047729492, "learning_rate": 1.2177050005970304e-07, "loss": 1.4021, "step": 13282 }, { "epoch": 2.273318500770152, "grad_norm": 0.29490023851394653, "learning_rate": 1.203097630783978e-07, "loss": 0.0919, "step": 13283 }, { "epoch": 2.273489645729933, "grad_norm": 8.713892936706543, "learning_rate": 1.1885780513649469e-07, "loss": 0.5713, "step": 13284 }, { "epoch": 2.273660790689714, "grad_norm": 8.749739646911621, "learning_rate": 1.1741462709065176e-07, "loss": 0.5479, "step": 13285 }, { "epoch": 2.273831935649495, "grad_norm": 36.74420928955078, "learning_rate": 1.1598022979234623e-07, "loss": 5.7973, "step": 13286 }, { "epoch": 2.274003080609276, "grad_norm": 13.641304016113281, "learning_rate": 1.1455461408787449e-07, "loss": 0.9716, "step": 13287 }, { "epoch": 2.274174225569057, "grad_norm": 10.894933700561523, "learning_rate": 1.1313778081835203e-07, "loss": 0.873, "step": 13288 }, { "epoch": 2.274345370528838, "grad_norm": 3.4444196224212646, "learning_rate": 1.1172973081971016e-07, "loss": 0.5575, "step": 13289 }, { "epoch": 2.274516515488619, "grad_norm": 0.3058640658855438, "learning_rate": 1.1033046492270438e-07, "loss": 0.1009, "step": 13290 }, { "epoch": 2.2746876604483997, "grad_norm": 0.4371137022972107, "learning_rate": 1.0893998395290095e-07, "loss": 0.1023, "step": 13291 }, { "epoch": 2.2748588054081806, "grad_norm": 16.61471939086914, "learning_rate": 1.0755828873068697e-07, "loss": 1.0584, "step": 13292 }, { "epoch": 2.2750299503679616, "grad_norm": 6.814513683319092, "learning_rate": 1.0618538007126533e-07, "loss": 0.6268, "step": 13293 }, { "epoch": 2.2752010953277426, "grad_norm": 7.656591415405273, "learning_rate": 1.0482125878465142e-07, "loss": 0.4792, "step": 13294 }, { "epoch": 2.2753722402875236, "grad_norm": 18.113788604736328, "learning_rate": 1.0346592567568313e-07, "loss": 2.2185, "step": 13295 }, { "epoch": 2.2755433852473046, "grad_norm": 12.689842224121094, "learning_rate": 1.0211938154400413e-07, "loss": 0.523, "step": 13296 }, { "epoch": 2.2757145302070856, "grad_norm": 8.632298469543457, "learning_rate": 1.0078162718408057e-07, "loss": 0.55, "step": 13297 }, { "epoch": 2.275885675166866, "grad_norm": 0.2996344566345215, "learning_rate": 9.945266338518778e-08, "loss": 0.096, "step": 13298 }, { "epoch": 2.276056820126647, "grad_norm": 15.424636840820312, "learning_rate": 9.813249093141851e-08, "loss": 1.57, "step": 13299 }, { "epoch": 2.276227965086428, "grad_norm": 12.334173202514648, "learning_rate": 9.682111060167476e-08, "loss": 0.9347, "step": 13300 }, { "epoch": 2.276399110046209, "grad_norm": 7.829089641571045, "learning_rate": 9.551852316967424e-08, "loss": 0.5279, "step": 13301 }, { "epoch": 2.27657025500599, "grad_norm": 45.49351119995117, "learning_rate": 9.422472940394223e-08, "loss": 6.526, "step": 13302 }, { "epoch": 2.276741399965771, "grad_norm": 5.574182987213135, "learning_rate": 9.293973006782308e-08, "loss": 0.4206, "step": 13303 }, { "epoch": 2.276912544925552, "grad_norm": 1.660355806350708, "learning_rate": 9.16635259194687e-08, "loss": 0.1459, "step": 13304 }, { "epoch": 2.277083689885333, "grad_norm": 9.990009307861328, "learning_rate": 9.039611771183509e-08, "loss": 0.6551, "step": 13305 }, { "epoch": 2.277254834845114, "grad_norm": 17.50528907775879, "learning_rate": 8.913750619270411e-08, "loss": 1.2055, "step": 13306 }, { "epoch": 2.2774259798048946, "grad_norm": 9.227025032043457, "learning_rate": 8.788769210465008e-08, "loss": 0.8801, "step": 13307 }, { "epoch": 2.2775971247646756, "grad_norm": 1.5545998811721802, "learning_rate": 8.664667618506983e-08, "loss": 0.1725, "step": 13308 }, { "epoch": 2.2777682697244566, "grad_norm": 6.395511627197266, "learning_rate": 8.541445916616264e-08, "loss": 0.7962, "step": 13309 }, { "epoch": 2.2779394146842376, "grad_norm": 1.7616088390350342, "learning_rate": 8.419104177494197e-08, "loss": 0.1839, "step": 13310 }, { "epoch": 2.2781105596440185, "grad_norm": 19.01666259765625, "learning_rate": 8.297642473322043e-08, "loss": 4.8335, "step": 13311 }, { "epoch": 2.2782817046037995, "grad_norm": 7.277900218963623, "learning_rate": 8.17706087576281e-08, "loss": 0.576, "step": 13312 }, { "epoch": 2.2784528495635805, "grad_norm": 2.4718832969665527, "learning_rate": 8.05735945595959e-08, "loss": 0.1908, "step": 13313 }, { "epoch": 2.278623994523361, "grad_norm": 10.499750137329102, "learning_rate": 7.938538284536556e-08, "loss": 0.7051, "step": 13314 }, { "epoch": 2.278795139483142, "grad_norm": 2.671109676361084, "learning_rate": 7.82059743159813e-08, "loss": 0.1988, "step": 13315 }, { "epoch": 2.278966284442923, "grad_norm": 11.05088996887207, "learning_rate": 7.703536966729985e-08, "loss": 0.7427, "step": 13316 }, { "epoch": 2.279137429402704, "grad_norm": 12.960077285766602, "learning_rate": 7.587356958997538e-08, "loss": 0.7107, "step": 13317 }, { "epoch": 2.279308574362485, "grad_norm": 1.4326839447021484, "learning_rate": 7.472057476947624e-08, "loss": 0.17, "step": 13318 }, { "epoch": 2.279479719322266, "grad_norm": 12.42382526397705, "learning_rate": 7.357638588606497e-08, "loss": 0.8285, "step": 13319 }, { "epoch": 2.279650864282047, "grad_norm": 0.31781527400016785, "learning_rate": 7.244100361482153e-08, "loss": 0.0961, "step": 13320 }, { "epoch": 2.279822009241828, "grad_norm": 3.662384271621704, "learning_rate": 7.131442862561843e-08, "loss": 0.2525, "step": 13321 }, { "epoch": 2.279993154201609, "grad_norm": 3.4872047901153564, "learning_rate": 7.019666158313898e-08, "loss": 0.2494, "step": 13322 }, { "epoch": 2.2801642991613895, "grad_norm": 17.448572158813477, "learning_rate": 6.908770314686564e-08, "loss": 1.8576, "step": 13323 }, { "epoch": 2.2803354441211705, "grad_norm": 13.973257064819336, "learning_rate": 6.798755397108669e-08, "loss": 0.9747, "step": 13324 }, { "epoch": 2.2805065890809515, "grad_norm": 17.785375595092773, "learning_rate": 6.689621470489126e-08, "loss": 1.9825, "step": 13325 }, { "epoch": 2.2806777340407325, "grad_norm": 9.327975273132324, "learning_rate": 6.581368599217096e-08, "loss": 0.7294, "step": 13326 }, { "epoch": 2.2808488790005135, "grad_norm": 0.28813302516937256, "learning_rate": 6.473996847162155e-08, "loss": 0.0924, "step": 13327 }, { "epoch": 2.2810200239602945, "grad_norm": 27.710386276245117, "learning_rate": 6.367506277673629e-08, "loss": 5.0382, "step": 13328 }, { "epoch": 2.2811911689200755, "grad_norm": 81.62084197998047, "learning_rate": 6.261896953580925e-08, "loss": 7.5802, "step": 13329 }, { "epoch": 2.281362313879856, "grad_norm": 4.759239673614502, "learning_rate": 6.157168937194036e-08, "loss": 0.4431, "step": 13330 }, { "epoch": 2.281533458839637, "grad_norm": 2.0918779373168945, "learning_rate": 6.053322290302365e-08, "loss": 0.1982, "step": 13331 }, { "epoch": 2.281704603799418, "grad_norm": 18.036943435668945, "learning_rate": 5.950357074175738e-08, "loss": 1.4876, "step": 13332 }, { "epoch": 2.281875748759199, "grad_norm": 3.6457552909851074, "learning_rate": 5.8482733495638926e-08, "loss": 0.2611, "step": 13333 }, { "epoch": 2.28204689371898, "grad_norm": 4.328112602233887, "learning_rate": 5.74707117669615e-08, "loss": 0.2481, "step": 13334 }, { "epoch": 2.282218038678761, "grad_norm": 15.353179931640625, "learning_rate": 5.646750615282081e-08, "loss": 1.041, "step": 13335 }, { "epoch": 2.282389183638542, "grad_norm": 9.155828475952148, "learning_rate": 5.5473117245108396e-08, "loss": 0.8095, "step": 13336 }, { "epoch": 2.282560328598323, "grad_norm": 0.3245168924331665, "learning_rate": 5.448754563051661e-08, "loss": 0.0907, "step": 13337 }, { "epoch": 2.282731473558104, "grad_norm": 13.996110916137695, "learning_rate": 5.351079189053365e-08, "loss": 1.1492, "step": 13338 }, { "epoch": 2.282902618517885, "grad_norm": 0.43840014934539795, "learning_rate": 5.2542856601446865e-08, "loss": 0.1018, "step": 13339 }, { "epoch": 2.2830737634776654, "grad_norm": 15.92821979522705, "learning_rate": 5.1583740334336104e-08, "loss": 1.0523, "step": 13340 }, { "epoch": 2.2832449084374464, "grad_norm": 11.447189331054688, "learning_rate": 5.0633443655085375e-08, "loss": 0.8373, "step": 13341 }, { "epoch": 2.2834160533972274, "grad_norm": 4.097468376159668, "learning_rate": 4.9691967124367854e-08, "loss": 0.2618, "step": 13342 }, { "epoch": 2.2835871983570084, "grad_norm": 13.971988677978516, "learning_rate": 4.8759311297659206e-08, "loss": 1.109, "step": 13343 }, { "epoch": 2.2837583433167894, "grad_norm": 5.060430526733398, "learning_rate": 4.783547672522592e-08, "loss": 0.5617, "step": 13344 }, { "epoch": 2.2839294882765704, "grad_norm": 10.96203899383545, "learning_rate": 4.6920463952130345e-08, "loss": 0.9547, "step": 13345 }, { "epoch": 2.2841006332363514, "grad_norm": 26.1273193359375, "learning_rate": 4.601427351823395e-08, "loss": 5.024, "step": 13346 }, { "epoch": 2.284271778196132, "grad_norm": 8.722874641418457, "learning_rate": 4.5116905958190734e-08, "loss": 0.5918, "step": 13347 }, { "epoch": 2.284442923155913, "grad_norm": 3.7547712326049805, "learning_rate": 4.422836180144552e-08, "loss": 0.4114, "step": 13348 }, { "epoch": 2.284614068115694, "grad_norm": 17.174095153808594, "learning_rate": 4.334864157224394e-08, "loss": 1.1451, "step": 13349 }, { "epoch": 2.284785213075475, "grad_norm": 18.728464126586914, "learning_rate": 4.247774578962082e-08, "loss": 1.8542, "step": 13350 }, { "epoch": 2.284956358035256, "grad_norm": 1.2999043464660645, "learning_rate": 4.161567496740848e-08, "loss": 0.1642, "step": 13351 }, { "epoch": 2.285127502995037, "grad_norm": 9.868552207946777, "learning_rate": 4.076242961422505e-08, "loss": 0.8876, "step": 13352 }, { "epoch": 2.285298647954818, "grad_norm": 9.937418937683105, "learning_rate": 3.9918010233491174e-08, "loss": 0.6653, "step": 13353 }, { "epoch": 2.285469792914599, "grad_norm": 8.514995574951172, "learning_rate": 3.908241732341167e-08, "loss": 0.5623, "step": 13354 }, { "epoch": 2.28564093787438, "grad_norm": 1.1148995161056519, "learning_rate": 3.825565137699216e-08, "loss": 0.1659, "step": 13355 }, { "epoch": 2.2858120828341604, "grad_norm": 0.2704388499259949, "learning_rate": 3.74377128820208e-08, "loss": 0.0929, "step": 13356 }, { "epoch": 2.2859832277939414, "grad_norm": 17.162927627563477, "learning_rate": 3.6628602321086555e-08, "loss": 1.2173, "step": 13357 }, { "epoch": 2.2861543727537224, "grad_norm": 9.613378524780273, "learning_rate": 3.582832017156423e-08, "loss": 0.8755, "step": 13358 }, { "epoch": 2.2863255177135033, "grad_norm": 16.495384216308594, "learning_rate": 3.50368669056228e-08, "loss": 1.299, "step": 13359 }, { "epoch": 2.2864966626732843, "grad_norm": 11.122078895568848, "learning_rate": 3.4254242990217066e-08, "loss": 0.9909, "step": 13360 }, { "epoch": 2.2866678076330653, "grad_norm": 13.876315116882324, "learning_rate": 3.3480448887099336e-08, "loss": 1.1555, "step": 13361 }, { "epoch": 2.2868389525928463, "grad_norm": 12.319376945495605, "learning_rate": 3.271548505280941e-08, "loss": 0.7051, "step": 13362 }, { "epoch": 2.287010097552627, "grad_norm": 1.2982587814331055, "learning_rate": 3.195935193867627e-08, "loss": 0.1438, "step": 13363 }, { "epoch": 2.287181242512408, "grad_norm": 7.783725261688232, "learning_rate": 3.121204999081972e-08, "loss": 0.5523, "step": 13364 }, { "epoch": 2.287352387472189, "grad_norm": 15.796246528625488, "learning_rate": 3.047357965015041e-08, "loss": 1.6279, "step": 13365 }, { "epoch": 2.28752353243197, "grad_norm": 13.842591285705566, "learning_rate": 2.9743941352363134e-08, "loss": 0.9706, "step": 13366 }, { "epoch": 2.287694677391751, "grad_norm": 7.9366254806518555, "learning_rate": 2.9023135527948553e-08, "loss": 0.8489, "step": 13367 }, { "epoch": 2.287865822351532, "grad_norm": 11.664368629455566, "learning_rate": 2.8311162602183138e-08, "loss": 0.837, "step": 13368 }, { "epoch": 2.288036967311313, "grad_norm": 5.4600138664245605, "learning_rate": 2.7608022995132544e-08, "loss": 0.6748, "step": 13369 }, { "epoch": 2.2882081122710938, "grad_norm": 0.3636683523654938, "learning_rate": 2.6913717121648252e-08, "loss": 0.0942, "step": 13370 }, { "epoch": 2.2883792572308748, "grad_norm": 24.35812759399414, "learning_rate": 2.6228245391372586e-08, "loss": 4.9678, "step": 13371 }, { "epoch": 2.2885504021906553, "grad_norm": 12.927050590515137, "learning_rate": 2.5551608208735365e-08, "loss": 1.2075, "step": 13372 }, { "epoch": 2.2887215471504363, "grad_norm": 86.80014038085938, "learning_rate": 2.488380597295392e-08, "loss": 7.2535, "step": 13373 }, { "epoch": 2.2888926921102173, "grad_norm": 14.493406295776367, "learning_rate": 2.422483907802975e-08, "loss": 0.9238, "step": 13374 }, { "epoch": 2.2890638370699983, "grad_norm": 7.195258617401123, "learning_rate": 2.357470791275851e-08, "loss": 0.6654, "step": 13375 }, { "epoch": 2.2892349820297793, "grad_norm": 18.607358932495117, "learning_rate": 2.293341286071504e-08, "loss": 1.5241, "step": 13376 }, { "epoch": 2.2894061269895603, "grad_norm": 18.97603988647461, "learning_rate": 2.2300954300266686e-08, "loss": 1.3828, "step": 13377 }, { "epoch": 2.2895772719493412, "grad_norm": 15.835659980773926, "learning_rate": 2.1677332604563284e-08, "loss": 1.0625, "step": 13378 }, { "epoch": 2.289748416909122, "grad_norm": 2.2908005714416504, "learning_rate": 2.1062548141542183e-08, "loss": 0.1922, "step": 13379 }, { "epoch": 2.2899195618689028, "grad_norm": 15.969532012939453, "learning_rate": 2.0456601273929897e-08, "loss": 1.3277, "step": 13380 }, { "epoch": 2.2900907068286838, "grad_norm": 10.891643524169922, "learning_rate": 1.9859492359233787e-08, "loss": 0.7309, "step": 13381 }, { "epoch": 2.2902618517884648, "grad_norm": 5.392662525177002, "learning_rate": 1.9271221749748714e-08, "loss": 0.6799, "step": 13382 }, { "epoch": 2.2904329967482457, "grad_norm": 12.27989673614502, "learning_rate": 1.8691789792557034e-08, "loss": 0.8952, "step": 13383 }, { "epoch": 2.2906041417080267, "grad_norm": 13.827444076538086, "learning_rate": 1.8121196829523622e-08, "loss": 0.9352, "step": 13384 }, { "epoch": 2.2907752866678077, "grad_norm": 1.0919454097747803, "learning_rate": 1.755944319729752e-08, "loss": 0.1575, "step": 13385 }, { "epoch": 2.2909464316275887, "grad_norm": 0.27913573384284973, "learning_rate": 1.7006529227316935e-08, "loss": 0.0971, "step": 13386 }, { "epoch": 2.2911175765873697, "grad_norm": 9.595022201538086, "learning_rate": 1.6462455245800923e-08, "loss": 0.4814, "step": 13387 }, { "epoch": 2.2912887215471502, "grad_norm": 24.242630004882812, "learning_rate": 1.5927221573752704e-08, "loss": 5.1253, "step": 13388 }, { "epoch": 2.2914598665069312, "grad_norm": 2.902010202407837, "learning_rate": 1.540082852696134e-08, "loss": 0.3292, "step": 13389 }, { "epoch": 2.291631011466712, "grad_norm": 25.132957458496094, "learning_rate": 1.4883276416001735e-08, "loss": 4.9021, "step": 13390 }, { "epoch": 2.291802156426493, "grad_norm": 21.702404022216797, "learning_rate": 1.4374565546227958e-08, "loss": 4.9698, "step": 13391 }, { "epoch": 2.291973301386274, "grad_norm": 7.968842506408691, "learning_rate": 1.387469621778159e-08, "loss": 0.7881, "step": 13392 }, { "epoch": 2.292144446346055, "grad_norm": 10.696248054504395, "learning_rate": 1.3383668725585052e-08, "loss": 0.7173, "step": 13393 }, { "epoch": 2.292315591305836, "grad_norm": 14.667497634887695, "learning_rate": 1.29014833593466e-08, "loss": 0.9132, "step": 13394 }, { "epoch": 2.292486736265617, "grad_norm": 17.918424606323242, "learning_rate": 1.2428140403555332e-08, "loss": 1.2784, "step": 13395 }, { "epoch": 2.2926578812253977, "grad_norm": 6.9997239112854, "learning_rate": 1.1963640137484521e-08, "loss": 0.5963, "step": 13396 }, { "epoch": 2.2928290261851787, "grad_norm": 15.309932708740234, "learning_rate": 1.1507982835189945e-08, "loss": 1.1244, "step": 13397 }, { "epoch": 2.2930001711449597, "grad_norm": 1.0400322675704956, "learning_rate": 1.1061168765509889e-08, "loss": 0.1616, "step": 13398 }, { "epoch": 2.2931713161047407, "grad_norm": 113.78826904296875, "learning_rate": 1.0623198192066807e-08, "loss": 7.3263, "step": 13399 }, { "epoch": 2.2933424610645217, "grad_norm": 8.077736854553223, "learning_rate": 1.0194071373262338e-08, "loss": 0.7887, "step": 13400 }, { "epoch": 2.2935136060243027, "grad_norm": 9.030354499816895, "learning_rate": 9.773788562282282e-09, "loss": 0.5907, "step": 13401 }, { "epoch": 2.2936847509840836, "grad_norm": 11.498586654663086, "learning_rate": 9.362350007094955e-09, "loss": 0.952, "step": 13402 }, { "epoch": 2.2938558959438646, "grad_norm": 11.436877250671387, "learning_rate": 8.959755950447845e-09, "loss": 0.7101, "step": 13403 }, { "epoch": 2.2940270409036456, "grad_norm": 7.019799709320068, "learning_rate": 8.566006629874279e-09, "loss": 0.4128, "step": 13404 }, { "epoch": 2.294198185863426, "grad_norm": 3.1404175758361816, "learning_rate": 8.181102277685092e-09, "loss": 0.2375, "step": 13405 }, { "epoch": 2.294369330823207, "grad_norm": 10.63111400604248, "learning_rate": 7.805043120975297e-09, "loss": 0.8307, "step": 13406 }, { "epoch": 2.294540475782988, "grad_norm": 12.55020809173584, "learning_rate": 7.437829381620742e-09, "loss": 1.0248, "step": 13407 }, { "epoch": 2.294711620742769, "grad_norm": 5.037360668182373, "learning_rate": 7.079461276278121e-09, "loss": 0.6275, "step": 13408 }, { "epoch": 2.29488276570255, "grad_norm": 21.29867935180664, "learning_rate": 6.729939016383302e-09, "loss": 5.1639, "step": 13409 }, { "epoch": 2.295053910662331, "grad_norm": 17.329364776611328, "learning_rate": 6.3892628081579914e-09, "loss": 1.3819, "step": 13410 }, { "epoch": 2.295225055622112, "grad_norm": 10.289579391479492, "learning_rate": 6.0574328525997426e-09, "loss": 0.8076, "step": 13411 }, { "epoch": 2.2953962005818926, "grad_norm": 5.650691032409668, "learning_rate": 5.734449345488613e-09, "loss": 0.5877, "step": 13412 }, { "epoch": 2.2955673455416736, "grad_norm": 12.48499870300293, "learning_rate": 5.420312477385503e-09, "loss": 0.9821, "step": 13413 }, { "epoch": 2.2957384905014546, "grad_norm": 0.37296387553215027, "learning_rate": 5.115022433632155e-09, "loss": 0.1021, "step": 13414 }, { "epoch": 2.2959096354612356, "grad_norm": 6.366091728210449, "learning_rate": 4.818579394349487e-09, "loss": 0.3888, "step": 13415 }, { "epoch": 2.2960807804210166, "grad_norm": 18.334564208984375, "learning_rate": 4.5309835344409245e-09, "loss": 1.6024, "step": 13416 }, { "epoch": 2.2962519253807976, "grad_norm": 14.420116424560547, "learning_rate": 4.252235023585738e-09, "loss": 1.0073, "step": 13417 }, { "epoch": 2.2964230703405786, "grad_norm": 8.695976257324219, "learning_rate": 3.982334026247369e-09, "loss": 0.5519, "step": 13418 }, { "epoch": 2.2965942153003596, "grad_norm": 9.467698097229004, "learning_rate": 3.721280701668439e-09, "loss": 0.5773, "step": 13419 }, { "epoch": 2.2967653602601406, "grad_norm": 149.52406311035156, "learning_rate": 3.469075203870742e-09, "loss": 9.2566, "step": 13420 }, { "epoch": 2.296936505219921, "grad_norm": 2.097064733505249, "learning_rate": 3.225717681655249e-09, "loss": 0.2976, "step": 13421 }, { "epoch": 2.297107650179702, "grad_norm": 19.09964942932129, "learning_rate": 2.9912082786037744e-09, "loss": 1.5158, "step": 13422 }, { "epoch": 2.297278795139483, "grad_norm": 6.907938003540039, "learning_rate": 2.765547133075641e-09, "loss": 0.5951, "step": 13423 }, { "epoch": 2.297449940099264, "grad_norm": 1.8947793245315552, "learning_rate": 2.5487343782126805e-09, "loss": 0.2085, "step": 13424 }, { "epoch": 2.297621085059045, "grad_norm": 8.273024559020996, "learning_rate": 2.3407701419358994e-09, "loss": 0.5878, "step": 13425 }, { "epoch": 2.297792230018826, "grad_norm": 13.063462257385254, "learning_rate": 2.1416545469421512e-09, "loss": 0.9339, "step": 13426 }, { "epoch": 2.297963374978607, "grad_norm": 23.88918113708496, "learning_rate": 1.951387710710795e-09, "loss": 5.1607, "step": 13427 }, { "epoch": 2.2981345199383876, "grad_norm": 15.747206687927246, "learning_rate": 1.7699697455003661e-09, "loss": 0.9793, "step": 13428 }, { "epoch": 2.2983056648981686, "grad_norm": 1.9992352724075317, "learning_rate": 1.5974007583452466e-09, "loss": 0.2064, "step": 13429 }, { "epoch": 2.2984768098579496, "grad_norm": 8.43206787109375, "learning_rate": 1.4336808510656551e-09, "loss": 0.702, "step": 13430 }, { "epoch": 2.2986479548177305, "grad_norm": 16.719728469848633, "learning_rate": 1.2788101202526602e-09, "loss": 1.5841, "step": 13431 }, { "epoch": 2.2988190997775115, "grad_norm": 0.3122705817222595, "learning_rate": 1.1327886572815028e-09, "loss": 0.0974, "step": 13432 }, { "epoch": 2.2989902447372925, "grad_norm": 5.210422039031982, "learning_rate": 9.956165483049339e-10, "loss": 0.318, "step": 13433 }, { "epoch": 2.2991613896970735, "grad_norm": 12.220215797424316, "learning_rate": 8.672938742565472e-10, "loss": 0.9978, "step": 13434 }, { "epoch": 2.2993325346568545, "grad_norm": 0.3831678032875061, "learning_rate": 7.47820710844116e-10, "loss": 0.0983, "step": 13435 }, { "epoch": 2.2995036796166355, "grad_norm": 9.44739055633545, "learning_rate": 6.371971285595857e-10, "loss": 0.7006, "step": 13436 }, { "epoch": 2.299674824576416, "grad_norm": 6.978682518005371, "learning_rate": 5.354231926690822e-10, "loss": 0.517, "step": 13437 }, { "epoch": 2.299845969536197, "grad_norm": 5.299063682556152, "learning_rate": 4.4249896321957305e-10, "loss": 0.5727, "step": 13438 }, { "epoch": 2.300017114495978, "grad_norm": 6.491596221923828, "learning_rate": 3.584244950372018e-10, "loss": 0.3858, "step": 13439 }, { "epoch": 2.300188259455759, "grad_norm": 17.977203369140625, "learning_rate": 2.8319983772562283e-10, "loss": 1.1719, "step": 13440 }, { "epoch": 2.30035940441554, "grad_norm": 7.596632480621338, "learning_rate": 2.168250356676671e-10, "loss": 0.7272, "step": 13441 }, { "epoch": 2.300530549375321, "grad_norm": 7.419238090515137, "learning_rate": 1.593001280236761e-10, "loss": 0.5072, "step": 13442 }, { "epoch": 2.300701694335102, "grad_norm": 12.013089179992676, "learning_rate": 1.106251487348331e-10, "loss": 1.0031, "step": 13443 }, { "epoch": 2.300872839294883, "grad_norm": 9.756840705871582, "learning_rate": 7.080012651983214e-11, "loss": 0.7039, "step": 13444 }, { "epoch": 2.3010439842546635, "grad_norm": 12.588701248168945, "learning_rate": 3.98250848732129e-11, "loss": 0.6942, "step": 13445 }, { "epoch": 2.3012151292144445, "grad_norm": 12.238914489746094, "learning_rate": 1.770004207368725e-11, "loss": 1.0216, "step": 13446 }, { "epoch": 2.3013862741742255, "grad_norm": 11.569670677185059, "learning_rate": 4.4250111708166175e-12, "loss": 0.6971, "step": 13447 }, { "epoch": 2.3015574191340065, "grad_norm": 0.28670382499694824, "learning_rate": 3e-05, "loss": 0.0922, "step": 13448 }, { "epoch": 2.3017285640937875, "grad_norm": 29.105388641357422, "learning_rate": 2.999999557498883e-05, "loss": 3.8052, "step": 13449 }, { "epoch": 2.3018997090535684, "grad_norm": 3.0582339763641357, "learning_rate": 2.999998229995793e-05, "loss": 0.2472, "step": 13450 }, { "epoch": 2.3020708540133494, "grad_norm": 3.0564467906951904, "learning_rate": 2.9999960174915127e-05, "loss": 0.332, "step": 13451 }, { "epoch": 2.3022419989731304, "grad_norm": 1.8194841146469116, "learning_rate": 2.999992919987348e-05, "loss": 0.1869, "step": 13452 }, { "epoch": 2.3024131439329114, "grad_norm": 14.897225379943848, "learning_rate": 2.9999889374851267e-05, "loss": 0.9727, "step": 13453 }, { "epoch": 2.302584288892692, "grad_norm": 14.004070281982422, "learning_rate": 2.999984069987198e-05, "loss": 0.8975, "step": 13454 }, { "epoch": 2.302755433852473, "grad_norm": 12.177933692932129, "learning_rate": 2.9999783174964336e-05, "loss": 0.9124, "step": 13455 }, { "epoch": 2.302926578812254, "grad_norm": 2.322572946548462, "learning_rate": 2.9999716800162275e-05, "loss": 0.2406, "step": 13456 }, { "epoch": 2.303097723772035, "grad_norm": 1.3871514797210693, "learning_rate": 2.9999641575504964e-05, "loss": 0.1652, "step": 13457 }, { "epoch": 2.303268868731816, "grad_norm": 17.613101959228516, "learning_rate": 2.9999557501036782e-05, "loss": 1.7859, "step": 13458 }, { "epoch": 2.303440013691597, "grad_norm": 3.3124642372131348, "learning_rate": 2.999946457680733e-05, "loss": 0.2291, "step": 13459 }, { "epoch": 2.303611158651378, "grad_norm": 6.650140285491943, "learning_rate": 2.999936280287144e-05, "loss": 0.3138, "step": 13460 }, { "epoch": 2.3037823036111584, "grad_norm": 10.667584419250488, "learning_rate": 2.9999252179289158e-05, "loss": 0.6826, "step": 13461 }, { "epoch": 2.3039534485709394, "grad_norm": 11.456108093261719, "learning_rate": 2.9999132706125743e-05, "loss": 0.7398, "step": 13462 }, { "epoch": 2.3041245935307204, "grad_norm": 23.460020065307617, "learning_rate": 2.9999004383451696e-05, "loss": 5.0367, "step": 13463 }, { "epoch": 2.3042957384905014, "grad_norm": 20.930160522460938, "learning_rate": 2.9998867211342718e-05, "loss": 2.629, "step": 13464 }, { "epoch": 2.3044668834502824, "grad_norm": 0.39711448550224304, "learning_rate": 2.999872118987975e-05, "loss": 0.1068, "step": 13465 }, { "epoch": 2.3046380284100634, "grad_norm": 15.692044258117676, "learning_rate": 2.9998566319148938e-05, "loss": 1.1965, "step": 13466 }, { "epoch": 2.3048091733698444, "grad_norm": 0.5872135162353516, "learning_rate": 2.9998402599241654e-05, "loss": 0.1108, "step": 13467 }, { "epoch": 2.3049803183296254, "grad_norm": 19.26517105102539, "learning_rate": 2.99982300302545e-05, "loss": 1.1434, "step": 13468 }, { "epoch": 2.3051514632894063, "grad_norm": 6.772439956665039, "learning_rate": 2.9998048612289287e-05, "loss": 0.5266, "step": 13469 }, { "epoch": 2.305322608249187, "grad_norm": 10.76187515258789, "learning_rate": 2.9997858345453058e-05, "loss": 0.8844, "step": 13470 }, { "epoch": 2.305493753208968, "grad_norm": 6.101515769958496, "learning_rate": 2.9997659229858064e-05, "loss": 0.4726, "step": 13471 }, { "epoch": 2.305664898168749, "grad_norm": 9.352967262268066, "learning_rate": 2.999745126562179e-05, "loss": 0.5655, "step": 13472 }, { "epoch": 2.30583604312853, "grad_norm": 10.767040252685547, "learning_rate": 2.9997234452866925e-05, "loss": 0.81, "step": 13473 }, { "epoch": 2.306007188088311, "grad_norm": 4.6326165199279785, "learning_rate": 2.9997008791721397e-05, "loss": 0.4199, "step": 13474 }, { "epoch": 2.306178333048092, "grad_norm": 3.982828140258789, "learning_rate": 2.9996774282318344e-05, "loss": 0.2879, "step": 13475 }, { "epoch": 2.306349478007873, "grad_norm": 17.469873428344727, "learning_rate": 2.9996530924796127e-05, "loss": 1.4722, "step": 13476 }, { "epoch": 2.3065206229676534, "grad_norm": 113.55764770507812, "learning_rate": 2.9996278719298328e-05, "loss": 7.0471, "step": 13477 }, { "epoch": 2.3066917679274344, "grad_norm": 9.771876335144043, "learning_rate": 2.999601766597375e-05, "loss": 0.7606, "step": 13478 }, { "epoch": 2.3068629128872153, "grad_norm": 15.654747009277344, "learning_rate": 2.9995747764976414e-05, "loss": 0.9017, "step": 13479 }, { "epoch": 2.3070340578469963, "grad_norm": 3.7923543453216553, "learning_rate": 2.999546901646556e-05, "loss": 0.2609, "step": 13480 }, { "epoch": 2.3072052028067773, "grad_norm": 6.711348056793213, "learning_rate": 2.9995181420605653e-05, "loss": 0.6736, "step": 13481 }, { "epoch": 2.3073763477665583, "grad_norm": 8.716449737548828, "learning_rate": 2.9994884977566372e-05, "loss": 0.553, "step": 13482 }, { "epoch": 2.3075474927263393, "grad_norm": 9.724431991577148, "learning_rate": 2.9994579687522615e-05, "loss": 0.7369, "step": 13483 }, { "epoch": 2.3077186376861203, "grad_norm": 41.8806037902832, "learning_rate": 2.9994265550654512e-05, "loss": 5.878, "step": 13484 }, { "epoch": 2.3078897826459013, "grad_norm": 14.9148588180542, "learning_rate": 2.9993942567147402e-05, "loss": 1.0895, "step": 13485 }, { "epoch": 2.308060927605682, "grad_norm": 5.252044677734375, "learning_rate": 2.999361073719184e-05, "loss": 0.4002, "step": 13486 }, { "epoch": 2.308232072565463, "grad_norm": 2.6469576358795166, "learning_rate": 2.999327006098362e-05, "loss": 0.2599, "step": 13487 }, { "epoch": 2.308403217525244, "grad_norm": 12.733126640319824, "learning_rate": 2.9992920538723722e-05, "loss": 0.6704, "step": 13488 }, { "epoch": 2.308574362485025, "grad_norm": 20.472309112548828, "learning_rate": 2.999256217061838e-05, "loss": 1.5762, "step": 13489 }, { "epoch": 2.3087455074448058, "grad_norm": 14.49013900756836, "learning_rate": 2.9992194956879027e-05, "loss": 0.9488, "step": 13490 }, { "epoch": 2.3089166524045868, "grad_norm": 18.990846633911133, "learning_rate": 2.9991818897722315e-05, "loss": 1.579, "step": 13491 }, { "epoch": 2.3090877973643678, "grad_norm": 20.917959213256836, "learning_rate": 2.999143399337013e-05, "loss": 2.447, "step": 13492 }, { "epoch": 2.3092589423241487, "grad_norm": 26.9874210357666, "learning_rate": 2.9991040244049556e-05, "loss": 5.3456, "step": 13493 }, { "epoch": 2.3094300872839293, "grad_norm": 17.81285858154297, "learning_rate": 2.999063764999291e-05, "loss": 1.3382, "step": 13494 }, { "epoch": 2.3096012322437103, "grad_norm": 7.484838485717773, "learning_rate": 2.9990226211437717e-05, "loss": 0.7602, "step": 13495 }, { "epoch": 2.3097723772034913, "grad_norm": 14.649654388427734, "learning_rate": 2.9989805928626736e-05, "loss": 1.1365, "step": 13496 }, { "epoch": 2.3099435221632723, "grad_norm": 0.512509286403656, "learning_rate": 2.9989376801807933e-05, "loss": 0.1051, "step": 13497 }, { "epoch": 2.3101146671230532, "grad_norm": 16.358963012695312, "learning_rate": 2.998893883123449e-05, "loss": 1.7518, "step": 13498 }, { "epoch": 2.3102858120828342, "grad_norm": 11.641204833984375, "learning_rate": 2.9988492017164812e-05, "loss": 0.8963, "step": 13499 }, { "epoch": 2.3104569570426152, "grad_norm": 10.063977241516113, "learning_rate": 2.9988036359862517e-05, "loss": 0.8237, "step": 13500 }, { "epoch": 2.310628102002396, "grad_norm": 6.366086959838867, "learning_rate": 2.9987571859596446e-05, "loss": 0.3845, "step": 13501 }, { "epoch": 2.310799246962177, "grad_norm": 142.8118896484375, "learning_rate": 2.9987098516640656e-05, "loss": 10.0252, "step": 13502 }, { "epoch": 2.3109703919219577, "grad_norm": 20.527841567993164, "learning_rate": 2.9986616331274415e-05, "loss": 2.3167, "step": 13503 }, { "epoch": 2.3111415368817387, "grad_norm": 17.55230140686035, "learning_rate": 2.998612530378222e-05, "loss": 1.8065, "step": 13504 }, { "epoch": 2.3113126818415197, "grad_norm": 24.12261962890625, "learning_rate": 2.9985625434453774e-05, "loss": 5.0346, "step": 13505 }, { "epoch": 2.3114838268013007, "grad_norm": 3.026292562484741, "learning_rate": 2.9985116723584e-05, "loss": 0.2504, "step": 13506 }, { "epoch": 2.3116549717610817, "grad_norm": 13.314135551452637, "learning_rate": 2.998459917147304e-05, "loss": 0.902, "step": 13507 }, { "epoch": 2.3118261167208627, "grad_norm": 6.1508283615112305, "learning_rate": 2.9984072778426246e-05, "loss": 0.5602, "step": 13508 }, { "epoch": 2.3119972616806437, "grad_norm": 6.550236701965332, "learning_rate": 2.9983537544754203e-05, "loss": 0.6474, "step": 13509 }, { "epoch": 2.3121684066404242, "grad_norm": 9.560619354248047, "learning_rate": 2.9982993470772684e-05, "loss": 0.641, "step": 13510 }, { "epoch": 2.312339551600205, "grad_norm": 0.3937772810459137, "learning_rate": 2.99824405568027e-05, "loss": 0.1012, "step": 13511 }, { "epoch": 2.312510696559986, "grad_norm": 6.345602035522461, "learning_rate": 2.9981878803170476e-05, "loss": 0.3704, "step": 13512 }, { "epoch": 2.312681841519767, "grad_norm": 12.034383773803711, "learning_rate": 2.9981308210207444e-05, "loss": 0.8872, "step": 13513 }, { "epoch": 2.312852986479548, "grad_norm": 10.373887062072754, "learning_rate": 2.998072877825025e-05, "loss": 0.7619, "step": 13514 }, { "epoch": 2.313024131439329, "grad_norm": 8.032447814941406, "learning_rate": 2.9980140507640764e-05, "loss": 0.5126, "step": 13515 }, { "epoch": 2.31319527639911, "grad_norm": 17.439775466918945, "learning_rate": 2.9979543398726073e-05, "loss": 1.8122, "step": 13516 }, { "epoch": 2.313366421358891, "grad_norm": 11.42000961303711, "learning_rate": 2.9978937451858457e-05, "loss": 0.8524, "step": 13517 }, { "epoch": 2.313537566318672, "grad_norm": 21.92038345336914, "learning_rate": 2.997832266739544e-05, "loss": 4.8309, "step": 13518 }, { "epoch": 2.3137087112784527, "grad_norm": 2.194136619567871, "learning_rate": 2.9977699045699735e-05, "loss": 0.1839, "step": 13519 }, { "epoch": 2.3138798562382337, "grad_norm": 2.6012978553771973, "learning_rate": 2.9977066587139287e-05, "loss": 0.2142, "step": 13520 }, { "epoch": 2.3140510011980147, "grad_norm": 0.4266297519207001, "learning_rate": 2.9976425292087245e-05, "loss": 0.1005, "step": 13521 }, { "epoch": 2.3142221461577956, "grad_norm": 12.075187683105469, "learning_rate": 2.997577516092197e-05, "loss": 0.9264, "step": 13522 }, { "epoch": 2.3143932911175766, "grad_norm": 5.369175910949707, "learning_rate": 2.9975116194027046e-05, "loss": 0.493, "step": 13523 }, { "epoch": 2.3145644360773576, "grad_norm": 9.043492317199707, "learning_rate": 2.9974448391791265e-05, "loss": 0.7004, "step": 13524 }, { "epoch": 2.3147355810371386, "grad_norm": 0.7850273251533508, "learning_rate": 2.997377175460863e-05, "loss": 0.1122, "step": 13525 }, { "epoch": 2.314906725996919, "grad_norm": 16.037662506103516, "learning_rate": 2.9973086282878353e-05, "loss": 1.4624, "step": 13526 }, { "epoch": 2.3150778709567, "grad_norm": 16.76435661315918, "learning_rate": 2.9972391977004867e-05, "loss": 1.0742, "step": 13527 }, { "epoch": 2.315249015916481, "grad_norm": 13.71143627166748, "learning_rate": 2.9971688837397816e-05, "loss": 1.0321, "step": 13528 }, { "epoch": 2.315420160876262, "grad_norm": 10.121928215026855, "learning_rate": 2.997097686447205e-05, "loss": 0.7134, "step": 13529 }, { "epoch": 2.315591305836043, "grad_norm": 1.8966169357299805, "learning_rate": 2.9970256058647636e-05, "loss": 0.1612, "step": 13530 }, { "epoch": 2.315762450795824, "grad_norm": 25.548995971679688, "learning_rate": 2.996952642034985e-05, "loss": 5.1037, "step": 13531 }, { "epoch": 2.315933595755605, "grad_norm": 15.279106140136719, "learning_rate": 2.996878795000918e-05, "loss": 1.4143, "step": 13532 }, { "epoch": 2.316104740715386, "grad_norm": 14.476461410522461, "learning_rate": 2.9968040648061328e-05, "loss": 1.137, "step": 13533 }, { "epoch": 2.316275885675167, "grad_norm": 9.378069877624512, "learning_rate": 2.9967284514947192e-05, "loss": 0.5779, "step": 13534 }, { "epoch": 2.3164470306349476, "grad_norm": 14.250051498413086, "learning_rate": 2.9966519551112904e-05, "loss": 0.9389, "step": 13535 }, { "epoch": 2.3166181755947286, "grad_norm": 13.766907691955566, "learning_rate": 2.9965745757009784e-05, "loss": 1.0328, "step": 13536 }, { "epoch": 2.3167893205545096, "grad_norm": 14.353287696838379, "learning_rate": 2.996496313309438e-05, "loss": 1.0963, "step": 13537 }, { "epoch": 2.3169604655142906, "grad_norm": 10.25852108001709, "learning_rate": 2.9964171679828438e-05, "loss": 0.6613, "step": 13538 }, { "epoch": 2.3171316104740716, "grad_norm": 10.67443561553955, "learning_rate": 2.9963371397678912e-05, "loss": 0.77, "step": 13539 }, { "epoch": 2.3173027554338526, "grad_norm": 14.299753189086914, "learning_rate": 2.9962562287117978e-05, "loss": 1.6681, "step": 13540 }, { "epoch": 2.3174739003936335, "grad_norm": 6.408872127532959, "learning_rate": 2.996174434862301e-05, "loss": 0.5752, "step": 13541 }, { "epoch": 2.3176450453534145, "grad_norm": 6.911219596862793, "learning_rate": 2.996091758267659e-05, "loss": 0.5009, "step": 13542 }, { "epoch": 2.317816190313195, "grad_norm": 16.417011260986328, "learning_rate": 2.996008198976651e-05, "loss": 1.8421, "step": 13543 }, { "epoch": 2.317987335272976, "grad_norm": 9.013779640197754, "learning_rate": 2.9959237570385777e-05, "loss": 0.6316, "step": 13544 }, { "epoch": 2.318158480232757, "grad_norm": 19.166305541992188, "learning_rate": 2.995838432503259e-05, "loss": 1.014, "step": 13545 }, { "epoch": 2.318329625192538, "grad_norm": 10.058900833129883, "learning_rate": 2.995752225421038e-05, "loss": 0.8148, "step": 13546 }, { "epoch": 2.318500770152319, "grad_norm": 2.0493733882904053, "learning_rate": 2.9956651358427758e-05, "loss": 0.2157, "step": 13547 }, { "epoch": 2.3186719151121, "grad_norm": 16.41203498840332, "learning_rate": 2.9955771638198552e-05, "loss": 1.8167, "step": 13548 }, { "epoch": 2.318843060071881, "grad_norm": 2.5444204807281494, "learning_rate": 2.9954883094041813e-05, "loss": 0.2169, "step": 13549 }, { "epoch": 2.319014205031662, "grad_norm": 0.4898788630962372, "learning_rate": 2.995398572648177e-05, "loss": 0.1075, "step": 13550 }, { "epoch": 2.319185349991443, "grad_norm": 27.161680221557617, "learning_rate": 2.995307953604787e-05, "loss": 1.0701, "step": 13551 }, { "epoch": 2.3193564949512235, "grad_norm": 8.024944305419922, "learning_rate": 2.9952164523274775e-05, "loss": 0.6824, "step": 13552 }, { "epoch": 2.3195276399110045, "grad_norm": 8.705029487609863, "learning_rate": 2.9951240688702342e-05, "loss": 0.6371, "step": 13553 }, { "epoch": 2.3196987848707855, "grad_norm": 8.669548034667969, "learning_rate": 2.9950308032875634e-05, "loss": 0.6821, "step": 13554 }, { "epoch": 2.3198699298305665, "grad_norm": 53.236175537109375, "learning_rate": 2.9949366556344916e-05, "loss": 6.0931, "step": 13555 }, { "epoch": 2.3200410747903475, "grad_norm": 12.099916458129883, "learning_rate": 2.9948416259665665e-05, "loss": 0.8741, "step": 13556 }, { "epoch": 2.3202122197501285, "grad_norm": 1.9536808729171753, "learning_rate": 2.9947457143398554e-05, "loss": 0.2053, "step": 13557 }, { "epoch": 2.3203833647099095, "grad_norm": 11.091364860534668, "learning_rate": 2.9946489208109468e-05, "loss": 0.8208, "step": 13558 }, { "epoch": 2.32055450966969, "grad_norm": 3.978388547897339, "learning_rate": 2.9945512454369482e-05, "loss": 0.425, "step": 13559 }, { "epoch": 2.320725654629471, "grad_norm": 8.882526397705078, "learning_rate": 2.9944526882754894e-05, "loss": 0.7787, "step": 13560 }, { "epoch": 2.320896799589252, "grad_norm": 13.874956130981445, "learning_rate": 2.994353249384718e-05, "loss": 1.49, "step": 13561 }, { "epoch": 2.321067944549033, "grad_norm": 28.158525466918945, "learning_rate": 2.994252928823304e-05, "loss": 5.144, "step": 13562 }, { "epoch": 2.321239089508814, "grad_norm": 0.4509497880935669, "learning_rate": 2.9941517266504363e-05, "loss": 0.107, "step": 13563 }, { "epoch": 2.321410234468595, "grad_norm": 17.02088165283203, "learning_rate": 2.994049642925824e-05, "loss": 1.5822, "step": 13564 }, { "epoch": 2.321581379428376, "grad_norm": 3.0687336921691895, "learning_rate": 2.9939466777096975e-05, "loss": 0.2425, "step": 13565 }, { "epoch": 2.321752524388157, "grad_norm": 15.129426002502441, "learning_rate": 2.9938428310628057e-05, "loss": 1.2529, "step": 13566 }, { "epoch": 2.321923669347938, "grad_norm": 11.726180076599121, "learning_rate": 2.9937381030464187e-05, "loss": 0.8374, "step": 13567 }, { "epoch": 2.3220948143077185, "grad_norm": 9.523625373840332, "learning_rate": 2.9936324937223266e-05, "loss": 0.766, "step": 13568 }, { "epoch": 2.3222659592674995, "grad_norm": 8.773947715759277, "learning_rate": 2.9935260031528377e-05, "loss": 0.7165, "step": 13569 }, { "epoch": 2.3224371042272804, "grad_norm": 11.207303047180176, "learning_rate": 2.993418631400783e-05, "loss": 0.9701, "step": 13570 }, { "epoch": 2.3226082491870614, "grad_norm": 0.4360193610191345, "learning_rate": 2.993310378529511e-05, "loss": 0.1042, "step": 13571 }, { "epoch": 2.3227793941468424, "grad_norm": 12.791950225830078, "learning_rate": 2.9932012446028916e-05, "loss": 0.8737, "step": 13572 }, { "epoch": 2.3229505391066234, "grad_norm": 0.282757043838501, "learning_rate": 2.9930912296853136e-05, "loss": 0.0921, "step": 13573 }, { "epoch": 2.3231216840664044, "grad_norm": 17.103870391845703, "learning_rate": 2.9929803338416863e-05, "loss": 1.553, "step": 13574 }, { "epoch": 2.323292829026185, "grad_norm": 0.4984229803085327, "learning_rate": 2.992868557137438e-05, "loss": 0.1055, "step": 13575 }, { "epoch": 2.323463973985966, "grad_norm": 10.396992683410645, "learning_rate": 2.992755899638518e-05, "loss": 0.829, "step": 13576 }, { "epoch": 2.323635118945747, "grad_norm": 2.2982115745544434, "learning_rate": 2.9926423614113935e-05, "loss": 0.2054, "step": 13577 }, { "epoch": 2.323806263905528, "grad_norm": 6.576963424682617, "learning_rate": 2.9925279425230528e-05, "loss": 0.4213, "step": 13578 }, { "epoch": 2.323977408865309, "grad_norm": 16.13943862915039, "learning_rate": 2.9924126430410022e-05, "loss": 1.0505, "step": 13579 }, { "epoch": 2.32414855382509, "grad_norm": 18.86705780029297, "learning_rate": 2.99229646303327e-05, "loss": 1.9488, "step": 13580 }, { "epoch": 2.324319698784871, "grad_norm": 20.89632225036621, "learning_rate": 2.992179402568402e-05, "loss": 2.0681, "step": 13581 }, { "epoch": 2.324490843744652, "grad_norm": 20.368547439575195, "learning_rate": 2.9920614617154634e-05, "loss": 2.0414, "step": 13582 }, { "epoch": 2.324661988704433, "grad_norm": 2.075364112854004, "learning_rate": 2.9919426405440406e-05, "loss": 0.1877, "step": 13583 }, { "epoch": 2.3248331336642134, "grad_norm": 5.38266658782959, "learning_rate": 2.991822939124237e-05, "loss": 0.3743, "step": 13584 }, { "epoch": 2.3250042786239944, "grad_norm": 10.65040111541748, "learning_rate": 2.9917023575266778e-05, "loss": 0.7766, "step": 13585 }, { "epoch": 2.3251754235837754, "grad_norm": 14.419092178344727, "learning_rate": 2.9915808958225057e-05, "loss": 1.2346, "step": 13586 }, { "epoch": 2.3253465685435564, "grad_norm": 13.391254425048828, "learning_rate": 2.9914585540833836e-05, "loss": 0.6888, "step": 13587 }, { "epoch": 2.3255177135033374, "grad_norm": 5.2972540855407715, "learning_rate": 2.9913353323814928e-05, "loss": 0.4483, "step": 13588 }, { "epoch": 2.3256888584631183, "grad_norm": 0.4817509055137634, "learning_rate": 2.991211230789535e-05, "loss": 0.1059, "step": 13589 }, { "epoch": 2.3258600034228993, "grad_norm": 8.15400218963623, "learning_rate": 2.9910862493807297e-05, "loss": 0.5673, "step": 13590 }, { "epoch": 2.32603114838268, "grad_norm": 8.92547607421875, "learning_rate": 2.9909603882288167e-05, "loss": 0.6382, "step": 13591 }, { "epoch": 2.326202293342461, "grad_norm": 18.289764404296875, "learning_rate": 2.9908336474080534e-05, "loss": 1.4232, "step": 13592 }, { "epoch": 2.326373438302242, "grad_norm": 2.053123712539673, "learning_rate": 2.9907060269932176e-05, "loss": 0.2317, "step": 13593 }, { "epoch": 2.326544583262023, "grad_norm": 9.714186668395996, "learning_rate": 2.9905775270596058e-05, "loss": 0.5306, "step": 13594 }, { "epoch": 2.326715728221804, "grad_norm": 17.172286987304688, "learning_rate": 2.990448147683033e-05, "loss": 1.0989, "step": 13595 }, { "epoch": 2.326886873181585, "grad_norm": 6.693421840667725, "learning_rate": 2.9903178889398325e-05, "loss": 0.3307, "step": 13596 }, { "epoch": 2.327058018141366, "grad_norm": 7.3236002922058105, "learning_rate": 2.9901867509068582e-05, "loss": 0.3592, "step": 13597 }, { "epoch": 2.327229163101147, "grad_norm": 18.839374542236328, "learning_rate": 2.9900547336614815e-05, "loss": 2.074, "step": 13598 }, { "epoch": 2.327400308060928, "grad_norm": 18.971776962280273, "learning_rate": 2.9899218372815923e-05, "loss": 1.4347, "step": 13599 }, { "epoch": 2.327571453020709, "grad_norm": 24.544099807739258, "learning_rate": 2.9897880618456e-05, "loss": 2.6806, "step": 13600 }, { "epoch": 2.3277425979804893, "grad_norm": 8.732476234436035, "learning_rate": 2.989653407432432e-05, "loss": 0.7522, "step": 13601 }, { "epoch": 2.3279137429402703, "grad_norm": 15.744263648986816, "learning_rate": 2.989517874121535e-05, "loss": 0.9242, "step": 13602 }, { "epoch": 2.3280848879000513, "grad_norm": 0.4875379800796509, "learning_rate": 2.9893814619928737e-05, "loss": 0.1062, "step": 13603 }, { "epoch": 2.3282560328598323, "grad_norm": 4.948946475982666, "learning_rate": 2.9892441711269315e-05, "loss": 0.4243, "step": 13604 }, { "epoch": 2.3284271778196133, "grad_norm": 5.619512557983398, "learning_rate": 2.9891060016047097e-05, "loss": 0.3446, "step": 13605 }, { "epoch": 2.3285983227793943, "grad_norm": 13.998258590698242, "learning_rate": 2.9889669535077294e-05, "loss": 1.128, "step": 13606 }, { "epoch": 2.3287694677391753, "grad_norm": 13.922667503356934, "learning_rate": 2.988827026918029e-05, "loss": 1.0015, "step": 13607 }, { "epoch": 2.328940612698956, "grad_norm": 17.19293212890625, "learning_rate": 2.988686221918165e-05, "loss": 1.1726, "step": 13608 }, { "epoch": 2.329111757658737, "grad_norm": 10.506680488586426, "learning_rate": 2.9885445385912124e-05, "loss": 0.8039, "step": 13609 }, { "epoch": 2.329282902618518, "grad_norm": 5.490754127502441, "learning_rate": 2.988401977020765e-05, "loss": 0.4805, "step": 13610 }, { "epoch": 2.3294540475782988, "grad_norm": 11.616114616394043, "learning_rate": 2.9882585372909348e-05, "loss": 0.8198, "step": 13611 }, { "epoch": 2.3296251925380798, "grad_norm": 30.416839599609375, "learning_rate": 2.9881142194863503e-05, "loss": 5.6632, "step": 13612 }, { "epoch": 2.3297963374978607, "grad_norm": 119.79792785644531, "learning_rate": 2.9879690236921604e-05, "loss": 7.7069, "step": 13613 }, { "epoch": 2.3299674824576417, "grad_norm": 11.294135093688965, "learning_rate": 2.9878229499940296e-05, "loss": 0.9175, "step": 13614 }, { "epoch": 2.3301386274174227, "grad_norm": 19.286046981811523, "learning_rate": 2.987675998478143e-05, "loss": 1.472, "step": 13615 }, { "epoch": 2.3303097723772037, "grad_norm": 8.879682540893555, "learning_rate": 2.9875281692312005e-05, "loss": 0.5536, "step": 13616 }, { "epoch": 2.3304809173369843, "grad_norm": 19.329744338989258, "learning_rate": 2.987379462340423e-05, "loss": 1.6448, "step": 13617 }, { "epoch": 2.3306520622967652, "grad_norm": 19.19766616821289, "learning_rate": 2.9872298778935472e-05, "loss": 2.626, "step": 13618 }, { "epoch": 2.3308232072565462, "grad_norm": 1.2703230381011963, "learning_rate": 2.987079415978829e-05, "loss": 0.1807, "step": 13619 }, { "epoch": 2.3309943522163272, "grad_norm": 6.04038143157959, "learning_rate": 2.9869280766850397e-05, "loss": 0.5156, "step": 13620 }, { "epoch": 2.331165497176108, "grad_norm": 17.113452911376953, "learning_rate": 2.986775860101471e-05, "loss": 1.5124, "step": 13621 }, { "epoch": 2.331336642135889, "grad_norm": 77.8150863647461, "learning_rate": 2.98662276631793e-05, "loss": 8.2717, "step": 13622 }, { "epoch": 2.33150778709567, "grad_norm": 9.993204116821289, "learning_rate": 2.9864687954247423e-05, "loss": 0.6149, "step": 13623 }, { "epoch": 2.3316789320554507, "grad_norm": 14.614229202270508, "learning_rate": 2.9863139475127515e-05, "loss": 1.0528, "step": 13624 }, { "epoch": 2.3318500770152317, "grad_norm": 7.8276686668396, "learning_rate": 2.9861582226733176e-05, "loss": 0.5441, "step": 13625 }, { "epoch": 2.3320212219750127, "grad_norm": 0.47913220524787903, "learning_rate": 2.9860016209983187e-05, "loss": 0.1137, "step": 13626 }, { "epoch": 2.3321923669347937, "grad_norm": 5.279212951660156, "learning_rate": 2.985844142580151e-05, "loss": 0.408, "step": 13627 }, { "epoch": 2.3323635118945747, "grad_norm": 10.494958877563477, "learning_rate": 2.9856857875117254e-05, "loss": 0.7636, "step": 13628 }, { "epoch": 2.3325346568543557, "grad_norm": 14.683012008666992, "learning_rate": 2.985526555886472e-05, "loss": 1.0069, "step": 13629 }, { "epoch": 2.3327058018141367, "grad_norm": 40.30543899536133, "learning_rate": 2.9853664477983382e-05, "loss": 5.178, "step": 13630 }, { "epoch": 2.3328769467739177, "grad_norm": 12.541032791137695, "learning_rate": 2.9852054633417885e-05, "loss": 0.9646, "step": 13631 }, { "epoch": 2.3330480917336986, "grad_norm": 10.344924926757812, "learning_rate": 2.985043602611803e-05, "loss": 0.6384, "step": 13632 }, { "epoch": 2.333219236693479, "grad_norm": 130.45193481445312, "learning_rate": 2.9848808657038795e-05, "loss": 8.8699, "step": 13633 }, { "epoch": 2.33339038165326, "grad_norm": 17.202491760253906, "learning_rate": 2.9847172527140338e-05, "loss": 1.918, "step": 13634 }, { "epoch": 2.333561526613041, "grad_norm": 14.694887161254883, "learning_rate": 2.9845527637387974e-05, "loss": 0.9756, "step": 13635 }, { "epoch": 2.333732671572822, "grad_norm": 15.296918869018555, "learning_rate": 2.9843873988752198e-05, "loss": 1.3816, "step": 13636 }, { "epoch": 2.333903816532603, "grad_norm": 16.934423446655273, "learning_rate": 2.9842211582208656e-05, "loss": 0.878, "step": 13637 }, { "epoch": 2.334074961492384, "grad_norm": 21.434900283813477, "learning_rate": 2.984054041873817e-05, "loss": 2.5928, "step": 13638 }, { "epoch": 2.334246106452165, "grad_norm": 18.25446319580078, "learning_rate": 2.983886049932674e-05, "loss": 2.1814, "step": 13639 }, { "epoch": 2.3344172514119457, "grad_norm": 3.20064377784729, "learning_rate": 2.9837171824965506e-05, "loss": 0.4068, "step": 13640 }, { "epoch": 2.3345883963717267, "grad_norm": 9.222538948059082, "learning_rate": 2.9835474396650802e-05, "loss": 0.5608, "step": 13641 }, { "epoch": 2.3347595413315076, "grad_norm": 15.78625202178955, "learning_rate": 2.98337682153841e-05, "loss": 1.0487, "step": 13642 }, { "epoch": 2.3349306862912886, "grad_norm": 24.31590461730957, "learning_rate": 2.9832053282172065e-05, "loss": 5.1187, "step": 13643 }, { "epoch": 2.3351018312510696, "grad_norm": 10.210701942443848, "learning_rate": 2.9830329598026498e-05, "loss": 0.8297, "step": 13644 }, { "epoch": 2.3352729762108506, "grad_norm": 4.183729648590088, "learning_rate": 2.982859716396438e-05, "loss": 0.3438, "step": 13645 }, { "epoch": 2.3354441211706316, "grad_norm": 101.57716369628906, "learning_rate": 2.9826855981007845e-05, "loss": 7.8894, "step": 13646 }, { "epoch": 2.3356152661304126, "grad_norm": 17.536670684814453, "learning_rate": 2.9825106050184198e-05, "loss": 1.6956, "step": 13647 }, { "epoch": 2.3357864110901936, "grad_norm": 18.1890811920166, "learning_rate": 2.9823347372525905e-05, "loss": 1.2158, "step": 13648 }, { "epoch": 2.3359575560499746, "grad_norm": 12.747384071350098, "learning_rate": 2.9821579949070573e-05, "loss": 1.0252, "step": 13649 }, { "epoch": 2.336128701009755, "grad_norm": 13.792305946350098, "learning_rate": 2.9819803780861006e-05, "loss": 1.1196, "step": 13650 }, { "epoch": 2.336299845969536, "grad_norm": 13.73145580291748, "learning_rate": 2.981801886894513e-05, "loss": 1.1894, "step": 13651 }, { "epoch": 2.336470990929317, "grad_norm": 26.322633743286133, "learning_rate": 2.9816225214376052e-05, "loss": 4.9301, "step": 13652 }, { "epoch": 2.336642135889098, "grad_norm": 0.9272482991218567, "learning_rate": 2.9814422818212032e-05, "loss": 0.1622, "step": 13653 }, { "epoch": 2.336813280848879, "grad_norm": 1.991828441619873, "learning_rate": 2.981261168151648e-05, "loss": 0.1887, "step": 13654 }, { "epoch": 2.33698442580866, "grad_norm": 12.076228141784668, "learning_rate": 2.981079180535797e-05, "loss": 0.9187, "step": 13655 }, { "epoch": 2.337155570768441, "grad_norm": 11.75818157196045, "learning_rate": 2.980896319081024e-05, "loss": 1.1033, "step": 13656 }, { "epoch": 2.3373267157282216, "grad_norm": 16.91015625, "learning_rate": 2.9807125838952168e-05, "loss": 1.7965, "step": 13657 }, { "epoch": 2.3374978606880026, "grad_norm": 0.7032408714294434, "learning_rate": 2.98052797508678e-05, "loss": 0.1584, "step": 13658 }, { "epoch": 2.3376690056477836, "grad_norm": 9.60195541381836, "learning_rate": 2.980342492764632e-05, "loss": 0.751, "step": 13659 }, { "epoch": 2.3378401506075646, "grad_norm": 19.59812355041504, "learning_rate": 2.980156137038209e-05, "loss": 1.4133, "step": 13660 }, { "epoch": 2.3380112955673455, "grad_norm": 17.834918975830078, "learning_rate": 2.97996890801746e-05, "loss": 1.3906, "step": 13661 }, { "epoch": 2.3381824405271265, "grad_norm": 9.582483291625977, "learning_rate": 2.9797808058128513e-05, "loss": 0.6168, "step": 13662 }, { "epoch": 2.3383535854869075, "grad_norm": 18.737794876098633, "learning_rate": 2.979591830535363e-05, "loss": 1.4623, "step": 13663 }, { "epoch": 2.3385247304466885, "grad_norm": 12.913202285766602, "learning_rate": 2.979401982296491e-05, "loss": 0.8554, "step": 13664 }, { "epoch": 2.3386958754064695, "grad_norm": 0.42290905117988586, "learning_rate": 2.9792112612082455e-05, "loss": 0.103, "step": 13665 }, { "epoch": 2.33886702036625, "grad_norm": 2.219910144805908, "learning_rate": 2.9790196673831532e-05, "loss": 0.2009, "step": 13666 }, { "epoch": 2.339038165326031, "grad_norm": 7.253404140472412, "learning_rate": 2.9788272009342534e-05, "loss": 0.5455, "step": 13667 }, { "epoch": 2.339209310285812, "grad_norm": 13.232219696044922, "learning_rate": 2.978633861975103e-05, "loss": 1.0165, "step": 13668 }, { "epoch": 2.339380455245593, "grad_norm": 20.18370246887207, "learning_rate": 2.978439650619772e-05, "loss": 2.0782, "step": 13669 }, { "epoch": 2.339551600205374, "grad_norm": 18.754037857055664, "learning_rate": 2.9782445669828445e-05, "loss": 1.6514, "step": 13670 }, { "epoch": 2.339722745165155, "grad_norm": 0.4430415630340576, "learning_rate": 2.978048611179421e-05, "loss": 0.1004, "step": 13671 }, { "epoch": 2.339893890124936, "grad_norm": 110.7207260131836, "learning_rate": 2.977851783325115e-05, "loss": 7.0141, "step": 13672 }, { "epoch": 2.3400650350847165, "grad_norm": 2.5121407508850098, "learning_rate": 2.977654083536056e-05, "loss": 0.2178, "step": 13673 }, { "epoch": 2.3402361800444975, "grad_norm": 5.153434753417969, "learning_rate": 2.9774555119288868e-05, "loss": 0.4074, "step": 13674 }, { "epoch": 2.3404073250042785, "grad_norm": 65.05607604980469, "learning_rate": 2.977256068620765e-05, "loss": 6.4684, "step": 13675 }, { "epoch": 2.3405784699640595, "grad_norm": 4.350557327270508, "learning_rate": 2.9770557537293624e-05, "loss": 0.5253, "step": 13676 }, { "epoch": 2.3407496149238405, "grad_norm": 12.657404899597168, "learning_rate": 2.976854567372865e-05, "loss": 0.983, "step": 13677 }, { "epoch": 2.3409207598836215, "grad_norm": 4.565290927886963, "learning_rate": 2.976652509669973e-05, "loss": 0.3869, "step": 13678 }, { "epoch": 2.3410919048434025, "grad_norm": 14.06399917602539, "learning_rate": 2.976449580739901e-05, "loss": 0.7895, "step": 13679 }, { "epoch": 2.3412630498031834, "grad_norm": 16.707382202148438, "learning_rate": 2.976245780702377e-05, "loss": 1.1729, "step": 13680 }, { "epoch": 2.3414341947629644, "grad_norm": 12.520792961120605, "learning_rate": 2.9760411096776442e-05, "loss": 1.0882, "step": 13681 }, { "epoch": 2.341605339722745, "grad_norm": 7.543610572814941, "learning_rate": 2.9758355677864574e-05, "loss": 0.532, "step": 13682 }, { "epoch": 2.341776484682526, "grad_norm": 6.221946716308594, "learning_rate": 2.975629155150088e-05, "loss": 0.6056, "step": 13683 }, { "epoch": 2.341947629642307, "grad_norm": 10.001294136047363, "learning_rate": 2.975421871890319e-05, "loss": 0.8762, "step": 13684 }, { "epoch": 2.342118774602088, "grad_norm": 35.06763458251953, "learning_rate": 2.975213718129448e-05, "loss": 5.0711, "step": 13685 }, { "epoch": 2.342289919561869, "grad_norm": 8.910524368286133, "learning_rate": 2.975004693990286e-05, "loss": 0.5254, "step": 13686 }, { "epoch": 2.34246106452165, "grad_norm": 0.4518713355064392, "learning_rate": 2.9747947995961572e-05, "loss": 0.1019, "step": 13687 }, { "epoch": 2.342632209481431, "grad_norm": 18.958961486816406, "learning_rate": 2.9745840350709006e-05, "loss": 1.1865, "step": 13688 }, { "epoch": 2.3428033544412115, "grad_norm": 1.6435120105743408, "learning_rate": 2.974372400538867e-05, "loss": 0.1919, "step": 13689 }, { "epoch": 2.3429744994009924, "grad_norm": 10.764307975769043, "learning_rate": 2.9741598961249205e-05, "loss": 0.8232, "step": 13690 }, { "epoch": 2.3431456443607734, "grad_norm": 14.981731414794922, "learning_rate": 2.97394652195444e-05, "loss": 1.1402, "step": 13691 }, { "epoch": 2.3433167893205544, "grad_norm": 15.344682693481445, "learning_rate": 2.9737322781533158e-05, "loss": 1.249, "step": 13692 }, { "epoch": 2.3434879342803354, "grad_norm": 9.51612663269043, "learning_rate": 2.9735171648479534e-05, "loss": 0.5791, "step": 13693 }, { "epoch": 2.3436590792401164, "grad_norm": 11.520572662353516, "learning_rate": 2.973301182165268e-05, "loss": 0.8384, "step": 13694 }, { "epoch": 2.3438302241998974, "grad_norm": 4.6705169677734375, "learning_rate": 2.973084330232691e-05, "loss": 0.4922, "step": 13695 }, { "epoch": 2.3440013691596784, "grad_norm": 1.0082848072052002, "learning_rate": 2.972866609178165e-05, "loss": 0.1449, "step": 13696 }, { "epoch": 2.3441725141194594, "grad_norm": 7.143579006195068, "learning_rate": 2.972648019130146e-05, "loss": 1.0498, "step": 13697 }, { "epoch": 2.3443436590792404, "grad_norm": 6.577096462249756, "learning_rate": 2.972428560217602e-05, "loss": 0.5404, "step": 13698 }, { "epoch": 2.344514804039021, "grad_norm": 13.688665390014648, "learning_rate": 2.972208232570015e-05, "loss": 1.0094, "step": 13699 }, { "epoch": 2.344685948998802, "grad_norm": 13.726154327392578, "learning_rate": 2.971987036317377e-05, "loss": 0.8359, "step": 13700 }, { "epoch": 2.344857093958583, "grad_norm": 14.923665046691895, "learning_rate": 2.9717649715901956e-05, "loss": 0.9738, "step": 13701 }, { "epoch": 2.345028238918364, "grad_norm": 10.530733108520508, "learning_rate": 2.971542038519489e-05, "loss": 0.6797, "step": 13702 }, { "epoch": 2.345199383878145, "grad_norm": 9.854372024536133, "learning_rate": 2.9713182372367877e-05, "loss": 1.0083, "step": 13703 }, { "epoch": 2.345370528837926, "grad_norm": 18.520357131958008, "learning_rate": 2.9710935678741347e-05, "loss": 1.3566, "step": 13704 }, { "epoch": 2.345541673797707, "grad_norm": 19.08574104309082, "learning_rate": 2.970868030564086e-05, "loss": 2.0367, "step": 13705 }, { "epoch": 2.3457128187574874, "grad_norm": 10.670495986938477, "learning_rate": 2.9706416254397077e-05, "loss": 0.682, "step": 13706 }, { "epoch": 2.3458839637172684, "grad_norm": 12.748114585876465, "learning_rate": 2.970414352634581e-05, "loss": 0.6034, "step": 13707 }, { "epoch": 2.3460551086770494, "grad_norm": 0.46030059456825256, "learning_rate": 2.9701862122827953e-05, "loss": 0.1011, "step": 13708 }, { "epoch": 2.3462262536368304, "grad_norm": 5.507357120513916, "learning_rate": 2.9699572045189546e-05, "loss": 0.3089, "step": 13709 }, { "epoch": 2.3463973985966113, "grad_norm": 4.074306964874268, "learning_rate": 2.9697273294781744e-05, "loss": 0.276, "step": 13710 }, { "epoch": 2.3465685435563923, "grad_norm": 10.090456008911133, "learning_rate": 2.96949658729608e-05, "loss": 0.7904, "step": 13711 }, { "epoch": 2.3467396885161733, "grad_norm": 14.861419677734375, "learning_rate": 2.9692649781088103e-05, "loss": 1.2557, "step": 13712 }, { "epoch": 2.3469108334759543, "grad_norm": 11.074277877807617, "learning_rate": 2.969032502053016e-05, "loss": 0.7781, "step": 13713 }, { "epoch": 2.3470819784357353, "grad_norm": 22.564245223999023, "learning_rate": 2.968799159265857e-05, "loss": 2.4523, "step": 13714 }, { "epoch": 2.347253123395516, "grad_norm": 12.7288179397583, "learning_rate": 2.968564949885006e-05, "loss": 0.7924, "step": 13715 }, { "epoch": 2.347424268355297, "grad_norm": 16.045734405517578, "learning_rate": 2.968329874048647e-05, "loss": 1.1793, "step": 13716 }, { "epoch": 2.347595413315078, "grad_norm": 18.389976501464844, "learning_rate": 2.968093931895476e-05, "loss": 1.7231, "step": 13717 }, { "epoch": 2.347766558274859, "grad_norm": 3.509699583053589, "learning_rate": 2.9678571235646986e-05, "loss": 0.2603, "step": 13718 }, { "epoch": 2.34793770323464, "grad_norm": 5.619528293609619, "learning_rate": 2.9676194491960313e-05, "loss": 0.2963, "step": 13719 }, { "epoch": 2.348108848194421, "grad_norm": 19.894046783447266, "learning_rate": 2.9673809089297034e-05, "loss": 2.0399, "step": 13720 }, { "epoch": 2.3482799931542018, "grad_norm": 10.68710994720459, "learning_rate": 2.967141502906454e-05, "loss": 1.0016, "step": 13721 }, { "epoch": 2.3484511381139823, "grad_norm": 10.290054321289062, "learning_rate": 2.9669012312675324e-05, "loss": 0.6946, "step": 13722 }, { "epoch": 2.3486222830737633, "grad_norm": 1.4068338871002197, "learning_rate": 2.966660094154699e-05, "loss": 0.1543, "step": 13723 }, { "epoch": 2.3487934280335443, "grad_norm": 9.512953758239746, "learning_rate": 2.9664180917102255e-05, "loss": 0.6003, "step": 13724 }, { "epoch": 2.3489645729933253, "grad_norm": 19.7421817779541, "learning_rate": 2.966175224076894e-05, "loss": 2.1443, "step": 13725 }, { "epoch": 2.3491357179531063, "grad_norm": 10.73427677154541, "learning_rate": 2.965931491397997e-05, "loss": 0.7377, "step": 13726 }, { "epoch": 2.3493068629128873, "grad_norm": 1.2298129796981812, "learning_rate": 2.9656868938173357e-05, "loss": 0.1304, "step": 13727 }, { "epoch": 2.3494780078726683, "grad_norm": 0.5983521342277527, "learning_rate": 2.965441431479224e-05, "loss": 0.1045, "step": 13728 }, { "epoch": 2.3496491528324492, "grad_norm": 0.41896769404411316, "learning_rate": 2.9651951045284857e-05, "loss": 0.1012, "step": 13729 }, { "epoch": 2.3498202977922302, "grad_norm": 3.764270782470703, "learning_rate": 2.9649479131104533e-05, "loss": 0.3782, "step": 13730 }, { "epoch": 2.3499914427520108, "grad_norm": 0.4089379906654358, "learning_rate": 2.9646998573709693e-05, "loss": 0.1033, "step": 13731 }, { "epoch": 2.3501625877117918, "grad_norm": 13.618885040283203, "learning_rate": 2.9644509374563887e-05, "loss": 1.0345, "step": 13732 }, { "epoch": 2.3503337326715728, "grad_norm": 1.5260969400405884, "learning_rate": 2.9642011535135733e-05, "loss": 0.185, "step": 13733 }, { "epoch": 2.3505048776313537, "grad_norm": 6.8031005859375, "learning_rate": 2.963950505689897e-05, "loss": 0.4948, "step": 13734 }, { "epoch": 2.3506760225911347, "grad_norm": 9.363670349121094, "learning_rate": 2.9636989941332415e-05, "loss": 0.6767, "step": 13735 }, { "epoch": 2.3508471675509157, "grad_norm": 18.322998046875, "learning_rate": 2.9634466189919995e-05, "loss": 1.4919, "step": 13736 }, { "epoch": 2.3510183125106967, "grad_norm": 128.09046936035156, "learning_rate": 2.9631933804150722e-05, "loss": 10.2138, "step": 13737 }, { "epoch": 2.3511894574704773, "grad_norm": 6.728992462158203, "learning_rate": 2.9629392785518714e-05, "loss": 0.4702, "step": 13738 }, { "epoch": 2.3513606024302582, "grad_norm": 14.951433181762695, "learning_rate": 2.9626843135523174e-05, "loss": 1.0818, "step": 13739 }, { "epoch": 2.3515317473900392, "grad_norm": 11.443788528442383, "learning_rate": 2.9624284855668394e-05, "loss": 0.8004, "step": 13740 }, { "epoch": 2.35170289234982, "grad_norm": 4.229185104370117, "learning_rate": 2.9621717947463768e-05, "loss": 0.5019, "step": 13741 }, { "epoch": 2.351874037309601, "grad_norm": 17.978229522705078, "learning_rate": 2.9619142412423775e-05, "loss": 1.9422, "step": 13742 }, { "epoch": 2.352045182269382, "grad_norm": 3.637451410293579, "learning_rate": 2.9616558252067985e-05, "loss": 0.2508, "step": 13743 }, { "epoch": 2.352216327229163, "grad_norm": 13.21918773651123, "learning_rate": 2.961396546792106e-05, "loss": 0.7922, "step": 13744 }, { "epoch": 2.352387472188944, "grad_norm": 11.739056587219238, "learning_rate": 2.9611364061512733e-05, "loss": 1.0276, "step": 13745 }, { "epoch": 2.352558617148725, "grad_norm": 93.84841918945312, "learning_rate": 2.9608754034377853e-05, "loss": 8.1467, "step": 13746 }, { "epoch": 2.352729762108506, "grad_norm": 1.9068375825881958, "learning_rate": 2.960613538805633e-05, "loss": 0.1841, "step": 13747 }, { "epoch": 2.3529009070682867, "grad_norm": 2.3761565685272217, "learning_rate": 2.9603508124093173e-05, "loss": 0.1807, "step": 13748 }, { "epoch": 2.3530720520280677, "grad_norm": 16.223819732666016, "learning_rate": 2.9600872244038473e-05, "loss": 1.9115, "step": 13749 }, { "epoch": 2.3532431969878487, "grad_norm": 18.096757888793945, "learning_rate": 2.9598227749447404e-05, "loss": 1.5446, "step": 13750 }, { "epoch": 2.3534143419476297, "grad_norm": 17.498674392700195, "learning_rate": 2.9595574641880213e-05, "loss": 2.0874, "step": 13751 }, { "epoch": 2.3535854869074107, "grad_norm": 2.4044742584228516, "learning_rate": 2.9592912922902246e-05, "loss": 0.2435, "step": 13752 }, { "epoch": 2.3537566318671916, "grad_norm": 15.699995994567871, "learning_rate": 2.9590242594083913e-05, "loss": 1.5788, "step": 13753 }, { "epoch": 2.3539277768269726, "grad_norm": 0.3551577925682068, "learning_rate": 2.958756365700073e-05, "loss": 0.0974, "step": 13754 }, { "epoch": 2.354098921786753, "grad_norm": 13.783117294311523, "learning_rate": 2.958487611323326e-05, "loss": 1.0008, "step": 13755 }, { "epoch": 2.354270066746534, "grad_norm": 12.435408592224121, "learning_rate": 2.9582179964367155e-05, "loss": 0.8749, "step": 13756 }, { "epoch": 2.354441211706315, "grad_norm": 26.706514358520508, "learning_rate": 2.957947521199315e-05, "loss": 5.2005, "step": 13757 }, { "epoch": 2.354612356666096, "grad_norm": 14.963375091552734, "learning_rate": 2.9576761857707058e-05, "loss": 1.3759, "step": 13758 }, { "epoch": 2.354783501625877, "grad_norm": 22.634431838989258, "learning_rate": 2.9574039903109762e-05, "loss": 2.7111, "step": 13759 }, { "epoch": 2.354954646585658, "grad_norm": 0.4186314344406128, "learning_rate": 2.957130934980721e-05, "loss": 0.0993, "step": 13760 }, { "epoch": 2.355125791545439, "grad_norm": 5.741849899291992, "learning_rate": 2.9568570199410436e-05, "loss": 0.5004, "step": 13761 }, { "epoch": 2.35529693650522, "grad_norm": 5.944777965545654, "learning_rate": 2.9565822453535553e-05, "loss": 0.6081, "step": 13762 }, { "epoch": 2.355468081465001, "grad_norm": 15.513654708862305, "learning_rate": 2.956306611380372e-05, "loss": 1.2744, "step": 13763 }, { "epoch": 2.3556392264247816, "grad_norm": 13.603975296020508, "learning_rate": 2.956030118184119e-05, "loss": 1.4133, "step": 13764 }, { "epoch": 2.3558103713845626, "grad_norm": 3.2988924980163574, "learning_rate": 2.9557527659279275e-05, "loss": 0.2556, "step": 13765 }, { "epoch": 2.3559815163443436, "grad_norm": 12.233057022094727, "learning_rate": 2.9554745547754364e-05, "loss": 0.9195, "step": 13766 }, { "epoch": 2.3561526613041246, "grad_norm": 11.486918449401855, "learning_rate": 2.95519548489079e-05, "loss": 1.1361, "step": 13767 }, { "epoch": 2.3563238062639056, "grad_norm": 18.043169021606445, "learning_rate": 2.9549155564386396e-05, "loss": 1.6197, "step": 13768 }, { "epoch": 2.3564949512236866, "grad_norm": 14.55087661743164, "learning_rate": 2.954634769584144e-05, "loss": 1.4634, "step": 13769 }, { "epoch": 2.3566660961834676, "grad_norm": 10.740273475646973, "learning_rate": 2.9543531244929677e-05, "loss": 0.8499, "step": 13770 }, { "epoch": 2.356837241143248, "grad_norm": 7.992386817932129, "learning_rate": 2.9540706213312823e-05, "loss": 0.4706, "step": 13771 }, { "epoch": 2.357008386103029, "grad_norm": 13.888870239257812, "learning_rate": 2.953787260265764e-05, "loss": 0.9593, "step": 13772 }, { "epoch": 2.35717953106281, "grad_norm": 7.161427021026611, "learning_rate": 2.953503041463597e-05, "loss": 0.4937, "step": 13773 }, { "epoch": 2.357350676022591, "grad_norm": 8.725404739379883, "learning_rate": 2.95321796509247e-05, "loss": 0.7047, "step": 13774 }, { "epoch": 2.357521820982372, "grad_norm": 13.974394798278809, "learning_rate": 2.9529320313205797e-05, "loss": 1.0055, "step": 13775 }, { "epoch": 2.357692965942153, "grad_norm": 18.212261199951172, "learning_rate": 2.952645240316626e-05, "loss": 1.4629, "step": 13776 }, { "epoch": 2.357864110901934, "grad_norm": 13.429821968078613, "learning_rate": 2.952357592249818e-05, "loss": 0.8827, "step": 13777 }, { "epoch": 2.358035255861715, "grad_norm": 11.195442199707031, "learning_rate": 2.952069087289867e-05, "loss": 0.62, "step": 13778 }, { "epoch": 2.358206400821496, "grad_norm": 1.2533843517303467, "learning_rate": 2.9517797256069917e-05, "loss": 0.1784, "step": 13779 }, { "epoch": 2.3583775457812766, "grad_norm": 17.146242141723633, "learning_rate": 2.951489507371916e-05, "loss": 1.9416, "step": 13780 }, { "epoch": 2.3585486907410576, "grad_norm": 0.41867658495903015, "learning_rate": 2.951198432755869e-05, "loss": 0.0983, "step": 13781 }, { "epoch": 2.3587198357008385, "grad_norm": 3.7335875034332275, "learning_rate": 2.950906501930585e-05, "loss": 0.1927, "step": 13782 }, { "epoch": 2.3588909806606195, "grad_norm": 11.162370681762695, "learning_rate": 2.9506137150683034e-05, "loss": 0.791, "step": 13783 }, { "epoch": 2.3590621256204005, "grad_norm": 14.339361190795898, "learning_rate": 2.9503200723417697e-05, "loss": 0.6149, "step": 13784 }, { "epoch": 2.3592332705801815, "grad_norm": 9.353601455688477, "learning_rate": 2.9500255739242333e-05, "loss": 0.6579, "step": 13785 }, { "epoch": 2.3594044155399625, "grad_norm": 0.3832778036594391, "learning_rate": 2.9497302199894482e-05, "loss": 0.0987, "step": 13786 }, { "epoch": 2.359575560499743, "grad_norm": 11.579385757446289, "learning_rate": 2.949434010711674e-05, "loss": 0.8065, "step": 13787 }, { "epoch": 2.359746705459524, "grad_norm": 4.029344081878662, "learning_rate": 2.949136946265675e-05, "loss": 0.2523, "step": 13788 }, { "epoch": 2.359917850419305, "grad_norm": 1.80219566822052, "learning_rate": 2.9488390268267193e-05, "loss": 0.1955, "step": 13789 }, { "epoch": 2.360088995379086, "grad_norm": 12.509167671203613, "learning_rate": 2.948540252570579e-05, "loss": 0.7935, "step": 13790 }, { "epoch": 2.360260140338867, "grad_norm": 16.19281005859375, "learning_rate": 2.9482406236735328e-05, "loss": 1.2888, "step": 13791 }, { "epoch": 2.360431285298648, "grad_norm": 0.9559103846549988, "learning_rate": 2.947940140312361e-05, "loss": 0.1699, "step": 13792 }, { "epoch": 2.360602430258429, "grad_norm": 7.657926082611084, "learning_rate": 2.9476388026643508e-05, "loss": 0.5127, "step": 13793 }, { "epoch": 2.36077357521821, "grad_norm": 21.728221893310547, "learning_rate": 2.9473366109072895e-05, "loss": 5.1605, "step": 13794 }, { "epoch": 2.360944720177991, "grad_norm": 15.935368537902832, "learning_rate": 2.9470335652194726e-05, "loss": 1.84, "step": 13795 }, { "epoch": 2.361115865137772, "grad_norm": 9.342744827270508, "learning_rate": 2.9467296657796975e-05, "loss": 0.6874, "step": 13796 }, { "epoch": 2.3612870100975525, "grad_norm": 94.91766357421875, "learning_rate": 2.9464249127672638e-05, "loss": 8.4964, "step": 13797 }, { "epoch": 2.3614581550573335, "grad_norm": 1.9580347537994385, "learning_rate": 2.9461193063619773e-05, "loss": 0.1987, "step": 13798 }, { "epoch": 2.3616293000171145, "grad_norm": 26.056055068969727, "learning_rate": 2.945812846744147e-05, "loss": 5.0928, "step": 13799 }, { "epoch": 2.3618004449768955, "grad_norm": 20.35849380493164, "learning_rate": 2.9455055340945834e-05, "loss": 1.6768, "step": 13800 }, { "epoch": 2.3619715899366764, "grad_norm": 6.172688007354736, "learning_rate": 2.9451973685946008e-05, "loss": 0.4312, "step": 13801 }, { "epoch": 2.3621427348964574, "grad_norm": 10.99133586883545, "learning_rate": 2.9448883504260186e-05, "loss": 1.0068, "step": 13802 }, { "epoch": 2.3623138798562384, "grad_norm": 20.267276763916016, "learning_rate": 2.944578479771158e-05, "loss": 2.2266, "step": 13803 }, { "epoch": 2.362485024816019, "grad_norm": 1.498047113418579, "learning_rate": 2.9442677568128426e-05, "loss": 0.1611, "step": 13804 }, { "epoch": 2.3626561697758, "grad_norm": 13.581354141235352, "learning_rate": 2.943956181734399e-05, "loss": 1.0332, "step": 13805 }, { "epoch": 2.362827314735581, "grad_norm": 13.277029991149902, "learning_rate": 2.9436437547196575e-05, "loss": 1.0024, "step": 13806 }, { "epoch": 2.362998459695362, "grad_norm": 3.513739824295044, "learning_rate": 2.943330475952951e-05, "loss": 0.2178, "step": 13807 }, { "epoch": 2.363169604655143, "grad_norm": 7.4882001876831055, "learning_rate": 2.943016345619114e-05, "loss": 0.5408, "step": 13808 }, { "epoch": 2.363340749614924, "grad_norm": 2.282379627227783, "learning_rate": 2.9427013639034825e-05, "loss": 0.2064, "step": 13809 }, { "epoch": 2.363511894574705, "grad_norm": 1.9806654453277588, "learning_rate": 2.9423855309918982e-05, "loss": 0.1924, "step": 13810 }, { "epoch": 2.363683039534486, "grad_norm": 8.37846851348877, "learning_rate": 2.942068847070703e-05, "loss": 1.121, "step": 13811 }, { "epoch": 2.363854184494267, "grad_norm": 15.134040832519531, "learning_rate": 2.9417513123267393e-05, "loss": 1.0459, "step": 13812 }, { "epoch": 2.3640253294540474, "grad_norm": 14.109121322631836, "learning_rate": 2.941432926947354e-05, "loss": 1.5477, "step": 13813 }, { "epoch": 2.3641964744138284, "grad_norm": 30.21404266357422, "learning_rate": 2.9411136911203945e-05, "loss": 5.228, "step": 13814 }, { "epoch": 2.3643676193736094, "grad_norm": 0.4001495838165283, "learning_rate": 2.9407936050342118e-05, "loss": 0.0933, "step": 13815 }, { "epoch": 2.3645387643333904, "grad_norm": 8.658340454101562, "learning_rate": 2.9404726688776555e-05, "loss": 0.5989, "step": 13816 }, { "epoch": 2.3647099092931714, "grad_norm": 32.939369201660156, "learning_rate": 2.9401508828400782e-05, "loss": 5.468, "step": 13817 }, { "epoch": 2.3648810542529524, "grad_norm": 13.19131088256836, "learning_rate": 2.939828247111336e-05, "loss": 0.9904, "step": 13818 }, { "epoch": 2.3650521992127334, "grad_norm": 12.606679916381836, "learning_rate": 2.939504761881783e-05, "loss": 0.895, "step": 13819 }, { "epoch": 2.365223344172514, "grad_norm": 8.798978805541992, "learning_rate": 2.939180427342277e-05, "loss": 0.6629, "step": 13820 }, { "epoch": 2.365394489132295, "grad_norm": 11.077106475830078, "learning_rate": 2.9388552436841746e-05, "loss": 0.858, "step": 13821 }, { "epoch": 2.365565634092076, "grad_norm": 1.2896209955215454, "learning_rate": 2.9385292110993364e-05, "loss": 0.1154, "step": 13822 }, { "epoch": 2.365736779051857, "grad_norm": 1.7195796966552734, "learning_rate": 2.9382023297801196e-05, "loss": 0.1836, "step": 13823 }, { "epoch": 2.365907924011638, "grad_norm": 86.87683868408203, "learning_rate": 2.937874599919387e-05, "loss": 9.5465, "step": 13824 }, { "epoch": 2.366079068971419, "grad_norm": 16.566246032714844, "learning_rate": 2.937546021710498e-05, "loss": 1.1711, "step": 13825 }, { "epoch": 2.3662502139312, "grad_norm": 13.157147407531738, "learning_rate": 2.937216595347316e-05, "loss": 1.2405, "step": 13826 }, { "epoch": 2.366421358890981, "grad_norm": 17.46339225769043, "learning_rate": 2.9368863210242015e-05, "loss": 0.9133, "step": 13827 }, { "epoch": 2.366592503850762, "grad_norm": 13.502808570861816, "learning_rate": 2.9365551989360176e-05, "loss": 0.9024, "step": 13828 }, { "epoch": 2.3667636488105424, "grad_norm": 4.762353897094727, "learning_rate": 2.9362232292781264e-05, "loss": 0.4062, "step": 13829 }, { "epoch": 2.3669347937703233, "grad_norm": 5.96539831161499, "learning_rate": 2.9358904122463914e-05, "loss": 0.3325, "step": 13830 }, { "epoch": 2.3671059387301043, "grad_norm": 15.752725601196289, "learning_rate": 2.9355567480371734e-05, "loss": 0.8002, "step": 13831 }, { "epoch": 2.3672770836898853, "grad_norm": 16.962108612060547, "learning_rate": 2.9352222368473366e-05, "loss": 1.2061, "step": 13832 }, { "epoch": 2.3674482286496663, "grad_norm": 11.395620346069336, "learning_rate": 2.934886878874242e-05, "loss": 0.7728, "step": 13833 }, { "epoch": 2.3676193736094473, "grad_norm": 6.070862770080566, "learning_rate": 2.934550674315752e-05, "loss": 0.5949, "step": 13834 }, { "epoch": 2.3677905185692283, "grad_norm": 15.73746109008789, "learning_rate": 2.9342136233702272e-05, "loss": 1.0781, "step": 13835 }, { "epoch": 2.367961663529009, "grad_norm": 14.543675422668457, "learning_rate": 2.9338757262365288e-05, "loss": 1.699, "step": 13836 }, { "epoch": 2.36813280848879, "grad_norm": 1.2054356336593628, "learning_rate": 2.9335369831140155e-05, "loss": 0.1691, "step": 13837 }, { "epoch": 2.368303953448571, "grad_norm": 12.869241714477539, "learning_rate": 2.9331973942025472e-05, "loss": 1.1467, "step": 13838 }, { "epoch": 2.368475098408352, "grad_norm": 9.577197074890137, "learning_rate": 2.9328569597024814e-05, "loss": 0.6671, "step": 13839 }, { "epoch": 2.368646243368133, "grad_norm": 0.7080820798873901, "learning_rate": 2.932515679814676e-05, "loss": 0.1048, "step": 13840 }, { "epoch": 2.3688173883279138, "grad_norm": 15.494146347045898, "learning_rate": 2.932173554740486e-05, "loss": 1.1603, "step": 13841 }, { "epoch": 2.3689885332876948, "grad_norm": 8.734058380126953, "learning_rate": 2.9318305846817646e-05, "loss": 0.6308, "step": 13842 }, { "epoch": 2.3691596782474758, "grad_norm": 6.21943998336792, "learning_rate": 2.9314867698408653e-05, "loss": 0.3869, "step": 13843 }, { "epoch": 2.3693308232072567, "grad_norm": 10.981721878051758, "learning_rate": 2.9311421104206407e-05, "loss": 0.7338, "step": 13844 }, { "epoch": 2.3695019681670373, "grad_norm": 18.26055145263672, "learning_rate": 2.9307966066244395e-05, "loss": 1.2251, "step": 13845 }, { "epoch": 2.3696731131268183, "grad_norm": 15.519186019897461, "learning_rate": 2.9304502586561082e-05, "loss": 1.1348, "step": 13846 }, { "epoch": 2.3698442580865993, "grad_norm": 1.0084937810897827, "learning_rate": 2.930103066719994e-05, "loss": 0.1626, "step": 13847 }, { "epoch": 2.3700154030463803, "grad_norm": 9.358965873718262, "learning_rate": 2.929755031020941e-05, "loss": 0.704, "step": 13848 }, { "epoch": 2.3701865480061612, "grad_norm": 6.842755317687988, "learning_rate": 2.92940615176429e-05, "loss": 0.4844, "step": 13849 }, { "epoch": 2.3703576929659422, "grad_norm": 13.688385963439941, "learning_rate": 2.92905642915588e-05, "loss": 0.9157, "step": 13850 }, { "epoch": 2.370528837925723, "grad_norm": 12.14505672454834, "learning_rate": 2.928705863402048e-05, "loss": 1.3332, "step": 13851 }, { "epoch": 2.370699982885504, "grad_norm": 16.24831771850586, "learning_rate": 2.9283544547096292e-05, "loss": 0.9503, "step": 13852 }, { "epoch": 2.3708711278452848, "grad_norm": 11.304767608642578, "learning_rate": 2.9280022032859546e-05, "loss": 0.9289, "step": 13853 }, { "epoch": 2.3710422728050657, "grad_norm": 16.237560272216797, "learning_rate": 2.927649109338853e-05, "loss": 1.2198, "step": 13854 }, { "epoch": 2.3712134177648467, "grad_norm": 10.913284301757812, "learning_rate": 2.9272951730766496e-05, "loss": 0.9169, "step": 13855 }, { "epoch": 2.3713845627246277, "grad_norm": 12.9520263671875, "learning_rate": 2.9269403947081697e-05, "loss": 0.9437, "step": 13856 }, { "epoch": 2.3715557076844087, "grad_norm": 15.201899528503418, "learning_rate": 2.926584774442731e-05, "loss": 1.1456, "step": 13857 }, { "epoch": 2.3717268526441897, "grad_norm": 1.5297011137008667, "learning_rate": 2.92622831249015e-05, "loss": 0.1985, "step": 13858 }, { "epoch": 2.3718979976039707, "grad_norm": 0.6970651149749756, "learning_rate": 2.9258710090607405e-05, "loss": 0.1025, "step": 13859 }, { "epoch": 2.3720691425637517, "grad_norm": 3.5106606483459473, "learning_rate": 2.9255128643653117e-05, "loss": 0.2443, "step": 13860 }, { "epoch": 2.3722402875235327, "grad_norm": 18.453481674194336, "learning_rate": 2.9251538786151702e-05, "loss": 1.5195, "step": 13861 }, { "epoch": 2.372411432483313, "grad_norm": 19.77035140991211, "learning_rate": 2.9247940520221176e-05, "loss": 2.1936, "step": 13862 }, { "epoch": 2.372582577443094, "grad_norm": 16.532407760620117, "learning_rate": 2.9244333847984522e-05, "loss": 1.1554, "step": 13863 }, { "epoch": 2.372753722402875, "grad_norm": 14.818830490112305, "learning_rate": 2.9240718771569676e-05, "loss": 0.6578, "step": 13864 }, { "epoch": 2.372924867362656, "grad_norm": 11.528362274169922, "learning_rate": 2.923709529310955e-05, "loss": 0.7533, "step": 13865 }, { "epoch": 2.373096012322437, "grad_norm": 30.12575340270996, "learning_rate": 2.9233463414741984e-05, "loss": 5.8831, "step": 13866 }, { "epoch": 2.373267157282218, "grad_norm": 67.10313415527344, "learning_rate": 2.922982313860982e-05, "loss": 8.0176, "step": 13867 }, { "epoch": 2.373438302241999, "grad_norm": 12.559006690979004, "learning_rate": 2.9226174466860797e-05, "loss": 0.865, "step": 13868 }, { "epoch": 2.3736094472017797, "grad_norm": 15.757425308227539, "learning_rate": 2.9222517401647655e-05, "loss": 1.6827, "step": 13869 }, { "epoch": 2.3737805921615607, "grad_norm": 24.07862091064453, "learning_rate": 2.9218851945128058e-05, "loss": 5.4629, "step": 13870 }, { "epoch": 2.3739517371213417, "grad_norm": 5.029224395751953, "learning_rate": 2.9215178099464644e-05, "loss": 0.4169, "step": 13871 }, { "epoch": 2.3741228820811227, "grad_norm": 24.814659118652344, "learning_rate": 2.9211495866824966e-05, "loss": 5.0643, "step": 13872 }, { "epoch": 2.3742940270409036, "grad_norm": 17.59563636779785, "learning_rate": 2.920780524938157e-05, "loss": 1.3776, "step": 13873 }, { "epoch": 2.3744651720006846, "grad_norm": 8.589835166931152, "learning_rate": 2.9204106249311904e-05, "loss": 0.892, "step": 13874 }, { "epoch": 2.3746363169604656, "grad_norm": 18.964452743530273, "learning_rate": 2.9200398868798404e-05, "loss": 2.1783, "step": 13875 }, { "epoch": 2.3748074619202466, "grad_norm": 22.2451171875, "learning_rate": 2.9196683110028412e-05, "loss": 5.0051, "step": 13876 }, { "epoch": 2.3749786068800276, "grad_norm": 12.25952434539795, "learning_rate": 2.919295897519425e-05, "loss": 1.0435, "step": 13877 }, { "epoch": 2.375149751839808, "grad_norm": 12.748000144958496, "learning_rate": 2.9189226466493146e-05, "loss": 0.9638, "step": 13878 }, { "epoch": 2.375320896799589, "grad_norm": 0.3841175436973572, "learning_rate": 2.91854855861273e-05, "loss": 0.0972, "step": 13879 }, { "epoch": 2.37549204175937, "grad_norm": 10.616653442382812, "learning_rate": 2.9181736336303814e-05, "loss": 0.783, "step": 13880 }, { "epoch": 2.375663186719151, "grad_norm": 17.931745529174805, "learning_rate": 2.9177978719234775e-05, "loss": 1.6782, "step": 13881 }, { "epoch": 2.375834331678932, "grad_norm": 11.16501235961914, "learning_rate": 2.9174212737137177e-05, "loss": 0.9039, "step": 13882 }, { "epoch": 2.376005476638713, "grad_norm": 23.922630310058594, "learning_rate": 2.9170438392232947e-05, "loss": 5.2092, "step": 13883 }, { "epoch": 2.376176621598494, "grad_norm": 14.709573745727539, "learning_rate": 2.9166655686748964e-05, "loss": 1.2348, "step": 13884 }, { "epoch": 2.3763477665582746, "grad_norm": 16.957683563232422, "learning_rate": 2.916286462291702e-05, "loss": 1.2592, "step": 13885 }, { "epoch": 2.3765189115180556, "grad_norm": 10.839428901672363, "learning_rate": 2.9159065202973862e-05, "loss": 0.7732, "step": 13886 }, { "epoch": 2.3766900564778366, "grad_norm": 69.25241088867188, "learning_rate": 2.9155257429161136e-05, "loss": 8.1186, "step": 13887 }, { "epoch": 2.3768612014376176, "grad_norm": 7.763872146606445, "learning_rate": 2.9151441303725444e-05, "loss": 0.7564, "step": 13888 }, { "epoch": 2.3770323463973986, "grad_norm": 0.9499492645263672, "learning_rate": 2.914761682891831e-05, "loss": 0.1629, "step": 13889 }, { "epoch": 2.3772034913571796, "grad_norm": 20.75343132019043, "learning_rate": 2.914378400699618e-05, "loss": 2.4828, "step": 13890 }, { "epoch": 2.3773746363169606, "grad_norm": 22.876901626586914, "learning_rate": 2.9139942840220407e-05, "loss": 1.826, "step": 13891 }, { "epoch": 2.3775457812767415, "grad_norm": 0.4839036762714386, "learning_rate": 2.9136093330857298e-05, "loss": 0.0986, "step": 13892 }, { "epoch": 2.3777169262365225, "grad_norm": 20.732179641723633, "learning_rate": 2.9132235481178077e-05, "loss": 1.4727, "step": 13893 }, { "epoch": 2.377888071196303, "grad_norm": 14.663887023925781, "learning_rate": 2.912836929345887e-05, "loss": 1.202, "step": 13894 }, { "epoch": 2.378059216156084, "grad_norm": 19.696918487548828, "learning_rate": 2.9124494769980734e-05, "loss": 2.112, "step": 13895 }, { "epoch": 2.378230361115865, "grad_norm": 23.019502639770508, "learning_rate": 2.9120611913029642e-05, "loss": 0.8844, "step": 13896 }, { "epoch": 2.378401506075646, "grad_norm": 13.973206520080566, "learning_rate": 2.9116720724896495e-05, "loss": 1.0748, "step": 13897 }, { "epoch": 2.378572651035427, "grad_norm": 2.4960007667541504, "learning_rate": 2.91128212078771e-05, "loss": 0.2131, "step": 13898 }, { "epoch": 2.378743795995208, "grad_norm": 2.06769061088562, "learning_rate": 2.910891336427216e-05, "loss": 0.1982, "step": 13899 }, { "epoch": 2.378914940954989, "grad_norm": 13.368046760559082, "learning_rate": 2.910499719638732e-05, "loss": 0.9325, "step": 13900 }, { "epoch": 2.37908608591477, "grad_norm": 2.931258201599121, "learning_rate": 2.9101072706533134e-05, "loss": 0.4108, "step": 13901 }, { "epoch": 2.3792572308745505, "grad_norm": 13.211956977844238, "learning_rate": 2.909713989702505e-05, "loss": 0.7875, "step": 13902 }, { "epoch": 2.3794283758343315, "grad_norm": 16.328636169433594, "learning_rate": 2.9093198770183412e-05, "loss": 1.6243, "step": 13903 }, { "epoch": 2.3795995207941125, "grad_norm": 53.06248092651367, "learning_rate": 2.9089249328333528e-05, "loss": 8.5931, "step": 13904 }, { "epoch": 2.3797706657538935, "grad_norm": 14.898456573486328, "learning_rate": 2.9085291573805543e-05, "loss": 1.2901, "step": 13905 }, { "epoch": 2.3799418107136745, "grad_norm": 0.4610017240047455, "learning_rate": 2.9081325508934556e-05, "loss": 0.096, "step": 13906 }, { "epoch": 2.3801129556734555, "grad_norm": 3.3614704608917236, "learning_rate": 2.9077351136060542e-05, "loss": 0.2131, "step": 13907 }, { "epoch": 2.3802841006332365, "grad_norm": 3.0489695072174072, "learning_rate": 2.907336845752839e-05, "loss": 0.2342, "step": 13908 }, { "epoch": 2.3804552455930175, "grad_norm": 3.489628314971924, "learning_rate": 2.9069377475687886e-05, "loss": 0.2361, "step": 13909 }, { "epoch": 2.3806263905527985, "grad_norm": 12.323399543762207, "learning_rate": 2.9065378192893723e-05, "loss": 1.4322, "step": 13910 }, { "epoch": 2.380797535512579, "grad_norm": 15.32873249053955, "learning_rate": 2.906137061150547e-05, "loss": 1.5155, "step": 13911 }, { "epoch": 2.38096868047236, "grad_norm": 15.967973709106445, "learning_rate": 2.9057354733887616e-05, "loss": 1.9899, "step": 13912 }, { "epoch": 2.381139825432141, "grad_norm": 14.929649353027344, "learning_rate": 2.9053330562409525e-05, "loss": 1.0201, "step": 13913 }, { "epoch": 2.381310970391922, "grad_norm": 18.098020553588867, "learning_rate": 2.9049298099445474e-05, "loss": 2.2357, "step": 13914 }, { "epoch": 2.381482115351703, "grad_norm": 8.080223083496094, "learning_rate": 2.9045257347374616e-05, "loss": 0.6169, "step": 13915 }, { "epoch": 2.381653260311484, "grad_norm": 5.7460784912109375, "learning_rate": 2.9041208308581012e-05, "loss": 0.5386, "step": 13916 }, { "epoch": 2.381824405271265, "grad_norm": 6.741369724273682, "learning_rate": 2.903715098545358e-05, "loss": 0.68, "step": 13917 }, { "epoch": 2.3819955502310455, "grad_norm": 17.742116928100586, "learning_rate": 2.903308538038617e-05, "loss": 1.9164, "step": 13918 }, { "epoch": 2.3821666951908265, "grad_norm": 15.757413864135742, "learning_rate": 2.902901149577747e-05, "loss": 1.1369, "step": 13919 }, { "epoch": 2.3823378401506075, "grad_norm": 5.4556355476379395, "learning_rate": 2.9024929334031105e-05, "loss": 0.2995, "step": 13920 }, { "epoch": 2.3825089851103884, "grad_norm": 7.547480583190918, "learning_rate": 2.9020838897555538e-05, "loss": 0.5822, "step": 13921 }, { "epoch": 2.3826801300701694, "grad_norm": 9.700549125671387, "learning_rate": 2.9016740188764137e-05, "loss": 0.658, "step": 13922 }, { "epoch": 2.3828512750299504, "grad_norm": 50.94050598144531, "learning_rate": 2.9012633210075146e-05, "loss": 6.9322, "step": 13923 }, { "epoch": 2.3830224199897314, "grad_norm": 2.771986722946167, "learning_rate": 2.900851796391169e-05, "loss": 0.2426, "step": 13924 }, { "epoch": 2.3831935649495124, "grad_norm": 4.5597100257873535, "learning_rate": 2.900439445270177e-05, "loss": 0.291, "step": 13925 }, { "epoch": 2.3833647099092934, "grad_norm": 11.881745338439941, "learning_rate": 2.9000262678878266e-05, "loss": 0.7799, "step": 13926 }, { "epoch": 2.383535854869074, "grad_norm": 12.484890937805176, "learning_rate": 2.8996122644878938e-05, "loss": 0.8632, "step": 13927 }, { "epoch": 2.383706999828855, "grad_norm": 2.431853771209717, "learning_rate": 2.8991974353146388e-05, "loss": 0.2017, "step": 13928 }, { "epoch": 2.383878144788636, "grad_norm": 9.242044448852539, "learning_rate": 2.898781780612813e-05, "loss": 0.585, "step": 13929 }, { "epoch": 2.384049289748417, "grad_norm": 6.900981903076172, "learning_rate": 2.8983653006276544e-05, "loss": 0.5883, "step": 13930 }, { "epoch": 2.384220434708198, "grad_norm": 0.5738440155982971, "learning_rate": 2.897947995604885e-05, "loss": 0.1067, "step": 13931 }, { "epoch": 2.384391579667979, "grad_norm": 5.669466495513916, "learning_rate": 2.8975298657907158e-05, "loss": 0.334, "step": 13932 }, { "epoch": 2.38456272462776, "grad_norm": 8.501715660095215, "learning_rate": 2.8971109114318442e-05, "loss": 0.6334, "step": 13933 }, { "epoch": 2.3847338695875404, "grad_norm": 37.18186950683594, "learning_rate": 2.8966911327754543e-05, "loss": 5.5143, "step": 13934 }, { "epoch": 2.3849050145473214, "grad_norm": 7.615840911865234, "learning_rate": 2.896270530069216e-05, "loss": 0.8354, "step": 13935 }, { "epoch": 2.3850761595071024, "grad_norm": 5.1893310546875, "learning_rate": 2.8958491035612842e-05, "loss": 0.4339, "step": 13936 }, { "epoch": 2.3852473044668834, "grad_norm": 7.524728298187256, "learning_rate": 2.8954268535003022e-05, "loss": 0.4833, "step": 13937 }, { "epoch": 2.3854184494266644, "grad_norm": 18.081893920898438, "learning_rate": 2.8950037801353995e-05, "loss": 1.2583, "step": 13938 }, { "epoch": 2.3855895943864454, "grad_norm": 0.3987780213356018, "learning_rate": 2.8945798837161884e-05, "loss": 0.099, "step": 13939 }, { "epoch": 2.3857607393462263, "grad_norm": 0.3429018557071686, "learning_rate": 2.894155164492768e-05, "loss": 0.0969, "step": 13940 }, { "epoch": 2.3859318843060073, "grad_norm": 0.7678021788597107, "learning_rate": 2.8937296227157242e-05, "loss": 0.1053, "step": 13941 }, { "epoch": 2.3861030292657883, "grad_norm": 11.245859146118164, "learning_rate": 2.8933032586361278e-05, "loss": 0.8156, "step": 13942 }, { "epoch": 2.386274174225569, "grad_norm": 5.14680290222168, "learning_rate": 2.8928760725055335e-05, "loss": 0.4833, "step": 13943 }, { "epoch": 2.38644531918535, "grad_norm": 3.099090814590454, "learning_rate": 2.8924480645759805e-05, "loss": 0.219, "step": 13944 }, { "epoch": 2.386616464145131, "grad_norm": 10.786515235900879, "learning_rate": 2.892019235099996e-05, "loss": 0.7081, "step": 13945 }, { "epoch": 2.386787609104912, "grad_norm": 7.958423614501953, "learning_rate": 2.8915895843305896e-05, "loss": 0.6596, "step": 13946 }, { "epoch": 2.386958754064693, "grad_norm": 2.792550563812256, "learning_rate": 2.891159112521256e-05, "loss": 0.1962, "step": 13947 }, { "epoch": 2.387129899024474, "grad_norm": 13.622743606567383, "learning_rate": 2.8907278199259734e-05, "loss": 1.0205, "step": 13948 }, { "epoch": 2.387301043984255, "grad_norm": 14.159062385559082, "learning_rate": 2.8902957067992063e-05, "loss": 1.1103, "step": 13949 }, { "epoch": 2.387472188944036, "grad_norm": 9.74541187286377, "learning_rate": 2.8898627733959008e-05, "loss": 0.6317, "step": 13950 }, { "epoch": 2.3876433339038163, "grad_norm": 0.44634154438972473, "learning_rate": 2.8894290199714893e-05, "loss": 0.0941, "step": 13951 }, { "epoch": 2.3878144788635973, "grad_norm": 7.8267974853515625, "learning_rate": 2.8889944467818858e-05, "loss": 0.7829, "step": 13952 }, { "epoch": 2.3879856238233783, "grad_norm": 16.2908992767334, "learning_rate": 2.888559054083491e-05, "loss": 1.9923, "step": 13953 }, { "epoch": 2.3881567687831593, "grad_norm": 15.75640869140625, "learning_rate": 2.888122842133185e-05, "loss": 1.1189, "step": 13954 }, { "epoch": 2.3883279137429403, "grad_norm": 33.76850128173828, "learning_rate": 2.8876858111883355e-05, "loss": 0.9369, "step": 13955 }, { "epoch": 2.3884990587027213, "grad_norm": 0.3519725203514099, "learning_rate": 2.8872479615067897e-05, "loss": 0.0934, "step": 13956 }, { "epoch": 2.3886702036625023, "grad_norm": 4.972943305969238, "learning_rate": 2.8868092933468808e-05, "loss": 0.2819, "step": 13957 }, { "epoch": 2.3888413486222833, "grad_norm": 10.420768737792969, "learning_rate": 2.8863698069674227e-05, "loss": 0.7836, "step": 13958 }, { "epoch": 2.3890124935820642, "grad_norm": 15.43856430053711, "learning_rate": 2.8859295026277144e-05, "loss": 0.9737, "step": 13959 }, { "epoch": 2.389183638541845, "grad_norm": 121.28633117675781, "learning_rate": 2.8854883805875346e-05, "loss": 8.7668, "step": 13960 }, { "epoch": 2.389354783501626, "grad_norm": 17.783742904663086, "learning_rate": 2.8850464411071472e-05, "loss": 1.7386, "step": 13961 }, { "epoch": 2.3895259284614068, "grad_norm": 24.571277618408203, "learning_rate": 2.884603684447296e-05, "loss": 5.0999, "step": 13962 }, { "epoch": 2.3896970734211878, "grad_norm": 14.718461036682129, "learning_rate": 2.884160110869209e-05, "loss": 1.2203, "step": 13963 }, { "epoch": 2.3898682183809687, "grad_norm": 10.542510986328125, "learning_rate": 2.8837157206345945e-05, "loss": 0.701, "step": 13964 }, { "epoch": 2.3900393633407497, "grad_norm": 10.472046852111816, "learning_rate": 2.8832705140056447e-05, "loss": 0.7994, "step": 13965 }, { "epoch": 2.3902105083005307, "grad_norm": 12.404297828674316, "learning_rate": 2.8828244912450305e-05, "loss": 0.9246, "step": 13966 }, { "epoch": 2.3903816532603113, "grad_norm": 0.7443149089813232, "learning_rate": 2.8823776526159063e-05, "loss": 0.1053, "step": 13967 }, { "epoch": 2.3905527982200923, "grad_norm": 16.207639694213867, "learning_rate": 2.8819299983819096e-05, "loss": 1.3895, "step": 13968 }, { "epoch": 2.3907239431798732, "grad_norm": 6.036145210266113, "learning_rate": 2.8814815288071547e-05, "loss": 0.355, "step": 13969 }, { "epoch": 2.3908950881396542, "grad_norm": 7.137734413146973, "learning_rate": 2.8810322441562397e-05, "loss": 0.5325, "step": 13970 }, { "epoch": 2.3910662330994352, "grad_norm": 15.590974807739258, "learning_rate": 2.8805821446942442e-05, "loss": 1.1997, "step": 13971 }, { "epoch": 2.391237378059216, "grad_norm": 15.002717971801758, "learning_rate": 2.8801312306867275e-05, "loss": 1.0902, "step": 13972 }, { "epoch": 2.391408523018997, "grad_norm": 3.974515438079834, "learning_rate": 2.8796795023997282e-05, "loss": 0.226, "step": 13973 }, { "epoch": 2.391579667978778, "grad_norm": 9.29769229888916, "learning_rate": 2.8792269600997673e-05, "loss": 0.5618, "step": 13974 }, { "epoch": 2.391750812938559, "grad_norm": 18.92521858215332, "learning_rate": 2.8787736040538466e-05, "loss": 1.6355, "step": 13975 }, { "epoch": 2.3919219578983397, "grad_norm": 14.80100154876709, "learning_rate": 2.8783194345294458e-05, "loss": 0.9362, "step": 13976 }, { "epoch": 2.3920931028581207, "grad_norm": 14.1998872756958, "learning_rate": 2.877864451794525e-05, "loss": 0.9387, "step": 13977 }, { "epoch": 2.3922642478179017, "grad_norm": 17.581785202026367, "learning_rate": 2.877408656117525e-05, "loss": 1.4591, "step": 13978 }, { "epoch": 2.3924353927776827, "grad_norm": 95.49395751953125, "learning_rate": 2.8769520477673678e-05, "loss": 9.1713, "step": 13979 }, { "epoch": 2.3926065377374637, "grad_norm": 23.818078994750977, "learning_rate": 2.876494627013451e-05, "loss": 3.516, "step": 13980 }, { "epoch": 2.3927776826972447, "grad_norm": 13.651363372802734, "learning_rate": 2.8760363941256532e-05, "loss": 0.9572, "step": 13981 }, { "epoch": 2.3929488276570257, "grad_norm": 11.349719047546387, "learning_rate": 2.8755773493743334e-05, "loss": 0.8693, "step": 13982 }, { "epoch": 2.393119972616806, "grad_norm": 15.032410621643066, "learning_rate": 2.8751174930303295e-05, "loss": 1.2222, "step": 13983 }, { "epoch": 2.393291117576587, "grad_norm": 10.658223152160645, "learning_rate": 2.874656825364957e-05, "loss": 0.866, "step": 13984 }, { "epoch": 2.393462262536368, "grad_norm": 3.2427821159362793, "learning_rate": 2.8741953466500084e-05, "loss": 0.235, "step": 13985 }, { "epoch": 2.393633407496149, "grad_norm": 10.46068286895752, "learning_rate": 2.8737330571577584e-05, "loss": 0.8807, "step": 13986 }, { "epoch": 2.39380455245593, "grad_norm": 9.498750686645508, "learning_rate": 2.87326995716096e-05, "loss": 0.753, "step": 13987 }, { "epoch": 2.393975697415711, "grad_norm": 18.12687110900879, "learning_rate": 2.8728060469328404e-05, "loss": 1.2103, "step": 13988 }, { "epoch": 2.394146842375492, "grad_norm": 13.39859676361084, "learning_rate": 2.8723413267471083e-05, "loss": 0.9841, "step": 13989 }, { "epoch": 2.394317987335273, "grad_norm": 11.658907890319824, "learning_rate": 2.8718757968779503e-05, "loss": 1.0762, "step": 13990 }, { "epoch": 2.394489132295054, "grad_norm": 0.5293299555778503, "learning_rate": 2.8714094576000277e-05, "loss": 0.1074, "step": 13991 }, { "epoch": 2.3946602772548347, "grad_norm": 11.591958045959473, "learning_rate": 2.8709423091884836e-05, "loss": 0.7848, "step": 13992 }, { "epoch": 2.3948314222146156, "grad_norm": 17.578792572021484, "learning_rate": 2.870474351918934e-05, "loss": 1.2729, "step": 13993 }, { "epoch": 2.3950025671743966, "grad_norm": 16.04994773864746, "learning_rate": 2.8700055860674765e-05, "loss": 1.2288, "step": 13994 }, { "epoch": 2.3951737121341776, "grad_norm": 10.772201538085938, "learning_rate": 2.869536011910682e-05, "loss": 0.7875, "step": 13995 }, { "epoch": 2.3953448570939586, "grad_norm": 11.919108390808105, "learning_rate": 2.8690656297256014e-05, "loss": 0.804, "step": 13996 }, { "epoch": 2.3955160020537396, "grad_norm": 13.723003387451172, "learning_rate": 2.868594439789759e-05, "loss": 1.0554, "step": 13997 }, { "epoch": 2.3956871470135206, "grad_norm": 18.478771209716797, "learning_rate": 2.8681224423811595e-05, "loss": 2.0906, "step": 13998 }, { "epoch": 2.3958582919733016, "grad_norm": 0.4737374186515808, "learning_rate": 2.8676496377782805e-05, "loss": 0.1025, "step": 13999 }, { "epoch": 2.396029436933082, "grad_norm": 4.966257095336914, "learning_rate": 2.8671760262600793e-05, "loss": 0.4318, "step": 14000 }, { "epoch": 2.396200581892863, "grad_norm": 11.292283058166504, "learning_rate": 2.866701608105985e-05, "loss": 0.7158, "step": 14001 }, { "epoch": 2.396371726852644, "grad_norm": 4.196849822998047, "learning_rate": 2.866226383595907e-05, "loss": 0.2421, "step": 14002 }, { "epoch": 2.396542871812425, "grad_norm": 12.231025695800781, "learning_rate": 2.865750353010227e-05, "loss": 0.6696, "step": 14003 }, { "epoch": 2.396714016772206, "grad_norm": 18.521150588989258, "learning_rate": 2.8652735166298053e-05, "loss": 1.0692, "step": 14004 }, { "epoch": 2.396885161731987, "grad_norm": 0.8464030027389526, "learning_rate": 2.864795874735975e-05, "loss": 0.1775, "step": 14005 }, { "epoch": 2.397056306691768, "grad_norm": 10.29248332977295, "learning_rate": 2.8643174276105462e-05, "loss": 0.8836, "step": 14006 }, { "epoch": 2.397227451651549, "grad_norm": 12.049858093261719, "learning_rate": 2.8638381755358024e-05, "loss": 0.7855, "step": 14007 }, { "epoch": 2.39739859661133, "grad_norm": 14.435182571411133, "learning_rate": 2.863358118794504e-05, "loss": 1.139, "step": 14008 }, { "epoch": 2.3975697415711106, "grad_norm": 10.865316390991211, "learning_rate": 2.862877257669884e-05, "loss": 0.8377, "step": 14009 }, { "epoch": 2.3977408865308916, "grad_norm": 12.41232967376709, "learning_rate": 2.8623955924456525e-05, "loss": 0.886, "step": 14010 }, { "epoch": 2.3979120314906726, "grad_norm": 14.456991195678711, "learning_rate": 2.861913123405992e-05, "loss": 1.428, "step": 14011 }, { "epoch": 2.3980831764504535, "grad_norm": 11.669357299804688, "learning_rate": 2.861429850835561e-05, "loss": 0.8058, "step": 14012 }, { "epoch": 2.3982543214102345, "grad_norm": 21.823827743530273, "learning_rate": 2.8609457750194903e-05, "loss": 2.006, "step": 14013 }, { "epoch": 2.3984254663700155, "grad_norm": 16.10972785949707, "learning_rate": 2.8604608962433847e-05, "loss": 1.2239, "step": 14014 }, { "epoch": 2.3985966113297965, "grad_norm": 0.46458151936531067, "learning_rate": 2.8599752147933236e-05, "loss": 0.0991, "step": 14015 }, { "epoch": 2.398767756289577, "grad_norm": 8.503589630126953, "learning_rate": 2.859488730955861e-05, "loss": 0.8046, "step": 14016 }, { "epoch": 2.398938901249358, "grad_norm": 17.08364486694336, "learning_rate": 2.859001445018022e-05, "loss": 0.9012, "step": 14017 }, { "epoch": 2.399110046209139, "grad_norm": 10.751371383666992, "learning_rate": 2.858513357267306e-05, "loss": 0.6009, "step": 14018 }, { "epoch": 2.39928119116892, "grad_norm": 11.274413108825684, "learning_rate": 2.8580244679916852e-05, "loss": 0.9102, "step": 14019 }, { "epoch": 2.399452336128701, "grad_norm": 17.162029266357422, "learning_rate": 2.8575347774796066e-05, "loss": 1.7428, "step": 14020 }, { "epoch": 2.399623481088482, "grad_norm": 9.907596588134766, "learning_rate": 2.8570442860199876e-05, "loss": 0.8831, "step": 14021 }, { "epoch": 2.399794626048263, "grad_norm": 18.76157569885254, "learning_rate": 2.8565529939022174e-05, "loss": 1.6408, "step": 14022 }, { "epoch": 2.399965771008044, "grad_norm": 21.254547119140625, "learning_rate": 2.85606090141616e-05, "loss": 1.5145, "step": 14023 }, { "epoch": 2.400136915967825, "grad_norm": 24.68286895751953, "learning_rate": 2.8555680088521526e-05, "loss": 5.3342, "step": 14024 }, { "epoch": 2.4003080609276055, "grad_norm": 17.595165252685547, "learning_rate": 2.8550743165010006e-05, "loss": 1.7326, "step": 14025 }, { "epoch": 2.4004792058873865, "grad_norm": 13.986950874328613, "learning_rate": 2.8545798246539824e-05, "loss": 1.1405, "step": 14026 }, { "epoch": 2.4006503508471675, "grad_norm": 11.07463550567627, "learning_rate": 2.8540845336028503e-05, "loss": 0.9386, "step": 14027 }, { "epoch": 2.4008214958069485, "grad_norm": 4.983100414276123, "learning_rate": 2.853588443639827e-05, "loss": 0.2671, "step": 14028 }, { "epoch": 2.4009926407667295, "grad_norm": 10.670504570007324, "learning_rate": 2.8530915550576063e-05, "loss": 0.8501, "step": 14029 }, { "epoch": 2.4011637857265105, "grad_norm": 13.154547691345215, "learning_rate": 2.852593868149352e-05, "loss": 1.1281, "step": 14030 }, { "epoch": 2.4013349306862914, "grad_norm": 5.916591167449951, "learning_rate": 2.8520953832087005e-05, "loss": 0.3343, "step": 14031 }, { "epoch": 2.401506075646072, "grad_norm": 15.19194507598877, "learning_rate": 2.851596100529759e-05, "loss": 0.8737, "step": 14032 }, { "epoch": 2.401506075646072, "eval_nli-pairs_loss": 1.2560908794403076, "eval_nli-pairs_runtime": 4.7049, "eval_nli-pairs_samples_per_second": 42.509, "eval_nli-pairs_steps_per_second": 1.488, "eval_sts-test_pearson_cosine": 0.778743376150553, "eval_sts-test_pearson_dot": 0.6195868666572915, "eval_sts-test_pearson_euclidean": 0.7648204545303492, "eval_sts-test_pearson_manhattan": 0.7662269904341282, "eval_sts-test_pearson_max": 0.778743376150553, "eval_sts-test_spearman_cosine": 0.7852198018923544, "eval_sts-test_spearman_dot": 0.6005518117733305, "eval_sts-test_spearman_euclidean": 0.757358453872815, "eval_sts-test_spearman_manhattan": 0.7612800268863865, "eval_sts-test_spearman_max": 0.7852198018923544, "step": 14032 }, { "epoch": 2.401506075646072, "eval_vitaminc-pairs_loss": 0.6099990010261536, "eval_vitaminc-pairs_runtime": 2.8924, "eval_vitaminc-pairs_samples_per_second": 69.146, "eval_vitaminc-pairs_steps_per_second": 2.42, "step": 14032 }, { "epoch": 2.401506075646072, "eval_qnli-contrastive_loss": 1.1883658170700073, "eval_qnli-contrastive_runtime": 0.6716, "eval_qnli-contrastive_samples_per_second": 297.814, "eval_qnli-contrastive_steps_per_second": 10.423, "step": 14032 }, { "epoch": 2.401506075646072, "eval_scitail-pairs-qa_loss": 0.08014583587646484, "eval_scitail-pairs-qa_runtime": 1.6727, "eval_scitail-pairs-qa_samples_per_second": 119.568, "eval_scitail-pairs-qa_steps_per_second": 4.185, "step": 14032 }, { "epoch": 2.401506075646072, "eval_scitail-pairs-pos_loss": 0.5815625786781311, "eval_scitail-pairs-pos_runtime": 2.6153, "eval_scitail-pairs-pos_samples_per_second": 76.473, "eval_scitail-pairs-pos_steps_per_second": 2.677, "step": 14032 }, { "epoch": 2.401506075646072, "eval_xsum-pairs_loss": 0.5843955874443054, "eval_xsum-pairs_runtime": 2.6969, "eval_xsum-pairs_samples_per_second": 64.889, "eval_xsum-pairs_steps_per_second": 2.225, "step": 14032 }, { "epoch": 2.401506075646072, "eval_compression-pairs_loss": 0.17852792143821716, "eval_compression-pairs_runtime": 0.5247, "eval_compression-pairs_samples_per_second": 381.15, "eval_compression-pairs_steps_per_second": 13.34, "step": 14032 }, { "epoch": 2.401506075646072, "eval_sciq_pairs_loss": 0.3364640176296234, "eval_sciq_pairs_runtime": 9.482, "eval_sciq_pairs_samples_per_second": 21.093, "eval_sciq_pairs_steps_per_second": 0.738, "step": 14032 }, { "epoch": 2.401506075646072, "eval_qasc_pairs_loss": 5.433475971221924, "eval_qasc_pairs_runtime": 2.8175, "eval_qasc_pairs_samples_per_second": 70.985, "eval_qasc_pairs_steps_per_second": 2.484, "step": 14032 }, { "epoch": 2.401506075646072, "eval_openbookqa_pairs_loss": 2.2755446434020996, "eval_openbookqa_pairs_runtime": 0.6709, "eval_openbookqa_pairs_samples_per_second": 102.84, "eval_openbookqa_pairs_steps_per_second": 4.471, "step": 14032 }, { "epoch": 2.401506075646072, "eval_msmarco_pairs_loss": 0.8218420147895813, "eval_msmarco_pairs_runtime": 4.026, "eval_msmarco_pairs_samples_per_second": 49.677, "eval_msmarco_pairs_steps_per_second": 1.739, "step": 14032 }, { "epoch": 2.401506075646072, "eval_nq_pairs_loss": 0.9934337139129639, "eval_nq_pairs_runtime": 8.6676, "eval_nq_pairs_samples_per_second": 23.074, "eval_nq_pairs_steps_per_second": 0.808, "step": 14032 }, { "epoch": 2.401506075646072, "eval_trivia_pairs_loss": 1.4804171323776245, "eval_trivia_pairs_runtime": 12.7839, "eval_trivia_pairs_samples_per_second": 15.645, "eval_trivia_pairs_steps_per_second": 0.548, "step": 14032 }, { "epoch": 2.401506075646072, "eval_quora_pairs_loss": 0.1613619178533554, "eval_quora_pairs_runtime": 1.5788, "eval_quora_pairs_samples_per_second": 126.68, "eval_quora_pairs_steps_per_second": 4.434, "step": 14032 }, { "epoch": 2.401506075646072, "eval_gooaq_pairs_loss": 0.7359638810157776, "eval_gooaq_pairs_runtime": 2.6283, "eval_gooaq_pairs_samples_per_second": 76.094, "eval_gooaq_pairs_steps_per_second": 2.663, "step": 14032 }, { "epoch": 2.401677220605853, "grad_norm": 9.097579956054688, "learning_rate": 2.851096020407106e-05, "loss": 0.79, "step": 14033 }, { "epoch": 2.401848365565634, "grad_norm": 18.430513381958008, "learning_rate": 2.8505951431357878e-05, "loss": 1.9566, "step": 14034 }, { "epoch": 2.402019510525415, "grad_norm": 16.36053466796875, "learning_rate": 2.850093469011324e-05, "loss": 1.893, "step": 14035 }, { "epoch": 2.402190655485196, "grad_norm": 9.419352531433105, "learning_rate": 2.8495909983297022e-05, "loss": 0.6642, "step": 14036 }, { "epoch": 2.402361800444977, "grad_norm": 2.6637392044067383, "learning_rate": 2.8490877313873814e-05, "loss": 0.2367, "step": 14037 }, { "epoch": 2.402532945404758, "grad_norm": 0.458404004573822, "learning_rate": 2.8485836684812893e-05, "loss": 0.1026, "step": 14038 }, { "epoch": 2.402704090364539, "grad_norm": 14.779921531677246, "learning_rate": 2.848078809908825e-05, "loss": 1.6887, "step": 14039 }, { "epoch": 2.40287523532432, "grad_norm": 16.22422981262207, "learning_rate": 2.8475731559678542e-05, "loss": 1.247, "step": 14040 }, { "epoch": 2.4030463802841004, "grad_norm": 9.610102653503418, "learning_rate": 2.8470667069567146e-05, "loss": 0.778, "step": 14041 }, { "epoch": 2.4032175252438814, "grad_norm": 18.88636016845703, "learning_rate": 2.8465594631742113e-05, "loss": 1.9663, "step": 14042 }, { "epoch": 2.4033886702036624, "grad_norm": 14.550506591796875, "learning_rate": 2.8460514249196197e-05, "loss": 0.8457, "step": 14043 }, { "epoch": 2.4035598151634434, "grad_norm": 0.9553149342536926, "learning_rate": 2.8455425924926812e-05, "loss": 0.1162, "step": 14044 }, { "epoch": 2.4037309601232244, "grad_norm": 12.769843101501465, "learning_rate": 2.84503296619361e-05, "loss": 0.7371, "step": 14045 }, { "epoch": 2.4039021050830054, "grad_norm": 5.775295257568359, "learning_rate": 2.8445225463230852e-05, "loss": 0.3644, "step": 14046 }, { "epoch": 2.4040732500427864, "grad_norm": 2.230868101119995, "learning_rate": 2.8440113331822556e-05, "loss": 0.3403, "step": 14047 }, { "epoch": 2.404244395002567, "grad_norm": 16.393827438354492, "learning_rate": 2.843499327072737e-05, "loss": 1.1653, "step": 14048 }, { "epoch": 2.404415539962348, "grad_norm": 7.141784191131592, "learning_rate": 2.8429865282966153e-05, "loss": 0.5874, "step": 14049 }, { "epoch": 2.404586684922129, "grad_norm": 12.893696784973145, "learning_rate": 2.8424729371564404e-05, "loss": 0.8362, "step": 14050 }, { "epoch": 2.40475782988191, "grad_norm": 11.29074764251709, "learning_rate": 2.841958553955234e-05, "loss": 0.8384, "step": 14051 }, { "epoch": 2.404928974841691, "grad_norm": 13.886877059936523, "learning_rate": 2.841443378996481e-05, "loss": 1.1666, "step": 14052 }, { "epoch": 2.405100119801472, "grad_norm": 13.410027503967285, "learning_rate": 2.8409274125841366e-05, "loss": 0.9696, "step": 14053 }, { "epoch": 2.405271264761253, "grad_norm": 15.933171272277832, "learning_rate": 2.8404106550226224e-05, "loss": 1.6613, "step": 14054 }, { "epoch": 2.405442409721034, "grad_norm": 11.972365379333496, "learning_rate": 2.839893106616824e-05, "loss": 0.9173, "step": 14055 }, { "epoch": 2.405613554680815, "grad_norm": 7.899214744567871, "learning_rate": 2.8393747676720975e-05, "loss": 0.5345, "step": 14056 }, { "epoch": 2.405784699640596, "grad_norm": 21.24188232421875, "learning_rate": 2.8388556384942638e-05, "loss": 2.0827, "step": 14057 }, { "epoch": 2.4059558446003764, "grad_norm": 24.387590408325195, "learning_rate": 2.838335719389609e-05, "loss": 5.4967, "step": 14058 }, { "epoch": 2.4061269895601574, "grad_norm": 32.41388702392578, "learning_rate": 2.8378150106648857e-05, "loss": 5.3522, "step": 14059 }, { "epoch": 2.4062981345199383, "grad_norm": 9.83592414855957, "learning_rate": 2.8372935126273137e-05, "loss": 0.6992, "step": 14060 }, { "epoch": 2.4064692794797193, "grad_norm": 0.8665664196014404, "learning_rate": 2.8367712255845776e-05, "loss": 0.1648, "step": 14061 }, { "epoch": 2.4066404244395003, "grad_norm": 29.388748168945312, "learning_rate": 2.836248149844828e-05, "loss": 5.5525, "step": 14062 }, { "epoch": 2.4068115693992813, "grad_norm": 6.783276081085205, "learning_rate": 2.8357242857166787e-05, "loss": 0.528, "step": 14063 }, { "epoch": 2.4069827143590623, "grad_norm": 15.343708038330078, "learning_rate": 2.835199633509211e-05, "loss": 1.4214, "step": 14064 }, { "epoch": 2.407153859318843, "grad_norm": 16.694421768188477, "learning_rate": 2.8346741935319716e-05, "loss": 1.3766, "step": 14065 }, { "epoch": 2.407325004278624, "grad_norm": 10.816000938415527, "learning_rate": 2.834147966094971e-05, "loss": 0.8299, "step": 14066 }, { "epoch": 2.407496149238405, "grad_norm": 27.410747528076172, "learning_rate": 2.8336209515086813e-05, "loss": 5.3851, "step": 14067 }, { "epoch": 2.407667294198186, "grad_norm": 14.646197319030762, "learning_rate": 2.8330931500840443e-05, "loss": 1.1396, "step": 14068 }, { "epoch": 2.407838439157967, "grad_norm": 0.4411049485206604, "learning_rate": 2.8325645621324642e-05, "loss": 0.1014, "step": 14069 }, { "epoch": 2.408009584117748, "grad_norm": 3.652146816253662, "learning_rate": 2.8320351879658075e-05, "loss": 0.3094, "step": 14070 }, { "epoch": 2.408180729077529, "grad_norm": 10.615584373474121, "learning_rate": 2.831505027896405e-05, "loss": 0.7818, "step": 14071 }, { "epoch": 2.4083518740373098, "grad_norm": 21.435148239135742, "learning_rate": 2.8309740822370526e-05, "loss": 2.6262, "step": 14072 }, { "epoch": 2.4085230189970908, "grad_norm": 16.534366607666016, "learning_rate": 2.83044235130101e-05, "loss": 1.4077, "step": 14073 }, { "epoch": 2.4086941639568713, "grad_norm": 8.432446479797363, "learning_rate": 2.8299098354019984e-05, "loss": 0.6552, "step": 14074 }, { "epoch": 2.4088653089166523, "grad_norm": 128.39862060546875, "learning_rate": 2.8293765348542024e-05, "loss": 10.2264, "step": 14075 }, { "epoch": 2.4090364538764333, "grad_norm": 6.508200645446777, "learning_rate": 2.8288424499722717e-05, "loss": 0.5447, "step": 14076 }, { "epoch": 2.4092075988362143, "grad_norm": 21.060564041137695, "learning_rate": 2.8283075810713156e-05, "loss": 4.7343, "step": 14077 }, { "epoch": 2.4093787437959953, "grad_norm": 15.640949249267578, "learning_rate": 2.827771928466909e-05, "loss": 1.1644, "step": 14078 }, { "epoch": 2.4095498887557762, "grad_norm": 3.15975022315979, "learning_rate": 2.827235492475086e-05, "loss": 0.2618, "step": 14079 }, { "epoch": 2.4097210337155572, "grad_norm": 15.497106552124023, "learning_rate": 2.8266982734123462e-05, "loss": 1.1669, "step": 14080 }, { "epoch": 2.409892178675338, "grad_norm": 11.226163864135742, "learning_rate": 2.826160271595649e-05, "loss": 0.8744, "step": 14081 }, { "epoch": 2.4100633236351188, "grad_norm": 10.729740142822266, "learning_rate": 2.8256214873424167e-05, "loss": 0.7472, "step": 14082 }, { "epoch": 2.4102344685948998, "grad_norm": 15.658041954040527, "learning_rate": 2.8250819209705313e-05, "loss": 1.1319, "step": 14083 }, { "epoch": 2.4104056135546807, "grad_norm": 7.535942554473877, "learning_rate": 2.8245415727983398e-05, "loss": 0.7118, "step": 14084 }, { "epoch": 2.4105767585144617, "grad_norm": 19.016681671142578, "learning_rate": 2.824000443144647e-05, "loss": 2.1901, "step": 14085 }, { "epoch": 2.4107479034742427, "grad_norm": 9.700361251831055, "learning_rate": 2.8234585323287212e-05, "loss": 0.7516, "step": 14086 }, { "epoch": 2.4109190484340237, "grad_norm": 15.03188705444336, "learning_rate": 2.822915840670289e-05, "loss": 1.0163, "step": 14087 }, { "epoch": 2.4110901933938047, "grad_norm": 2.519787549972534, "learning_rate": 2.822372368489542e-05, "loss": 0.2314, "step": 14088 }, { "epoch": 2.4112613383535857, "grad_norm": 11.502283096313477, "learning_rate": 2.8218281161071262e-05, "loss": 0.7401, "step": 14089 }, { "epoch": 2.4114324833133662, "grad_norm": 15.430981636047363, "learning_rate": 2.8212830838441547e-05, "loss": 1.1554, "step": 14090 }, { "epoch": 2.4116036282731472, "grad_norm": 9.724127769470215, "learning_rate": 2.8207372720221944e-05, "loss": 0.845, "step": 14091 }, { "epoch": 2.411774773232928, "grad_norm": 11.749658584594727, "learning_rate": 2.8201906809632775e-05, "loss": 1.0637, "step": 14092 }, { "epoch": 2.411945918192709, "grad_norm": 13.096307754516602, "learning_rate": 2.8196433109898917e-05, "loss": 1.0235, "step": 14093 }, { "epoch": 2.41211706315249, "grad_norm": 14.948675155639648, "learning_rate": 2.8190951624249876e-05, "loss": 1.7195, "step": 14094 }, { "epoch": 2.412288208112271, "grad_norm": 17.108829498291016, "learning_rate": 2.818546235591972e-05, "loss": 1.7854, "step": 14095 }, { "epoch": 2.412459353072052, "grad_norm": 11.196515083312988, "learning_rate": 2.8179965308147136e-05, "loss": 0.8086, "step": 14096 }, { "epoch": 2.4126304980318327, "grad_norm": 13.273808479309082, "learning_rate": 2.817446048417539e-05, "loss": 0.8916, "step": 14097 }, { "epoch": 2.4128016429916137, "grad_norm": 17.198759078979492, "learning_rate": 2.816894788725234e-05, "loss": 1.7191, "step": 14098 }, { "epoch": 2.4129727879513947, "grad_norm": 12.244705200195312, "learning_rate": 2.8163427520630427e-05, "loss": 0.7319, "step": 14099 }, { "epoch": 2.4131439329111757, "grad_norm": 12.265054702758789, "learning_rate": 2.8157899387566658e-05, "loss": 0.9313, "step": 14100 }, { "epoch": 2.4133150778709567, "grad_norm": 17.702238082885742, "learning_rate": 2.815236349132265e-05, "loss": 1.7688, "step": 14101 }, { "epoch": 2.4134862228307377, "grad_norm": 9.178650856018066, "learning_rate": 2.8146819835164608e-05, "loss": 0.632, "step": 14102 }, { "epoch": 2.4136573677905186, "grad_norm": 57.62971496582031, "learning_rate": 2.8141268422363283e-05, "loss": 7.8967, "step": 14103 }, { "epoch": 2.4138285127502996, "grad_norm": 13.328060150146484, "learning_rate": 2.8135709256194e-05, "loss": 1.0077, "step": 14104 }, { "epoch": 2.4139996577100806, "grad_norm": 0.4923412799835205, "learning_rate": 2.8130142339936692e-05, "loss": 0.105, "step": 14105 }, { "epoch": 2.4141708026698616, "grad_norm": 12.535155296325684, "learning_rate": 2.8124567676875854e-05, "loss": 0.6859, "step": 14106 }, { "epoch": 2.414341947629642, "grad_norm": 0.9592524766921997, "learning_rate": 2.811898527030054e-05, "loss": 0.1665, "step": 14107 }, { "epoch": 2.414513092589423, "grad_norm": 0.4188377857208252, "learning_rate": 2.811339512350437e-05, "loss": 0.0973, "step": 14108 }, { "epoch": 2.414684237549204, "grad_norm": 7.833198547363281, "learning_rate": 2.810779723978554e-05, "loss": 0.7152, "step": 14109 }, { "epoch": 2.414855382508985, "grad_norm": 18.741771697998047, "learning_rate": 2.8102191622446825e-05, "loss": 1.8095, "step": 14110 }, { "epoch": 2.415026527468766, "grad_norm": 8.305549621582031, "learning_rate": 2.8096578274795545e-05, "loss": 0.5357, "step": 14111 }, { "epoch": 2.415197672428547, "grad_norm": 5.178909778594971, "learning_rate": 2.809095720014356e-05, "loss": 0.4332, "step": 14112 }, { "epoch": 2.415368817388328, "grad_norm": 22.977750778198242, "learning_rate": 2.8085328401807328e-05, "loss": 1.628, "step": 14113 }, { "epoch": 2.4155399623481086, "grad_norm": 9.666526794433594, "learning_rate": 2.807969188310786e-05, "loss": 0.8849, "step": 14114 }, { "epoch": 2.4157111073078896, "grad_norm": 26.50626564025879, "learning_rate": 2.80740476473707e-05, "loss": 5.3255, "step": 14115 }, { "epoch": 2.4158822522676706, "grad_norm": 6.024598121643066, "learning_rate": 2.8068395697925946e-05, "loss": 0.4889, "step": 14116 }, { "epoch": 2.4160533972274516, "grad_norm": 12.69047737121582, "learning_rate": 2.8062736038108263e-05, "loss": 0.8225, "step": 14117 }, { "epoch": 2.4162245421872326, "grad_norm": 15.371095657348633, "learning_rate": 2.8057068671256862e-05, "loss": 1.1557, "step": 14118 }, { "epoch": 2.4163956871470136, "grad_norm": 25.02386474609375, "learning_rate": 2.8051393600715507e-05, "loss": 5.1982, "step": 14119 }, { "epoch": 2.4165668321067946, "grad_norm": 10.767854690551758, "learning_rate": 2.804571082983248e-05, "loss": 0.9169, "step": 14120 }, { "epoch": 2.4167379770665756, "grad_norm": 15.424456596374512, "learning_rate": 2.804002036196064e-05, "loss": 1.2275, "step": 14121 }, { "epoch": 2.4169091220263565, "grad_norm": 15.442620277404785, "learning_rate": 2.803432220045735e-05, "loss": 1.1497, "step": 14122 }, { "epoch": 2.417080266986137, "grad_norm": 15.694896697998047, "learning_rate": 2.802861634868456e-05, "loss": 1.2342, "step": 14123 }, { "epoch": 2.417251411945918, "grad_norm": 15.724752426147461, "learning_rate": 2.8022902810008715e-05, "loss": 1.2421, "step": 14124 }, { "epoch": 2.417422556905699, "grad_norm": 13.600316047668457, "learning_rate": 2.801718158780082e-05, "loss": 1.4829, "step": 14125 }, { "epoch": 2.41759370186548, "grad_norm": 16.973175048828125, "learning_rate": 2.801145268543639e-05, "loss": 0.91, "step": 14126 }, { "epoch": 2.417764846825261, "grad_norm": 13.525860786437988, "learning_rate": 2.8005716106295502e-05, "loss": 0.7601, "step": 14127 }, { "epoch": 2.417935991785042, "grad_norm": 13.257013320922852, "learning_rate": 2.7999971853762733e-05, "loss": 0.9521, "step": 14128 }, { "epoch": 2.418107136744823, "grad_norm": 13.706380844116211, "learning_rate": 2.7994219931227218e-05, "loss": 1.3078, "step": 14129 }, { "epoch": 2.4182782817046036, "grad_norm": 6.726744651794434, "learning_rate": 2.7988460342082582e-05, "loss": 0.4547, "step": 14130 }, { "epoch": 2.4184494266643846, "grad_norm": 9.860395431518555, "learning_rate": 2.7982693089727003e-05, "loss": 0.7292, "step": 14131 }, { "epoch": 2.4186205716241655, "grad_norm": 11.905856132507324, "learning_rate": 2.7976918177563157e-05, "loss": 0.6807, "step": 14132 }, { "epoch": 2.4187917165839465, "grad_norm": 15.168338775634766, "learning_rate": 2.7971135608998267e-05, "loss": 1.1277, "step": 14133 }, { "epoch": 2.4189628615437275, "grad_norm": 11.142333984375, "learning_rate": 2.7965345387444035e-05, "loss": 0.8123, "step": 14134 }, { "epoch": 2.4191340065035085, "grad_norm": 34.28571319580078, "learning_rate": 2.795954751631673e-05, "loss": 5.9432, "step": 14135 }, { "epoch": 2.4193051514632895, "grad_norm": 10.36127758026123, "learning_rate": 2.7953741999037074e-05, "loss": 0.8222, "step": 14136 }, { "epoch": 2.4194762964230705, "grad_norm": 16.674129486083984, "learning_rate": 2.794792883903035e-05, "loss": 1.2083, "step": 14137 }, { "epoch": 2.4196474413828515, "grad_norm": 13.073434829711914, "learning_rate": 2.794210803972632e-05, "loss": 0.983, "step": 14138 }, { "epoch": 2.419818586342632, "grad_norm": 0.5108442902565002, "learning_rate": 2.7936279604559268e-05, "loss": 0.1062, "step": 14139 }, { "epoch": 2.419989731302413, "grad_norm": 8.997053146362305, "learning_rate": 2.7930443536967992e-05, "loss": 0.6514, "step": 14140 }, { "epoch": 2.420160876262194, "grad_norm": 2.0622286796569824, "learning_rate": 2.792459984039576e-05, "loss": 0.2063, "step": 14141 }, { "epoch": 2.420332021221975, "grad_norm": 12.403244972229004, "learning_rate": 2.7918748518290372e-05, "loss": 0.9169, "step": 14142 }, { "epoch": 2.420503166181756, "grad_norm": 15.700401306152344, "learning_rate": 2.7912889574104127e-05, "loss": 1.2051, "step": 14143 }, { "epoch": 2.420674311141537, "grad_norm": 11.183328628540039, "learning_rate": 2.7907023011293797e-05, "loss": 0.7429, "step": 14144 }, { "epoch": 2.420845456101318, "grad_norm": 18.298290252685547, "learning_rate": 2.790114883332066e-05, "loss": 1.5347, "step": 14145 }, { "epoch": 2.4210166010610985, "grad_norm": 8.242451667785645, "learning_rate": 2.789526704365049e-05, "loss": 0.5602, "step": 14146 }, { "epoch": 2.4211877460208795, "grad_norm": 11.75820541381836, "learning_rate": 2.7889377645753573e-05, "loss": 0.8419, "step": 14147 }, { "epoch": 2.4213588909806605, "grad_norm": 1.8395276069641113, "learning_rate": 2.7883480643104643e-05, "loss": 0.2157, "step": 14148 }, { "epoch": 2.4215300359404415, "grad_norm": 8.093130111694336, "learning_rate": 2.7877576039182934e-05, "loss": 0.5057, "step": 14149 }, { "epoch": 2.4217011809002225, "grad_norm": 16.15241241455078, "learning_rate": 2.7871663837472183e-05, "loss": 1.0997, "step": 14150 }, { "epoch": 2.4218723258600034, "grad_norm": 11.44933032989502, "learning_rate": 2.786574404146061e-05, "loss": 0.7904, "step": 14151 }, { "epoch": 2.4220434708197844, "grad_norm": 3.160348415374756, "learning_rate": 2.7859816654640883e-05, "loss": 0.3837, "step": 14152 }, { "epoch": 2.4222146157795654, "grad_norm": 29.739276885986328, "learning_rate": 2.785388168051017e-05, "loss": 5.7877, "step": 14153 }, { "epoch": 2.4223857607393464, "grad_norm": 1.222548007965088, "learning_rate": 2.7847939122570114e-05, "loss": 0.2011, "step": 14154 }, { "epoch": 2.4225569056991274, "grad_norm": 4.669407367706299, "learning_rate": 2.7841988984326857e-05, "loss": 0.4832, "step": 14155 }, { "epoch": 2.422728050658908, "grad_norm": 3.892198324203491, "learning_rate": 2.783603126929097e-05, "loss": 0.3338, "step": 14156 }, { "epoch": 2.422899195618689, "grad_norm": 12.628219604492188, "learning_rate": 2.78300659809775e-05, "loss": 0.907, "step": 14157 }, { "epoch": 2.42307034057847, "grad_norm": 58.39409255981445, "learning_rate": 2.7824093122905993e-05, "loss": 8.012, "step": 14158 }, { "epoch": 2.423241485538251, "grad_norm": 18.517927169799805, "learning_rate": 2.7818112698600456e-05, "loss": 1.4316, "step": 14159 }, { "epoch": 2.423412630498032, "grad_norm": 4.349628448486328, "learning_rate": 2.7812124711589323e-05, "loss": 0.3638, "step": 14160 }, { "epoch": 2.423583775457813, "grad_norm": 0.5906945466995239, "learning_rate": 2.780612916540553e-05, "loss": 0.1047, "step": 14161 }, { "epoch": 2.423754920417594, "grad_norm": 13.096047401428223, "learning_rate": 2.780012606358646e-05, "loss": 0.9738, "step": 14162 }, { "epoch": 2.4239260653773744, "grad_norm": 6.467690944671631, "learning_rate": 2.7794115409673936e-05, "loss": 0.5262, "step": 14163 }, { "epoch": 2.4240972103371554, "grad_norm": 20.008193969726562, "learning_rate": 2.778809720721428e-05, "loss": 2.2389, "step": 14164 }, { "epoch": 2.4242683552969364, "grad_norm": 0.5166319608688354, "learning_rate": 2.7782071459758208e-05, "loss": 0.1091, "step": 14165 }, { "epoch": 2.4244395002567174, "grad_norm": 14.947403907775879, "learning_rate": 2.7776038170860952e-05, "loss": 1.1171, "step": 14166 }, { "epoch": 2.4246106452164984, "grad_norm": 15.35404109954834, "learning_rate": 2.7769997344082136e-05, "loss": 1.0461, "step": 14167 }, { "epoch": 2.4247817901762794, "grad_norm": 2.217679738998413, "learning_rate": 2.7763948982985874e-05, "loss": 0.2003, "step": 14168 }, { "epoch": 2.4249529351360604, "grad_norm": 18.515127182006836, "learning_rate": 2.775789309114069e-05, "loss": 1.4398, "step": 14169 }, { "epoch": 2.4251240800958414, "grad_norm": 12.891633033752441, "learning_rate": 2.775182967211959e-05, "loss": 0.9744, "step": 14170 }, { "epoch": 2.4252952250556223, "grad_norm": 13.591495513916016, "learning_rate": 2.774575872949998e-05, "loss": 0.9816, "step": 14171 }, { "epoch": 2.425466370015403, "grad_norm": 6.287905216217041, "learning_rate": 2.773968026686375e-05, "loss": 0.4694, "step": 14172 }, { "epoch": 2.425637514975184, "grad_norm": 11.304561614990234, "learning_rate": 2.773359428779717e-05, "loss": 0.8278, "step": 14173 }, { "epoch": 2.425808659934965, "grad_norm": 9.930416107177734, "learning_rate": 2.772750079589101e-05, "loss": 0.7421, "step": 14174 }, { "epoch": 2.425979804894746, "grad_norm": 0.8840548992156982, "learning_rate": 2.7721399794740412e-05, "loss": 0.1192, "step": 14175 }, { "epoch": 2.426150949854527, "grad_norm": 12.452665328979492, "learning_rate": 2.7715291287944996e-05, "loss": 0.8006, "step": 14176 }, { "epoch": 2.426322094814308, "grad_norm": 14.232076644897461, "learning_rate": 2.770917527910877e-05, "loss": 1.4419, "step": 14177 }, { "epoch": 2.426493239774089, "grad_norm": 11.725752830505371, "learning_rate": 2.770305177184021e-05, "loss": 0.9189, "step": 14178 }, { "epoch": 2.4266643847338694, "grad_norm": 59.20233917236328, "learning_rate": 2.7696920769752176e-05, "loss": 7.5354, "step": 14179 }, { "epoch": 2.4268355296936504, "grad_norm": 4.807818412780762, "learning_rate": 2.7690782276461983e-05, "loss": 0.2466, "step": 14180 }, { "epoch": 2.4270066746534313, "grad_norm": 17.006162643432617, "learning_rate": 2.768463629559134e-05, "loss": 1.1105, "step": 14181 }, { "epoch": 2.4271778196132123, "grad_norm": 14.36371898651123, "learning_rate": 2.7678482830766384e-05, "loss": 1.285, "step": 14182 }, { "epoch": 2.4273489645729933, "grad_norm": 18.02627182006836, "learning_rate": 2.7672321885617673e-05, "loss": 2.158, "step": 14183 }, { "epoch": 2.4275201095327743, "grad_norm": 0.40289387106895447, "learning_rate": 2.766615346378019e-05, "loss": 0.1075, "step": 14184 }, { "epoch": 2.4276912544925553, "grad_norm": 39.97178268432617, "learning_rate": 2.7659977568893294e-05, "loss": 7.0111, "step": 14185 }, { "epoch": 2.4278623994523363, "grad_norm": 17.540721893310547, "learning_rate": 2.7653794204600764e-05, "loss": 1.221, "step": 14186 }, { "epoch": 2.4280335444121173, "grad_norm": 21.037158966064453, "learning_rate": 2.764760337455081e-05, "loss": 2.1096, "step": 14187 }, { "epoch": 2.428204689371898, "grad_norm": 5.504323482513428, "learning_rate": 2.7641405082396038e-05, "loss": 0.3258, "step": 14188 }, { "epoch": 2.428375834331679, "grad_norm": 7.2169365882873535, "learning_rate": 2.7635199331793437e-05, "loss": 0.4337, "step": 14189 }, { "epoch": 2.42854697929146, "grad_norm": 15.251007080078125, "learning_rate": 2.7628986126404398e-05, "loss": 1.8252, "step": 14190 }, { "epoch": 2.428718124251241, "grad_norm": 5.62943172454834, "learning_rate": 2.7622765469894733e-05, "loss": 0.439, "step": 14191 }, { "epoch": 2.4288892692110218, "grad_norm": 14.855351448059082, "learning_rate": 2.7616537365934652e-05, "loss": 1.3852, "step": 14192 }, { "epoch": 2.4290604141708028, "grad_norm": 16.5765438079834, "learning_rate": 2.7610301818198738e-05, "loss": 1.6256, "step": 14193 }, { "epoch": 2.4292315591305838, "grad_norm": 7.607579708099365, "learning_rate": 2.7604058830365952e-05, "loss": 0.467, "step": 14194 }, { "epoch": 2.4294027040903643, "grad_norm": 20.640003204345703, "learning_rate": 2.7597808406119685e-05, "loss": 1.5033, "step": 14195 }, { "epoch": 2.4295738490501453, "grad_norm": 4.43795919418335, "learning_rate": 2.7591550549147704e-05, "loss": 0.2981, "step": 14196 }, { "epoch": 2.4297449940099263, "grad_norm": 13.51389217376709, "learning_rate": 2.758528526314215e-05, "loss": 0.888, "step": 14197 }, { "epoch": 2.4299161389697073, "grad_norm": 27.27247428894043, "learning_rate": 2.7579012551799526e-05, "loss": 5.2371, "step": 14198 }, { "epoch": 2.4300872839294883, "grad_norm": 10.629762649536133, "learning_rate": 2.757273241882077e-05, "loss": 0.5826, "step": 14199 }, { "epoch": 2.4302584288892692, "grad_norm": 8.212379455566406, "learning_rate": 2.7566444867911165e-05, "loss": 0.5468, "step": 14200 }, { "epoch": 2.4304295738490502, "grad_norm": 13.139976501464844, "learning_rate": 2.756014990278037e-05, "loss": 0.8328, "step": 14201 }, { "epoch": 2.430600718808831, "grad_norm": 18.45878028869629, "learning_rate": 2.7553847527142416e-05, "loss": 1.9908, "step": 14202 }, { "epoch": 2.430771863768612, "grad_norm": 17.66438865661621, "learning_rate": 2.7547537744715722e-05, "loss": 1.1384, "step": 14203 }, { "epoch": 2.430943008728393, "grad_norm": 15.118637084960938, "learning_rate": 2.7541220559223062e-05, "loss": 0.9096, "step": 14204 }, { "epoch": 2.4311141536881737, "grad_norm": 4.560638904571533, "learning_rate": 2.7534895974391614e-05, "loss": 0.2257, "step": 14205 }, { "epoch": 2.4312852986479547, "grad_norm": 12.974631309509277, "learning_rate": 2.752856399395286e-05, "loss": 1.0523, "step": 14206 }, { "epoch": 2.4314564436077357, "grad_norm": 13.394937515258789, "learning_rate": 2.7522224621642692e-05, "loss": 0.6995, "step": 14207 }, { "epoch": 2.4316275885675167, "grad_norm": 0.41368749737739563, "learning_rate": 2.751587786120134e-05, "loss": 0.0998, "step": 14208 }, { "epoch": 2.4317987335272977, "grad_norm": 15.924103736877441, "learning_rate": 2.750952371637342e-05, "loss": 1.7624, "step": 14209 }, { "epoch": 2.4319698784870787, "grad_norm": 10.429113388061523, "learning_rate": 2.750316219090786e-05, "loss": 0.5733, "step": 14210 }, { "epoch": 2.4321410234468597, "grad_norm": 16.58104705810547, "learning_rate": 2.7496793288558e-05, "loss": 1.5759, "step": 14211 }, { "epoch": 2.43231216840664, "grad_norm": 11.191537857055664, "learning_rate": 2.7490417013081472e-05, "loss": 0.8365, "step": 14212 }, { "epoch": 2.432483313366421, "grad_norm": 3.4741806983947754, "learning_rate": 2.7484033368240316e-05, "loss": 0.2235, "step": 14213 }, { "epoch": 2.432654458326202, "grad_norm": 17.41012191772461, "learning_rate": 2.7477642357800863e-05, "loss": 1.5105, "step": 14214 }, { "epoch": 2.432825603285983, "grad_norm": 3.299332618713379, "learning_rate": 2.747124398553385e-05, "loss": 0.2887, "step": 14215 }, { "epoch": 2.432996748245764, "grad_norm": 2.036177158355713, "learning_rate": 2.7464838255214296e-05, "loss": 0.2032, "step": 14216 }, { "epoch": 2.433167893205545, "grad_norm": 0.687885046005249, "learning_rate": 2.7458425170621617e-05, "loss": 0.105, "step": 14217 }, { "epoch": 2.433339038165326, "grad_norm": 12.940757751464844, "learning_rate": 2.745200473553952e-05, "loss": 1.1401, "step": 14218 }, { "epoch": 2.433510183125107, "grad_norm": 16.283721923828125, "learning_rate": 2.7445576953756095e-05, "loss": 1.3229, "step": 14219 }, { "epoch": 2.433681328084888, "grad_norm": 14.680545806884766, "learning_rate": 2.7439141829063718e-05, "loss": 0.9548, "step": 14220 }, { "epoch": 2.4338524730446687, "grad_norm": 18.31490707397461, "learning_rate": 2.7432699365259143e-05, "loss": 1.5903, "step": 14221 }, { "epoch": 2.4340236180044497, "grad_norm": 15.488758087158203, "learning_rate": 2.742624956614341e-05, "loss": 1.3972, "step": 14222 }, { "epoch": 2.4341947629642307, "grad_norm": 12.587503433227539, "learning_rate": 2.7419792435521942e-05, "loss": 0.9295, "step": 14223 }, { "epoch": 2.4343659079240116, "grad_norm": 17.967069625854492, "learning_rate": 2.7413327977204413e-05, "loss": 1.4736, "step": 14224 }, { "epoch": 2.4345370528837926, "grad_norm": 17.53647804260254, "learning_rate": 2.7406856195004914e-05, "loss": 1.3355, "step": 14225 }, { "epoch": 2.4347081978435736, "grad_norm": 75.30220031738281, "learning_rate": 2.740037709274178e-05, "loss": 8.5803, "step": 14226 }, { "epoch": 2.4348793428033546, "grad_norm": 10.546639442443848, "learning_rate": 2.739389067423768e-05, "loss": 0.8379, "step": 14227 }, { "epoch": 2.435050487763135, "grad_norm": 6.172989845275879, "learning_rate": 2.7387396943319618e-05, "loss": 0.5636, "step": 14228 }, { "epoch": 2.435221632722916, "grad_norm": 1.886457085609436, "learning_rate": 2.7380895903818927e-05, "loss": 0.1966, "step": 14229 }, { "epoch": 2.435392777682697, "grad_norm": 25.003862380981445, "learning_rate": 2.7374387559571206e-05, "loss": 4.9902, "step": 14230 }, { "epoch": 2.435563922642478, "grad_norm": 1.6909300088882446, "learning_rate": 2.7367871914416383e-05, "loss": 0.1737, "step": 14231 }, { "epoch": 2.435735067602259, "grad_norm": 19.198863983154297, "learning_rate": 2.736134897219871e-05, "loss": 2.2457, "step": 14232 }, { "epoch": 2.43590621256204, "grad_norm": 2.4915549755096436, "learning_rate": 2.7354818736766747e-05, "loss": 0.2523, "step": 14233 }, { "epoch": 2.436077357521821, "grad_norm": 0.35343417525291443, "learning_rate": 2.7348281211973324e-05, "loss": 0.0984, "step": 14234 }, { "epoch": 2.436248502481602, "grad_norm": 16.052297592163086, "learning_rate": 2.7341736401675578e-05, "loss": 1.0875, "step": 14235 }, { "epoch": 2.436419647441383, "grad_norm": 7.68033504486084, "learning_rate": 2.733518430973498e-05, "loss": 0.6867, "step": 14236 }, { "epoch": 2.4365907924011636, "grad_norm": 12.109743118286133, "learning_rate": 2.7328624940017273e-05, "loss": 0.863, "step": 14237 }, { "epoch": 2.4367619373609446, "grad_norm": 9.602211952209473, "learning_rate": 2.732205829639249e-05, "loss": 0.6855, "step": 14238 }, { "epoch": 2.4369330823207256, "grad_norm": 12.405147552490234, "learning_rate": 2.731548438273495e-05, "loss": 0.7994, "step": 14239 }, { "epoch": 2.4371042272805066, "grad_norm": 11.461498260498047, "learning_rate": 2.7308903202923277e-05, "loss": 0.8237, "step": 14240 }, { "epoch": 2.4372753722402876, "grad_norm": 15.221641540527344, "learning_rate": 2.7302314760840392e-05, "loss": 1.1217, "step": 14241 }, { "epoch": 2.4374465172000686, "grad_norm": 18.321094512939453, "learning_rate": 2.7295719060373475e-05, "loss": 1.9224, "step": 14242 }, { "epoch": 2.4376176621598495, "grad_norm": 79.25579833984375, "learning_rate": 2.728911610541399e-05, "loss": 8.0167, "step": 14243 }, { "epoch": 2.43778880711963, "grad_norm": 12.199490547180176, "learning_rate": 2.728250589985769e-05, "loss": 0.8383, "step": 14244 }, { "epoch": 2.437959952079411, "grad_norm": 14.084924697875977, "learning_rate": 2.7275888447604635e-05, "loss": 1.0837, "step": 14245 }, { "epoch": 2.438131097039192, "grad_norm": 2.046492576599121, "learning_rate": 2.7269263752559102e-05, "loss": 0.2122, "step": 14246 }, { "epoch": 2.438302241998973, "grad_norm": 16.41877555847168, "learning_rate": 2.726263181862967e-05, "loss": 1.4978, "step": 14247 }, { "epoch": 2.438473386958754, "grad_norm": 16.61231803894043, "learning_rate": 2.7255992649729222e-05, "loss": 1.2531, "step": 14248 }, { "epoch": 2.438644531918535, "grad_norm": 6.9494147300720215, "learning_rate": 2.724934624977484e-05, "loss": 0.5818, "step": 14249 }, { "epoch": 2.438815676878316, "grad_norm": 13.305373191833496, "learning_rate": 2.7242692622687934e-05, "loss": 0.9515, "step": 14250 }, { "epoch": 2.438986821838097, "grad_norm": 14.454103469848633, "learning_rate": 2.723603177239414e-05, "loss": 1.1783, "step": 14251 }, { "epoch": 2.439157966797878, "grad_norm": 17.140716552734375, "learning_rate": 2.7229363702823385e-05, "loss": 1.2731, "step": 14252 }, { "epoch": 2.439329111757659, "grad_norm": 0.33619147539138794, "learning_rate": 2.7222688417909817e-05, "loss": 0.0902, "step": 14253 }, { "epoch": 2.4395002567174395, "grad_norm": 10.22806453704834, "learning_rate": 2.7216005921591886e-05, "loss": 0.6436, "step": 14254 }, { "epoch": 2.4396714016772205, "grad_norm": 15.783626556396484, "learning_rate": 2.720931621781226e-05, "loss": 0.9557, "step": 14255 }, { "epoch": 2.4398425466370015, "grad_norm": 0.6908251643180847, "learning_rate": 2.7202619310517892e-05, "loss": 0.1162, "step": 14256 }, { "epoch": 2.4400136915967825, "grad_norm": 0.7647294402122498, "learning_rate": 2.719591520365994e-05, "loss": 0.1574, "step": 14257 }, { "epoch": 2.4401848365565635, "grad_norm": 21.834888458251953, "learning_rate": 2.7189203901193866e-05, "loss": 4.84, "step": 14258 }, { "epoch": 2.4403559815163445, "grad_norm": 12.288422584533691, "learning_rate": 2.7182485407079323e-05, "loss": 0.7876, "step": 14259 }, { "epoch": 2.4405271264761255, "grad_norm": 12.00680923461914, "learning_rate": 2.7175759725280258e-05, "loss": 0.9172, "step": 14260 }, { "epoch": 2.440698271435906, "grad_norm": 12.766034126281738, "learning_rate": 2.7169026859764806e-05, "loss": 0.9225, "step": 14261 }, { "epoch": 2.440869416395687, "grad_norm": 5.024628639221191, "learning_rate": 2.7162286814505392e-05, "loss": 0.3094, "step": 14262 }, { "epoch": 2.441040561355468, "grad_norm": 12.24208927154541, "learning_rate": 2.7155539593478633e-05, "loss": 0.9574, "step": 14263 }, { "epoch": 2.441211706315249, "grad_norm": 19.38558578491211, "learning_rate": 2.7148785200665418e-05, "loss": 1.5054, "step": 14264 }, { "epoch": 2.44138285127503, "grad_norm": 0.9621898531913757, "learning_rate": 2.714202364005083e-05, "loss": 0.1684, "step": 14265 }, { "epoch": 2.441553996234811, "grad_norm": 9.641449928283691, "learning_rate": 2.713525491562422e-05, "loss": 0.6212, "step": 14266 }, { "epoch": 2.441725141194592, "grad_norm": 16.866167068481445, "learning_rate": 2.712847903137912e-05, "loss": 0.7798, "step": 14267 }, { "epoch": 2.441896286154373, "grad_norm": 1.8975228071212769, "learning_rate": 2.7121695991313332e-05, "loss": 0.1932, "step": 14268 }, { "epoch": 2.442067431114154, "grad_norm": 4.9077467918396, "learning_rate": 2.7114905799428853e-05, "loss": 0.3728, "step": 14269 }, { "epoch": 2.4422385760739345, "grad_norm": 22.115041732788086, "learning_rate": 2.7108108459731917e-05, "loss": 5.2076, "step": 14270 }, { "epoch": 2.4424097210337155, "grad_norm": 10.398249626159668, "learning_rate": 2.710130397623296e-05, "loss": 0.7509, "step": 14271 }, { "epoch": 2.4425808659934964, "grad_norm": 24.33930015563965, "learning_rate": 2.7094492352946612e-05, "loss": 5.1722, "step": 14272 }, { "epoch": 2.4427520109532774, "grad_norm": 17.45279884338379, "learning_rate": 2.708767359389177e-05, "loss": 1.066, "step": 14273 }, { "epoch": 2.4429231559130584, "grad_norm": 13.824460983276367, "learning_rate": 2.708084770309151e-05, "loss": 1.1491, "step": 14274 }, { "epoch": 2.4430943008728394, "grad_norm": 0.45958346128463745, "learning_rate": 2.7074014684573116e-05, "loss": 0.1006, "step": 14275 }, { "epoch": 2.4432654458326204, "grad_norm": 5.739128589630127, "learning_rate": 2.7067174542368064e-05, "loss": 0.4006, "step": 14276 }, { "epoch": 2.443436590792401, "grad_norm": 0.35255980491638184, "learning_rate": 2.706032728051205e-05, "loss": 0.0974, "step": 14277 }, { "epoch": 2.443607735752182, "grad_norm": 0.5631693005561829, "learning_rate": 2.7053472903044994e-05, "loss": 0.1067, "step": 14278 }, { "epoch": 2.443778880711963, "grad_norm": 12.854817390441895, "learning_rate": 2.7046611414010975e-05, "loss": 0.8943, "step": 14279 }, { "epoch": 2.443950025671744, "grad_norm": 2.9958572387695312, "learning_rate": 2.7039742817458263e-05, "loss": 0.2071, "step": 14280 }, { "epoch": 2.444121170631525, "grad_norm": 9.51891040802002, "learning_rate": 2.7032867117439356e-05, "loss": 0.7809, "step": 14281 }, { "epoch": 2.444292315591306, "grad_norm": 47.94428253173828, "learning_rate": 2.7025984318010942e-05, "loss": 7.2773, "step": 14282 }, { "epoch": 2.444463460551087, "grad_norm": 9.55189037322998, "learning_rate": 2.701909442323386e-05, "loss": 0.821, "step": 14283 }, { "epoch": 2.444634605510868, "grad_norm": 9.44992446899414, "learning_rate": 2.7012197437173163e-05, "loss": 0.7286, "step": 14284 }, { "epoch": 2.444805750470649, "grad_norm": 9.707310676574707, "learning_rate": 2.7005293363898085e-05, "loss": 0.6524, "step": 14285 }, { "epoch": 2.4449768954304294, "grad_norm": 6.030359268188477, "learning_rate": 2.6998382207482048e-05, "loss": 0.4065, "step": 14286 }, { "epoch": 2.4451480403902104, "grad_norm": 12.310891151428223, "learning_rate": 2.6991463972002643e-05, "loss": 0.821, "step": 14287 }, { "epoch": 2.4453191853499914, "grad_norm": 5.576077938079834, "learning_rate": 2.698453866154162e-05, "loss": 0.5959, "step": 14288 }, { "epoch": 2.4454903303097724, "grad_norm": 7.223038673400879, "learning_rate": 2.6977606280184937e-05, "loss": 0.5781, "step": 14289 }, { "epoch": 2.4456614752695534, "grad_norm": 12.37451457977295, "learning_rate": 2.6970666832022706e-05, "loss": 0.7596, "step": 14290 }, { "epoch": 2.4458326202293343, "grad_norm": 3.223525047302246, "learning_rate": 2.696372032114923e-05, "loss": 0.1922, "step": 14291 }, { "epoch": 2.4460037651891153, "grad_norm": 9.237861633300781, "learning_rate": 2.695676675166293e-05, "loss": 0.7651, "step": 14292 }, { "epoch": 2.446174910148896, "grad_norm": 14.784784317016602, "learning_rate": 2.694980612766645e-05, "loss": 1.3485, "step": 14293 }, { "epoch": 2.446346055108677, "grad_norm": 20.30532455444336, "learning_rate": 2.694283845326654e-05, "loss": 1.4246, "step": 14294 }, { "epoch": 2.446517200068458, "grad_norm": 0.5101381540298462, "learning_rate": 2.6935863732574174e-05, "loss": 0.0943, "step": 14295 }, { "epoch": 2.446688345028239, "grad_norm": 14.033026695251465, "learning_rate": 2.692888196970441e-05, "loss": 0.9655, "step": 14296 }, { "epoch": 2.44685948998802, "grad_norm": 1.429619312286377, "learning_rate": 2.6921893168776534e-05, "loss": 0.1804, "step": 14297 }, { "epoch": 2.447030634947801, "grad_norm": 72.43138885498047, "learning_rate": 2.6914897333913914e-05, "loss": 6.9341, "step": 14298 }, { "epoch": 2.447201779907582, "grad_norm": 7.9695563316345215, "learning_rate": 2.6907894469244134e-05, "loss": 0.6044, "step": 14299 }, { "epoch": 2.447372924867363, "grad_norm": 16.899761199951172, "learning_rate": 2.690088457889887e-05, "loss": 1.2036, "step": 14300 }, { "epoch": 2.447544069827144, "grad_norm": 2.584798812866211, "learning_rate": 2.6893867667013987e-05, "loss": 0.2038, "step": 14301 }, { "epoch": 2.4477152147869243, "grad_norm": 8.224316596984863, "learning_rate": 2.6886843737729453e-05, "loss": 0.5771, "step": 14302 }, { "epoch": 2.4478863597467053, "grad_norm": 15.516573905944824, "learning_rate": 2.6879812795189424e-05, "loss": 1.2771, "step": 14303 }, { "epoch": 2.4480575047064863, "grad_norm": 17.428138732910156, "learning_rate": 2.687277484354214e-05, "loss": 1.7909, "step": 14304 }, { "epoch": 2.4482286496662673, "grad_norm": 1.195208191871643, "learning_rate": 2.6865729886940022e-05, "loss": 0.1671, "step": 14305 }, { "epoch": 2.4483997946260483, "grad_norm": 8.907599449157715, "learning_rate": 2.685867792953959e-05, "loss": 0.5945, "step": 14306 }, { "epoch": 2.4485709395858293, "grad_norm": 10.895401954650879, "learning_rate": 2.6851618975501528e-05, "loss": 0.7454, "step": 14307 }, { "epoch": 2.4487420845456103, "grad_norm": 14.817546844482422, "learning_rate": 2.6844553028990602e-05, "loss": 0.9737, "step": 14308 }, { "epoch": 2.4489132295053913, "grad_norm": 8.262884140014648, "learning_rate": 2.6837480094175763e-05, "loss": 0.4916, "step": 14309 }, { "epoch": 2.449084374465172, "grad_norm": 79.84688568115234, "learning_rate": 2.683040017523001e-05, "loss": 9.0646, "step": 14310 }, { "epoch": 2.449255519424953, "grad_norm": 7.583674430847168, "learning_rate": 2.6823313276330557e-05, "loss": 0.4255, "step": 14311 }, { "epoch": 2.4494266643847338, "grad_norm": 10.880108833312988, "learning_rate": 2.6816219401658664e-05, "loss": 0.8368, "step": 14312 }, { "epoch": 2.4495978093445148, "grad_norm": 14.518631935119629, "learning_rate": 2.680911855539971e-05, "loss": 1.0049, "step": 14313 }, { "epoch": 2.4497689543042958, "grad_norm": 26.180015563964844, "learning_rate": 2.6802010741743214e-05, "loss": 5.1965, "step": 14314 }, { "epoch": 2.4499400992640767, "grad_norm": 10.16732406616211, "learning_rate": 2.679489596488282e-05, "loss": 0.7546, "step": 14315 }, { "epoch": 2.4501112442238577, "grad_norm": 24.781461715698242, "learning_rate": 2.678777422901624e-05, "loss": 1.8569, "step": 14316 }, { "epoch": 2.4502823891836387, "grad_norm": 14.712135314941406, "learning_rate": 2.678064553834529e-05, "loss": 1.5454, "step": 14317 }, { "epoch": 2.4504535341434197, "grad_norm": 11.036540985107422, "learning_rate": 2.6773509897075933e-05, "loss": 0.6444, "step": 14318 }, { "epoch": 2.4506246791032003, "grad_norm": 2.6540367603302, "learning_rate": 2.6766367309418206e-05, "loss": 0.1925, "step": 14319 }, { "epoch": 2.4507958240629812, "grad_norm": 11.485629081726074, "learning_rate": 2.6759217779586244e-05, "loss": 0.8875, "step": 14320 }, { "epoch": 2.4509669690227622, "grad_norm": 22.38067054748535, "learning_rate": 2.6752061311798263e-05, "loss": 2.9125, "step": 14321 }, { "epoch": 2.451138113982543, "grad_norm": 4.864941120147705, "learning_rate": 2.6744897910276605e-05, "loss": 0.359, "step": 14322 }, { "epoch": 2.451309258942324, "grad_norm": 16.684341430664062, "learning_rate": 2.6737727579247696e-05, "loss": 1.749, "step": 14323 }, { "epoch": 2.451480403902105, "grad_norm": 3.515782356262207, "learning_rate": 2.673055032294204e-05, "loss": 0.2217, "step": 14324 }, { "epoch": 2.451651548861886, "grad_norm": 2.8836817741394043, "learning_rate": 2.67233661455942e-05, "loss": 0.2137, "step": 14325 }, { "epoch": 2.4518226938216667, "grad_norm": 1.9990898370742798, "learning_rate": 2.6716175051442877e-05, "loss": 0.1992, "step": 14326 }, { "epoch": 2.4519938387814477, "grad_norm": 16.09250831604004, "learning_rate": 2.6708977044730832e-05, "loss": 1.3021, "step": 14327 }, { "epoch": 2.4521649837412287, "grad_norm": 16.23906898498535, "learning_rate": 2.6701772129704894e-05, "loss": 1.4574, "step": 14328 }, { "epoch": 2.4523361287010097, "grad_norm": 8.207794189453125, "learning_rate": 2.6694560310615958e-05, "loss": 0.7196, "step": 14329 }, { "epoch": 2.4525072736607907, "grad_norm": 1.1460460424423218, "learning_rate": 2.6687341591719016e-05, "loss": 0.176, "step": 14330 }, { "epoch": 2.4526784186205717, "grad_norm": 11.399039268493652, "learning_rate": 2.6680115977273147e-05, "loss": 0.7195, "step": 14331 }, { "epoch": 2.4528495635803527, "grad_norm": 1.4221856594085693, "learning_rate": 2.6672883471541436e-05, "loss": 0.1883, "step": 14332 }, { "epoch": 2.4530207085401337, "grad_norm": 9.839604377746582, "learning_rate": 2.666564407879109e-05, "loss": 0.6083, "step": 14333 }, { "epoch": 2.4531918534999146, "grad_norm": 40.16059112548828, "learning_rate": 2.6658397803293376e-05, "loss": 6.877, "step": 14334 }, { "epoch": 2.453362998459695, "grad_norm": 17.63550567626953, "learning_rate": 2.6651144649323582e-05, "loss": 1.8164, "step": 14335 }, { "epoch": 2.453534143419476, "grad_norm": 11.50011157989502, "learning_rate": 2.6643884621161102e-05, "loss": 0.799, "step": 14336 }, { "epoch": 2.453705288379257, "grad_norm": 22.261171340942383, "learning_rate": 2.6636617723089342e-05, "loss": 5.1523, "step": 14337 }, { "epoch": 2.453876433339038, "grad_norm": 9.185544967651367, "learning_rate": 2.6629343959395805e-05, "loss": 0.6906, "step": 14338 }, { "epoch": 2.454047578298819, "grad_norm": 0.5411891937255859, "learning_rate": 2.6622063334371995e-05, "loss": 0.1019, "step": 14339 }, { "epoch": 2.4542187232586, "grad_norm": 10.342248916625977, "learning_rate": 2.6614775852313523e-05, "loss": 0.7225, "step": 14340 }, { "epoch": 2.454389868218381, "grad_norm": 6.384377479553223, "learning_rate": 2.6607481517519984e-05, "loss": 0.5816, "step": 14341 }, { "epoch": 2.4545610131781617, "grad_norm": 20.26117706298828, "learning_rate": 2.6600180334295073e-05, "loss": 2.5073, "step": 14342 }, { "epoch": 2.4547321581379427, "grad_norm": 11.082470893859863, "learning_rate": 2.6592872306946476e-05, "loss": 0.707, "step": 14343 }, { "epoch": 2.4549033030977236, "grad_norm": 23.301559448242188, "learning_rate": 2.658555743978596e-05, "loss": 5.116, "step": 14344 }, { "epoch": 2.4550744480575046, "grad_norm": 11.75782299041748, "learning_rate": 2.6578235737129292e-05, "loss": 0.8383, "step": 14345 }, { "epoch": 2.4552455930172856, "grad_norm": 0.6588224172592163, "learning_rate": 2.6570907203296306e-05, "loss": 0.1051, "step": 14346 }, { "epoch": 2.4554167379770666, "grad_norm": 10.16492748260498, "learning_rate": 2.6563571842610817e-05, "loss": 0.7598, "step": 14347 }, { "epoch": 2.4555878829368476, "grad_norm": 12.387700080871582, "learning_rate": 2.655622965940073e-05, "loss": 0.9873, "step": 14348 }, { "epoch": 2.4557590278966286, "grad_norm": 16.76618766784668, "learning_rate": 2.6548880657997922e-05, "loss": 1.9154, "step": 14349 }, { "epoch": 2.4559301728564096, "grad_norm": 20.862821578979492, "learning_rate": 2.654152484273834e-05, "loss": 2.5141, "step": 14350 }, { "epoch": 2.45610131781619, "grad_norm": 15.00993824005127, "learning_rate": 2.6534162217961893e-05, "loss": 1.0163, "step": 14351 }, { "epoch": 2.456272462775971, "grad_norm": 13.529190063476562, "learning_rate": 2.652679278801257e-05, "loss": 0.8556, "step": 14352 }, { "epoch": 2.456443607735752, "grad_norm": 7.6101579666137695, "learning_rate": 2.651941655723832e-05, "loss": 0.5763, "step": 14353 }, { "epoch": 2.456614752695533, "grad_norm": 18.150043487548828, "learning_rate": 2.6512033529991148e-05, "loss": 1.7316, "step": 14354 }, { "epoch": 2.456785897655314, "grad_norm": 22.605453491210938, "learning_rate": 2.650464371062704e-05, "loss": 5.2636, "step": 14355 }, { "epoch": 2.456957042615095, "grad_norm": 12.342305183410645, "learning_rate": 2.6497247103506023e-05, "loss": 1.053, "step": 14356 }, { "epoch": 2.457128187574876, "grad_norm": 11.815692901611328, "learning_rate": 2.6489843712992097e-05, "loss": 0.9082, "step": 14357 }, { "epoch": 2.457299332534657, "grad_norm": 5.671293258666992, "learning_rate": 2.6482433543453245e-05, "loss": 0.4845, "step": 14358 }, { "epoch": 2.4574704774944376, "grad_norm": 15.395242691040039, "learning_rate": 2.6475016599261502e-05, "loss": 1.1708, "step": 14359 }, { "epoch": 2.4576416224542186, "grad_norm": 2.4131014347076416, "learning_rate": 2.6467592884792892e-05, "loss": 0.1629, "step": 14360 }, { "epoch": 2.4578127674139996, "grad_norm": 0.3766633868217468, "learning_rate": 2.646016240442739e-05, "loss": 0.0983, "step": 14361 }, { "epoch": 2.4579839123737806, "grad_norm": 4.9578680992126465, "learning_rate": 2.6452725162548994e-05, "loss": 0.4456, "step": 14362 }, { "epoch": 2.4581550573335615, "grad_norm": 68.1369857788086, "learning_rate": 2.644528116354569e-05, "loss": 7.4022, "step": 14363 }, { "epoch": 2.4583262022933425, "grad_norm": 14.40063762664795, "learning_rate": 2.643783041180947e-05, "loss": 1.1981, "step": 14364 }, { "epoch": 2.4584973472531235, "grad_norm": 10.628440856933594, "learning_rate": 2.6430372911736266e-05, "loss": 0.6536, "step": 14365 }, { "epoch": 2.4586684922129045, "grad_norm": 16.850582122802734, "learning_rate": 2.642290866772601e-05, "loss": 1.5933, "step": 14366 }, { "epoch": 2.4588396371726855, "grad_norm": 15.053182601928711, "learning_rate": 2.641543768418262e-05, "loss": 1.1794, "step": 14367 }, { "epoch": 2.459010782132466, "grad_norm": 12.267268180847168, "learning_rate": 2.640795996551401e-05, "loss": 0.8238, "step": 14368 }, { "epoch": 2.459181927092247, "grad_norm": 8.214422225952148, "learning_rate": 2.6400475516132033e-05, "loss": 0.5772, "step": 14369 }, { "epoch": 2.459353072052028, "grad_norm": 5.6866774559021, "learning_rate": 2.63929843404525e-05, "loss": 0.6787, "step": 14370 }, { "epoch": 2.459524217011809, "grad_norm": 0.36026531457901, "learning_rate": 2.6385486442895244e-05, "loss": 0.0948, "step": 14371 }, { "epoch": 2.45969536197159, "grad_norm": 9.499744415283203, "learning_rate": 2.6377981827884035e-05, "loss": 0.6605, "step": 14372 }, { "epoch": 2.459866506931371, "grad_norm": 0.8194761276245117, "learning_rate": 2.6370470499846603e-05, "loss": 0.1085, "step": 14373 }, { "epoch": 2.460037651891152, "grad_norm": 5.923460960388184, "learning_rate": 2.6362952463214614e-05, "loss": 0.495, "step": 14374 }, { "epoch": 2.4602087968509325, "grad_norm": 15.998412132263184, "learning_rate": 2.6355427722423774e-05, "loss": 1.0517, "step": 14375 }, { "epoch": 2.4603799418107135, "grad_norm": 13.601499557495117, "learning_rate": 2.634789628191365e-05, "loss": 0.9844, "step": 14376 }, { "epoch": 2.4605510867704945, "grad_norm": 18.352291107177734, "learning_rate": 2.6340358146127835e-05, "loss": 1.5082, "step": 14377 }, { "epoch": 2.4607222317302755, "grad_norm": 16.232019424438477, "learning_rate": 2.6332813319513806e-05, "loss": 1.0866, "step": 14378 }, { "epoch": 2.4608933766900565, "grad_norm": 9.254181861877441, "learning_rate": 2.6325261806523055e-05, "loss": 0.6477, "step": 14379 }, { "epoch": 2.4610645216498375, "grad_norm": 24.415681838989258, "learning_rate": 2.631770361161095e-05, "loss": 2.1588, "step": 14380 }, { "epoch": 2.4612356666096185, "grad_norm": 12.35239028930664, "learning_rate": 2.6310138739236873e-05, "loss": 0.8786, "step": 14381 }, { "epoch": 2.4614068115693994, "grad_norm": 2.7586510181427, "learning_rate": 2.6302567193864084e-05, "loss": 0.2085, "step": 14382 }, { "epoch": 2.4615779565291804, "grad_norm": 13.927287101745605, "learning_rate": 2.6294988979959822e-05, "loss": 1.2793, "step": 14383 }, { "epoch": 2.461749101488961, "grad_norm": 12.367339134216309, "learning_rate": 2.6287404101995228e-05, "loss": 0.8103, "step": 14384 }, { "epoch": 2.461920246448742, "grad_norm": 12.37490177154541, "learning_rate": 2.627981256444542e-05, "loss": 0.803, "step": 14385 }, { "epoch": 2.462091391408523, "grad_norm": 10.95678997039795, "learning_rate": 2.6272214371789382e-05, "loss": 0.7091, "step": 14386 }, { "epoch": 2.462262536368304, "grad_norm": 9.36892032623291, "learning_rate": 2.6264609528510087e-05, "loss": 0.672, "step": 14387 }, { "epoch": 2.462433681328085, "grad_norm": 11.300817489624023, "learning_rate": 2.6256998039094383e-05, "loss": 0.7335, "step": 14388 }, { "epoch": 2.462604826287866, "grad_norm": 14.421119689941406, "learning_rate": 2.6249379908033078e-05, "loss": 1.5722, "step": 14389 }, { "epoch": 2.462775971247647, "grad_norm": 17.359142303466797, "learning_rate": 2.6241755139820857e-05, "loss": 1.6154, "step": 14390 }, { "epoch": 2.4629471162074275, "grad_norm": 2.0066730976104736, "learning_rate": 2.6234123738956378e-05, "loss": 0.1849, "step": 14391 }, { "epoch": 2.4631182611672084, "grad_norm": 13.959449768066406, "learning_rate": 2.622648570994214e-05, "loss": 1.1824, "step": 14392 }, { "epoch": 2.4632894061269894, "grad_norm": 18.263277053833008, "learning_rate": 2.621884105728463e-05, "loss": 1.5711, "step": 14393 }, { "epoch": 2.4634605510867704, "grad_norm": 8.09199047088623, "learning_rate": 2.621118978549417e-05, "loss": 0.74, "step": 14394 }, { "epoch": 2.4636316960465514, "grad_norm": 23.259504318237305, "learning_rate": 2.6203531899085055e-05, "loss": 5.3334, "step": 14395 }, { "epoch": 2.4638028410063324, "grad_norm": 14.859175682067871, "learning_rate": 2.6195867402575408e-05, "loss": 1.0749, "step": 14396 }, { "epoch": 2.4639739859661134, "grad_norm": 3.293239116668701, "learning_rate": 2.618819630048734e-05, "loss": 0.2341, "step": 14397 }, { "epoch": 2.4641451309258944, "grad_norm": 10.450746536254883, "learning_rate": 2.6180518597346788e-05, "loss": 0.7069, "step": 14398 }, { "epoch": 2.4643162758856754, "grad_norm": 10.34240436553955, "learning_rate": 2.61728342976836e-05, "loss": 0.739, "step": 14399 }, { "epoch": 2.464487420845456, "grad_norm": 15.398393630981445, "learning_rate": 2.6165143406031534e-05, "loss": 1.3295, "step": 14400 }, { "epoch": 2.464658565805237, "grad_norm": 10.938633918762207, "learning_rate": 2.615744592692824e-05, "loss": 0.816, "step": 14401 }, { "epoch": 2.464829710765018, "grad_norm": 14.80673885345459, "learning_rate": 2.6149741864915236e-05, "loss": 1.8126, "step": 14402 }, { "epoch": 2.465000855724799, "grad_norm": 12.120914459228516, "learning_rate": 2.6142031224537907e-05, "loss": 1.0667, "step": 14403 }, { "epoch": 2.46517200068458, "grad_norm": 15.62583065032959, "learning_rate": 2.613431401034556e-05, "loss": 0.8856, "step": 14404 }, { "epoch": 2.465343145644361, "grad_norm": 16.945981979370117, "learning_rate": 2.612659022689138e-05, "loss": 0.9949, "step": 14405 }, { "epoch": 2.465514290604142, "grad_norm": 13.460371017456055, "learning_rate": 2.611885987873239e-05, "loss": 1.0574, "step": 14406 }, { "epoch": 2.465685435563923, "grad_norm": 22.009889602661133, "learning_rate": 2.6111122970429495e-05, "loss": 2.0325, "step": 14407 }, { "epoch": 2.4658565805237034, "grad_norm": 6.17164421081543, "learning_rate": 2.6103379506547506e-05, "loss": 0.3061, "step": 14408 }, { "epoch": 2.4660277254834844, "grad_norm": 0.466234028339386, "learning_rate": 2.6095629491655078e-05, "loss": 0.1044, "step": 14409 }, { "epoch": 2.4661988704432654, "grad_norm": 7.108079433441162, "learning_rate": 2.6087872930324724e-05, "loss": 0.5598, "step": 14410 }, { "epoch": 2.4663700154030463, "grad_norm": 0.4567773640155792, "learning_rate": 2.608010982713281e-05, "loss": 0.092, "step": 14411 }, { "epoch": 2.4665411603628273, "grad_norm": 9.88395881652832, "learning_rate": 2.6072340186659594e-05, "loss": 0.7538, "step": 14412 }, { "epoch": 2.4667123053226083, "grad_norm": 10.364333152770996, "learning_rate": 2.6064564013489195e-05, "loss": 0.7849, "step": 14413 }, { "epoch": 2.4668834502823893, "grad_norm": 22.418636322021484, "learning_rate": 2.6056781312209547e-05, "loss": 1.9313, "step": 14414 }, { "epoch": 2.4670545952421703, "grad_norm": 19.4090576171875, "learning_rate": 2.6048992087412437e-05, "loss": 1.5664, "step": 14415 }, { "epoch": 2.4672257402019513, "grad_norm": 1.4765784740447998, "learning_rate": 2.604119634369355e-05, "loss": 0.1042, "step": 14416 }, { "epoch": 2.467396885161732, "grad_norm": 9.50800895690918, "learning_rate": 2.6033394085652362e-05, "loss": 0.5848, "step": 14417 }, { "epoch": 2.467568030121513, "grad_norm": 12.019543647766113, "learning_rate": 2.602558531789225e-05, "loss": 0.9216, "step": 14418 }, { "epoch": 2.467739175081294, "grad_norm": 15.234990119934082, "learning_rate": 2.601777004502036e-05, "loss": 1.0877, "step": 14419 }, { "epoch": 2.467910320041075, "grad_norm": 3.9635884761810303, "learning_rate": 2.6009948271647753e-05, "loss": 0.257, "step": 14420 }, { "epoch": 2.468081465000856, "grad_norm": 10.827518463134766, "learning_rate": 2.6002120002389247e-05, "loss": 0.7583, "step": 14421 }, { "epoch": 2.468252609960637, "grad_norm": 17.963951110839844, "learning_rate": 2.5994285241863572e-05, "loss": 1.4457, "step": 14422 }, { "epoch": 2.4684237549204178, "grad_norm": 17.545825958251953, "learning_rate": 2.598644399469321e-05, "loss": 0.8924, "step": 14423 }, { "epoch": 2.4685948998801983, "grad_norm": 25.90056610107422, "learning_rate": 2.5978596265504545e-05, "loss": 5.0642, "step": 14424 }, { "epoch": 2.4687660448399793, "grad_norm": 8.369790077209473, "learning_rate": 2.5970742058927716e-05, "loss": 0.5869, "step": 14425 }, { "epoch": 2.4689371897997603, "grad_norm": 26.67466926574707, "learning_rate": 2.5962881379596744e-05, "loss": 5.2028, "step": 14426 }, { "epoch": 2.4691083347595413, "grad_norm": 2.650683879852295, "learning_rate": 2.5955014232149416e-05, "loss": 0.191, "step": 14427 }, { "epoch": 2.4692794797193223, "grad_norm": 0.3533748984336853, "learning_rate": 2.594714062122739e-05, "loss": 0.0977, "step": 14428 }, { "epoch": 2.4694506246791033, "grad_norm": 11.722600936889648, "learning_rate": 2.5939260551476075e-05, "loss": 0.8714, "step": 14429 }, { "epoch": 2.4696217696388842, "grad_norm": 19.681283950805664, "learning_rate": 2.593137402754476e-05, "loss": 1.3403, "step": 14430 }, { "epoch": 2.4697929145986652, "grad_norm": 0.4833071231842041, "learning_rate": 2.5923481054086467e-05, "loss": 0.1038, "step": 14431 }, { "epoch": 2.4699640595584462, "grad_norm": 10.87557315826416, "learning_rate": 2.5915581635758096e-05, "loss": 0.9325, "step": 14432 }, { "epoch": 2.4701352045182268, "grad_norm": 9.290080070495605, "learning_rate": 2.590767577722029e-05, "loss": 0.7235, "step": 14433 }, { "epoch": 2.4703063494780078, "grad_norm": 7.555256366729736, "learning_rate": 2.5899763483137545e-05, "loss": 0.6556, "step": 14434 }, { "epoch": 2.4704774944377887, "grad_norm": 5.5722880363464355, "learning_rate": 2.5891844758178092e-05, "loss": 0.4306, "step": 14435 }, { "epoch": 2.4706486393975697, "grad_norm": 42.15092468261719, "learning_rate": 2.5883919607014025e-05, "loss": 7.221, "step": 14436 }, { "epoch": 2.4708197843573507, "grad_norm": 0.6545300483703613, "learning_rate": 2.5875988034321167e-05, "loss": 0.1105, "step": 14437 }, { "epoch": 2.4709909293171317, "grad_norm": 12.298563957214355, "learning_rate": 2.586805004477918e-05, "loss": 0.8696, "step": 14438 }, { "epoch": 2.4711620742769127, "grad_norm": 6.234464168548584, "learning_rate": 2.5860105643071466e-05, "loss": 0.6777, "step": 14439 }, { "epoch": 2.4713332192366932, "grad_norm": 22.581737518310547, "learning_rate": 2.5852154833885245e-05, "loss": 4.903, "step": 14440 }, { "epoch": 2.4715043641964742, "grad_norm": 10.928167343139648, "learning_rate": 2.584419762191151e-05, "loss": 0.7779, "step": 14441 }, { "epoch": 2.4716755091562552, "grad_norm": 14.362181663513184, "learning_rate": 2.5836234011845042e-05, "loss": 1.2487, "step": 14442 }, { "epoch": 2.471846654116036, "grad_norm": 19.478580474853516, "learning_rate": 2.5828264008384365e-05, "loss": 2.7019, "step": 14443 }, { "epoch": 2.472017799075817, "grad_norm": 13.918542861938477, "learning_rate": 2.582028761623178e-05, "loss": 0.9674, "step": 14444 }, { "epoch": 2.472188944035598, "grad_norm": 14.001805305480957, "learning_rate": 2.5812304840093384e-05, "loss": 0.7157, "step": 14445 }, { "epoch": 2.472360088995379, "grad_norm": 1.982899785041809, "learning_rate": 2.5804315684679046e-05, "loss": 0.1864, "step": 14446 }, { "epoch": 2.47253123395516, "grad_norm": 13.674560546875, "learning_rate": 2.579632015470236e-05, "loss": 1.0723, "step": 14447 }, { "epoch": 2.472702378914941, "grad_norm": 1.3001809120178223, "learning_rate": 2.578831825488069e-05, "loss": 0.1095, "step": 14448 }, { "epoch": 2.4728735238747217, "grad_norm": 11.776524543762207, "learning_rate": 2.5780309989935174e-05, "loss": 0.9064, "step": 14449 }, { "epoch": 2.4730446688345027, "grad_norm": 11.664833068847656, "learning_rate": 2.5772295364590726e-05, "loss": 0.6207, "step": 14450 }, { "epoch": 2.4732158137942837, "grad_norm": 13.91102409362793, "learning_rate": 2.576427438357597e-05, "loss": 1.0032, "step": 14451 }, { "epoch": 2.4733869587540647, "grad_norm": 1.138609528541565, "learning_rate": 2.5756247051623274e-05, "loss": 0.1721, "step": 14452 }, { "epoch": 2.4735581037138457, "grad_norm": 11.511601448059082, "learning_rate": 2.57482133734688e-05, "loss": 1.0061, "step": 14453 }, { "epoch": 2.4737292486736266, "grad_norm": 9.717206954956055, "learning_rate": 2.5740173353852443e-05, "loss": 1.0846, "step": 14454 }, { "epoch": 2.4739003936334076, "grad_norm": 0.3845316171646118, "learning_rate": 2.5732126997517805e-05, "loss": 0.0954, "step": 14455 }, { "epoch": 2.4740715385931886, "grad_norm": 0.36258038878440857, "learning_rate": 2.5724074309212246e-05, "loss": 0.094, "step": 14456 }, { "epoch": 2.474242683552969, "grad_norm": 14.754484176635742, "learning_rate": 2.5716015293686865e-05, "loss": 1.2654, "step": 14457 }, { "epoch": 2.47441382851275, "grad_norm": 10.861266136169434, "learning_rate": 2.570794995569652e-05, "loss": 0.9013, "step": 14458 }, { "epoch": 2.474584973472531, "grad_norm": 0.3085983097553253, "learning_rate": 2.5699878299999748e-05, "loss": 0.0918, "step": 14459 }, { "epoch": 2.474756118432312, "grad_norm": 31.472570419311523, "learning_rate": 2.5691800331358815e-05, "loss": 5.3676, "step": 14460 }, { "epoch": 2.474927263392093, "grad_norm": 3.1321446895599365, "learning_rate": 2.5683716054539787e-05, "loss": 0.2738, "step": 14461 }, { "epoch": 2.475098408351874, "grad_norm": 5.117201805114746, "learning_rate": 2.5675625474312362e-05, "loss": 0.3888, "step": 14462 }, { "epoch": 2.475269553311655, "grad_norm": 17.370853424072266, "learning_rate": 2.5667528595450024e-05, "loss": 2.0321, "step": 14463 }, { "epoch": 2.475440698271436, "grad_norm": 0.8653531074523926, "learning_rate": 2.5659425422729904e-05, "loss": 0.1041, "step": 14464 }, { "epoch": 2.475611843231217, "grad_norm": 6.25007438659668, "learning_rate": 2.5651315960932926e-05, "loss": 0.4785, "step": 14465 }, { "epoch": 2.4757829881909976, "grad_norm": 17.44664764404297, "learning_rate": 2.564320021484365e-05, "loss": 2.2073, "step": 14466 }, { "epoch": 2.4759541331507786, "grad_norm": 37.80023193359375, "learning_rate": 2.5635078189250418e-05, "loss": 5.5817, "step": 14467 }, { "epoch": 2.4761252781105596, "grad_norm": 14.851840019226074, "learning_rate": 2.562694988894519e-05, "loss": 1.1252, "step": 14468 }, { "epoch": 2.4762964230703406, "grad_norm": 33.09085464477539, "learning_rate": 2.5618815318723716e-05, "loss": 5.3908, "step": 14469 }, { "epoch": 2.4764675680301216, "grad_norm": 0.44161102175712585, "learning_rate": 2.5610674483385373e-05, "loss": 0.0939, "step": 14470 }, { "epoch": 2.4766387129899026, "grad_norm": 10.392792701721191, "learning_rate": 2.5602527387733297e-05, "loss": 0.8039, "step": 14471 }, { "epoch": 2.4768098579496836, "grad_norm": 1.359745740890503, "learning_rate": 2.559437403657425e-05, "loss": 0.1588, "step": 14472 }, { "epoch": 2.476981002909464, "grad_norm": 16.023914337158203, "learning_rate": 2.5586214434718763e-05, "loss": 1.1853, "step": 14473 }, { "epoch": 2.477152147869245, "grad_norm": 14.545693397521973, "learning_rate": 2.5578048586980974e-05, "loss": 1.1026, "step": 14474 }, { "epoch": 2.477323292829026, "grad_norm": 14.978209495544434, "learning_rate": 2.556987649817878e-05, "loss": 1.0813, "step": 14475 }, { "epoch": 2.477494437788807, "grad_norm": 0.6708654165267944, "learning_rate": 2.556169817313369e-05, "loss": 0.0944, "step": 14476 }, { "epoch": 2.477665582748588, "grad_norm": 7.150966644287109, "learning_rate": 2.5553513616670964e-05, "loss": 0.9656, "step": 14477 }, { "epoch": 2.477836727708369, "grad_norm": 6.708449840545654, "learning_rate": 2.554532283361947e-05, "loss": 0.4421, "step": 14478 }, { "epoch": 2.47800787266815, "grad_norm": 0.29211658239364624, "learning_rate": 2.553712582881181e-05, "loss": 0.0867, "step": 14479 }, { "epoch": 2.478179017627931, "grad_norm": 13.883329391479492, "learning_rate": 2.5528922607084203e-05, "loss": 0.9324, "step": 14480 }, { "epoch": 2.478350162587712, "grad_norm": 14.379722595214844, "learning_rate": 2.552071317327659e-05, "loss": 0.8433, "step": 14481 }, { "epoch": 2.4785213075474926, "grad_norm": 1.8460192680358887, "learning_rate": 2.5512497532232507e-05, "loss": 0.1734, "step": 14482 }, { "epoch": 2.4786924525072735, "grad_norm": 15.41537857055664, "learning_rate": 2.550427568879925e-05, "loss": 1.1164, "step": 14483 }, { "epoch": 2.4788635974670545, "grad_norm": 44.197181701660156, "learning_rate": 2.5496047647827694e-05, "loss": 6.8026, "step": 14484 }, { "epoch": 2.4790347424268355, "grad_norm": 16.152755737304688, "learning_rate": 2.5487813414172374e-05, "loss": 1.8367, "step": 14485 }, { "epoch": 2.4792058873866165, "grad_norm": 14.460108757019043, "learning_rate": 2.547957299269152e-05, "loss": 0.8293, "step": 14486 }, { "epoch": 2.4793770323463975, "grad_norm": 21.376873016357422, "learning_rate": 2.5471326388247003e-05, "loss": 3.098, "step": 14487 }, { "epoch": 2.4795481773061785, "grad_norm": 11.223711967468262, "learning_rate": 2.5463073605704326e-05, "loss": 0.7697, "step": 14488 }, { "epoch": 2.479719322265959, "grad_norm": 21.395654678344727, "learning_rate": 2.5454814649932616e-05, "loss": 0.6642, "step": 14489 }, { "epoch": 2.47989046722574, "grad_norm": 4.796473979949951, "learning_rate": 2.5446549525804696e-05, "loss": 0.2661, "step": 14490 }, { "epoch": 2.480061612185521, "grad_norm": 26.2740478515625, "learning_rate": 2.5438278238197005e-05, "loss": 5.3332, "step": 14491 }, { "epoch": 2.480232757145302, "grad_norm": 9.821441650390625, "learning_rate": 2.543000079198961e-05, "loss": 0.835, "step": 14492 }, { "epoch": 2.480403902105083, "grad_norm": 11.909598350524902, "learning_rate": 2.5421717192066202e-05, "loss": 0.9271, "step": 14493 }, { "epoch": 2.480575047064864, "grad_norm": 19.718547821044922, "learning_rate": 2.5413427443314124e-05, "loss": 1.6688, "step": 14494 }, { "epoch": 2.480746192024645, "grad_norm": 22.689167022705078, "learning_rate": 2.5405131550624355e-05, "loss": 3.0726, "step": 14495 }, { "epoch": 2.480917336984426, "grad_norm": 10.45872688293457, "learning_rate": 2.539682951889148e-05, "loss": 0.6946, "step": 14496 }, { "epoch": 2.481088481944207, "grad_norm": 1.0609910488128662, "learning_rate": 2.5388521353013683e-05, "loss": 0.1735, "step": 14497 }, { "epoch": 2.4812596269039875, "grad_norm": 5.442974090576172, "learning_rate": 2.5380207057892815e-05, "loss": 0.3505, "step": 14498 }, { "epoch": 2.4814307718637685, "grad_norm": 0.3742998540401459, "learning_rate": 2.5371886638434335e-05, "loss": 0.096, "step": 14499 }, { "epoch": 2.4816019168235495, "grad_norm": 13.438188552856445, "learning_rate": 2.5363560099547296e-05, "loss": 1.0515, "step": 14500 }, { "epoch": 2.4817730617833305, "grad_norm": 17.364797592163086, "learning_rate": 2.5355227446144343e-05, "loss": 1.8114, "step": 14501 }, { "epoch": 2.4819442067431114, "grad_norm": 12.848957061767578, "learning_rate": 2.5346888683141776e-05, "loss": 1.2374, "step": 14502 }, { "epoch": 2.4821153517028924, "grad_norm": 15.015218734741211, "learning_rate": 2.533854381545947e-05, "loss": 1.0869, "step": 14503 }, { "epoch": 2.4822864966626734, "grad_norm": 14.73206901550293, "learning_rate": 2.5330192848020935e-05, "loss": 1.0973, "step": 14504 }, { "epoch": 2.482457641622454, "grad_norm": 9.241649627685547, "learning_rate": 2.5321835785753215e-05, "loss": 0.709, "step": 14505 }, { "epoch": 2.482628786582235, "grad_norm": 2.8755922317504883, "learning_rate": 2.5313472633587025e-05, "loss": 0.2616, "step": 14506 }, { "epoch": 2.482799931542016, "grad_norm": 53.02229690551758, "learning_rate": 2.53051033964566e-05, "loss": 7.5488, "step": 14507 }, { "epoch": 2.482971076501797, "grad_norm": 8.252564430236816, "learning_rate": 2.529672807929984e-05, "loss": 0.4815, "step": 14508 }, { "epoch": 2.483142221461578, "grad_norm": 12.516328811645508, "learning_rate": 2.528834668705816e-05, "loss": 1.3951, "step": 14509 }, { "epoch": 2.483313366421359, "grad_norm": 3.451993703842163, "learning_rate": 2.5279959224676627e-05, "loss": 0.2645, "step": 14510 }, { "epoch": 2.48348451138114, "grad_norm": 21.66196060180664, "learning_rate": 2.527156569710382e-05, "loss": 4.7236, "step": 14511 }, { "epoch": 2.483655656340921, "grad_norm": 0.30332982540130615, "learning_rate": 2.5263166109291976e-05, "loss": 0.093, "step": 14512 }, { "epoch": 2.483826801300702, "grad_norm": 19.58452796936035, "learning_rate": 2.5254760466196822e-05, "loss": 1.4736, "step": 14513 }, { "epoch": 2.483997946260483, "grad_norm": 13.417981147766113, "learning_rate": 2.5246348772777737e-05, "loss": 1.3074, "step": 14514 }, { "epoch": 2.4841690912202634, "grad_norm": 4.364748001098633, "learning_rate": 2.5237931033997594e-05, "loss": 0.2885, "step": 14515 }, { "epoch": 2.4843402361800444, "grad_norm": 12.222046852111816, "learning_rate": 2.522950725482291e-05, "loss": 0.857, "step": 14516 }, { "epoch": 2.4845113811398254, "grad_norm": 17.944873809814453, "learning_rate": 2.5221077440223696e-05, "loss": 2.0304, "step": 14517 }, { "epoch": 2.4846825260996064, "grad_norm": 11.555278778076172, "learning_rate": 2.5212641595173577e-05, "loss": 0.9471, "step": 14518 }, { "epoch": 2.4848536710593874, "grad_norm": 10.878954887390137, "learning_rate": 2.5204199724649696e-05, "loss": 0.8059, "step": 14519 }, { "epoch": 2.4850248160191684, "grad_norm": 6.676743030548096, "learning_rate": 2.5195751833632794e-05, "loss": 0.5633, "step": 14520 }, { "epoch": 2.4851959609789493, "grad_norm": 9.187894821166992, "learning_rate": 2.5187297927107106e-05, "loss": 0.6769, "step": 14521 }, { "epoch": 2.48536710593873, "grad_norm": 11.271055221557617, "learning_rate": 2.5178838010060482e-05, "loss": 0.8918, "step": 14522 }, { "epoch": 2.485538250898511, "grad_norm": 13.582656860351562, "learning_rate": 2.517037208748426e-05, "loss": 0.9057, "step": 14523 }, { "epoch": 2.485709395858292, "grad_norm": 19.863039016723633, "learning_rate": 2.5161900164373368e-05, "loss": 1.5352, "step": 14524 }, { "epoch": 2.485880540818073, "grad_norm": 37.604408264160156, "learning_rate": 2.5153422245726232e-05, "loss": 6.6622, "step": 14525 }, { "epoch": 2.486051685777854, "grad_norm": 13.025396347045898, "learning_rate": 2.5144938336544846e-05, "loss": 1.1776, "step": 14526 }, { "epoch": 2.486222830737635, "grad_norm": 10.491693496704102, "learning_rate": 2.5136448441834733e-05, "loss": 0.8853, "step": 14527 }, { "epoch": 2.486393975697416, "grad_norm": 0.32772424817085266, "learning_rate": 2.5127952566604953e-05, "loss": 0.0933, "step": 14528 }, { "epoch": 2.486565120657197, "grad_norm": 2.421790838241577, "learning_rate": 2.5119450715868078e-05, "loss": 0.2112, "step": 14529 }, { "epoch": 2.486736265616978, "grad_norm": 15.089143753051758, "learning_rate": 2.5110942894640192e-05, "loss": 1.5874, "step": 14530 }, { "epoch": 2.4869074105767583, "grad_norm": 5.358714580535889, "learning_rate": 2.5102429107940937e-05, "loss": 0.4228, "step": 14531 }, { "epoch": 2.4870785555365393, "grad_norm": 13.96886157989502, "learning_rate": 2.5093909360793476e-05, "loss": 1.1553, "step": 14532 }, { "epoch": 2.4872497004963203, "grad_norm": 7.248575210571289, "learning_rate": 2.508538365822446e-05, "loss": 0.8302, "step": 14533 }, { "epoch": 2.4874208454561013, "grad_norm": 12.299983978271484, "learning_rate": 2.5076852005264045e-05, "loss": 1.0877, "step": 14534 }, { "epoch": 2.4875919904158823, "grad_norm": 21.179508209228516, "learning_rate": 2.506831440694594e-05, "loss": 2.6994, "step": 14535 }, { "epoch": 2.4877631353756633, "grad_norm": 13.74756908416748, "learning_rate": 2.5059770868307353e-05, "loss": 1.141, "step": 14536 }, { "epoch": 2.4879342803354443, "grad_norm": 0.2790202498435974, "learning_rate": 2.505122139438897e-05, "loss": 0.0857, "step": 14537 }, { "epoch": 2.488105425295225, "grad_norm": 9.119311332702637, "learning_rate": 2.504266599023498e-05, "loss": 0.8198, "step": 14538 }, { "epoch": 2.488276570255006, "grad_norm": 16.012781143188477, "learning_rate": 2.5034104660893096e-05, "loss": 0.9407, "step": 14539 }, { "epoch": 2.488447715214787, "grad_norm": 26.537410736083984, "learning_rate": 2.5025537411414532e-05, "loss": 5.498, "step": 14540 }, { "epoch": 2.488618860174568, "grad_norm": 11.560832023620605, "learning_rate": 2.5016964246853966e-05, "loss": 0.7872, "step": 14541 }, { "epoch": 2.488790005134349, "grad_norm": 16.098012924194336, "learning_rate": 2.5008385172269556e-05, "loss": 1.7256, "step": 14542 }, { "epoch": 2.4889611500941298, "grad_norm": 10.749532699584961, "learning_rate": 2.4999800192722988e-05, "loss": 0.9395, "step": 14543 }, { "epoch": 2.4891322950539108, "grad_norm": 9.283292770385742, "learning_rate": 2.4991209313279425e-05, "loss": 0.5765, "step": 14544 }, { "epoch": 2.4893034400136917, "grad_norm": 14.967839241027832, "learning_rate": 2.4982612539007488e-05, "loss": 0.9999, "step": 14545 }, { "epoch": 2.4894745849734727, "grad_norm": 14.720806121826172, "learning_rate": 2.497400987497925e-05, "loss": 1.2788, "step": 14546 }, { "epoch": 2.4896457299332533, "grad_norm": 30.299829483032227, "learning_rate": 2.4965401326270365e-05, "loss": 5.5046, "step": 14547 }, { "epoch": 2.4898168748930343, "grad_norm": 17.99800682067871, "learning_rate": 2.495678689795984e-05, "loss": 2.1953, "step": 14548 }, { "epoch": 2.4899880198528153, "grad_norm": 1.4501234292984009, "learning_rate": 2.4948166595130227e-05, "loss": 0.184, "step": 14549 }, { "epoch": 2.4901591648125962, "grad_norm": 11.489253044128418, "learning_rate": 2.4939540422867487e-05, "loss": 0.7793, "step": 14550 }, { "epoch": 2.4903303097723772, "grad_norm": 11.009381294250488, "learning_rate": 2.493090838626112e-05, "loss": 0.739, "step": 14551 }, { "epoch": 2.4905014547321582, "grad_norm": 14.745049476623535, "learning_rate": 2.492227049040399e-05, "loss": 1.3673, "step": 14552 }, { "epoch": 2.490672599691939, "grad_norm": 0.2870556712150574, "learning_rate": 2.4913626740392505e-05, "loss": 0.0887, "step": 14553 }, { "epoch": 2.4908437446517198, "grad_norm": 14.423157691955566, "learning_rate": 2.4904977141326465e-05, "loss": 1.045, "step": 14554 }, { "epoch": 2.4910148896115007, "grad_norm": 10.950271606445312, "learning_rate": 2.4896321698309176e-05, "loss": 0.8662, "step": 14555 }, { "epoch": 2.4911860345712817, "grad_norm": 11.371031761169434, "learning_rate": 2.4887660416447322e-05, "loss": 0.8647, "step": 14556 }, { "epoch": 2.4913571795310627, "grad_norm": 18.148456573486328, "learning_rate": 2.4878993300851115e-05, "loss": 1.3387, "step": 14557 }, { "epoch": 2.4915283244908437, "grad_norm": 2.3638315200805664, "learning_rate": 2.487032035663413e-05, "loss": 0.1867, "step": 14558 }, { "epoch": 2.4916994694506247, "grad_norm": 12.719754219055176, "learning_rate": 2.486164158891345e-05, "loss": 0.7863, "step": 14559 }, { "epoch": 2.4918706144104057, "grad_norm": 0.4815010130405426, "learning_rate": 2.4852957002809534e-05, "loss": 0.0995, "step": 14560 }, { "epoch": 2.4920417593701867, "grad_norm": 9.398436546325684, "learning_rate": 2.4844266603446318e-05, "loss": 0.6471, "step": 14561 }, { "epoch": 2.4922129043299677, "grad_norm": 27.794591903686523, "learning_rate": 2.483557039595113e-05, "loss": 5.2028, "step": 14562 }, { "epoch": 2.4923840492897487, "grad_norm": 3.8123369216918945, "learning_rate": 2.4826868385454774e-05, "loss": 0.2335, "step": 14563 }, { "epoch": 2.492555194249529, "grad_norm": 8.629185676574707, "learning_rate": 2.4818160577091417e-05, "loss": 0.6985, "step": 14564 }, { "epoch": 2.49272633920931, "grad_norm": 24.973142623901367, "learning_rate": 2.4809446975998717e-05, "loss": 2.7435, "step": 14565 }, { "epoch": 2.492897484169091, "grad_norm": 0.35664206743240356, "learning_rate": 2.480072758731767e-05, "loss": 0.0924, "step": 14566 }, { "epoch": 2.493068629128872, "grad_norm": 10.630910873413086, "learning_rate": 2.4792002416192764e-05, "loss": 0.9349, "step": 14567 }, { "epoch": 2.493239774088653, "grad_norm": 0.346587598323822, "learning_rate": 2.478327146777182e-05, "loss": 0.0868, "step": 14568 }, { "epoch": 2.493410919048434, "grad_norm": 20.77900505065918, "learning_rate": 2.4774534747206165e-05, "loss": 3.1294, "step": 14569 }, { "epoch": 2.493582064008215, "grad_norm": 1.253547191619873, "learning_rate": 2.4765792259650456e-05, "loss": 0.1591, "step": 14570 }, { "epoch": 2.4937532089679957, "grad_norm": 20.472095489501953, "learning_rate": 2.4757044010262747e-05, "loss": 2.7798, "step": 14571 }, { "epoch": 2.4939243539277767, "grad_norm": 12.379570960998535, "learning_rate": 2.474829000420454e-05, "loss": 0.8081, "step": 14572 }, { "epoch": 2.4940954988875577, "grad_norm": 10.227797508239746, "learning_rate": 2.473953024664073e-05, "loss": 0.6296, "step": 14573 }, { "epoch": 2.4942666438473386, "grad_norm": 19.47867202758789, "learning_rate": 2.473076474273956e-05, "loss": 2.2772, "step": 14574 }, { "epoch": 2.4944377888071196, "grad_norm": 13.436195373535156, "learning_rate": 2.4721993497672693e-05, "loss": 0.9318, "step": 14575 }, { "epoch": 2.4946089337669006, "grad_norm": 20.97713851928711, "learning_rate": 2.4713216516615172e-05, "loss": 2.8317, "step": 14576 }, { "epoch": 2.4947800787266816, "grad_norm": 13.571615219116211, "learning_rate": 2.4704433804745465e-05, "loss": 0.8646, "step": 14577 }, { "epoch": 2.4949512236864626, "grad_norm": 14.414464950561523, "learning_rate": 2.469564536724535e-05, "loss": 0.8546, "step": 14578 }, { "epoch": 2.4951223686462436, "grad_norm": 15.295833587646484, "learning_rate": 2.4686851209300017e-05, "loss": 0.941, "step": 14579 }, { "epoch": 2.495293513606024, "grad_norm": 11.113797187805176, "learning_rate": 2.467805133609804e-05, "loss": 0.757, "step": 14580 }, { "epoch": 2.495464658565805, "grad_norm": 16.6119441986084, "learning_rate": 2.4669245752831375e-05, "loss": 1.0516, "step": 14581 }, { "epoch": 2.495635803525586, "grad_norm": 11.651034355163574, "learning_rate": 2.4660434464695314e-05, "loss": 0.8132, "step": 14582 }, { "epoch": 2.495806948485367, "grad_norm": 18.838590621948242, "learning_rate": 2.465161747688851e-05, "loss": 0.7193, "step": 14583 }, { "epoch": 2.495978093445148, "grad_norm": 19.409914016723633, "learning_rate": 2.4642794794613023e-05, "loss": 2.2994, "step": 14584 }, { "epoch": 2.496149238404929, "grad_norm": 4.355964660644531, "learning_rate": 2.4633966423074267e-05, "loss": 0.2479, "step": 14585 }, { "epoch": 2.49632038336471, "grad_norm": 13.12771224975586, "learning_rate": 2.462513236748096e-05, "loss": 0.8035, "step": 14586 }, { "epoch": 2.4964915283244906, "grad_norm": 5.415434837341309, "learning_rate": 2.461629263304521e-05, "loss": 0.3543, "step": 14587 }, { "epoch": 2.4966626732842716, "grad_norm": 8.136317253112793, "learning_rate": 2.4607447224982484e-05, "loss": 0.7153, "step": 14588 }, { "epoch": 2.4968338182440526, "grad_norm": 11.366291999816895, "learning_rate": 2.4598596148511585e-05, "loss": 0.7843, "step": 14589 }, { "epoch": 2.4970049632038336, "grad_norm": 15.149494171142578, "learning_rate": 2.4589739408854678e-05, "loss": 0.9671, "step": 14590 }, { "epoch": 2.4971761081636146, "grad_norm": 8.3561372756958, "learning_rate": 2.4580877011237218e-05, "loss": 0.5666, "step": 14591 }, { "epoch": 2.4973472531233956, "grad_norm": 0.48620977997779846, "learning_rate": 2.457200896088807e-05, "loss": 0.0961, "step": 14592 }, { "epoch": 2.4975183980831765, "grad_norm": 10.495131492614746, "learning_rate": 2.4563135263039357e-05, "loss": 0.7325, "step": 14593 }, { "epoch": 2.4976895430429575, "grad_norm": 13.02835750579834, "learning_rate": 2.4554255922926618e-05, "loss": 0.9236, "step": 14594 }, { "epoch": 2.4978606880027385, "grad_norm": 7.566399574279785, "learning_rate": 2.4545370945788636e-05, "loss": 0.7957, "step": 14595 }, { "epoch": 2.498031832962519, "grad_norm": 9.411153793334961, "learning_rate": 2.453648033686759e-05, "loss": 0.516, "step": 14596 }, { "epoch": 2.4982029779223, "grad_norm": 16.608848571777344, "learning_rate": 2.4527584101408927e-05, "loss": 1.0266, "step": 14597 }, { "epoch": 2.498374122882081, "grad_norm": 5.962610721588135, "learning_rate": 2.451868224466147e-05, "loss": 0.4331, "step": 14598 }, { "epoch": 2.498545267841862, "grad_norm": 17.371429443359375, "learning_rate": 2.4509774771877292e-05, "loss": 2.1271, "step": 14599 }, { "epoch": 2.498716412801643, "grad_norm": 21.05018424987793, "learning_rate": 2.4500861688311856e-05, "loss": 2.8771, "step": 14600 }, { "epoch": 2.498887557761424, "grad_norm": 2.97570538520813, "learning_rate": 2.4491942999223856e-05, "loss": 0.2053, "step": 14601 }, { "epoch": 2.499058702721205, "grad_norm": 8.781641960144043, "learning_rate": 2.448301870987536e-05, "loss": 0.7722, "step": 14602 }, { "epoch": 2.4992298476809856, "grad_norm": 19.725696563720703, "learning_rate": 2.4474088825531687e-05, "loss": 2.4722, "step": 14603 }, { "epoch": 2.4994009926407665, "grad_norm": 5.053024768829346, "learning_rate": 2.4465153351461514e-05, "loss": 0.3174, "step": 14604 }, { "epoch": 2.4995721376005475, "grad_norm": 13.599452018737793, "learning_rate": 2.4456212292936744e-05, "loss": 1.0567, "step": 14605 }, { "epoch": 2.4997432825603285, "grad_norm": 16.693878173828125, "learning_rate": 2.4447265655232656e-05, "loss": 1.2958, "step": 14606 }, { "epoch": 2.4999144275201095, "grad_norm": 6.62552547454834, "learning_rate": 2.4438313443627748e-05, "loss": 0.5467, "step": 14607 }, { "epoch": 2.5000855724798905, "grad_norm": 3.9215126037597656, "learning_rate": 2.4429355663403855e-05, "loss": 0.381, "step": 14608 }, { "epoch": 2.5002567174396715, "grad_norm": 33.17057418823242, "learning_rate": 2.442039231984607e-05, "loss": 5.6576, "step": 14609 }, { "epoch": 2.5004278623994525, "grad_norm": 17.766109466552734, "learning_rate": 2.4411423418242783e-05, "loss": 1.4412, "step": 14610 }, { "epoch": 2.5005990073592335, "grad_norm": 12.371880531311035, "learning_rate": 2.4402448963885676e-05, "loss": 1.0017, "step": 14611 }, { "epoch": 2.5007701523190144, "grad_norm": 20.277158737182617, "learning_rate": 2.4393468962069663e-05, "loss": 2.5513, "step": 14612 }, { "epoch": 2.500941297278795, "grad_norm": 25.277820587158203, "learning_rate": 2.4384483418092966e-05, "loss": 5.5843, "step": 14613 }, { "epoch": 2.501112442238576, "grad_norm": 2.1314682960510254, "learning_rate": 2.4375492337257093e-05, "loss": 0.2156, "step": 14614 }, { "epoch": 2.501283587198357, "grad_norm": 7.321349143981934, "learning_rate": 2.436649572486678e-05, "loss": 0.5212, "step": 14615 }, { "epoch": 2.501454732158138, "grad_norm": 10.451953887939453, "learning_rate": 2.4357493586230015e-05, "loss": 0.735, "step": 14616 }, { "epoch": 2.501625877117919, "grad_norm": 2.189490795135498, "learning_rate": 2.43484859266581e-05, "loss": 0.1458, "step": 14617 }, { "epoch": 2.5017970220777, "grad_norm": 23.438396453857422, "learning_rate": 2.433947275146558e-05, "loss": 4.9337, "step": 14618 }, { "epoch": 2.5019681670374805, "grad_norm": 8.848040580749512, "learning_rate": 2.4330454065970227e-05, "loss": 0.5836, "step": 14619 }, { "epoch": 2.5021393119972615, "grad_norm": 26.163606643676758, "learning_rate": 2.432142987549306e-05, "loss": 5.2362, "step": 14620 }, { "epoch": 2.5023104569570425, "grad_norm": 16.638065338134766, "learning_rate": 2.4312400185358386e-05, "loss": 1.2142, "step": 14621 }, { "epoch": 2.5024816019168235, "grad_norm": 10.820052146911621, "learning_rate": 2.4303365000893744e-05, "loss": 0.8749, "step": 14622 }, { "epoch": 2.5026527468766044, "grad_norm": 4.578121662139893, "learning_rate": 2.4294324327429894e-05, "loss": 0.5153, "step": 14623 }, { "epoch": 2.5028238918363854, "grad_norm": 10.976085662841797, "learning_rate": 2.4285278170300835e-05, "loss": 0.8646, "step": 14624 }, { "epoch": 2.5029950367961664, "grad_norm": 6.886414527893066, "learning_rate": 2.427622653484382e-05, "loss": 0.6988, "step": 14625 }, { "epoch": 2.5031661817559474, "grad_norm": 17.63956642150879, "learning_rate": 2.4267169426399356e-05, "loss": 1.8748, "step": 14626 }, { "epoch": 2.5033373267157284, "grad_norm": 8.20398235321045, "learning_rate": 2.4258106850311124e-05, "loss": 0.6412, "step": 14627 }, { "epoch": 2.5035084716755094, "grad_norm": 11.417510032653809, "learning_rate": 2.424903881192605e-05, "loss": 0.7291, "step": 14628 }, { "epoch": 2.50367961663529, "grad_norm": 9.25893783569336, "learning_rate": 2.423996531659429e-05, "loss": 0.5362, "step": 14629 }, { "epoch": 2.503850761595071, "grad_norm": 7.014880180358887, "learning_rate": 2.4230886369669255e-05, "loss": 0.5598, "step": 14630 }, { "epoch": 2.504021906554852, "grad_norm": 1.9710155725479126, "learning_rate": 2.422180197650751e-05, "loss": 0.1819, "step": 14631 }, { "epoch": 2.504193051514633, "grad_norm": 3.535234212875366, "learning_rate": 2.421271214246883e-05, "loss": 0.2194, "step": 14632 }, { "epoch": 2.504364196474414, "grad_norm": 18.9307861328125, "learning_rate": 2.42036168729163e-05, "loss": 2.0963, "step": 14633 }, { "epoch": 2.504535341434195, "grad_norm": 0.38863322138786316, "learning_rate": 2.4194516173216083e-05, "loss": 0.0999, "step": 14634 }, { "epoch": 2.504706486393976, "grad_norm": 15.239171028137207, "learning_rate": 2.4185410048737654e-05, "loss": 1.3328, "step": 14635 }, { "epoch": 2.5048776313537564, "grad_norm": 9.268189430236816, "learning_rate": 2.4176298504853594e-05, "loss": 0.673, "step": 14636 }, { "epoch": 2.5050487763135374, "grad_norm": 10.160786628723145, "learning_rate": 2.4167181546939765e-05, "loss": 0.9098, "step": 14637 }, { "epoch": 2.5052199212733184, "grad_norm": 19.15043830871582, "learning_rate": 2.4158059180375154e-05, "loss": 1.5879, "step": 14638 }, { "epoch": 2.5053910662330994, "grad_norm": 6.558884620666504, "learning_rate": 2.4148931410542002e-05, "loss": 0.8693, "step": 14639 }, { "epoch": 2.5055622111928804, "grad_norm": 9.716113090515137, "learning_rate": 2.4139798242825677e-05, "loss": 0.7377, "step": 14640 }, { "epoch": 2.5057333561526614, "grad_norm": 9.919028282165527, "learning_rate": 2.413065968261479e-05, "loss": 1.1799, "step": 14641 }, { "epoch": 2.5059045011124423, "grad_norm": 13.744864463806152, "learning_rate": 2.412151573530107e-05, "loss": 1.1077, "step": 14642 }, { "epoch": 2.5060756460722233, "grad_norm": 26.9238224029541, "learning_rate": 2.41123664062795e-05, "loss": 4.7603, "step": 14643 }, { "epoch": 2.5062467910320043, "grad_norm": 2.2069363594055176, "learning_rate": 2.4103211700948163e-05, "loss": 0.1989, "step": 14644 }, { "epoch": 2.5064179359917853, "grad_norm": 12.869248390197754, "learning_rate": 2.4094051624708377e-05, "loss": 0.8308, "step": 14645 }, { "epoch": 2.506589080951566, "grad_norm": 16.189023971557617, "learning_rate": 2.408488618296457e-05, "loss": 1.1156, "step": 14646 }, { "epoch": 2.506760225911347, "grad_norm": 10.860884666442871, "learning_rate": 2.4075715381124404e-05, "loss": 0.7507, "step": 14647 }, { "epoch": 2.506931370871128, "grad_norm": 12.093406677246094, "learning_rate": 2.4066539224598623e-05, "loss": 0.7421, "step": 14648 }, { "epoch": 2.507102515830909, "grad_norm": 11.906426429748535, "learning_rate": 2.4057357718801216e-05, "loss": 0.9662, "step": 14649 }, { "epoch": 2.50727366079069, "grad_norm": 15.80009651184082, "learning_rate": 2.4048170869149248e-05, "loss": 1.9594, "step": 14650 }, { "epoch": 2.507444805750471, "grad_norm": 26.19715690612793, "learning_rate": 2.4038978681063005e-05, "loss": 5.2032, "step": 14651 }, { "epoch": 2.5076159507102513, "grad_norm": 6.305425643920898, "learning_rate": 2.402978115996586e-05, "loss": 0.4284, "step": 14652 }, { "epoch": 2.5077870956700323, "grad_norm": 9.213685035705566, "learning_rate": 2.4020578311284383e-05, "loss": 0.728, "step": 14653 }, { "epoch": 2.5079582406298133, "grad_norm": 17.400115966796875, "learning_rate": 2.4011370140448264e-05, "loss": 1.4199, "step": 14654 }, { "epoch": 2.5081293855895943, "grad_norm": 2.091099262237549, "learning_rate": 2.400215665289036e-05, "loss": 0.1821, "step": 14655 }, { "epoch": 2.5083005305493753, "grad_norm": 10.247763633728027, "learning_rate": 2.3992937854046624e-05, "loss": 0.9412, "step": 14656 }, { "epoch": 2.5084716755091563, "grad_norm": 10.958451271057129, "learning_rate": 2.3983713749356138e-05, "loss": 0.833, "step": 14657 }, { "epoch": 2.5086428204689373, "grad_norm": 10.709365844726562, "learning_rate": 2.397448434426116e-05, "loss": 0.8384, "step": 14658 }, { "epoch": 2.5088139654287183, "grad_norm": 9.836774826049805, "learning_rate": 2.3965249644207072e-05, "loss": 0.7917, "step": 14659 }, { "epoch": 2.5089851103884993, "grad_norm": 3.700845956802368, "learning_rate": 2.3956009654642336e-05, "loss": 0.2911, "step": 14660 }, { "epoch": 2.5091562553482802, "grad_norm": 1.2296513319015503, "learning_rate": 2.3946764381018548e-05, "loss": 0.1918, "step": 14661 }, { "epoch": 2.509327400308061, "grad_norm": 5.9709906578063965, "learning_rate": 2.393751382879045e-05, "loss": 0.5181, "step": 14662 }, { "epoch": 2.5094985452678418, "grad_norm": 12.804922103881836, "learning_rate": 2.3928258003415902e-05, "loss": 0.753, "step": 14663 }, { "epoch": 2.5096696902276228, "grad_norm": 3.493795394897461, "learning_rate": 2.391899691035583e-05, "loss": 0.2323, "step": 14664 }, { "epoch": 2.5098408351874038, "grad_norm": 2.3334310054779053, "learning_rate": 2.390973055507428e-05, "loss": 0.1771, "step": 14665 }, { "epoch": 2.5100119801471847, "grad_norm": 39.235965728759766, "learning_rate": 2.390045894303843e-05, "loss": 6.5223, "step": 14666 }, { "epoch": 2.5101831251069657, "grad_norm": 14.768915176391602, "learning_rate": 2.3891182079718563e-05, "loss": 1.0535, "step": 14667 }, { "epoch": 2.5103542700667463, "grad_norm": 18.048480987548828, "learning_rate": 2.3881899970588027e-05, "loss": 1.3634, "step": 14668 }, { "epoch": 2.5105254150265273, "grad_norm": 13.416892051696777, "learning_rate": 2.3872612621123265e-05, "loss": 1.2749, "step": 14669 }, { "epoch": 2.5106965599863083, "grad_norm": 5.486839771270752, "learning_rate": 2.3863320036803843e-05, "loss": 0.3151, "step": 14670 }, { "epoch": 2.5108677049460892, "grad_norm": 17.10948944091797, "learning_rate": 2.3854022223112408e-05, "loss": 1.1909, "step": 14671 }, { "epoch": 2.5110388499058702, "grad_norm": 12.881095886230469, "learning_rate": 2.3844719185534677e-05, "loss": 0.936, "step": 14672 }, { "epoch": 2.511209994865651, "grad_norm": 8.777505874633789, "learning_rate": 2.3835410929559435e-05, "loss": 0.6572, "step": 14673 }, { "epoch": 2.511381139825432, "grad_norm": 19.757871627807617, "learning_rate": 2.3826097460678584e-05, "loss": 2.0739, "step": 14674 }, { "epoch": 2.511552284785213, "grad_norm": 25.17220687866211, "learning_rate": 2.3816778784387087e-05, "loss": 4.9549, "step": 14675 }, { "epoch": 2.511723429744994, "grad_norm": 12.111425399780273, "learning_rate": 2.3807454906182992e-05, "loss": 0.9207, "step": 14676 }, { "epoch": 2.511894574704775, "grad_norm": 9.688026428222656, "learning_rate": 2.3798125831567362e-05, "loss": 0.6154, "step": 14677 }, { "epoch": 2.5120657196645557, "grad_norm": 0.6713597178459167, "learning_rate": 2.378879156604441e-05, "loss": 0.1302, "step": 14678 }, { "epoch": 2.5122368646243367, "grad_norm": 8.393242835998535, "learning_rate": 2.377945211512133e-05, "loss": 0.5523, "step": 14679 }, { "epoch": 2.5124080095841177, "grad_norm": 16.707626342773438, "learning_rate": 2.3770107484308435e-05, "loss": 1.3357, "step": 14680 }, { "epoch": 2.5125791545438987, "grad_norm": 49.35841369628906, "learning_rate": 2.3760757679119044e-05, "loss": 7.0855, "step": 14681 }, { "epoch": 2.5127502995036797, "grad_norm": 6.789189338684082, "learning_rate": 2.3751402705069595e-05, "loss": 0.5502, "step": 14682 }, { "epoch": 2.5129214444634607, "grad_norm": 8.347942352294922, "learning_rate": 2.3742042567679495e-05, "loss": 0.4973, "step": 14683 }, { "epoch": 2.5130925894232417, "grad_norm": 23.72980499267578, "learning_rate": 2.3732677272471276e-05, "loss": 4.7583, "step": 14684 }, { "epoch": 2.513263734383022, "grad_norm": 3.2899582386016846, "learning_rate": 2.3723306824970443e-05, "loss": 0.2588, "step": 14685 }, { "epoch": 2.513434879342803, "grad_norm": 2.4259939193725586, "learning_rate": 2.3713931230705606e-05, "loss": 0.1755, "step": 14686 }, { "epoch": 2.513606024302584, "grad_norm": 12.703667640686035, "learning_rate": 2.3704550495208353e-05, "loss": 0.8696, "step": 14687 }, { "epoch": 2.513777169262365, "grad_norm": 19.923269271850586, "learning_rate": 2.3695164624013354e-05, "loss": 2.7331, "step": 14688 }, { "epoch": 2.513948314222146, "grad_norm": 19.050975799560547, "learning_rate": 2.368577362265826e-05, "loss": 1.3175, "step": 14689 }, { "epoch": 2.514119459181927, "grad_norm": 5.3114752769470215, "learning_rate": 2.367637749668381e-05, "loss": 0.6186, "step": 14690 }, { "epoch": 2.514290604141708, "grad_norm": 20.33905792236328, "learning_rate": 2.366697625163369e-05, "loss": 2.5167, "step": 14691 }, { "epoch": 2.514461749101489, "grad_norm": 10.460938453674316, "learning_rate": 2.3657569893054697e-05, "loss": 0.5951, "step": 14692 }, { "epoch": 2.51463289406127, "grad_norm": 0.36616262793540955, "learning_rate": 2.3648158426496556e-05, "loss": 0.0937, "step": 14693 }, { "epoch": 2.514804039021051, "grad_norm": 8.398286819458008, "learning_rate": 2.3638741857512076e-05, "loss": 0.6422, "step": 14694 }, { "epoch": 2.5149751839808316, "grad_norm": 5.314393997192383, "learning_rate": 2.3629320191657016e-05, "loss": 0.5044, "step": 14695 }, { "epoch": 2.5151463289406126, "grad_norm": 13.878609657287598, "learning_rate": 2.3619893434490187e-05, "loss": 1.0739, "step": 14696 }, { "epoch": 2.5153174739003936, "grad_norm": 86.79373931884766, "learning_rate": 2.361046159157341e-05, "loss": 8.4728, "step": 14697 }, { "epoch": 2.5154886188601746, "grad_norm": 20.265270233154297, "learning_rate": 2.360102466847146e-05, "loss": 2.1489, "step": 14698 }, { "epoch": 2.5156597638199556, "grad_norm": 0.34380513429641724, "learning_rate": 2.3591582670752137e-05, "loss": 0.0905, "step": 14699 }, { "epoch": 2.5158309087797366, "grad_norm": 10.887395858764648, "learning_rate": 2.358213560398626e-05, "loss": 0.9013, "step": 14700 }, { "epoch": 2.516002053739517, "grad_norm": 18.937129974365234, "learning_rate": 2.3572683473747593e-05, "loss": 1.9514, "step": 14701 }, { "epoch": 2.516173198699298, "grad_norm": 15.265789985656738, "learning_rate": 2.35632262856129e-05, "loss": 1.0513, "step": 14702 }, { "epoch": 2.516344343659079, "grad_norm": 16.585676193237305, "learning_rate": 2.355376404516193e-05, "loss": 1.8088, "step": 14703 }, { "epoch": 2.51651548861886, "grad_norm": 8.843396186828613, "learning_rate": 2.3544296757977458e-05, "loss": 0.7196, "step": 14704 }, { "epoch": 2.516686633578641, "grad_norm": 10.249777793884277, "learning_rate": 2.3534824429645173e-05, "loss": 0.734, "step": 14705 }, { "epoch": 2.516857778538422, "grad_norm": 13.046167373657227, "learning_rate": 2.352534706575374e-05, "loss": 0.9524, "step": 14706 }, { "epoch": 2.517028923498203, "grad_norm": 46.65620422363281, "learning_rate": 2.3515864671894843e-05, "loss": 6.9564, "step": 14707 }, { "epoch": 2.517200068457984, "grad_norm": 6.273788928985596, "learning_rate": 2.3506377253663125e-05, "loss": 0.4636, "step": 14708 }, { "epoch": 2.517371213417765, "grad_norm": 20.379297256469727, "learning_rate": 2.3496884816656155e-05, "loss": 2.0818, "step": 14709 }, { "epoch": 2.517542358377546, "grad_norm": 13.116079330444336, "learning_rate": 2.348738736647447e-05, "loss": 0.9301, "step": 14710 }, { "epoch": 2.5177135033373266, "grad_norm": 15.757540702819824, "learning_rate": 2.34778849087216e-05, "loss": 1.3219, "step": 14711 }, { "epoch": 2.5178846482971076, "grad_norm": 7.168689250946045, "learning_rate": 2.3468377449004018e-05, "loss": 0.6255, "step": 14712 }, { "epoch": 2.5180557932568886, "grad_norm": 0.5727549195289612, "learning_rate": 2.345886499293113e-05, "loss": 0.1078, "step": 14713 }, { "epoch": 2.5182269382166695, "grad_norm": 12.99024486541748, "learning_rate": 2.344934754611528e-05, "loss": 1.0117, "step": 14714 }, { "epoch": 2.5183980831764505, "grad_norm": 7.1233673095703125, "learning_rate": 2.34398251141718e-05, "loss": 0.5314, "step": 14715 }, { "epoch": 2.5185692281362315, "grad_norm": 22.25042724609375, "learning_rate": 2.3430297702718936e-05, "loss": 1.6919, "step": 14716 }, { "epoch": 2.518740373096012, "grad_norm": 18.022750854492188, "learning_rate": 2.3420765317377877e-05, "loss": 1.4421, "step": 14717 }, { "epoch": 2.518911518055793, "grad_norm": 9.959421157836914, "learning_rate": 2.3411227963772706e-05, "loss": 0.8657, "step": 14718 }, { "epoch": 2.519082663015574, "grad_norm": 4.0943498611450195, "learning_rate": 2.340168564753054e-05, "loss": 0.3286, "step": 14719 }, { "epoch": 2.519253807975355, "grad_norm": 10.004379272460938, "learning_rate": 2.3392138374281315e-05, "loss": 0.6334, "step": 14720 }, { "epoch": 2.519424952935136, "grad_norm": 6.428358554840088, "learning_rate": 2.338258614965796e-05, "loss": 0.5394, "step": 14721 }, { "epoch": 2.519596097894917, "grad_norm": 14.667947769165039, "learning_rate": 2.337302897929628e-05, "loss": 1.032, "step": 14722 }, { "epoch": 2.519767242854698, "grad_norm": 18.24533462524414, "learning_rate": 2.3363466868835045e-05, "loss": 1.237, "step": 14723 }, { "epoch": 2.519938387814479, "grad_norm": 8.165216445922852, "learning_rate": 2.335389982391588e-05, "loss": 0.636, "step": 14724 }, { "epoch": 2.52010953277426, "grad_norm": 17.474393844604492, "learning_rate": 2.3344327850183398e-05, "loss": 1.8973, "step": 14725 }, { "epoch": 2.520280677734041, "grad_norm": 0.48041558265686035, "learning_rate": 2.3334750953285037e-05, "loss": 0.1012, "step": 14726 }, { "epoch": 2.5204518226938215, "grad_norm": 13.757022857666016, "learning_rate": 2.3325169138871218e-05, "loss": 0.9932, "step": 14727 }, { "epoch": 2.5206229676536025, "grad_norm": 18.18849754333496, "learning_rate": 2.3315582412595188e-05, "loss": 1.2899, "step": 14728 }, { "epoch": 2.5207941126133835, "grad_norm": 16.81682777404785, "learning_rate": 2.3305990780113166e-05, "loss": 1.2939, "step": 14729 }, { "epoch": 2.5209652575731645, "grad_norm": 4.60380220413208, "learning_rate": 2.32963942470842e-05, "loss": 0.38, "step": 14730 }, { "epoch": 2.5211364025329455, "grad_norm": 18.26609230041504, "learning_rate": 2.3286792819170283e-05, "loss": 1.3618, "step": 14731 }, { "epoch": 2.5213075474927265, "grad_norm": 17.683521270751953, "learning_rate": 2.327718650203624e-05, "loss": 1.0943, "step": 14732 }, { "epoch": 2.521478692452507, "grad_norm": 14.937235832214355, "learning_rate": 2.3267575301349856e-05, "loss": 0.9207, "step": 14733 }, { "epoch": 2.521649837412288, "grad_norm": 7.406940460205078, "learning_rate": 2.3257959222781708e-05, "loss": 0.5485, "step": 14734 }, { "epoch": 2.521820982372069, "grad_norm": 8.481680870056152, "learning_rate": 2.3248338272005342e-05, "loss": 0.4802, "step": 14735 }, { "epoch": 2.52199212733185, "grad_norm": 14.911299705505371, "learning_rate": 2.323871245469709e-05, "loss": 1.031, "step": 14736 }, { "epoch": 2.522163272291631, "grad_norm": 29.44999885559082, "learning_rate": 2.3229081776536234e-05, "loss": 5.295, "step": 14737 }, { "epoch": 2.522334417251412, "grad_norm": 14.505546569824219, "learning_rate": 2.3219446243204856e-05, "loss": 1.052, "step": 14738 }, { "epoch": 2.522505562211193, "grad_norm": 15.519218444824219, "learning_rate": 2.320980586038795e-05, "loss": 0.6305, "step": 14739 }, { "epoch": 2.522676707170974, "grad_norm": 103.4175033569336, "learning_rate": 2.320016063377336e-05, "loss": 9.3529, "step": 14740 }, { "epoch": 2.522847852130755, "grad_norm": 12.77975082397461, "learning_rate": 2.3190510569051803e-05, "loss": 0.9816, "step": 14741 }, { "epoch": 2.523018997090536, "grad_norm": 6.507275581359863, "learning_rate": 2.3180855671916807e-05, "loss": 0.4457, "step": 14742 }, { "epoch": 2.523190142050317, "grad_norm": 20.96272087097168, "learning_rate": 2.317119594806476e-05, "loss": 1.6405, "step": 14743 }, { "epoch": 2.5233612870100974, "grad_norm": 15.110397338867188, "learning_rate": 2.3161531403194937e-05, "loss": 1.218, "step": 14744 }, { "epoch": 2.5235324319698784, "grad_norm": 13.587836265563965, "learning_rate": 2.3151862043009443e-05, "loss": 0.9663, "step": 14745 }, { "epoch": 2.5237035769296594, "grad_norm": 3.8424298763275146, "learning_rate": 2.314218787321321e-05, "loss": 0.3474, "step": 14746 }, { "epoch": 2.5238747218894404, "grad_norm": 4.582193374633789, "learning_rate": 2.313250889951398e-05, "loss": 0.2951, "step": 14747 }, { "epoch": 2.5240458668492214, "grad_norm": 7.1230573654174805, "learning_rate": 2.3122825127622386e-05, "loss": 0.4583, "step": 14748 }, { "epoch": 2.5242170118090024, "grad_norm": 15.793252944946289, "learning_rate": 2.311313656325189e-05, "loss": 1.1305, "step": 14749 }, { "epoch": 2.524388156768783, "grad_norm": 13.861077308654785, "learning_rate": 2.3103443212118735e-05, "loss": 1.2464, "step": 14750 }, { "epoch": 2.524559301728564, "grad_norm": 16.1656551361084, "learning_rate": 2.3093745079942e-05, "loss": 1.5306, "step": 14751 }, { "epoch": 2.524730446688345, "grad_norm": 19.04994773864746, "learning_rate": 2.3084042172443605e-05, "loss": 1.3477, "step": 14752 }, { "epoch": 2.524901591648126, "grad_norm": 0.4357701241970062, "learning_rate": 2.307433449534831e-05, "loss": 0.0987, "step": 14753 }, { "epoch": 2.525072736607907, "grad_norm": 79.52932739257812, "learning_rate": 2.3064622054383638e-05, "loss": 7.5853, "step": 14754 }, { "epoch": 2.525243881567688, "grad_norm": 16.456806182861328, "learning_rate": 2.3054904855279924e-05, "loss": 1.6284, "step": 14755 }, { "epoch": 2.525415026527469, "grad_norm": 8.45326042175293, "learning_rate": 2.3045182903770343e-05, "loss": 0.473, "step": 14756 }, { "epoch": 2.52558617148725, "grad_norm": 17.314315795898438, "learning_rate": 2.303545620559089e-05, "loss": 2.0723, "step": 14757 }, { "epoch": 2.525757316447031, "grad_norm": 14.217031478881836, "learning_rate": 2.3025724766480308e-05, "loss": 1.5255, "step": 14758 }, { "epoch": 2.525928461406812, "grad_norm": 36.21196365356445, "learning_rate": 2.3015988592180136e-05, "loss": 6.894, "step": 14759 }, { "epoch": 2.5260996063665924, "grad_norm": 14.527505874633789, "learning_rate": 2.3006247688434758e-05, "loss": 1.5436, "step": 14760 }, { "epoch": 2.5262707513263734, "grad_norm": 13.87662124633789, "learning_rate": 2.299650206099132e-05, "loss": 1.0372, "step": 14761 }, { "epoch": 2.5264418962861543, "grad_norm": 15.428047180175781, "learning_rate": 2.2986751715599767e-05, "loss": 1.7734, "step": 14762 }, { "epoch": 2.5266130412459353, "grad_norm": 1.6423033475875854, "learning_rate": 2.2976996658012794e-05, "loss": 0.1414, "step": 14763 }, { "epoch": 2.5267841862057163, "grad_norm": 12.761268615722656, "learning_rate": 2.296723689398593e-05, "loss": 0.6838, "step": 14764 }, { "epoch": 2.5269553311654973, "grad_norm": 11.950900077819824, "learning_rate": 2.2957472429277407e-05, "loss": 0.7497, "step": 14765 }, { "epoch": 2.527126476125278, "grad_norm": 28.040447235107422, "learning_rate": 2.294770326964833e-05, "loss": 5.3155, "step": 14766 }, { "epoch": 2.527297621085059, "grad_norm": 18.570659637451172, "learning_rate": 2.2937929420862457e-05, "loss": 2.1338, "step": 14767 }, { "epoch": 2.52746876604484, "grad_norm": 8.387194633483887, "learning_rate": 2.2928150888686418e-05, "loss": 0.4945, "step": 14768 }, { "epoch": 2.527639911004621, "grad_norm": 29.481292724609375, "learning_rate": 2.2918367678889535e-05, "loss": 5.5599, "step": 14769 }, { "epoch": 2.527811055964402, "grad_norm": 8.76069450378418, "learning_rate": 2.2908579797243946e-05, "loss": 0.4533, "step": 14770 }, { "epoch": 2.527982200924183, "grad_norm": 5.085324287414551, "learning_rate": 2.2898787249524474e-05, "loss": 0.3383, "step": 14771 }, { "epoch": 2.528153345883964, "grad_norm": 53.00185775756836, "learning_rate": 2.2888990041508778e-05, "loss": 7.22, "step": 14772 }, { "epoch": 2.5283244908437448, "grad_norm": 15.327123641967773, "learning_rate": 2.2879188178977194e-05, "loss": 1.3947, "step": 14773 }, { "epoch": 2.5284956358035258, "grad_norm": 10.696853637695312, "learning_rate": 2.2869381667712863e-05, "loss": 0.9818, "step": 14774 }, { "epoch": 2.5286667807633068, "grad_norm": 0.3308296203613281, "learning_rate": 2.285957051350161e-05, "loss": 0.093, "step": 14775 }, { "epoch": 2.5288379257230873, "grad_norm": 16.96717643737793, "learning_rate": 2.2849754722132064e-05, "loss": 1.134, "step": 14776 }, { "epoch": 2.5290090706828683, "grad_norm": 0.3712121546268463, "learning_rate": 2.2839934299395526e-05, "loss": 0.0957, "step": 14777 }, { "epoch": 2.5291802156426493, "grad_norm": 15.035158157348633, "learning_rate": 2.283010925108609e-05, "loss": 1.2604, "step": 14778 }, { "epoch": 2.5293513606024303, "grad_norm": 0.4311560094356537, "learning_rate": 2.2820279583000514e-05, "loss": 0.0965, "step": 14779 }, { "epoch": 2.5295225055622113, "grad_norm": 11.89159107208252, "learning_rate": 2.2810445300938356e-05, "loss": 0.9702, "step": 14780 }, { "epoch": 2.5296936505219922, "grad_norm": 11.383295059204102, "learning_rate": 2.2800606410701817e-05, "loss": 0.9481, "step": 14781 }, { "epoch": 2.529864795481773, "grad_norm": 9.119539260864258, "learning_rate": 2.279076291809588e-05, "loss": 0.7685, "step": 14782 }, { "epoch": 2.5300359404415538, "grad_norm": 23.090713500976562, "learning_rate": 2.278091482892823e-05, "loss": 5.2059, "step": 14783 }, { "epoch": 2.5302070854013348, "grad_norm": 9.357633590698242, "learning_rate": 2.277106214900923e-05, "loss": 0.9213, "step": 14784 }, { "epoch": 2.5303782303611158, "grad_norm": 21.8276309967041, "learning_rate": 2.2761204884151983e-05, "loss": 5.0996, "step": 14785 }, { "epoch": 2.5305493753208967, "grad_norm": 17.570703506469727, "learning_rate": 2.275134304017231e-05, "loss": 1.1656, "step": 14786 }, { "epoch": 2.5307205202806777, "grad_norm": 1.653861165046692, "learning_rate": 2.2741476622888704e-05, "loss": 0.1936, "step": 14787 }, { "epoch": 2.5308916652404587, "grad_norm": 37.85600662231445, "learning_rate": 2.273160563812234e-05, "loss": 6.3732, "step": 14788 }, { "epoch": 2.5310628102002397, "grad_norm": 9.018007278442383, "learning_rate": 2.272173009169713e-05, "loss": 0.681, "step": 14789 }, { "epoch": 2.5312339551600207, "grad_norm": 7.42921257019043, "learning_rate": 2.271184998943969e-05, "loss": 0.5683, "step": 14790 }, { "epoch": 2.5314051001198017, "grad_norm": 17.091533660888672, "learning_rate": 2.2701965337179264e-05, "loss": 1.3542, "step": 14791 }, { "epoch": 2.5315762450795827, "grad_norm": 6.81679105758667, "learning_rate": 2.26920761407478e-05, "loss": 0.4529, "step": 14792 }, { "epoch": 2.531747390039363, "grad_norm": 9.321266174316406, "learning_rate": 2.2682182405979953e-05, "loss": 0.675, "step": 14793 }, { "epoch": 2.531918534999144, "grad_norm": 0.6089408993721008, "learning_rate": 2.2672284138713066e-05, "loss": 0.0972, "step": 14794 }, { "epoch": 2.532089679958925, "grad_norm": 2.3100063800811768, "learning_rate": 2.2662381344787112e-05, "loss": 0.188, "step": 14795 }, { "epoch": 2.532260824918706, "grad_norm": 14.153358459472656, "learning_rate": 2.265247403004473e-05, "loss": 1.039, "step": 14796 }, { "epoch": 2.532431969878487, "grad_norm": 7.542194843292236, "learning_rate": 2.2642562200331273e-05, "loss": 0.7313, "step": 14797 }, { "epoch": 2.532603114838268, "grad_norm": 6.960524082183838, "learning_rate": 2.2632645861494755e-05, "loss": 0.6595, "step": 14798 }, { "epoch": 2.5327742597980487, "grad_norm": 7.379673480987549, "learning_rate": 2.2622725019385807e-05, "loss": 0.4587, "step": 14799 }, { "epoch": 2.5329454047578297, "grad_norm": 8.736422538757324, "learning_rate": 2.2612799679857726e-05, "loss": 0.7138, "step": 14800 }, { "epoch": 2.5331165497176107, "grad_norm": 2.135769844055176, "learning_rate": 2.2602869848766497e-05, "loss": 0.2304, "step": 14801 }, { "epoch": 2.5332876946773917, "grad_norm": 0.8696700930595398, "learning_rate": 2.259293553197075e-05, "loss": 0.1485, "step": 14802 }, { "epoch": 2.5334588396371727, "grad_norm": 7.406271934509277, "learning_rate": 2.258299673533171e-05, "loss": 0.6294, "step": 14803 }, { "epoch": 2.5336299845969537, "grad_norm": 1.7431578636169434, "learning_rate": 2.2573053464713304e-05, "loss": 0.1772, "step": 14804 }, { "epoch": 2.5338011295567346, "grad_norm": 5.66493034362793, "learning_rate": 2.2563105725982094e-05, "loss": 0.2347, "step": 14805 }, { "epoch": 2.5339722745165156, "grad_norm": 15.189970016479492, "learning_rate": 2.255315352500722e-05, "loss": 0.9543, "step": 14806 }, { "epoch": 2.5341434194762966, "grad_norm": 20.411930084228516, "learning_rate": 2.2543196867660534e-05, "loss": 2.2729, "step": 14807 }, { "epoch": 2.5343145644360776, "grad_norm": 12.698857307434082, "learning_rate": 2.253323575981644e-05, "loss": 1.0045, "step": 14808 }, { "epoch": 2.534485709395858, "grad_norm": 3.864426612854004, "learning_rate": 2.2523270207352052e-05, "loss": 0.3824, "step": 14809 }, { "epoch": 2.534656854355639, "grad_norm": 16.947818756103516, "learning_rate": 2.2513300216147013e-05, "loss": 1.3262, "step": 14810 }, { "epoch": 2.53482799931542, "grad_norm": 11.127943992614746, "learning_rate": 2.250332579208367e-05, "loss": 0.8442, "step": 14811 }, { "epoch": 2.534999144275201, "grad_norm": 12.753450393676758, "learning_rate": 2.2493346941046922e-05, "loss": 0.9088, "step": 14812 }, { "epoch": 2.535170289234982, "grad_norm": 7.026861190795898, "learning_rate": 2.248336366892434e-05, "loss": 0.6788, "step": 14813 }, { "epoch": 2.535341434194763, "grad_norm": 15.89734172821045, "learning_rate": 2.2473375981606024e-05, "loss": 1.2512, "step": 14814 }, { "epoch": 2.5355125791545436, "grad_norm": 0.3184797763824463, "learning_rate": 2.2463383884984763e-05, "loss": 0.0913, "step": 14815 }, { "epoch": 2.5356837241143246, "grad_norm": 11.891545295715332, "learning_rate": 2.2453387384955877e-05, "loss": 1.3036, "step": 14816 }, { "epoch": 2.5358548690741056, "grad_norm": 14.315423965454102, "learning_rate": 2.244338648741735e-05, "loss": 0.9891, "step": 14817 }, { "epoch": 2.5360260140338866, "grad_norm": 10.697434425354004, "learning_rate": 2.243338119826969e-05, "loss": 0.9002, "step": 14818 }, { "epoch": 2.5361971589936676, "grad_norm": 10.515617370605469, "learning_rate": 2.2423371523416075e-05, "loss": 0.6732, "step": 14819 }, { "epoch": 2.5363683039534486, "grad_norm": 3.8096933364868164, "learning_rate": 2.2413357468762182e-05, "loss": 0.2348, "step": 14820 }, { "epoch": 2.5365394489132296, "grad_norm": 7.914472579956055, "learning_rate": 2.2403339040216358e-05, "loss": 0.5535, "step": 14821 }, { "epoch": 2.5367105938730106, "grad_norm": 23.99715232849121, "learning_rate": 2.239331624368946e-05, "loss": 2.6468, "step": 14822 }, { "epoch": 2.5368817388327916, "grad_norm": 1.7412183284759521, "learning_rate": 2.2383289085094976e-05, "loss": 0.1346, "step": 14823 }, { "epoch": 2.5370528837925725, "grad_norm": 14.085664749145508, "learning_rate": 2.2373257570348924e-05, "loss": 1.0026, "step": 14824 }, { "epoch": 2.537224028752353, "grad_norm": 2.4265151023864746, "learning_rate": 2.236322170536992e-05, "loss": 0.1915, "step": 14825 }, { "epoch": 2.537395173712134, "grad_norm": 9.647686004638672, "learning_rate": 2.2353181496079143e-05, "loss": 0.5639, "step": 14826 }, { "epoch": 2.537566318671915, "grad_norm": 2.2418553829193115, "learning_rate": 2.234313694840035e-05, "loss": 0.2044, "step": 14827 }, { "epoch": 2.537737463631696, "grad_norm": 20.247314453125, "learning_rate": 2.2333088068259816e-05, "loss": 1.5628, "step": 14828 }, { "epoch": 2.537908608591477, "grad_norm": 11.692889213562012, "learning_rate": 2.2323034861586385e-05, "loss": 0.8244, "step": 14829 }, { "epoch": 2.538079753551258, "grad_norm": 11.490840911865234, "learning_rate": 2.2312977334311475e-05, "loss": 0.8679, "step": 14830 }, { "epoch": 2.5382508985110386, "grad_norm": 4.355506420135498, "learning_rate": 2.230291549236907e-05, "loss": 0.2557, "step": 14831 }, { "epoch": 2.5384220434708196, "grad_norm": 10.936756134033203, "learning_rate": 2.2292849341695644e-05, "loss": 0.7491, "step": 14832 }, { "epoch": 2.5385931884306006, "grad_norm": 5.783020496368408, "learning_rate": 2.2282778888230224e-05, "loss": 0.3692, "step": 14833 }, { "epoch": 2.5387643333903815, "grad_norm": 9.403080940246582, "learning_rate": 2.2272704137914415e-05, "loss": 0.6683, "step": 14834 }, { "epoch": 2.5389354783501625, "grad_norm": 0.42916685342788696, "learning_rate": 2.226262509669235e-05, "loss": 0.1004, "step": 14835 }, { "epoch": 2.5391066233099435, "grad_norm": 14.160784721374512, "learning_rate": 2.2252541770510662e-05, "loss": 0.9216, "step": 14836 }, { "epoch": 2.5392777682697245, "grad_norm": 0.589654803276062, "learning_rate": 2.2242454165318507e-05, "loss": 0.0959, "step": 14837 }, { "epoch": 2.5394489132295055, "grad_norm": 13.865631103515625, "learning_rate": 2.2232362287067604e-05, "loss": 1.0218, "step": 14838 }, { "epoch": 2.5396200581892865, "grad_norm": 8.55599594116211, "learning_rate": 2.22222661417122e-05, "loss": 0.5843, "step": 14839 }, { "epoch": 2.5397912031490675, "grad_norm": 18.53800392150879, "learning_rate": 2.2212165735209014e-05, "loss": 1.8839, "step": 14840 }, { "epoch": 2.5399623481088485, "grad_norm": 9.631828308105469, "learning_rate": 2.2202061073517288e-05, "loss": 0.6755, "step": 14841 }, { "epoch": 2.540133493068629, "grad_norm": 10.829450607299805, "learning_rate": 2.2191952162598798e-05, "loss": 0.7724, "step": 14842 }, { "epoch": 2.54030463802841, "grad_norm": 13.621152877807617, "learning_rate": 2.2181839008417835e-05, "loss": 0.7983, "step": 14843 }, { "epoch": 2.540475782988191, "grad_norm": 5.24722146987915, "learning_rate": 2.2171721616941167e-05, "loss": 0.2585, "step": 14844 }, { "epoch": 2.540646927947972, "grad_norm": 5.13550329208374, "learning_rate": 2.2161599994138046e-05, "loss": 0.5344, "step": 14845 }, { "epoch": 2.540818072907753, "grad_norm": 12.812666893005371, "learning_rate": 2.2151474145980255e-05, "loss": 0.737, "step": 14846 }, { "epoch": 2.540989217867534, "grad_norm": 1.3931697607040405, "learning_rate": 2.2141344078442066e-05, "loss": 0.1571, "step": 14847 }, { "epoch": 2.5411603628273145, "grad_norm": 8.335688591003418, "learning_rate": 2.2131209797500253e-05, "loss": 1.5738, "step": 14848 }, { "epoch": 2.5413315077870955, "grad_norm": 12.297319412231445, "learning_rate": 2.2121071309134023e-05, "loss": 0.8164, "step": 14849 }, { "epoch": 2.5415026527468765, "grad_norm": 0.5159737467765808, "learning_rate": 2.211092861932513e-05, "loss": 0.1012, "step": 14850 }, { "epoch": 2.5416737977066575, "grad_norm": 1.5781267881393433, "learning_rate": 2.210078173405774e-05, "loss": 0.1822, "step": 14851 }, { "epoch": 2.5418449426664385, "grad_norm": 10.817215919494629, "learning_rate": 2.209063065931857e-05, "loss": 0.8218, "step": 14852 }, { "epoch": 2.5420160876262194, "grad_norm": 15.264995574951172, "learning_rate": 2.2080475401096736e-05, "loss": 1.4324, "step": 14853 }, { "epoch": 2.5421872325860004, "grad_norm": 2.434833526611328, "learning_rate": 2.2070315965383886e-05, "loss": 0.1804, "step": 14854 }, { "epoch": 2.5423583775457814, "grad_norm": 64.04412841796875, "learning_rate": 2.2060152358174063e-05, "loss": 7.5448, "step": 14855 }, { "epoch": 2.5425295225055624, "grad_norm": 10.27996826171875, "learning_rate": 2.2049984585463854e-05, "loss": 0.7197, "step": 14856 }, { "epoch": 2.5427006674653434, "grad_norm": 11.611339569091797, "learning_rate": 2.203981265325222e-05, "loss": 0.8052, "step": 14857 }, { "epoch": 2.542871812425124, "grad_norm": 13.633689880371094, "learning_rate": 2.2029636567540652e-05, "loss": 1.028, "step": 14858 }, { "epoch": 2.543042957384905, "grad_norm": 8.97692584991455, "learning_rate": 2.2019456334333023e-05, "loss": 0.5909, "step": 14859 }, { "epoch": 2.543214102344686, "grad_norm": 14.345952987670898, "learning_rate": 2.2009271959635715e-05, "loss": 1.1281, "step": 14860 }, { "epoch": 2.543385247304467, "grad_norm": 25.936738967895508, "learning_rate": 2.19990834494575e-05, "loss": 5.5474, "step": 14861 }, { "epoch": 2.543556392264248, "grad_norm": 12.348350524902344, "learning_rate": 2.1988890809809642e-05, "loss": 0.9628, "step": 14862 }, { "epoch": 2.543727537224029, "grad_norm": 38.14033508300781, "learning_rate": 2.1978694046705773e-05, "loss": 6.6386, "step": 14863 }, { "epoch": 2.5438986821838094, "grad_norm": 20.159040451049805, "learning_rate": 2.1968493166162042e-05, "loss": 2.3438, "step": 14864 }, { "epoch": 2.5440698271435904, "grad_norm": 0.3866449296474457, "learning_rate": 2.1958288174196947e-05, "loss": 0.0955, "step": 14865 }, { "epoch": 2.5442409721033714, "grad_norm": 14.329410552978516, "learning_rate": 2.1948079076831482e-05, "loss": 0.9018, "step": 14866 }, { "epoch": 2.5444121170631524, "grad_norm": 24.51056480407715, "learning_rate": 2.1937865880088994e-05, "loss": 3.7353, "step": 14867 }, { "epoch": 2.5445832620229334, "grad_norm": 0.30827730894088745, "learning_rate": 2.1927648589995305e-05, "loss": 0.0931, "step": 14868 }, { "epoch": 2.5447544069827144, "grad_norm": 0.326984703540802, "learning_rate": 2.191742721257865e-05, "loss": 0.0915, "step": 14869 }, { "epoch": 2.5449255519424954, "grad_norm": 4.498813629150391, "learning_rate": 2.1907201753869618e-05, "loss": 0.3864, "step": 14870 }, { "epoch": 2.5450966969022764, "grad_norm": 39.71647644042969, "learning_rate": 2.189697221990126e-05, "loss": 6.6565, "step": 14871 }, { "epoch": 2.5452678418620573, "grad_norm": 17.917158126831055, "learning_rate": 2.1886738616709038e-05, "loss": 1.3816, "step": 14872 }, { "epoch": 2.5454389868218383, "grad_norm": 8.810152053833008, "learning_rate": 2.187650095033077e-05, "loss": 0.5572, "step": 14873 }, { "epoch": 2.545610131781619, "grad_norm": 8.417913436889648, "learning_rate": 2.1866259226806687e-05, "loss": 0.5775, "step": 14874 }, { "epoch": 2.5457812767414, "grad_norm": 3.2366271018981934, "learning_rate": 2.1856013452179433e-05, "loss": 0.2501, "step": 14875 }, { "epoch": 2.545952421701181, "grad_norm": 0.28882524371147156, "learning_rate": 2.1845763632494046e-05, "loss": 0.083, "step": 14876 }, { "epoch": 2.546123566660962, "grad_norm": 16.2208194732666, "learning_rate": 2.1835509773797922e-05, "loss": 1.1197, "step": 14877 }, { "epoch": 2.546294711620743, "grad_norm": 8.216126441955566, "learning_rate": 2.182525188214083e-05, "loss": 0.7078, "step": 14878 }, { "epoch": 2.546465856580524, "grad_norm": 9.191536903381348, "learning_rate": 2.181498996357496e-05, "loss": 0.5262, "step": 14879 }, { "epoch": 2.5466370015403044, "grad_norm": 12.568198204040527, "learning_rate": 2.1804724024154883e-05, "loss": 0.9777, "step": 14880 }, { "epoch": 2.5468081465000854, "grad_norm": 4.361909866333008, "learning_rate": 2.1794454069937495e-05, "loss": 0.454, "step": 14881 }, { "epoch": 2.5469792914598663, "grad_norm": 15.561259269714355, "learning_rate": 2.1784180106982066e-05, "loss": 1.2742, "step": 14882 }, { "epoch": 2.5471504364196473, "grad_norm": 13.630328178405762, "learning_rate": 2.177390214135027e-05, "loss": 1.431, "step": 14883 }, { "epoch": 2.5473215813794283, "grad_norm": 16.8635311126709, "learning_rate": 2.1763620179106137e-05, "loss": 1.6303, "step": 14884 }, { "epoch": 2.5474927263392093, "grad_norm": 0.3227163553237915, "learning_rate": 2.1753334226316033e-05, "loss": 0.0858, "step": 14885 }, { "epoch": 2.5476638712989903, "grad_norm": 18.632020950317383, "learning_rate": 2.1743044289048654e-05, "loss": 2.0206, "step": 14886 }, { "epoch": 2.5478350162587713, "grad_norm": 13.8741455078125, "learning_rate": 2.1732750373375098e-05, "loss": 1.6832, "step": 14887 }, { "epoch": 2.5480061612185523, "grad_norm": 12.148030281066895, "learning_rate": 2.1722452485368815e-05, "loss": 1.0131, "step": 14888 }, { "epoch": 2.5481773061783333, "grad_norm": 29.41322898864746, "learning_rate": 2.1712150631105528e-05, "loss": 5.3903, "step": 14889 }, { "epoch": 2.5483484511381143, "grad_norm": 12.678583145141602, "learning_rate": 2.1701844816663377e-05, "loss": 0.7445, "step": 14890 }, { "epoch": 2.548519596097895, "grad_norm": 11.547226905822754, "learning_rate": 2.1691535048122818e-05, "loss": 0.8211, "step": 14891 }, { "epoch": 2.548690741057676, "grad_norm": 8.636588096618652, "learning_rate": 2.1681221331566598e-05, "loss": 0.7865, "step": 14892 }, { "epoch": 2.548861886017457, "grad_norm": 7.975541114807129, "learning_rate": 2.167090367307986e-05, "loss": 0.5964, "step": 14893 }, { "epoch": 2.5490330309772378, "grad_norm": 1.6466413736343384, "learning_rate": 2.1660582078749993e-05, "loss": 0.1667, "step": 14894 }, { "epoch": 2.5492041759370188, "grad_norm": 10.69886589050293, "learning_rate": 2.1650256554666804e-05, "loss": 0.915, "step": 14895 }, { "epoch": 2.5493753208967997, "grad_norm": 10.410268783569336, "learning_rate": 2.1639927106922324e-05, "loss": 0.7933, "step": 14896 }, { "epoch": 2.5495464658565803, "grad_norm": 6.350156784057617, "learning_rate": 2.162959374161098e-05, "loss": 0.3851, "step": 14897 }, { "epoch": 2.5497176108163613, "grad_norm": 14.016349792480469, "learning_rate": 2.1619256464829433e-05, "loss": 0.8286, "step": 14898 }, { "epoch": 2.5498887557761423, "grad_norm": 0.27360934019088745, "learning_rate": 2.1608915282676728e-05, "loss": 0.0891, "step": 14899 }, { "epoch": 2.5500599007359233, "grad_norm": 11.159873962402344, "learning_rate": 2.159857020125415e-05, "loss": 0.7185, "step": 14900 }, { "epoch": 2.5502310456957042, "grad_norm": 10.134807586669922, "learning_rate": 2.1588221226665344e-05, "loss": 1.0953, "step": 14901 }, { "epoch": 2.5504021906554852, "grad_norm": 60.004615783691406, "learning_rate": 2.157786836501618e-05, "loss": 7.5149, "step": 14902 }, { "epoch": 2.5505733356152662, "grad_norm": 14.00865364074707, "learning_rate": 2.15675116224149e-05, "loss": 0.9257, "step": 14903 }, { "epoch": 2.550744480575047, "grad_norm": 18.80005645751953, "learning_rate": 2.1557151004971965e-05, "loss": 2.1279, "step": 14904 }, { "epoch": 2.550915625534828, "grad_norm": 13.166810035705566, "learning_rate": 2.154678651880019e-05, "loss": 1.5046, "step": 14905 }, { "epoch": 2.551086770494609, "grad_norm": 1.4229991436004639, "learning_rate": 2.1536418170014595e-05, "loss": 0.1169, "step": 14906 }, { "epoch": 2.5512579154543897, "grad_norm": 19.945756912231445, "learning_rate": 2.1526045964732566e-05, "loss": 4.7664, "step": 14907 }, { "epoch": 2.5514290604141707, "grad_norm": 16.982746124267578, "learning_rate": 2.1515669909073675e-05, "loss": 2.0395, "step": 14908 }, { "epoch": 2.5516002053739517, "grad_norm": 16.958377838134766, "learning_rate": 2.1505290009159857e-05, "loss": 1.4035, "step": 14909 }, { "epoch": 2.5516002053739517, "eval_nli-pairs_loss": 1.120631217956543, "eval_nli-pairs_runtime": 4.2627, "eval_nli-pairs_samples_per_second": 46.919, "eval_nli-pairs_steps_per_second": 1.642, "eval_sts-test_pearson_cosine": 0.7834909659305883, "eval_sts-test_pearson_dot": 0.633810775281594, "eval_sts-test_pearson_euclidean": 0.7655395893514084, "eval_sts-test_pearson_manhattan": 0.7686048699481917, "eval_sts-test_pearson_max": 0.7834909659305883, "eval_sts-test_spearman_cosine": 0.791696889956414, "eval_sts-test_spearman_dot": 0.6110073285561313, "eval_sts-test_spearman_euclidean": 0.7542411302358768, "eval_sts-test_spearman_manhattan": 0.7601652358630435, "eval_sts-test_spearman_max": 0.791696889956414, "step": 14909 }, { "epoch": 2.5516002053739517, "eval_vitaminc-pairs_loss": 0.5565792918205261, "eval_vitaminc-pairs_runtime": 2.7085, "eval_vitaminc-pairs_samples_per_second": 73.843, "eval_vitaminc-pairs_steps_per_second": 2.584, "step": 14909 }, { "epoch": 2.5516002053739517, "eval_qnli-contrastive_loss": 1.1673836708068848, "eval_qnli-contrastive_runtime": 0.6599, "eval_qnli-contrastive_samples_per_second": 303.066, "eval_qnli-contrastive_steps_per_second": 10.607, "step": 14909 }, { "epoch": 2.5516002053739517, "eval_scitail-pairs-qa_loss": 0.07468931376934052, "eval_scitail-pairs-qa_runtime": 1.6094, "eval_scitail-pairs-qa_samples_per_second": 124.273, "eval_scitail-pairs-qa_steps_per_second": 4.35, "step": 14909 }, { "epoch": 2.5516002053739517, "eval_scitail-pairs-pos_loss": 0.537910521030426, "eval_scitail-pairs-pos_runtime": 2.5747, "eval_scitail-pairs-pos_samples_per_second": 77.679, "eval_scitail-pairs-pos_steps_per_second": 2.719, "step": 14909 }, { "epoch": 2.5516002053739517, "eval_xsum-pairs_loss": 0.6066221594810486, "eval_xsum-pairs_runtime": 2.6415, "eval_xsum-pairs_samples_per_second": 66.25, "eval_xsum-pairs_steps_per_second": 2.271, "step": 14909 }, { "epoch": 2.5516002053739517, "eval_compression-pairs_loss": 0.158778578042984, "eval_compression-pairs_runtime": 0.5069, "eval_compression-pairs_samples_per_second": 394.568, "eval_compression-pairs_steps_per_second": 13.81, "step": 14909 }, { "epoch": 2.5516002053739517, "eval_sciq_pairs_loss": 0.33066776394844055, "eval_sciq_pairs_runtime": 9.0797, "eval_sciq_pairs_samples_per_second": 22.027, "eval_sciq_pairs_steps_per_second": 0.771, "step": 14909 }, { "epoch": 2.5516002053739517, "eval_qasc_pairs_loss": 5.162327766418457, "eval_qasc_pairs_runtime": 2.6445, "eval_qasc_pairs_samples_per_second": 75.628, "eval_qasc_pairs_steps_per_second": 2.647, "step": 14909 }, { "epoch": 2.5516002053739517, "eval_openbookqa_pairs_loss": 2.2789571285247803, "eval_openbookqa_pairs_runtime": 0.6319, "eval_openbookqa_pairs_samples_per_second": 109.197, "eval_openbookqa_pairs_steps_per_second": 4.748, "step": 14909 }, { "epoch": 2.5516002053739517, "eval_msmarco_pairs_loss": 0.7646405100822449, "eval_msmarco_pairs_runtime": 3.9114, "eval_msmarco_pairs_samples_per_second": 51.133, "eval_msmarco_pairs_steps_per_second": 1.79, "step": 14909 }, { "epoch": 2.5516002053739517, "eval_nq_pairs_loss": 0.9478620886802673, "eval_nq_pairs_runtime": 8.5804, "eval_nq_pairs_samples_per_second": 23.309, "eval_nq_pairs_steps_per_second": 0.816, "step": 14909 }, { "epoch": 2.5516002053739517, "eval_trivia_pairs_loss": 1.3344215154647827, "eval_trivia_pairs_runtime": 12.7365, "eval_trivia_pairs_samples_per_second": 15.703, "eval_trivia_pairs_steps_per_second": 0.55, "step": 14909 }, { "epoch": 2.5516002053739517, "eval_quora_pairs_loss": 0.14927494525909424, "eval_quora_pairs_runtime": 1.5929, "eval_quora_pairs_samples_per_second": 125.555, "eval_quora_pairs_steps_per_second": 4.394, "step": 14909 }, { "epoch": 2.5516002053739517, "eval_gooaq_pairs_loss": 0.6557897925376892, "eval_gooaq_pairs_runtime": 2.6299, "eval_gooaq_pairs_samples_per_second": 76.048, "eval_gooaq_pairs_steps_per_second": 2.662, "step": 14909 }, { "epoch": 2.5517713503337327, "grad_norm": 23.04192543029785, "learning_rate": 2.1494906271115225e-05, "loss": 1.7202, "step": 14910 }, { "epoch": 2.5519424952935137, "grad_norm": 19.534320831298828, "learning_rate": 2.1484518701066216e-05, "loss": 4.7485, "step": 14911 }, { "epoch": 2.5521136402532947, "grad_norm": 12.829976081848145, "learning_rate": 2.1474127305141514e-05, "loss": 0.7576, "step": 14912 }, { "epoch": 2.5522847852130752, "grad_norm": 6.833683013916016, "learning_rate": 2.146373208947208e-05, "loss": 0.4711, "step": 14913 }, { "epoch": 2.552455930172856, "grad_norm": 4.855682373046875, "learning_rate": 2.1453333060191083e-05, "loss": 0.3376, "step": 14914 }, { "epoch": 2.552627075132637, "grad_norm": 9.678744316101074, "learning_rate": 2.1442930223433952e-05, "loss": 0.8591, "step": 14915 }, { "epoch": 2.552798220092418, "grad_norm": 9.4759521484375, "learning_rate": 2.1432523585338393e-05, "loss": 0.6573, "step": 14916 }, { "epoch": 2.552969365052199, "grad_norm": 0.33439746499061584, "learning_rate": 2.1422113152044354e-05, "loss": 0.0973, "step": 14917 }, { "epoch": 2.55314051001198, "grad_norm": 11.356155395507812, "learning_rate": 2.1411698929693996e-05, "loss": 1.16, "step": 14918 }, { "epoch": 2.553311654971761, "grad_norm": 10.032844543457031, "learning_rate": 2.1401280924431694e-05, "loss": 0.682, "step": 14919 }, { "epoch": 2.553482799931542, "grad_norm": 2.930114269256592, "learning_rate": 2.1390859142404114e-05, "loss": 0.2002, "step": 14920 }, { "epoch": 2.553653944891323, "grad_norm": 15.541784286499023, "learning_rate": 2.1380433589760144e-05, "loss": 1.2242, "step": 14921 }, { "epoch": 2.553825089851104, "grad_norm": 8.50974178314209, "learning_rate": 2.1370004272650847e-05, "loss": 0.6792, "step": 14922 }, { "epoch": 2.5539962348108847, "grad_norm": 10.010719299316406, "learning_rate": 2.1359571197229526e-05, "loss": 0.8043, "step": 14923 }, { "epoch": 2.5541673797706657, "grad_norm": 0.3111683428287506, "learning_rate": 2.1349134369651726e-05, "loss": 0.0926, "step": 14924 }, { "epoch": 2.5543385247304466, "grad_norm": 15.324975967407227, "learning_rate": 2.1338693796075205e-05, "loss": 1.1748, "step": 14925 }, { "epoch": 2.5545096696902276, "grad_norm": 17.022727966308594, "learning_rate": 2.132824948265991e-05, "loss": 1.2669, "step": 14926 }, { "epoch": 2.5546808146500086, "grad_norm": 14.866715431213379, "learning_rate": 2.1317801435567974e-05, "loss": 1.2419, "step": 14927 }, { "epoch": 2.5548519596097896, "grad_norm": 19.098058700561523, "learning_rate": 2.130734966096378e-05, "loss": 1.8095, "step": 14928 }, { "epoch": 2.55502310456957, "grad_norm": 0.32640114426612854, "learning_rate": 2.1296894165013907e-05, "loss": 0.0926, "step": 14929 }, { "epoch": 2.555194249529351, "grad_norm": 3.7135837078094482, "learning_rate": 2.1286434953887102e-05, "loss": 0.2108, "step": 14930 }, { "epoch": 2.555365394489132, "grad_norm": 39.058292388916016, "learning_rate": 2.127597203375429e-05, "loss": 7.083, "step": 14931 }, { "epoch": 2.555536539448913, "grad_norm": 18.135663986206055, "learning_rate": 2.126550541078863e-05, "loss": 2.2722, "step": 14932 }, { "epoch": 2.555707684408694, "grad_norm": 18.42775535583496, "learning_rate": 2.125503509116545e-05, "loss": 1.3974, "step": 14933 }, { "epoch": 2.555878829368475, "grad_norm": 14.927212715148926, "learning_rate": 2.1244561081062262e-05, "loss": 1.1124, "step": 14934 }, { "epoch": 2.556049974328256, "grad_norm": 25.008014678955078, "learning_rate": 2.123408338665872e-05, "loss": 5.1284, "step": 14935 }, { "epoch": 2.556221119288037, "grad_norm": 15.350872993469238, "learning_rate": 2.1223602014136712e-05, "loss": 1.2146, "step": 14936 }, { "epoch": 2.556392264247818, "grad_norm": 15.238006591796875, "learning_rate": 2.121311696968023e-05, "loss": 1.905, "step": 14937 }, { "epoch": 2.556563409207599, "grad_norm": 21.918561935424805, "learning_rate": 2.1202628259475498e-05, "loss": 3.3005, "step": 14938 }, { "epoch": 2.55673455416738, "grad_norm": 4.679481506347656, "learning_rate": 2.1192135889710837e-05, "loss": 0.3217, "step": 14939 }, { "epoch": 2.5569056991271606, "grad_norm": 5.3423075675964355, "learning_rate": 2.118163986657679e-05, "loss": 0.4768, "step": 14940 }, { "epoch": 2.5570768440869416, "grad_norm": 13.814794540405273, "learning_rate": 2.1171140196266005e-05, "loss": 1.0074, "step": 14941 }, { "epoch": 2.5572479890467226, "grad_norm": 9.153532028198242, "learning_rate": 2.116063688497333e-05, "loss": 0.6338, "step": 14942 }, { "epoch": 2.5574191340065036, "grad_norm": 11.833871841430664, "learning_rate": 2.1150129938895692e-05, "loss": 0.6006, "step": 14943 }, { "epoch": 2.5575902789662845, "grad_norm": 3.368509292602539, "learning_rate": 2.1139619364232253e-05, "loss": 0.2308, "step": 14944 }, { "epoch": 2.5577614239260655, "grad_norm": 0.33954349160194397, "learning_rate": 2.1129105167184223e-05, "loss": 0.0928, "step": 14945 }, { "epoch": 2.557932568885846, "grad_norm": 20.24551773071289, "learning_rate": 2.1118587353955037e-05, "loss": 1.427, "step": 14946 }, { "epoch": 2.558103713845627, "grad_norm": 39.546966552734375, "learning_rate": 2.1108065930750177e-05, "loss": 7.6556, "step": 14947 }, { "epoch": 2.558274858805408, "grad_norm": 3.718235731124878, "learning_rate": 2.1097540903777336e-05, "loss": 0.2275, "step": 14948 }, { "epoch": 2.558446003765189, "grad_norm": 8.186110496520996, "learning_rate": 2.108701227924627e-05, "loss": 0.6181, "step": 14949 }, { "epoch": 2.55861714872497, "grad_norm": 10.332124710083008, "learning_rate": 2.107648006336891e-05, "loss": 0.8186, "step": 14950 }, { "epoch": 2.558788293684751, "grad_norm": 12.202836036682129, "learning_rate": 2.1065944262359238e-05, "loss": 0.6572, "step": 14951 }, { "epoch": 2.558959438644532, "grad_norm": 13.431889533996582, "learning_rate": 2.1055404882433435e-05, "loss": 0.956, "step": 14952 }, { "epoch": 2.559130583604313, "grad_norm": 9.904834747314453, "learning_rate": 2.1044861929809715e-05, "loss": 0.61, "step": 14953 }, { "epoch": 2.559301728564094, "grad_norm": 4.145629405975342, "learning_rate": 2.1034315410708455e-05, "loss": 0.4618, "step": 14954 }, { "epoch": 2.559472873523875, "grad_norm": 0.9454410076141357, "learning_rate": 2.1023765331352134e-05, "loss": 0.1313, "step": 14955 }, { "epoch": 2.5596440184836555, "grad_norm": 1.0192196369171143, "learning_rate": 2.1013211697965273e-05, "loss": 0.1635, "step": 14956 }, { "epoch": 2.5598151634434365, "grad_norm": 2.512600898742676, "learning_rate": 2.1002654516774553e-05, "loss": 0.219, "step": 14957 }, { "epoch": 2.5599863084032175, "grad_norm": 14.576485633850098, "learning_rate": 2.099209379400875e-05, "loss": 1.4803, "step": 14958 }, { "epoch": 2.5601574533629985, "grad_norm": 43.57606506347656, "learning_rate": 2.0981529535898686e-05, "loss": 6.9266, "step": 14959 }, { "epoch": 2.5603285983227795, "grad_norm": 0.9998489022254944, "learning_rate": 2.097096174867726e-05, "loss": 0.1578, "step": 14960 }, { "epoch": 2.5604997432825605, "grad_norm": 11.67483139038086, "learning_rate": 2.0960390438579514e-05, "loss": 0.7173, "step": 14961 }, { "epoch": 2.560670888242341, "grad_norm": 3.6552770137786865, "learning_rate": 2.094981561184255e-05, "loss": 0.3029, "step": 14962 }, { "epoch": 2.560842033202122, "grad_norm": 0.30459484457969666, "learning_rate": 2.093923727470551e-05, "loss": 0.0876, "step": 14963 }, { "epoch": 2.561013178161903, "grad_norm": 10.111140251159668, "learning_rate": 2.0928655433409614e-05, "loss": 0.7835, "step": 14964 }, { "epoch": 2.561184323121684, "grad_norm": 2.9463531970977783, "learning_rate": 2.0918070094198185e-05, "loss": 0.2501, "step": 14965 }, { "epoch": 2.561355468081465, "grad_norm": 8.904659271240234, "learning_rate": 2.09074812633166e-05, "loss": 0.4727, "step": 14966 }, { "epoch": 2.561526613041246, "grad_norm": 0.9995426535606384, "learning_rate": 2.0896888947012275e-05, "loss": 0.1617, "step": 14967 }, { "epoch": 2.561697758001027, "grad_norm": 24.442790985107422, "learning_rate": 2.0886293151534666e-05, "loss": 5.1037, "step": 14968 }, { "epoch": 2.561868902960808, "grad_norm": 16.466705322265625, "learning_rate": 2.087569388313533e-05, "loss": 1.3126, "step": 14969 }, { "epoch": 2.562040047920589, "grad_norm": 12.439037322998047, "learning_rate": 2.0865091148067874e-05, "loss": 0.8174, "step": 14970 }, { "epoch": 2.56221119288037, "grad_norm": 8.18338394165039, "learning_rate": 2.0854484952587897e-05, "loss": 0.6564, "step": 14971 }, { "epoch": 2.5623823378401505, "grad_norm": 10.432333946228027, "learning_rate": 2.0843875302953067e-05, "loss": 0.7201, "step": 14972 }, { "epoch": 2.5625534827999314, "grad_norm": 12.981709480285645, "learning_rate": 2.08332622054231e-05, "loss": 0.9254, "step": 14973 }, { "epoch": 2.5627246277597124, "grad_norm": 5.060718536376953, "learning_rate": 2.0822645666259768e-05, "loss": 0.5194, "step": 14974 }, { "epoch": 2.5628957727194934, "grad_norm": 8.997509002685547, "learning_rate": 2.0812025691726795e-05, "loss": 0.7755, "step": 14975 }, { "epoch": 2.5630669176792744, "grad_norm": 1.3221774101257324, "learning_rate": 2.0801402288090013e-05, "loss": 0.1693, "step": 14976 }, { "epoch": 2.5632380626390554, "grad_norm": 9.987198829650879, "learning_rate": 2.079077546161725e-05, "loss": 0.7452, "step": 14977 }, { "epoch": 2.563409207598836, "grad_norm": 15.780447006225586, "learning_rate": 2.0780145218578327e-05, "loss": 1.1272, "step": 14978 }, { "epoch": 2.563580352558617, "grad_norm": 14.939083099365234, "learning_rate": 2.076951156524513e-05, "loss": 1.0128, "step": 14979 }, { "epoch": 2.563751497518398, "grad_norm": 7.640585422515869, "learning_rate": 2.0758874507891503e-05, "loss": 0.9087, "step": 14980 }, { "epoch": 2.563922642478179, "grad_norm": 19.50883674621582, "learning_rate": 2.0748234052793353e-05, "loss": 2.3203, "step": 14981 }, { "epoch": 2.56409378743796, "grad_norm": 1.8413156270980835, "learning_rate": 2.0737590206228533e-05, "loss": 0.1811, "step": 14982 }, { "epoch": 2.564264932397741, "grad_norm": 9.871707916259766, "learning_rate": 2.072694297447697e-05, "loss": 1.0044, "step": 14983 }, { "epoch": 2.564436077357522, "grad_norm": 14.288470268249512, "learning_rate": 2.0716292363820497e-05, "loss": 1.5474, "step": 14984 }, { "epoch": 2.564607222317303, "grad_norm": 10.653759002685547, "learning_rate": 2.0705638380543034e-05, "loss": 0.8872, "step": 14985 }, { "epoch": 2.564778367277084, "grad_norm": 13.399293899536133, "learning_rate": 2.069498103093041e-05, "loss": 0.9144, "step": 14986 }, { "epoch": 2.564949512236865, "grad_norm": 20.413545608520508, "learning_rate": 2.0684320321270516e-05, "loss": 1.7884, "step": 14987 }, { "epoch": 2.5651206571966454, "grad_norm": 13.422945976257324, "learning_rate": 2.067365625785314e-05, "loss": 0.7918, "step": 14988 }, { "epoch": 2.5652918021564264, "grad_norm": 13.752655029296875, "learning_rate": 2.066298884697015e-05, "loss": 0.9543, "step": 14989 }, { "epoch": 2.5654629471162074, "grad_norm": 5.870984077453613, "learning_rate": 2.065231809491528e-05, "loss": 0.3328, "step": 14990 }, { "epoch": 2.5656340920759884, "grad_norm": 2.369518280029297, "learning_rate": 2.064164400798434e-05, "loss": 0.1925, "step": 14991 }, { "epoch": 2.5658052370357693, "grad_norm": 18.735166549682617, "learning_rate": 2.0630966592475006e-05, "loss": 2.1465, "step": 14992 }, { "epoch": 2.5659763819955503, "grad_norm": 14.432852745056152, "learning_rate": 2.062028585468702e-05, "loss": 1.536, "step": 14993 }, { "epoch": 2.5661475269553313, "grad_norm": 17.422643661499023, "learning_rate": 2.0609601800921984e-05, "loss": 1.8163, "step": 14994 }, { "epoch": 2.566318671915112, "grad_norm": 17.029266357421875, "learning_rate": 2.059891443748355e-05, "loss": 1.4096, "step": 14995 }, { "epoch": 2.566489816874893, "grad_norm": 18.954832077026367, "learning_rate": 2.0588223770677247e-05, "loss": 2.0827, "step": 14996 }, { "epoch": 2.566660961834674, "grad_norm": 15.571535110473633, "learning_rate": 2.057752980681059e-05, "loss": 1.152, "step": 14997 }, { "epoch": 2.566832106794455, "grad_norm": 45.537742614746094, "learning_rate": 2.056683255219304e-05, "loss": 7.3696, "step": 14998 }, { "epoch": 2.567003251754236, "grad_norm": 7.563534736633301, "learning_rate": 2.055613201313601e-05, "loss": 0.5683, "step": 14999 }, { "epoch": 2.567174396714017, "grad_norm": 13.762846946716309, "learning_rate": 2.054542819595282e-05, "loss": 1.0238, "step": 15000 }, { "epoch": 2.567345541673798, "grad_norm": 9.096762657165527, "learning_rate": 2.0534721106958715e-05, "loss": 0.7346, "step": 15001 }, { "epoch": 2.567516686633579, "grad_norm": 6.424158096313477, "learning_rate": 2.0524010752470913e-05, "loss": 0.4661, "step": 15002 }, { "epoch": 2.56768783159336, "grad_norm": 24.1102237701416, "learning_rate": 2.0513297138808555e-05, "loss": 2.7669, "step": 15003 }, { "epoch": 2.5678589765531408, "grad_norm": 9.185404777526855, "learning_rate": 2.0502580272292677e-05, "loss": 0.543, "step": 15004 }, { "epoch": 2.5680301215129213, "grad_norm": 9.390907287597656, "learning_rate": 2.0491860159246223e-05, "loss": 0.5725, "step": 15005 }, { "epoch": 2.5682012664727023, "grad_norm": 14.363121032714844, "learning_rate": 2.0481136805994094e-05, "loss": 1.6365, "step": 15006 }, { "epoch": 2.5683724114324833, "grad_norm": 0.4636465907096863, "learning_rate": 2.0470410218863106e-05, "loss": 0.0973, "step": 15007 }, { "epoch": 2.5685435563922643, "grad_norm": 0.5244224667549133, "learning_rate": 2.045968040418194e-05, "loss": 0.129, "step": 15008 }, { "epoch": 2.5687147013520453, "grad_norm": 14.858797073364258, "learning_rate": 2.0448947368281183e-05, "loss": 1.2017, "step": 15009 }, { "epoch": 2.5688858463118263, "grad_norm": 13.200790405273438, "learning_rate": 2.0438211117493363e-05, "loss": 0.8717, "step": 15010 }, { "epoch": 2.569056991271607, "grad_norm": 3.513781785964966, "learning_rate": 2.0427471658152902e-05, "loss": 0.301, "step": 15011 }, { "epoch": 2.569228136231388, "grad_norm": 11.256268501281738, "learning_rate": 2.0416728996596083e-05, "loss": 1.0063, "step": 15012 }, { "epoch": 2.569399281191169, "grad_norm": 13.875487327575684, "learning_rate": 2.0405983139161067e-05, "loss": 0.9874, "step": 15013 }, { "epoch": 2.5695704261509498, "grad_norm": 0.9143614172935486, "learning_rate": 2.0395234092187946e-05, "loss": 0.1158, "step": 15014 }, { "epoch": 2.5697415711107308, "grad_norm": 5.517627716064453, "learning_rate": 2.0384481862018697e-05, "loss": 0.5442, "step": 15015 }, { "epoch": 2.5699127160705117, "grad_norm": 5.315635681152344, "learning_rate": 2.0373726454997126e-05, "loss": 0.4804, "step": 15016 }, { "epoch": 2.5700838610302927, "grad_norm": 15.102041244506836, "learning_rate": 2.0362967877468922e-05, "loss": 1.7647, "step": 15017 }, { "epoch": 2.5702550059900737, "grad_norm": 19.14992904663086, "learning_rate": 2.0352206135781683e-05, "loss": 2.1326, "step": 15018 }, { "epoch": 2.5704261509498547, "grad_norm": 18.47158432006836, "learning_rate": 2.0341441236284855e-05, "loss": 2.2118, "step": 15019 }, { "epoch": 2.5705972959096357, "grad_norm": 0.3034180700778961, "learning_rate": 2.033067318532976e-05, "loss": 0.0914, "step": 15020 }, { "epoch": 2.5707684408694162, "grad_norm": 11.107064247131348, "learning_rate": 2.0319901989269523e-05, "loss": 0.704, "step": 15021 }, { "epoch": 2.5709395858291972, "grad_norm": 5.984079360961914, "learning_rate": 2.0309127654459216e-05, "loss": 0.4428, "step": 15022 }, { "epoch": 2.5711107307889782, "grad_norm": 1.358727216720581, "learning_rate": 2.029835018725566e-05, "loss": 0.1772, "step": 15023 }, { "epoch": 2.571281875748759, "grad_norm": 14.07783317565918, "learning_rate": 2.028756959401762e-05, "loss": 1.2199, "step": 15024 }, { "epoch": 2.57145302070854, "grad_norm": 0.9886142015457153, "learning_rate": 2.0276785881105628e-05, "loss": 0.1166, "step": 15025 }, { "epoch": 2.571624165668321, "grad_norm": 8.623197555541992, "learning_rate": 2.0265999054882124e-05, "loss": 0.6494, "step": 15026 }, { "epoch": 2.5717953106281017, "grad_norm": 7.860516548156738, "learning_rate": 2.0255209121711313e-05, "loss": 0.4086, "step": 15027 }, { "epoch": 2.5719664555878827, "grad_norm": 15.51563549041748, "learning_rate": 2.0244416087959312e-05, "loss": 1.0779, "step": 15028 }, { "epoch": 2.5721376005476637, "grad_norm": 17.072086334228516, "learning_rate": 2.023361995999399e-05, "loss": 1.2842, "step": 15029 }, { "epoch": 2.5723087455074447, "grad_norm": 6.532835960388184, "learning_rate": 2.022282074418512e-05, "loss": 0.5273, "step": 15030 }, { "epoch": 2.5724798904672257, "grad_norm": 17.24150848388672, "learning_rate": 2.021201844690421e-05, "loss": 1.8301, "step": 15031 }, { "epoch": 2.5726510354270067, "grad_norm": 11.531495094299316, "learning_rate": 2.020121307452467e-05, "loss": 0.7696, "step": 15032 }, { "epoch": 2.5728221803867877, "grad_norm": 0.4274217486381531, "learning_rate": 2.019040463342165e-05, "loss": 0.0951, "step": 15033 }, { "epoch": 2.5729933253465687, "grad_norm": 21.09198570251465, "learning_rate": 2.0179593129972188e-05, "loss": 1.9351, "step": 15034 }, { "epoch": 2.5731644703063496, "grad_norm": 0.34131792187690735, "learning_rate": 2.016877857055504e-05, "loss": 0.0947, "step": 15035 }, { "epoch": 2.5733356152661306, "grad_norm": 46.55397415161133, "learning_rate": 2.015796096155086e-05, "loss": 7.1444, "step": 15036 }, { "epoch": 2.573506760225911, "grad_norm": 11.437747955322266, "learning_rate": 2.0147140309342008e-05, "loss": 0.696, "step": 15037 }, { "epoch": 2.573677905185692, "grad_norm": 12.17674446105957, "learning_rate": 2.013631662031273e-05, "loss": 0.7658, "step": 15038 }, { "epoch": 2.573849050145473, "grad_norm": 16.181354522705078, "learning_rate": 2.0125489900848954e-05, "loss": 1.2364, "step": 15039 }, { "epoch": 2.574020195105254, "grad_norm": 4.297463893890381, "learning_rate": 2.0114660157338545e-05, "loss": 0.4328, "step": 15040 }, { "epoch": 2.574191340065035, "grad_norm": 5.374021053314209, "learning_rate": 2.0103827396171017e-05, "loss": 0.3639, "step": 15041 }, { "epoch": 2.574362485024816, "grad_norm": 12.3497953414917, "learning_rate": 2.0092991623737713e-05, "loss": 0.7905, "step": 15042 }, { "epoch": 2.574533629984597, "grad_norm": 12.127429008483887, "learning_rate": 2.0082152846431758e-05, "loss": 0.7652, "step": 15043 }, { "epoch": 2.5747047749443777, "grad_norm": 16.298364639282227, "learning_rate": 2.0071311070648076e-05, "loss": 1.6324, "step": 15044 }, { "epoch": 2.5748759199041586, "grad_norm": 13.82253360748291, "learning_rate": 2.006046630278331e-05, "loss": 1.4785, "step": 15045 }, { "epoch": 2.5750470648639396, "grad_norm": 5.7285075187683105, "learning_rate": 2.004961854923587e-05, "loss": 0.5033, "step": 15046 }, { "epoch": 2.5752182098237206, "grad_norm": 0.3507691025733948, "learning_rate": 2.0038767816405962e-05, "loss": 0.0912, "step": 15047 }, { "epoch": 2.5753893547835016, "grad_norm": 5.285333156585693, "learning_rate": 2.0027914110695558e-05, "loss": 0.2492, "step": 15048 }, { "epoch": 2.5755604997432826, "grad_norm": 31.662006378173828, "learning_rate": 2.0017057438508338e-05, "loss": 5.1494, "step": 15049 }, { "epoch": 2.5757316447030636, "grad_norm": 97.16339874267578, "learning_rate": 2.0006197806249737e-05, "loss": 8.4305, "step": 15050 }, { "epoch": 2.5759027896628446, "grad_norm": 11.20245361328125, "learning_rate": 1.9995335220326975e-05, "loss": 0.705, "step": 15051 }, { "epoch": 2.5760739346226256, "grad_norm": 0.9684906005859375, "learning_rate": 1.998446968714901e-05, "loss": 0.1152, "step": 15052 }, { "epoch": 2.5762450795824066, "grad_norm": 21.63077163696289, "learning_rate": 1.997360121312651e-05, "loss": 4.9183, "step": 15053 }, { "epoch": 2.576416224542187, "grad_norm": 11.631574630737305, "learning_rate": 1.9962729804671868e-05, "loss": 0.7147, "step": 15054 }, { "epoch": 2.576587369501968, "grad_norm": 9.479214668273926, "learning_rate": 1.995185546819925e-05, "loss": 0.6449, "step": 15055 }, { "epoch": 2.576758514461749, "grad_norm": 10.53096866607666, "learning_rate": 1.9940978210124538e-05, "loss": 0.8513, "step": 15056 }, { "epoch": 2.57692965942153, "grad_norm": 12.310078620910645, "learning_rate": 1.993009803686533e-05, "loss": 0.8301, "step": 15057 }, { "epoch": 2.577100804381311, "grad_norm": 8.389551162719727, "learning_rate": 1.9919214954840918e-05, "loss": 0.447, "step": 15058 }, { "epoch": 2.577271949341092, "grad_norm": 12.011335372924805, "learning_rate": 1.9908328970472357e-05, "loss": 0.785, "step": 15059 }, { "epoch": 2.5774430943008726, "grad_norm": 8.316442489624023, "learning_rate": 1.9897440090182412e-05, "loss": 0.601, "step": 15060 }, { "epoch": 2.5776142392606536, "grad_norm": 0.6412540078163147, "learning_rate": 1.9886548320395496e-05, "loss": 0.1054, "step": 15061 }, { "epoch": 2.5777853842204346, "grad_norm": 0.3269597291946411, "learning_rate": 1.9875653667537794e-05, "loss": 0.0954, "step": 15062 }, { "epoch": 2.5779565291802156, "grad_norm": 5.142387866973877, "learning_rate": 1.9864756138037188e-05, "loss": 0.4576, "step": 15063 }, { "epoch": 2.5781276741399966, "grad_norm": 16.568979263305664, "learning_rate": 1.9853855738323194e-05, "loss": 2.0292, "step": 15064 }, { "epoch": 2.5782988190997775, "grad_norm": 10.97111701965332, "learning_rate": 1.9842952474827102e-05, "loss": 0.751, "step": 15065 }, { "epoch": 2.5784699640595585, "grad_norm": 0.3032510578632355, "learning_rate": 1.983204635398182e-05, "loss": 0.0847, "step": 15066 }, { "epoch": 2.5786411090193395, "grad_norm": 5.169975757598877, "learning_rate": 1.9821137382222012e-05, "loss": 0.3254, "step": 15067 }, { "epoch": 2.5788122539791205, "grad_norm": 12.333659172058105, "learning_rate": 1.9810225565983946e-05, "loss": 0.8785, "step": 15068 }, { "epoch": 2.5789833989389015, "grad_norm": 11.574036598205566, "learning_rate": 1.9799310911705654e-05, "loss": 1.0074, "step": 15069 }, { "epoch": 2.579154543898682, "grad_norm": 1.5504649877548218, "learning_rate": 1.978839342582675e-05, "loss": 0.1739, "step": 15070 }, { "epoch": 2.579325688858463, "grad_norm": 15.960871696472168, "learning_rate": 1.977747311478862e-05, "loss": 1.5712, "step": 15071 }, { "epoch": 2.579496833818244, "grad_norm": 17.1929931640625, "learning_rate": 1.976654998503421e-05, "loss": 1.8358, "step": 15072 }, { "epoch": 2.579667978778025, "grad_norm": 13.11357593536377, "learning_rate": 1.975562404300823e-05, "loss": 0.8932, "step": 15073 }, { "epoch": 2.579839123737806, "grad_norm": 9.601709365844727, "learning_rate": 1.9744695295156966e-05, "loss": 0.5624, "step": 15074 }, { "epoch": 2.580010268697587, "grad_norm": 10.024223327636719, "learning_rate": 1.9733763747928425e-05, "loss": 0.7247, "step": 15075 }, { "epoch": 2.5801814136573675, "grad_norm": 8.410579681396484, "learning_rate": 1.9722829407772208e-05, "loss": 0.5701, "step": 15076 }, { "epoch": 2.5803525586171485, "grad_norm": 5.880515098571777, "learning_rate": 1.971189228113962e-05, "loss": 0.4324, "step": 15077 }, { "epoch": 2.5805237035769295, "grad_norm": 11.545395851135254, "learning_rate": 1.970095237448355e-05, "loss": 0.8226, "step": 15078 }, { "epoch": 2.5806948485367105, "grad_norm": 0.3842194080352783, "learning_rate": 1.9690009694258603e-05, "loss": 0.0911, "step": 15079 }, { "epoch": 2.5808659934964915, "grad_norm": 2.631908655166626, "learning_rate": 1.9679064246920923e-05, "loss": 0.2397, "step": 15080 }, { "epoch": 2.5810371384562725, "grad_norm": 4.888580799102783, "learning_rate": 1.9668116038928394e-05, "loss": 0.2031, "step": 15081 }, { "epoch": 2.5812082834160535, "grad_norm": 1.77593195438385, "learning_rate": 1.9657165076740426e-05, "loss": 0.1836, "step": 15082 }, { "epoch": 2.5813794283758345, "grad_norm": 16.804601669311523, "learning_rate": 1.9646211366818123e-05, "loss": 1.5903, "step": 15083 }, { "epoch": 2.5815505733356154, "grad_norm": 10.499201774597168, "learning_rate": 1.9635254915624197e-05, "loss": 0.6901, "step": 15084 }, { "epoch": 2.5817217182953964, "grad_norm": 0.36544427275657654, "learning_rate": 1.9624295729622984e-05, "loss": 0.0903, "step": 15085 }, { "epoch": 2.581892863255177, "grad_norm": 1.3816996812820435, "learning_rate": 1.961333381528041e-05, "loss": 0.1653, "step": 15086 }, { "epoch": 2.582064008214958, "grad_norm": 0.3106927275657654, "learning_rate": 1.9602369179063987e-05, "loss": 0.0902, "step": 15087 }, { "epoch": 2.582235153174739, "grad_norm": 4.2508864402771, "learning_rate": 1.9591401827442894e-05, "loss": 0.3564, "step": 15088 }, { "epoch": 2.58240629813452, "grad_norm": 11.233048439025879, "learning_rate": 1.9580431766887904e-05, "loss": 0.753, "step": 15089 }, { "epoch": 2.582577443094301, "grad_norm": 15.830307006835938, "learning_rate": 1.9569459003871348e-05, "loss": 1.2489, "step": 15090 }, { "epoch": 2.582748588054082, "grad_norm": 4.842190265655518, "learning_rate": 1.955848354486716e-05, "loss": 0.3578, "step": 15091 }, { "epoch": 2.582919733013863, "grad_norm": 15.196305274963379, "learning_rate": 1.9547505396350883e-05, "loss": 1.7358, "step": 15092 }, { "epoch": 2.5830908779736435, "grad_norm": 15.167933464050293, "learning_rate": 1.9536524564799673e-05, "loss": 1.1765, "step": 15093 }, { "epoch": 2.5832620229334244, "grad_norm": 10.86551570892334, "learning_rate": 1.9525541056692213e-05, "loss": 0.6237, "step": 15094 }, { "epoch": 2.5834331678932054, "grad_norm": 1.482143521308899, "learning_rate": 1.951455487850877e-05, "loss": 0.1717, "step": 15095 }, { "epoch": 2.5836043128529864, "grad_norm": 13.119484901428223, "learning_rate": 1.9503566036731222e-05, "loss": 0.9115, "step": 15096 }, { "epoch": 2.5837754578127674, "grad_norm": 26.376602172851562, "learning_rate": 1.9492574537843024e-05, "loss": 5.044, "step": 15097 }, { "epoch": 2.5839466027725484, "grad_norm": 30.82118034362793, "learning_rate": 1.9481580388329158e-05, "loss": 5.4257, "step": 15098 }, { "epoch": 2.5841177477323294, "grad_norm": 20.17182159423828, "learning_rate": 1.9470583594676167e-05, "loss": 2.1697, "step": 15099 }, { "epoch": 2.5842888926921104, "grad_norm": 7.32912015914917, "learning_rate": 1.9459584163372197e-05, "loss": 0.3865, "step": 15100 }, { "epoch": 2.5844600376518914, "grad_norm": 9.577326774597168, "learning_rate": 1.9448582100906946e-05, "loss": 0.5724, "step": 15101 }, { "epoch": 2.5846311826116724, "grad_norm": 12.719517707824707, "learning_rate": 1.9437577413771633e-05, "loss": 0.9987, "step": 15102 }, { "epoch": 2.584802327571453, "grad_norm": 11.720426559448242, "learning_rate": 1.9426570108459017e-05, "loss": 0.8703, "step": 15103 }, { "epoch": 2.584973472531234, "grad_norm": 5.553788661956787, "learning_rate": 1.9415560191463444e-05, "loss": 0.3154, "step": 15104 }, { "epoch": 2.585144617491015, "grad_norm": 4.61550760269165, "learning_rate": 1.9404547669280778e-05, "loss": 0.3384, "step": 15105 }, { "epoch": 2.585315762450796, "grad_norm": 14.407800674438477, "learning_rate": 1.9393532548408447e-05, "loss": 0.7792, "step": 15106 }, { "epoch": 2.585486907410577, "grad_norm": 14.518190383911133, "learning_rate": 1.938251483534535e-05, "loss": 1.3431, "step": 15107 }, { "epoch": 2.585658052370358, "grad_norm": 16.46763038635254, "learning_rate": 1.937149453659199e-05, "loss": 1.3278, "step": 15108 }, { "epoch": 2.5858291973301384, "grad_norm": 2.465404510498047, "learning_rate": 1.9360471658650325e-05, "loss": 0.2265, "step": 15109 }, { "epoch": 2.5860003422899194, "grad_norm": 16.973480224609375, "learning_rate": 1.9349446208023903e-05, "loss": 1.8998, "step": 15110 }, { "epoch": 2.5861714872497004, "grad_norm": 7.180452823638916, "learning_rate": 1.9338418191217725e-05, "loss": 0.5943, "step": 15111 }, { "epoch": 2.5863426322094814, "grad_norm": 15.830549240112305, "learning_rate": 1.9327387614738375e-05, "loss": 1.2266, "step": 15112 }, { "epoch": 2.5865137771692623, "grad_norm": 1.519684910774231, "learning_rate": 1.931635448509386e-05, "loss": 0.163, "step": 15113 }, { "epoch": 2.5866849221290433, "grad_norm": 8.774735450744629, "learning_rate": 1.930531880879379e-05, "loss": 0.5928, "step": 15114 }, { "epoch": 2.5868560670888243, "grad_norm": 15.79123592376709, "learning_rate": 1.9294280592349193e-05, "loss": 1.143, "step": 15115 }, { "epoch": 2.5870272120486053, "grad_norm": 12.301125526428223, "learning_rate": 1.9283239842272668e-05, "loss": 0.7682, "step": 15116 }, { "epoch": 2.5871983570083863, "grad_norm": 10.727261543273926, "learning_rate": 1.9272196565078238e-05, "loss": 0.8187, "step": 15117 }, { "epoch": 2.5873695019681673, "grad_norm": 8.82696533203125, "learning_rate": 1.9261150767281493e-05, "loss": 0.6238, "step": 15118 }, { "epoch": 2.587540646927948, "grad_norm": 16.824838638305664, "learning_rate": 1.9250102455399427e-05, "loss": 1.1379, "step": 15119 }, { "epoch": 2.587711791887729, "grad_norm": 8.71042537689209, "learning_rate": 1.92390516359506e-05, "loss": 0.5987, "step": 15120 }, { "epoch": 2.58788293684751, "grad_norm": 6.368405342102051, "learning_rate": 1.9227998315454976e-05, "loss": 0.7994, "step": 15121 }, { "epoch": 2.588054081807291, "grad_norm": 11.070363998413086, "learning_rate": 1.9216942500434066e-05, "loss": 0.88, "step": 15122 }, { "epoch": 2.588225226767072, "grad_norm": 11.59894847869873, "learning_rate": 1.920588419741078e-05, "loss": 0.7056, "step": 15123 }, { "epoch": 2.5883963717268528, "grad_norm": 0.2717958390712738, "learning_rate": 1.9194823412909576e-05, "loss": 0.0875, "step": 15124 }, { "epoch": 2.5885675166866333, "grad_norm": 11.125103950500488, "learning_rate": 1.918376015345627e-05, "loss": 0.9034, "step": 15125 }, { "epoch": 2.5887386616464143, "grad_norm": 22.53141212463379, "learning_rate": 1.917269442557828e-05, "loss": 3.4078, "step": 15126 }, { "epoch": 2.5889098066061953, "grad_norm": 10.819095611572266, "learning_rate": 1.9161626235804368e-05, "loss": 0.6358, "step": 15127 }, { "epoch": 2.5890809515659763, "grad_norm": 20.78098487854004, "learning_rate": 1.9150555590664754e-05, "loss": 2.8535, "step": 15128 }, { "epoch": 2.5892520965257573, "grad_norm": 13.112188339233398, "learning_rate": 1.9139482496691158e-05, "loss": 1.0125, "step": 15129 }, { "epoch": 2.5894232414855383, "grad_norm": 11.732759475708008, "learning_rate": 1.9128406960416748e-05, "loss": 1.0718, "step": 15130 }, { "epoch": 2.5895943864453193, "grad_norm": 22.954689025878906, "learning_rate": 1.911732898837608e-05, "loss": 5.3309, "step": 15131 }, { "epoch": 2.5897655314051002, "grad_norm": 14.589348793029785, "learning_rate": 1.9106248587105154e-05, "loss": 1.0385, "step": 15132 }, { "epoch": 2.5899366763648812, "grad_norm": 0.3246939480304718, "learning_rate": 1.909516576314145e-05, "loss": 0.0894, "step": 15133 }, { "epoch": 2.590107821324662, "grad_norm": 9.372694969177246, "learning_rate": 1.9084080523023862e-05, "loss": 0.6371, "step": 15134 }, { "epoch": 2.5902789662844428, "grad_norm": 11.697389602661133, "learning_rate": 1.9072992873292686e-05, "loss": 0.8095, "step": 15135 }, { "epoch": 2.5904501112442238, "grad_norm": 6.507179260253906, "learning_rate": 1.9061902820489628e-05, "loss": 0.4906, "step": 15136 }, { "epoch": 2.5906212562040047, "grad_norm": 8.602190017700195, "learning_rate": 1.905081037115785e-05, "loss": 0.4829, "step": 15137 }, { "epoch": 2.5907924011637857, "grad_norm": 4.272670745849609, "learning_rate": 1.9039715531841946e-05, "loss": 0.462, "step": 15138 }, { "epoch": 2.5909635461235667, "grad_norm": 12.092643737792969, "learning_rate": 1.902861830908786e-05, "loss": 1.4451, "step": 15139 }, { "epoch": 2.5911346910833477, "grad_norm": 1.0485435724258423, "learning_rate": 1.901751870944295e-05, "loss": 0.1083, "step": 15140 }, { "epoch": 2.5913058360431287, "grad_norm": 0.3347090184688568, "learning_rate": 1.9006416739456018e-05, "loss": 0.0885, "step": 15141 }, { "epoch": 2.5914769810029092, "grad_norm": 9.732542037963867, "learning_rate": 1.8995312405677262e-05, "loss": 0.7686, "step": 15142 }, { "epoch": 2.5916481259626902, "grad_norm": 11.438066482543945, "learning_rate": 1.8984205714658236e-05, "loss": 0.901, "step": 15143 }, { "epoch": 2.591819270922471, "grad_norm": 7.826801300048828, "learning_rate": 1.8973096672951887e-05, "loss": 0.7584, "step": 15144 }, { "epoch": 2.591990415882252, "grad_norm": 4.112530708312988, "learning_rate": 1.896198528711258e-05, "loss": 0.2772, "step": 15145 }, { "epoch": 2.592161560842033, "grad_norm": 8.370685577392578, "learning_rate": 1.895087156369607e-05, "loss": 0.9727, "step": 15146 }, { "epoch": 2.592332705801814, "grad_norm": 14.213507652282715, "learning_rate": 1.893975550925943e-05, "loss": 1.4778, "step": 15147 }, { "epoch": 2.592503850761595, "grad_norm": 8.46274471282959, "learning_rate": 1.892863713036118e-05, "loss": 0.6572, "step": 15148 }, { "epoch": 2.592674995721376, "grad_norm": 18.516733169555664, "learning_rate": 1.891751643356119e-05, "loss": 1.748, "step": 15149 }, { "epoch": 2.592846140681157, "grad_norm": 0.39341068267822266, "learning_rate": 1.8906393425420644e-05, "loss": 0.0861, "step": 15150 }, { "epoch": 2.593017285640938, "grad_norm": 2.221566915512085, "learning_rate": 1.889526811250219e-05, "loss": 0.1834, "step": 15151 }, { "epoch": 2.5931884306007187, "grad_norm": 11.966039657592773, "learning_rate": 1.888414050136972e-05, "loss": 0.8094, "step": 15152 }, { "epoch": 2.5933595755604997, "grad_norm": 11.436786651611328, "learning_rate": 1.8873010598588583e-05, "loss": 0.9999, "step": 15153 }, { "epoch": 2.5935307205202807, "grad_norm": 24.73260498046875, "learning_rate": 1.8861878410725412e-05, "loss": 4.9874, "step": 15154 }, { "epoch": 2.5937018654800617, "grad_norm": 15.489234924316406, "learning_rate": 1.8850743944348244e-05, "loss": 1.149, "step": 15155 }, { "epoch": 2.5938730104398426, "grad_norm": 9.881089210510254, "learning_rate": 1.8839607206026393e-05, "loss": 0.6913, "step": 15156 }, { "epoch": 2.5940441553996236, "grad_norm": 0.8873459696769714, "learning_rate": 1.882846820233059e-05, "loss": 0.14, "step": 15157 }, { "epoch": 2.594215300359404, "grad_norm": 17.68792152404785, "learning_rate": 1.8817326939832828e-05, "loss": 1.2925, "step": 15158 }, { "epoch": 2.594386445319185, "grad_norm": 11.319480895996094, "learning_rate": 1.8806183425106507e-05, "loss": 0.6008, "step": 15159 }, { "epoch": 2.594557590278966, "grad_norm": 19.31732177734375, "learning_rate": 1.8795037664726276e-05, "loss": 2.1755, "step": 15160 }, { "epoch": 2.594728735238747, "grad_norm": 12.915658950805664, "learning_rate": 1.8783889665268186e-05, "loss": 0.8061, "step": 15161 }, { "epoch": 2.594899880198528, "grad_norm": 15.459428787231445, "learning_rate": 1.877273943330954e-05, "loss": 0.99, "step": 15162 }, { "epoch": 2.595071025158309, "grad_norm": 7.267991542816162, "learning_rate": 1.8761586975429032e-05, "loss": 0.4741, "step": 15163 }, { "epoch": 2.59524217011809, "grad_norm": 11.592203140258789, "learning_rate": 1.875043229820658e-05, "loss": 0.6482, "step": 15164 }, { "epoch": 2.595413315077871, "grad_norm": 9.857067108154297, "learning_rate": 1.8739275408223507e-05, "loss": 0.8365, "step": 15165 }, { "epoch": 2.595584460037652, "grad_norm": 0.269639790058136, "learning_rate": 1.872811631206236e-05, "loss": 0.0854, "step": 15166 }, { "epoch": 2.595755604997433, "grad_norm": 5.790857791900635, "learning_rate": 1.871695501630705e-05, "loss": 0.4405, "step": 15167 }, { "epoch": 2.5959267499572136, "grad_norm": 6.271033763885498, "learning_rate": 1.870579152754273e-05, "loss": 0.3683, "step": 15168 }, { "epoch": 2.5960978949169946, "grad_norm": 5.780383110046387, "learning_rate": 1.869462585235588e-05, "loss": 0.4713, "step": 15169 }, { "epoch": 2.5962690398767756, "grad_norm": 9.594270706176758, "learning_rate": 1.868345799733428e-05, "loss": 0.6685, "step": 15170 }, { "epoch": 2.5964401848365566, "grad_norm": 6.118438243865967, "learning_rate": 1.8672287969066995e-05, "loss": 0.6524, "step": 15171 }, { "epoch": 2.5966113297963376, "grad_norm": 18.041297912597656, "learning_rate": 1.8661115774144333e-05, "loss": 1.7398, "step": 15172 }, { "epoch": 2.5967824747561186, "grad_norm": 6.446053981781006, "learning_rate": 1.8649941419157897e-05, "loss": 0.4889, "step": 15173 }, { "epoch": 2.596953619715899, "grad_norm": 21.298049926757812, "learning_rate": 1.8638764910700585e-05, "loss": 4.7381, "step": 15174 }, { "epoch": 2.59712476467568, "grad_norm": 15.968399047851562, "learning_rate": 1.862758625536658e-05, "loss": 0.9978, "step": 15175 }, { "epoch": 2.597295909635461, "grad_norm": 16.415910720825195, "learning_rate": 1.861640545975128e-05, "loss": 1.7307, "step": 15176 }, { "epoch": 2.597467054595242, "grad_norm": 12.115501403808594, "learning_rate": 1.860522253045135e-05, "loss": 0.781, "step": 15177 }, { "epoch": 2.597638199555023, "grad_norm": 19.219608306884766, "learning_rate": 1.8594037474064757e-05, "loss": 1.1359, "step": 15178 }, { "epoch": 2.597809344514804, "grad_norm": 14.244746208190918, "learning_rate": 1.858285029719072e-05, "loss": 0.8563, "step": 15179 }, { "epoch": 2.597980489474585, "grad_norm": 21.969738006591797, "learning_rate": 1.857166100642967e-05, "loss": 2.7444, "step": 15180 }, { "epoch": 2.598151634434366, "grad_norm": 24.648027420043945, "learning_rate": 1.8560469608383293e-05, "loss": 5.3106, "step": 15181 }, { "epoch": 2.598322779394147, "grad_norm": 8.941143035888672, "learning_rate": 1.854927610965454e-05, "loss": 0.6307, "step": 15182 }, { "epoch": 2.598493924353928, "grad_norm": 16.680683135986328, "learning_rate": 1.8538080516847615e-05, "loss": 1.0662, "step": 15183 }, { "epoch": 2.5986650693137086, "grad_norm": 0.7203956246376038, "learning_rate": 1.8526882836567914e-05, "loss": 0.135, "step": 15184 }, { "epoch": 2.5988362142734895, "grad_norm": 8.890251159667969, "learning_rate": 1.8515683075422073e-05, "loss": 0.6076, "step": 15185 }, { "epoch": 2.5990073592332705, "grad_norm": 2.6681928634643555, "learning_rate": 1.8504481240017977e-05, "loss": 0.2585, "step": 15186 }, { "epoch": 2.5991785041930515, "grad_norm": 11.974296569824219, "learning_rate": 1.8493277336964745e-05, "loss": 0.793, "step": 15187 }, { "epoch": 2.5993496491528325, "grad_norm": 7.292025566101074, "learning_rate": 1.8482071372872687e-05, "loss": 0.3974, "step": 15188 }, { "epoch": 2.5995207941126135, "grad_norm": 10.183197975158691, "learning_rate": 1.847086335435331e-05, "loss": 0.6509, "step": 15189 }, { "epoch": 2.599691939072394, "grad_norm": 6.272428035736084, "learning_rate": 1.8459653288019385e-05, "loss": 0.4593, "step": 15190 }, { "epoch": 2.599863084032175, "grad_norm": 8.958464622497559, "learning_rate": 1.8448441180484866e-05, "loss": 0.5367, "step": 15191 }, { "epoch": 2.600034228991956, "grad_norm": 5.96212911605835, "learning_rate": 1.8437227038364935e-05, "loss": 0.4588, "step": 15192 }, { "epoch": 2.600205373951737, "grad_norm": 9.121065139770508, "learning_rate": 1.8426010868275917e-05, "loss": 0.7625, "step": 15193 }, { "epoch": 2.600376518911518, "grad_norm": 10.296212196350098, "learning_rate": 1.8414792676835395e-05, "loss": 0.7589, "step": 15194 }, { "epoch": 2.600547663871299, "grad_norm": 14.770569801330566, "learning_rate": 1.840357247066209e-05, "loss": 1.324, "step": 15195 }, { "epoch": 2.60071880883108, "grad_norm": 18.49117088317871, "learning_rate": 1.839235025637598e-05, "loss": 2.1817, "step": 15196 }, { "epoch": 2.600889953790861, "grad_norm": 6.83669900894165, "learning_rate": 1.8381126040598147e-05, "loss": 0.4745, "step": 15197 }, { "epoch": 2.601061098750642, "grad_norm": 9.8988676071167, "learning_rate": 1.8369899829950928e-05, "loss": 0.5922, "step": 15198 }, { "epoch": 2.601232243710423, "grad_norm": 10.49427318572998, "learning_rate": 1.8358671631057772e-05, "loss": 0.6834, "step": 15199 }, { "epoch": 2.601403388670204, "grad_norm": 17.792938232421875, "learning_rate": 1.834744145054338e-05, "loss": 1.9382, "step": 15200 }, { "epoch": 2.6015745336299845, "grad_norm": 10.462258338928223, "learning_rate": 1.8336209295033516e-05, "loss": 0.6318, "step": 15201 }, { "epoch": 2.6017456785897655, "grad_norm": 13.703853607177734, "learning_rate": 1.8324975171155224e-05, "loss": 1.4407, "step": 15202 }, { "epoch": 2.6019168235495465, "grad_norm": 6.98768424987793, "learning_rate": 1.8313739085536606e-05, "loss": 0.683, "step": 15203 }, { "epoch": 2.6020879685093274, "grad_norm": 0.38384926319122314, "learning_rate": 1.8302501044807012e-05, "loss": 0.0946, "step": 15204 }, { "epoch": 2.6022591134691084, "grad_norm": 14.0606689453125, "learning_rate": 1.8291261055596863e-05, "loss": 1.2293, "step": 15205 }, { "epoch": 2.6024302584288894, "grad_norm": 6.760662078857422, "learning_rate": 1.828001912453781e-05, "loss": 0.8046, "step": 15206 }, { "epoch": 2.60260140338867, "grad_norm": 4.6409077644348145, "learning_rate": 1.8268775258262567e-05, "loss": 0.3758, "step": 15207 }, { "epoch": 2.602772548348451, "grad_norm": 10.591435432434082, "learning_rate": 1.8257529463405073e-05, "loss": 0.7333, "step": 15208 }, { "epoch": 2.602943693308232, "grad_norm": 13.450599670410156, "learning_rate": 1.8246281746600325e-05, "loss": 1.3499, "step": 15209 }, { "epoch": 2.603114838268013, "grad_norm": 0.32653993368148804, "learning_rate": 1.8235032114484528e-05, "loss": 0.0895, "step": 15210 }, { "epoch": 2.603285983227794, "grad_norm": 12.166532516479492, "learning_rate": 1.822378057369493e-05, "loss": 0.8077, "step": 15211 }, { "epoch": 2.603457128187575, "grad_norm": 8.01476764678955, "learning_rate": 1.8212527130870025e-05, "loss": 0.5156, "step": 15212 }, { "epoch": 2.603628273147356, "grad_norm": 34.531002044677734, "learning_rate": 1.8201271792649334e-05, "loss": 6.5963, "step": 15213 }, { "epoch": 2.603799418107137, "grad_norm": 10.49241828918457, "learning_rate": 1.8190014565673493e-05, "loss": 0.6161, "step": 15214 }, { "epoch": 2.603970563066918, "grad_norm": 10.53573226928711, "learning_rate": 1.817875545658431e-05, "loss": 0.6302, "step": 15215 }, { "epoch": 2.604141708026699, "grad_norm": 10.276386260986328, "learning_rate": 1.8167494472024694e-05, "loss": 0.6368, "step": 15216 }, { "epoch": 2.6043128529864794, "grad_norm": 6.370922565460205, "learning_rate": 1.8156231618638616e-05, "loss": 0.557, "step": 15217 }, { "epoch": 2.6044839979462604, "grad_norm": 8.528852462768555, "learning_rate": 1.814496690307117e-05, "loss": 0.7008, "step": 15218 }, { "epoch": 2.6046551429060414, "grad_norm": 6.358438491821289, "learning_rate": 1.813370033196856e-05, "loss": 0.4458, "step": 15219 }, { "epoch": 2.6048262878658224, "grad_norm": 9.126945495605469, "learning_rate": 1.812243191197811e-05, "loss": 0.7879, "step": 15220 }, { "epoch": 2.6049974328256034, "grad_norm": 12.489262580871582, "learning_rate": 1.811116164974818e-05, "loss": 0.8161, "step": 15221 }, { "epoch": 2.6051685777853844, "grad_norm": 7.374971866607666, "learning_rate": 1.809988955192822e-05, "loss": 0.5142, "step": 15222 }, { "epoch": 2.605339722745165, "grad_norm": 6.993281841278076, "learning_rate": 1.808861562516881e-05, "loss": 0.6862, "step": 15223 }, { "epoch": 2.605510867704946, "grad_norm": 1.6778910160064697, "learning_rate": 1.8077339876121604e-05, "loss": 0.1896, "step": 15224 }, { "epoch": 2.605682012664727, "grad_norm": 1.008608341217041, "learning_rate": 1.8066062311439286e-05, "loss": 0.1448, "step": 15225 }, { "epoch": 2.605853157624508, "grad_norm": 1.723668098449707, "learning_rate": 1.8054782937775613e-05, "loss": 0.1785, "step": 15226 }, { "epoch": 2.606024302584289, "grad_norm": 0.7441108822822571, "learning_rate": 1.8043501761785458e-05, "loss": 0.1051, "step": 15227 }, { "epoch": 2.60619544754407, "grad_norm": 11.855443000793457, "learning_rate": 1.803221879012475e-05, "loss": 0.6315, "step": 15228 }, { "epoch": 2.606366592503851, "grad_norm": 0.3189244270324707, "learning_rate": 1.8020934029450443e-05, "loss": 0.0888, "step": 15229 }, { "epoch": 2.606537737463632, "grad_norm": 8.66945743560791, "learning_rate": 1.800964748642054e-05, "loss": 0.6042, "step": 15230 }, { "epoch": 2.606708882423413, "grad_norm": 0.7838638424873352, "learning_rate": 1.7998359167694134e-05, "loss": 0.1436, "step": 15231 }, { "epoch": 2.606880027383194, "grad_norm": 10.084344863891602, "learning_rate": 1.7987069079931373e-05, "loss": 0.9052, "step": 15232 }, { "epoch": 2.6070511723429743, "grad_norm": 4.142926216125488, "learning_rate": 1.7975777229793386e-05, "loss": 0.4287, "step": 15233 }, { "epoch": 2.6072223173027553, "grad_norm": 14.287060737609863, "learning_rate": 1.7964483623942403e-05, "loss": 1.3708, "step": 15234 }, { "epoch": 2.6073934622625363, "grad_norm": 1.2622969150543213, "learning_rate": 1.7953188269041686e-05, "loss": 0.1435, "step": 15235 }, { "epoch": 2.6075646072223173, "grad_norm": 8.369921684265137, "learning_rate": 1.7941891171755467e-05, "loss": 0.5316, "step": 15236 }, { "epoch": 2.6077357521820983, "grad_norm": 5.2494330406188965, "learning_rate": 1.79305923387491e-05, "loss": 0.4212, "step": 15237 }, { "epoch": 2.6079068971418793, "grad_norm": 6.756010055541992, "learning_rate": 1.7919291776688865e-05, "loss": 0.3466, "step": 15238 }, { "epoch": 2.60807804210166, "grad_norm": 0.8167133331298828, "learning_rate": 1.7907989492242157e-05, "loss": 0.1222, "step": 15239 }, { "epoch": 2.608249187061441, "grad_norm": 6.814412593841553, "learning_rate": 1.7896685492077292e-05, "loss": 0.5037, "step": 15240 }, { "epoch": 2.608420332021222, "grad_norm": 5.503702640533447, "learning_rate": 1.78853797828637e-05, "loss": 0.5358, "step": 15241 }, { "epoch": 2.608591476981003, "grad_norm": 15.422974586486816, "learning_rate": 1.7874072371271714e-05, "loss": 1.0536, "step": 15242 }, { "epoch": 2.608762621940784, "grad_norm": 16.168315887451172, "learning_rate": 1.7862763263972766e-05, "loss": 1.7006, "step": 15243 }, { "epoch": 2.6089337669005648, "grad_norm": 7.332607269287109, "learning_rate": 1.785145246763921e-05, "loss": 0.509, "step": 15244 }, { "epoch": 2.6091049118603458, "grad_norm": 15.448347091674805, "learning_rate": 1.7840139988944473e-05, "loss": 1.6394, "step": 15245 }, { "epoch": 2.6092760568201268, "grad_norm": 0.6104698181152344, "learning_rate": 1.782882583456289e-05, "loss": 0.1329, "step": 15246 }, { "epoch": 2.6094472017799077, "grad_norm": 0.7983872294425964, "learning_rate": 1.7817510011169872e-05, "loss": 0.1266, "step": 15247 }, { "epoch": 2.6096183467396887, "grad_norm": 9.955517768859863, "learning_rate": 1.7806192525441734e-05, "loss": 0.6968, "step": 15248 }, { "epoch": 2.6097894916994697, "grad_norm": 4.615678787231445, "learning_rate": 1.7794873384055842e-05, "loss": 0.4269, "step": 15249 }, { "epoch": 2.6099606366592503, "grad_norm": 0.3094068467617035, "learning_rate": 1.778355259369047e-05, "loss": 0.0857, "step": 15250 }, { "epoch": 2.6101317816190313, "grad_norm": 26.07895278930664, "learning_rate": 1.7772230161024945e-05, "loss": 5.1593, "step": 15251 }, { "epoch": 2.6103029265788122, "grad_norm": 36.71152877807617, "learning_rate": 1.776090609273947e-05, "loss": 6.3405, "step": 15252 }, { "epoch": 2.6104740715385932, "grad_norm": 8.006674766540527, "learning_rate": 1.774958039551531e-05, "loss": 0.5207, "step": 15253 }, { "epoch": 2.610645216498374, "grad_norm": 11.062516212463379, "learning_rate": 1.7738253076034608e-05, "loss": 1.2303, "step": 15254 }, { "epoch": 2.610816361458155, "grad_norm": 2.3548357486724854, "learning_rate": 1.7726924140980506e-05, "loss": 0.1925, "step": 15255 }, { "epoch": 2.6109875064179358, "grad_norm": 42.769561767578125, "learning_rate": 1.7715593597037103e-05, "loss": 6.6702, "step": 15256 }, { "epoch": 2.6111586513777167, "grad_norm": 14.620716094970703, "learning_rate": 1.7704261450889454e-05, "loss": 1.3411, "step": 15257 }, { "epoch": 2.6113297963374977, "grad_norm": 13.034462928771973, "learning_rate": 1.7692927709223518e-05, "loss": 1.4222, "step": 15258 }, { "epoch": 2.6115009412972787, "grad_norm": 13.7128324508667, "learning_rate": 1.76815923787262e-05, "loss": 1.3528, "step": 15259 }, { "epoch": 2.6116720862570597, "grad_norm": 6.604728698730469, "learning_rate": 1.7670255466085388e-05, "loss": 0.4968, "step": 15260 }, { "epoch": 2.6118432312168407, "grad_norm": 16.870691299438477, "learning_rate": 1.7658916977989894e-05, "loss": 1.9905, "step": 15261 }, { "epoch": 2.6120143761766217, "grad_norm": 8.262125015258789, "learning_rate": 1.7647576921129422e-05, "loss": 0.6176, "step": 15262 }, { "epoch": 2.6121855211364027, "grad_norm": 3.432337999343872, "learning_rate": 1.7636235302194598e-05, "loss": 0.383, "step": 15263 }, { "epoch": 2.6123566660961837, "grad_norm": 2.6762070655822754, "learning_rate": 1.7624892127877028e-05, "loss": 0.229, "step": 15264 }, { "epoch": 2.6125278110559647, "grad_norm": 0.8819395303726196, "learning_rate": 1.7613547404869208e-05, "loss": 0.1338, "step": 15265 }, { "epoch": 2.612698956015745, "grad_norm": 10.59372615814209, "learning_rate": 1.7602201139864528e-05, "loss": 0.9456, "step": 15266 }, { "epoch": 2.612870100975526, "grad_norm": 10.274118423461914, "learning_rate": 1.7590853339557276e-05, "loss": 0.7087, "step": 15267 }, { "epoch": 2.613041245935307, "grad_norm": 1.7101260423660278, "learning_rate": 1.7579504010642702e-05, "loss": 0.2532, "step": 15268 }, { "epoch": 2.613212390895088, "grad_norm": 2.3339691162109375, "learning_rate": 1.7568153159816933e-05, "loss": 0.2679, "step": 15269 }, { "epoch": 2.613383535854869, "grad_norm": 23.94843864440918, "learning_rate": 1.7556800793776978e-05, "loss": 4.8663, "step": 15270 }, { "epoch": 2.61355468081465, "grad_norm": 15.771297454833984, "learning_rate": 1.7545446919220723e-05, "loss": 1.4377, "step": 15271 }, { "epoch": 2.6137258257744307, "grad_norm": 2.7552733421325684, "learning_rate": 1.7534091542846998e-05, "loss": 0.1707, "step": 15272 }, { "epoch": 2.6138969707342117, "grad_norm": 3.3889026641845703, "learning_rate": 1.7522734671355506e-05, "loss": 0.2213, "step": 15273 }, { "epoch": 2.6140681156939927, "grad_norm": 5.925023078918457, "learning_rate": 1.7511376311446802e-05, "loss": 0.5841, "step": 15274 }, { "epoch": 2.6142392606537737, "grad_norm": 5.701343536376953, "learning_rate": 1.750001646982229e-05, "loss": 0.46, "step": 15275 }, { "epoch": 2.6144104056135546, "grad_norm": 6.445212364196777, "learning_rate": 1.748865515318438e-05, "loss": 0.4204, "step": 15276 }, { "epoch": 2.6145815505733356, "grad_norm": 11.659846305847168, "learning_rate": 1.7477292368236204e-05, "loss": 1.0334, "step": 15277 }, { "epoch": 2.6147526955331166, "grad_norm": 8.786909103393555, "learning_rate": 1.7465928121681858e-05, "loss": 0.6349, "step": 15278 }, { "epoch": 2.6149238404928976, "grad_norm": 5.418483734130859, "learning_rate": 1.745456242022623e-05, "loss": 0.4791, "step": 15279 }, { "epoch": 2.6150949854526786, "grad_norm": 8.79396915435791, "learning_rate": 1.744319527057514e-05, "loss": 0.7363, "step": 15280 }, { "epoch": 2.6152661304124596, "grad_norm": 8.164962768554688, "learning_rate": 1.743182667943518e-05, "loss": 0.4844, "step": 15281 }, { "epoch": 2.61543727537224, "grad_norm": 0.7373983860015869, "learning_rate": 1.7420456653513874e-05, "loss": 0.1464, "step": 15282 }, { "epoch": 2.615608420332021, "grad_norm": 5.527731418609619, "learning_rate": 1.7409085199519517e-05, "loss": 0.5535, "step": 15283 }, { "epoch": 2.615779565291802, "grad_norm": 14.398109436035156, "learning_rate": 1.7397712324161326e-05, "loss": 0.8935, "step": 15284 }, { "epoch": 2.615950710251583, "grad_norm": 10.562129974365234, "learning_rate": 1.738633803414927e-05, "loss": 0.817, "step": 15285 }, { "epoch": 2.616121855211364, "grad_norm": 5.372660160064697, "learning_rate": 1.7374962336194243e-05, "loss": 0.4015, "step": 15286 }, { "epoch": 2.616293000171145, "grad_norm": 6.772414684295654, "learning_rate": 1.736358523700788e-05, "loss": 0.4674, "step": 15287 }, { "epoch": 2.6164641451309256, "grad_norm": 15.751577377319336, "learning_rate": 1.7352206743302724e-05, "loss": 1.5328, "step": 15288 }, { "epoch": 2.6166352900907066, "grad_norm": 9.547479629516602, "learning_rate": 1.734082686179207e-05, "loss": 0.5856, "step": 15289 }, { "epoch": 2.6168064350504876, "grad_norm": 10.867112159729004, "learning_rate": 1.7329445599190097e-05, "loss": 0.8435, "step": 15290 }, { "epoch": 2.6169775800102686, "grad_norm": 8.782581329345703, "learning_rate": 1.7318062962211734e-05, "loss": 0.5844, "step": 15291 }, { "epoch": 2.6171487249700496, "grad_norm": 4.673138618469238, "learning_rate": 1.7306678957572788e-05, "loss": 0.426, "step": 15292 }, { "epoch": 2.6173198699298306, "grad_norm": 7.4084858894348145, "learning_rate": 1.72952935919898e-05, "loss": 0.8249, "step": 15293 }, { "epoch": 2.6174910148896116, "grad_norm": 17.241498947143555, "learning_rate": 1.7283906872180195e-05, "loss": 1.741, "step": 15294 }, { "epoch": 2.6176621598493925, "grad_norm": 2.5243172645568848, "learning_rate": 1.7272518804862115e-05, "loss": 0.1976, "step": 15295 }, { "epoch": 2.6178333048091735, "grad_norm": 16.5325927734375, "learning_rate": 1.726112939675458e-05, "loss": 1.8949, "step": 15296 }, { "epoch": 2.6180044497689545, "grad_norm": 6.8588762283325195, "learning_rate": 1.7249738654577283e-05, "loss": 0.3775, "step": 15297 }, { "epoch": 2.6181755947287355, "grad_norm": 0.31104710698127747, "learning_rate": 1.7238346585050878e-05, "loss": 0.0858, "step": 15298 }, { "epoch": 2.618346739688516, "grad_norm": 7.256035327911377, "learning_rate": 1.7226953194896648e-05, "loss": 0.6712, "step": 15299 }, { "epoch": 2.618517884648297, "grad_norm": 8.193878173828125, "learning_rate": 1.72155584908367e-05, "loss": 0.5427, "step": 15300 }, { "epoch": 2.618689029608078, "grad_norm": 1.9335206747055054, "learning_rate": 1.720416247959394e-05, "loss": 0.2159, "step": 15301 }, { "epoch": 2.618860174567859, "grad_norm": 25.362106323242188, "learning_rate": 1.7192765167892054e-05, "loss": 4.7118, "step": 15302 }, { "epoch": 2.61903131952764, "grad_norm": 17.31849479675293, "learning_rate": 1.7181366562455456e-05, "loss": 1.4904, "step": 15303 }, { "epoch": 2.619202464487421, "grad_norm": 6.2245612144470215, "learning_rate": 1.716996667000931e-05, "loss": 0.5939, "step": 15304 }, { "epoch": 2.6193736094472015, "grad_norm": 10.427659034729004, "learning_rate": 1.7158565497279602e-05, "loss": 0.7532, "step": 15305 }, { "epoch": 2.6195447544069825, "grad_norm": 3.1957194805145264, "learning_rate": 1.7147163050993057e-05, "loss": 0.3374, "step": 15306 }, { "epoch": 2.6197158993667635, "grad_norm": 5.187853813171387, "learning_rate": 1.713575933787711e-05, "loss": 0.4778, "step": 15307 }, { "epoch": 2.6198870443265445, "grad_norm": 2.947824001312256, "learning_rate": 1.7124354364659955e-05, "loss": 0.3028, "step": 15308 }, { "epoch": 2.6200581892863255, "grad_norm": 5.944991588592529, "learning_rate": 1.711294813807056e-05, "loss": 0.5552, "step": 15309 }, { "epoch": 2.6202293342461065, "grad_norm": 3.5820629596710205, "learning_rate": 1.7101540664838635e-05, "loss": 0.3488, "step": 15310 }, { "epoch": 2.6204004792058875, "grad_norm": 6.443985462188721, "learning_rate": 1.709013195169459e-05, "loss": 0.5316, "step": 15311 }, { "epoch": 2.6205716241656685, "grad_norm": 0.32985883951187134, "learning_rate": 1.707872200536956e-05, "loss": 0.0893, "step": 15312 }, { "epoch": 2.6207427691254495, "grad_norm": 0.33715900778770447, "learning_rate": 1.706731083259545e-05, "loss": 0.0836, "step": 15313 }, { "epoch": 2.6209139140852304, "grad_norm": 7.2128777503967285, "learning_rate": 1.7055898440104887e-05, "loss": 0.5729, "step": 15314 }, { "epoch": 2.621085059045011, "grad_norm": 8.92819881439209, "learning_rate": 1.7044484834631184e-05, "loss": 0.5271, "step": 15315 }, { "epoch": 2.621256204004792, "grad_norm": 2.340161085128784, "learning_rate": 1.7033070022908364e-05, "loss": 0.297, "step": 15316 }, { "epoch": 2.621427348964573, "grad_norm": 1.1957939863204956, "learning_rate": 1.70216540116712e-05, "loss": 0.1263, "step": 15317 }, { "epoch": 2.621598493924354, "grad_norm": 2.6731600761413574, "learning_rate": 1.7010236807655162e-05, "loss": 0.3552, "step": 15318 }, { "epoch": 2.621769638884135, "grad_norm": 5.284124851226807, "learning_rate": 1.699881841759643e-05, "loss": 0.4538, "step": 15319 }, { "epoch": 2.621940783843916, "grad_norm": 10.857243537902832, "learning_rate": 1.6987398848231835e-05, "loss": 0.645, "step": 15320 }, { "epoch": 2.6221119288036965, "grad_norm": 7.422756195068359, "learning_rate": 1.6975978106298984e-05, "loss": 0.642, "step": 15321 }, { "epoch": 2.6222830737634775, "grad_norm": 0.28736940026283264, "learning_rate": 1.6964556198536083e-05, "loss": 0.0865, "step": 15322 }, { "epoch": 2.6224542187232585, "grad_norm": 16.991146087646484, "learning_rate": 1.6953133131682116e-05, "loss": 1.6143, "step": 15323 }, { "epoch": 2.6226253636830394, "grad_norm": 10.42922592163086, "learning_rate": 1.6941708912476677e-05, "loss": 0.4789, "step": 15324 }, { "epoch": 2.6227965086428204, "grad_norm": 3.7413253784179688, "learning_rate": 1.6930283547660106e-05, "loss": 0.4059, "step": 15325 }, { "epoch": 2.6229676536026014, "grad_norm": 10.67026138305664, "learning_rate": 1.6918857043973347e-05, "loss": 0.65, "step": 15326 }, { "epoch": 2.6231387985623824, "grad_norm": 1.2789674997329712, "learning_rate": 1.6907429408158092e-05, "loss": 0.174, "step": 15327 }, { "epoch": 2.6233099435221634, "grad_norm": 20.167665481567383, "learning_rate": 1.6896000646956625e-05, "loss": 2.4964, "step": 15328 }, { "epoch": 2.6234810884819444, "grad_norm": 9.229650497436523, "learning_rate": 1.688457076711198e-05, "loss": 1.0561, "step": 15329 }, { "epoch": 2.6236522334417254, "grad_norm": 6.288013935089111, "learning_rate": 1.687313977536775e-05, "loss": 0.4041, "step": 15330 }, { "epoch": 2.623823378401506, "grad_norm": 10.614367485046387, "learning_rate": 1.6861707678468282e-05, "loss": 0.6928, "step": 15331 }, { "epoch": 2.623994523361287, "grad_norm": 9.52440357208252, "learning_rate": 1.685027448315849e-05, "loss": 0.5724, "step": 15332 }, { "epoch": 2.624165668321068, "grad_norm": 11.350860595703125, "learning_rate": 1.6838840196184013e-05, "loss": 0.9663, "step": 15333 }, { "epoch": 2.624336813280849, "grad_norm": 9.016491889953613, "learning_rate": 1.682740482429107e-05, "loss": 0.6183, "step": 15334 }, { "epoch": 2.62450795824063, "grad_norm": 13.974984169006348, "learning_rate": 1.6815968374226578e-05, "loss": 0.8873, "step": 15335 }, { "epoch": 2.624679103200411, "grad_norm": 4.702191352844238, "learning_rate": 1.6804530852738016e-05, "loss": 0.5891, "step": 15336 }, { "epoch": 2.6248502481601914, "grad_norm": 8.336148262023926, "learning_rate": 1.6793092266573583e-05, "loss": 0.5133, "step": 15337 }, { "epoch": 2.6250213931199724, "grad_norm": 15.071510314941406, "learning_rate": 1.6781652622482024e-05, "loss": 0.8542, "step": 15338 }, { "epoch": 2.6251925380797534, "grad_norm": 5.003264904022217, "learning_rate": 1.6770211927212785e-05, "loss": 0.3898, "step": 15339 }, { "epoch": 2.6253636830395344, "grad_norm": 5.783367156982422, "learning_rate": 1.6758770187515846e-05, "loss": 0.3392, "step": 15340 }, { "epoch": 2.6255348279993154, "grad_norm": 29.221561431884766, "learning_rate": 1.6747327410141883e-05, "loss": 5.186, "step": 15341 }, { "epoch": 2.6257059729590964, "grad_norm": 17.511343002319336, "learning_rate": 1.6735883601842144e-05, "loss": 1.968, "step": 15342 }, { "epoch": 2.6258771179188773, "grad_norm": 5.415096759796143, "learning_rate": 1.6724438769368516e-05, "loss": 0.4704, "step": 15343 }, { "epoch": 2.6260482628786583, "grad_norm": 7.83999490737915, "learning_rate": 1.6712992919473447e-05, "loss": 0.6433, "step": 15344 }, { "epoch": 2.6262194078384393, "grad_norm": 16.12394142150879, "learning_rate": 1.6701546058909978e-05, "loss": 1.6706, "step": 15345 }, { "epoch": 2.6263905527982203, "grad_norm": 10.261462211608887, "learning_rate": 1.6690098194431804e-05, "loss": 0.7795, "step": 15346 }, { "epoch": 2.6265616977580013, "grad_norm": 9.458169937133789, "learning_rate": 1.6678649332793198e-05, "loss": 0.7423, "step": 15347 }, { "epoch": 2.626732842717782, "grad_norm": 3.5165486335754395, "learning_rate": 1.666719948074898e-05, "loss": 0.3543, "step": 15348 }, { "epoch": 2.626903987677563, "grad_norm": 0.3753510117530823, "learning_rate": 1.665574864505457e-05, "loss": 0.0895, "step": 15349 }, { "epoch": 2.627075132637344, "grad_norm": 5.303385257720947, "learning_rate": 1.664429683246599e-05, "loss": 0.48, "step": 15350 }, { "epoch": 2.627246277597125, "grad_norm": 7.073727607727051, "learning_rate": 1.6632844049739856e-05, "loss": 0.6109, "step": 15351 }, { "epoch": 2.627417422556906, "grad_norm": 9.899068832397461, "learning_rate": 1.6621390303633298e-05, "loss": 0.6008, "step": 15352 }, { "epoch": 2.627588567516687, "grad_norm": 0.8912720680236816, "learning_rate": 1.6609935600904025e-05, "loss": 0.1402, "step": 15353 }, { "epoch": 2.6277597124764673, "grad_norm": 5.706963539123535, "learning_rate": 1.659847994831035e-05, "loss": 0.4657, "step": 15354 }, { "epoch": 2.6279308574362483, "grad_norm": 10.250340461730957, "learning_rate": 1.6587023352611144e-05, "loss": 0.679, "step": 15355 }, { "epoch": 2.6281020023960293, "grad_norm": 13.851080894470215, "learning_rate": 1.6575565820565795e-05, "loss": 1.56, "step": 15356 }, { "epoch": 2.6282731473558103, "grad_norm": 5.847459316253662, "learning_rate": 1.6564107358934245e-05, "loss": 0.3182, "step": 15357 }, { "epoch": 2.6284442923155913, "grad_norm": 6.2985124588012695, "learning_rate": 1.6552647974477023e-05, "loss": 0.5451, "step": 15358 }, { "epoch": 2.6286154372753723, "grad_norm": 7.941428184509277, "learning_rate": 1.6541187673955206e-05, "loss": 0.6422, "step": 15359 }, { "epoch": 2.6287865822351533, "grad_norm": 0.293760746717453, "learning_rate": 1.652972646413037e-05, "loss": 0.086, "step": 15360 }, { "epoch": 2.6289577271949343, "grad_norm": 4.273711681365967, "learning_rate": 1.6518264351764593e-05, "loss": 0.3467, "step": 15361 }, { "epoch": 2.6291288721547152, "grad_norm": 1.0143873691558838, "learning_rate": 1.6506801343620635e-05, "loss": 0.1477, "step": 15362 }, { "epoch": 2.6293000171144962, "grad_norm": 7.5124335289001465, "learning_rate": 1.6495337446461613e-05, "loss": 0.6555, "step": 15363 }, { "epoch": 2.629471162074277, "grad_norm": 0.33844539523124695, "learning_rate": 1.648387266705129e-05, "loss": 0.0905, "step": 15364 }, { "epoch": 2.6296423070340578, "grad_norm": 35.32624435424805, "learning_rate": 1.647240701215387e-05, "loss": 5.4023, "step": 15365 }, { "epoch": 2.6298134519938388, "grad_norm": 14.502459526062012, "learning_rate": 1.6460940488534133e-05, "loss": 0.8612, "step": 15366 }, { "epoch": 2.6299845969536197, "grad_norm": 9.316868782043457, "learning_rate": 1.6449473102957317e-05, "loss": 0.5589, "step": 15367 }, { "epoch": 2.6301557419134007, "grad_norm": 9.102574348449707, "learning_rate": 1.643800486218923e-05, "loss": 0.6269, "step": 15368 }, { "epoch": 2.6303268868731817, "grad_norm": 15.743060111999512, "learning_rate": 1.6426535772996113e-05, "loss": 1.802, "step": 15369 }, { "epoch": 2.6304980318329623, "grad_norm": 5.7743425369262695, "learning_rate": 1.6415065842144783e-05, "loss": 0.3627, "step": 15370 }, { "epoch": 2.6306691767927433, "grad_norm": 6.3669586181640625, "learning_rate": 1.640359507640248e-05, "loss": 0.6371, "step": 15371 }, { "epoch": 2.6308403217525242, "grad_norm": 1.079591989517212, "learning_rate": 1.639212348253701e-05, "loss": 0.1502, "step": 15372 }, { "epoch": 2.6310114667123052, "grad_norm": 2.33381986618042, "learning_rate": 1.638065106731659e-05, "loss": 0.2927, "step": 15373 }, { "epoch": 2.6311826116720862, "grad_norm": 19.23689842224121, "learning_rate": 1.6369177837510006e-05, "loss": 2.3108, "step": 15374 }, { "epoch": 2.631353756631867, "grad_norm": 11.314587593078613, "learning_rate": 1.6357703799886442e-05, "loss": 0.7219, "step": 15375 }, { "epoch": 2.631524901591648, "grad_norm": 3.4419078826904297, "learning_rate": 1.6346228961215634e-05, "loss": 0.2878, "step": 15376 }, { "epoch": 2.631696046551429, "grad_norm": 10.518135070800781, "learning_rate": 1.6334753328267706e-05, "loss": 0.8313, "step": 15377 }, { "epoch": 2.63186719151121, "grad_norm": 1.0202610492706299, "learning_rate": 1.632327690781335e-05, "loss": 0.138, "step": 15378 }, { "epoch": 2.632038336470991, "grad_norm": 8.168794631958008, "learning_rate": 1.631179970662363e-05, "loss": 0.6754, "step": 15379 }, { "epoch": 2.6322094814307717, "grad_norm": 4.993374824523926, "learning_rate": 1.6300321731470146e-05, "loss": 0.3867, "step": 15380 }, { "epoch": 2.6323806263905527, "grad_norm": 3.805640697479248, "learning_rate": 1.6288842989124883e-05, "loss": 0.2707, "step": 15381 }, { "epoch": 2.6325517713503337, "grad_norm": 12.324261665344238, "learning_rate": 1.6277363486360362e-05, "loss": 0.798, "step": 15382 }, { "epoch": 2.6327229163101147, "grad_norm": 8.196633338928223, "learning_rate": 1.6265883229949435e-05, "loss": 0.9612, "step": 15383 }, { "epoch": 2.6328940612698957, "grad_norm": 8.071084022521973, "learning_rate": 1.625440222666556e-05, "loss": 0.5829, "step": 15384 }, { "epoch": 2.6330652062296767, "grad_norm": 7.776893138885498, "learning_rate": 1.6242920483282506e-05, "loss": 0.3389, "step": 15385 }, { "epoch": 2.633236351189457, "grad_norm": 5.0663743019104, "learning_rate": 1.6231438006574493e-05, "loss": 0.4339, "step": 15386 }, { "epoch": 2.633407496149238, "grad_norm": 0.8951917886734009, "learning_rate": 1.621995480331622e-05, "loss": 0.1534, "step": 15387 }, { "epoch": 2.633578641109019, "grad_norm": 9.231096267700195, "learning_rate": 1.6208470880282816e-05, "loss": 0.5361, "step": 15388 }, { "epoch": 2.6337497860688, "grad_norm": 3.083308696746826, "learning_rate": 1.6196986244249793e-05, "loss": 0.3027, "step": 15389 }, { "epoch": 2.633920931028581, "grad_norm": 15.558186531066895, "learning_rate": 1.6185500901993086e-05, "loss": 1.1854, "step": 15390 }, { "epoch": 2.634092075988362, "grad_norm": 2.182664155960083, "learning_rate": 1.6174014860289077e-05, "loss": 0.1745, "step": 15391 }, { "epoch": 2.634263220948143, "grad_norm": 5.101382255554199, "learning_rate": 1.6162528125914575e-05, "loss": 0.4759, "step": 15392 }, { "epoch": 2.634434365907924, "grad_norm": 15.50987720489502, "learning_rate": 1.6151040705646748e-05, "loss": 1.4292, "step": 15393 }, { "epoch": 2.634605510867705, "grad_norm": 8.444144248962402, "learning_rate": 1.6139552606263167e-05, "loss": 0.6984, "step": 15394 }, { "epoch": 2.634776655827486, "grad_norm": 11.994129180908203, "learning_rate": 1.6128063834541852e-05, "loss": 0.914, "step": 15395 }, { "epoch": 2.634947800787267, "grad_norm": 3.4433228969573975, "learning_rate": 1.6116574397261217e-05, "loss": 0.3906, "step": 15396 }, { "epoch": 2.6351189457470476, "grad_norm": 15.774407386779785, "learning_rate": 1.610508430120002e-05, "loss": 1.5987, "step": 15397 }, { "epoch": 2.6352900907068286, "grad_norm": 7.939515590667725, "learning_rate": 1.609359355313742e-05, "loss": 0.4306, "step": 15398 }, { "epoch": 2.6354612356666096, "grad_norm": 1.3309015035629272, "learning_rate": 1.6082102159852998e-05, "loss": 0.1835, "step": 15399 }, { "epoch": 2.6356323806263906, "grad_norm": 5.83642578125, "learning_rate": 1.607061012812671e-05, "loss": 0.4149, "step": 15400 }, { "epoch": 2.6358035255861716, "grad_norm": 0.310150682926178, "learning_rate": 1.605911746473885e-05, "loss": 0.089, "step": 15401 }, { "epoch": 2.6359746705459526, "grad_norm": 6.335173606872559, "learning_rate": 1.604762417647009e-05, "loss": 0.8632, "step": 15402 }, { "epoch": 2.636145815505733, "grad_norm": 5.991723537445068, "learning_rate": 1.6036130270101503e-05, "loss": 0.4122, "step": 15403 }, { "epoch": 2.636316960465514, "grad_norm": 11.239047050476074, "learning_rate": 1.6024635752414513e-05, "loss": 0.598, "step": 15404 }, { "epoch": 2.636488105425295, "grad_norm": 12.121798515319824, "learning_rate": 1.6013140630190924e-05, "loss": 0.8132, "step": 15405 }, { "epoch": 2.636659250385076, "grad_norm": 0.3001841902732849, "learning_rate": 1.6001644910212833e-05, "loss": 0.0891, "step": 15406 }, { "epoch": 2.636830395344857, "grad_norm": 5.199068069458008, "learning_rate": 1.5990148599262772e-05, "loss": 0.4141, "step": 15407 }, { "epoch": 2.637001540304638, "grad_norm": 42.375186920166016, "learning_rate": 1.5978651704123547e-05, "loss": 6.4962, "step": 15408 }, { "epoch": 2.637172685264419, "grad_norm": 3.5431177616119385, "learning_rate": 1.596715423157838e-05, "loss": 0.3153, "step": 15409 }, { "epoch": 2.6373438302242, "grad_norm": 4.011086463928223, "learning_rate": 1.595565618841075e-05, "loss": 0.391, "step": 15410 }, { "epoch": 2.637514975183981, "grad_norm": 12.603432655334473, "learning_rate": 1.5944157581404568e-05, "loss": 0.91, "step": 15411 }, { "epoch": 2.637686120143762, "grad_norm": 31.915395736694336, "learning_rate": 1.5932658417343988e-05, "loss": 5.8324, "step": 15412 }, { "epoch": 2.6378572651035426, "grad_norm": 12.355116844177246, "learning_rate": 1.592115870301357e-05, "loss": 1.2899, "step": 15413 }, { "epoch": 2.6380284100633236, "grad_norm": 0.2745956778526306, "learning_rate": 1.5909658445198128e-05, "loss": 0.0812, "step": 15414 }, { "epoch": 2.6381995550231045, "grad_norm": 4.2450785636901855, "learning_rate": 1.5898157650682865e-05, "loss": 0.4201, "step": 15415 }, { "epoch": 2.6383706999828855, "grad_norm": 2.685703754425049, "learning_rate": 1.5886656326253233e-05, "loss": 0.189, "step": 15416 }, { "epoch": 2.6385418449426665, "grad_norm": 3.100611686706543, "learning_rate": 1.587515447869507e-05, "loss": 0.2892, "step": 15417 }, { "epoch": 2.6387129899024475, "grad_norm": 8.159748077392578, "learning_rate": 1.586365211479444e-05, "loss": 0.6243, "step": 15418 }, { "epoch": 2.638884134862228, "grad_norm": 0.37525808811187744, "learning_rate": 1.585214924133779e-05, "loss": 0.0905, "step": 15419 }, { "epoch": 2.639055279822009, "grad_norm": 55.49412155151367, "learning_rate": 1.5840645865111804e-05, "loss": 6.6224, "step": 15420 }, { "epoch": 2.63922642478179, "grad_norm": 3.3415913581848145, "learning_rate": 1.5829141992903523e-05, "loss": 0.2926, "step": 15421 }, { "epoch": 2.639397569741571, "grad_norm": 7.509571552276611, "learning_rate": 1.5817637631500213e-05, "loss": 0.5808, "step": 15422 }, { "epoch": 2.639568714701352, "grad_norm": 6.828643321990967, "learning_rate": 1.5806132787689502e-05, "loss": 0.7372, "step": 15423 }, { "epoch": 2.639739859661133, "grad_norm": 3.630831718444824, "learning_rate": 1.5794627468259224e-05, "loss": 0.3238, "step": 15424 }, { "epoch": 2.639911004620914, "grad_norm": 8.597683906555176, "learning_rate": 1.578312167999758e-05, "loss": 0.4907, "step": 15425 }, { "epoch": 2.640082149580695, "grad_norm": 8.504083633422852, "learning_rate": 1.5771615429692958e-05, "loss": 0.6578, "step": 15426 }, { "epoch": 2.640253294540476, "grad_norm": 33.29721450805664, "learning_rate": 1.5760108724134078e-05, "loss": 5.9111, "step": 15427 }, { "epoch": 2.640424439500257, "grad_norm": 2.0628862380981445, "learning_rate": 1.574860157010992e-05, "loss": 0.1852, "step": 15428 }, { "epoch": 2.6405955844600375, "grad_norm": 6.221285343170166, "learning_rate": 1.5737093974409745e-05, "loss": 0.5676, "step": 15429 }, { "epoch": 2.6407667294198185, "grad_norm": 4.964844703674316, "learning_rate": 1.5725585943823022e-05, "loss": 0.3491, "step": 15430 }, { "epoch": 2.6409378743795995, "grad_norm": 3.4714267253875732, "learning_rate": 1.5714077485139496e-05, "loss": 0.4736, "step": 15431 }, { "epoch": 2.6411090193393805, "grad_norm": 9.079512596130371, "learning_rate": 1.5702568605149188e-05, "loss": 0.6165, "step": 15432 }, { "epoch": 2.6412801642991615, "grad_norm": 3.909158945083618, "learning_rate": 1.569105931064238e-05, "loss": 0.2101, "step": 15433 }, { "epoch": 2.6414513092589424, "grad_norm": 3.5572688579559326, "learning_rate": 1.567954960840954e-05, "loss": 0.4207, "step": 15434 }, { "epoch": 2.641622454218723, "grad_norm": 2.7544524669647217, "learning_rate": 1.5668039505241407e-05, "loss": 0.3462, "step": 15435 }, { "epoch": 2.641793599178504, "grad_norm": 15.888957977294922, "learning_rate": 1.565652900792897e-05, "loss": 1.5924, "step": 15436 }, { "epoch": 2.641964744138285, "grad_norm": 2.0858304500579834, "learning_rate": 1.564501812326346e-05, "loss": 0.1731, "step": 15437 }, { "epoch": 2.642135889098066, "grad_norm": 0.3204922378063202, "learning_rate": 1.5633506858036297e-05, "loss": 0.0875, "step": 15438 }, { "epoch": 2.642307034057847, "grad_norm": 14.238653182983398, "learning_rate": 1.5621995219039122e-05, "loss": 1.6013, "step": 15439 }, { "epoch": 2.642478179017628, "grad_norm": 9.724506378173828, "learning_rate": 1.5610483213063842e-05, "loss": 0.6783, "step": 15440 }, { "epoch": 2.642649323977409, "grad_norm": 5.866201400756836, "learning_rate": 1.5598970846902578e-05, "loss": 0.5876, "step": 15441 }, { "epoch": 2.64282046893719, "grad_norm": 48.26438903808594, "learning_rate": 1.558745812734762e-05, "loss": 6.7772, "step": 15442 }, { "epoch": 2.642991613896971, "grad_norm": 8.727643966674805, "learning_rate": 1.5575945061191474e-05, "loss": 0.6086, "step": 15443 }, { "epoch": 2.643162758856752, "grad_norm": 7.082684516906738, "learning_rate": 1.5564431655226884e-05, "loss": 0.5913, "step": 15444 }, { "epoch": 2.6433339038165324, "grad_norm": 9.172520637512207, "learning_rate": 1.5552917916246796e-05, "loss": 0.6399, "step": 15445 }, { "epoch": 2.6435050487763134, "grad_norm": 4.995087146759033, "learning_rate": 1.5541403851044314e-05, "loss": 0.5241, "step": 15446 }, { "epoch": 2.6436761937360944, "grad_norm": 9.461006164550781, "learning_rate": 1.552988946641271e-05, "loss": 0.6837, "step": 15447 }, { "epoch": 2.6438473386958754, "grad_norm": 2.654177665710449, "learning_rate": 1.5518374769145577e-05, "loss": 0.2764, "step": 15448 }, { "epoch": 2.6440184836556564, "grad_norm": 4.422146320343018, "learning_rate": 1.5506859766036536e-05, "loss": 0.5072, "step": 15449 }, { "epoch": 2.6441896286154374, "grad_norm": 8.721612930297852, "learning_rate": 1.5495344463879502e-05, "loss": 0.7248, "step": 15450 }, { "epoch": 2.6443607735752184, "grad_norm": 4.616147518157959, "learning_rate": 1.548382886946847e-05, "loss": 0.3909, "step": 15451 }, { "epoch": 2.644531918534999, "grad_norm": 22.336681365966797, "learning_rate": 1.5472312989597707e-05, "loss": 4.6769, "step": 15452 }, { "epoch": 2.64470306349478, "grad_norm": 0.3116171360015869, "learning_rate": 1.5460796831061557e-05, "loss": 0.0928, "step": 15453 }, { "epoch": 2.644874208454561, "grad_norm": 1.5781573057174683, "learning_rate": 1.544928040065461e-05, "loss": 0.1456, "step": 15454 }, { "epoch": 2.645045353414342, "grad_norm": 15.924553871154785, "learning_rate": 1.543776370517154e-05, "loss": 1.5215, "step": 15455 }, { "epoch": 2.645216498374123, "grad_norm": 7.349564075469971, "learning_rate": 1.5426246751407248e-05, "loss": 0.536, "step": 15456 }, { "epoch": 2.645387643333904, "grad_norm": 13.220769882202148, "learning_rate": 1.5414729546156717e-05, "loss": 1.28, "step": 15457 }, { "epoch": 2.645558788293685, "grad_norm": 1.215255856513977, "learning_rate": 1.5403212096215165e-05, "loss": 0.1332, "step": 15458 }, { "epoch": 2.645729933253466, "grad_norm": 0.9238847494125366, "learning_rate": 1.5391694408377847e-05, "loss": 0.1265, "step": 15459 }, { "epoch": 2.645901078213247, "grad_norm": 0.6280931234359741, "learning_rate": 1.5380176489440265e-05, "loss": 0.1343, "step": 15460 }, { "epoch": 2.646072223173028, "grad_norm": 5.4289774894714355, "learning_rate": 1.536865834619797e-05, "loss": 0.5381, "step": 15461 }, { "epoch": 2.6462433681328084, "grad_norm": 5.11730432510376, "learning_rate": 1.5357139985446722e-05, "loss": 0.4799, "step": 15462 }, { "epoch": 2.6464145130925893, "grad_norm": 0.6072238683700562, "learning_rate": 1.5345621413982327e-05, "loss": 0.1382, "step": 15463 }, { "epoch": 2.6465856580523703, "grad_norm": 6.4895806312561035, "learning_rate": 1.5334102638600807e-05, "loss": 0.4925, "step": 15464 }, { "epoch": 2.6467568030121513, "grad_norm": 33.01985168457031, "learning_rate": 1.5322583666098214e-05, "loss": 6.4708, "step": 15465 }, { "epoch": 2.6469279479719323, "grad_norm": 11.518941879272461, "learning_rate": 1.5311064503270793e-05, "loss": 0.7694, "step": 15466 }, { "epoch": 2.6470990929317133, "grad_norm": 10.100695610046387, "learning_rate": 1.5299545156914833e-05, "loss": 0.6196, "step": 15467 }, { "epoch": 2.647270237891494, "grad_norm": 0.29943785071372986, "learning_rate": 1.5288025633826787e-05, "loss": 0.0842, "step": 15468 }, { "epoch": 2.647441382851275, "grad_norm": 1.4251099824905396, "learning_rate": 1.5276505940803186e-05, "loss": 0.1522, "step": 15469 }, { "epoch": 2.647612527811056, "grad_norm": 2.5821337699890137, "learning_rate": 1.5264986084640688e-05, "loss": 0.2151, "step": 15470 }, { "epoch": 2.647783672770837, "grad_norm": 2.4808146953582764, "learning_rate": 1.5253466072136005e-05, "loss": 0.3773, "step": 15471 }, { "epoch": 2.647954817730618, "grad_norm": 7.8650641441345215, "learning_rate": 1.5241945910085943e-05, "loss": 0.6718, "step": 15472 }, { "epoch": 2.648125962690399, "grad_norm": 14.039972305297852, "learning_rate": 1.523042560528743e-05, "loss": 1.348, "step": 15473 }, { "epoch": 2.64829710765018, "grad_norm": 15.058419227600098, "learning_rate": 1.5218905164537493e-05, "loss": 1.1075, "step": 15474 }, { "epoch": 2.6484682526099608, "grad_norm": 26.75159454345703, "learning_rate": 1.5207384594633181e-05, "loss": 4.7428, "step": 15475 }, { "epoch": 2.6486393975697418, "grad_norm": 14.139065742492676, "learning_rate": 1.5195863902371629e-05, "loss": 1.0346, "step": 15476 }, { "epoch": 2.6488105425295227, "grad_norm": 10.329983711242676, "learning_rate": 1.518434309455008e-05, "loss": 0.9054, "step": 15477 }, { "epoch": 2.6489816874893033, "grad_norm": 13.145374298095703, "learning_rate": 1.517282217796585e-05, "loss": 1.4194, "step": 15478 }, { "epoch": 2.6491528324490843, "grad_norm": 8.706025123596191, "learning_rate": 1.516130115941628e-05, "loss": 0.5202, "step": 15479 }, { "epoch": 2.6493239774088653, "grad_norm": 15.32783031463623, "learning_rate": 1.5149780045698768e-05, "loss": 1.3171, "step": 15480 }, { "epoch": 2.6494951223686463, "grad_norm": 36.52410888671875, "learning_rate": 1.51382588436108e-05, "loss": 5.527, "step": 15481 }, { "epoch": 2.6496662673284272, "grad_norm": 1.6757413148880005, "learning_rate": 1.5126737559949937e-05, "loss": 0.1668, "step": 15482 }, { "epoch": 2.6498374122882082, "grad_norm": 8.751421928405762, "learning_rate": 1.511521620151372e-05, "loss": 0.5675, "step": 15483 }, { "epoch": 2.650008557247989, "grad_norm": 1.2299692630767822, "learning_rate": 1.5103694775099751e-05, "loss": 0.1691, "step": 15484 }, { "epoch": 2.6501797022077698, "grad_norm": 12.904131889343262, "learning_rate": 1.5092173287505712e-05, "loss": 0.7825, "step": 15485 }, { "epoch": 2.6503508471675508, "grad_norm": 7.590359687805176, "learning_rate": 1.5080651745529313e-05, "loss": 0.4954, "step": 15486 }, { "epoch": 2.6505219921273317, "grad_norm": 17.782958984375, "learning_rate": 1.506913015596827e-05, "loss": 1.7203, "step": 15487 }, { "epoch": 2.6506931370871127, "grad_norm": 9.827252388000488, "learning_rate": 1.5057608525620308e-05, "loss": 0.6298, "step": 15488 }, { "epoch": 2.6508642820468937, "grad_norm": 12.208246231079102, "learning_rate": 1.5046086861283228e-05, "loss": 0.6772, "step": 15489 }, { "epoch": 2.6510354270066747, "grad_norm": 23.547271728515625, "learning_rate": 1.5034565169754833e-05, "loss": 4.5673, "step": 15490 }, { "epoch": 2.6512065719664557, "grad_norm": 7.7194600105285645, "learning_rate": 1.502304345783296e-05, "loss": 0.4858, "step": 15491 }, { "epoch": 2.6513777169262367, "grad_norm": 2.499220609664917, "learning_rate": 1.501152173231539e-05, "loss": 0.1977, "step": 15492 }, { "epoch": 2.6515488618860177, "grad_norm": 11.411466598510742, "learning_rate": 1.5e-05, "loss": 0.7336, "step": 15493 }, { "epoch": 2.6517200068457982, "grad_norm": 8.775806427001953, "learning_rate": 1.498847826768459e-05, "loss": 0.701, "step": 15494 }, { "epoch": 2.651891151805579, "grad_norm": 1.0577445030212402, "learning_rate": 1.497695654216704e-05, "loss": 0.1513, "step": 15495 }, { "epoch": 2.65206229676536, "grad_norm": 8.472949028015137, "learning_rate": 1.4965434830245144e-05, "loss": 0.741, "step": 15496 }, { "epoch": 2.652233441725141, "grad_norm": 11.14928150177002, "learning_rate": 1.4953913138716775e-05, "loss": 0.8425, "step": 15497 }, { "epoch": 2.652404586684922, "grad_norm": 29.732192993164062, "learning_rate": 1.4942391474379696e-05, "loss": 5.4415, "step": 15498 }, { "epoch": 2.652575731644703, "grad_norm": 4.235503673553467, "learning_rate": 1.4930869844031755e-05, "loss": 0.3987, "step": 15499 }, { "epoch": 2.652746876604484, "grad_norm": 5.892906188964844, "learning_rate": 1.4919348254470689e-05, "loss": 0.3872, "step": 15500 }, { "epoch": 2.6529180215642647, "grad_norm": 5.058338642120361, "learning_rate": 1.490782671249429e-05, "loss": 0.4246, "step": 15501 }, { "epoch": 2.6530891665240457, "grad_norm": 13.629232406616211, "learning_rate": 1.4896305224900251e-05, "loss": 1.4799, "step": 15502 }, { "epoch": 2.6532603114838267, "grad_norm": 1.0210224390029907, "learning_rate": 1.4884783798486305e-05, "loss": 0.132, "step": 15503 }, { "epoch": 2.6534314564436077, "grad_norm": 8.469715118408203, "learning_rate": 1.4873262440050065e-05, "loss": 0.5811, "step": 15504 }, { "epoch": 2.6536026014033887, "grad_norm": 5.267064094543457, "learning_rate": 1.48617411563892e-05, "loss": 0.5447, "step": 15505 }, { "epoch": 2.6537737463631696, "grad_norm": 19.719419479370117, "learning_rate": 1.4850219954301236e-05, "loss": 1.8579, "step": 15506 }, { "epoch": 2.6539448913229506, "grad_norm": 1.8736990690231323, "learning_rate": 1.4838698840583742e-05, "loss": 0.1935, "step": 15507 }, { "epoch": 2.6541160362827316, "grad_norm": 8.807021141052246, "learning_rate": 1.4827177822034152e-05, "loss": 0.6067, "step": 15508 }, { "epoch": 2.6542871812425126, "grad_norm": 17.51524543762207, "learning_rate": 1.4815656905449924e-05, "loss": 1.5079, "step": 15509 }, { "epoch": 2.6544583262022936, "grad_norm": 7.729092597961426, "learning_rate": 1.4804136097628375e-05, "loss": 0.5393, "step": 15510 }, { "epoch": 2.654629471162074, "grad_norm": 2.3021607398986816, "learning_rate": 1.4792615405366823e-05, "loss": 0.3485, "step": 15511 }, { "epoch": 2.654800616121855, "grad_norm": 9.451481819152832, "learning_rate": 1.478109483546251e-05, "loss": 0.6684, "step": 15512 }, { "epoch": 2.654971761081636, "grad_norm": 42.77833938598633, "learning_rate": 1.4769574394712548e-05, "loss": 7.3545, "step": 15513 }, { "epoch": 2.655142906041417, "grad_norm": 4.872589111328125, "learning_rate": 1.4758054089914037e-05, "loss": 0.312, "step": 15514 }, { "epoch": 2.655314051001198, "grad_norm": 12.244401931762695, "learning_rate": 1.4746533927863997e-05, "loss": 0.9, "step": 15515 }, { "epoch": 2.655485195960979, "grad_norm": 3.9373574256896973, "learning_rate": 1.4735013915359317e-05, "loss": 0.2861, "step": 15516 }, { "epoch": 2.6556563409207596, "grad_norm": 10.776229858398438, "learning_rate": 1.4723494059196796e-05, "loss": 0.7931, "step": 15517 }, { "epoch": 2.6558274858805406, "grad_norm": 30.107683181762695, "learning_rate": 1.4711974366173197e-05, "loss": 4.9749, "step": 15518 }, { "epoch": 2.6559986308403216, "grad_norm": 8.850568771362305, "learning_rate": 1.4700454843085167e-05, "loss": 0.76, "step": 15519 }, { "epoch": 2.6561697758001026, "grad_norm": 7.565058708190918, "learning_rate": 1.4688935496729228e-05, "loss": 0.524, "step": 15520 }, { "epoch": 2.6563409207598836, "grad_norm": 0.8224894404411316, "learning_rate": 1.4677416333901789e-05, "loss": 0.172, "step": 15521 }, { "epoch": 2.6565120657196646, "grad_norm": 1.2279188632965088, "learning_rate": 1.4665897361399192e-05, "loss": 0.1666, "step": 15522 }, { "epoch": 2.6566832106794456, "grad_norm": 21.630298614501953, "learning_rate": 1.4654378586017674e-05, "loss": 4.5965, "step": 15523 }, { "epoch": 2.6568543556392266, "grad_norm": 12.808518409729004, "learning_rate": 1.4642860014553302e-05, "loss": 1.2709, "step": 15524 }, { "epoch": 2.6570255005990076, "grad_norm": 4.84402322769165, "learning_rate": 1.4631341653802032e-05, "loss": 0.4133, "step": 15525 }, { "epoch": 2.6571966455587885, "grad_norm": 10.186229705810547, "learning_rate": 1.4619823510559737e-05, "loss": 0.6494, "step": 15526 }, { "epoch": 2.657367790518569, "grad_norm": 8.29053020477295, "learning_rate": 1.4608305591622155e-05, "loss": 0.6366, "step": 15527 }, { "epoch": 2.65753893547835, "grad_norm": 5.159774303436279, "learning_rate": 1.4596787903784865e-05, "loss": 0.5069, "step": 15528 }, { "epoch": 2.657710080438131, "grad_norm": 0.6003122329711914, "learning_rate": 1.4585270453843284e-05, "loss": 0.1333, "step": 15529 }, { "epoch": 2.657881225397912, "grad_norm": 4.7638068199157715, "learning_rate": 1.4573753248592756e-05, "loss": 0.3833, "step": 15530 }, { "epoch": 2.658052370357693, "grad_norm": 0.3391852378845215, "learning_rate": 1.4562236294828465e-05, "loss": 0.0847, "step": 15531 }, { "epoch": 2.658223515317474, "grad_norm": 3.4312243461608887, "learning_rate": 1.455071959934541e-05, "loss": 0.2535, "step": 15532 }, { "epoch": 2.6583946602772546, "grad_norm": 6.980222702026367, "learning_rate": 1.4539203168938424e-05, "loss": 0.6152, "step": 15533 }, { "epoch": 2.6585658052370356, "grad_norm": 8.255572319030762, "learning_rate": 1.4527687010402294e-05, "loss": 0.4206, "step": 15534 }, { "epoch": 2.6587369501968166, "grad_norm": 11.683449745178223, "learning_rate": 1.4516171130531512e-05, "loss": 0.8114, "step": 15535 }, { "epoch": 2.6589080951565975, "grad_norm": 0.6261475086212158, "learning_rate": 1.4504655536120502e-05, "loss": 0.134, "step": 15536 }, { "epoch": 2.6590792401163785, "grad_norm": 14.707966804504395, "learning_rate": 1.4493140233963442e-05, "loss": 2.0046, "step": 15537 }, { "epoch": 2.6592503850761595, "grad_norm": 8.303256034851074, "learning_rate": 1.4481625230854426e-05, "loss": 0.5323, "step": 15538 }, { "epoch": 2.6594215300359405, "grad_norm": 8.747976303100586, "learning_rate": 1.447011053358727e-05, "loss": 0.518, "step": 15539 }, { "epoch": 2.6595926749957215, "grad_norm": 1.9518808126449585, "learning_rate": 1.4458596148955709e-05, "loss": 0.2168, "step": 15540 }, { "epoch": 2.6597638199555025, "grad_norm": 14.827438354492188, "learning_rate": 1.4447082083753207e-05, "loss": 1.4572, "step": 15541 }, { "epoch": 2.6599349649152835, "grad_norm": 3.845144510269165, "learning_rate": 1.4435568344773118e-05, "loss": 0.3834, "step": 15542 }, { "epoch": 2.660106109875064, "grad_norm": 9.76317310333252, "learning_rate": 1.4424054938808527e-05, "loss": 0.8342, "step": 15543 }, { "epoch": 2.660277254834845, "grad_norm": 6.0353851318359375, "learning_rate": 1.44125418726524e-05, "loss": 0.5343, "step": 15544 }, { "epoch": 2.660448399794626, "grad_norm": 2.7325916290283203, "learning_rate": 1.4401029153097425e-05, "loss": 0.368, "step": 15545 }, { "epoch": 2.660619544754407, "grad_norm": 23.372852325439453, "learning_rate": 1.4389516786936162e-05, "loss": 4.9391, "step": 15546 }, { "epoch": 2.660790689714188, "grad_norm": 4.358602046966553, "learning_rate": 1.4378004780960882e-05, "loss": 0.6372, "step": 15547 }, { "epoch": 2.660961834673969, "grad_norm": 8.824239730834961, "learning_rate": 1.4366493141963728e-05, "loss": 0.6605, "step": 15548 }, { "epoch": 2.66113297963375, "grad_norm": 44.076927185058594, "learning_rate": 1.4354981876736543e-05, "loss": 6.2869, "step": 15549 }, { "epoch": 2.6613041245935305, "grad_norm": 18.821292877197266, "learning_rate": 1.4343470992071033e-05, "loss": 2.0477, "step": 15550 }, { "epoch": 2.6614752695533115, "grad_norm": 0.9430305361747742, "learning_rate": 1.4331960494758594e-05, "loss": 0.1553, "step": 15551 }, { "epoch": 2.6616464145130925, "grad_norm": 4.17100715637207, "learning_rate": 1.4320450391590478e-05, "loss": 0.362, "step": 15552 }, { "epoch": 2.6618175594728735, "grad_norm": 18.80425262451172, "learning_rate": 1.4308940689357624e-05, "loss": 0.8306, "step": 15553 }, { "epoch": 2.6619887044326545, "grad_norm": 4.200234889984131, "learning_rate": 1.4297431394850793e-05, "loss": 0.4488, "step": 15554 }, { "epoch": 2.6621598493924354, "grad_norm": 23.111360549926758, "learning_rate": 1.4285922514860485e-05, "loss": 4.9843, "step": 15555 }, { "epoch": 2.6623309943522164, "grad_norm": 0.7436122298240662, "learning_rate": 1.4274414056176978e-05, "loss": 0.1398, "step": 15556 }, { "epoch": 2.6625021393119974, "grad_norm": 15.731658935546875, "learning_rate": 1.4262906025590258e-05, "loss": 1.8167, "step": 15557 }, { "epoch": 2.6626732842717784, "grad_norm": 12.377107620239258, "learning_rate": 1.425139842989006e-05, "loss": 0.8171, "step": 15558 }, { "epoch": 2.6628444292315594, "grad_norm": 11.82387638092041, "learning_rate": 1.4239891275865903e-05, "loss": 0.6902, "step": 15559 }, { "epoch": 2.66301557419134, "grad_norm": 18.07465934753418, "learning_rate": 1.4228384570307047e-05, "loss": 1.8344, "step": 15560 }, { "epoch": 2.663186719151121, "grad_norm": 3.0058000087738037, "learning_rate": 1.4216878320002445e-05, "loss": 0.3952, "step": 15561 }, { "epoch": 2.663357864110902, "grad_norm": 4.959670066833496, "learning_rate": 1.4205372531740779e-05, "loss": 0.5506, "step": 15562 }, { "epoch": 2.663529009070683, "grad_norm": 5.419678211212158, "learning_rate": 1.4193867212310502e-05, "loss": 0.5066, "step": 15563 }, { "epoch": 2.663700154030464, "grad_norm": 0.2938271164894104, "learning_rate": 1.418236236849979e-05, "loss": 0.0872, "step": 15564 }, { "epoch": 2.663871298990245, "grad_norm": 4.015591621398926, "learning_rate": 1.41708580070965e-05, "loss": 0.4165, "step": 15565 }, { "epoch": 2.6640424439500254, "grad_norm": 2.192807197570801, "learning_rate": 1.4159354134888199e-05, "loss": 0.2166, "step": 15566 }, { "epoch": 2.6642135889098064, "grad_norm": 1.6383728981018066, "learning_rate": 1.4147850758662212e-05, "loss": 0.1698, "step": 15567 }, { "epoch": 2.6643847338695874, "grad_norm": 5.298519134521484, "learning_rate": 1.4136347885205565e-05, "loss": 0.4204, "step": 15568 }, { "epoch": 2.6645558788293684, "grad_norm": 7.8603739738464355, "learning_rate": 1.4124845521304956e-05, "loss": 0.5902, "step": 15569 }, { "epoch": 2.6647270237891494, "grad_norm": 50.974884033203125, "learning_rate": 1.411334367374677e-05, "loss": 5.9373, "step": 15570 }, { "epoch": 2.6648981687489304, "grad_norm": 5.230668067932129, "learning_rate": 1.4101842349317137e-05, "loss": 0.387, "step": 15571 }, { "epoch": 2.6650693137087114, "grad_norm": 14.49184513092041, "learning_rate": 1.4090341554801876e-05, "loss": 1.5508, "step": 15572 }, { "epoch": 2.6652404586684924, "grad_norm": 12.852449417114258, "learning_rate": 1.4078841296986452e-05, "loss": 0.9538, "step": 15573 }, { "epoch": 2.6654116036282733, "grad_norm": 6.220725059509277, "learning_rate": 1.4067341582656015e-05, "loss": 0.6628, "step": 15574 }, { "epoch": 2.6655827485880543, "grad_norm": 5.071091651916504, "learning_rate": 1.4055842418595432e-05, "loss": 0.4846, "step": 15575 }, { "epoch": 2.665753893547835, "grad_norm": 10.916587829589844, "learning_rate": 1.4044343811589231e-05, "loss": 0.7286, "step": 15576 }, { "epoch": 2.665925038507616, "grad_norm": 5.625235557556152, "learning_rate": 1.4032845768421624e-05, "loss": 0.4774, "step": 15577 }, { "epoch": 2.666096183467397, "grad_norm": 3.9687068462371826, "learning_rate": 1.4021348295876434e-05, "loss": 0.221, "step": 15578 }, { "epoch": 2.666267328427178, "grad_norm": 14.998714447021484, "learning_rate": 1.4009851400737227e-05, "loss": 1.7903, "step": 15579 }, { "epoch": 2.666438473386959, "grad_norm": 12.404382705688477, "learning_rate": 1.3998355089787149e-05, "loss": 1.0612, "step": 15580 }, { "epoch": 2.66660961834674, "grad_norm": 0.2912103235721588, "learning_rate": 1.398685936980908e-05, "loss": 0.0842, "step": 15581 }, { "epoch": 2.6667807633065204, "grad_norm": 14.653780937194824, "learning_rate": 1.397536424758547e-05, "loss": 1.3011, "step": 15582 }, { "epoch": 2.6669519082663014, "grad_norm": 2.5672290325164795, "learning_rate": 1.3963869729898501e-05, "loss": 0.2909, "step": 15583 }, { "epoch": 2.6671230532260823, "grad_norm": 6.251506805419922, "learning_rate": 1.3952375823529914e-05, "loss": 0.4052, "step": 15584 }, { "epoch": 2.6672941981858633, "grad_norm": 1.015556812286377, "learning_rate": 1.3940882535261173e-05, "loss": 0.1392, "step": 15585 }, { "epoch": 2.6674653431456443, "grad_norm": 5.636768817901611, "learning_rate": 1.3929389871873292e-05, "loss": 0.5183, "step": 15586 }, { "epoch": 2.6676364881054253, "grad_norm": 1.5955324172973633, "learning_rate": 1.3917897840147006e-05, "loss": 0.1823, "step": 15587 }, { "epoch": 2.6678076330652063, "grad_norm": 5.738802909851074, "learning_rate": 1.3906406446862581e-05, "loss": 0.4746, "step": 15588 }, { "epoch": 2.6679787780249873, "grad_norm": 9.05001449584961, "learning_rate": 1.3894915698800004e-05, "loss": 0.6922, "step": 15589 }, { "epoch": 2.6681499229847683, "grad_norm": 7.280507564544678, "learning_rate": 1.3883425602738787e-05, "loss": 0.5156, "step": 15590 }, { "epoch": 2.6683210679445493, "grad_norm": 0.49970361590385437, "learning_rate": 1.3871936165458149e-05, "loss": 0.128, "step": 15591 }, { "epoch": 2.66849221290433, "grad_norm": 14.837495803833008, "learning_rate": 1.3860447393736834e-05, "loss": 1.8002, "step": 15592 }, { "epoch": 2.668663357864111, "grad_norm": 12.60578441619873, "learning_rate": 1.3848959294353277e-05, "loss": 0.9146, "step": 15593 }, { "epoch": 2.668834502823892, "grad_norm": 23.197141647338867, "learning_rate": 1.3837471874085428e-05, "loss": 5.1039, "step": 15594 }, { "epoch": 2.6690056477836728, "grad_norm": 4.911342620849609, "learning_rate": 1.3825985139710926e-05, "loss": 0.3939, "step": 15595 }, { "epoch": 2.6691767927434538, "grad_norm": 10.033991813659668, "learning_rate": 1.3814499098006918e-05, "loss": 0.6771, "step": 15596 }, { "epoch": 2.6693479377032348, "grad_norm": 10.017093658447266, "learning_rate": 1.3803013755750208e-05, "loss": 0.9478, "step": 15597 }, { "epoch": 2.6695190826630153, "grad_norm": 9.723074913024902, "learning_rate": 1.3791529119717186e-05, "loss": 0.7745, "step": 15598 }, { "epoch": 2.6696902276227963, "grad_norm": 1.5167818069458008, "learning_rate": 1.3780045196683764e-05, "loss": 0.1666, "step": 15599 }, { "epoch": 2.6698613725825773, "grad_norm": 11.820952415466309, "learning_rate": 1.3768561993425491e-05, "loss": 1.1133, "step": 15600 }, { "epoch": 2.6700325175423583, "grad_norm": 25.17776107788086, "learning_rate": 1.3757079516717496e-05, "loss": 4.8678, "step": 15601 }, { "epoch": 2.6702036625021393, "grad_norm": 8.779361724853516, "learning_rate": 1.3745597773334443e-05, "loss": 0.6203, "step": 15602 }, { "epoch": 2.6703748074619202, "grad_norm": 0.281924843788147, "learning_rate": 1.3734116770050548e-05, "loss": 0.088, "step": 15603 }, { "epoch": 2.6705459524217012, "grad_norm": 7.942995548248291, "learning_rate": 1.3722636513639644e-05, "loss": 0.4606, "step": 15604 }, { "epoch": 2.670717097381482, "grad_norm": 14.32952880859375, "learning_rate": 1.371115701087512e-05, "loss": 1.2764, "step": 15605 }, { "epoch": 2.670888242341263, "grad_norm": 22.83847999572754, "learning_rate": 1.3699678268529877e-05, "loss": 4.7806, "step": 15606 }, { "epoch": 2.671059387301044, "grad_norm": 11.307907104492188, "learning_rate": 1.3688200293376372e-05, "loss": 0.6029, "step": 15607 }, { "epoch": 2.671230532260825, "grad_norm": 18.452299118041992, "learning_rate": 1.3676723092186652e-05, "loss": 1.5833, "step": 15608 }, { "epoch": 2.6714016772206057, "grad_norm": 10.743220329284668, "learning_rate": 1.3665246671732296e-05, "loss": 0.888, "step": 15609 }, { "epoch": 2.6715728221803867, "grad_norm": 3.003208875656128, "learning_rate": 1.3653771038784396e-05, "loss": 0.3714, "step": 15610 }, { "epoch": 2.6717439671401677, "grad_norm": 5.403586387634277, "learning_rate": 1.3642296200113564e-05, "loss": 0.3604, "step": 15611 }, { "epoch": 2.6719151120999487, "grad_norm": 7.213888645172119, "learning_rate": 1.3630822162489995e-05, "loss": 0.4634, "step": 15612 }, { "epoch": 2.6720862570597297, "grad_norm": 15.029621124267578, "learning_rate": 1.3619348932683411e-05, "loss": 0.9496, "step": 15613 }, { "epoch": 2.6722574020195107, "grad_norm": 8.197566032409668, "learning_rate": 1.3607876517463014e-05, "loss": 0.4885, "step": 15614 }, { "epoch": 2.672428546979291, "grad_norm": 0.7849375605583191, "learning_rate": 1.3596404923597524e-05, "loss": 0.1503, "step": 15615 }, { "epoch": 2.672599691939072, "grad_norm": 10.626391410827637, "learning_rate": 1.358493415785522e-05, "loss": 0.6028, "step": 15616 }, { "epoch": 2.672770836898853, "grad_norm": 2.942342519760132, "learning_rate": 1.3573464227003891e-05, "loss": 0.3729, "step": 15617 }, { "epoch": 2.672941981858634, "grad_norm": 1.178956389427185, "learning_rate": 1.3561995137810795e-05, "loss": 0.146, "step": 15618 }, { "epoch": 2.673113126818415, "grad_norm": 4.806873798370361, "learning_rate": 1.3550526897042663e-05, "loss": 0.3849, "step": 15619 }, { "epoch": 2.673284271778196, "grad_norm": 10.712030410766602, "learning_rate": 1.3539059511465868e-05, "loss": 0.6783, "step": 15620 }, { "epoch": 2.673455416737977, "grad_norm": 7.1282830238342285, "learning_rate": 1.3527592987846114e-05, "loss": 0.587, "step": 15621 }, { "epoch": 2.673626561697758, "grad_norm": 15.100757598876953, "learning_rate": 1.3516127332948709e-05, "loss": 1.3819, "step": 15622 }, { "epoch": 2.673797706657539, "grad_norm": 0.30615800619125366, "learning_rate": 1.350466255353837e-05, "loss": 0.0906, "step": 15623 }, { "epoch": 2.67396885161732, "grad_norm": 4.144843578338623, "learning_rate": 1.349319865637937e-05, "loss": 0.4347, "step": 15624 }, { "epoch": 2.6741399965771007, "grad_norm": 16.846349716186523, "learning_rate": 1.3481735648235388e-05, "loss": 1.6139, "step": 15625 }, { "epoch": 2.6743111415368817, "grad_norm": 23.29416275024414, "learning_rate": 1.3470273535869658e-05, "loss": 5.2668, "step": 15626 }, { "epoch": 2.6744822864966626, "grad_norm": 8.89089584350586, "learning_rate": 1.3458812326044796e-05, "loss": 0.5765, "step": 15627 }, { "epoch": 2.6746534314564436, "grad_norm": 2.492835760116577, "learning_rate": 1.3447352025522978e-05, "loss": 0.1529, "step": 15628 }, { "epoch": 2.6748245764162246, "grad_norm": 4.697807312011719, "learning_rate": 1.3435892641065757e-05, "loss": 0.3752, "step": 15629 }, { "epoch": 2.6749957213760056, "grad_norm": 5.463619232177734, "learning_rate": 1.3424434179434227e-05, "loss": 0.4932, "step": 15630 }, { "epoch": 2.675166866335786, "grad_norm": 6.7512102127075195, "learning_rate": 1.341297664738886e-05, "loss": 0.5635, "step": 15631 }, { "epoch": 2.675338011295567, "grad_norm": 8.266084671020508, "learning_rate": 1.3401520051689654e-05, "loss": 0.6322, "step": 15632 }, { "epoch": 2.675509156255348, "grad_norm": 6.323330402374268, "learning_rate": 1.3390064399095977e-05, "loss": 0.5282, "step": 15633 }, { "epoch": 2.675680301215129, "grad_norm": 7.541857719421387, "learning_rate": 1.3378609696366727e-05, "loss": 0.4819, "step": 15634 }, { "epoch": 2.67585144617491, "grad_norm": 9.226752281188965, "learning_rate": 1.3367155950260148e-05, "loss": 0.6292, "step": 15635 }, { "epoch": 2.676022591134691, "grad_norm": 14.42416763305664, "learning_rate": 1.335570316753401e-05, "loss": 1.5114, "step": 15636 }, { "epoch": 2.676193736094472, "grad_norm": 9.525150299072266, "learning_rate": 1.3344251354945433e-05, "loss": 0.5907, "step": 15637 }, { "epoch": 2.676364881054253, "grad_norm": 0.46182844042778015, "learning_rate": 1.3332800519251041e-05, "loss": 0.0872, "step": 15638 }, { "epoch": 2.676536026014034, "grad_norm": 0.38990890979766846, "learning_rate": 1.3321350667206808e-05, "loss": 0.0925, "step": 15639 }, { "epoch": 2.676707170973815, "grad_norm": 0.2704499661922455, "learning_rate": 1.330990180556818e-05, "loss": 0.0845, "step": 15640 }, { "epoch": 2.6768783159335956, "grad_norm": 28.0125675201416, "learning_rate": 1.3298453941090004e-05, "loss": 4.8862, "step": 15641 }, { "epoch": 2.6770494608933766, "grad_norm": 10.232093811035156, "learning_rate": 1.3287007080526557e-05, "loss": 0.7191, "step": 15642 }, { "epoch": 2.6772206058531576, "grad_norm": 0.29431185126304626, "learning_rate": 1.3275561230631488e-05, "loss": 0.0857, "step": 15643 }, { "epoch": 2.6773917508129386, "grad_norm": 5.832426071166992, "learning_rate": 1.326411639815784e-05, "loss": 0.3562, "step": 15644 }, { "epoch": 2.6775628957727196, "grad_norm": 13.347962379455566, "learning_rate": 1.32526725898581e-05, "loss": 0.9442, "step": 15645 }, { "epoch": 2.6777340407325005, "grad_norm": 1.1535459756851196, "learning_rate": 1.3241229812484153e-05, "loss": 0.169, "step": 15646 }, { "epoch": 2.677905185692281, "grad_norm": 6.865843296051025, "learning_rate": 1.322978807278724e-05, "loss": 0.4559, "step": 15647 }, { "epoch": 2.678076330652062, "grad_norm": 19.76642608642578, "learning_rate": 1.3218347377517979e-05, "loss": 2.3449, "step": 15648 }, { "epoch": 2.678247475611843, "grad_norm": 6.140285491943359, "learning_rate": 1.320690773342642e-05, "loss": 0.4338, "step": 15649 }, { "epoch": 2.678418620571624, "grad_norm": 12.117287635803223, "learning_rate": 1.3195469147261987e-05, "loss": 0.93, "step": 15650 }, { "epoch": 2.678589765531405, "grad_norm": 0.2778697609901428, "learning_rate": 1.318403162577345e-05, "loss": 0.0852, "step": 15651 }, { "epoch": 2.678760910491186, "grad_norm": 0.9695111513137817, "learning_rate": 1.3172595175708934e-05, "loss": 0.1707, "step": 15652 }, { "epoch": 2.678932055450967, "grad_norm": 7.0369486808776855, "learning_rate": 1.3161159803815986e-05, "loss": 0.6132, "step": 15653 }, { "epoch": 2.679103200410748, "grad_norm": 16.093168258666992, "learning_rate": 1.3149725516841514e-05, "loss": 0.732, "step": 15654 }, { "epoch": 2.679274345370529, "grad_norm": 1.3280994892120361, "learning_rate": 1.3138292321531742e-05, "loss": 0.1658, "step": 15655 }, { "epoch": 2.67944549033031, "grad_norm": 8.55648136138916, "learning_rate": 1.3126860224632255e-05, "loss": 0.5394, "step": 15656 }, { "epoch": 2.679616635290091, "grad_norm": 6.761679172515869, "learning_rate": 1.3115429232888025e-05, "loss": 0.5437, "step": 15657 }, { "epoch": 2.6797877802498715, "grad_norm": 3.300199508666992, "learning_rate": 1.3103999353043376e-05, "loss": 0.2172, "step": 15658 }, { "epoch": 2.6799589252096525, "grad_norm": 9.99261474609375, "learning_rate": 1.3092570591841929e-05, "loss": 0.8094, "step": 15659 }, { "epoch": 2.6801300701694335, "grad_norm": 24.478145599365234, "learning_rate": 1.3081142956026657e-05, "loss": 4.7234, "step": 15660 }, { "epoch": 2.6803012151292145, "grad_norm": 9.748064994812012, "learning_rate": 1.3069716452339897e-05, "loss": 0.7035, "step": 15661 }, { "epoch": 2.6804723600889955, "grad_norm": 3.5795645713806152, "learning_rate": 1.3058291087523305e-05, "loss": 0.3815, "step": 15662 }, { "epoch": 2.6806435050487765, "grad_norm": 5.7938032150268555, "learning_rate": 1.3046866868317883e-05, "loss": 0.442, "step": 15663 }, { "epoch": 2.680814650008557, "grad_norm": 4.308058738708496, "learning_rate": 1.30354438014639e-05, "loss": 0.3212, "step": 15664 }, { "epoch": 2.680985794968338, "grad_norm": 14.503430366516113, "learning_rate": 1.3024021893701019e-05, "loss": 1.0531, "step": 15665 }, { "epoch": 2.681156939928119, "grad_norm": 4.006903171539307, "learning_rate": 1.3012601151768144e-05, "loss": 0.4976, "step": 15666 }, { "epoch": 2.6813280848879, "grad_norm": 0.7797266840934753, "learning_rate": 1.3001181582403573e-05, "loss": 0.1453, "step": 15667 }, { "epoch": 2.681499229847681, "grad_norm": 15.863028526306152, "learning_rate": 1.298976319234482e-05, "loss": 1.4404, "step": 15668 }, { "epoch": 2.681670374807462, "grad_norm": 0.6970544457435608, "learning_rate": 1.29783459883288e-05, "loss": 0.1392, "step": 15669 }, { "epoch": 2.681841519767243, "grad_norm": 0.26754966378211975, "learning_rate": 1.296692997709164e-05, "loss": 0.0822, "step": 15670 }, { "epoch": 2.682012664727024, "grad_norm": 17.380647659301758, "learning_rate": 1.2955515165368839e-05, "loss": 1.1748, "step": 15671 }, { "epoch": 2.682183809686805, "grad_norm": 2.7184319496154785, "learning_rate": 1.2944101559895114e-05, "loss": 0.2373, "step": 15672 }, { "epoch": 2.682354954646586, "grad_norm": 9.026260375976562, "learning_rate": 1.2932689167404553e-05, "loss": 0.6513, "step": 15673 }, { "epoch": 2.6825260996063665, "grad_norm": 7.321512699127197, "learning_rate": 1.2921277994630444e-05, "loss": 0.4986, "step": 15674 }, { "epoch": 2.6826972445661474, "grad_norm": 10.214896202087402, "learning_rate": 1.2909868048305434e-05, "loss": 0.7033, "step": 15675 }, { "epoch": 2.6828683895259284, "grad_norm": 4.146488666534424, "learning_rate": 1.2898459335161369e-05, "loss": 0.4486, "step": 15676 }, { "epoch": 2.6830395344857094, "grad_norm": 2.6537482738494873, "learning_rate": 1.2887051861929445e-05, "loss": 0.184, "step": 15677 }, { "epoch": 2.6832106794454904, "grad_norm": 8.092536926269531, "learning_rate": 1.287564563534005e-05, "loss": 0.5296, "step": 15678 }, { "epoch": 2.6833818244052714, "grad_norm": 10.659708023071289, "learning_rate": 1.2864240662122913e-05, "loss": 0.8322, "step": 15679 }, { "epoch": 2.683552969365052, "grad_norm": 8.14775276184082, "learning_rate": 1.2852836949006946e-05, "loss": 0.4979, "step": 15680 }, { "epoch": 2.683724114324833, "grad_norm": 7.388331413269043, "learning_rate": 1.2841434502720402e-05, "loss": 0.505, "step": 15681 }, { "epoch": 2.683895259284614, "grad_norm": 6.120757102966309, "learning_rate": 1.2830033329990692e-05, "loss": 0.3654, "step": 15682 }, { "epoch": 2.684066404244395, "grad_norm": 8.91789436340332, "learning_rate": 1.2818633437544549e-05, "loss": 0.5459, "step": 15683 }, { "epoch": 2.684237549204176, "grad_norm": 14.642288208007812, "learning_rate": 1.280723483210795e-05, "loss": 1.3279, "step": 15684 }, { "epoch": 2.684408694163957, "grad_norm": 0.32882776856422424, "learning_rate": 1.2795837520406045e-05, "loss": 0.0894, "step": 15685 }, { "epoch": 2.684579839123738, "grad_norm": 0.49750518798828125, "learning_rate": 1.2784441509163283e-05, "loss": 0.1226, "step": 15686 }, { "epoch": 2.684750984083519, "grad_norm": 3.730806589126587, "learning_rate": 1.2773046805103353e-05, "loss": 0.3036, "step": 15687 }, { "epoch": 2.6849221290433, "grad_norm": 8.511749267578125, "learning_rate": 1.2761653414949125e-05, "loss": 0.6971, "step": 15688 }, { "epoch": 2.685093274003081, "grad_norm": 6.38276481628418, "learning_rate": 1.2750261345422696e-05, "loss": 0.5427, "step": 15689 }, { "epoch": 2.6852644189628614, "grad_norm": 9.81951904296875, "learning_rate": 1.2738870603245424e-05, "loss": 0.8291, "step": 15690 }, { "epoch": 2.6854355639226424, "grad_norm": 6.778170108795166, "learning_rate": 1.2727481195137886e-05, "loss": 0.6162, "step": 15691 }, { "epoch": 2.6856067088824234, "grad_norm": 6.1214375495910645, "learning_rate": 1.2716093127819826e-05, "loss": 0.5127, "step": 15692 }, { "epoch": 2.6857778538422044, "grad_norm": 14.155745506286621, "learning_rate": 1.2704706408010203e-05, "loss": 1.3681, "step": 15693 }, { "epoch": 2.6859489988019853, "grad_norm": 7.950896739959717, "learning_rate": 1.2693321042427216e-05, "loss": 0.876, "step": 15694 }, { "epoch": 2.6861201437617663, "grad_norm": 28.893869400024414, "learning_rate": 1.2681937037788272e-05, "loss": 4.7201, "step": 15695 }, { "epoch": 2.686291288721547, "grad_norm": 7.184755325317383, "learning_rate": 1.2670554400809925e-05, "loss": 0.7183, "step": 15696 }, { "epoch": 2.686462433681328, "grad_norm": 5.090479850769043, "learning_rate": 1.2659173138207933e-05, "loss": 0.3873, "step": 15697 }, { "epoch": 2.686633578641109, "grad_norm": 0.3015848994255066, "learning_rate": 1.2647793256697277e-05, "loss": 0.0837, "step": 15698 }, { "epoch": 2.68680472360089, "grad_norm": 6.854918003082275, "learning_rate": 1.2636414762992125e-05, "loss": 0.4272, "step": 15699 }, { "epoch": 2.686975868560671, "grad_norm": 15.517809867858887, "learning_rate": 1.262503766380578e-05, "loss": 1.8529, "step": 15700 }, { "epoch": 2.687147013520452, "grad_norm": 2.4599037170410156, "learning_rate": 1.2613661965850732e-05, "loss": 0.2331, "step": 15701 }, { "epoch": 2.687318158480233, "grad_norm": 1.1916369199752808, "learning_rate": 1.2602287675838679e-05, "loss": 0.1444, "step": 15702 }, { "epoch": 2.687489303440014, "grad_norm": 3.818796157836914, "learning_rate": 1.2590914800480486e-05, "loss": 0.3882, "step": 15703 }, { "epoch": 2.687660448399795, "grad_norm": 4.174842357635498, "learning_rate": 1.2579543346486132e-05, "loss": 0.4553, "step": 15704 }, { "epoch": 2.6878315933595758, "grad_norm": 18.382287979125977, "learning_rate": 1.2568173320564805e-05, "loss": 2.2784, "step": 15705 }, { "epoch": 2.6880027383193568, "grad_norm": 0.84431391954422, "learning_rate": 1.2556804729424863e-05, "loss": 0.1423, "step": 15706 }, { "epoch": 2.6881738832791373, "grad_norm": 11.054018020629883, "learning_rate": 1.254543757977375e-05, "loss": 0.7262, "step": 15707 }, { "epoch": 2.6883450282389183, "grad_norm": 7.32551908493042, "learning_rate": 1.2534071878318143e-05, "loss": 0.4765, "step": 15708 }, { "epoch": 2.6885161731986993, "grad_norm": 0.2768929898738861, "learning_rate": 1.252270763176378e-05, "loss": 0.0882, "step": 15709 }, { "epoch": 2.6886873181584803, "grad_norm": 15.73495101928711, "learning_rate": 1.2511344846815621e-05, "loss": 1.1576, "step": 15710 }, { "epoch": 2.6888584631182613, "grad_norm": 7.036859035491943, "learning_rate": 1.249998353017769e-05, "loss": 0.6044, "step": 15711 }, { "epoch": 2.6890296080780423, "grad_norm": 6.721189022064209, "learning_rate": 1.248862368855322e-05, "loss": 0.5776, "step": 15712 }, { "epoch": 2.689200753037823, "grad_norm": 2.300388813018799, "learning_rate": 1.2477265328644495e-05, "loss": 0.35, "step": 15713 }, { "epoch": 2.689371897997604, "grad_norm": 6.91254186630249, "learning_rate": 1.2465908457153006e-05, "loss": 0.4716, "step": 15714 }, { "epoch": 2.6895430429573848, "grad_norm": 5.926042079925537, "learning_rate": 1.245455308077928e-05, "loss": 0.5309, "step": 15715 }, { "epoch": 2.6897141879171658, "grad_norm": 6.186888217926025, "learning_rate": 1.2443199206223046e-05, "loss": 0.5105, "step": 15716 }, { "epoch": 2.6898853328769468, "grad_norm": 6.343082904815674, "learning_rate": 1.243184684018307e-05, "loss": 0.3985, "step": 15717 }, { "epoch": 2.6900564778367277, "grad_norm": 4.338169574737549, "learning_rate": 1.2420495989357304e-05, "loss": 0.4046, "step": 15718 }, { "epoch": 2.6902276227965087, "grad_norm": 18.460243225097656, "learning_rate": 1.2409146660442723e-05, "loss": 1.9613, "step": 15719 }, { "epoch": 2.6903987677562897, "grad_norm": 9.934530258178711, "learning_rate": 1.2397798860135493e-05, "loss": 0.6922, "step": 15720 }, { "epoch": 2.6905699127160707, "grad_norm": 15.34299373626709, "learning_rate": 1.2386452595130793e-05, "loss": 1.5136, "step": 15721 }, { "epoch": 2.6907410576758517, "grad_norm": 13.844565391540527, "learning_rate": 1.2375107872122977e-05, "loss": 1.4607, "step": 15722 }, { "epoch": 2.6909122026356322, "grad_norm": 7.438872337341309, "learning_rate": 1.2363764697805402e-05, "loss": 0.435, "step": 15723 }, { "epoch": 2.6910833475954132, "grad_norm": 4.114236831665039, "learning_rate": 1.2352423078870602e-05, "loss": 0.3769, "step": 15724 }, { "epoch": 2.6912544925551942, "grad_norm": 4.422876834869385, "learning_rate": 1.234108302201011e-05, "loss": 0.4363, "step": 15725 }, { "epoch": 2.691425637514975, "grad_norm": 3.572704315185547, "learning_rate": 1.2329744533914596e-05, "loss": 0.3446, "step": 15726 }, { "epoch": 2.691596782474756, "grad_norm": 4.208020210266113, "learning_rate": 1.231840762127378e-05, "loss": 0.3049, "step": 15727 }, { "epoch": 2.691767927434537, "grad_norm": 22.265148162841797, "learning_rate": 1.2307072290776485e-05, "loss": 4.544, "step": 15728 }, { "epoch": 2.6919390723943177, "grad_norm": 0.31280046701431274, "learning_rate": 1.229573854911055e-05, "loss": 0.0866, "step": 15729 }, { "epoch": 2.6921102173540987, "grad_norm": 3.4406251907348633, "learning_rate": 1.2284406402962877e-05, "loss": 0.1703, "step": 15730 }, { "epoch": 2.6922813623138797, "grad_norm": 11.930238723754883, "learning_rate": 1.2273075859019473e-05, "loss": 0.838, "step": 15731 }, { "epoch": 2.6924525072736607, "grad_norm": 9.023502349853516, "learning_rate": 1.2261746923965395e-05, "loss": 0.6628, "step": 15732 }, { "epoch": 2.6926236522334417, "grad_norm": 8.395051956176758, "learning_rate": 1.2250419604484708e-05, "loss": 0.5401, "step": 15733 }, { "epoch": 2.6927947971932227, "grad_norm": 11.712132453918457, "learning_rate": 1.223909390726053e-05, "loss": 0.7845, "step": 15734 }, { "epoch": 2.6929659421530037, "grad_norm": 23.777132034301758, "learning_rate": 1.2227769838975059e-05, "loss": 4.6139, "step": 15735 }, { "epoch": 2.6931370871127847, "grad_norm": 6.199190139770508, "learning_rate": 1.221644740630953e-05, "loss": 0.4553, "step": 15736 }, { "epoch": 2.6933082320725656, "grad_norm": 7.5311126708984375, "learning_rate": 1.220512661594418e-05, "loss": 0.6551, "step": 15737 }, { "epoch": 2.6934793770323466, "grad_norm": 14.30656623840332, "learning_rate": 1.2193807474558268e-05, "loss": 1.3899, "step": 15738 }, { "epoch": 2.693650521992127, "grad_norm": 2.0986149311065674, "learning_rate": 1.2182489988830127e-05, "loss": 0.2008, "step": 15739 }, { "epoch": 2.693821666951908, "grad_norm": 8.187783241271973, "learning_rate": 1.2171174165437112e-05, "loss": 0.5712, "step": 15740 }, { "epoch": 2.693992811911689, "grad_norm": 8.988706588745117, "learning_rate": 1.215986001105555e-05, "loss": 0.6994, "step": 15741 }, { "epoch": 2.69416395687147, "grad_norm": 6.192677021026611, "learning_rate": 1.2148547532360791e-05, "loss": 0.6515, "step": 15742 }, { "epoch": 2.694335101831251, "grad_norm": 7.772089958190918, "learning_rate": 1.2137236736027236e-05, "loss": 0.6158, "step": 15743 }, { "epoch": 2.694506246791032, "grad_norm": 0.5674598813056946, "learning_rate": 1.2125927628728288e-05, "loss": 0.1387, "step": 15744 }, { "epoch": 2.6946773917508127, "grad_norm": 4.3516645431518555, "learning_rate": 1.2114620217136326e-05, "loss": 0.4454, "step": 15745 }, { "epoch": 2.6948485367105937, "grad_norm": 8.59583854675293, "learning_rate": 1.2103314507922709e-05, "loss": 0.6137, "step": 15746 }, { "epoch": 2.6950196816703746, "grad_norm": 0.2519618272781372, "learning_rate": 1.2092010507757849e-05, "loss": 0.0855, "step": 15747 }, { "epoch": 2.6951908266301556, "grad_norm": 6.931314945220947, "learning_rate": 1.2080708223311117e-05, "loss": 0.5085, "step": 15748 }, { "epoch": 2.6953619715899366, "grad_norm": 8.683560371398926, "learning_rate": 1.2069407661250903e-05, "loss": 0.4682, "step": 15749 }, { "epoch": 2.6955331165497176, "grad_norm": 1.544816493988037, "learning_rate": 1.2058108828244514e-05, "loss": 0.1743, "step": 15750 }, { "epoch": 2.6957042615094986, "grad_norm": 6.079012870788574, "learning_rate": 1.204681173095832e-05, "loss": 0.4615, "step": 15751 }, { "epoch": 2.6958754064692796, "grad_norm": 15.962564468383789, "learning_rate": 1.203551637605758e-05, "loss": 1.5836, "step": 15752 }, { "epoch": 2.6960465514290606, "grad_norm": 0.2702110707759857, "learning_rate": 1.2024222770206614e-05, "loss": 0.0866, "step": 15753 }, { "epoch": 2.6962176963888416, "grad_norm": 1.070860743522644, "learning_rate": 1.2012930920068628e-05, "loss": 0.1547, "step": 15754 }, { "epoch": 2.6963888413486226, "grad_norm": 3.2316360473632812, "learning_rate": 1.2001640832305872e-05, "loss": 0.5088, "step": 15755 }, { "epoch": 2.696559986308403, "grad_norm": 8.262500762939453, "learning_rate": 1.1990352513579466e-05, "loss": 0.5829, "step": 15756 }, { "epoch": 2.696731131268184, "grad_norm": 11.615005493164062, "learning_rate": 1.197906597054958e-05, "loss": 1.2347, "step": 15757 }, { "epoch": 2.696902276227965, "grad_norm": 10.268115043640137, "learning_rate": 1.1967781209875251e-05, "loss": 0.7155, "step": 15758 }, { "epoch": 2.697073421187746, "grad_norm": 6.271699905395508, "learning_rate": 1.1956498238214543e-05, "loss": 0.6385, "step": 15759 }, { "epoch": 2.697244566147527, "grad_norm": 8.920348167419434, "learning_rate": 1.194521706222439e-05, "loss": 0.676, "step": 15760 }, { "epoch": 2.697415711107308, "grad_norm": 0.26926833391189575, "learning_rate": 1.193393768856074e-05, "loss": 0.0844, "step": 15761 }, { "epoch": 2.6975868560670886, "grad_norm": 0.7299058437347412, "learning_rate": 1.19226601238784e-05, "loss": 0.144, "step": 15762 }, { "epoch": 2.6977580010268696, "grad_norm": 55.87983703613281, "learning_rate": 1.1911384374831194e-05, "loss": 5.7693, "step": 15763 }, { "epoch": 2.6979291459866506, "grad_norm": 0.2671976387500763, "learning_rate": 1.1900110448071781e-05, "loss": 0.0885, "step": 15764 }, { "epoch": 2.6981002909464316, "grad_norm": 1.0287240743637085, "learning_rate": 1.1888838350251845e-05, "loss": 0.1349, "step": 15765 }, { "epoch": 2.6982714359062125, "grad_norm": 5.344680309295654, "learning_rate": 1.1877568088021896e-05, "loss": 0.3275, "step": 15766 }, { "epoch": 2.6984425808659935, "grad_norm": 1.3648954629898071, "learning_rate": 1.1866299668031442e-05, "loss": 0.198, "step": 15767 }, { "epoch": 2.6986137258257745, "grad_norm": 4.387040138244629, "learning_rate": 1.1855033096928834e-05, "loss": 0.3721, "step": 15768 }, { "epoch": 2.6987848707855555, "grad_norm": 8.777769088745117, "learning_rate": 1.1843768381361384e-05, "loss": 0.6655, "step": 15769 }, { "epoch": 2.6989560157453365, "grad_norm": 7.758626461029053, "learning_rate": 1.1832505527975309e-05, "loss": 0.6551, "step": 15770 }, { "epoch": 2.6991271607051175, "grad_norm": 0.8305807709693909, "learning_rate": 1.1821244543415671e-05, "loss": 0.1491, "step": 15771 }, { "epoch": 2.699298305664898, "grad_norm": 0.49470382928848267, "learning_rate": 1.1809985434326487e-05, "loss": 0.1237, "step": 15772 }, { "epoch": 2.699469450624679, "grad_norm": 6.89596700668335, "learning_rate": 1.179872820735067e-05, "loss": 0.5238, "step": 15773 }, { "epoch": 2.69964059558446, "grad_norm": 4.45634126663208, "learning_rate": 1.1787472869129975e-05, "loss": 0.4636, "step": 15774 }, { "epoch": 2.699811740544241, "grad_norm": 7.970472812652588, "learning_rate": 1.1776219426305055e-05, "loss": 0.68, "step": 15775 }, { "epoch": 2.699982885504022, "grad_norm": 3.5207178592681885, "learning_rate": 1.1764967885515473e-05, "loss": 0.3875, "step": 15776 }, { "epoch": 2.700154030463803, "grad_norm": 1.7363307476043701, "learning_rate": 1.1753718253399677e-05, "loss": 0.1524, "step": 15777 } ], "logging_steps": 1, "max_steps": 17529, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 1753, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 32, "trial_name": null, "trial_params": null }