{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.1599997440004096, "eval_steps": 200000, "global_step": 100000, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0001599997440004096, "grad_norm": 84.32501983642578, "learning_rate": 3.103950336794611e-08, "loss": 10.8792, "step": 100 }, { "epoch": 0.0003199994880008192, "grad_norm": 60.63747024536133, "learning_rate": 6.303899137613798e-08, "loss": 10.9284, "step": 200 }, { "epoch": 0.00047999923200122877, "grad_norm": 55.71075439453125, "learning_rate": 9.503847938432986e-08, "loss": 10.6466, "step": 300 }, { "epoch": 0.0006399989760016384, "grad_norm": 57.63307189941406, "learning_rate": 1.2703796739252173e-07, "loss": 10.841, "step": 400 }, { "epoch": 0.000799998720002048, "grad_norm": 89.1032485961914, "learning_rate": 1.590374554007136e-07, "loss": 10.8094, "step": 500 }, { "epoch": 0.0009599984640024575, "grad_norm": 57.2479362487793, "learning_rate": 1.9103694340890547e-07, "loss": 10.4323, "step": 600 }, { "epoch": 0.0011199982080028672, "grad_norm": 51.17530059814453, "learning_rate": 2.2303643141709733e-07, "loss": 10.3032, "step": 700 }, { "epoch": 0.0012799979520032767, "grad_norm": 60.76409912109375, "learning_rate": 2.550359194252892e-07, "loss": 10.4006, "step": 800 }, { "epoch": 0.0014399976960036865, "grad_norm": 67.00859069824219, "learning_rate": 2.870354074334811e-07, "loss": 10.4743, "step": 900 }, { "epoch": 0.001599997440004096, "grad_norm": 68.4343032836914, "learning_rate": 3.19034895441673e-07, "loss": 10.2334, "step": 1000 }, { "epoch": 0.0017599971840045055, "grad_norm": 48.704105377197266, "learning_rate": 3.510343834498648e-07, "loss": 10.0135, "step": 1100 }, { "epoch": 0.001919996928004915, "grad_norm": 45.30134963989258, "learning_rate": 3.830338714580567e-07, "loss": 9.7874, "step": 1200 }, { "epoch": 0.002079996672005325, "grad_norm": 84.56024169921875, "learning_rate": 4.150333594662486e-07, "loss": 9.7419, "step": 1300 }, { "epoch": 0.0022399964160057344, "grad_norm": 45.73213195800781, "learning_rate": 4.470328474744404e-07, "loss": 9.7412, "step": 1400 }, { "epoch": 0.002399996160006144, "grad_norm": 50.21996307373047, "learning_rate": 4.790323354826324e-07, "loss": 9.4585, "step": 1500 }, { "epoch": 0.0025599959040065534, "grad_norm": 59.475799560546875, "learning_rate": 5.110318234908241e-07, "loss": 9.5339, "step": 1600 }, { "epoch": 0.002719995648006963, "grad_norm": 82.53620910644531, "learning_rate": 5.43031311499016e-07, "loss": 9.4345, "step": 1700 }, { "epoch": 0.002879995392007373, "grad_norm": 39.44235610961914, "learning_rate": 5.750307995072079e-07, "loss": 9.1733, "step": 1800 }, { "epoch": 0.0030399951360077825, "grad_norm": 37.58698654174805, "learning_rate": 6.070302875153998e-07, "loss": 8.9952, "step": 1900 }, { "epoch": 0.003199994880008192, "grad_norm": 40.35204315185547, "learning_rate": 6.390297755235917e-07, "loss": 8.9669, "step": 2000 }, { "epoch": 0.0033599946240086016, "grad_norm": 57.84451675415039, "learning_rate": 6.707092686517017e-07, "loss": 8.8152, "step": 2100 }, { "epoch": 0.003519994368009011, "grad_norm": 40.126953125, "learning_rate": 7.027087566598935e-07, "loss": 8.7936, "step": 2200 }, { "epoch": 0.0036799941120094206, "grad_norm": 35.435707092285156, "learning_rate": 7.347082446680854e-07, "loss": 8.6771, "step": 2300 }, { "epoch": 0.00383999385600983, "grad_norm": 42.3509635925293, "learning_rate": 7.667077326762773e-07, "loss": 8.4648, "step": 2400 }, { "epoch": 0.00399999360001024, "grad_norm": 33.58556365966797, "learning_rate": 7.987072206844691e-07, "loss": 8.5764, "step": 2500 }, { "epoch": 0.00415999334401065, "grad_norm": 34.014678955078125, "learning_rate": 8.30706708692661e-07, "loss": 8.4587, "step": 2600 }, { "epoch": 0.004319993088011059, "grad_norm": 36.43831253051758, "learning_rate": 8.627061967008528e-07, "loss": 8.2966, "step": 2700 }, { "epoch": 0.004479992832011469, "grad_norm": 31.411684036254883, "learning_rate": 8.947056847090448e-07, "loss": 8.2329, "step": 2800 }, { "epoch": 0.004639992576011879, "grad_norm": 47.570125579833984, "learning_rate": 9.267051727172366e-07, "loss": 8.1415, "step": 2900 }, { "epoch": 0.004799992320012288, "grad_norm": 30.771928787231445, "learning_rate": 9.587046607254284e-07, "loss": 8.0404, "step": 3000 }, { "epoch": 0.004959992064012698, "grad_norm": 26.92803955078125, "learning_rate": 9.907041487336204e-07, "loss": 7.9698, "step": 3100 }, { "epoch": 0.005119991808013107, "grad_norm": 31.121917724609375, "learning_rate": 1.0227036367418122e-06, "loss": 7.9205, "step": 3200 }, { "epoch": 0.005279991552013517, "grad_norm": 33.991416931152344, "learning_rate": 1.054703124750004e-06, "loss": 7.8314, "step": 3300 }, { "epoch": 0.005439991296013926, "grad_norm": 31.278030395507812, "learning_rate": 1.086702612758196e-06, "loss": 7.8369, "step": 3400 }, { "epoch": 0.005599991040014336, "grad_norm": 28.116140365600586, "learning_rate": 1.1187021007663878e-06, "loss": 7.6403, "step": 3500 }, { "epoch": 0.005759990784014746, "grad_norm": 30.954113006591797, "learning_rate": 1.1507015887745798e-06, "loss": 7.5842, "step": 3600 }, { "epoch": 0.005919990528015155, "grad_norm": 36.53567886352539, "learning_rate": 1.1827010767827715e-06, "loss": 7.5812, "step": 3700 }, { "epoch": 0.006079990272015565, "grad_norm": 36.81153106689453, "learning_rate": 1.2147005647909635e-06, "loss": 7.4335, "step": 3800 }, { "epoch": 0.006239990016015974, "grad_norm": 22.556833267211914, "learning_rate": 1.2467000527991553e-06, "loss": 7.4917, "step": 3900 }, { "epoch": 0.006399989760016384, "grad_norm": 40.195579528808594, "learning_rate": 1.278699540807347e-06, "loss": 7.3204, "step": 4000 }, { "epoch": 0.006559989504016793, "grad_norm": 21.862642288208008, "learning_rate": 1.310699028815539e-06, "loss": 7.2971, "step": 4100 }, { "epoch": 0.006719989248017203, "grad_norm": 29.61161231994629, "learning_rate": 1.3426985168237308e-06, "loss": 7.2233, "step": 4200 }, { "epoch": 0.006879988992017613, "grad_norm": 22.342451095581055, "learning_rate": 1.3746980048319228e-06, "loss": 7.2081, "step": 4300 }, { "epoch": 0.007039988736018022, "grad_norm": 36.36684799194336, "learning_rate": 1.4066974928401148e-06, "loss": 7.1364, "step": 4400 }, { "epoch": 0.007199988480018432, "grad_norm": 25.563953399658203, "learning_rate": 1.4386969808483064e-06, "loss": 7.0663, "step": 4500 }, { "epoch": 0.007359988224018841, "grad_norm": 22.50385856628418, "learning_rate": 1.4706964688564984e-06, "loss": 6.9601, "step": 4600 }, { "epoch": 0.007519987968019251, "grad_norm": 31.61231231689453, "learning_rate": 1.5026959568646904e-06, "loss": 6.9546, "step": 4700 }, { "epoch": 0.00767998771201966, "grad_norm": 18.862520217895508, "learning_rate": 1.5346954448728822e-06, "loss": 6.9019, "step": 4800 }, { "epoch": 0.00783998745602007, "grad_norm": 32.594539642333984, "learning_rate": 1.5666949328810741e-06, "loss": 6.8801, "step": 4900 }, { "epoch": 0.00799998720002048, "grad_norm": 21.06804084777832, "learning_rate": 1.598694420889266e-06, "loss": 6.7734, "step": 5000 }, { "epoch": 0.00815998694402089, "grad_norm": 31.783803939819336, "learning_rate": 1.6303739140173757e-06, "loss": 6.7648, "step": 5100 }, { "epoch": 0.0083199866880213, "grad_norm": 49.79084777832031, "learning_rate": 1.6623734020255677e-06, "loss": 6.7498, "step": 5200 }, { "epoch": 0.008479986432021708, "grad_norm": 26.1977481842041, "learning_rate": 1.6943728900337597e-06, "loss": 6.6872, "step": 5300 }, { "epoch": 0.008639986176022118, "grad_norm": 21.942001342773438, "learning_rate": 1.7263723780419515e-06, "loss": 6.6264, "step": 5400 }, { "epoch": 0.008799985920022528, "grad_norm": 32.572959899902344, "learning_rate": 1.7583718660501433e-06, "loss": 6.579, "step": 5500 }, { "epoch": 0.008959985664022938, "grad_norm": 20.728240966796875, "learning_rate": 1.7903713540583353e-06, "loss": 6.6001, "step": 5600 }, { "epoch": 0.009119985408023347, "grad_norm": 24.334205627441406, "learning_rate": 1.822370842066527e-06, "loss": 6.5971, "step": 5700 }, { "epoch": 0.009279985152023757, "grad_norm": 27.025753021240234, "learning_rate": 1.854370330074719e-06, "loss": 6.4694, "step": 5800 }, { "epoch": 0.009439984896024167, "grad_norm": 23.506013870239258, "learning_rate": 1.8863698180829106e-06, "loss": 6.3983, "step": 5900 }, { "epoch": 0.009599984640024576, "grad_norm": 35.65713882446289, "learning_rate": 1.9183693060911026e-06, "loss": 6.4477, "step": 6000 }, { "epoch": 0.009759984384024985, "grad_norm": 22.977373123168945, "learning_rate": 1.950368794099295e-06, "loss": 6.4308, "step": 6100 }, { "epoch": 0.009919984128025396, "grad_norm": 22.127635955810547, "learning_rate": 1.982368282107486e-06, "loss": 6.4248, "step": 6200 }, { "epoch": 0.010079983872025805, "grad_norm": 33.53960418701172, "learning_rate": 2.0143677701156784e-06, "loss": 6.2642, "step": 6300 }, { "epoch": 0.010239983616026214, "grad_norm": 24.39597511291504, "learning_rate": 2.04636725812387e-06, "loss": 6.2763, "step": 6400 }, { "epoch": 0.010399983360026625, "grad_norm": 24.471288681030273, "learning_rate": 2.078366746132062e-06, "loss": 6.3878, "step": 6500 }, { "epoch": 0.010559983104027034, "grad_norm": 34.05498123168945, "learning_rate": 2.110366234140254e-06, "loss": 6.2601, "step": 6600 }, { "epoch": 0.010719982848027443, "grad_norm": 30.60455322265625, "learning_rate": 2.142365722148446e-06, "loss": 6.1789, "step": 6700 }, { "epoch": 0.010879982592027852, "grad_norm": 27.737686157226562, "learning_rate": 2.1743652101566377e-06, "loss": 6.1773, "step": 6800 }, { "epoch": 0.011039982336028263, "grad_norm": 24.246810913085938, "learning_rate": 2.2063646981648294e-06, "loss": 6.1439, "step": 6900 }, { "epoch": 0.011199982080028672, "grad_norm": 27.53533363342285, "learning_rate": 2.2383641861730217e-06, "loss": 6.1863, "step": 7000 }, { "epoch": 0.011359981824029081, "grad_norm": 27.81687355041504, "learning_rate": 2.2703636741812134e-06, "loss": 6.0513, "step": 7100 }, { "epoch": 0.011519981568029492, "grad_norm": 28.00519371032715, "learning_rate": 2.3020431673093234e-06, "loss": 6.0671, "step": 7200 }, { "epoch": 0.011679981312029901, "grad_norm": 29.347061157226562, "learning_rate": 2.3340426553175152e-06, "loss": 6.0212, "step": 7300 }, { "epoch": 0.01183998105603031, "grad_norm": 29.621200561523438, "learning_rate": 2.365722148445625e-06, "loss": 6.0043, "step": 7400 }, { "epoch": 0.011999980800030719, "grad_norm": 31.689117431640625, "learning_rate": 2.397721636453817e-06, "loss": 6.0166, "step": 7500 }, { "epoch": 0.01215998054403113, "grad_norm": 46.79508972167969, "learning_rate": 2.429721124462009e-06, "loss": 5.9754, "step": 7600 }, { "epoch": 0.012319980288031539, "grad_norm": 28.857833862304688, "learning_rate": 2.4617206124702006e-06, "loss": 5.9211, "step": 7700 }, { "epoch": 0.012479980032031948, "grad_norm": 58.34132766723633, "learning_rate": 2.4937201004783928e-06, "loss": 5.7867, "step": 7800 }, { "epoch": 0.012639979776032359, "grad_norm": 49.33425521850586, "learning_rate": 2.525719588486584e-06, "loss": 5.8534, "step": 7900 }, { "epoch": 0.012799979520032768, "grad_norm": 39.17392349243164, "learning_rate": 2.5577190764947763e-06, "loss": 5.7708, "step": 8000 }, { "epoch": 0.012959979264033177, "grad_norm": 45.94136428833008, "learning_rate": 2.589718564502968e-06, "loss": 5.8328, "step": 8100 }, { "epoch": 0.013119979008033586, "grad_norm": 36.19196319580078, "learning_rate": 2.6217180525111603e-06, "loss": 5.7417, "step": 8200 }, { "epoch": 0.013279978752033997, "grad_norm": 37.051658630371094, "learning_rate": 2.653717540519352e-06, "loss": 5.8097, "step": 8300 }, { "epoch": 0.013439978496034406, "grad_norm": 90.0757064819336, "learning_rate": 2.6857170285275435e-06, "loss": 5.7578, "step": 8400 }, { "epoch": 0.013599978240034815, "grad_norm": 92.7857894897461, "learning_rate": 2.7177165165357357e-06, "loss": 5.643, "step": 8500 }, { "epoch": 0.013759977984035226, "grad_norm": 26.648149490356445, "learning_rate": 2.7497160045439274e-06, "loss": 5.6401, "step": 8600 }, { "epoch": 0.013919977728035635, "grad_norm": 45.42919158935547, "learning_rate": 2.7817154925521196e-06, "loss": 5.6627, "step": 8700 }, { "epoch": 0.014079977472036044, "grad_norm": 48.3182487487793, "learning_rate": 2.8137149805603114e-06, "loss": 5.6167, "step": 8800 }, { "epoch": 0.014239977216036454, "grad_norm": 51.463653564453125, "learning_rate": 2.8457144685685028e-06, "loss": 5.6539, "step": 8900 }, { "epoch": 0.014399976960036864, "grad_norm": 47.81680679321289, "learning_rate": 2.877713956576695e-06, "loss": 5.4513, "step": 9000 }, { "epoch": 0.014559976704037273, "grad_norm": 42.410667419433594, "learning_rate": 2.9097134445848868e-06, "loss": 5.4132, "step": 9100 }, { "epoch": 0.014719976448037683, "grad_norm": 55.33562088012695, "learning_rate": 2.941712932593079e-06, "loss": 5.4714, "step": 9200 }, { "epoch": 0.014879976192038093, "grad_norm": 38.538246154785156, "learning_rate": 2.9737124206012707e-06, "loss": 5.4786, "step": 9300 }, { "epoch": 0.015039975936038502, "grad_norm": 43.42023468017578, "learning_rate": 3.0057119086094625e-06, "loss": 5.3928, "step": 9400 }, { "epoch": 0.015199975680038912, "grad_norm": 24.861467361450195, "learning_rate": 3.037391401737572e-06, "loss": 5.4774, "step": 9500 }, { "epoch": 0.01535997542403932, "grad_norm": 98.92141723632812, "learning_rate": 3.0693908897457643e-06, "loss": 5.2881, "step": 9600 }, { "epoch": 0.015519975168039732, "grad_norm": 62.839866638183594, "learning_rate": 3.101390377753956e-06, "loss": 5.3699, "step": 9700 }, { "epoch": 0.01567997491204014, "grad_norm": 46.006065368652344, "learning_rate": 3.133069870882066e-06, "loss": 5.1483, "step": 9800 }, { "epoch": 0.01583997465604055, "grad_norm": 89.62445068359375, "learning_rate": 3.1650693588902583e-06, "loss": 5.3051, "step": 9900 }, { "epoch": 0.01599997440004096, "grad_norm": 41.113609313964844, "learning_rate": 3.19706884689845e-06, "loss": 5.2546, "step": 10000 }, { "epoch": 0.01615997414404137, "grad_norm": 46.37376403808594, "learning_rate": 3.2290683349066414e-06, "loss": 5.2314, "step": 10100 }, { "epoch": 0.01631997388804178, "grad_norm": 60.3846321105957, "learning_rate": 3.2610678229148337e-06, "loss": 5.1783, "step": 10200 }, { "epoch": 0.016479973632042188, "grad_norm": 145.4359130859375, "learning_rate": 3.2930673109230254e-06, "loss": 5.2074, "step": 10300 }, { "epoch": 0.0166399733760426, "grad_norm": 69.00183868408203, "learning_rate": 3.325066798931217e-06, "loss": 5.2825, "step": 10400 }, { "epoch": 0.01679997312004301, "grad_norm": 48.03580093383789, "learning_rate": 3.3570662869394094e-06, "loss": 5.1715, "step": 10500 }, { "epoch": 0.016959972864043417, "grad_norm": 58.56736755371094, "learning_rate": 3.389065774947601e-06, "loss": 5.087, "step": 10600 }, { "epoch": 0.017119972608043828, "grad_norm": 54.484527587890625, "learning_rate": 3.421065262955793e-06, "loss": 5.082, "step": 10700 }, { "epoch": 0.017279972352044235, "grad_norm": 74.30866241455078, "learning_rate": 3.4530647509639847e-06, "loss": 4.9111, "step": 10800 }, { "epoch": 0.017439972096044646, "grad_norm": 60.489505767822266, "learning_rate": 3.4850642389721765e-06, "loss": 5.0213, "step": 10900 }, { "epoch": 0.017599971840045057, "grad_norm": 61.25093460083008, "learning_rate": 3.5170637269803687e-06, "loss": 4.9898, "step": 11000 }, { "epoch": 0.017759971584045464, "grad_norm": 51.98568344116211, "learning_rate": 3.5490632149885605e-06, "loss": 4.7734, "step": 11100 }, { "epoch": 0.017919971328045875, "grad_norm": 64.08167266845703, "learning_rate": 3.581062702996752e-06, "loss": 4.9511, "step": 11200 }, { "epoch": 0.018079971072046286, "grad_norm": 61.8354606628418, "learning_rate": 3.613062191004944e-06, "loss": 5.0481, "step": 11300 }, { "epoch": 0.018239970816046693, "grad_norm": 97.53675842285156, "learning_rate": 3.645061679013136e-06, "loss": 4.8441, "step": 11400 }, { "epoch": 0.018399970560047104, "grad_norm": 49.35017013549805, "learning_rate": 3.677061167021328e-06, "loss": 4.873, "step": 11500 }, { "epoch": 0.018559970304047515, "grad_norm": 44.33409118652344, "learning_rate": 3.70906065502952e-06, "loss": 4.9988, "step": 11600 }, { "epoch": 0.018719970048047922, "grad_norm": 140.5505828857422, "learning_rate": 3.741060143037712e-06, "loss": 4.7653, "step": 11700 }, { "epoch": 0.018879969792048333, "grad_norm": 68.21163177490234, "learning_rate": 3.7730596310459034e-06, "loss": 4.804, "step": 11800 }, { "epoch": 0.019039969536048744, "grad_norm": 48.678226470947266, "learning_rate": 3.805059119054095e-06, "loss": 4.8288, "step": 11900 }, { "epoch": 0.01919996928004915, "grad_norm": 76.32611083984375, "learning_rate": 3.837058607062287e-06, "loss": 4.7053, "step": 12000 }, { "epoch": 0.019359969024049562, "grad_norm": 70.85586547851562, "learning_rate": 3.869058095070479e-06, "loss": 4.6887, "step": 12100 }, { "epoch": 0.01951996876804997, "grad_norm": 66.46036529541016, "learning_rate": 3.901057583078671e-06, "loss": 4.7832, "step": 12200 }, { "epoch": 0.01967996851205038, "grad_norm": 165.13221740722656, "learning_rate": 3.9330570710868636e-06, "loss": 4.6817, "step": 12300 }, { "epoch": 0.01983996825605079, "grad_norm": 118.48895263671875, "learning_rate": 3.965056559095055e-06, "loss": 4.6252, "step": 12400 }, { "epoch": 0.0199999680000512, "grad_norm": 64.3436050415039, "learning_rate": 3.997056047103246e-06, "loss": 4.5936, "step": 12500 }, { "epoch": 0.02015996774405161, "grad_norm": 42.27592468261719, "learning_rate": 4.0290555351114385e-06, "loss": 4.7452, "step": 12600 }, { "epoch": 0.02031996748805202, "grad_norm": 60.829036712646484, "learning_rate": 4.061055023119631e-06, "loss": 4.5321, "step": 12700 }, { "epoch": 0.020479967232052428, "grad_norm": 161.975830078125, "learning_rate": 4.093054511127823e-06, "loss": 4.4964, "step": 12800 }, { "epoch": 0.02063996697605284, "grad_norm": 99.2963638305664, "learning_rate": 4.125053999136014e-06, "loss": 4.4421, "step": 12900 }, { "epoch": 0.02079996672005325, "grad_norm": 68.78880310058594, "learning_rate": 4.156733492264124e-06, "loss": 4.3782, "step": 13000 }, { "epoch": 0.020959966464053657, "grad_norm": 80.74951171875, "learning_rate": 4.188732980272316e-06, "loss": 4.5169, "step": 13100 }, { "epoch": 0.021119966208054067, "grad_norm": 157.87254333496094, "learning_rate": 4.220412473400426e-06, "loss": 4.533, "step": 13200 }, { "epoch": 0.02127996595205448, "grad_norm": 148.68331909179688, "learning_rate": 4.252411961408618e-06, "loss": 4.3725, "step": 13300 }, { "epoch": 0.021439965696054886, "grad_norm": 72.9531021118164, "learning_rate": 4.28441144941681e-06, "loss": 4.2911, "step": 13400 }, { "epoch": 0.021599965440055297, "grad_norm": 73.24847412109375, "learning_rate": 4.316410937425001e-06, "loss": 4.2261, "step": 13500 }, { "epoch": 0.021759965184055704, "grad_norm": 94.57313537597656, "learning_rate": 4.348410425433194e-06, "loss": 4.2467, "step": 13600 }, { "epoch": 0.021919964928056115, "grad_norm": 105.674560546875, "learning_rate": 4.380409913441385e-06, "loss": 4.1558, "step": 13700 }, { "epoch": 0.022079964672056526, "grad_norm": 63.658287048339844, "learning_rate": 4.412409401449577e-06, "loss": 4.2794, "step": 13800 }, { "epoch": 0.022239964416056933, "grad_norm": 77.69287109375, "learning_rate": 4.444408889457769e-06, "loss": 4.2383, "step": 13900 }, { "epoch": 0.022399964160057344, "grad_norm": 82.83360290527344, "learning_rate": 4.4764083774659615e-06, "loss": 4.1654, "step": 14000 }, { "epoch": 0.022559963904057755, "grad_norm": 47.373531341552734, "learning_rate": 4.508407865474153e-06, "loss": 4.158, "step": 14100 }, { "epoch": 0.022719963648058162, "grad_norm": 97.64757537841797, "learning_rate": 4.540407353482344e-06, "loss": 4.1299, "step": 14200 }, { "epoch": 0.022879963392058573, "grad_norm": 54.75618362426758, "learning_rate": 4.5724068414905365e-06, "loss": 4.1902, "step": 14300 }, { "epoch": 0.023039963136058984, "grad_norm": 258.4887390136719, "learning_rate": 4.604406329498729e-06, "loss": 3.7853, "step": 14400 }, { "epoch": 0.02319996288005939, "grad_norm": 104.63798522949219, "learning_rate": 4.63640581750692e-06, "loss": 4.0514, "step": 14500 }, { "epoch": 0.023359962624059802, "grad_norm": 60.090843200683594, "learning_rate": 4.668405305515112e-06, "loss": 4.1655, "step": 14600 }, { "epoch": 0.023519962368060213, "grad_norm": 44.36670684814453, "learning_rate": 4.7004047935233036e-06, "loss": 4.051, "step": 14700 }, { "epoch": 0.02367996211206062, "grad_norm": 41.61213302612305, "learning_rate": 4.732404281531496e-06, "loss": 4.078, "step": 14800 }, { "epoch": 0.02383996185606103, "grad_norm": 73.2448501586914, "learning_rate": 4.764403769539688e-06, "loss": 4.1193, "step": 14900 }, { "epoch": 0.023999961600061438, "grad_norm": 77.30301666259766, "learning_rate": 4.796403257547879e-06, "loss": 4.1536, "step": 15000 }, { "epoch": 0.02415996134406185, "grad_norm": 48.1458854675293, "learning_rate": 4.8284027455560715e-06, "loss": 3.935, "step": 15100 }, { "epoch": 0.02431996108806226, "grad_norm": 129.59295654296875, "learning_rate": 4.860402233564263e-06, "loss": 3.9535, "step": 15200 }, { "epoch": 0.024479960832062667, "grad_norm": 163.0813751220703, "learning_rate": 4.892401721572455e-06, "loss": 3.7051, "step": 15300 }, { "epoch": 0.024639960576063078, "grad_norm": 102.2786865234375, "learning_rate": 4.924401209580647e-06, "loss": 3.8329, "step": 15400 }, { "epoch": 0.02479996032006349, "grad_norm": 160.66392517089844, "learning_rate": 4.956400697588839e-06, "loss": 3.9412, "step": 15500 }, { "epoch": 0.024959960064063896, "grad_norm": 136.77218627929688, "learning_rate": 4.988400185597031e-06, "loss": 3.6668, "step": 15600 }, { "epoch": 0.025119959808064307, "grad_norm": 63.87991714477539, "learning_rate": 5.0200796787251404e-06, "loss": 3.7758, "step": 15700 }, { "epoch": 0.025279959552064718, "grad_norm": 352.977294921875, "learning_rate": 5.052079166733333e-06, "loss": 3.8805, "step": 15800 }, { "epoch": 0.025439959296065125, "grad_norm": 148.54776000976562, "learning_rate": 5.084078654741524e-06, "loss": 3.8848, "step": 15900 }, { "epoch": 0.025599959040065536, "grad_norm": 105.01113891601562, "learning_rate": 5.116078142749716e-06, "loss": 3.75, "step": 16000 }, { "epoch": 0.025759958784065947, "grad_norm": 170.62828063964844, "learning_rate": 5.148077630757908e-06, "loss": 3.5685, "step": 16100 }, { "epoch": 0.025919958528066354, "grad_norm": 164.85324096679688, "learning_rate": 5.180077118766101e-06, "loss": 3.7016, "step": 16200 }, { "epoch": 0.026079958272066765, "grad_norm": 79.85810852050781, "learning_rate": 5.212076606774292e-06, "loss": 4.0955, "step": 16300 }, { "epoch": 0.026239958016067173, "grad_norm": 109.73529815673828, "learning_rate": 5.244076094782484e-06, "loss": 3.7577, "step": 16400 }, { "epoch": 0.026399957760067583, "grad_norm": 105.98066711425781, "learning_rate": 5.276075582790676e-06, "loss": 3.7485, "step": 16500 }, { "epoch": 0.026559957504067994, "grad_norm": 71.02545166015625, "learning_rate": 5.3080750707988686e-06, "loss": 3.8263, "step": 16600 }, { "epoch": 0.0267199572480684, "grad_norm": 245.44224548339844, "learning_rate": 5.340074558807059e-06, "loss": 3.6922, "step": 16700 }, { "epoch": 0.026879956992068813, "grad_norm": 42.178157806396484, "learning_rate": 5.372074046815251e-06, "loss": 3.6568, "step": 16800 }, { "epoch": 0.027039956736069223, "grad_norm": 114.55894470214844, "learning_rate": 5.404073534823443e-06, "loss": 3.7317, "step": 16900 }, { "epoch": 0.02719995648006963, "grad_norm": 86.70626831054688, "learning_rate": 5.436073022831635e-06, "loss": 3.5089, "step": 17000 }, { "epoch": 0.02735995622407004, "grad_norm": 202.02505493164062, "learning_rate": 5.468072510839827e-06, "loss": 3.7377, "step": 17100 }, { "epoch": 0.027519955968070452, "grad_norm": 114.00701141357422, "learning_rate": 5.500071998848019e-06, "loss": 3.6206, "step": 17200 }, { "epoch": 0.02767995571207086, "grad_norm": 152.38311767578125, "learning_rate": 5.532071486856211e-06, "loss": 3.3702, "step": 17300 }, { "epoch": 0.02783995545607127, "grad_norm": 156.1048126220703, "learning_rate": 5.564070974864403e-06, "loss": 3.5126, "step": 17400 }, { "epoch": 0.02799995520007168, "grad_norm": 117.87386322021484, "learning_rate": 5.596070462872595e-06, "loss": 3.4841, "step": 17500 }, { "epoch": 0.02815995494407209, "grad_norm": 616.7991333007812, "learning_rate": 5.628069950880786e-06, "loss": 3.1464, "step": 17600 }, { "epoch": 0.0283199546880725, "grad_norm": 131.32760620117188, "learning_rate": 5.6600694388889786e-06, "loss": 3.7012, "step": 17700 }, { "epoch": 0.028479954432072907, "grad_norm": 60.172969818115234, "learning_rate": 5.69206892689717e-06, "loss": 3.5802, "step": 17800 }, { "epoch": 0.028639954176073318, "grad_norm": 169.24374389648438, "learning_rate": 5.724068414905361e-06, "loss": 3.4952, "step": 17900 }, { "epoch": 0.02879995392007373, "grad_norm": 158.77391052246094, "learning_rate": 5.7560679029135535e-06, "loss": 3.1174, "step": 18000 }, { "epoch": 0.028959953664074136, "grad_norm": 218.98867797851562, "learning_rate": 5.787747396041664e-06, "loss": 3.3134, "step": 18100 }, { "epoch": 0.029119953408074547, "grad_norm": 185.3249053955078, "learning_rate": 5.819746884049856e-06, "loss": 3.3578, "step": 18200 }, { "epoch": 0.029279953152074958, "grad_norm": 93.69242858886719, "learning_rate": 5.851746372058048e-06, "loss": 3.0209, "step": 18300 }, { "epoch": 0.029439952896075365, "grad_norm": 85.82784271240234, "learning_rate": 5.883745860066239e-06, "loss": 3.3796, "step": 18400 }, { "epoch": 0.029599952640075776, "grad_norm": 125.96697998046875, "learning_rate": 5.915745348074431e-06, "loss": 3.2287, "step": 18500 }, { "epoch": 0.029759952384076187, "grad_norm": 235.71075439453125, "learning_rate": 5.947744836082623e-06, "loss": 3.1537, "step": 18600 }, { "epoch": 0.029919952128076594, "grad_norm": 139.5558319091797, "learning_rate": 5.979744324090815e-06, "loss": 2.9073, "step": 18700 }, { "epoch": 0.030079951872077005, "grad_norm": 204.2928924560547, "learning_rate": 6.011743812099007e-06, "loss": 3.3444, "step": 18800 }, { "epoch": 0.030239951616077416, "grad_norm": 165.4457244873047, "learning_rate": 6.043743300107199e-06, "loss": 3.1341, "step": 18900 }, { "epoch": 0.030399951360077823, "grad_norm": 66.5983657836914, "learning_rate": 6.07574278811539e-06, "loss": 2.8862, "step": 19000 }, { "epoch": 0.030559951104078234, "grad_norm": 219.95774841308594, "learning_rate": 6.1077422761235826e-06, "loss": 3.2033, "step": 19100 }, { "epoch": 0.03071995084807864, "grad_norm": 125.15766906738281, "learning_rate": 6.139741764131775e-06, "loss": 3.2764, "step": 19200 }, { "epoch": 0.030879950592079052, "grad_norm": 207.95970153808594, "learning_rate": 6.171741252139967e-06, "loss": 3.0725, "step": 19300 }, { "epoch": 0.031039950336079463, "grad_norm": 368.32781982421875, "learning_rate": 6.203740740148158e-06, "loss": 3.0436, "step": 19400 }, { "epoch": 0.03119995008007987, "grad_norm": 412.2764587402344, "learning_rate": 6.23574022815635e-06, "loss": 3.3493, "step": 19500 }, { "epoch": 0.03135994982408028, "grad_norm": 155.46766662597656, "learning_rate": 6.267739716164542e-06, "loss": 3.0141, "step": 19600 }, { "epoch": 0.03151994956808069, "grad_norm": 89.32569885253906, "learning_rate": 6.299739204172733e-06, "loss": 2.779, "step": 19700 }, { "epoch": 0.0316799493120811, "grad_norm": 241.4378204345703, "learning_rate": 6.3317386921809254e-06, "loss": 3.3543, "step": 19800 }, { "epoch": 0.03183994905608151, "grad_norm": 13.20569133758545, "learning_rate": 6.363738180189118e-06, "loss": 3.1526, "step": 19900 }, { "epoch": 0.03199994880008192, "grad_norm": 270.6402893066406, "learning_rate": 6.395737668197309e-06, "loss": 2.7896, "step": 20000 }, { "epoch": 0.03215994854408233, "grad_norm": 106.38632202148438, "learning_rate": 6.427737156205501e-06, "loss": 2.9398, "step": 20100 }, { "epoch": 0.03231994828808274, "grad_norm": 191.7210693359375, "learning_rate": 6.459416649333611e-06, "loss": 3.1254, "step": 20200 }, { "epoch": 0.03247994803208315, "grad_norm": 143.96151733398438, "learning_rate": 6.491416137341803e-06, "loss": 2.8832, "step": 20300 }, { "epoch": 0.03263994777608356, "grad_norm": 150.26368713378906, "learning_rate": 6.523415625349994e-06, "loss": 3.0542, "step": 20400 }, { "epoch": 0.032799947520083965, "grad_norm": 178.11705017089844, "learning_rate": 6.5554151133581865e-06, "loss": 2.9722, "step": 20500 }, { "epoch": 0.032959947264084376, "grad_norm": 222.4794921875, "learning_rate": 6.587414601366379e-06, "loss": 2.9321, "step": 20600 }, { "epoch": 0.03311994700808479, "grad_norm": 155.37796020507812, "learning_rate": 6.619414089374571e-06, "loss": 2.6448, "step": 20700 }, { "epoch": 0.0332799467520852, "grad_norm": 155.5786590576172, "learning_rate": 6.651413577382762e-06, "loss": 3.4006, "step": 20800 }, { "epoch": 0.03343994649608561, "grad_norm": 684.525146484375, "learning_rate": 6.6834130653909545e-06, "loss": 3.0022, "step": 20900 }, { "epoch": 0.03359994624008602, "grad_norm": 545.5623168945312, "learning_rate": 6.715412553399147e-06, "loss": 2.6366, "step": 21000 }, { "epoch": 0.03375994598408642, "grad_norm": 292.9093017578125, "learning_rate": 6.747412041407339e-06, "loss": 3.0112, "step": 21100 }, { "epoch": 0.033919945728086834, "grad_norm": 2.531680107116699, "learning_rate": 6.7794115294155294e-06, "loss": 2.7856, "step": 21200 }, { "epoch": 0.034079945472087245, "grad_norm": 216.7860565185547, "learning_rate": 6.811411017423722e-06, "loss": 3.0967, "step": 21300 }, { "epoch": 0.034239945216087656, "grad_norm": 138.73028564453125, "learning_rate": 6.843410505431913e-06, "loss": 2.8754, "step": 21400 }, { "epoch": 0.034399944960088066, "grad_norm": 78.2362060546875, "learning_rate": 6.875409993440105e-06, "loss": 3.1269, "step": 21500 }, { "epoch": 0.03455994470408847, "grad_norm": 144.1228790283203, "learning_rate": 6.907409481448297e-06, "loss": 2.8235, "step": 21600 }, { "epoch": 0.03471994444808888, "grad_norm": 275.1159973144531, "learning_rate": 6.93940896945649e-06, "loss": 2.4912, "step": 21700 }, { "epoch": 0.03487994419208929, "grad_norm": 216.12060546875, "learning_rate": 6.971408457464681e-06, "loss": 2.5079, "step": 21800 }, { "epoch": 0.0350399439360897, "grad_norm": 398.5049133300781, "learning_rate": 7.003407945472873e-06, "loss": 3.2942, "step": 21900 }, { "epoch": 0.035199943680090114, "grad_norm": 116.13761901855469, "learning_rate": 7.035407433481065e-06, "loss": 2.4184, "step": 22000 }, { "epoch": 0.035359943424090524, "grad_norm": 425.1556091308594, "learning_rate": 7.067406921489257e-06, "loss": 2.782, "step": 22100 }, { "epoch": 0.03551994316809093, "grad_norm": 17.029335021972656, "learning_rate": 7.099086414617366e-06, "loss": 2.7652, "step": 22200 }, { "epoch": 0.03567994291209134, "grad_norm": 307.45343017578125, "learning_rate": 7.1310859026255585e-06, "loss": 3.113, "step": 22300 }, { "epoch": 0.03583994265609175, "grad_norm": 69.89311981201172, "learning_rate": 7.163085390633751e-06, "loss": 2.7451, "step": 22400 }, { "epoch": 0.03599994240009216, "grad_norm": 28.0865535736084, "learning_rate": 7.195084878641943e-06, "loss": 2.7473, "step": 22500 }, { "epoch": 0.03615994214409257, "grad_norm": 108.03202056884766, "learning_rate": 7.227084366650134e-06, "loss": 2.5116, "step": 22600 }, { "epoch": 0.03631994188809298, "grad_norm": 299.888427734375, "learning_rate": 7.2590838546583265e-06, "loss": 2.8531, "step": 22700 }, { "epoch": 0.036479941632093386, "grad_norm": 87.79664611816406, "learning_rate": 7.291083342666519e-06, "loss": 2.9171, "step": 22800 }, { "epoch": 0.0366399413760938, "grad_norm": 388.6871337890625, "learning_rate": 7.323082830674709e-06, "loss": 2.7954, "step": 22900 }, { "epoch": 0.03679994112009421, "grad_norm": 87.27410888671875, "learning_rate": 7.355082318682901e-06, "loss": 2.5376, "step": 23000 }, { "epoch": 0.03695994086409462, "grad_norm": 159.74534606933594, "learning_rate": 7.387081806691094e-06, "loss": 3.2488, "step": 23100 }, { "epoch": 0.03711994060809503, "grad_norm": 169.96243286132812, "learning_rate": 7.419081294699285e-06, "loss": 2.6131, "step": 23200 }, { "epoch": 0.037279940352095434, "grad_norm": 221.1896514892578, "learning_rate": 7.451080782707477e-06, "loss": 3.1343, "step": 23300 }, { "epoch": 0.037439940096095845, "grad_norm": 67.28482818603516, "learning_rate": 7.482760275835588e-06, "loss": 2.3159, "step": 23400 }, { "epoch": 0.037599939840096255, "grad_norm": 341.05975341796875, "learning_rate": 7.514759763843779e-06, "loss": 2.4225, "step": 23500 }, { "epoch": 0.037759939584096666, "grad_norm": 250.44683837890625, "learning_rate": 7.54675925185197e-06, "loss": 2.5034, "step": 23600 }, { "epoch": 0.03791993932809708, "grad_norm": 423.6518249511719, "learning_rate": 7.5787587398601625e-06, "loss": 3.0067, "step": 23700 }, { "epoch": 0.03807993907209749, "grad_norm": 169.45944213867188, "learning_rate": 7.610758227868355e-06, "loss": 2.313, "step": 23800 }, { "epoch": 0.03823993881609789, "grad_norm": 80.43399047851562, "learning_rate": 7.642757715876546e-06, "loss": 2.5363, "step": 23900 }, { "epoch": 0.0383999385600983, "grad_norm": 248.08848571777344, "learning_rate": 7.674757203884739e-06, "loss": 2.7929, "step": 24000 }, { "epoch": 0.03855993830409871, "grad_norm": 3.7647440433502197, "learning_rate": 7.70675669189293e-06, "loss": 2.617, "step": 24100 }, { "epoch": 0.038719938048099124, "grad_norm": 3.100020170211792, "learning_rate": 7.738756179901122e-06, "loss": 2.9711, "step": 24200 }, { "epoch": 0.038879937792099535, "grad_norm": 69.79640197753906, "learning_rate": 7.770755667909315e-06, "loss": 2.7726, "step": 24300 }, { "epoch": 0.03903993753609994, "grad_norm": 190.2179412841797, "learning_rate": 7.802755155917506e-06, "loss": 2.5849, "step": 24400 }, { "epoch": 0.03919993728010035, "grad_norm": 75.47491455078125, "learning_rate": 7.834754643925698e-06, "loss": 2.3231, "step": 24500 }, { "epoch": 0.03935993702410076, "grad_norm": 13.3529691696167, "learning_rate": 7.866754131933889e-06, "loss": 2.2477, "step": 24600 }, { "epoch": 0.03951993676810117, "grad_norm": 280.162109375, "learning_rate": 7.89875361994208e-06, "loss": 2.5487, "step": 24700 }, { "epoch": 0.03967993651210158, "grad_norm": 376.9624938964844, "learning_rate": 7.930753107950273e-06, "loss": 2.5175, "step": 24800 }, { "epoch": 0.03983993625610199, "grad_norm": 341.099609375, "learning_rate": 7.962752595958465e-06, "loss": 2.6758, "step": 24900 }, { "epoch": 0.0399999360001024, "grad_norm": 436.5195007324219, "learning_rate": 7.994752083966658e-06, "loss": 2.7313, "step": 25000 }, { "epoch": 0.04015993574410281, "grad_norm": 274.91363525390625, "learning_rate": 8.026751571974849e-06, "loss": 2.4846, "step": 25100 }, { "epoch": 0.04031993548810322, "grad_norm": 183.5716094970703, "learning_rate": 8.05875105998304e-06, "loss": 2.8697, "step": 25200 }, { "epoch": 0.04047993523210363, "grad_norm": 70.23844909667969, "learning_rate": 8.090750547991234e-06, "loss": 2.5289, "step": 25300 }, { "epoch": 0.04063993497610404, "grad_norm": 139.3669891357422, "learning_rate": 8.122750035999425e-06, "loss": 2.235, "step": 25400 }, { "epoch": 0.04079993472010445, "grad_norm": 242.79315185546875, "learning_rate": 8.154429529127534e-06, "loss": 2.5028, "step": 25500 }, { "epoch": 0.040959934464104855, "grad_norm": 257.0070495605469, "learning_rate": 8.186429017135727e-06, "loss": 2.6295, "step": 25600 }, { "epoch": 0.041119934208105266, "grad_norm": 314.8670959472656, "learning_rate": 8.218428505143918e-06, "loss": 2.6159, "step": 25700 }, { "epoch": 0.04127993395210568, "grad_norm": 284.12762451171875, "learning_rate": 8.250427993152111e-06, "loss": 2.4447, "step": 25800 }, { "epoch": 0.04143993369610609, "grad_norm": 5.427358627319336, "learning_rate": 8.282427481160302e-06, "loss": 2.7233, "step": 25900 }, { "epoch": 0.0415999334401065, "grad_norm": 240.23260498046875, "learning_rate": 8.314426969168494e-06, "loss": 2.5651, "step": 26000 }, { "epoch": 0.0417599331841069, "grad_norm": 15.093184471130371, "learning_rate": 8.346426457176687e-06, "loss": 2.1317, "step": 26100 }, { "epoch": 0.04191993292810731, "grad_norm": 14.953177452087402, "learning_rate": 8.378425945184878e-06, "loss": 2.6157, "step": 26200 }, { "epoch": 0.042079932672107724, "grad_norm": 242.84718322753906, "learning_rate": 8.410105438312987e-06, "loss": 2.7385, "step": 26300 }, { "epoch": 0.042239932416108135, "grad_norm": 1.3409643173217773, "learning_rate": 8.44210492632118e-06, "loss": 2.4642, "step": 26400 }, { "epoch": 0.042399932160108546, "grad_norm": 90.02801513671875, "learning_rate": 8.474104414329371e-06, "loss": 2.0621, "step": 26500 }, { "epoch": 0.04255993190410896, "grad_norm": 11.879080772399902, "learning_rate": 8.506103902337564e-06, "loss": 2.3864, "step": 26600 }, { "epoch": 0.04271993164810936, "grad_norm": 598.356689453125, "learning_rate": 8.538103390345756e-06, "loss": 2.6951, "step": 26700 }, { "epoch": 0.04287993139210977, "grad_norm": 144.25924682617188, "learning_rate": 8.570102878353947e-06, "loss": 2.2628, "step": 26800 }, { "epoch": 0.04303993113611018, "grad_norm": 521.1145629882812, "learning_rate": 8.602102366362138e-06, "loss": 2.7538, "step": 26900 }, { "epoch": 0.04319993088011059, "grad_norm": 86.13031005859375, "learning_rate": 8.63410185437033e-06, "loss": 2.6871, "step": 27000 }, { "epoch": 0.043359930624111004, "grad_norm": 268.4532775878906, "learning_rate": 8.666101342378523e-06, "loss": 2.2453, "step": 27100 }, { "epoch": 0.04351993036811141, "grad_norm": 531.1592407226562, "learning_rate": 8.698100830386714e-06, "loss": 1.6334, "step": 27200 }, { "epoch": 0.04367993011211182, "grad_norm": 166.83230590820312, "learning_rate": 8.730100318394906e-06, "loss": 2.666, "step": 27300 }, { "epoch": 0.04383992985611223, "grad_norm": 208.4716033935547, "learning_rate": 8.762099806403099e-06, "loss": 2.128, "step": 27400 }, { "epoch": 0.04399992960011264, "grad_norm": 257.9130859375, "learning_rate": 8.79409929441129e-06, "loss": 2.7573, "step": 27500 }, { "epoch": 0.04415992934411305, "grad_norm": 85.08763885498047, "learning_rate": 8.826098782419481e-06, "loss": 2.5276, "step": 27600 }, { "epoch": 0.04431992908811346, "grad_norm": 8.960221290588379, "learning_rate": 8.858098270427674e-06, "loss": 2.2438, "step": 27700 }, { "epoch": 0.044479928832113866, "grad_norm": 404.66558837890625, "learning_rate": 8.890097758435866e-06, "loss": 2.3156, "step": 27800 }, { "epoch": 0.04463992857611428, "grad_norm": 151.23495483398438, "learning_rate": 8.922097246444059e-06, "loss": 2.1735, "step": 27900 }, { "epoch": 0.04479992832011469, "grad_norm": 151.7221221923828, "learning_rate": 8.95409673445225e-06, "loss": 2.1733, "step": 28000 }, { "epoch": 0.0449599280641151, "grad_norm": 228.95974731445312, "learning_rate": 8.986096222460441e-06, "loss": 2.4094, "step": 28100 }, { "epoch": 0.04511992780811551, "grad_norm": 522.6806640625, "learning_rate": 9.018095710468633e-06, "loss": 2.8484, "step": 28200 }, { "epoch": 0.04527992755211592, "grad_norm": 16.065011978149414, "learning_rate": 9.050095198476824e-06, "loss": 2.4507, "step": 28300 }, { "epoch": 0.045439927296116324, "grad_norm": 227.2984619140625, "learning_rate": 9.082094686485017e-06, "loss": 2.6822, "step": 28400 }, { "epoch": 0.045599927040116735, "grad_norm": 430.3262634277344, "learning_rate": 9.114094174493209e-06, "loss": 2.1191, "step": 28500 }, { "epoch": 0.045759926784117146, "grad_norm": 0.1830236166715622, "learning_rate": 9.1460936625014e-06, "loss": 2.0696, "step": 28600 }, { "epoch": 0.045919926528117556, "grad_norm": 97.45941162109375, "learning_rate": 9.178093150509593e-06, "loss": 2.4027, "step": 28700 }, { "epoch": 0.04607992627211797, "grad_norm": 22.469968795776367, "learning_rate": 9.210092638517784e-06, "loss": 1.7958, "step": 28800 }, { "epoch": 0.04623992601611837, "grad_norm": 103.27215576171875, "learning_rate": 9.242092126525977e-06, "loss": 2.5874, "step": 28900 }, { "epoch": 0.04639992576011878, "grad_norm": 578.951171875, "learning_rate": 9.274091614534169e-06, "loss": 2.2679, "step": 29000 }, { "epoch": 0.04655992550411919, "grad_norm": 6.261137008666992, "learning_rate": 9.30609110254236e-06, "loss": 2.6394, "step": 29100 }, { "epoch": 0.046719925248119604, "grad_norm": 113.35989379882812, "learning_rate": 9.338090590550551e-06, "loss": 1.7998, "step": 29200 }, { "epoch": 0.046879924992120015, "grad_norm": 116.46363830566406, "learning_rate": 9.370090078558743e-06, "loss": 2.6834, "step": 29300 }, { "epoch": 0.047039924736120425, "grad_norm": 84.5538101196289, "learning_rate": 9.402089566566936e-06, "loss": 2.1242, "step": 29400 }, { "epoch": 0.04719992448012083, "grad_norm": 150.44454956054688, "learning_rate": 9.434089054575127e-06, "loss": 2.0039, "step": 29500 }, { "epoch": 0.04735992422412124, "grad_norm": 12.482616424560547, "learning_rate": 9.466088542583319e-06, "loss": 2.018, "step": 29600 }, { "epoch": 0.04751992396812165, "grad_norm": 1.1050609350204468, "learning_rate": 9.498088030591512e-06, "loss": 2.9357, "step": 29700 }, { "epoch": 0.04767992371212206, "grad_norm": 256.4771423339844, "learning_rate": 9.530087518599703e-06, "loss": 2.1914, "step": 29800 }, { "epoch": 0.04783992345612247, "grad_norm": 178.9422149658203, "learning_rate": 9.562087006607896e-06, "loss": 2.0968, "step": 29900 }, { "epoch": 0.047999923200122876, "grad_norm": 160.1494140625, "learning_rate": 9.594086494616087e-06, "loss": 1.9762, "step": 30000 }, { "epoch": 0.04815992294412329, "grad_norm": 86.46272277832031, "learning_rate": 9.626085982624279e-06, "loss": 2.1436, "step": 30100 }, { "epoch": 0.0483199226881237, "grad_norm": 76.13285064697266, "learning_rate": 9.658085470632472e-06, "loss": 2.1919, "step": 30200 }, { "epoch": 0.04847992243212411, "grad_norm": 2.952242374420166, "learning_rate": 9.690084958640661e-06, "loss": 1.9683, "step": 30300 }, { "epoch": 0.04863992217612452, "grad_norm": 33.4036979675293, "learning_rate": 9.722084446648855e-06, "loss": 2.3543, "step": 30400 }, { "epoch": 0.04879992192012493, "grad_norm": 173.6257781982422, "learning_rate": 9.753763939776965e-06, "loss": 2.0642, "step": 30500 }, { "epoch": 0.048959921664125335, "grad_norm": 0.08548393100500107, "learning_rate": 9.785763427785156e-06, "loss": 1.8447, "step": 30600 }, { "epoch": 0.049119921408125745, "grad_norm": 111.82203674316406, "learning_rate": 9.817762915793348e-06, "loss": 2.3467, "step": 30700 }, { "epoch": 0.049279921152126156, "grad_norm": 142.97500610351562, "learning_rate": 9.84976240380154e-06, "loss": 2.6461, "step": 30800 }, { "epoch": 0.04943992089612657, "grad_norm": 417.88677978515625, "learning_rate": 9.881761891809732e-06, "loss": 2.028, "step": 30900 }, { "epoch": 0.04959992064012698, "grad_norm": 4.543129920959473, "learning_rate": 9.913761379817923e-06, "loss": 1.4188, "step": 31000 }, { "epoch": 0.04975992038412739, "grad_norm": 205.02293395996094, "learning_rate": 9.945760867826115e-06, "loss": 2.7219, "step": 31100 }, { "epoch": 0.04991992012812779, "grad_norm": 123.40583038330078, "learning_rate": 9.977760355834308e-06, "loss": 2.2345, "step": 31200 }, { "epoch": 0.050079919872128204, "grad_norm": 0.9410820603370667, "learning_rate": 1.00097598438425e-05, "loss": 2.201, "step": 31300 }, { "epoch": 0.050239919616128614, "grad_norm": 51.27448272705078, "learning_rate": 1.004175933185069e-05, "loss": 2.092, "step": 31400 }, { "epoch": 0.050399919360129025, "grad_norm": 258.7269592285156, "learning_rate": 1.0073758819858884e-05, "loss": 2.2871, "step": 31500 }, { "epoch": 0.050559919104129436, "grad_norm": 108.11058044433594, "learning_rate": 1.0105758307867075e-05, "loss": 2.0167, "step": 31600 }, { "epoch": 0.05071991884812984, "grad_norm": 229.5725555419922, "learning_rate": 1.0137757795875266e-05, "loss": 1.9175, "step": 31700 }, { "epoch": 0.05087991859213025, "grad_norm": 204.41357421875, "learning_rate": 1.016975728388346e-05, "loss": 2.2229, "step": 31800 }, { "epoch": 0.05103991833613066, "grad_norm": 8.951689720153809, "learning_rate": 1.020175677189165e-05, "loss": 2.1196, "step": 31900 }, { "epoch": 0.05119991808013107, "grad_norm": 275.85198974609375, "learning_rate": 1.0233756259899844e-05, "loss": 2.2192, "step": 32000 }, { "epoch": 0.05135991782413148, "grad_norm": 359.066650390625, "learning_rate": 1.0265755747908035e-05, "loss": 1.6462, "step": 32100 }, { "epoch": 0.051519917568131894, "grad_norm": 0.10183493793010712, "learning_rate": 1.0297755235916226e-05, "loss": 2.099, "step": 32200 }, { "epoch": 0.0516799173121323, "grad_norm": 43.3016357421875, "learning_rate": 1.0329434729044337e-05, "loss": 2.0914, "step": 32300 }, { "epoch": 0.05183991705613271, "grad_norm": 97.42915344238281, "learning_rate": 1.0361434217052528e-05, "loss": 2.3295, "step": 32400 }, { "epoch": 0.05199991680013312, "grad_norm": 1.9172292947769165, "learning_rate": 1.039343370506072e-05, "loss": 2.256, "step": 32500 }, { "epoch": 0.05215991654413353, "grad_norm": 157.83743286132812, "learning_rate": 1.0425433193068913e-05, "loss": 1.7662, "step": 32600 }, { "epoch": 0.05231991628813394, "grad_norm": 1.3025041818618774, "learning_rate": 1.0457432681077104e-05, "loss": 1.7234, "step": 32700 }, { "epoch": 0.052479916032134345, "grad_norm": 234.8426971435547, "learning_rate": 1.0489432169085297e-05, "loss": 1.984, "step": 32800 }, { "epoch": 0.052639915776134756, "grad_norm": 9.249500274658203, "learning_rate": 1.0521431657093488e-05, "loss": 2.1815, "step": 32900 }, { "epoch": 0.05279991552013517, "grad_norm": 164.6519012451172, "learning_rate": 1.055343114510168e-05, "loss": 1.4987, "step": 33000 }, { "epoch": 0.05295991526413558, "grad_norm": 145.9049072265625, "learning_rate": 1.0585430633109873e-05, "loss": 2.0034, "step": 33100 }, { "epoch": 0.05311991500813599, "grad_norm": 79.73159790039062, "learning_rate": 1.0617430121118062e-05, "loss": 2.6008, "step": 33200 }, { "epoch": 0.0532799147521364, "grad_norm": 131.95318603515625, "learning_rate": 1.0649429609126254e-05, "loss": 2.4585, "step": 33300 }, { "epoch": 0.0534399144961368, "grad_norm": 44.75098419189453, "learning_rate": 1.0681429097134445e-05, "loss": 1.881, "step": 33400 }, { "epoch": 0.053599914240137214, "grad_norm": 0.9141740202903748, "learning_rate": 1.0713428585142638e-05, "loss": 1.8738, "step": 33500 }, { "epoch": 0.053759913984137625, "grad_norm": 248.49734497070312, "learning_rate": 1.074542807315083e-05, "loss": 1.9726, "step": 33600 }, { "epoch": 0.053919913728138036, "grad_norm": 167.88706970214844, "learning_rate": 1.0777427561159021e-05, "loss": 2.3734, "step": 33700 }, { "epoch": 0.05407991347213845, "grad_norm": 0.43971773982048035, "learning_rate": 1.0809427049167214e-05, "loss": 1.6898, "step": 33800 }, { "epoch": 0.05423991321613886, "grad_norm": 713.7942504882812, "learning_rate": 1.0841426537175405e-05, "loss": 2.2171, "step": 33900 }, { "epoch": 0.05439991296013926, "grad_norm": 57.55624771118164, "learning_rate": 1.0873426025183598e-05, "loss": 1.4453, "step": 34000 }, { "epoch": 0.05455991270413967, "grad_norm": 409.5030822753906, "learning_rate": 1.090542551319179e-05, "loss": 1.5057, "step": 34100 }, { "epoch": 0.05471991244814008, "grad_norm": 60.115047454833984, "learning_rate": 1.0937425001199981e-05, "loss": 2.1497, "step": 34200 }, { "epoch": 0.054879912192140494, "grad_norm": 0.7692262530326843, "learning_rate": 1.0969424489208174e-05, "loss": 1.8618, "step": 34300 }, { "epoch": 0.055039911936140905, "grad_norm": 698.8638916015625, "learning_rate": 1.1001423977216366e-05, "loss": 1.7878, "step": 34400 }, { "epoch": 0.05519991168014131, "grad_norm": 0.5103877186775208, "learning_rate": 1.1033423465224557e-05, "loss": 1.8199, "step": 34500 }, { "epoch": 0.05535991142414172, "grad_norm": 347.1667175292969, "learning_rate": 1.106542295323275e-05, "loss": 2.1649, "step": 34600 }, { "epoch": 0.05551991116814213, "grad_norm": 99.95459747314453, "learning_rate": 1.1097422441240941e-05, "loss": 1.7906, "step": 34700 }, { "epoch": 0.05567991091214254, "grad_norm": 211.90087890625, "learning_rate": 1.1129421929249133e-05, "loss": 1.6816, "step": 34800 }, { "epoch": 0.05583991065614295, "grad_norm": 60.790199279785156, "learning_rate": 1.1161421417257326e-05, "loss": 2.1464, "step": 34900 }, { "epoch": 0.05599991040014336, "grad_norm": 585.09716796875, "learning_rate": 1.1193420905265517e-05, "loss": 2.0039, "step": 35000 }, { "epoch": 0.05615991014414377, "grad_norm": 0.2061644047498703, "learning_rate": 1.122542039327371e-05, "loss": 1.735, "step": 35100 }, { "epoch": 0.05631990988814418, "grad_norm": 204.5592498779297, "learning_rate": 1.1257419881281901e-05, "loss": 1.853, "step": 35200 }, { "epoch": 0.05647990963214459, "grad_norm": 695.4961547851562, "learning_rate": 1.1289419369290093e-05, "loss": 1.6068, "step": 35300 }, { "epoch": 0.056639909376145, "grad_norm": 220.02767944335938, "learning_rate": 1.1321418857298282e-05, "loss": 1.6349, "step": 35400 }, { "epoch": 0.05679990912014541, "grad_norm": 0.07823936641216278, "learning_rate": 1.1353418345306476e-05, "loss": 1.9571, "step": 35500 }, { "epoch": 0.056959908864145814, "grad_norm": 31.91838836669922, "learning_rate": 1.1385417833314667e-05, "loss": 1.5854, "step": 35600 }, { "epoch": 0.057119908608146225, "grad_norm": 1040.179931640625, "learning_rate": 1.1417417321322858e-05, "loss": 1.9756, "step": 35700 }, { "epoch": 0.057279908352146636, "grad_norm": 16.008800506591797, "learning_rate": 1.1449416809331051e-05, "loss": 1.9816, "step": 35800 }, { "epoch": 0.057439908096147047, "grad_norm": 226.522705078125, "learning_rate": 1.1481416297339243e-05, "loss": 1.6758, "step": 35900 }, { "epoch": 0.05759990784014746, "grad_norm": 85.04449462890625, "learning_rate": 1.1513415785347436e-05, "loss": 2.2583, "step": 36000 }, { "epoch": 0.05775990758414787, "grad_norm": 3.989626884460449, "learning_rate": 1.1545415273355627e-05, "loss": 1.7584, "step": 36100 }, { "epoch": 0.05791990732814827, "grad_norm": 63.272911071777344, "learning_rate": 1.1577414761363818e-05, "loss": 1.9894, "step": 36200 }, { "epoch": 0.05807990707214868, "grad_norm": 175.4257049560547, "learning_rate": 1.1609414249372011e-05, "loss": 2.3922, "step": 36300 }, { "epoch": 0.058239906816149094, "grad_norm": 160.36253356933594, "learning_rate": 1.164109374250012e-05, "loss": 2.0077, "step": 36400 }, { "epoch": 0.058399906560149505, "grad_norm": 95.23787689208984, "learning_rate": 1.1673093230508312e-05, "loss": 2.3684, "step": 36500 }, { "epoch": 0.058559906304149915, "grad_norm": 173.092041015625, "learning_rate": 1.1705092718516505e-05, "loss": 2.1103, "step": 36600 }, { "epoch": 0.058719906048150326, "grad_norm": 719.3712768554688, "learning_rate": 1.1736772211644613e-05, "loss": 2.0728, "step": 36700 }, { "epoch": 0.05887990579215073, "grad_norm": 1.7120122909545898, "learning_rate": 1.1768771699652806e-05, "loss": 1.9364, "step": 36800 }, { "epoch": 0.05903990553615114, "grad_norm": 120.16387176513672, "learning_rate": 1.1800771187660998e-05, "loss": 2.5203, "step": 36900 }, { "epoch": 0.05919990528015155, "grad_norm": 46.504329681396484, "learning_rate": 1.1832770675669189e-05, "loss": 1.8473, "step": 37000 }, { "epoch": 0.05935990502415196, "grad_norm": 255.33987426757812, "learning_rate": 1.1864770163677382e-05, "loss": 1.8076, "step": 37100 }, { "epoch": 0.059519904768152374, "grad_norm": 130.05715942382812, "learning_rate": 1.1896769651685574e-05, "loss": 2.0157, "step": 37200 }, { "epoch": 0.05967990451215278, "grad_norm": 201.22866821289062, "learning_rate": 1.1928769139693765e-05, "loss": 2.1587, "step": 37300 }, { "epoch": 0.05983990425615319, "grad_norm": 0.2600236237049103, "learning_rate": 1.1960768627701958e-05, "loss": 1.9825, "step": 37400 }, { "epoch": 0.0599999040001536, "grad_norm": 0.20701654255390167, "learning_rate": 1.199276811571015e-05, "loss": 2.0693, "step": 37500 }, { "epoch": 0.06015990374415401, "grad_norm": 247.0039520263672, "learning_rate": 1.202476760371834e-05, "loss": 1.5505, "step": 37600 }, { "epoch": 0.06031990348815442, "grad_norm": 15.698258399963379, "learning_rate": 1.2056767091726534e-05, "loss": 1.5472, "step": 37700 }, { "epoch": 0.06047990323215483, "grad_norm": 357.7092590332031, "learning_rate": 1.2088766579734725e-05, "loss": 2.0568, "step": 37800 }, { "epoch": 0.060639902976155236, "grad_norm": 54.52446365356445, "learning_rate": 1.2120766067742918e-05, "loss": 1.9219, "step": 37900 }, { "epoch": 0.060799902720155646, "grad_norm": 240.81784057617188, "learning_rate": 1.215276555575111e-05, "loss": 2.091, "step": 38000 }, { "epoch": 0.06095990246415606, "grad_norm": 0.14063161611557007, "learning_rate": 1.21847650437593e-05, "loss": 2.0523, "step": 38100 }, { "epoch": 0.06111990220815647, "grad_norm": 101.88555145263672, "learning_rate": 1.2216764531767494e-05, "loss": 1.7628, "step": 38200 }, { "epoch": 0.06127990195215688, "grad_norm": 1.7761729955673218, "learning_rate": 1.2248764019775685e-05, "loss": 1.8753, "step": 38300 }, { "epoch": 0.06143990169615728, "grad_norm": 183.46917724609375, "learning_rate": 1.2280763507783877e-05, "loss": 1.846, "step": 38400 }, { "epoch": 0.061599901440157694, "grad_norm": 0.008245576173067093, "learning_rate": 1.231276299579207e-05, "loss": 1.803, "step": 38500 }, { "epoch": 0.061759901184158104, "grad_norm": 389.3524169921875, "learning_rate": 1.2344762483800261e-05, "loss": 2.1226, "step": 38600 }, { "epoch": 0.061919900928158515, "grad_norm": 457.38519287109375, "learning_rate": 1.2376761971808452e-05, "loss": 2.0906, "step": 38700 }, { "epoch": 0.062079900672158926, "grad_norm": 95.94575500488281, "learning_rate": 1.2408441464936563e-05, "loss": 1.4321, "step": 38800 }, { "epoch": 0.06223990041615934, "grad_norm": 0.09420862793922424, "learning_rate": 1.2440440952944754e-05, "loss": 2.5214, "step": 38900 }, { "epoch": 0.06239990016015974, "grad_norm": 7.472883224487305, "learning_rate": 1.2472440440952947e-05, "loss": 1.5412, "step": 39000 }, { "epoch": 0.06255989990416015, "grad_norm": 198.42828369140625, "learning_rate": 1.2504439928961139e-05, "loss": 1.4382, "step": 39100 }, { "epoch": 0.06271989964816056, "grad_norm": 1.2646727561950684, "learning_rate": 1.253643941696933e-05, "loss": 1.8417, "step": 39200 }, { "epoch": 0.06287989939216097, "grad_norm": 85.20125579833984, "learning_rate": 1.2568438904977523e-05, "loss": 2.1105, "step": 39300 }, { "epoch": 0.06303989913616138, "grad_norm": 6.063973903656006, "learning_rate": 1.2600438392985714e-05, "loss": 1.6347, "step": 39400 }, { "epoch": 0.0631998988801618, "grad_norm": 1.7712761163711548, "learning_rate": 1.2632437880993904e-05, "loss": 2.0372, "step": 39500 }, { "epoch": 0.0633598986241622, "grad_norm": 105.22515106201172, "learning_rate": 1.2664437369002095e-05, "loss": 1.6222, "step": 39600 }, { "epoch": 0.06351989836816262, "grad_norm": 152.34910583496094, "learning_rate": 1.2696436857010288e-05, "loss": 1.8033, "step": 39700 }, { "epoch": 0.06367989811216301, "grad_norm": 0.4972204864025116, "learning_rate": 1.272843634501848e-05, "loss": 1.9847, "step": 39800 }, { "epoch": 0.06383989785616342, "grad_norm": 145.8481903076172, "learning_rate": 1.2760435833026673e-05, "loss": 2.1354, "step": 39900 }, { "epoch": 0.06399989760016384, "grad_norm": 0.4929490089416504, "learning_rate": 1.2792435321034864e-05, "loss": 1.6792, "step": 40000 }, { "epoch": 0.06415989734416425, "grad_norm": 0.004757192451506853, "learning_rate": 1.2824434809043055e-05, "loss": 2.1055, "step": 40100 }, { "epoch": 0.06431989708816466, "grad_norm": 133.86878967285156, "learning_rate": 1.2856434297051249e-05, "loss": 2.0657, "step": 40200 }, { "epoch": 0.06447989683216507, "grad_norm": 75.14216613769531, "learning_rate": 1.288843378505944e-05, "loss": 1.9618, "step": 40300 }, { "epoch": 0.06463989657616548, "grad_norm": 0.47655782103538513, "learning_rate": 1.2920433273067631e-05, "loss": 1.5807, "step": 40400 }, { "epoch": 0.06479989632016589, "grad_norm": 0.25797244906425476, "learning_rate": 1.2952432761075824e-05, "loss": 1.6451, "step": 40500 }, { "epoch": 0.0649598960641663, "grad_norm": 0.013840774074196815, "learning_rate": 1.2984432249084016e-05, "loss": 2.1299, "step": 40600 }, { "epoch": 0.06511989580816671, "grad_norm": 0.016265127807855606, "learning_rate": 1.3016431737092207e-05, "loss": 1.9912, "step": 40700 }, { "epoch": 0.06527989555216712, "grad_norm": 91.05821228027344, "learning_rate": 1.30484312251004e-05, "loss": 1.6392, "step": 40800 }, { "epoch": 0.06543989529616753, "grad_norm": 0.5753430724143982, "learning_rate": 1.3080430713108591e-05, "loss": 1.8049, "step": 40900 }, { "epoch": 0.06559989504016793, "grad_norm": 1.7056798934936523, "learning_rate": 1.3112430201116784e-05, "loss": 1.9832, "step": 41000 }, { "epoch": 0.06575989478416834, "grad_norm": 115.96708679199219, "learning_rate": 1.3144109694244893e-05, "loss": 2.0309, "step": 41100 }, { "epoch": 0.06591989452816875, "grad_norm": 128.86553955078125, "learning_rate": 1.3176109182253085e-05, "loss": 1.8362, "step": 41200 }, { "epoch": 0.06607989427216916, "grad_norm": 8.644057273864746, "learning_rate": 1.3208108670261278e-05, "loss": 2.2709, "step": 41300 }, { "epoch": 0.06623989401616957, "grad_norm": 105.3166732788086, "learning_rate": 1.3240108158269469e-05, "loss": 2.0785, "step": 41400 }, { "epoch": 0.06639989376016998, "grad_norm": 66.77593231201172, "learning_rate": 1.327210764627766e-05, "loss": 1.5627, "step": 41500 }, { "epoch": 0.0665598935041704, "grad_norm": 0.6800107359886169, "learning_rate": 1.3304107134285853e-05, "loss": 1.6058, "step": 41600 }, { "epoch": 0.0667198932481708, "grad_norm": 138.5995330810547, "learning_rate": 1.3336106622294045e-05, "loss": 1.7099, "step": 41700 }, { "epoch": 0.06687989299217122, "grad_norm": 0.2328547090291977, "learning_rate": 1.3368106110302238e-05, "loss": 1.7096, "step": 41800 }, { "epoch": 0.06703989273617163, "grad_norm": 82.12950897216797, "learning_rate": 1.3400105598310429e-05, "loss": 1.6429, "step": 41900 }, { "epoch": 0.06719989248017204, "grad_norm": 1.3431618213653564, "learning_rate": 1.343210508631862e-05, "loss": 1.2514, "step": 42000 }, { "epoch": 0.06735989222417244, "grad_norm": 30.410139083862305, "learning_rate": 1.3464104574326814e-05, "loss": 1.5746, "step": 42100 }, { "epoch": 0.06751989196817285, "grad_norm": 106.41495513916016, "learning_rate": 1.3496104062335005e-05, "loss": 1.7186, "step": 42200 }, { "epoch": 0.06767989171217326, "grad_norm": 200.46978759765625, "learning_rate": 1.3528103550343195e-05, "loss": 1.8152, "step": 42300 }, { "epoch": 0.06783989145617367, "grad_norm": 0.09822285175323486, "learning_rate": 1.3560103038351386e-05, "loss": 1.705, "step": 42400 }, { "epoch": 0.06799989120017408, "grad_norm": 438.903564453125, "learning_rate": 1.3592102526359579e-05, "loss": 1.6779, "step": 42500 }, { "epoch": 0.06815989094417449, "grad_norm": 0.03262553736567497, "learning_rate": 1.362410201436777e-05, "loss": 1.8157, "step": 42600 }, { "epoch": 0.0683198906881749, "grad_norm": 376.0313720703125, "learning_rate": 1.3656101502375962e-05, "loss": 1.8464, "step": 42700 }, { "epoch": 0.06847989043217531, "grad_norm": 29.421518325805664, "learning_rate": 1.3688100990384155e-05, "loss": 1.748, "step": 42800 }, { "epoch": 0.06863989017617572, "grad_norm": 183.51832580566406, "learning_rate": 1.3720100478392346e-05, "loss": 1.6836, "step": 42900 }, { "epoch": 0.06879988992017613, "grad_norm": 0.0013067092513665557, "learning_rate": 1.3752099966400539e-05, "loss": 1.65, "step": 43000 }, { "epoch": 0.06895988966417654, "grad_norm": 0.006181403063237667, "learning_rate": 1.378409945440873e-05, "loss": 1.5632, "step": 43100 }, { "epoch": 0.06911988940817694, "grad_norm": 0.134628027677536, "learning_rate": 1.3816098942416922e-05, "loss": 2.0987, "step": 43200 }, { "epoch": 0.06927988915217735, "grad_norm": 235.39088439941406, "learning_rate": 1.3848098430425115e-05, "loss": 1.5783, "step": 43300 }, { "epoch": 0.06943988889617776, "grad_norm": 89.28943634033203, "learning_rate": 1.3880097918433306e-05, "loss": 1.8029, "step": 43400 }, { "epoch": 0.06959988864017817, "grad_norm": 197.04258728027344, "learning_rate": 1.3911777411561415e-05, "loss": 1.7154, "step": 43500 }, { "epoch": 0.06975988838417858, "grad_norm": 96.05148315429688, "learning_rate": 1.3943776899569608e-05, "loss": 1.663, "step": 43600 }, { "epoch": 0.069919888128179, "grad_norm": 8.378194808959961, "learning_rate": 1.39757763875778e-05, "loss": 1.4403, "step": 43700 }, { "epoch": 0.0700798878721794, "grad_norm": 1932.4417724609375, "learning_rate": 1.4007775875585992e-05, "loss": 1.6513, "step": 43800 }, { "epoch": 0.07023988761617982, "grad_norm": 185.06163024902344, "learning_rate": 1.4039775363594184e-05, "loss": 2.2041, "step": 43900 }, { "epoch": 0.07039988736018023, "grad_norm": 1.7904412746429443, "learning_rate": 1.4071774851602375e-05, "loss": 2.3908, "step": 44000 }, { "epoch": 0.07055988710418064, "grad_norm": 0.07365602254867554, "learning_rate": 1.4103774339610568e-05, "loss": 1.7153, "step": 44100 }, { "epoch": 0.07071988684818105, "grad_norm": 117.01744842529297, "learning_rate": 1.413577382761876e-05, "loss": 2.2112, "step": 44200 }, { "epoch": 0.07087988659218145, "grad_norm": 509.897216796875, "learning_rate": 1.4167773315626951e-05, "loss": 1.8663, "step": 44300 }, { "epoch": 0.07103988633618186, "grad_norm": 188.78509521484375, "learning_rate": 1.4199772803635144e-05, "loss": 1.8206, "step": 44400 }, { "epoch": 0.07119988608018227, "grad_norm": 122.20122528076172, "learning_rate": 1.4231772291643335e-05, "loss": 2.2269, "step": 44500 }, { "epoch": 0.07135988582418268, "grad_norm": 894.0123901367188, "learning_rate": 1.4263771779651527e-05, "loss": 1.8159, "step": 44600 }, { "epoch": 0.07151988556818309, "grad_norm": 217.02325439453125, "learning_rate": 1.429577126765972e-05, "loss": 1.9257, "step": 44700 }, { "epoch": 0.0716798853121835, "grad_norm": 0.4048191010951996, "learning_rate": 1.4327770755667911e-05, "loss": 2.087, "step": 44800 }, { "epoch": 0.07183988505618391, "grad_norm": 85.02055358886719, "learning_rate": 1.4359770243676104e-05, "loss": 1.3623, "step": 44900 }, { "epoch": 0.07199988480018432, "grad_norm": 90.52297973632812, "learning_rate": 1.4391769731684295e-05, "loss": 1.5747, "step": 45000 }, { "epoch": 0.07215988454418473, "grad_norm": 28.14681053161621, "learning_rate": 1.4423769219692485e-05, "loss": 1.8051, "step": 45100 }, { "epoch": 0.07231988428818514, "grad_norm": 0.31391066312789917, "learning_rate": 1.4455768707700676e-05, "loss": 2.3691, "step": 45200 }, { "epoch": 0.07247988403218555, "grad_norm": 0.08174788951873779, "learning_rate": 1.448776819570887e-05, "loss": 2.1125, "step": 45300 }, { "epoch": 0.07263988377618597, "grad_norm": 107.4333267211914, "learning_rate": 1.4519767683717061e-05, "loss": 1.566, "step": 45400 }, { "epoch": 0.07279988352018636, "grad_norm": 119.36717224121094, "learning_rate": 1.4551767171725252e-05, "loss": 1.5042, "step": 45500 }, { "epoch": 0.07295988326418677, "grad_norm": 157.90757751464844, "learning_rate": 1.4583446664853364e-05, "loss": 1.9469, "step": 45600 }, { "epoch": 0.07311988300818718, "grad_norm": 172.0279998779297, "learning_rate": 1.4615446152861554e-05, "loss": 1.9346, "step": 45700 }, { "epoch": 0.0732798827521876, "grad_norm": 961.58935546875, "learning_rate": 1.4647445640869747e-05, "loss": 1.4362, "step": 45800 }, { "epoch": 0.073439882496188, "grad_norm": 0.0017953349743038416, "learning_rate": 1.4679445128877938e-05, "loss": 1.9164, "step": 45900 }, { "epoch": 0.07359988224018842, "grad_norm": 67.61031341552734, "learning_rate": 1.471144461688613e-05, "loss": 1.511, "step": 46000 }, { "epoch": 0.07375988198418883, "grad_norm": 223.40682983398438, "learning_rate": 1.4743444104894323e-05, "loss": 1.4523, "step": 46100 }, { "epoch": 0.07391988172818924, "grad_norm": 86.30171966552734, "learning_rate": 1.4775443592902514e-05, "loss": 1.1247, "step": 46200 }, { "epoch": 0.07407988147218965, "grad_norm": 147.9749755859375, "learning_rate": 1.4807443080910706e-05, "loss": 1.9694, "step": 46300 }, { "epoch": 0.07423988121619006, "grad_norm": 0.01015425007790327, "learning_rate": 1.4839442568918899e-05, "loss": 2.1909, "step": 46400 }, { "epoch": 0.07439988096019047, "grad_norm": 259.0996398925781, "learning_rate": 1.487144205692709e-05, "loss": 2.0247, "step": 46500 }, { "epoch": 0.07455988070419087, "grad_norm": 95.31226348876953, "learning_rate": 1.4903441544935281e-05, "loss": 1.2061, "step": 46600 }, { "epoch": 0.07471988044819128, "grad_norm": 173.83978271484375, "learning_rate": 1.4935441032943474e-05, "loss": 1.6151, "step": 46700 }, { "epoch": 0.07487988019219169, "grad_norm": 2.386795997619629, "learning_rate": 1.4967440520951666e-05, "loss": 1.6184, "step": 46800 }, { "epoch": 0.0750398799361921, "grad_norm": 429.2137756347656, "learning_rate": 1.4999440008959859e-05, "loss": 2.0375, "step": 46900 }, { "epoch": 0.07519987968019251, "grad_norm": 387.4931945800781, "learning_rate": 1.503143949696805e-05, "loss": 1.8357, "step": 47000 }, { "epoch": 0.07535987942419292, "grad_norm": 412.69384765625, "learning_rate": 1.5063438984976241e-05, "loss": 1.7605, "step": 47100 }, { "epoch": 0.07551987916819333, "grad_norm": 64.52519989013672, "learning_rate": 1.5095438472984435e-05, "loss": 2.1139, "step": 47200 }, { "epoch": 0.07567987891219374, "grad_norm": 2.706088066101074, "learning_rate": 1.5127437960992626e-05, "loss": 1.2971, "step": 47300 }, { "epoch": 0.07583987865619415, "grad_norm": 3.714489459991455, "learning_rate": 1.5159437449000817e-05, "loss": 1.7242, "step": 47400 }, { "epoch": 0.07599987840019456, "grad_norm": 5.220536708831787, "learning_rate": 1.519143693700901e-05, "loss": 1.2726, "step": 47500 }, { "epoch": 0.07615987814419498, "grad_norm": 133.04861450195312, "learning_rate": 1.5223436425017202e-05, "loss": 1.9947, "step": 47600 }, { "epoch": 0.07631987788819537, "grad_norm": 0.19635449349880219, "learning_rate": 1.5255435913025393e-05, "loss": 2.2796, "step": 47700 }, { "epoch": 0.07647987763219578, "grad_norm": 168.06861877441406, "learning_rate": 1.5287115406153503e-05, "loss": 1.6232, "step": 47800 }, { "epoch": 0.0766398773761962, "grad_norm": 0.004633053671568632, "learning_rate": 1.5319114894161697e-05, "loss": 1.3513, "step": 47900 }, { "epoch": 0.0767998771201966, "grad_norm": 0.0009558099554851651, "learning_rate": 1.5351114382169886e-05, "loss": 1.291, "step": 48000 }, { "epoch": 0.07695987686419702, "grad_norm": 1.2679246664047241, "learning_rate": 1.538311387017808e-05, "loss": 1.5954, "step": 48100 }, { "epoch": 0.07711987660819743, "grad_norm": 511.6206970214844, "learning_rate": 1.5415113358186272e-05, "loss": 1.6232, "step": 48200 }, { "epoch": 0.07727987635219784, "grad_norm": 190.66940307617188, "learning_rate": 1.5447112846194462e-05, "loss": 1.8858, "step": 48300 }, { "epoch": 0.07743987609619825, "grad_norm": 368.29022216796875, "learning_rate": 1.5479112334202655e-05, "loss": 1.6235, "step": 48400 }, { "epoch": 0.07759987584019866, "grad_norm": 0.00299979280680418, "learning_rate": 1.5511111822210848e-05, "loss": 1.9061, "step": 48500 }, { "epoch": 0.07775987558419907, "grad_norm": 96.37437438964844, "learning_rate": 1.5543111310219038e-05, "loss": 1.5919, "step": 48600 }, { "epoch": 0.07791987532819948, "grad_norm": 141.4491729736328, "learning_rate": 1.5575110798227227e-05, "loss": 1.8474, "step": 48700 }, { "epoch": 0.07807987507219988, "grad_norm": 9.810319900512695, "learning_rate": 1.560711028623542e-05, "loss": 1.7112, "step": 48800 }, { "epoch": 0.07823987481620029, "grad_norm": 0.20426060259342194, "learning_rate": 1.5639109774243613e-05, "loss": 1.8007, "step": 48900 }, { "epoch": 0.0783998745602007, "grad_norm": 2.3212544918060303, "learning_rate": 1.5671109262251803e-05, "loss": 1.7499, "step": 49000 }, { "epoch": 0.07855987430420111, "grad_norm": 2386.313232421875, "learning_rate": 1.5703108750259996e-05, "loss": 1.4046, "step": 49100 }, { "epoch": 0.07871987404820152, "grad_norm": 123.7901611328125, "learning_rate": 1.573510823826819e-05, "loss": 2.0843, "step": 49200 }, { "epoch": 0.07887987379220193, "grad_norm": 190.3510284423828, "learning_rate": 1.576710772627638e-05, "loss": 1.52, "step": 49300 }, { "epoch": 0.07903987353620234, "grad_norm": 0.0007205315632745624, "learning_rate": 1.5799107214284572e-05, "loss": 1.8708, "step": 49400 }, { "epoch": 0.07919987328020275, "grad_norm": 0.013503137975931168, "learning_rate": 1.5831106702292765e-05, "loss": 1.673, "step": 49500 }, { "epoch": 0.07935987302420316, "grad_norm": 65.62171936035156, "learning_rate": 1.5863106190300958e-05, "loss": 1.8457, "step": 49600 }, { "epoch": 0.07951987276820358, "grad_norm": 92.97589874267578, "learning_rate": 1.5895105678309148e-05, "loss": 1.5627, "step": 49700 }, { "epoch": 0.07967987251220399, "grad_norm": 21.424842834472656, "learning_rate": 1.5926785171437256e-05, "loss": 1.6497, "step": 49800 }, { "epoch": 0.07983987225620438, "grad_norm": 0.0011259341845288873, "learning_rate": 1.595878465944545e-05, "loss": 1.5787, "step": 49900 }, { "epoch": 0.0799998720002048, "grad_norm": 0.01448867842555046, "learning_rate": 1.5990784147453643e-05, "loss": 1.8507, "step": 50000 }, { "epoch": 0.0801598717442052, "grad_norm": 66.43374633789062, "learning_rate": 1.6022783635461832e-05, "loss": 1.4336, "step": 50100 }, { "epoch": 0.08031987148820562, "grad_norm": 27.450525283813477, "learning_rate": 1.6054783123470025e-05, "loss": 2.152, "step": 50200 }, { "epoch": 0.08047987123220603, "grad_norm": 33.92656707763672, "learning_rate": 1.6086782611478218e-05, "loss": 1.6311, "step": 50300 }, { "epoch": 0.08063987097620644, "grad_norm": 20.14742660522461, "learning_rate": 1.611878209948641e-05, "loss": 1.7442, "step": 50400 }, { "epoch": 0.08079987072020685, "grad_norm": 190.49342346191406, "learning_rate": 1.61507815874946e-05, "loss": 1.8063, "step": 50500 }, { "epoch": 0.08095987046420726, "grad_norm": 673.4315185546875, "learning_rate": 1.6182781075502794e-05, "loss": 1.4, "step": 50600 }, { "epoch": 0.08111987020820767, "grad_norm": 87.93296813964844, "learning_rate": 1.6214780563510987e-05, "loss": 1.6401, "step": 50700 }, { "epoch": 0.08127986995220808, "grad_norm": 10.013740539550781, "learning_rate": 1.6246780051519177e-05, "loss": 1.9426, "step": 50800 }, { "epoch": 0.08143986969620849, "grad_norm": 84.70770263671875, "learning_rate": 1.627877953952737e-05, "loss": 2.0937, "step": 50900 }, { "epoch": 0.0815998694402089, "grad_norm": 66.33674621582031, "learning_rate": 1.6310779027535563e-05, "loss": 1.8187, "step": 51000 }, { "epoch": 0.0817598691842093, "grad_norm": 0.013598043471574783, "learning_rate": 1.6342778515543753e-05, "loss": 2.1751, "step": 51100 }, { "epoch": 0.08191986892820971, "grad_norm": 0.8764291405677795, "learning_rate": 1.6374778003551946e-05, "loss": 2.1703, "step": 51200 }, { "epoch": 0.08207986867221012, "grad_norm": 14.436594009399414, "learning_rate": 1.640677749156014e-05, "loss": 1.4443, "step": 51300 }, { "epoch": 0.08223986841621053, "grad_norm": 0.27148157358169556, "learning_rate": 1.6438776979568328e-05, "loss": 1.9266, "step": 51400 }, { "epoch": 0.08239986816021094, "grad_norm": 11.139505386352539, "learning_rate": 1.6470776467576518e-05, "loss": 1.8226, "step": 51500 }, { "epoch": 0.08255986790421135, "grad_norm": 105.84121704101562, "learning_rate": 1.650277595558471e-05, "loss": 1.4394, "step": 51600 }, { "epoch": 0.08271986764821176, "grad_norm": 161.04141235351562, "learning_rate": 1.6534775443592904e-05, "loss": 1.052, "step": 51700 }, { "epoch": 0.08287986739221218, "grad_norm": 11.454148292541504, "learning_rate": 1.6566774931601094e-05, "loss": 1.0614, "step": 51800 }, { "epoch": 0.08303986713621259, "grad_norm": 0.03253089264035225, "learning_rate": 1.6598774419609287e-05, "loss": 1.4591, "step": 51900 }, { "epoch": 0.083199866880213, "grad_norm": 0.2750859558582306, "learning_rate": 1.663077390761748e-05, "loss": 1.6479, "step": 52000 }, { "epoch": 0.08335986662421341, "grad_norm": 3.381882667541504, "learning_rate": 1.666245340074559e-05, "loss": 1.7548, "step": 52100 }, { "epoch": 0.0835198663682138, "grad_norm": 102.45317840576172, "learning_rate": 1.669445288875378e-05, "loss": 1.6293, "step": 52200 }, { "epoch": 0.08367986611221422, "grad_norm": 122.25707244873047, "learning_rate": 1.672645237676197e-05, "loss": 1.7183, "step": 52300 }, { "epoch": 0.08383986585621463, "grad_norm": 1.0929410457611084, "learning_rate": 1.6758451864770164e-05, "loss": 1.2329, "step": 52400 }, { "epoch": 0.08399986560021504, "grad_norm": 0.0009238706552423537, "learning_rate": 1.6790451352778357e-05, "loss": 1.5292, "step": 52500 }, { "epoch": 0.08415986534421545, "grad_norm": 15.874957084655762, "learning_rate": 1.6822450840786547e-05, "loss": 1.6752, "step": 52600 }, { "epoch": 0.08431986508821586, "grad_norm": 8.129535675048828, "learning_rate": 1.685445032879474e-05, "loss": 1.3228, "step": 52700 }, { "epoch": 0.08447986483221627, "grad_norm": 196.6626434326172, "learning_rate": 1.6886449816802933e-05, "loss": 1.485, "step": 52800 }, { "epoch": 0.08463986457621668, "grad_norm": 257.9208679199219, "learning_rate": 1.6918449304811123e-05, "loss": 1.4228, "step": 52900 }, { "epoch": 0.08479986432021709, "grad_norm": 126.92493438720703, "learning_rate": 1.6950448792819316e-05, "loss": 1.1385, "step": 53000 }, { "epoch": 0.0849598640642175, "grad_norm": 218.52455139160156, "learning_rate": 1.698244828082751e-05, "loss": 1.1812, "step": 53100 }, { "epoch": 0.08511986380821791, "grad_norm": 0.32875338196754456, "learning_rate": 1.70144477688357e-05, "loss": 1.4763, "step": 53200 }, { "epoch": 0.08527986355221831, "grad_norm": 6.30516242980957, "learning_rate": 1.704644725684389e-05, "loss": 1.9444, "step": 53300 }, { "epoch": 0.08543986329621872, "grad_norm": 0.10023212432861328, "learning_rate": 1.7078446744852085e-05, "loss": 1.5316, "step": 53400 }, { "epoch": 0.08559986304021913, "grad_norm": 0.16311447322368622, "learning_rate": 1.7110446232860278e-05, "loss": 1.6928, "step": 53500 }, { "epoch": 0.08575986278421954, "grad_norm": 113.52496337890625, "learning_rate": 1.7142445720868467e-05, "loss": 1.4466, "step": 53600 }, { "epoch": 0.08591986252821995, "grad_norm": 202.2323760986328, "learning_rate": 1.717444520887666e-05, "loss": 1.438, "step": 53700 }, { "epoch": 0.08607986227222036, "grad_norm": 0.009465747512876987, "learning_rate": 1.7206444696884853e-05, "loss": 1.1629, "step": 53800 }, { "epoch": 0.08623986201622078, "grad_norm": 36.7415771484375, "learning_rate": 1.7238444184893043e-05, "loss": 1.3017, "step": 53900 }, { "epoch": 0.08639986176022119, "grad_norm": 238.30662536621094, "learning_rate": 1.7270443672901236e-05, "loss": 1.6614, "step": 54000 }, { "epoch": 0.0865598615042216, "grad_norm": 92.3208236694336, "learning_rate": 1.730244316090943e-05, "loss": 1.4535, "step": 54100 }, { "epoch": 0.08671986124822201, "grad_norm": 78.44776153564453, "learning_rate": 1.733444264891762e-05, "loss": 1.7061, "step": 54200 }, { "epoch": 0.08687986099222242, "grad_norm": 0.22147144377231598, "learning_rate": 1.736644213692581e-05, "loss": 1.4681, "step": 54300 }, { "epoch": 0.08703986073622282, "grad_norm": 11.244450569152832, "learning_rate": 1.7398441624934e-05, "loss": 1.3449, "step": 54400 }, { "epoch": 0.08719986048022323, "grad_norm": 0.002357147866860032, "learning_rate": 1.7430441112942195e-05, "loss": 1.8814, "step": 54500 }, { "epoch": 0.08735986022422364, "grad_norm": 0.008252524770796299, "learning_rate": 1.7462440600950384e-05, "loss": 1.5989, "step": 54600 }, { "epoch": 0.08751985996822405, "grad_norm": 61.25815200805664, "learning_rate": 1.7494440088958577e-05, "loss": 1.3711, "step": 54700 }, { "epoch": 0.08767985971222446, "grad_norm": 0.536085307598114, "learning_rate": 1.752643957696677e-05, "loss": 1.3199, "step": 54800 }, { "epoch": 0.08783985945622487, "grad_norm": 261.3522033691406, "learning_rate": 1.755811907009488e-05, "loss": 1.3713, "step": 54900 }, { "epoch": 0.08799985920022528, "grad_norm": 0.18219026923179626, "learning_rate": 1.7590118558103072e-05, "loss": 1.441, "step": 55000 }, { "epoch": 0.08815985894422569, "grad_norm": 2.0045886039733887, "learning_rate": 1.7622118046111262e-05, "loss": 1.268, "step": 55100 }, { "epoch": 0.0883198586882261, "grad_norm": 119.64205932617188, "learning_rate": 1.7654117534119455e-05, "loss": 1.1648, "step": 55200 }, { "epoch": 0.08847985843222651, "grad_norm": 1316.0831298828125, "learning_rate": 1.7686117022127648e-05, "loss": 1.8108, "step": 55300 }, { "epoch": 0.08863985817622692, "grad_norm": 0.016518862918019295, "learning_rate": 1.7718116510135838e-05, "loss": 1.4904, "step": 55400 }, { "epoch": 0.08879985792022732, "grad_norm": 0.0020672741811722517, "learning_rate": 1.774979600326395e-05, "loss": 1.2555, "step": 55500 }, { "epoch": 0.08895985766422773, "grad_norm": 0.0013748366618528962, "learning_rate": 1.778179549127214e-05, "loss": 1.2733, "step": 55600 }, { "epoch": 0.08911985740822814, "grad_norm": 156.89892578125, "learning_rate": 1.7813794979280332e-05, "loss": 1.5194, "step": 55700 }, { "epoch": 0.08927985715222855, "grad_norm": 0.034991975873708725, "learning_rate": 1.7845794467288526e-05, "loss": 1.7587, "step": 55800 }, { "epoch": 0.08943985689622896, "grad_norm": 25.602022171020508, "learning_rate": 1.7877793955296715e-05, "loss": 1.6183, "step": 55900 }, { "epoch": 0.08959985664022938, "grad_norm": 0.6393762230873108, "learning_rate": 1.7909793443304908e-05, "loss": 1.3596, "step": 56000 }, { "epoch": 0.08975985638422979, "grad_norm": 48.19321823120117, "learning_rate": 1.79417929313131e-05, "loss": 1.5248, "step": 56100 }, { "epoch": 0.0899198561282302, "grad_norm": 88.53876495361328, "learning_rate": 1.797379241932129e-05, "loss": 1.5177, "step": 56200 }, { "epoch": 0.09007985587223061, "grad_norm": 4.195464611053467, "learning_rate": 1.8005791907329484e-05, "loss": 1.7579, "step": 56300 }, { "epoch": 0.09023985561623102, "grad_norm": 4.309329986572266, "learning_rate": 1.8037791395337677e-05, "loss": 1.5508, "step": 56400 }, { "epoch": 0.09039985536023143, "grad_norm": 781.6338500976562, "learning_rate": 1.8069790883345867e-05, "loss": 1.5965, "step": 56500 }, { "epoch": 0.09055985510423184, "grad_norm": 0.2752499580383301, "learning_rate": 1.810179037135406e-05, "loss": 1.5762, "step": 56600 }, { "epoch": 0.09071985484823224, "grad_norm": 107.10926818847656, "learning_rate": 1.8133789859362253e-05, "loss": 1.7441, "step": 56700 }, { "epoch": 0.09087985459223265, "grad_norm": 97.79568481445312, "learning_rate": 1.8165789347370442e-05, "loss": 2.0257, "step": 56800 }, { "epoch": 0.09103985433623306, "grad_norm": 0.0016732424264773726, "learning_rate": 1.8197788835378635e-05, "loss": 1.1371, "step": 56900 }, { "epoch": 0.09119985408023347, "grad_norm": 7.662989139556885, "learning_rate": 1.822978832338683e-05, "loss": 1.8825, "step": 57000 }, { "epoch": 0.09135985382423388, "grad_norm": 16.23094940185547, "learning_rate": 1.8261787811395018e-05, "loss": 1.0455, "step": 57100 }, { "epoch": 0.09151985356823429, "grad_norm": 0.025669243186712265, "learning_rate": 1.829378729940321e-05, "loss": 1.5889, "step": 57200 }, { "epoch": 0.0916798533122347, "grad_norm": 0.4638320505619049, "learning_rate": 1.8325786787411404e-05, "loss": 1.192, "step": 57300 }, { "epoch": 0.09183985305623511, "grad_norm": 61.32036209106445, "learning_rate": 1.8357786275419597e-05, "loss": 1.5374, "step": 57400 }, { "epoch": 0.09199985280023552, "grad_norm": 0.0012545910431072116, "learning_rate": 1.8389785763427787e-05, "loss": 1.6236, "step": 57500 }, { "epoch": 0.09215985254423593, "grad_norm": 92.61576080322266, "learning_rate": 1.842178525143598e-05, "loss": 1.8945, "step": 57600 }, { "epoch": 0.09231985228823635, "grad_norm": 22.349824905395508, "learning_rate": 1.845378473944417e-05, "loss": 1.607, "step": 57700 }, { "epoch": 0.09247985203223674, "grad_norm": 126.0189208984375, "learning_rate": 1.8485784227452363e-05, "loss": 1.8133, "step": 57800 }, { "epoch": 0.09263985177623715, "grad_norm": 78.0487060546875, "learning_rate": 1.8517783715460552e-05, "loss": 1.5777, "step": 57900 }, { "epoch": 0.09279985152023756, "grad_norm": 0.0007238721009343863, "learning_rate": 1.8549783203468745e-05, "loss": 1.5043, "step": 58000 }, { "epoch": 0.09295985126423797, "grad_norm": 123.57425689697266, "learning_rate": 1.858178269147694e-05, "loss": 1.7681, "step": 58100 }, { "epoch": 0.09311985100823839, "grad_norm": 0.2985190153121948, "learning_rate": 1.8613782179485128e-05, "loss": 1.623, "step": 58200 }, { "epoch": 0.0932798507522388, "grad_norm": 64.27520751953125, "learning_rate": 1.864578166749332e-05, "loss": 2.2137, "step": 58300 }, { "epoch": 0.09343985049623921, "grad_norm": 207.88841247558594, "learning_rate": 1.8677781155501514e-05, "loss": 2.2447, "step": 58400 }, { "epoch": 0.09359985024023962, "grad_norm": 94.58351135253906, "learning_rate": 1.8709780643509704e-05, "loss": 2.3013, "step": 58500 }, { "epoch": 0.09375984998424003, "grad_norm": 0.29524990916252136, "learning_rate": 1.8741780131517897e-05, "loss": 1.3105, "step": 58600 }, { "epoch": 0.09391984972824044, "grad_norm": 110.85052490234375, "learning_rate": 1.877377961952609e-05, "loss": 1.4461, "step": 58700 }, { "epoch": 0.09407984947224085, "grad_norm": 44.61641311645508, "learning_rate": 1.880577910753428e-05, "loss": 2.1321, "step": 58800 }, { "epoch": 0.09423984921624125, "grad_norm": 699.353759765625, "learning_rate": 1.8837778595542473e-05, "loss": 1.7541, "step": 58900 }, { "epoch": 0.09439984896024166, "grad_norm": 88.22161865234375, "learning_rate": 1.8869778083550666e-05, "loss": 1.7894, "step": 59000 }, { "epoch": 0.09455984870424207, "grad_norm": 25.957782745361328, "learning_rate": 1.8901777571558855e-05, "loss": 1.693, "step": 59100 }, { "epoch": 0.09471984844824248, "grad_norm": 1.7580465078353882, "learning_rate": 1.8933457064686968e-05, "loss": 1.7073, "step": 59200 }, { "epoch": 0.09487984819224289, "grad_norm": 5.568783283233643, "learning_rate": 1.8965456552695157e-05, "loss": 2.0305, "step": 59300 }, { "epoch": 0.0950398479362433, "grad_norm": 0.21757324039936066, "learning_rate": 1.899745604070335e-05, "loss": 1.3684, "step": 59400 }, { "epoch": 0.09519984768024371, "grad_norm": 123.5767593383789, "learning_rate": 1.9029455528711543e-05, "loss": 1.8754, "step": 59500 }, { "epoch": 0.09535984742424412, "grad_norm": 66.91508483886719, "learning_rate": 1.9061455016719733e-05, "loss": 2.0225, "step": 59600 }, { "epoch": 0.09551984716824453, "grad_norm": 0.00018894312961492687, "learning_rate": 1.9093454504727926e-05, "loss": 2.1975, "step": 59700 }, { "epoch": 0.09567984691224495, "grad_norm": 84.60813903808594, "learning_rate": 1.912545399273612e-05, "loss": 1.7173, "step": 59800 }, { "epoch": 0.09583984665624536, "grad_norm": 67.51477813720703, "learning_rate": 1.915745348074431e-05, "loss": 1.4302, "step": 59900 }, { "epoch": 0.09599984640024575, "grad_norm": 59.47672653198242, "learning_rate": 1.9189452968752502e-05, "loss": 1.2497, "step": 60000 }, { "epoch": 0.09615984614424616, "grad_norm": 19.75477409362793, "learning_rate": 1.9221452456760695e-05, "loss": 1.4058, "step": 60100 }, { "epoch": 0.09631984588824657, "grad_norm": 91.08583068847656, "learning_rate": 1.9253451944768885e-05, "loss": 1.0956, "step": 60200 }, { "epoch": 0.09647984563224699, "grad_norm": 133.5473175048828, "learning_rate": 1.9285451432777078e-05, "loss": 1.3731, "step": 60300 }, { "epoch": 0.0966398453762474, "grad_norm": 0.010973370634019375, "learning_rate": 1.931745092078527e-05, "loss": 1.2953, "step": 60400 }, { "epoch": 0.09679984512024781, "grad_norm": 0.08579988777637482, "learning_rate": 1.934945040879346e-05, "loss": 1.0987, "step": 60500 }, { "epoch": 0.09695984486424822, "grad_norm": 1.1617801189422607, "learning_rate": 1.9381449896801653e-05, "loss": 1.5104, "step": 60600 }, { "epoch": 0.09711984460824863, "grad_norm": 0.3544386029243469, "learning_rate": 1.9413449384809843e-05, "loss": 1.5224, "step": 60700 }, { "epoch": 0.09727984435224904, "grad_norm": 44.28148651123047, "learning_rate": 1.9445448872818036e-05, "loss": 1.3982, "step": 60800 }, { "epoch": 0.09743984409624945, "grad_norm": 167.9832305908203, "learning_rate": 1.947744836082623e-05, "loss": 1.2785, "step": 60900 }, { "epoch": 0.09759984384024986, "grad_norm": 129.58119201660156, "learning_rate": 1.950944784883442e-05, "loss": 1.6018, "step": 61000 }, { "epoch": 0.09775984358425026, "grad_norm": 0.048871856182813644, "learning_rate": 1.9541447336842612e-05, "loss": 1.4968, "step": 61100 }, { "epoch": 0.09791984332825067, "grad_norm": 0.14592894911766052, "learning_rate": 1.9573446824850805e-05, "loss": 1.2423, "step": 61200 }, { "epoch": 0.09807984307225108, "grad_norm": 0.548117458820343, "learning_rate": 1.9605446312858995e-05, "loss": 1.9973, "step": 61300 }, { "epoch": 0.09823984281625149, "grad_norm": 52.4393424987793, "learning_rate": 1.9637445800867188e-05, "loss": 1.2149, "step": 61400 }, { "epoch": 0.0983998425602519, "grad_norm": 101.79759216308594, "learning_rate": 1.966944528887538e-05, "loss": 1.731, "step": 61500 }, { "epoch": 0.09855984230425231, "grad_norm": 0.04176723212003708, "learning_rate": 1.970144477688357e-05, "loss": 1.2889, "step": 61600 }, { "epoch": 0.09871984204825272, "grad_norm": 8.380585670471191, "learning_rate": 1.9733444264891763e-05, "loss": 1.856, "step": 61700 }, { "epoch": 0.09887984179225313, "grad_norm": 0.014852323569357395, "learning_rate": 1.9765443752899956e-05, "loss": 0.8942, "step": 61800 }, { "epoch": 0.09903984153625355, "grad_norm": 0.3229600787162781, "learning_rate": 1.9797443240908146e-05, "loss": 1.3371, "step": 61900 }, { "epoch": 0.09919984128025396, "grad_norm": 141.8211212158203, "learning_rate": 1.982944272891634e-05, "loss": 1.5222, "step": 62000 }, { "epoch": 0.09935984102425437, "grad_norm": 0.025253353640437126, "learning_rate": 1.9861442216924532e-05, "loss": 1.5435, "step": 62100 }, { "epoch": 0.09951984076825478, "grad_norm": 0.0009790909243747592, "learning_rate": 1.9893441704932722e-05, "loss": 1.1172, "step": 62200 }, { "epoch": 0.09967984051225517, "grad_norm": 43.73761749267578, "learning_rate": 1.9925441192940915e-05, "loss": 1.6024, "step": 62300 }, { "epoch": 0.09983984025625559, "grad_norm": 18.400936126708984, "learning_rate": 1.9957440680949108e-05, "loss": 1.3914, "step": 62400 }, { "epoch": 0.099999840000256, "grad_norm": 937.4790649414062, "learning_rate": 1.9989440168957298e-05, "loss": 1.4714, "step": 62500 }, { "epoch": 0.10015983974425641, "grad_norm": 0.14736099541187286, "learning_rate": 1.999761777777778e-05, "loss": 1.2922, "step": 62600 }, { "epoch": 0.10031983948825682, "grad_norm": 0.606549084186554, "learning_rate": 1.999409777777778e-05, "loss": 1.4263, "step": 62700 }, { "epoch": 0.10047983923225723, "grad_norm": 426.79400634765625, "learning_rate": 1.9990542222222224e-05, "loss": 1.4586, "step": 62800 }, { "epoch": 0.10063983897625764, "grad_norm": 0.007887039333581924, "learning_rate": 1.9986986666666668e-05, "loss": 1.6312, "step": 62900 }, { "epoch": 0.10079983872025805, "grad_norm": 0.9442864060401917, "learning_rate": 1.9983431111111113e-05, "loss": 1.9607, "step": 63000 }, { "epoch": 0.10095983846425846, "grad_norm": 0.002317711478099227, "learning_rate": 1.9979875555555557e-05, "loss": 1.5771, "step": 63100 }, { "epoch": 0.10111983820825887, "grad_norm": 75.09770965576172, "learning_rate": 1.9976320000000002e-05, "loss": 1.6721, "step": 63200 }, { "epoch": 0.10127983795225928, "grad_norm": 135.64022827148438, "learning_rate": 1.9972764444444446e-05, "loss": 1.8461, "step": 63300 }, { "epoch": 0.10143983769625968, "grad_norm": 0.1608121395111084, "learning_rate": 1.996920888888889e-05, "loss": 1.5256, "step": 63400 }, { "epoch": 0.10159983744026009, "grad_norm": 266.8143615722656, "learning_rate": 1.9965653333333336e-05, "loss": 1.9736, "step": 63500 }, { "epoch": 0.1017598371842605, "grad_norm": 125.29386138916016, "learning_rate": 1.996209777777778e-05, "loss": 1.4735, "step": 63600 }, { "epoch": 0.10191983692826091, "grad_norm": 1.8401292562484741, "learning_rate": 1.9958542222222225e-05, "loss": 1.4619, "step": 63700 }, { "epoch": 0.10207983667226132, "grad_norm": 352.0743103027344, "learning_rate": 1.995498666666667e-05, "loss": 1.6571, "step": 63800 }, { "epoch": 0.10223983641626173, "grad_norm": 546.5570068359375, "learning_rate": 1.9951431111111114e-05, "loss": 1.5888, "step": 63900 }, { "epoch": 0.10239983616026214, "grad_norm": 0.0009566646185703576, "learning_rate": 1.994787555555556e-05, "loss": 2.0457, "step": 64000 }, { "epoch": 0.10255983590426256, "grad_norm": 717.4028930664062, "learning_rate": 1.9944320000000003e-05, "loss": 1.7843, "step": 64100 }, { "epoch": 0.10271983564826297, "grad_norm": 0.16068622469902039, "learning_rate": 1.9940764444444447e-05, "loss": 1.5116, "step": 64200 }, { "epoch": 0.10287983539226338, "grad_norm": 69.0772705078125, "learning_rate": 1.9937208888888892e-05, "loss": 1.6682, "step": 64300 }, { "epoch": 0.10303983513626379, "grad_norm": 0.0007585228304378688, "learning_rate": 1.9933653333333337e-05, "loss": 1.2137, "step": 64400 }, { "epoch": 0.10319983488026419, "grad_norm": 140.95750427246094, "learning_rate": 1.9930097777777778e-05, "loss": 1.1308, "step": 64500 }, { "epoch": 0.1033598346242646, "grad_norm": 1.2280133962631226, "learning_rate": 1.9926542222222226e-05, "loss": 2.031, "step": 64600 }, { "epoch": 0.103519834368265, "grad_norm": 51.01097106933594, "learning_rate": 1.9922986666666667e-05, "loss": 1.6903, "step": 64700 }, { "epoch": 0.10367983411226542, "grad_norm": 354.4974365234375, "learning_rate": 1.9919431111111115e-05, "loss": 1.3365, "step": 64800 }, { "epoch": 0.10383983385626583, "grad_norm": 98.43709564208984, "learning_rate": 1.9915875555555556e-05, "loss": 1.5736, "step": 64900 }, { "epoch": 0.10399983360026624, "grad_norm": 176.26661682128906, "learning_rate": 1.991232e-05, "loss": 1.7264, "step": 65000 }, { "epoch": 0.10415983334426665, "grad_norm": 55.52714920043945, "learning_rate": 1.9908764444444445e-05, "loss": 1.1781, "step": 65100 }, { "epoch": 0.10431983308826706, "grad_norm": 0.0009407736943103373, "learning_rate": 1.990520888888889e-05, "loss": 1.2503, "step": 65200 }, { "epoch": 0.10447983283226747, "grad_norm": 0.001376794883981347, "learning_rate": 1.9901653333333334e-05, "loss": 0.9432, "step": 65300 }, { "epoch": 0.10463983257626788, "grad_norm": 52.175819396972656, "learning_rate": 1.9898133333333335e-05, "loss": 1.264, "step": 65400 }, { "epoch": 0.1047998323202683, "grad_norm": 147.7506866455078, "learning_rate": 1.989457777777778e-05, "loss": 1.2086, "step": 65500 }, { "epoch": 0.10495983206426869, "grad_norm": 31.214550018310547, "learning_rate": 1.9891022222222224e-05, "loss": 1.8692, "step": 65600 }, { "epoch": 0.1051198318082691, "grad_norm": 168.40858459472656, "learning_rate": 1.988746666666667e-05, "loss": 1.2745, "step": 65700 }, { "epoch": 0.10527983155226951, "grad_norm": 401.31842041015625, "learning_rate": 1.9883911111111113e-05, "loss": 1.6839, "step": 65800 }, { "epoch": 0.10543983129626992, "grad_norm": 139.64588928222656, "learning_rate": 1.9880355555555558e-05, "loss": 1.4509, "step": 65900 }, { "epoch": 0.10559983104027033, "grad_norm": 125.26469421386719, "learning_rate": 1.98768e-05, "loss": 1.1615, "step": 66000 }, { "epoch": 0.10575983078427074, "grad_norm": 0.1609152853488922, "learning_rate": 1.9873244444444447e-05, "loss": 1.4458, "step": 66100 }, { "epoch": 0.10591983052827116, "grad_norm": 0.000580300809815526, "learning_rate": 1.9869688888888888e-05, "loss": 1.8329, "step": 66200 }, { "epoch": 0.10607983027227157, "grad_norm": 101.93132781982422, "learning_rate": 1.9866133333333336e-05, "loss": 1.567, "step": 66300 }, { "epoch": 0.10623983001627198, "grad_norm": 99.72083282470703, "learning_rate": 1.9862577777777777e-05, "loss": 1.6746, "step": 66400 }, { "epoch": 0.10639982976027239, "grad_norm": 1651.70263671875, "learning_rate": 1.9859022222222225e-05, "loss": 1.65, "step": 66500 }, { "epoch": 0.1065598295042728, "grad_norm": 132.80343627929688, "learning_rate": 1.9855466666666666e-05, "loss": 1.5497, "step": 66600 }, { "epoch": 0.1067198292482732, "grad_norm": 0.004364237189292908, "learning_rate": 1.9851911111111114e-05, "loss": 1.4009, "step": 66700 }, { "epoch": 0.1068798289922736, "grad_norm": 4.1050825119018555, "learning_rate": 1.9848355555555556e-05, "loss": 2.058, "step": 66800 }, { "epoch": 0.10703982873627402, "grad_norm": 0.047410767525434494, "learning_rate": 1.9844800000000004e-05, "loss": 1.6306, "step": 66900 }, { "epoch": 0.10719982848027443, "grad_norm": 6.651243686676025, "learning_rate": 1.9841244444444445e-05, "loss": 1.4377, "step": 67000 }, { "epoch": 0.10735982822427484, "grad_norm": 0.010524190030992031, "learning_rate": 1.983768888888889e-05, "loss": 1.4501, "step": 67100 }, { "epoch": 0.10751982796827525, "grad_norm": 0.0009414692758582532, "learning_rate": 1.9834133333333334e-05, "loss": 1.2648, "step": 67200 }, { "epoch": 0.10767982771227566, "grad_norm": 1019.7636108398438, "learning_rate": 1.983057777777778e-05, "loss": 1.3186, "step": 67300 }, { "epoch": 0.10783982745627607, "grad_norm": 0.006541971582919359, "learning_rate": 1.982705777777778e-05, "loss": 1.1313, "step": 67400 }, { "epoch": 0.10799982720027648, "grad_norm": 323.63226318359375, "learning_rate": 1.9823502222222224e-05, "loss": 2.2523, "step": 67500 }, { "epoch": 0.1081598269442769, "grad_norm": 114.84791564941406, "learning_rate": 1.981994666666667e-05, "loss": 1.9146, "step": 67600 }, { "epoch": 0.1083198266882773, "grad_norm": 4.059427738189697, "learning_rate": 1.9816391111111113e-05, "loss": 1.7334, "step": 67700 }, { "epoch": 0.10847982643227772, "grad_norm": 3.274331569671631, "learning_rate": 1.9812835555555558e-05, "loss": 1.7195, "step": 67800 }, { "epoch": 0.10863982617627811, "grad_norm": 0.0480005145072937, "learning_rate": 1.9809280000000002e-05, "loss": 1.4661, "step": 67900 }, { "epoch": 0.10879982592027852, "grad_norm": 45.43354415893555, "learning_rate": 1.9805724444444447e-05, "loss": 1.3503, "step": 68000 }, { "epoch": 0.10895982566427893, "grad_norm": 0.0006582220084965229, "learning_rate": 1.980216888888889e-05, "loss": 1.0129, "step": 68100 }, { "epoch": 0.10911982540827934, "grad_norm": 111.87661743164062, "learning_rate": 1.9798613333333332e-05, "loss": 1.6036, "step": 68200 }, { "epoch": 0.10927982515227976, "grad_norm": 122.35249328613281, "learning_rate": 1.979505777777778e-05, "loss": 0.9312, "step": 68300 }, { "epoch": 0.10943982489628017, "grad_norm": 0.5635089874267578, "learning_rate": 1.979150222222222e-05, "loss": 1.5817, "step": 68400 }, { "epoch": 0.10959982464028058, "grad_norm": 2.6275858879089355, "learning_rate": 1.978794666666667e-05, "loss": 1.2024, "step": 68500 }, { "epoch": 0.10975982438428099, "grad_norm": 0.6521372199058533, "learning_rate": 1.978439111111111e-05, "loss": 0.985, "step": 68600 }, { "epoch": 0.1099198241282814, "grad_norm": 2.0386836528778076, "learning_rate": 1.978083555555556e-05, "loss": 1.1712, "step": 68700 }, { "epoch": 0.11007982387228181, "grad_norm": 132.18045043945312, "learning_rate": 1.977728e-05, "loss": 1.5874, "step": 68800 }, { "epoch": 0.11023982361628222, "grad_norm": 0.0068659852258861065, "learning_rate": 1.9773724444444448e-05, "loss": 1.8551, "step": 68900 }, { "epoch": 0.11039982336028262, "grad_norm": 84.89590454101562, "learning_rate": 1.977016888888889e-05, "loss": 1.232, "step": 69000 }, { "epoch": 0.11055982310428303, "grad_norm": 191.7918243408203, "learning_rate": 1.9766613333333337e-05, "loss": 1.4688, "step": 69100 }, { "epoch": 0.11071982284828344, "grad_norm": 10.109711647033691, "learning_rate": 1.9763057777777778e-05, "loss": 1.1107, "step": 69200 }, { "epoch": 0.11087982259228385, "grad_norm": 63.09272766113281, "learning_rate": 1.9759502222222226e-05, "loss": 1.6495, "step": 69300 }, { "epoch": 0.11103982233628426, "grad_norm": 66.24422454833984, "learning_rate": 1.9755946666666667e-05, "loss": 1.6278, "step": 69400 }, { "epoch": 0.11119982208028467, "grad_norm": 95.56941223144531, "learning_rate": 1.975239111111111e-05, "loss": 1.7135, "step": 69500 }, { "epoch": 0.11135982182428508, "grad_norm": 324.1082763671875, "learning_rate": 1.9748835555555556e-05, "loss": 1.5108, "step": 69600 }, { "epoch": 0.1115198215682855, "grad_norm": 123.81194305419922, "learning_rate": 1.974528e-05, "loss": 1.4056, "step": 69700 }, { "epoch": 0.1116798213122859, "grad_norm": 0.9657291769981384, "learning_rate": 1.9741724444444445e-05, "loss": 0.9324, "step": 69800 }, { "epoch": 0.11183982105628631, "grad_norm": 21.34449005126953, "learning_rate": 1.973816888888889e-05, "loss": 1.3613, "step": 69900 }, { "epoch": 0.11199982080028673, "grad_norm": 51.35042953491211, "learning_rate": 1.9734613333333334e-05, "loss": 1.5283, "step": 70000 }, { "epoch": 0.11215982054428712, "grad_norm": 16.527353286743164, "learning_rate": 1.973105777777778e-05, "loss": 1.3809, "step": 70100 }, { "epoch": 0.11231982028828753, "grad_norm": 0.032987259328365326, "learning_rate": 1.9727502222222224e-05, "loss": 1.5552, "step": 70200 }, { "epoch": 0.11247982003228794, "grad_norm": 96.99321746826172, "learning_rate": 1.9723946666666668e-05, "loss": 1.4567, "step": 70300 }, { "epoch": 0.11263981977628836, "grad_norm": 291.11444091796875, "learning_rate": 1.9720391111111113e-05, "loss": 1.4404, "step": 70400 }, { "epoch": 0.11279981952028877, "grad_norm": 1.33843195438385, "learning_rate": 1.9716835555555557e-05, "loss": 1.1805, "step": 70500 }, { "epoch": 0.11295981926428918, "grad_norm": 340.20355224609375, "learning_rate": 1.9713280000000002e-05, "loss": 2.514, "step": 70600 }, { "epoch": 0.11311981900828959, "grad_norm": 0.20241181552410126, "learning_rate": 1.9709724444444446e-05, "loss": 1.4821, "step": 70700 }, { "epoch": 0.11327981875229, "grad_norm": 0.044828109443187714, "learning_rate": 1.970616888888889e-05, "loss": 1.5156, "step": 70800 }, { "epoch": 0.11343981849629041, "grad_norm": 121.0267105102539, "learning_rate": 1.9702648888888892e-05, "loss": 1.5925, "step": 70900 }, { "epoch": 0.11359981824029082, "grad_norm": 1217.32373046875, "learning_rate": 1.9699093333333333e-05, "loss": 1.9517, "step": 71000 }, { "epoch": 0.11375981798429123, "grad_norm": 102.1255111694336, "learning_rate": 1.969553777777778e-05, "loss": 1.2685, "step": 71100 }, { "epoch": 0.11391981772829163, "grad_norm": 0.0009581278427504003, "learning_rate": 1.9691982222222222e-05, "loss": 1.6314, "step": 71200 }, { "epoch": 0.11407981747229204, "grad_norm": 105.03948974609375, "learning_rate": 1.968842666666667e-05, "loss": 1.5252, "step": 71300 }, { "epoch": 0.11423981721629245, "grad_norm": 2.6692819595336914, "learning_rate": 1.968487111111111e-05, "loss": 1.5176, "step": 71400 }, { "epoch": 0.11439981696029286, "grad_norm": 93.15460205078125, "learning_rate": 1.968131555555556e-05, "loss": 1.3461, "step": 71500 }, { "epoch": 0.11455981670429327, "grad_norm": 37.849117279052734, "learning_rate": 1.967776e-05, "loss": 1.3832, "step": 71600 }, { "epoch": 0.11471981644829368, "grad_norm": 3.6809959411621094, "learning_rate": 1.967420444444445e-05, "loss": 1.2962, "step": 71700 }, { "epoch": 0.11487981619229409, "grad_norm": 40.560264587402344, "learning_rate": 1.967064888888889e-05, "loss": 1.5179, "step": 71800 }, { "epoch": 0.1150398159362945, "grad_norm": 0.17644475400447845, "learning_rate": 1.9667093333333334e-05, "loss": 1.1041, "step": 71900 }, { "epoch": 0.11519981568029491, "grad_norm": 0.05514904111623764, "learning_rate": 1.966353777777778e-05, "loss": 1.5031, "step": 72000 }, { "epoch": 0.11535981542429533, "grad_norm": 23.659364700317383, "learning_rate": 1.9659982222222223e-05, "loss": 1.5412, "step": 72100 }, { "epoch": 0.11551981516829574, "grad_norm": 0.0025822233874350786, "learning_rate": 1.9656426666666668e-05, "loss": 1.2971, "step": 72200 }, { "epoch": 0.11567981491229615, "grad_norm": 2.382300853729248, "learning_rate": 1.9652871111111112e-05, "loss": 1.0979, "step": 72300 }, { "epoch": 0.11583981465629654, "grad_norm": 150.96646118164062, "learning_rate": 1.9649315555555557e-05, "loss": 1.307, "step": 72400 }, { "epoch": 0.11599981440029696, "grad_norm": 0.022950541228055954, "learning_rate": 1.964576e-05, "loss": 1.3418, "step": 72500 }, { "epoch": 0.11615981414429737, "grad_norm": 21.7007999420166, "learning_rate": 1.9642204444444446e-05, "loss": 1.7298, "step": 72600 }, { "epoch": 0.11631981388829778, "grad_norm": 100.60992431640625, "learning_rate": 1.963864888888889e-05, "loss": 1.68, "step": 72700 }, { "epoch": 0.11647981363229819, "grad_norm": 104.22400665283203, "learning_rate": 1.9635093333333335e-05, "loss": 1.3106, "step": 72800 }, { "epoch": 0.1166398133762986, "grad_norm": 0.01572352461516857, "learning_rate": 1.963153777777778e-05, "loss": 1.0954, "step": 72900 }, { "epoch": 0.11679981312029901, "grad_norm": 0.1720964014530182, "learning_rate": 1.9627982222222224e-05, "loss": 1.5994, "step": 73000 }, { "epoch": 0.11695981286429942, "grad_norm": 90.89932250976562, "learning_rate": 1.962442666666667e-05, "loss": 1.5953, "step": 73100 }, { "epoch": 0.11711981260829983, "grad_norm": 94.24946594238281, "learning_rate": 1.9620871111111113e-05, "loss": 1.9498, "step": 73200 }, { "epoch": 0.11727981235230024, "grad_norm": 0.08061110228300095, "learning_rate": 1.9617315555555554e-05, "loss": 0.9937, "step": 73300 }, { "epoch": 0.11743981209630065, "grad_norm": 9.990059852600098, "learning_rate": 1.9613760000000002e-05, "loss": 1.4753, "step": 73400 }, { "epoch": 0.11759981184030105, "grad_norm": 1.572757601737976, "learning_rate": 1.9610204444444444e-05, "loss": 1.417, "step": 73500 }, { "epoch": 0.11775981158430146, "grad_norm": 23.618915557861328, "learning_rate": 1.960664888888889e-05, "loss": 1.596, "step": 73600 }, { "epoch": 0.11791981132830187, "grad_norm": 90.75736999511719, "learning_rate": 1.9603093333333333e-05, "loss": 1.8794, "step": 73700 }, { "epoch": 0.11807981107230228, "grad_norm": 0.07401008158922195, "learning_rate": 1.959953777777778e-05, "loss": 1.3118, "step": 73800 }, { "epoch": 0.11823981081630269, "grad_norm": 80.61852264404297, "learning_rate": 1.9595982222222222e-05, "loss": 1.732, "step": 73900 }, { "epoch": 0.1183998105603031, "grad_norm": 0.18783807754516602, "learning_rate": 1.959242666666667e-05, "loss": 1.4504, "step": 74000 }, { "epoch": 0.11855981030430351, "grad_norm": 0.0010747779160737991, "learning_rate": 1.958887111111111e-05, "loss": 1.0878, "step": 74100 }, { "epoch": 0.11871981004830393, "grad_norm": 0.0007994744810275733, "learning_rate": 1.958531555555556e-05, "loss": 1.2488, "step": 74200 }, { "epoch": 0.11887980979230434, "grad_norm": 16.047822952270508, "learning_rate": 1.958176e-05, "loss": 1.3887, "step": 74300 }, { "epoch": 0.11903980953630475, "grad_norm": 105.44868469238281, "learning_rate": 1.9578204444444448e-05, "loss": 1.2265, "step": 74400 }, { "epoch": 0.11919980928030516, "grad_norm": 0.0008991442155092955, "learning_rate": 1.957464888888889e-05, "loss": 1.4668, "step": 74500 }, { "epoch": 0.11935980902430555, "grad_norm": 0.04507048800587654, "learning_rate": 1.9571093333333334e-05, "loss": 1.6258, "step": 74600 }, { "epoch": 0.11951980876830597, "grad_norm": 0.00036178340087644756, "learning_rate": 1.9567537777777778e-05, "loss": 1.9551, "step": 74700 }, { "epoch": 0.11967980851230638, "grad_norm": 0.0032805718947201967, "learning_rate": 1.9563982222222223e-05, "loss": 1.1811, "step": 74800 }, { "epoch": 0.11983980825630679, "grad_norm": 126.1537857055664, "learning_rate": 1.9560426666666667e-05, "loss": 1.2119, "step": 74900 }, { "epoch": 0.1199998080003072, "grad_norm": 1.2026222944259644, "learning_rate": 1.9556871111111112e-05, "loss": 1.4051, "step": 75000 }, { "epoch": 0.12015980774430761, "grad_norm": 1.8911128044128418, "learning_rate": 1.9553351111111113e-05, "loss": 1.2587, "step": 75100 }, { "epoch": 0.12031980748830802, "grad_norm": 0.4351516664028168, "learning_rate": 1.9549795555555558e-05, "loss": 1.4563, "step": 75200 }, { "epoch": 0.12047980723230843, "grad_norm": 0.004506128840148449, "learning_rate": 1.9546240000000002e-05, "loss": 1.5581, "step": 75300 }, { "epoch": 0.12063980697630884, "grad_norm": 0.0002510923077352345, "learning_rate": 1.9542684444444447e-05, "loss": 1.5457, "step": 75400 }, { "epoch": 0.12079980672030925, "grad_norm": 0.15088102221488953, "learning_rate": 1.953912888888889e-05, "loss": 1.2675, "step": 75500 }, { "epoch": 0.12095980646430966, "grad_norm": 0.05502159520983696, "learning_rate": 1.9535573333333336e-05, "loss": 1.0948, "step": 75600 }, { "epoch": 0.12111980620831006, "grad_norm": 0.09219387173652649, "learning_rate": 1.953201777777778e-05, "loss": 1.2045, "step": 75700 }, { "epoch": 0.12127980595231047, "grad_norm": 199.2202911376953, "learning_rate": 1.9528462222222225e-05, "loss": 1.5964, "step": 75800 }, { "epoch": 0.12143980569631088, "grad_norm": 11.821746826171875, "learning_rate": 1.9524906666666666e-05, "loss": 1.0517, "step": 75900 }, { "epoch": 0.12159980544031129, "grad_norm": 106.8429946899414, "learning_rate": 1.9521351111111114e-05, "loss": 1.2883, "step": 76000 }, { "epoch": 0.1217598051843117, "grad_norm": 0.08651433885097504, "learning_rate": 1.9517795555555555e-05, "loss": 1.2276, "step": 76100 }, { "epoch": 0.12191980492831211, "grad_norm": 79.67507934570312, "learning_rate": 1.9514240000000003e-05, "loss": 1.2463, "step": 76200 }, { "epoch": 0.12207980467231253, "grad_norm": 0.6488481163978577, "learning_rate": 1.9510684444444444e-05, "loss": 1.241, "step": 76300 }, { "epoch": 0.12223980441631294, "grad_norm": 6.853870391845703, "learning_rate": 1.9507128888888892e-05, "loss": 1.8648, "step": 76400 }, { "epoch": 0.12239980416031335, "grad_norm": 0.0021333652548491955, "learning_rate": 1.9503573333333333e-05, "loss": 1.4848, "step": 76500 }, { "epoch": 0.12255980390431376, "grad_norm": 0.0014837757917121053, "learning_rate": 1.950001777777778e-05, "loss": 1.413, "step": 76600 }, { "epoch": 0.12271980364831417, "grad_norm": 196.25413513183594, "learning_rate": 1.9496462222222222e-05, "loss": 1.594, "step": 76700 }, { "epoch": 0.12287980339231457, "grad_norm": 117.68331909179688, "learning_rate": 1.949290666666667e-05, "loss": 1.3682, "step": 76800 }, { "epoch": 0.12303980313631498, "grad_norm": 3.5699806213378906, "learning_rate": 1.948935111111111e-05, "loss": 1.159, "step": 76900 }, { "epoch": 0.12319980288031539, "grad_norm": 0.051335014402866364, "learning_rate": 1.9485795555555556e-05, "loss": 1.4702, "step": 77000 }, { "epoch": 0.1233598026243158, "grad_norm": 0.012798790819942951, "learning_rate": 1.948224e-05, "loss": 1.3251, "step": 77100 }, { "epoch": 0.12351980236831621, "grad_norm": 3.92969012260437, "learning_rate": 1.9478684444444445e-05, "loss": 1.0538, "step": 77200 }, { "epoch": 0.12367980211231662, "grad_norm": 2.8302226066589355, "learning_rate": 1.947512888888889e-05, "loss": 1.1708, "step": 77300 }, { "epoch": 0.12383980185631703, "grad_norm": 0.11278839409351349, "learning_rate": 1.947160888888889e-05, "loss": 1.2864, "step": 77400 }, { "epoch": 0.12399980160031744, "grad_norm": 99.1993408203125, "learning_rate": 1.9468053333333335e-05, "loss": 1.6501, "step": 77500 }, { "epoch": 0.12415980134431785, "grad_norm": 0.22377841174602509, "learning_rate": 1.946449777777778e-05, "loss": 1.0104, "step": 77600 }, { "epoch": 0.12431980108831826, "grad_norm": 99.57634735107422, "learning_rate": 1.9460942222222225e-05, "loss": 1.7969, "step": 77700 }, { "epoch": 0.12447980083231867, "grad_norm": 0.9174038767814636, "learning_rate": 1.945738666666667e-05, "loss": 1.0293, "step": 77800 }, { "epoch": 0.12463980057631908, "grad_norm": 61.01045227050781, "learning_rate": 1.9453831111111114e-05, "loss": 1.5593, "step": 77900 }, { "epoch": 0.12479980032031948, "grad_norm": 2412.276611328125, "learning_rate": 1.9450275555555558e-05, "loss": 0.9902, "step": 78000 }, { "epoch": 0.12495980006431989, "grad_norm": 241.99363708496094, "learning_rate": 1.9446720000000003e-05, "loss": 1.058, "step": 78100 }, { "epoch": 0.1251197998083203, "grad_norm": 96.88700866699219, "learning_rate": 1.9443164444444447e-05, "loss": 1.4039, "step": 78200 }, { "epoch": 0.12527979955232071, "grad_norm": 56.962181091308594, "learning_rate": 1.943960888888889e-05, "loss": 1.008, "step": 78300 }, { "epoch": 0.12543979929632113, "grad_norm": 23.272607803344727, "learning_rate": 1.9436053333333336e-05, "loss": 1.4593, "step": 78400 }, { "epoch": 0.12559979904032154, "grad_norm": 97.3494644165039, "learning_rate": 1.9432497777777778e-05, "loss": 1.563, "step": 78500 }, { "epoch": 0.12575979878432195, "grad_norm": 3.625567674636841, "learning_rate": 1.9428942222222226e-05, "loss": 1.1569, "step": 78600 }, { "epoch": 0.12591979852832236, "grad_norm": 63.88728332519531, "learning_rate": 1.9425386666666667e-05, "loss": 1.3886, "step": 78700 }, { "epoch": 0.12607979827232277, "grad_norm": 0.533359169960022, "learning_rate": 1.9421831111111115e-05, "loss": 1.061, "step": 78800 }, { "epoch": 0.12623979801632318, "grad_norm": 0.0005107554607093334, "learning_rate": 1.9418275555555556e-05, "loss": 1.2085, "step": 78900 }, { "epoch": 0.1263997977603236, "grad_norm": 66.7668685913086, "learning_rate": 1.9414720000000004e-05, "loss": 1.8553, "step": 79000 }, { "epoch": 0.126559797504324, "grad_norm": 2.3458669185638428, "learning_rate": 1.9411164444444445e-05, "loss": 1.7144, "step": 79100 }, { "epoch": 0.1267197972483244, "grad_norm": 101.0086669921875, "learning_rate": 1.9407608888888893e-05, "loss": 1.2216, "step": 79200 }, { "epoch": 0.12687979699232482, "grad_norm": 14.662532806396484, "learning_rate": 1.9404053333333334e-05, "loss": 1.1646, "step": 79300 }, { "epoch": 0.12703979673632523, "grad_norm": 70.46912384033203, "learning_rate": 1.9400497777777782e-05, "loss": 1.7768, "step": 79400 }, { "epoch": 0.12719979648032564, "grad_norm": 3.7776920795440674, "learning_rate": 1.9396942222222223e-05, "loss": 1.1314, "step": 79500 }, { "epoch": 0.12735979622432603, "grad_norm": 0.05991614609956741, "learning_rate": 1.9393386666666668e-05, "loss": 1.2374, "step": 79600 }, { "epoch": 0.12751979596832644, "grad_norm": 1.138396978378296, "learning_rate": 1.9389831111111112e-05, "loss": 1.2681, "step": 79700 }, { "epoch": 0.12767979571232685, "grad_norm": 117.04296875, "learning_rate": 1.9386275555555557e-05, "loss": 1.2624, "step": 79800 }, { "epoch": 0.12783979545632726, "grad_norm": 165.1708984375, "learning_rate": 1.9382755555555558e-05, "loss": 1.6775, "step": 79900 }, { "epoch": 0.12799979520032767, "grad_norm": 127.26524353027344, "learning_rate": 1.9379200000000002e-05, "loss": 1.3587, "step": 80000 }, { "epoch": 0.12815979494432808, "grad_norm": 128.8250274658203, "learning_rate": 1.9375644444444447e-05, "loss": 1.7402, "step": 80100 }, { "epoch": 0.1283197946883285, "grad_norm": 83.64952850341797, "learning_rate": 1.937208888888889e-05, "loss": 1.5349, "step": 80200 }, { "epoch": 0.1284797944323289, "grad_norm": 2.9033825397491455, "learning_rate": 1.9368533333333336e-05, "loss": 0.8546, "step": 80300 }, { "epoch": 0.12863979417632931, "grad_norm": 1.8563624620437622, "learning_rate": 1.936497777777778e-05, "loss": 1.3903, "step": 80400 }, { "epoch": 0.12879979392032972, "grad_norm": 0.020641742274165154, "learning_rate": 1.9361422222222225e-05, "loss": 1.0712, "step": 80500 }, { "epoch": 0.12895979366433014, "grad_norm": 0.030105268582701683, "learning_rate": 1.935786666666667e-05, "loss": 1.6633, "step": 80600 }, { "epoch": 0.12911979340833055, "grad_norm": 7.39204216003418, "learning_rate": 1.935431111111111e-05, "loss": 1.4125, "step": 80700 }, { "epoch": 0.12927979315233096, "grad_norm": 0.6996489763259888, "learning_rate": 1.935075555555556e-05, "loss": 0.6973, "step": 80800 }, { "epoch": 0.12943979289633137, "grad_norm": 45.01316452026367, "learning_rate": 1.93472e-05, "loss": 1.1729, "step": 80900 }, { "epoch": 0.12959979264033178, "grad_norm": 0.2586953938007355, "learning_rate": 1.9343644444444448e-05, "loss": 1.2217, "step": 81000 }, { "epoch": 0.1297597923843322, "grad_norm": 0.02437330223619938, "learning_rate": 1.934008888888889e-05, "loss": 1.3184, "step": 81100 }, { "epoch": 0.1299197921283326, "grad_norm": 86.13786315917969, "learning_rate": 1.9336533333333334e-05, "loss": 1.2718, "step": 81200 }, { "epoch": 0.130079791872333, "grad_norm": 129.18377685546875, "learning_rate": 1.9332977777777778e-05, "loss": 1.1913, "step": 81300 }, { "epoch": 0.13023979161633342, "grad_norm": 0.21126429736614227, "learning_rate": 1.9329422222222223e-05, "loss": 1.4728, "step": 81400 }, { "epoch": 0.13039979136033383, "grad_norm": 17.239547729492188, "learning_rate": 1.9325902222222224e-05, "loss": 1.1221, "step": 81500 }, { "epoch": 0.13055979110433424, "grad_norm": 3.2373907566070557, "learning_rate": 1.932234666666667e-05, "loss": 1.235, "step": 81600 }, { "epoch": 0.13071979084833465, "grad_norm": 5.152343273162842, "learning_rate": 1.9318791111111113e-05, "loss": 1.3497, "step": 81700 }, { "epoch": 0.13087979059233507, "grad_norm": 88.2583236694336, "learning_rate": 1.9315235555555558e-05, "loss": 1.2361, "step": 81800 }, { "epoch": 0.13103979033633545, "grad_norm": 0.001005143509246409, "learning_rate": 1.9311680000000002e-05, "loss": 2.0015, "step": 81900 }, { "epoch": 0.13119979008033586, "grad_norm": 0.3949466347694397, "learning_rate": 1.9308124444444447e-05, "loss": 1.2259, "step": 82000 }, { "epoch": 0.13135978982433627, "grad_norm": 0.6978406310081482, "learning_rate": 1.930456888888889e-05, "loss": 0.9236, "step": 82100 }, { "epoch": 0.13151978956833668, "grad_norm": 0.6740103363990784, "learning_rate": 1.9301013333333332e-05, "loss": 1.5339, "step": 82200 }, { "epoch": 0.1316797893123371, "grad_norm": 0.007084805518388748, "learning_rate": 1.929745777777778e-05, "loss": 1.2036, "step": 82300 }, { "epoch": 0.1318397890563375, "grad_norm": 88.51764678955078, "learning_rate": 1.929390222222222e-05, "loss": 1.2631, "step": 82400 }, { "epoch": 0.1319997888003379, "grad_norm": 0.000969950866419822, "learning_rate": 1.929034666666667e-05, "loss": 1.0858, "step": 82500 }, { "epoch": 0.13215978854433832, "grad_norm": 0.10399264842271805, "learning_rate": 1.928679111111111e-05, "loss": 1.635, "step": 82600 }, { "epoch": 0.13231978828833874, "grad_norm": 39.86758804321289, "learning_rate": 1.928323555555556e-05, "loss": 1.285, "step": 82700 }, { "epoch": 0.13247978803233915, "grad_norm": 2.6627957820892334, "learning_rate": 1.927968e-05, "loss": 1.1209, "step": 82800 }, { "epoch": 0.13263978777633956, "grad_norm": 0.2310008406639099, "learning_rate": 1.9276124444444448e-05, "loss": 1.4032, "step": 82900 }, { "epoch": 0.13279978752033997, "grad_norm": 76.39102935791016, "learning_rate": 1.927256888888889e-05, "loss": 1.1279, "step": 83000 }, { "epoch": 0.13295978726434038, "grad_norm": 0.5016289949417114, "learning_rate": 1.9269013333333337e-05, "loss": 1.5145, "step": 83100 }, { "epoch": 0.1331197870083408, "grad_norm": 0.2468506544828415, "learning_rate": 1.9265457777777778e-05, "loss": 1.4923, "step": 83200 }, { "epoch": 0.1332797867523412, "grad_norm": 0.03473009541630745, "learning_rate": 1.9261902222222222e-05, "loss": 0.9845, "step": 83300 }, { "epoch": 0.1334397864963416, "grad_norm": 15.979637145996094, "learning_rate": 1.9258346666666667e-05, "loss": 1.3847, "step": 83400 }, { "epoch": 0.13359978624034202, "grad_norm": 0.13443760573863983, "learning_rate": 1.925479111111111e-05, "loss": 1.0149, "step": 83500 }, { "epoch": 0.13375978598434243, "grad_norm": 0.09114881604909897, "learning_rate": 1.9251235555555556e-05, "loss": 1.2644, "step": 83600 }, { "epoch": 0.13391978572834284, "grad_norm": 67.14397430419922, "learning_rate": 1.924768e-05, "loss": 1.2981, "step": 83700 }, { "epoch": 0.13407978547234325, "grad_norm": 9.479357719421387, "learning_rate": 1.9244124444444445e-05, "loss": 1.6903, "step": 83800 }, { "epoch": 0.13423978521634367, "grad_norm": 222.48973083496094, "learning_rate": 1.924056888888889e-05, "loss": 1.2846, "step": 83900 }, { "epoch": 0.13439978496034408, "grad_norm": 0.24070465564727783, "learning_rate": 1.9237013333333334e-05, "loss": 1.4647, "step": 84000 }, { "epoch": 0.13455978470434446, "grad_norm": 1444.743896484375, "learning_rate": 1.923345777777778e-05, "loss": 1.1213, "step": 84100 }, { "epoch": 0.13471978444834487, "grad_norm": 1.174815058708191, "learning_rate": 1.9229902222222223e-05, "loss": 1.1379, "step": 84200 }, { "epoch": 0.13487978419234528, "grad_norm": 120.12804412841797, "learning_rate": 1.9226346666666668e-05, "loss": 1.2793, "step": 84300 }, { "epoch": 0.1350397839363457, "grad_norm": 89.50218200683594, "learning_rate": 1.9222791111111113e-05, "loss": 1.343, "step": 84400 }, { "epoch": 0.1351997836803461, "grad_norm": 100.86327362060547, "learning_rate": 1.9219235555555557e-05, "loss": 1.8342, "step": 84500 }, { "epoch": 0.1353597834243465, "grad_norm": 99.5421371459961, "learning_rate": 1.921568e-05, "loss": 1.0487, "step": 84600 }, { "epoch": 0.13551978316834692, "grad_norm": 0.0015393303474411368, "learning_rate": 1.9212124444444446e-05, "loss": 1.1531, "step": 84700 }, { "epoch": 0.13567978291234734, "grad_norm": 3.457564353942871, "learning_rate": 1.920856888888889e-05, "loss": 0.8552, "step": 84800 }, { "epoch": 0.13583978265634775, "grad_norm": 0.00041561800753697753, "learning_rate": 1.9205013333333335e-05, "loss": 1.1422, "step": 84900 }, { "epoch": 0.13599978240034816, "grad_norm": 0.0012223550584167242, "learning_rate": 1.920145777777778e-05, "loss": 1.0918, "step": 85000 }, { "epoch": 0.13615978214434857, "grad_norm": 0.21084783971309662, "learning_rate": 1.9197902222222224e-05, "loss": 1.2873, "step": 85100 }, { "epoch": 0.13631978188834898, "grad_norm": 24.241910934448242, "learning_rate": 1.919434666666667e-05, "loss": 1.547, "step": 85200 }, { "epoch": 0.1364797816323494, "grad_norm": 47.714027404785156, "learning_rate": 1.9190791111111114e-05, "loss": 1.5094, "step": 85300 }, { "epoch": 0.1366397813763498, "grad_norm": 6.054490089416504, "learning_rate": 1.9187235555555558e-05, "loss": 1.051, "step": 85400 }, { "epoch": 0.1367997811203502, "grad_norm": 0.001112865749746561, "learning_rate": 1.9183680000000003e-05, "loss": 0.9952, "step": 85500 }, { "epoch": 0.13695978086435062, "grad_norm": 0.015406353399157524, "learning_rate": 1.9180124444444447e-05, "loss": 1.1978, "step": 85600 }, { "epoch": 0.13711978060835103, "grad_norm": 17.65171241760254, "learning_rate": 1.9176604444444445e-05, "loss": 1.5221, "step": 85700 }, { "epoch": 0.13727978035235144, "grad_norm": 14.018333435058594, "learning_rate": 1.917304888888889e-05, "loss": 1.3841, "step": 85800 }, { "epoch": 0.13743978009635185, "grad_norm": 0.0006000687135383487, "learning_rate": 1.9169493333333334e-05, "loss": 1.3999, "step": 85900 }, { "epoch": 0.13759977984035227, "grad_norm": 1715.1507568359375, "learning_rate": 1.916593777777778e-05, "loss": 1.5574, "step": 86000 }, { "epoch": 0.13775977958435268, "grad_norm": 4.6950907707214355, "learning_rate": 1.9162382222222223e-05, "loss": 1.3267, "step": 86100 }, { "epoch": 0.1379197793283531, "grad_norm": 0.8909225463867188, "learning_rate": 1.9158826666666668e-05, "loss": 1.358, "step": 86200 }, { "epoch": 0.1380797790723535, "grad_norm": 27.72040367126465, "learning_rate": 1.9155271111111112e-05, "loss": 1.5441, "step": 86300 }, { "epoch": 0.13823977881635388, "grad_norm": 120.01333618164062, "learning_rate": 1.9151715555555557e-05, "loss": 1.4124, "step": 86400 }, { "epoch": 0.1383997785603543, "grad_norm": 27.406797409057617, "learning_rate": 1.914816e-05, "loss": 0.8352, "step": 86500 }, { "epoch": 0.1385597783043547, "grad_norm": 0.07549207657575607, "learning_rate": 1.9144604444444446e-05, "loss": 1.2549, "step": 86600 }, { "epoch": 0.1387197780483551, "grad_norm": 1.3746123313903809, "learning_rate": 1.914104888888889e-05, "loss": 1.4328, "step": 86700 }, { "epoch": 0.13887977779235552, "grad_norm": 0.002391360467299819, "learning_rate": 1.9137493333333335e-05, "loss": 1.2577, "step": 86800 }, { "epoch": 0.13903977753635594, "grad_norm": 39.3692626953125, "learning_rate": 1.913393777777778e-05, "loss": 1.4417, "step": 86900 }, { "epoch": 0.13919977728035635, "grad_norm": 0.0015022088773548603, "learning_rate": 1.9130382222222224e-05, "loss": 1.1927, "step": 87000 }, { "epoch": 0.13935977702435676, "grad_norm": 3.776437520980835, "learning_rate": 1.912682666666667e-05, "loss": 1.4435, "step": 87100 }, { "epoch": 0.13951977676835717, "grad_norm": 9.693673133850098, "learning_rate": 1.9123271111111113e-05, "loss": 1.3579, "step": 87200 }, { "epoch": 0.13967977651235758, "grad_norm": 47.54679870605469, "learning_rate": 1.9119751111111114e-05, "loss": 1.3883, "step": 87300 }, { "epoch": 0.139839776256358, "grad_norm": 57.24945068359375, "learning_rate": 1.9116195555555555e-05, "loss": 1.2645, "step": 87400 }, { "epoch": 0.1399997760003584, "grad_norm": 0.0025031184777617455, "learning_rate": 1.9112640000000003e-05, "loss": 1.1366, "step": 87500 }, { "epoch": 0.1401597757443588, "grad_norm": 0.015484058298170567, "learning_rate": 1.9109084444444445e-05, "loss": 1.4566, "step": 87600 }, { "epoch": 0.14031977548835922, "grad_norm": 0.24919560551643372, "learning_rate": 1.9105528888888893e-05, "loss": 1.447, "step": 87700 }, { "epoch": 0.14047977523235963, "grad_norm": 74.2865219116211, "learning_rate": 1.9101973333333334e-05, "loss": 1.0701, "step": 87800 }, { "epoch": 0.14063977497636004, "grad_norm": 127.0066909790039, "learning_rate": 1.909841777777778e-05, "loss": 1.3449, "step": 87900 }, { "epoch": 0.14079977472036045, "grad_norm": 87.54583740234375, "learning_rate": 1.9094862222222223e-05, "loss": 1.4331, "step": 88000 }, { "epoch": 0.14095977446436087, "grad_norm": 0.001399531727656722, "learning_rate": 1.909130666666667e-05, "loss": 1.3965, "step": 88100 }, { "epoch": 0.14111977420836128, "grad_norm": 69.87310028076172, "learning_rate": 1.9087751111111112e-05, "loss": 1.347, "step": 88200 }, { "epoch": 0.1412797739523617, "grad_norm": 69.38946533203125, "learning_rate": 1.9084195555555556e-05, "loss": 1.0262, "step": 88300 }, { "epoch": 0.1414397736963621, "grad_norm": 18.69589614868164, "learning_rate": 1.908064e-05, "loss": 1.0787, "step": 88400 }, { "epoch": 0.1415997734403625, "grad_norm": 0.20538243651390076, "learning_rate": 1.9077084444444446e-05, "loss": 1.3829, "step": 88500 }, { "epoch": 0.1417597731843629, "grad_norm": 0.0005450706230476499, "learning_rate": 1.907352888888889e-05, "loss": 1.2001, "step": 88600 }, { "epoch": 0.1419197729283633, "grad_norm": 2.548616409301758, "learning_rate": 1.9069973333333335e-05, "loss": 1.2407, "step": 88700 }, { "epoch": 0.1420797726723637, "grad_norm": 139.437744140625, "learning_rate": 1.906641777777778e-05, "loss": 1.6291, "step": 88800 }, { "epoch": 0.14223977241636412, "grad_norm": 10.629435539245605, "learning_rate": 1.9062862222222224e-05, "loss": 1.1502, "step": 88900 }, { "epoch": 0.14239977216036454, "grad_norm": 3.494685411453247, "learning_rate": 1.905930666666667e-05, "loss": 1.2155, "step": 89000 }, { "epoch": 0.14255977190436495, "grad_norm": 107.18891143798828, "learning_rate": 1.9055751111111113e-05, "loss": 1.3381, "step": 89100 }, { "epoch": 0.14271977164836536, "grad_norm": 2575.91796875, "learning_rate": 1.9052195555555557e-05, "loss": 0.819, "step": 89200 }, { "epoch": 0.14287977139236577, "grad_norm": 302.19500732421875, "learning_rate": 1.9048640000000002e-05, "loss": 1.0402, "step": 89300 }, { "epoch": 0.14303977113636618, "grad_norm": 87.07076263427734, "learning_rate": 1.9045084444444447e-05, "loss": 1.1062, "step": 89400 }, { "epoch": 0.1431997708803666, "grad_norm": 5.228755950927734, "learning_rate": 1.9041528888888888e-05, "loss": 1.6693, "step": 89500 }, { "epoch": 0.143359770624367, "grad_norm": 0.20638461410999298, "learning_rate": 1.9037973333333336e-05, "loss": 1.1991, "step": 89600 }, { "epoch": 0.1435197703683674, "grad_norm": 114.9300308227539, "learning_rate": 1.9034417777777777e-05, "loss": 1.3535, "step": 89700 }, { "epoch": 0.14367977011236782, "grad_norm": 86.26241302490234, "learning_rate": 1.9030862222222225e-05, "loss": 1.6776, "step": 89800 }, { "epoch": 0.14383976985636823, "grad_norm": 78.20118713378906, "learning_rate": 1.9027306666666666e-05, "loss": 1.2221, "step": 89900 }, { "epoch": 0.14399976960036864, "grad_norm": 7.4184088706970215, "learning_rate": 1.9023751111111114e-05, "loss": 1.0253, "step": 90000 }, { "epoch": 0.14415976934436905, "grad_norm": 99.789794921875, "learning_rate": 1.9020195555555555e-05, "loss": 1.0469, "step": 90100 }, { "epoch": 0.14431976908836947, "grad_norm": 0.003789502428844571, "learning_rate": 1.9016640000000003e-05, "loss": 1.2465, "step": 90200 }, { "epoch": 0.14447976883236988, "grad_norm": 0.27373766899108887, "learning_rate": 1.9013084444444444e-05, "loss": 1.4068, "step": 90300 }, { "epoch": 0.1446397685763703, "grad_norm": 101.30089569091797, "learning_rate": 1.9009528888888892e-05, "loss": 1.5961, "step": 90400 }, { "epoch": 0.1447997683203707, "grad_norm": 0.24238981306552887, "learning_rate": 1.9005973333333333e-05, "loss": 1.0579, "step": 90500 }, { "epoch": 0.1449597680643711, "grad_norm": 0.6612280011177063, "learning_rate": 1.900241777777778e-05, "loss": 0.941, "step": 90600 }, { "epoch": 0.14511976780837152, "grad_norm": 3.1052684783935547, "learning_rate": 1.8998862222222222e-05, "loss": 1.1861, "step": 90700 }, { "epoch": 0.14527976755237193, "grad_norm": 9.876090049743652, "learning_rate": 1.8995306666666667e-05, "loss": 1.4697, "step": 90800 }, { "epoch": 0.1454397672963723, "grad_norm": 32.829795837402344, "learning_rate": 1.899175111111111e-05, "loss": 0.6486, "step": 90900 }, { "epoch": 0.14559976704037272, "grad_norm": 2219.107177734375, "learning_rate": 1.8988195555555556e-05, "loss": 1.3865, "step": 91000 }, { "epoch": 0.14575976678437313, "grad_norm": 0.2465362697839737, "learning_rate": 1.898464e-05, "loss": 1.1494, "step": 91100 }, { "epoch": 0.14591976652837355, "grad_norm": 0.06304822117090225, "learning_rate": 1.8981084444444445e-05, "loss": 1.3623, "step": 91200 }, { "epoch": 0.14607976627237396, "grad_norm": 0.0003378583351150155, "learning_rate": 1.897752888888889e-05, "loss": 1.2193, "step": 91300 }, { "epoch": 0.14623976601637437, "grad_norm": 0.023890919983386993, "learning_rate": 1.8973973333333334e-05, "loss": 1.3003, "step": 91400 }, { "epoch": 0.14639976576037478, "grad_norm": 0.20042432844638824, "learning_rate": 1.8970453333333335e-05, "loss": 1.2608, "step": 91500 }, { "epoch": 0.1465597655043752, "grad_norm": 0.4310738742351532, "learning_rate": 1.896689777777778e-05, "loss": 1.2544, "step": 91600 }, { "epoch": 0.1467197652483756, "grad_norm": 6.881536960601807, "learning_rate": 1.8963342222222224e-05, "loss": 1.332, "step": 91700 }, { "epoch": 0.146879764992376, "grad_norm": 34.862266540527344, "learning_rate": 1.895978666666667e-05, "loss": 1.3548, "step": 91800 }, { "epoch": 0.14703976473637642, "grad_norm": 41.60286331176758, "learning_rate": 1.8956231111111114e-05, "loss": 1.54, "step": 91900 }, { "epoch": 0.14719976448037683, "grad_norm": 0.21723419427871704, "learning_rate": 1.8952675555555558e-05, "loss": 1.3125, "step": 92000 }, { "epoch": 0.14735976422437724, "grad_norm": 12.313715934753418, "learning_rate": 1.894912e-05, "loss": 0.897, "step": 92100 }, { "epoch": 0.14751976396837765, "grad_norm": 9.945670171873644e-05, "learning_rate": 1.8945564444444447e-05, "loss": 1.1594, "step": 92200 }, { "epoch": 0.14767976371237806, "grad_norm": 0.00018985375936608762, "learning_rate": 1.894200888888889e-05, "loss": 0.9194, "step": 92300 }, { "epoch": 0.14783976345637848, "grad_norm": 6.04590368270874, "learning_rate": 1.8938453333333336e-05, "loss": 1.2209, "step": 92400 }, { "epoch": 0.1479997632003789, "grad_norm": 0.19547709822654724, "learning_rate": 1.8934897777777777e-05, "loss": 1.0027, "step": 92500 }, { "epoch": 0.1481597629443793, "grad_norm": 0.00870482623577118, "learning_rate": 1.8931342222222225e-05, "loss": 1.4675, "step": 92600 }, { "epoch": 0.1483197626883797, "grad_norm": 112.5121078491211, "learning_rate": 1.8927786666666667e-05, "loss": 1.3982, "step": 92700 }, { "epoch": 0.14847976243238012, "grad_norm": 0.000352471019141376, "learning_rate": 1.8924231111111115e-05, "loss": 0.8595, "step": 92800 }, { "epoch": 0.14863976217638053, "grad_norm": 0.20730045437812805, "learning_rate": 1.8920675555555556e-05, "loss": 1.572, "step": 92900 }, { "epoch": 0.14879976192038094, "grad_norm": 5.9496917724609375, "learning_rate": 1.8917120000000004e-05, "loss": 1.2832, "step": 93000 }, { "epoch": 0.14895976166438132, "grad_norm": 0.012922318652272224, "learning_rate": 1.8913564444444445e-05, "loss": 1.2838, "step": 93100 }, { "epoch": 0.14911976140838173, "grad_norm": 40.53496170043945, "learning_rate": 1.8910008888888893e-05, "loss": 1.6535, "step": 93200 }, { "epoch": 0.14927976115238215, "grad_norm": 106.17526245117188, "learning_rate": 1.8906453333333334e-05, "loss": 1.5996, "step": 93300 }, { "epoch": 0.14943976089638256, "grad_norm": 93.25550079345703, "learning_rate": 1.890289777777778e-05, "loss": 1.058, "step": 93400 }, { "epoch": 0.14959976064038297, "grad_norm": 83.99794006347656, "learning_rate": 1.8899342222222223e-05, "loss": 1.3316, "step": 93500 }, { "epoch": 0.14975976038438338, "grad_norm": 0.0036302392836660147, "learning_rate": 1.8895822222222224e-05, "loss": 0.8627, "step": 93600 }, { "epoch": 0.1499197601283838, "grad_norm": 4.188179969787598, "learning_rate": 1.889226666666667e-05, "loss": 1.4411, "step": 93700 }, { "epoch": 0.1500797598723842, "grad_norm": 0.38291868567466736, "learning_rate": 1.8888711111111113e-05, "loss": 0.9331, "step": 93800 }, { "epoch": 0.1502397596163846, "grad_norm": 2.517091751098633, "learning_rate": 1.8885155555555558e-05, "loss": 1.0032, "step": 93900 }, { "epoch": 0.15039975936038502, "grad_norm": 128.78472900390625, "learning_rate": 1.8881600000000002e-05, "loss": 1.2341, "step": 94000 }, { "epoch": 0.15055975910438543, "grad_norm": 0.0004920060164295137, "learning_rate": 1.8878044444444447e-05, "loss": 1.3369, "step": 94100 }, { "epoch": 0.15071975884838584, "grad_norm": 36.320411682128906, "learning_rate": 1.887448888888889e-05, "loss": 1.2324, "step": 94200 }, { "epoch": 0.15087975859238625, "grad_norm": 34.70246887207031, "learning_rate": 1.8870933333333336e-05, "loss": 1.6952, "step": 94300 }, { "epoch": 0.15103975833638666, "grad_norm": 0.5894516110420227, "learning_rate": 1.886737777777778e-05, "loss": 1.2401, "step": 94400 }, { "epoch": 0.15119975808038708, "grad_norm": 0.0024842778220772743, "learning_rate": 1.8863822222222222e-05, "loss": 1.2998, "step": 94500 }, { "epoch": 0.1513597578243875, "grad_norm": 0.16776002943515778, "learning_rate": 1.886026666666667e-05, "loss": 1.1458, "step": 94600 }, { "epoch": 0.1515197575683879, "grad_norm": 0.008235426619648933, "learning_rate": 1.885671111111111e-05, "loss": 1.0211, "step": 94700 }, { "epoch": 0.1516797573123883, "grad_norm": 82.82616424560547, "learning_rate": 1.8853191111111112e-05, "loss": 0.9866, "step": 94800 }, { "epoch": 0.15183975705638872, "grad_norm": 0.0073872278444468975, "learning_rate": 1.8849635555555556e-05, "loss": 1.3636, "step": 94900 }, { "epoch": 0.15199975680038913, "grad_norm": 11.451104164123535, "learning_rate": 1.884608e-05, "loss": 1.1485, "step": 95000 }, { "epoch": 0.15215975654438954, "grad_norm": 100.47103118896484, "learning_rate": 1.8842524444444446e-05, "loss": 0.7671, "step": 95100 }, { "epoch": 0.15231975628838995, "grad_norm": 0.4101124703884125, "learning_rate": 1.883896888888889e-05, "loss": 1.0069, "step": 95200 }, { "epoch": 0.15247975603239033, "grad_norm": 37.40227508544922, "learning_rate": 1.8835413333333335e-05, "loss": 1.1276, "step": 95300 }, { "epoch": 0.15263975577639075, "grad_norm": 141.687744140625, "learning_rate": 1.883185777777778e-05, "loss": 1.4477, "step": 95400 }, { "epoch": 0.15279975552039116, "grad_norm": 0.008716798387467861, "learning_rate": 1.8828302222222224e-05, "loss": 0.9887, "step": 95500 }, { "epoch": 0.15295975526439157, "grad_norm": 7.543517858721316e-05, "learning_rate": 1.882474666666667e-05, "loss": 1.065, "step": 95600 }, { "epoch": 0.15311975500839198, "grad_norm": 6.7989821434021, "learning_rate": 1.8821191111111113e-05, "loss": 0.982, "step": 95700 }, { "epoch": 0.1532797547523924, "grad_norm": 27.62921714782715, "learning_rate": 1.8817635555555557e-05, "loss": 1.1166, "step": 95800 }, { "epoch": 0.1534397544963928, "grad_norm": 28.467132568359375, "learning_rate": 1.8814080000000002e-05, "loss": 1.3949, "step": 95900 }, { "epoch": 0.1535997542403932, "grad_norm": 89.31570434570312, "learning_rate": 1.8810524444444447e-05, "loss": 1.4164, "step": 96000 }, { "epoch": 0.15375975398439362, "grad_norm": 0.30763378739356995, "learning_rate": 1.880696888888889e-05, "loss": 1.7997, "step": 96100 }, { "epoch": 0.15391975372839403, "grad_norm": 90.01514434814453, "learning_rate": 1.8803413333333336e-05, "loss": 1.3941, "step": 96200 }, { "epoch": 0.15407975347239444, "grad_norm": 0.9498651027679443, "learning_rate": 1.879985777777778e-05, "loss": 1.0592, "step": 96300 }, { "epoch": 0.15423975321639485, "grad_norm": 88.10832977294922, "learning_rate": 1.8796302222222225e-05, "loss": 1.1661, "step": 96400 }, { "epoch": 0.15439975296039526, "grad_norm": 3.867802143096924, "learning_rate": 1.879274666666667e-05, "loss": 1.5968, "step": 96500 }, { "epoch": 0.15455975270439568, "grad_norm": 114.89385986328125, "learning_rate": 1.8789191111111114e-05, "loss": 1.2586, "step": 96600 }, { "epoch": 0.1547197524483961, "grad_norm": 0.009951179847121239, "learning_rate": 1.878563555555556e-05, "loss": 1.5164, "step": 96700 }, { "epoch": 0.1548797521923965, "grad_norm": 1.848288893699646, "learning_rate": 1.8782080000000003e-05, "loss": 1.5942, "step": 96800 }, { "epoch": 0.1550397519363969, "grad_norm": 56.26310348510742, "learning_rate": 1.8778524444444448e-05, "loss": 0.6635, "step": 96900 }, { "epoch": 0.15519975168039732, "grad_norm": 0.2863824963569641, "learning_rate": 1.8774968888888892e-05, "loss": 1.3037, "step": 97000 }, { "epoch": 0.15535975142439773, "grad_norm": 0.006738504860550165, "learning_rate": 1.8771413333333333e-05, "loss": 1.3557, "step": 97100 }, { "epoch": 0.15551975116839814, "grad_norm": 0.24526001513004303, "learning_rate": 1.876785777777778e-05, "loss": 1.0864, "step": 97200 }, { "epoch": 0.15567975091239855, "grad_norm": 70.70162963867188, "learning_rate": 1.8764302222222222e-05, "loss": 1.3139, "step": 97300 }, { "epoch": 0.15583975065639896, "grad_norm": 0.7548888921737671, "learning_rate": 1.8760746666666667e-05, "loss": 0.7139, "step": 97400 }, { "epoch": 0.15599975040039937, "grad_norm": 0.08793803304433823, "learning_rate": 1.875719111111111e-05, "loss": 1.1084, "step": 97500 }, { "epoch": 0.15615975014439976, "grad_norm": 8.044859886169434, "learning_rate": 1.8753635555555556e-05, "loss": 1.2294, "step": 97600 }, { "epoch": 0.15631974988840017, "grad_norm": 0.4635624587535858, "learning_rate": 1.875008e-05, "loss": 0.9581, "step": 97700 }, { "epoch": 0.15647974963240058, "grad_norm": 0.0022484343498945236, "learning_rate": 1.8746524444444445e-05, "loss": 1.2983, "step": 97800 }, { "epoch": 0.156639749376401, "grad_norm": 2.357697010040283, "learning_rate": 1.874296888888889e-05, "loss": 1.8281, "step": 97900 }, { "epoch": 0.1567997491204014, "grad_norm": 78.0554428100586, "learning_rate": 1.8739413333333334e-05, "loss": 1.2914, "step": 98000 }, { "epoch": 0.1569597488644018, "grad_norm": 0.6091700196266174, "learning_rate": 1.873585777777778e-05, "loss": 0.8656, "step": 98100 }, { "epoch": 0.15711974860840222, "grad_norm": 0.20535144209861755, "learning_rate": 1.8732302222222223e-05, "loss": 1.3438, "step": 98200 }, { "epoch": 0.15727974835240263, "grad_norm": 0.029342494904994965, "learning_rate": 1.8728746666666668e-05, "loss": 1.465, "step": 98300 }, { "epoch": 0.15743974809640304, "grad_norm": 0.20423032343387604, "learning_rate": 1.8725191111111112e-05, "loss": 1.2253, "step": 98400 }, { "epoch": 0.15759974784040345, "grad_norm": 0.020203936845064163, "learning_rate": 1.8721635555555557e-05, "loss": 1.3481, "step": 98500 }, { "epoch": 0.15775974758440386, "grad_norm": 0.001091059297323227, "learning_rate": 1.871808e-05, "loss": 1.5131, "step": 98600 }, { "epoch": 0.15791974732840428, "grad_norm": 42.3817253112793, "learning_rate": 1.8714524444444446e-05, "loss": 1.4852, "step": 98700 }, { "epoch": 0.15807974707240469, "grad_norm": 11.986414909362793, "learning_rate": 1.871096888888889e-05, "loss": 1.1317, "step": 98800 }, { "epoch": 0.1582397468164051, "grad_norm": 6.878232002258301, "learning_rate": 1.8707413333333335e-05, "loss": 1.0395, "step": 98900 }, { "epoch": 0.1583997465604055, "grad_norm": 0.011188351549208164, "learning_rate": 1.8703893333333333e-05, "loss": 0.9256, "step": 99000 }, { "epoch": 0.15855974630440592, "grad_norm": 0.03425045683979988, "learning_rate": 1.870033777777778e-05, "loss": 0.9774, "step": 99100 }, { "epoch": 0.15871974604840633, "grad_norm": 26.11473846435547, "learning_rate": 1.8696782222222222e-05, "loss": 0.9756, "step": 99200 }, { "epoch": 0.15887974579240674, "grad_norm": 0.0001582380209583789, "learning_rate": 1.869322666666667e-05, "loss": 1.4885, "step": 99300 }, { "epoch": 0.15903974553640715, "grad_norm": 0.1462339162826538, "learning_rate": 1.868967111111111e-05, "loss": 1.2373, "step": 99400 }, { "epoch": 0.15919974528040756, "grad_norm": 20.499425888061523, "learning_rate": 1.8686115555555556e-05, "loss": 1.3868, "step": 99500 }, { "epoch": 0.15935974502440797, "grad_norm": 89.42505645751953, "learning_rate": 1.868256e-05, "loss": 0.9238, "step": 99600 }, { "epoch": 0.15951974476840838, "grad_norm": 4.4118266105651855, "learning_rate": 1.8679004444444445e-05, "loss": 1.0793, "step": 99700 }, { "epoch": 0.15967974451240877, "grad_norm": 0.08320512622594833, "learning_rate": 1.867544888888889e-05, "loss": 1.2405, "step": 99800 }, { "epoch": 0.15983974425640918, "grad_norm": 1.5630072355270386, "learning_rate": 1.8671893333333334e-05, "loss": 1.2417, "step": 99900 }, { "epoch": 0.1599997440004096, "grad_norm": 1.7299790382385254, "learning_rate": 1.866833777777778e-05, "loss": 1.1264, "step": 100000 } ], "logging_steps": 100, "max_steps": 625001, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 128, "trial_name": null, "trial_params": null }