diff --git "a/checkpoint-105000/trainer_state.json" "b/checkpoint-105000/trainer_state.json" new file mode 100644--- /dev/null +++ "b/checkpoint-105000/trainer_state.json" @@ -0,0 +1,7383 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 0.16799973120043007, + "eval_steps": 200000, + "global_step": 105000, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0001599997440004096, + "grad_norm": 84.32501983642578, + "learning_rate": 3.103950336794611e-08, + "loss": 10.8792, + "step": 100 + }, + { + "epoch": 0.0003199994880008192, + "grad_norm": 60.63747024536133, + "learning_rate": 6.303899137613798e-08, + "loss": 10.9284, + "step": 200 + }, + { + "epoch": 0.00047999923200122877, + "grad_norm": 55.71075439453125, + "learning_rate": 9.503847938432986e-08, + "loss": 10.6466, + "step": 300 + }, + { + "epoch": 0.0006399989760016384, + "grad_norm": 57.63307189941406, + "learning_rate": 1.2703796739252173e-07, + "loss": 10.841, + "step": 400 + }, + { + "epoch": 0.000799998720002048, + "grad_norm": 89.1032485961914, + "learning_rate": 1.590374554007136e-07, + "loss": 10.8094, + "step": 500 + }, + { + "epoch": 0.0009599984640024575, + "grad_norm": 57.2479362487793, + "learning_rate": 1.9103694340890547e-07, + "loss": 10.4323, + "step": 600 + }, + { + "epoch": 0.0011199982080028672, + "grad_norm": 51.17530059814453, + "learning_rate": 2.2303643141709733e-07, + "loss": 10.3032, + "step": 700 + }, + { + "epoch": 0.0012799979520032767, + "grad_norm": 60.76409912109375, + "learning_rate": 2.550359194252892e-07, + "loss": 10.4006, + "step": 800 + }, + { + "epoch": 0.0014399976960036865, + "grad_norm": 67.00859069824219, + "learning_rate": 2.870354074334811e-07, + "loss": 10.4743, + "step": 900 + }, + { + "epoch": 0.001599997440004096, + "grad_norm": 68.4343032836914, + "learning_rate": 3.19034895441673e-07, + "loss": 10.2334, + "step": 1000 + }, + { + "epoch": 0.0017599971840045055, + "grad_norm": 48.704105377197266, + "learning_rate": 3.510343834498648e-07, + "loss": 10.0135, + "step": 1100 + }, + { + "epoch": 0.001919996928004915, + "grad_norm": 45.30134963989258, + "learning_rate": 3.830338714580567e-07, + "loss": 9.7874, + "step": 1200 + }, + { + "epoch": 0.002079996672005325, + "grad_norm": 84.56024169921875, + "learning_rate": 4.150333594662486e-07, + "loss": 9.7419, + "step": 1300 + }, + { + "epoch": 0.0022399964160057344, + "grad_norm": 45.73213195800781, + "learning_rate": 4.470328474744404e-07, + "loss": 9.7412, + "step": 1400 + }, + { + "epoch": 0.002399996160006144, + "grad_norm": 50.21996307373047, + "learning_rate": 4.790323354826324e-07, + "loss": 9.4585, + "step": 1500 + }, + { + "epoch": 0.0025599959040065534, + "grad_norm": 59.475799560546875, + "learning_rate": 5.110318234908241e-07, + "loss": 9.5339, + "step": 1600 + }, + { + "epoch": 0.002719995648006963, + "grad_norm": 82.53620910644531, + "learning_rate": 5.43031311499016e-07, + "loss": 9.4345, + "step": 1700 + }, + { + "epoch": 0.002879995392007373, + "grad_norm": 39.44235610961914, + "learning_rate": 5.750307995072079e-07, + "loss": 9.1733, + "step": 1800 + }, + { + "epoch": 0.0030399951360077825, + "grad_norm": 37.58698654174805, + "learning_rate": 6.070302875153998e-07, + "loss": 8.9952, + "step": 1900 + }, + { + "epoch": 0.003199994880008192, + "grad_norm": 40.35204315185547, + "learning_rate": 6.390297755235917e-07, + "loss": 8.9669, + "step": 2000 + }, + { + "epoch": 0.0033599946240086016, + "grad_norm": 57.84451675415039, + "learning_rate": 6.707092686517017e-07, + "loss": 8.8152, + "step": 2100 + }, + { + "epoch": 0.003519994368009011, + "grad_norm": 40.126953125, + "learning_rate": 7.027087566598935e-07, + "loss": 8.7936, + "step": 2200 + }, + { + "epoch": 0.0036799941120094206, + "grad_norm": 35.435707092285156, + "learning_rate": 7.347082446680854e-07, + "loss": 8.6771, + "step": 2300 + }, + { + "epoch": 0.00383999385600983, + "grad_norm": 42.3509635925293, + "learning_rate": 7.667077326762773e-07, + "loss": 8.4648, + "step": 2400 + }, + { + "epoch": 0.00399999360001024, + "grad_norm": 33.58556365966797, + "learning_rate": 7.987072206844691e-07, + "loss": 8.5764, + "step": 2500 + }, + { + "epoch": 0.00415999334401065, + "grad_norm": 34.014678955078125, + "learning_rate": 8.30706708692661e-07, + "loss": 8.4587, + "step": 2600 + }, + { + "epoch": 0.004319993088011059, + "grad_norm": 36.43831253051758, + "learning_rate": 8.627061967008528e-07, + "loss": 8.2966, + "step": 2700 + }, + { + "epoch": 0.004479992832011469, + "grad_norm": 31.411684036254883, + "learning_rate": 8.947056847090448e-07, + "loss": 8.2329, + "step": 2800 + }, + { + "epoch": 0.004639992576011879, + "grad_norm": 47.570125579833984, + "learning_rate": 9.267051727172366e-07, + "loss": 8.1415, + "step": 2900 + }, + { + "epoch": 0.004799992320012288, + "grad_norm": 30.771928787231445, + "learning_rate": 9.587046607254284e-07, + "loss": 8.0404, + "step": 3000 + }, + { + "epoch": 0.004959992064012698, + "grad_norm": 26.92803955078125, + "learning_rate": 9.907041487336204e-07, + "loss": 7.9698, + "step": 3100 + }, + { + "epoch": 0.005119991808013107, + "grad_norm": 31.121917724609375, + "learning_rate": 1.0227036367418122e-06, + "loss": 7.9205, + "step": 3200 + }, + { + "epoch": 0.005279991552013517, + "grad_norm": 33.991416931152344, + "learning_rate": 1.054703124750004e-06, + "loss": 7.8314, + "step": 3300 + }, + { + "epoch": 0.005439991296013926, + "grad_norm": 31.278030395507812, + "learning_rate": 1.086702612758196e-06, + "loss": 7.8369, + "step": 3400 + }, + { + "epoch": 0.005599991040014336, + "grad_norm": 28.116140365600586, + "learning_rate": 1.1187021007663878e-06, + "loss": 7.6403, + "step": 3500 + }, + { + "epoch": 0.005759990784014746, + "grad_norm": 30.954113006591797, + "learning_rate": 1.1507015887745798e-06, + "loss": 7.5842, + "step": 3600 + }, + { + "epoch": 0.005919990528015155, + "grad_norm": 36.53567886352539, + "learning_rate": 1.1827010767827715e-06, + "loss": 7.5812, + "step": 3700 + }, + { + "epoch": 0.006079990272015565, + "grad_norm": 36.81153106689453, + "learning_rate": 1.2147005647909635e-06, + "loss": 7.4335, + "step": 3800 + }, + { + "epoch": 0.006239990016015974, + "grad_norm": 22.556833267211914, + "learning_rate": 1.2467000527991553e-06, + "loss": 7.4917, + "step": 3900 + }, + { + "epoch": 0.006399989760016384, + "grad_norm": 40.195579528808594, + "learning_rate": 1.278699540807347e-06, + "loss": 7.3204, + "step": 4000 + }, + { + "epoch": 0.006559989504016793, + "grad_norm": 21.862642288208008, + "learning_rate": 1.310699028815539e-06, + "loss": 7.2971, + "step": 4100 + }, + { + "epoch": 0.006719989248017203, + "grad_norm": 29.61161231994629, + "learning_rate": 1.3426985168237308e-06, + "loss": 7.2233, + "step": 4200 + }, + { + "epoch": 0.006879988992017613, + "grad_norm": 22.342451095581055, + "learning_rate": 1.3746980048319228e-06, + "loss": 7.2081, + "step": 4300 + }, + { + "epoch": 0.007039988736018022, + "grad_norm": 36.36684799194336, + "learning_rate": 1.4066974928401148e-06, + "loss": 7.1364, + "step": 4400 + }, + { + "epoch": 0.007199988480018432, + "grad_norm": 25.563953399658203, + "learning_rate": 1.4386969808483064e-06, + "loss": 7.0663, + "step": 4500 + }, + { + "epoch": 0.007359988224018841, + "grad_norm": 22.50385856628418, + "learning_rate": 1.4706964688564984e-06, + "loss": 6.9601, + "step": 4600 + }, + { + "epoch": 0.007519987968019251, + "grad_norm": 31.61231231689453, + "learning_rate": 1.5026959568646904e-06, + "loss": 6.9546, + "step": 4700 + }, + { + "epoch": 0.00767998771201966, + "grad_norm": 18.862520217895508, + "learning_rate": 1.5346954448728822e-06, + "loss": 6.9019, + "step": 4800 + }, + { + "epoch": 0.00783998745602007, + "grad_norm": 32.594539642333984, + "learning_rate": 1.5666949328810741e-06, + "loss": 6.8801, + "step": 4900 + }, + { + "epoch": 0.00799998720002048, + "grad_norm": 21.06804084777832, + "learning_rate": 1.598694420889266e-06, + "loss": 6.7734, + "step": 5000 + }, + { + "epoch": 0.00815998694402089, + "grad_norm": 31.783803939819336, + "learning_rate": 1.6303739140173757e-06, + "loss": 6.7648, + "step": 5100 + }, + { + "epoch": 0.0083199866880213, + "grad_norm": 49.79084777832031, + "learning_rate": 1.6623734020255677e-06, + "loss": 6.7498, + "step": 5200 + }, + { + "epoch": 0.008479986432021708, + "grad_norm": 26.1977481842041, + "learning_rate": 1.6943728900337597e-06, + "loss": 6.6872, + "step": 5300 + }, + { + "epoch": 0.008639986176022118, + "grad_norm": 21.942001342773438, + "learning_rate": 1.7263723780419515e-06, + "loss": 6.6264, + "step": 5400 + }, + { + "epoch": 0.008799985920022528, + "grad_norm": 32.572959899902344, + "learning_rate": 1.7583718660501433e-06, + "loss": 6.579, + "step": 5500 + }, + { + "epoch": 0.008959985664022938, + "grad_norm": 20.728240966796875, + "learning_rate": 1.7903713540583353e-06, + "loss": 6.6001, + "step": 5600 + }, + { + "epoch": 0.009119985408023347, + "grad_norm": 24.334205627441406, + "learning_rate": 1.822370842066527e-06, + "loss": 6.5971, + "step": 5700 + }, + { + "epoch": 0.009279985152023757, + "grad_norm": 27.025753021240234, + "learning_rate": 1.854370330074719e-06, + "loss": 6.4694, + "step": 5800 + }, + { + "epoch": 0.009439984896024167, + "grad_norm": 23.506013870239258, + "learning_rate": 1.8863698180829106e-06, + "loss": 6.3983, + "step": 5900 + }, + { + "epoch": 0.009599984640024576, + "grad_norm": 35.65713882446289, + "learning_rate": 1.9183693060911026e-06, + "loss": 6.4477, + "step": 6000 + }, + { + "epoch": 0.009759984384024985, + "grad_norm": 22.977373123168945, + "learning_rate": 1.950368794099295e-06, + "loss": 6.4308, + "step": 6100 + }, + { + "epoch": 0.009919984128025396, + "grad_norm": 22.127635955810547, + "learning_rate": 1.982368282107486e-06, + "loss": 6.4248, + "step": 6200 + }, + { + "epoch": 0.010079983872025805, + "grad_norm": 33.53960418701172, + "learning_rate": 2.0143677701156784e-06, + "loss": 6.2642, + "step": 6300 + }, + { + "epoch": 0.010239983616026214, + "grad_norm": 24.39597511291504, + "learning_rate": 2.04636725812387e-06, + "loss": 6.2763, + "step": 6400 + }, + { + "epoch": 0.010399983360026625, + "grad_norm": 24.471288681030273, + "learning_rate": 2.078366746132062e-06, + "loss": 6.3878, + "step": 6500 + }, + { + "epoch": 0.010559983104027034, + "grad_norm": 34.05498123168945, + "learning_rate": 2.110366234140254e-06, + "loss": 6.2601, + "step": 6600 + }, + { + "epoch": 0.010719982848027443, + "grad_norm": 30.60455322265625, + "learning_rate": 2.142365722148446e-06, + "loss": 6.1789, + "step": 6700 + }, + { + "epoch": 0.010879982592027852, + "grad_norm": 27.737686157226562, + "learning_rate": 2.1743652101566377e-06, + "loss": 6.1773, + "step": 6800 + }, + { + "epoch": 0.011039982336028263, + "grad_norm": 24.246810913085938, + "learning_rate": 2.2063646981648294e-06, + "loss": 6.1439, + "step": 6900 + }, + { + "epoch": 0.011199982080028672, + "grad_norm": 27.53533363342285, + "learning_rate": 2.2383641861730217e-06, + "loss": 6.1863, + "step": 7000 + }, + { + "epoch": 0.011359981824029081, + "grad_norm": 27.81687355041504, + "learning_rate": 2.2703636741812134e-06, + "loss": 6.0513, + "step": 7100 + }, + { + "epoch": 0.011519981568029492, + "grad_norm": 28.00519371032715, + "learning_rate": 2.3020431673093234e-06, + "loss": 6.0671, + "step": 7200 + }, + { + "epoch": 0.011679981312029901, + "grad_norm": 29.347061157226562, + "learning_rate": 2.3340426553175152e-06, + "loss": 6.0212, + "step": 7300 + }, + { + "epoch": 0.01183998105603031, + "grad_norm": 29.621200561523438, + "learning_rate": 2.365722148445625e-06, + "loss": 6.0043, + "step": 7400 + }, + { + "epoch": 0.011999980800030719, + "grad_norm": 31.689117431640625, + "learning_rate": 2.397721636453817e-06, + "loss": 6.0166, + "step": 7500 + }, + { + "epoch": 0.01215998054403113, + "grad_norm": 46.79508972167969, + "learning_rate": 2.429721124462009e-06, + "loss": 5.9754, + "step": 7600 + }, + { + "epoch": 0.012319980288031539, + "grad_norm": 28.857833862304688, + "learning_rate": 2.4617206124702006e-06, + "loss": 5.9211, + "step": 7700 + }, + { + "epoch": 0.012479980032031948, + "grad_norm": 58.34132766723633, + "learning_rate": 2.4937201004783928e-06, + "loss": 5.7867, + "step": 7800 + }, + { + "epoch": 0.012639979776032359, + "grad_norm": 49.33425521850586, + "learning_rate": 2.525719588486584e-06, + "loss": 5.8534, + "step": 7900 + }, + { + "epoch": 0.012799979520032768, + "grad_norm": 39.17392349243164, + "learning_rate": 2.5577190764947763e-06, + "loss": 5.7708, + "step": 8000 + }, + { + "epoch": 0.012959979264033177, + "grad_norm": 45.94136428833008, + "learning_rate": 2.589718564502968e-06, + "loss": 5.8328, + "step": 8100 + }, + { + "epoch": 0.013119979008033586, + "grad_norm": 36.19196319580078, + "learning_rate": 2.6217180525111603e-06, + "loss": 5.7417, + "step": 8200 + }, + { + "epoch": 0.013279978752033997, + "grad_norm": 37.051658630371094, + "learning_rate": 2.653717540519352e-06, + "loss": 5.8097, + "step": 8300 + }, + { + "epoch": 0.013439978496034406, + "grad_norm": 90.0757064819336, + "learning_rate": 2.6857170285275435e-06, + "loss": 5.7578, + "step": 8400 + }, + { + "epoch": 0.013599978240034815, + "grad_norm": 92.7857894897461, + "learning_rate": 2.7177165165357357e-06, + "loss": 5.643, + "step": 8500 + }, + { + "epoch": 0.013759977984035226, + "grad_norm": 26.648149490356445, + "learning_rate": 2.7497160045439274e-06, + "loss": 5.6401, + "step": 8600 + }, + { + "epoch": 0.013919977728035635, + "grad_norm": 45.42919158935547, + "learning_rate": 2.7817154925521196e-06, + "loss": 5.6627, + "step": 8700 + }, + { + "epoch": 0.014079977472036044, + "grad_norm": 48.3182487487793, + "learning_rate": 2.8137149805603114e-06, + "loss": 5.6167, + "step": 8800 + }, + { + "epoch": 0.014239977216036454, + "grad_norm": 51.463653564453125, + "learning_rate": 2.8457144685685028e-06, + "loss": 5.6539, + "step": 8900 + }, + { + "epoch": 0.014399976960036864, + "grad_norm": 47.81680679321289, + "learning_rate": 2.877713956576695e-06, + "loss": 5.4513, + "step": 9000 + }, + { + "epoch": 0.014559976704037273, + "grad_norm": 42.410667419433594, + "learning_rate": 2.9097134445848868e-06, + "loss": 5.4132, + "step": 9100 + }, + { + "epoch": 0.014719976448037683, + "grad_norm": 55.33562088012695, + "learning_rate": 2.941712932593079e-06, + "loss": 5.4714, + "step": 9200 + }, + { + "epoch": 0.014879976192038093, + "grad_norm": 38.538246154785156, + "learning_rate": 2.9737124206012707e-06, + "loss": 5.4786, + "step": 9300 + }, + { + "epoch": 0.015039975936038502, + "grad_norm": 43.42023468017578, + "learning_rate": 3.0057119086094625e-06, + "loss": 5.3928, + "step": 9400 + }, + { + "epoch": 0.015199975680038912, + "grad_norm": 24.861467361450195, + "learning_rate": 3.037391401737572e-06, + "loss": 5.4774, + "step": 9500 + }, + { + "epoch": 0.01535997542403932, + "grad_norm": 98.92141723632812, + "learning_rate": 3.0693908897457643e-06, + "loss": 5.2881, + "step": 9600 + }, + { + "epoch": 0.015519975168039732, + "grad_norm": 62.839866638183594, + "learning_rate": 3.101390377753956e-06, + "loss": 5.3699, + "step": 9700 + }, + { + "epoch": 0.01567997491204014, + "grad_norm": 46.006065368652344, + "learning_rate": 3.133069870882066e-06, + "loss": 5.1483, + "step": 9800 + }, + { + "epoch": 0.01583997465604055, + "grad_norm": 89.62445068359375, + "learning_rate": 3.1650693588902583e-06, + "loss": 5.3051, + "step": 9900 + }, + { + "epoch": 0.01599997440004096, + "grad_norm": 41.113609313964844, + "learning_rate": 3.19706884689845e-06, + "loss": 5.2546, + "step": 10000 + }, + { + "epoch": 0.01615997414404137, + "grad_norm": 46.37376403808594, + "learning_rate": 3.2290683349066414e-06, + "loss": 5.2314, + "step": 10100 + }, + { + "epoch": 0.01631997388804178, + "grad_norm": 60.3846321105957, + "learning_rate": 3.2610678229148337e-06, + "loss": 5.1783, + "step": 10200 + }, + { + "epoch": 0.016479973632042188, + "grad_norm": 145.4359130859375, + "learning_rate": 3.2930673109230254e-06, + "loss": 5.2074, + "step": 10300 + }, + { + "epoch": 0.0166399733760426, + "grad_norm": 69.00183868408203, + "learning_rate": 3.325066798931217e-06, + "loss": 5.2825, + "step": 10400 + }, + { + "epoch": 0.01679997312004301, + "grad_norm": 48.03580093383789, + "learning_rate": 3.3570662869394094e-06, + "loss": 5.1715, + "step": 10500 + }, + { + "epoch": 0.016959972864043417, + "grad_norm": 58.56736755371094, + "learning_rate": 3.389065774947601e-06, + "loss": 5.087, + "step": 10600 + }, + { + "epoch": 0.017119972608043828, + "grad_norm": 54.484527587890625, + "learning_rate": 3.421065262955793e-06, + "loss": 5.082, + "step": 10700 + }, + { + "epoch": 0.017279972352044235, + "grad_norm": 74.30866241455078, + "learning_rate": 3.4530647509639847e-06, + "loss": 4.9111, + "step": 10800 + }, + { + "epoch": 0.017439972096044646, + "grad_norm": 60.489505767822266, + "learning_rate": 3.4850642389721765e-06, + "loss": 5.0213, + "step": 10900 + }, + { + "epoch": 0.017599971840045057, + "grad_norm": 61.25093460083008, + "learning_rate": 3.5170637269803687e-06, + "loss": 4.9898, + "step": 11000 + }, + { + "epoch": 0.017759971584045464, + "grad_norm": 51.98568344116211, + "learning_rate": 3.5490632149885605e-06, + "loss": 4.7734, + "step": 11100 + }, + { + "epoch": 0.017919971328045875, + "grad_norm": 64.08167266845703, + "learning_rate": 3.581062702996752e-06, + "loss": 4.9511, + "step": 11200 + }, + { + "epoch": 0.018079971072046286, + "grad_norm": 61.8354606628418, + "learning_rate": 3.613062191004944e-06, + "loss": 5.0481, + "step": 11300 + }, + { + "epoch": 0.018239970816046693, + "grad_norm": 97.53675842285156, + "learning_rate": 3.645061679013136e-06, + "loss": 4.8441, + "step": 11400 + }, + { + "epoch": 0.018399970560047104, + "grad_norm": 49.35017013549805, + "learning_rate": 3.677061167021328e-06, + "loss": 4.873, + "step": 11500 + }, + { + "epoch": 0.018559970304047515, + "grad_norm": 44.33409118652344, + "learning_rate": 3.70906065502952e-06, + "loss": 4.9988, + "step": 11600 + }, + { + "epoch": 0.018719970048047922, + "grad_norm": 140.5505828857422, + "learning_rate": 3.741060143037712e-06, + "loss": 4.7653, + "step": 11700 + }, + { + "epoch": 0.018879969792048333, + "grad_norm": 68.21163177490234, + "learning_rate": 3.7730596310459034e-06, + "loss": 4.804, + "step": 11800 + }, + { + "epoch": 0.019039969536048744, + "grad_norm": 48.678226470947266, + "learning_rate": 3.805059119054095e-06, + "loss": 4.8288, + "step": 11900 + }, + { + "epoch": 0.01919996928004915, + "grad_norm": 76.32611083984375, + "learning_rate": 3.837058607062287e-06, + "loss": 4.7053, + "step": 12000 + }, + { + "epoch": 0.019359969024049562, + "grad_norm": 70.85586547851562, + "learning_rate": 3.869058095070479e-06, + "loss": 4.6887, + "step": 12100 + }, + { + "epoch": 0.01951996876804997, + "grad_norm": 66.46036529541016, + "learning_rate": 3.901057583078671e-06, + "loss": 4.7832, + "step": 12200 + }, + { + "epoch": 0.01967996851205038, + "grad_norm": 165.13221740722656, + "learning_rate": 3.9330570710868636e-06, + "loss": 4.6817, + "step": 12300 + }, + { + "epoch": 0.01983996825605079, + "grad_norm": 118.48895263671875, + "learning_rate": 3.965056559095055e-06, + "loss": 4.6252, + "step": 12400 + }, + { + "epoch": 0.0199999680000512, + "grad_norm": 64.3436050415039, + "learning_rate": 3.997056047103246e-06, + "loss": 4.5936, + "step": 12500 + }, + { + "epoch": 0.02015996774405161, + "grad_norm": 42.27592468261719, + "learning_rate": 4.0290555351114385e-06, + "loss": 4.7452, + "step": 12600 + }, + { + "epoch": 0.02031996748805202, + "grad_norm": 60.829036712646484, + "learning_rate": 4.061055023119631e-06, + "loss": 4.5321, + "step": 12700 + }, + { + "epoch": 0.020479967232052428, + "grad_norm": 161.975830078125, + "learning_rate": 4.093054511127823e-06, + "loss": 4.4964, + "step": 12800 + }, + { + "epoch": 0.02063996697605284, + "grad_norm": 99.2963638305664, + "learning_rate": 4.125053999136014e-06, + "loss": 4.4421, + "step": 12900 + }, + { + "epoch": 0.02079996672005325, + "grad_norm": 68.78880310058594, + "learning_rate": 4.156733492264124e-06, + "loss": 4.3782, + "step": 13000 + }, + { + "epoch": 0.020959966464053657, + "grad_norm": 80.74951171875, + "learning_rate": 4.188732980272316e-06, + "loss": 4.5169, + "step": 13100 + }, + { + "epoch": 0.021119966208054067, + "grad_norm": 157.87254333496094, + "learning_rate": 4.220412473400426e-06, + "loss": 4.533, + "step": 13200 + }, + { + "epoch": 0.02127996595205448, + "grad_norm": 148.68331909179688, + "learning_rate": 4.252411961408618e-06, + "loss": 4.3725, + "step": 13300 + }, + { + "epoch": 0.021439965696054886, + "grad_norm": 72.9531021118164, + "learning_rate": 4.28441144941681e-06, + "loss": 4.2911, + "step": 13400 + }, + { + "epoch": 0.021599965440055297, + "grad_norm": 73.24847412109375, + "learning_rate": 4.316410937425001e-06, + "loss": 4.2261, + "step": 13500 + }, + { + "epoch": 0.021759965184055704, + "grad_norm": 94.57313537597656, + "learning_rate": 4.348410425433194e-06, + "loss": 4.2467, + "step": 13600 + }, + { + "epoch": 0.021919964928056115, + "grad_norm": 105.674560546875, + "learning_rate": 4.380409913441385e-06, + "loss": 4.1558, + "step": 13700 + }, + { + "epoch": 0.022079964672056526, + "grad_norm": 63.658287048339844, + "learning_rate": 4.412409401449577e-06, + "loss": 4.2794, + "step": 13800 + }, + { + "epoch": 0.022239964416056933, + "grad_norm": 77.69287109375, + "learning_rate": 4.444408889457769e-06, + "loss": 4.2383, + "step": 13900 + }, + { + "epoch": 0.022399964160057344, + "grad_norm": 82.83360290527344, + "learning_rate": 4.4764083774659615e-06, + "loss": 4.1654, + "step": 14000 + }, + { + "epoch": 0.022559963904057755, + "grad_norm": 47.373531341552734, + "learning_rate": 4.508407865474153e-06, + "loss": 4.158, + "step": 14100 + }, + { + "epoch": 0.022719963648058162, + "grad_norm": 97.64757537841797, + "learning_rate": 4.540407353482344e-06, + "loss": 4.1299, + "step": 14200 + }, + { + "epoch": 0.022879963392058573, + "grad_norm": 54.75618362426758, + "learning_rate": 4.5724068414905365e-06, + "loss": 4.1902, + "step": 14300 + }, + { + "epoch": 0.023039963136058984, + "grad_norm": 258.4887390136719, + "learning_rate": 4.604406329498729e-06, + "loss": 3.7853, + "step": 14400 + }, + { + "epoch": 0.02319996288005939, + "grad_norm": 104.63798522949219, + "learning_rate": 4.63640581750692e-06, + "loss": 4.0514, + "step": 14500 + }, + { + "epoch": 0.023359962624059802, + "grad_norm": 60.090843200683594, + "learning_rate": 4.668405305515112e-06, + "loss": 4.1655, + "step": 14600 + }, + { + "epoch": 0.023519962368060213, + "grad_norm": 44.36670684814453, + "learning_rate": 4.7004047935233036e-06, + "loss": 4.051, + "step": 14700 + }, + { + "epoch": 0.02367996211206062, + "grad_norm": 41.61213302612305, + "learning_rate": 4.732404281531496e-06, + "loss": 4.078, + "step": 14800 + }, + { + "epoch": 0.02383996185606103, + "grad_norm": 73.2448501586914, + "learning_rate": 4.764403769539688e-06, + "loss": 4.1193, + "step": 14900 + }, + { + "epoch": 0.023999961600061438, + "grad_norm": 77.30301666259766, + "learning_rate": 4.796403257547879e-06, + "loss": 4.1536, + "step": 15000 + }, + { + "epoch": 0.02415996134406185, + "grad_norm": 48.1458854675293, + "learning_rate": 4.8284027455560715e-06, + "loss": 3.935, + "step": 15100 + }, + { + "epoch": 0.02431996108806226, + "grad_norm": 129.59295654296875, + "learning_rate": 4.860402233564263e-06, + "loss": 3.9535, + "step": 15200 + }, + { + "epoch": 0.024479960832062667, + "grad_norm": 163.0813751220703, + "learning_rate": 4.892401721572455e-06, + "loss": 3.7051, + "step": 15300 + }, + { + "epoch": 0.024639960576063078, + "grad_norm": 102.2786865234375, + "learning_rate": 4.924401209580647e-06, + "loss": 3.8329, + "step": 15400 + }, + { + "epoch": 0.02479996032006349, + "grad_norm": 160.66392517089844, + "learning_rate": 4.956400697588839e-06, + "loss": 3.9412, + "step": 15500 + }, + { + "epoch": 0.024959960064063896, + "grad_norm": 136.77218627929688, + "learning_rate": 4.988400185597031e-06, + "loss": 3.6668, + "step": 15600 + }, + { + "epoch": 0.025119959808064307, + "grad_norm": 63.87991714477539, + "learning_rate": 5.0200796787251404e-06, + "loss": 3.7758, + "step": 15700 + }, + { + "epoch": 0.025279959552064718, + "grad_norm": 352.977294921875, + "learning_rate": 5.052079166733333e-06, + "loss": 3.8805, + "step": 15800 + }, + { + "epoch": 0.025439959296065125, + "grad_norm": 148.54776000976562, + "learning_rate": 5.084078654741524e-06, + "loss": 3.8848, + "step": 15900 + }, + { + "epoch": 0.025599959040065536, + "grad_norm": 105.01113891601562, + "learning_rate": 5.116078142749716e-06, + "loss": 3.75, + "step": 16000 + }, + { + "epoch": 0.025759958784065947, + "grad_norm": 170.62828063964844, + "learning_rate": 5.148077630757908e-06, + "loss": 3.5685, + "step": 16100 + }, + { + "epoch": 0.025919958528066354, + "grad_norm": 164.85324096679688, + "learning_rate": 5.180077118766101e-06, + "loss": 3.7016, + "step": 16200 + }, + { + "epoch": 0.026079958272066765, + "grad_norm": 79.85810852050781, + "learning_rate": 5.212076606774292e-06, + "loss": 4.0955, + "step": 16300 + }, + { + "epoch": 0.026239958016067173, + "grad_norm": 109.73529815673828, + "learning_rate": 5.244076094782484e-06, + "loss": 3.7577, + "step": 16400 + }, + { + "epoch": 0.026399957760067583, + "grad_norm": 105.98066711425781, + "learning_rate": 5.276075582790676e-06, + "loss": 3.7485, + "step": 16500 + }, + { + "epoch": 0.026559957504067994, + "grad_norm": 71.02545166015625, + "learning_rate": 5.3080750707988686e-06, + "loss": 3.8263, + "step": 16600 + }, + { + "epoch": 0.0267199572480684, + "grad_norm": 245.44224548339844, + "learning_rate": 5.340074558807059e-06, + "loss": 3.6922, + "step": 16700 + }, + { + "epoch": 0.026879956992068813, + "grad_norm": 42.178157806396484, + "learning_rate": 5.372074046815251e-06, + "loss": 3.6568, + "step": 16800 + }, + { + "epoch": 0.027039956736069223, + "grad_norm": 114.55894470214844, + "learning_rate": 5.404073534823443e-06, + "loss": 3.7317, + "step": 16900 + }, + { + "epoch": 0.02719995648006963, + "grad_norm": 86.70626831054688, + "learning_rate": 5.436073022831635e-06, + "loss": 3.5089, + "step": 17000 + }, + { + "epoch": 0.02735995622407004, + "grad_norm": 202.02505493164062, + "learning_rate": 5.468072510839827e-06, + "loss": 3.7377, + "step": 17100 + }, + { + "epoch": 0.027519955968070452, + "grad_norm": 114.00701141357422, + "learning_rate": 5.500071998848019e-06, + "loss": 3.6206, + "step": 17200 + }, + { + "epoch": 0.02767995571207086, + "grad_norm": 152.38311767578125, + "learning_rate": 5.532071486856211e-06, + "loss": 3.3702, + "step": 17300 + }, + { + "epoch": 0.02783995545607127, + "grad_norm": 156.1048126220703, + "learning_rate": 5.564070974864403e-06, + "loss": 3.5126, + "step": 17400 + }, + { + "epoch": 0.02799995520007168, + "grad_norm": 117.87386322021484, + "learning_rate": 5.596070462872595e-06, + "loss": 3.4841, + "step": 17500 + }, + { + "epoch": 0.02815995494407209, + "grad_norm": 616.7991333007812, + "learning_rate": 5.628069950880786e-06, + "loss": 3.1464, + "step": 17600 + }, + { + "epoch": 0.0283199546880725, + "grad_norm": 131.32760620117188, + "learning_rate": 5.6600694388889786e-06, + "loss": 3.7012, + "step": 17700 + }, + { + "epoch": 0.028479954432072907, + "grad_norm": 60.172969818115234, + "learning_rate": 5.69206892689717e-06, + "loss": 3.5802, + "step": 17800 + }, + { + "epoch": 0.028639954176073318, + "grad_norm": 169.24374389648438, + "learning_rate": 5.724068414905361e-06, + "loss": 3.4952, + "step": 17900 + }, + { + "epoch": 0.02879995392007373, + "grad_norm": 158.77391052246094, + "learning_rate": 5.7560679029135535e-06, + "loss": 3.1174, + "step": 18000 + }, + { + "epoch": 0.028959953664074136, + "grad_norm": 218.98867797851562, + "learning_rate": 5.787747396041664e-06, + "loss": 3.3134, + "step": 18100 + }, + { + "epoch": 0.029119953408074547, + "grad_norm": 185.3249053955078, + "learning_rate": 5.819746884049856e-06, + "loss": 3.3578, + "step": 18200 + }, + { + "epoch": 0.029279953152074958, + "grad_norm": 93.69242858886719, + "learning_rate": 5.851746372058048e-06, + "loss": 3.0209, + "step": 18300 + }, + { + "epoch": 0.029439952896075365, + "grad_norm": 85.82784271240234, + "learning_rate": 5.883745860066239e-06, + "loss": 3.3796, + "step": 18400 + }, + { + "epoch": 0.029599952640075776, + "grad_norm": 125.96697998046875, + "learning_rate": 5.915745348074431e-06, + "loss": 3.2287, + "step": 18500 + }, + { + "epoch": 0.029759952384076187, + "grad_norm": 235.71075439453125, + "learning_rate": 5.947744836082623e-06, + "loss": 3.1537, + "step": 18600 + }, + { + "epoch": 0.029919952128076594, + "grad_norm": 139.5558319091797, + "learning_rate": 5.979744324090815e-06, + "loss": 2.9073, + "step": 18700 + }, + { + "epoch": 0.030079951872077005, + "grad_norm": 204.2928924560547, + "learning_rate": 6.011743812099007e-06, + "loss": 3.3444, + "step": 18800 + }, + { + "epoch": 0.030239951616077416, + "grad_norm": 165.4457244873047, + "learning_rate": 6.043743300107199e-06, + "loss": 3.1341, + "step": 18900 + }, + { + "epoch": 0.030399951360077823, + "grad_norm": 66.5983657836914, + "learning_rate": 6.07574278811539e-06, + "loss": 2.8862, + "step": 19000 + }, + { + "epoch": 0.030559951104078234, + "grad_norm": 219.95774841308594, + "learning_rate": 6.1077422761235826e-06, + "loss": 3.2033, + "step": 19100 + }, + { + "epoch": 0.03071995084807864, + "grad_norm": 125.15766906738281, + "learning_rate": 6.139741764131775e-06, + "loss": 3.2764, + "step": 19200 + }, + { + "epoch": 0.030879950592079052, + "grad_norm": 207.95970153808594, + "learning_rate": 6.171741252139967e-06, + "loss": 3.0725, + "step": 19300 + }, + { + "epoch": 0.031039950336079463, + "grad_norm": 368.32781982421875, + "learning_rate": 6.203740740148158e-06, + "loss": 3.0436, + "step": 19400 + }, + { + "epoch": 0.03119995008007987, + "grad_norm": 412.2764587402344, + "learning_rate": 6.23574022815635e-06, + "loss": 3.3493, + "step": 19500 + }, + { + "epoch": 0.03135994982408028, + "grad_norm": 155.46766662597656, + "learning_rate": 6.267739716164542e-06, + "loss": 3.0141, + "step": 19600 + }, + { + "epoch": 0.03151994956808069, + "grad_norm": 89.32569885253906, + "learning_rate": 6.299739204172733e-06, + "loss": 2.779, + "step": 19700 + }, + { + "epoch": 0.0316799493120811, + "grad_norm": 241.4378204345703, + "learning_rate": 6.3317386921809254e-06, + "loss": 3.3543, + "step": 19800 + }, + { + "epoch": 0.03183994905608151, + "grad_norm": 13.20569133758545, + "learning_rate": 6.363738180189118e-06, + "loss": 3.1526, + "step": 19900 + }, + { + "epoch": 0.03199994880008192, + "grad_norm": 270.6402893066406, + "learning_rate": 6.395737668197309e-06, + "loss": 2.7896, + "step": 20000 + }, + { + "epoch": 0.03215994854408233, + "grad_norm": 106.38632202148438, + "learning_rate": 6.427737156205501e-06, + "loss": 2.9398, + "step": 20100 + }, + { + "epoch": 0.03231994828808274, + "grad_norm": 191.7210693359375, + "learning_rate": 6.459416649333611e-06, + "loss": 3.1254, + "step": 20200 + }, + { + "epoch": 0.03247994803208315, + "grad_norm": 143.96151733398438, + "learning_rate": 6.491416137341803e-06, + "loss": 2.8832, + "step": 20300 + }, + { + "epoch": 0.03263994777608356, + "grad_norm": 150.26368713378906, + "learning_rate": 6.523415625349994e-06, + "loss": 3.0542, + "step": 20400 + }, + { + "epoch": 0.032799947520083965, + "grad_norm": 178.11705017089844, + "learning_rate": 6.5554151133581865e-06, + "loss": 2.9722, + "step": 20500 + }, + { + "epoch": 0.032959947264084376, + "grad_norm": 222.4794921875, + "learning_rate": 6.587414601366379e-06, + "loss": 2.9321, + "step": 20600 + }, + { + "epoch": 0.03311994700808479, + "grad_norm": 155.37796020507812, + "learning_rate": 6.619414089374571e-06, + "loss": 2.6448, + "step": 20700 + }, + { + "epoch": 0.0332799467520852, + "grad_norm": 155.5786590576172, + "learning_rate": 6.651413577382762e-06, + "loss": 3.4006, + "step": 20800 + }, + { + "epoch": 0.03343994649608561, + "grad_norm": 684.525146484375, + "learning_rate": 6.6834130653909545e-06, + "loss": 3.0022, + "step": 20900 + }, + { + "epoch": 0.03359994624008602, + "grad_norm": 545.5623168945312, + "learning_rate": 6.715412553399147e-06, + "loss": 2.6366, + "step": 21000 + }, + { + "epoch": 0.03375994598408642, + "grad_norm": 292.9093017578125, + "learning_rate": 6.747412041407339e-06, + "loss": 3.0112, + "step": 21100 + }, + { + "epoch": 0.033919945728086834, + "grad_norm": 2.531680107116699, + "learning_rate": 6.7794115294155294e-06, + "loss": 2.7856, + "step": 21200 + }, + { + "epoch": 0.034079945472087245, + "grad_norm": 216.7860565185547, + "learning_rate": 6.811411017423722e-06, + "loss": 3.0967, + "step": 21300 + }, + { + "epoch": 0.034239945216087656, + "grad_norm": 138.73028564453125, + "learning_rate": 6.843410505431913e-06, + "loss": 2.8754, + "step": 21400 + }, + { + "epoch": 0.034399944960088066, + "grad_norm": 78.2362060546875, + "learning_rate": 6.875409993440105e-06, + "loss": 3.1269, + "step": 21500 + }, + { + "epoch": 0.03455994470408847, + "grad_norm": 144.1228790283203, + "learning_rate": 6.907409481448297e-06, + "loss": 2.8235, + "step": 21600 + }, + { + "epoch": 0.03471994444808888, + "grad_norm": 275.1159973144531, + "learning_rate": 6.93940896945649e-06, + "loss": 2.4912, + "step": 21700 + }, + { + "epoch": 0.03487994419208929, + "grad_norm": 216.12060546875, + "learning_rate": 6.971408457464681e-06, + "loss": 2.5079, + "step": 21800 + }, + { + "epoch": 0.0350399439360897, + "grad_norm": 398.5049133300781, + "learning_rate": 7.003407945472873e-06, + "loss": 3.2942, + "step": 21900 + }, + { + "epoch": 0.035199943680090114, + "grad_norm": 116.13761901855469, + "learning_rate": 7.035407433481065e-06, + "loss": 2.4184, + "step": 22000 + }, + { + "epoch": 0.035359943424090524, + "grad_norm": 425.1556091308594, + "learning_rate": 7.067406921489257e-06, + "loss": 2.782, + "step": 22100 + }, + { + "epoch": 0.03551994316809093, + "grad_norm": 17.029335021972656, + "learning_rate": 7.099086414617366e-06, + "loss": 2.7652, + "step": 22200 + }, + { + "epoch": 0.03567994291209134, + "grad_norm": 307.45343017578125, + "learning_rate": 7.1310859026255585e-06, + "loss": 3.113, + "step": 22300 + }, + { + "epoch": 0.03583994265609175, + "grad_norm": 69.89311981201172, + "learning_rate": 7.163085390633751e-06, + "loss": 2.7451, + "step": 22400 + }, + { + "epoch": 0.03599994240009216, + "grad_norm": 28.0865535736084, + "learning_rate": 7.195084878641943e-06, + "loss": 2.7473, + "step": 22500 + }, + { + "epoch": 0.03615994214409257, + "grad_norm": 108.03202056884766, + "learning_rate": 7.227084366650134e-06, + "loss": 2.5116, + "step": 22600 + }, + { + "epoch": 0.03631994188809298, + "grad_norm": 299.888427734375, + "learning_rate": 7.2590838546583265e-06, + "loss": 2.8531, + "step": 22700 + }, + { + "epoch": 0.036479941632093386, + "grad_norm": 87.79664611816406, + "learning_rate": 7.291083342666519e-06, + "loss": 2.9171, + "step": 22800 + }, + { + "epoch": 0.0366399413760938, + "grad_norm": 388.6871337890625, + "learning_rate": 7.323082830674709e-06, + "loss": 2.7954, + "step": 22900 + }, + { + "epoch": 0.03679994112009421, + "grad_norm": 87.27410888671875, + "learning_rate": 7.355082318682901e-06, + "loss": 2.5376, + "step": 23000 + }, + { + "epoch": 0.03695994086409462, + "grad_norm": 159.74534606933594, + "learning_rate": 7.387081806691094e-06, + "loss": 3.2488, + "step": 23100 + }, + { + "epoch": 0.03711994060809503, + "grad_norm": 169.96243286132812, + "learning_rate": 7.419081294699285e-06, + "loss": 2.6131, + "step": 23200 + }, + { + "epoch": 0.037279940352095434, + "grad_norm": 221.1896514892578, + "learning_rate": 7.451080782707477e-06, + "loss": 3.1343, + "step": 23300 + }, + { + "epoch": 0.037439940096095845, + "grad_norm": 67.28482818603516, + "learning_rate": 7.482760275835588e-06, + "loss": 2.3159, + "step": 23400 + }, + { + "epoch": 0.037599939840096255, + "grad_norm": 341.05975341796875, + "learning_rate": 7.514759763843779e-06, + "loss": 2.4225, + "step": 23500 + }, + { + "epoch": 0.037759939584096666, + "grad_norm": 250.44683837890625, + "learning_rate": 7.54675925185197e-06, + "loss": 2.5034, + "step": 23600 + }, + { + "epoch": 0.03791993932809708, + "grad_norm": 423.6518249511719, + "learning_rate": 7.5787587398601625e-06, + "loss": 3.0067, + "step": 23700 + }, + { + "epoch": 0.03807993907209749, + "grad_norm": 169.45944213867188, + "learning_rate": 7.610758227868355e-06, + "loss": 2.313, + "step": 23800 + }, + { + "epoch": 0.03823993881609789, + "grad_norm": 80.43399047851562, + "learning_rate": 7.642757715876546e-06, + "loss": 2.5363, + "step": 23900 + }, + { + "epoch": 0.0383999385600983, + "grad_norm": 248.08848571777344, + "learning_rate": 7.674757203884739e-06, + "loss": 2.7929, + "step": 24000 + }, + { + "epoch": 0.03855993830409871, + "grad_norm": 3.7647440433502197, + "learning_rate": 7.70675669189293e-06, + "loss": 2.617, + "step": 24100 + }, + { + "epoch": 0.038719938048099124, + "grad_norm": 3.100020170211792, + "learning_rate": 7.738756179901122e-06, + "loss": 2.9711, + "step": 24200 + }, + { + "epoch": 0.038879937792099535, + "grad_norm": 69.79640197753906, + "learning_rate": 7.770755667909315e-06, + "loss": 2.7726, + "step": 24300 + }, + { + "epoch": 0.03903993753609994, + "grad_norm": 190.2179412841797, + "learning_rate": 7.802755155917506e-06, + "loss": 2.5849, + "step": 24400 + }, + { + "epoch": 0.03919993728010035, + "grad_norm": 75.47491455078125, + "learning_rate": 7.834754643925698e-06, + "loss": 2.3231, + "step": 24500 + }, + { + "epoch": 0.03935993702410076, + "grad_norm": 13.3529691696167, + "learning_rate": 7.866754131933889e-06, + "loss": 2.2477, + "step": 24600 + }, + { + "epoch": 0.03951993676810117, + "grad_norm": 280.162109375, + "learning_rate": 7.89875361994208e-06, + "loss": 2.5487, + "step": 24700 + }, + { + "epoch": 0.03967993651210158, + "grad_norm": 376.9624938964844, + "learning_rate": 7.930753107950273e-06, + "loss": 2.5175, + "step": 24800 + }, + { + "epoch": 0.03983993625610199, + "grad_norm": 341.099609375, + "learning_rate": 7.962752595958465e-06, + "loss": 2.6758, + "step": 24900 + }, + { + "epoch": 0.0399999360001024, + "grad_norm": 436.5195007324219, + "learning_rate": 7.994752083966658e-06, + "loss": 2.7313, + "step": 25000 + }, + { + "epoch": 0.04015993574410281, + "grad_norm": 274.91363525390625, + "learning_rate": 8.026751571974849e-06, + "loss": 2.4846, + "step": 25100 + }, + { + "epoch": 0.04031993548810322, + "grad_norm": 183.5716094970703, + "learning_rate": 8.05875105998304e-06, + "loss": 2.8697, + "step": 25200 + }, + { + "epoch": 0.04047993523210363, + "grad_norm": 70.23844909667969, + "learning_rate": 8.090750547991234e-06, + "loss": 2.5289, + "step": 25300 + }, + { + "epoch": 0.04063993497610404, + "grad_norm": 139.3669891357422, + "learning_rate": 8.122750035999425e-06, + "loss": 2.235, + "step": 25400 + }, + { + "epoch": 0.04079993472010445, + "grad_norm": 242.79315185546875, + "learning_rate": 8.154429529127534e-06, + "loss": 2.5028, + "step": 25500 + }, + { + "epoch": 0.040959934464104855, + "grad_norm": 257.0070495605469, + "learning_rate": 8.186429017135727e-06, + "loss": 2.6295, + "step": 25600 + }, + { + "epoch": 0.041119934208105266, + "grad_norm": 314.8670959472656, + "learning_rate": 8.218428505143918e-06, + "loss": 2.6159, + "step": 25700 + }, + { + "epoch": 0.04127993395210568, + "grad_norm": 284.12762451171875, + "learning_rate": 8.250427993152111e-06, + "loss": 2.4447, + "step": 25800 + }, + { + "epoch": 0.04143993369610609, + "grad_norm": 5.427358627319336, + "learning_rate": 8.282427481160302e-06, + "loss": 2.7233, + "step": 25900 + }, + { + "epoch": 0.0415999334401065, + "grad_norm": 240.23260498046875, + "learning_rate": 8.314426969168494e-06, + "loss": 2.5651, + "step": 26000 + }, + { + "epoch": 0.0417599331841069, + "grad_norm": 15.093184471130371, + "learning_rate": 8.346426457176687e-06, + "loss": 2.1317, + "step": 26100 + }, + { + "epoch": 0.04191993292810731, + "grad_norm": 14.953177452087402, + "learning_rate": 8.378425945184878e-06, + "loss": 2.6157, + "step": 26200 + }, + { + "epoch": 0.042079932672107724, + "grad_norm": 242.84718322753906, + "learning_rate": 8.410105438312987e-06, + "loss": 2.7385, + "step": 26300 + }, + { + "epoch": 0.042239932416108135, + "grad_norm": 1.3409643173217773, + "learning_rate": 8.44210492632118e-06, + "loss": 2.4642, + "step": 26400 + }, + { + "epoch": 0.042399932160108546, + "grad_norm": 90.02801513671875, + "learning_rate": 8.474104414329371e-06, + "loss": 2.0621, + "step": 26500 + }, + { + "epoch": 0.04255993190410896, + "grad_norm": 11.879080772399902, + "learning_rate": 8.506103902337564e-06, + "loss": 2.3864, + "step": 26600 + }, + { + "epoch": 0.04271993164810936, + "grad_norm": 598.356689453125, + "learning_rate": 8.538103390345756e-06, + "loss": 2.6951, + "step": 26700 + }, + { + "epoch": 0.04287993139210977, + "grad_norm": 144.25924682617188, + "learning_rate": 8.570102878353947e-06, + "loss": 2.2628, + "step": 26800 + }, + { + "epoch": 0.04303993113611018, + "grad_norm": 521.1145629882812, + "learning_rate": 8.602102366362138e-06, + "loss": 2.7538, + "step": 26900 + }, + { + "epoch": 0.04319993088011059, + "grad_norm": 86.13031005859375, + "learning_rate": 8.63410185437033e-06, + "loss": 2.6871, + "step": 27000 + }, + { + "epoch": 0.043359930624111004, + "grad_norm": 268.4532775878906, + "learning_rate": 8.666101342378523e-06, + "loss": 2.2453, + "step": 27100 + }, + { + "epoch": 0.04351993036811141, + "grad_norm": 531.1592407226562, + "learning_rate": 8.698100830386714e-06, + "loss": 1.6334, + "step": 27200 + }, + { + "epoch": 0.04367993011211182, + "grad_norm": 166.83230590820312, + "learning_rate": 8.730100318394906e-06, + "loss": 2.666, + "step": 27300 + }, + { + "epoch": 0.04383992985611223, + "grad_norm": 208.4716033935547, + "learning_rate": 8.762099806403099e-06, + "loss": 2.128, + "step": 27400 + }, + { + "epoch": 0.04399992960011264, + "grad_norm": 257.9130859375, + "learning_rate": 8.79409929441129e-06, + "loss": 2.7573, + "step": 27500 + }, + { + "epoch": 0.04415992934411305, + "grad_norm": 85.08763885498047, + "learning_rate": 8.826098782419481e-06, + "loss": 2.5276, + "step": 27600 + }, + { + "epoch": 0.04431992908811346, + "grad_norm": 8.960221290588379, + "learning_rate": 8.858098270427674e-06, + "loss": 2.2438, + "step": 27700 + }, + { + "epoch": 0.044479928832113866, + "grad_norm": 404.66558837890625, + "learning_rate": 8.890097758435866e-06, + "loss": 2.3156, + "step": 27800 + }, + { + "epoch": 0.04463992857611428, + "grad_norm": 151.23495483398438, + "learning_rate": 8.922097246444059e-06, + "loss": 2.1735, + "step": 27900 + }, + { + "epoch": 0.04479992832011469, + "grad_norm": 151.7221221923828, + "learning_rate": 8.95409673445225e-06, + "loss": 2.1733, + "step": 28000 + }, + { + "epoch": 0.0449599280641151, + "grad_norm": 228.95974731445312, + "learning_rate": 8.986096222460441e-06, + "loss": 2.4094, + "step": 28100 + }, + { + "epoch": 0.04511992780811551, + "grad_norm": 522.6806640625, + "learning_rate": 9.018095710468633e-06, + "loss": 2.8484, + "step": 28200 + }, + { + "epoch": 0.04527992755211592, + "grad_norm": 16.065011978149414, + "learning_rate": 9.050095198476824e-06, + "loss": 2.4507, + "step": 28300 + }, + { + "epoch": 0.045439927296116324, + "grad_norm": 227.2984619140625, + "learning_rate": 9.082094686485017e-06, + "loss": 2.6822, + "step": 28400 + }, + { + "epoch": 0.045599927040116735, + "grad_norm": 430.3262634277344, + "learning_rate": 9.114094174493209e-06, + "loss": 2.1191, + "step": 28500 + }, + { + "epoch": 0.045759926784117146, + "grad_norm": 0.1830236166715622, + "learning_rate": 9.1460936625014e-06, + "loss": 2.0696, + "step": 28600 + }, + { + "epoch": 0.045919926528117556, + "grad_norm": 97.45941162109375, + "learning_rate": 9.178093150509593e-06, + "loss": 2.4027, + "step": 28700 + }, + { + "epoch": 0.04607992627211797, + "grad_norm": 22.469968795776367, + "learning_rate": 9.210092638517784e-06, + "loss": 1.7958, + "step": 28800 + }, + { + "epoch": 0.04623992601611837, + "grad_norm": 103.27215576171875, + "learning_rate": 9.242092126525977e-06, + "loss": 2.5874, + "step": 28900 + }, + { + "epoch": 0.04639992576011878, + "grad_norm": 578.951171875, + "learning_rate": 9.274091614534169e-06, + "loss": 2.2679, + "step": 29000 + }, + { + "epoch": 0.04655992550411919, + "grad_norm": 6.261137008666992, + "learning_rate": 9.30609110254236e-06, + "loss": 2.6394, + "step": 29100 + }, + { + "epoch": 0.046719925248119604, + "grad_norm": 113.35989379882812, + "learning_rate": 9.338090590550551e-06, + "loss": 1.7998, + "step": 29200 + }, + { + "epoch": 0.046879924992120015, + "grad_norm": 116.46363830566406, + "learning_rate": 9.370090078558743e-06, + "loss": 2.6834, + "step": 29300 + }, + { + "epoch": 0.047039924736120425, + "grad_norm": 84.5538101196289, + "learning_rate": 9.402089566566936e-06, + "loss": 2.1242, + "step": 29400 + }, + { + "epoch": 0.04719992448012083, + "grad_norm": 150.44454956054688, + "learning_rate": 9.434089054575127e-06, + "loss": 2.0039, + "step": 29500 + }, + { + "epoch": 0.04735992422412124, + "grad_norm": 12.482616424560547, + "learning_rate": 9.466088542583319e-06, + "loss": 2.018, + "step": 29600 + }, + { + "epoch": 0.04751992396812165, + "grad_norm": 1.1050609350204468, + "learning_rate": 9.498088030591512e-06, + "loss": 2.9357, + "step": 29700 + }, + { + "epoch": 0.04767992371212206, + "grad_norm": 256.4771423339844, + "learning_rate": 9.530087518599703e-06, + "loss": 2.1914, + "step": 29800 + }, + { + "epoch": 0.04783992345612247, + "grad_norm": 178.9422149658203, + "learning_rate": 9.562087006607896e-06, + "loss": 2.0968, + "step": 29900 + }, + { + "epoch": 0.047999923200122876, + "grad_norm": 160.1494140625, + "learning_rate": 9.594086494616087e-06, + "loss": 1.9762, + "step": 30000 + }, + { + "epoch": 0.04815992294412329, + "grad_norm": 86.46272277832031, + "learning_rate": 9.626085982624279e-06, + "loss": 2.1436, + "step": 30100 + }, + { + "epoch": 0.0483199226881237, + "grad_norm": 76.13285064697266, + "learning_rate": 9.658085470632472e-06, + "loss": 2.1919, + "step": 30200 + }, + { + "epoch": 0.04847992243212411, + "grad_norm": 2.952242374420166, + "learning_rate": 9.690084958640661e-06, + "loss": 1.9683, + "step": 30300 + }, + { + "epoch": 0.04863992217612452, + "grad_norm": 33.4036979675293, + "learning_rate": 9.722084446648855e-06, + "loss": 2.3543, + "step": 30400 + }, + { + "epoch": 0.04879992192012493, + "grad_norm": 173.6257781982422, + "learning_rate": 9.753763939776965e-06, + "loss": 2.0642, + "step": 30500 + }, + { + "epoch": 0.048959921664125335, + "grad_norm": 0.08548393100500107, + "learning_rate": 9.785763427785156e-06, + "loss": 1.8447, + "step": 30600 + }, + { + "epoch": 0.049119921408125745, + "grad_norm": 111.82203674316406, + "learning_rate": 9.817762915793348e-06, + "loss": 2.3467, + "step": 30700 + }, + { + "epoch": 0.049279921152126156, + "grad_norm": 142.97500610351562, + "learning_rate": 9.84976240380154e-06, + "loss": 2.6461, + "step": 30800 + }, + { + "epoch": 0.04943992089612657, + "grad_norm": 417.88677978515625, + "learning_rate": 9.881761891809732e-06, + "loss": 2.028, + "step": 30900 + }, + { + "epoch": 0.04959992064012698, + "grad_norm": 4.543129920959473, + "learning_rate": 9.913761379817923e-06, + "loss": 1.4188, + "step": 31000 + }, + { + "epoch": 0.04975992038412739, + "grad_norm": 205.02293395996094, + "learning_rate": 9.945760867826115e-06, + "loss": 2.7219, + "step": 31100 + }, + { + "epoch": 0.04991992012812779, + "grad_norm": 123.40583038330078, + "learning_rate": 9.977760355834308e-06, + "loss": 2.2345, + "step": 31200 + }, + { + "epoch": 0.050079919872128204, + "grad_norm": 0.9410820603370667, + "learning_rate": 1.00097598438425e-05, + "loss": 2.201, + "step": 31300 + }, + { + "epoch": 0.050239919616128614, + "grad_norm": 51.27448272705078, + "learning_rate": 1.004175933185069e-05, + "loss": 2.092, + "step": 31400 + }, + { + "epoch": 0.050399919360129025, + "grad_norm": 258.7269592285156, + "learning_rate": 1.0073758819858884e-05, + "loss": 2.2871, + "step": 31500 + }, + { + "epoch": 0.050559919104129436, + "grad_norm": 108.11058044433594, + "learning_rate": 1.0105758307867075e-05, + "loss": 2.0167, + "step": 31600 + }, + { + "epoch": 0.05071991884812984, + "grad_norm": 229.5725555419922, + "learning_rate": 1.0137757795875266e-05, + "loss": 1.9175, + "step": 31700 + }, + { + "epoch": 0.05087991859213025, + "grad_norm": 204.41357421875, + "learning_rate": 1.016975728388346e-05, + "loss": 2.2229, + "step": 31800 + }, + { + "epoch": 0.05103991833613066, + "grad_norm": 8.951689720153809, + "learning_rate": 1.020175677189165e-05, + "loss": 2.1196, + "step": 31900 + }, + { + "epoch": 0.05119991808013107, + "grad_norm": 275.85198974609375, + "learning_rate": 1.0233756259899844e-05, + "loss": 2.2192, + "step": 32000 + }, + { + "epoch": 0.05135991782413148, + "grad_norm": 359.066650390625, + "learning_rate": 1.0265755747908035e-05, + "loss": 1.6462, + "step": 32100 + }, + { + "epoch": 0.051519917568131894, + "grad_norm": 0.10183493793010712, + "learning_rate": 1.0297755235916226e-05, + "loss": 2.099, + "step": 32200 + }, + { + "epoch": 0.0516799173121323, + "grad_norm": 43.3016357421875, + "learning_rate": 1.0329434729044337e-05, + "loss": 2.0914, + "step": 32300 + }, + { + "epoch": 0.05183991705613271, + "grad_norm": 97.42915344238281, + "learning_rate": 1.0361434217052528e-05, + "loss": 2.3295, + "step": 32400 + }, + { + "epoch": 0.05199991680013312, + "grad_norm": 1.9172292947769165, + "learning_rate": 1.039343370506072e-05, + "loss": 2.256, + "step": 32500 + }, + { + "epoch": 0.05215991654413353, + "grad_norm": 157.83743286132812, + "learning_rate": 1.0425433193068913e-05, + "loss": 1.7662, + "step": 32600 + }, + { + "epoch": 0.05231991628813394, + "grad_norm": 1.3025041818618774, + "learning_rate": 1.0457432681077104e-05, + "loss": 1.7234, + "step": 32700 + }, + { + "epoch": 0.052479916032134345, + "grad_norm": 234.8426971435547, + "learning_rate": 1.0489432169085297e-05, + "loss": 1.984, + "step": 32800 + }, + { + "epoch": 0.052639915776134756, + "grad_norm": 9.249500274658203, + "learning_rate": 1.0521431657093488e-05, + "loss": 2.1815, + "step": 32900 + }, + { + "epoch": 0.05279991552013517, + "grad_norm": 164.6519012451172, + "learning_rate": 1.055343114510168e-05, + "loss": 1.4987, + "step": 33000 + }, + { + "epoch": 0.05295991526413558, + "grad_norm": 145.9049072265625, + "learning_rate": 1.0585430633109873e-05, + "loss": 2.0034, + "step": 33100 + }, + { + "epoch": 0.05311991500813599, + "grad_norm": 79.73159790039062, + "learning_rate": 1.0617430121118062e-05, + "loss": 2.6008, + "step": 33200 + }, + { + "epoch": 0.0532799147521364, + "grad_norm": 131.95318603515625, + "learning_rate": 1.0649429609126254e-05, + "loss": 2.4585, + "step": 33300 + }, + { + "epoch": 0.0534399144961368, + "grad_norm": 44.75098419189453, + "learning_rate": 1.0681429097134445e-05, + "loss": 1.881, + "step": 33400 + }, + { + "epoch": 0.053599914240137214, + "grad_norm": 0.9141740202903748, + "learning_rate": 1.0713428585142638e-05, + "loss": 1.8738, + "step": 33500 + }, + { + "epoch": 0.053759913984137625, + "grad_norm": 248.49734497070312, + "learning_rate": 1.074542807315083e-05, + "loss": 1.9726, + "step": 33600 + }, + { + "epoch": 0.053919913728138036, + "grad_norm": 167.88706970214844, + "learning_rate": 1.0777427561159021e-05, + "loss": 2.3734, + "step": 33700 + }, + { + "epoch": 0.05407991347213845, + "grad_norm": 0.43971773982048035, + "learning_rate": 1.0809427049167214e-05, + "loss": 1.6898, + "step": 33800 + }, + { + "epoch": 0.05423991321613886, + "grad_norm": 713.7942504882812, + "learning_rate": 1.0841426537175405e-05, + "loss": 2.2171, + "step": 33900 + }, + { + "epoch": 0.05439991296013926, + "grad_norm": 57.55624771118164, + "learning_rate": 1.0873426025183598e-05, + "loss": 1.4453, + "step": 34000 + }, + { + "epoch": 0.05455991270413967, + "grad_norm": 409.5030822753906, + "learning_rate": 1.090542551319179e-05, + "loss": 1.5057, + "step": 34100 + }, + { + "epoch": 0.05471991244814008, + "grad_norm": 60.115047454833984, + "learning_rate": 1.0937425001199981e-05, + "loss": 2.1497, + "step": 34200 + }, + { + "epoch": 0.054879912192140494, + "grad_norm": 0.7692262530326843, + "learning_rate": 1.0969424489208174e-05, + "loss": 1.8618, + "step": 34300 + }, + { + "epoch": 0.055039911936140905, + "grad_norm": 698.8638916015625, + "learning_rate": 1.1001423977216366e-05, + "loss": 1.7878, + "step": 34400 + }, + { + "epoch": 0.05519991168014131, + "grad_norm": 0.5103877186775208, + "learning_rate": 1.1033423465224557e-05, + "loss": 1.8199, + "step": 34500 + }, + { + "epoch": 0.05535991142414172, + "grad_norm": 347.1667175292969, + "learning_rate": 1.106542295323275e-05, + "loss": 2.1649, + "step": 34600 + }, + { + "epoch": 0.05551991116814213, + "grad_norm": 99.95459747314453, + "learning_rate": 1.1097422441240941e-05, + "loss": 1.7906, + "step": 34700 + }, + { + "epoch": 0.05567991091214254, + "grad_norm": 211.90087890625, + "learning_rate": 1.1129421929249133e-05, + "loss": 1.6816, + "step": 34800 + }, + { + "epoch": 0.05583991065614295, + "grad_norm": 60.790199279785156, + "learning_rate": 1.1161421417257326e-05, + "loss": 2.1464, + "step": 34900 + }, + { + "epoch": 0.05599991040014336, + "grad_norm": 585.09716796875, + "learning_rate": 1.1193420905265517e-05, + "loss": 2.0039, + "step": 35000 + }, + { + "epoch": 0.05615991014414377, + "grad_norm": 0.2061644047498703, + "learning_rate": 1.122542039327371e-05, + "loss": 1.735, + "step": 35100 + }, + { + "epoch": 0.05631990988814418, + "grad_norm": 204.5592498779297, + "learning_rate": 1.1257419881281901e-05, + "loss": 1.853, + "step": 35200 + }, + { + "epoch": 0.05647990963214459, + "grad_norm": 695.4961547851562, + "learning_rate": 1.1289419369290093e-05, + "loss": 1.6068, + "step": 35300 + }, + { + "epoch": 0.056639909376145, + "grad_norm": 220.02767944335938, + "learning_rate": 1.1321418857298282e-05, + "loss": 1.6349, + "step": 35400 + }, + { + "epoch": 0.05679990912014541, + "grad_norm": 0.07823936641216278, + "learning_rate": 1.1353418345306476e-05, + "loss": 1.9571, + "step": 35500 + }, + { + "epoch": 0.056959908864145814, + "grad_norm": 31.91838836669922, + "learning_rate": 1.1385417833314667e-05, + "loss": 1.5854, + "step": 35600 + }, + { + "epoch": 0.057119908608146225, + "grad_norm": 1040.179931640625, + "learning_rate": 1.1417417321322858e-05, + "loss": 1.9756, + "step": 35700 + }, + { + "epoch": 0.057279908352146636, + "grad_norm": 16.008800506591797, + "learning_rate": 1.1449416809331051e-05, + "loss": 1.9816, + "step": 35800 + }, + { + "epoch": 0.057439908096147047, + "grad_norm": 226.522705078125, + "learning_rate": 1.1481416297339243e-05, + "loss": 1.6758, + "step": 35900 + }, + { + "epoch": 0.05759990784014746, + "grad_norm": 85.04449462890625, + "learning_rate": 1.1513415785347436e-05, + "loss": 2.2583, + "step": 36000 + }, + { + "epoch": 0.05775990758414787, + "grad_norm": 3.989626884460449, + "learning_rate": 1.1545415273355627e-05, + "loss": 1.7584, + "step": 36100 + }, + { + "epoch": 0.05791990732814827, + "grad_norm": 63.272911071777344, + "learning_rate": 1.1577414761363818e-05, + "loss": 1.9894, + "step": 36200 + }, + { + "epoch": 0.05807990707214868, + "grad_norm": 175.4257049560547, + "learning_rate": 1.1609414249372011e-05, + "loss": 2.3922, + "step": 36300 + }, + { + "epoch": 0.058239906816149094, + "grad_norm": 160.36253356933594, + "learning_rate": 1.164109374250012e-05, + "loss": 2.0077, + "step": 36400 + }, + { + "epoch": 0.058399906560149505, + "grad_norm": 95.23787689208984, + "learning_rate": 1.1673093230508312e-05, + "loss": 2.3684, + "step": 36500 + }, + { + "epoch": 0.058559906304149915, + "grad_norm": 173.092041015625, + "learning_rate": 1.1705092718516505e-05, + "loss": 2.1103, + "step": 36600 + }, + { + "epoch": 0.058719906048150326, + "grad_norm": 719.3712768554688, + "learning_rate": 1.1736772211644613e-05, + "loss": 2.0728, + "step": 36700 + }, + { + "epoch": 0.05887990579215073, + "grad_norm": 1.7120122909545898, + "learning_rate": 1.1768771699652806e-05, + "loss": 1.9364, + "step": 36800 + }, + { + "epoch": 0.05903990553615114, + "grad_norm": 120.16387176513672, + "learning_rate": 1.1800771187660998e-05, + "loss": 2.5203, + "step": 36900 + }, + { + "epoch": 0.05919990528015155, + "grad_norm": 46.504329681396484, + "learning_rate": 1.1832770675669189e-05, + "loss": 1.8473, + "step": 37000 + }, + { + "epoch": 0.05935990502415196, + "grad_norm": 255.33987426757812, + "learning_rate": 1.1864770163677382e-05, + "loss": 1.8076, + "step": 37100 + }, + { + "epoch": 0.059519904768152374, + "grad_norm": 130.05715942382812, + "learning_rate": 1.1896769651685574e-05, + "loss": 2.0157, + "step": 37200 + }, + { + "epoch": 0.05967990451215278, + "grad_norm": 201.22866821289062, + "learning_rate": 1.1928769139693765e-05, + "loss": 2.1587, + "step": 37300 + }, + { + "epoch": 0.05983990425615319, + "grad_norm": 0.2600236237049103, + "learning_rate": 1.1960768627701958e-05, + "loss": 1.9825, + "step": 37400 + }, + { + "epoch": 0.0599999040001536, + "grad_norm": 0.20701654255390167, + "learning_rate": 1.199276811571015e-05, + "loss": 2.0693, + "step": 37500 + }, + { + "epoch": 0.06015990374415401, + "grad_norm": 247.0039520263672, + "learning_rate": 1.202476760371834e-05, + "loss": 1.5505, + "step": 37600 + }, + { + "epoch": 0.06031990348815442, + "grad_norm": 15.698258399963379, + "learning_rate": 1.2056767091726534e-05, + "loss": 1.5472, + "step": 37700 + }, + { + "epoch": 0.06047990323215483, + "grad_norm": 357.7092590332031, + "learning_rate": 1.2088766579734725e-05, + "loss": 2.0568, + "step": 37800 + }, + { + "epoch": 0.060639902976155236, + "grad_norm": 54.52446365356445, + "learning_rate": 1.2120766067742918e-05, + "loss": 1.9219, + "step": 37900 + }, + { + "epoch": 0.060799902720155646, + "grad_norm": 240.81784057617188, + "learning_rate": 1.215276555575111e-05, + "loss": 2.091, + "step": 38000 + }, + { + "epoch": 0.06095990246415606, + "grad_norm": 0.14063161611557007, + "learning_rate": 1.21847650437593e-05, + "loss": 2.0523, + "step": 38100 + }, + { + "epoch": 0.06111990220815647, + "grad_norm": 101.88555145263672, + "learning_rate": 1.2216764531767494e-05, + "loss": 1.7628, + "step": 38200 + }, + { + "epoch": 0.06127990195215688, + "grad_norm": 1.7761729955673218, + "learning_rate": 1.2248764019775685e-05, + "loss": 1.8753, + "step": 38300 + }, + { + "epoch": 0.06143990169615728, + "grad_norm": 183.46917724609375, + "learning_rate": 1.2280763507783877e-05, + "loss": 1.846, + "step": 38400 + }, + { + "epoch": 0.061599901440157694, + "grad_norm": 0.008245576173067093, + "learning_rate": 1.231276299579207e-05, + "loss": 1.803, + "step": 38500 + }, + { + "epoch": 0.061759901184158104, + "grad_norm": 389.3524169921875, + "learning_rate": 1.2344762483800261e-05, + "loss": 2.1226, + "step": 38600 + }, + { + "epoch": 0.061919900928158515, + "grad_norm": 457.38519287109375, + "learning_rate": 1.2376761971808452e-05, + "loss": 2.0906, + "step": 38700 + }, + { + "epoch": 0.062079900672158926, + "grad_norm": 95.94575500488281, + "learning_rate": 1.2408441464936563e-05, + "loss": 1.4321, + "step": 38800 + }, + { + "epoch": 0.06223990041615934, + "grad_norm": 0.09420862793922424, + "learning_rate": 1.2440440952944754e-05, + "loss": 2.5214, + "step": 38900 + }, + { + "epoch": 0.06239990016015974, + "grad_norm": 7.472883224487305, + "learning_rate": 1.2472440440952947e-05, + "loss": 1.5412, + "step": 39000 + }, + { + "epoch": 0.06255989990416015, + "grad_norm": 198.42828369140625, + "learning_rate": 1.2504439928961139e-05, + "loss": 1.4382, + "step": 39100 + }, + { + "epoch": 0.06271989964816056, + "grad_norm": 1.2646727561950684, + "learning_rate": 1.253643941696933e-05, + "loss": 1.8417, + "step": 39200 + }, + { + "epoch": 0.06287989939216097, + "grad_norm": 85.20125579833984, + "learning_rate": 1.2568438904977523e-05, + "loss": 2.1105, + "step": 39300 + }, + { + "epoch": 0.06303989913616138, + "grad_norm": 6.063973903656006, + "learning_rate": 1.2600438392985714e-05, + "loss": 1.6347, + "step": 39400 + }, + { + "epoch": 0.0631998988801618, + "grad_norm": 1.7712761163711548, + "learning_rate": 1.2632437880993904e-05, + "loss": 2.0372, + "step": 39500 + }, + { + "epoch": 0.0633598986241622, + "grad_norm": 105.22515106201172, + "learning_rate": 1.2664437369002095e-05, + "loss": 1.6222, + "step": 39600 + }, + { + "epoch": 0.06351989836816262, + "grad_norm": 152.34910583496094, + "learning_rate": 1.2696436857010288e-05, + "loss": 1.8033, + "step": 39700 + }, + { + "epoch": 0.06367989811216301, + "grad_norm": 0.4972204864025116, + "learning_rate": 1.272843634501848e-05, + "loss": 1.9847, + "step": 39800 + }, + { + "epoch": 0.06383989785616342, + "grad_norm": 145.8481903076172, + "learning_rate": 1.2760435833026673e-05, + "loss": 2.1354, + "step": 39900 + }, + { + "epoch": 0.06399989760016384, + "grad_norm": 0.4929490089416504, + "learning_rate": 1.2792435321034864e-05, + "loss": 1.6792, + "step": 40000 + }, + { + "epoch": 0.06415989734416425, + "grad_norm": 0.004757192451506853, + "learning_rate": 1.2824434809043055e-05, + "loss": 2.1055, + "step": 40100 + }, + { + "epoch": 0.06431989708816466, + "grad_norm": 133.86878967285156, + "learning_rate": 1.2856434297051249e-05, + "loss": 2.0657, + "step": 40200 + }, + { + "epoch": 0.06447989683216507, + "grad_norm": 75.14216613769531, + "learning_rate": 1.288843378505944e-05, + "loss": 1.9618, + "step": 40300 + }, + { + "epoch": 0.06463989657616548, + "grad_norm": 0.47655782103538513, + "learning_rate": 1.2920433273067631e-05, + "loss": 1.5807, + "step": 40400 + }, + { + "epoch": 0.06479989632016589, + "grad_norm": 0.25797244906425476, + "learning_rate": 1.2952432761075824e-05, + "loss": 1.6451, + "step": 40500 + }, + { + "epoch": 0.0649598960641663, + "grad_norm": 0.013840774074196815, + "learning_rate": 1.2984432249084016e-05, + "loss": 2.1299, + "step": 40600 + }, + { + "epoch": 0.06511989580816671, + "grad_norm": 0.016265127807855606, + "learning_rate": 1.3016431737092207e-05, + "loss": 1.9912, + "step": 40700 + }, + { + "epoch": 0.06527989555216712, + "grad_norm": 91.05821228027344, + "learning_rate": 1.30484312251004e-05, + "loss": 1.6392, + "step": 40800 + }, + { + "epoch": 0.06543989529616753, + "grad_norm": 0.5753430724143982, + "learning_rate": 1.3080430713108591e-05, + "loss": 1.8049, + "step": 40900 + }, + { + "epoch": 0.06559989504016793, + "grad_norm": 1.7056798934936523, + "learning_rate": 1.3112430201116784e-05, + "loss": 1.9832, + "step": 41000 + }, + { + "epoch": 0.06575989478416834, + "grad_norm": 115.96708679199219, + "learning_rate": 1.3144109694244893e-05, + "loss": 2.0309, + "step": 41100 + }, + { + "epoch": 0.06591989452816875, + "grad_norm": 128.86553955078125, + "learning_rate": 1.3176109182253085e-05, + "loss": 1.8362, + "step": 41200 + }, + { + "epoch": 0.06607989427216916, + "grad_norm": 8.644057273864746, + "learning_rate": 1.3208108670261278e-05, + "loss": 2.2709, + "step": 41300 + }, + { + "epoch": 0.06623989401616957, + "grad_norm": 105.3166732788086, + "learning_rate": 1.3240108158269469e-05, + "loss": 2.0785, + "step": 41400 + }, + { + "epoch": 0.06639989376016998, + "grad_norm": 66.77593231201172, + "learning_rate": 1.327210764627766e-05, + "loss": 1.5627, + "step": 41500 + }, + { + "epoch": 0.0665598935041704, + "grad_norm": 0.6800107359886169, + "learning_rate": 1.3304107134285853e-05, + "loss": 1.6058, + "step": 41600 + }, + { + "epoch": 0.0667198932481708, + "grad_norm": 138.5995330810547, + "learning_rate": 1.3336106622294045e-05, + "loss": 1.7099, + "step": 41700 + }, + { + "epoch": 0.06687989299217122, + "grad_norm": 0.2328547090291977, + "learning_rate": 1.3368106110302238e-05, + "loss": 1.7096, + "step": 41800 + }, + { + "epoch": 0.06703989273617163, + "grad_norm": 82.12950897216797, + "learning_rate": 1.3400105598310429e-05, + "loss": 1.6429, + "step": 41900 + }, + { + "epoch": 0.06719989248017204, + "grad_norm": 1.3431618213653564, + "learning_rate": 1.343210508631862e-05, + "loss": 1.2514, + "step": 42000 + }, + { + "epoch": 0.06735989222417244, + "grad_norm": 30.410139083862305, + "learning_rate": 1.3464104574326814e-05, + "loss": 1.5746, + "step": 42100 + }, + { + "epoch": 0.06751989196817285, + "grad_norm": 106.41495513916016, + "learning_rate": 1.3496104062335005e-05, + "loss": 1.7186, + "step": 42200 + }, + { + "epoch": 0.06767989171217326, + "grad_norm": 200.46978759765625, + "learning_rate": 1.3528103550343195e-05, + "loss": 1.8152, + "step": 42300 + }, + { + "epoch": 0.06783989145617367, + "grad_norm": 0.09822285175323486, + "learning_rate": 1.3560103038351386e-05, + "loss": 1.705, + "step": 42400 + }, + { + "epoch": 0.06799989120017408, + "grad_norm": 438.903564453125, + "learning_rate": 1.3592102526359579e-05, + "loss": 1.6779, + "step": 42500 + }, + { + "epoch": 0.06815989094417449, + "grad_norm": 0.03262553736567497, + "learning_rate": 1.362410201436777e-05, + "loss": 1.8157, + "step": 42600 + }, + { + "epoch": 0.0683198906881749, + "grad_norm": 376.0313720703125, + "learning_rate": 1.3656101502375962e-05, + "loss": 1.8464, + "step": 42700 + }, + { + "epoch": 0.06847989043217531, + "grad_norm": 29.421518325805664, + "learning_rate": 1.3688100990384155e-05, + "loss": 1.748, + "step": 42800 + }, + { + "epoch": 0.06863989017617572, + "grad_norm": 183.51832580566406, + "learning_rate": 1.3720100478392346e-05, + "loss": 1.6836, + "step": 42900 + }, + { + "epoch": 0.06879988992017613, + "grad_norm": 0.0013067092513665557, + "learning_rate": 1.3752099966400539e-05, + "loss": 1.65, + "step": 43000 + }, + { + "epoch": 0.06895988966417654, + "grad_norm": 0.006181403063237667, + "learning_rate": 1.378409945440873e-05, + "loss": 1.5632, + "step": 43100 + }, + { + "epoch": 0.06911988940817694, + "grad_norm": 0.134628027677536, + "learning_rate": 1.3816098942416922e-05, + "loss": 2.0987, + "step": 43200 + }, + { + "epoch": 0.06927988915217735, + "grad_norm": 235.39088439941406, + "learning_rate": 1.3848098430425115e-05, + "loss": 1.5783, + "step": 43300 + }, + { + "epoch": 0.06943988889617776, + "grad_norm": 89.28943634033203, + "learning_rate": 1.3880097918433306e-05, + "loss": 1.8029, + "step": 43400 + }, + { + "epoch": 0.06959988864017817, + "grad_norm": 197.04258728027344, + "learning_rate": 1.3911777411561415e-05, + "loss": 1.7154, + "step": 43500 + }, + { + "epoch": 0.06975988838417858, + "grad_norm": 96.05148315429688, + "learning_rate": 1.3943776899569608e-05, + "loss": 1.663, + "step": 43600 + }, + { + "epoch": 0.069919888128179, + "grad_norm": 8.378194808959961, + "learning_rate": 1.39757763875778e-05, + "loss": 1.4403, + "step": 43700 + }, + { + "epoch": 0.0700798878721794, + "grad_norm": 1932.4417724609375, + "learning_rate": 1.4007775875585992e-05, + "loss": 1.6513, + "step": 43800 + }, + { + "epoch": 0.07023988761617982, + "grad_norm": 185.06163024902344, + "learning_rate": 1.4039775363594184e-05, + "loss": 2.2041, + "step": 43900 + }, + { + "epoch": 0.07039988736018023, + "grad_norm": 1.7904412746429443, + "learning_rate": 1.4071774851602375e-05, + "loss": 2.3908, + "step": 44000 + }, + { + "epoch": 0.07055988710418064, + "grad_norm": 0.07365602254867554, + "learning_rate": 1.4103774339610568e-05, + "loss": 1.7153, + "step": 44100 + }, + { + "epoch": 0.07071988684818105, + "grad_norm": 117.01744842529297, + "learning_rate": 1.413577382761876e-05, + "loss": 2.2112, + "step": 44200 + }, + { + "epoch": 0.07087988659218145, + "grad_norm": 509.897216796875, + "learning_rate": 1.4167773315626951e-05, + "loss": 1.8663, + "step": 44300 + }, + { + "epoch": 0.07103988633618186, + "grad_norm": 188.78509521484375, + "learning_rate": 1.4199772803635144e-05, + "loss": 1.8206, + "step": 44400 + }, + { + "epoch": 0.07119988608018227, + "grad_norm": 122.20122528076172, + "learning_rate": 1.4231772291643335e-05, + "loss": 2.2269, + "step": 44500 + }, + { + "epoch": 0.07135988582418268, + "grad_norm": 894.0123901367188, + "learning_rate": 1.4263771779651527e-05, + "loss": 1.8159, + "step": 44600 + }, + { + "epoch": 0.07151988556818309, + "grad_norm": 217.02325439453125, + "learning_rate": 1.429577126765972e-05, + "loss": 1.9257, + "step": 44700 + }, + { + "epoch": 0.0716798853121835, + "grad_norm": 0.4048191010951996, + "learning_rate": 1.4327770755667911e-05, + "loss": 2.087, + "step": 44800 + }, + { + "epoch": 0.07183988505618391, + "grad_norm": 85.02055358886719, + "learning_rate": 1.4359770243676104e-05, + "loss": 1.3623, + "step": 44900 + }, + { + "epoch": 0.07199988480018432, + "grad_norm": 90.52297973632812, + "learning_rate": 1.4391769731684295e-05, + "loss": 1.5747, + "step": 45000 + }, + { + "epoch": 0.07215988454418473, + "grad_norm": 28.14681053161621, + "learning_rate": 1.4423769219692485e-05, + "loss": 1.8051, + "step": 45100 + }, + { + "epoch": 0.07231988428818514, + "grad_norm": 0.31391066312789917, + "learning_rate": 1.4455768707700676e-05, + "loss": 2.3691, + "step": 45200 + }, + { + "epoch": 0.07247988403218555, + "grad_norm": 0.08174788951873779, + "learning_rate": 1.448776819570887e-05, + "loss": 2.1125, + "step": 45300 + }, + { + "epoch": 0.07263988377618597, + "grad_norm": 107.4333267211914, + "learning_rate": 1.4519767683717061e-05, + "loss": 1.566, + "step": 45400 + }, + { + "epoch": 0.07279988352018636, + "grad_norm": 119.36717224121094, + "learning_rate": 1.4551767171725252e-05, + "loss": 1.5042, + "step": 45500 + }, + { + "epoch": 0.07295988326418677, + "grad_norm": 157.90757751464844, + "learning_rate": 1.4583446664853364e-05, + "loss": 1.9469, + "step": 45600 + }, + { + "epoch": 0.07311988300818718, + "grad_norm": 172.0279998779297, + "learning_rate": 1.4615446152861554e-05, + "loss": 1.9346, + "step": 45700 + }, + { + "epoch": 0.0732798827521876, + "grad_norm": 961.58935546875, + "learning_rate": 1.4647445640869747e-05, + "loss": 1.4362, + "step": 45800 + }, + { + "epoch": 0.073439882496188, + "grad_norm": 0.0017953349743038416, + "learning_rate": 1.4679445128877938e-05, + "loss": 1.9164, + "step": 45900 + }, + { + "epoch": 0.07359988224018842, + "grad_norm": 67.61031341552734, + "learning_rate": 1.471144461688613e-05, + "loss": 1.511, + "step": 46000 + }, + { + "epoch": 0.07375988198418883, + "grad_norm": 223.40682983398438, + "learning_rate": 1.4743444104894323e-05, + "loss": 1.4523, + "step": 46100 + }, + { + "epoch": 0.07391988172818924, + "grad_norm": 86.30171966552734, + "learning_rate": 1.4775443592902514e-05, + "loss": 1.1247, + "step": 46200 + }, + { + "epoch": 0.07407988147218965, + "grad_norm": 147.9749755859375, + "learning_rate": 1.4807443080910706e-05, + "loss": 1.9694, + "step": 46300 + }, + { + "epoch": 0.07423988121619006, + "grad_norm": 0.01015425007790327, + "learning_rate": 1.4839442568918899e-05, + "loss": 2.1909, + "step": 46400 + }, + { + "epoch": 0.07439988096019047, + "grad_norm": 259.0996398925781, + "learning_rate": 1.487144205692709e-05, + "loss": 2.0247, + "step": 46500 + }, + { + "epoch": 0.07455988070419087, + "grad_norm": 95.31226348876953, + "learning_rate": 1.4903441544935281e-05, + "loss": 1.2061, + "step": 46600 + }, + { + "epoch": 0.07471988044819128, + "grad_norm": 173.83978271484375, + "learning_rate": 1.4935441032943474e-05, + "loss": 1.6151, + "step": 46700 + }, + { + "epoch": 0.07487988019219169, + "grad_norm": 2.386795997619629, + "learning_rate": 1.4967440520951666e-05, + "loss": 1.6184, + "step": 46800 + }, + { + "epoch": 0.0750398799361921, + "grad_norm": 429.2137756347656, + "learning_rate": 1.4999440008959859e-05, + "loss": 2.0375, + "step": 46900 + }, + { + "epoch": 0.07519987968019251, + "grad_norm": 387.4931945800781, + "learning_rate": 1.503143949696805e-05, + "loss": 1.8357, + "step": 47000 + }, + { + "epoch": 0.07535987942419292, + "grad_norm": 412.69384765625, + "learning_rate": 1.5063438984976241e-05, + "loss": 1.7605, + "step": 47100 + }, + { + "epoch": 0.07551987916819333, + "grad_norm": 64.52519989013672, + "learning_rate": 1.5095438472984435e-05, + "loss": 2.1139, + "step": 47200 + }, + { + "epoch": 0.07567987891219374, + "grad_norm": 2.706088066101074, + "learning_rate": 1.5127437960992626e-05, + "loss": 1.2971, + "step": 47300 + }, + { + "epoch": 0.07583987865619415, + "grad_norm": 3.714489459991455, + "learning_rate": 1.5159437449000817e-05, + "loss": 1.7242, + "step": 47400 + }, + { + "epoch": 0.07599987840019456, + "grad_norm": 5.220536708831787, + "learning_rate": 1.519143693700901e-05, + "loss": 1.2726, + "step": 47500 + }, + { + "epoch": 0.07615987814419498, + "grad_norm": 133.04861450195312, + "learning_rate": 1.5223436425017202e-05, + "loss": 1.9947, + "step": 47600 + }, + { + "epoch": 0.07631987788819537, + "grad_norm": 0.19635449349880219, + "learning_rate": 1.5255435913025393e-05, + "loss": 2.2796, + "step": 47700 + }, + { + "epoch": 0.07647987763219578, + "grad_norm": 168.06861877441406, + "learning_rate": 1.5287115406153503e-05, + "loss": 1.6232, + "step": 47800 + }, + { + "epoch": 0.0766398773761962, + "grad_norm": 0.004633053671568632, + "learning_rate": 1.5319114894161697e-05, + "loss": 1.3513, + "step": 47900 + }, + { + "epoch": 0.0767998771201966, + "grad_norm": 0.0009558099554851651, + "learning_rate": 1.5351114382169886e-05, + "loss": 1.291, + "step": 48000 + }, + { + "epoch": 0.07695987686419702, + "grad_norm": 1.2679246664047241, + "learning_rate": 1.538311387017808e-05, + "loss": 1.5954, + "step": 48100 + }, + { + "epoch": 0.07711987660819743, + "grad_norm": 511.6206970214844, + "learning_rate": 1.5415113358186272e-05, + "loss": 1.6232, + "step": 48200 + }, + { + "epoch": 0.07727987635219784, + "grad_norm": 190.66940307617188, + "learning_rate": 1.5447112846194462e-05, + "loss": 1.8858, + "step": 48300 + }, + { + "epoch": 0.07743987609619825, + "grad_norm": 368.29022216796875, + "learning_rate": 1.5479112334202655e-05, + "loss": 1.6235, + "step": 48400 + }, + { + "epoch": 0.07759987584019866, + "grad_norm": 0.00299979280680418, + "learning_rate": 1.5511111822210848e-05, + "loss": 1.9061, + "step": 48500 + }, + { + "epoch": 0.07775987558419907, + "grad_norm": 96.37437438964844, + "learning_rate": 1.5543111310219038e-05, + "loss": 1.5919, + "step": 48600 + }, + { + "epoch": 0.07791987532819948, + "grad_norm": 141.4491729736328, + "learning_rate": 1.5575110798227227e-05, + "loss": 1.8474, + "step": 48700 + }, + { + "epoch": 0.07807987507219988, + "grad_norm": 9.810319900512695, + "learning_rate": 1.560711028623542e-05, + "loss": 1.7112, + "step": 48800 + }, + { + "epoch": 0.07823987481620029, + "grad_norm": 0.20426060259342194, + "learning_rate": 1.5639109774243613e-05, + "loss": 1.8007, + "step": 48900 + }, + { + "epoch": 0.0783998745602007, + "grad_norm": 2.3212544918060303, + "learning_rate": 1.5671109262251803e-05, + "loss": 1.7499, + "step": 49000 + }, + { + "epoch": 0.07855987430420111, + "grad_norm": 2386.313232421875, + "learning_rate": 1.5703108750259996e-05, + "loss": 1.4046, + "step": 49100 + }, + { + "epoch": 0.07871987404820152, + "grad_norm": 123.7901611328125, + "learning_rate": 1.573510823826819e-05, + "loss": 2.0843, + "step": 49200 + }, + { + "epoch": 0.07887987379220193, + "grad_norm": 190.3510284423828, + "learning_rate": 1.576710772627638e-05, + "loss": 1.52, + "step": 49300 + }, + { + "epoch": 0.07903987353620234, + "grad_norm": 0.0007205315632745624, + "learning_rate": 1.5799107214284572e-05, + "loss": 1.8708, + "step": 49400 + }, + { + "epoch": 0.07919987328020275, + "grad_norm": 0.013503137975931168, + "learning_rate": 1.5831106702292765e-05, + "loss": 1.673, + "step": 49500 + }, + { + "epoch": 0.07935987302420316, + "grad_norm": 65.62171936035156, + "learning_rate": 1.5863106190300958e-05, + "loss": 1.8457, + "step": 49600 + }, + { + "epoch": 0.07951987276820358, + "grad_norm": 92.97589874267578, + "learning_rate": 1.5895105678309148e-05, + "loss": 1.5627, + "step": 49700 + }, + { + "epoch": 0.07967987251220399, + "grad_norm": 21.424842834472656, + "learning_rate": 1.5926785171437256e-05, + "loss": 1.6497, + "step": 49800 + }, + { + "epoch": 0.07983987225620438, + "grad_norm": 0.0011259341845288873, + "learning_rate": 1.595878465944545e-05, + "loss": 1.5787, + "step": 49900 + }, + { + "epoch": 0.0799998720002048, + "grad_norm": 0.01448867842555046, + "learning_rate": 1.5990784147453643e-05, + "loss": 1.8507, + "step": 50000 + }, + { + "epoch": 0.0801598717442052, + "grad_norm": 66.43374633789062, + "learning_rate": 1.6022783635461832e-05, + "loss": 1.4336, + "step": 50100 + }, + { + "epoch": 0.08031987148820562, + "grad_norm": 27.450525283813477, + "learning_rate": 1.6054783123470025e-05, + "loss": 2.152, + "step": 50200 + }, + { + "epoch": 0.08047987123220603, + "grad_norm": 33.92656707763672, + "learning_rate": 1.6086782611478218e-05, + "loss": 1.6311, + "step": 50300 + }, + { + "epoch": 0.08063987097620644, + "grad_norm": 20.14742660522461, + "learning_rate": 1.611878209948641e-05, + "loss": 1.7442, + "step": 50400 + }, + { + "epoch": 0.08079987072020685, + "grad_norm": 190.49342346191406, + "learning_rate": 1.61507815874946e-05, + "loss": 1.8063, + "step": 50500 + }, + { + "epoch": 0.08095987046420726, + "grad_norm": 673.4315185546875, + "learning_rate": 1.6182781075502794e-05, + "loss": 1.4, + "step": 50600 + }, + { + "epoch": 0.08111987020820767, + "grad_norm": 87.93296813964844, + "learning_rate": 1.6214780563510987e-05, + "loss": 1.6401, + "step": 50700 + }, + { + "epoch": 0.08127986995220808, + "grad_norm": 10.013740539550781, + "learning_rate": 1.6246780051519177e-05, + "loss": 1.9426, + "step": 50800 + }, + { + "epoch": 0.08143986969620849, + "grad_norm": 84.70770263671875, + "learning_rate": 1.627877953952737e-05, + "loss": 2.0937, + "step": 50900 + }, + { + "epoch": 0.0815998694402089, + "grad_norm": 66.33674621582031, + "learning_rate": 1.6310779027535563e-05, + "loss": 1.8187, + "step": 51000 + }, + { + "epoch": 0.0817598691842093, + "grad_norm": 0.013598043471574783, + "learning_rate": 1.6342778515543753e-05, + "loss": 2.1751, + "step": 51100 + }, + { + "epoch": 0.08191986892820971, + "grad_norm": 0.8764291405677795, + "learning_rate": 1.6374778003551946e-05, + "loss": 2.1703, + "step": 51200 + }, + { + "epoch": 0.08207986867221012, + "grad_norm": 14.436594009399414, + "learning_rate": 1.640677749156014e-05, + "loss": 1.4443, + "step": 51300 + }, + { + "epoch": 0.08223986841621053, + "grad_norm": 0.27148157358169556, + "learning_rate": 1.6438776979568328e-05, + "loss": 1.9266, + "step": 51400 + }, + { + "epoch": 0.08239986816021094, + "grad_norm": 11.139505386352539, + "learning_rate": 1.6470776467576518e-05, + "loss": 1.8226, + "step": 51500 + }, + { + "epoch": 0.08255986790421135, + "grad_norm": 105.84121704101562, + "learning_rate": 1.650277595558471e-05, + "loss": 1.4394, + "step": 51600 + }, + { + "epoch": 0.08271986764821176, + "grad_norm": 161.04141235351562, + "learning_rate": 1.6534775443592904e-05, + "loss": 1.052, + "step": 51700 + }, + { + "epoch": 0.08287986739221218, + "grad_norm": 11.454148292541504, + "learning_rate": 1.6566774931601094e-05, + "loss": 1.0614, + "step": 51800 + }, + { + "epoch": 0.08303986713621259, + "grad_norm": 0.03253089264035225, + "learning_rate": 1.6598774419609287e-05, + "loss": 1.4591, + "step": 51900 + }, + { + "epoch": 0.083199866880213, + "grad_norm": 0.2750859558582306, + "learning_rate": 1.663077390761748e-05, + "loss": 1.6479, + "step": 52000 + }, + { + "epoch": 0.08335986662421341, + "grad_norm": 3.381882667541504, + "learning_rate": 1.666245340074559e-05, + "loss": 1.7548, + "step": 52100 + }, + { + "epoch": 0.0835198663682138, + "grad_norm": 102.45317840576172, + "learning_rate": 1.669445288875378e-05, + "loss": 1.6293, + "step": 52200 + }, + { + "epoch": 0.08367986611221422, + "grad_norm": 122.25707244873047, + "learning_rate": 1.672645237676197e-05, + "loss": 1.7183, + "step": 52300 + }, + { + "epoch": 0.08383986585621463, + "grad_norm": 1.0929410457611084, + "learning_rate": 1.6758451864770164e-05, + "loss": 1.2329, + "step": 52400 + }, + { + "epoch": 0.08399986560021504, + "grad_norm": 0.0009238706552423537, + "learning_rate": 1.6790451352778357e-05, + "loss": 1.5292, + "step": 52500 + }, + { + "epoch": 0.08415986534421545, + "grad_norm": 15.874957084655762, + "learning_rate": 1.6822450840786547e-05, + "loss": 1.6752, + "step": 52600 + }, + { + "epoch": 0.08431986508821586, + "grad_norm": 8.129535675048828, + "learning_rate": 1.685445032879474e-05, + "loss": 1.3228, + "step": 52700 + }, + { + "epoch": 0.08447986483221627, + "grad_norm": 196.6626434326172, + "learning_rate": 1.6886449816802933e-05, + "loss": 1.485, + "step": 52800 + }, + { + "epoch": 0.08463986457621668, + "grad_norm": 257.9208679199219, + "learning_rate": 1.6918449304811123e-05, + "loss": 1.4228, + "step": 52900 + }, + { + "epoch": 0.08479986432021709, + "grad_norm": 126.92493438720703, + "learning_rate": 1.6950448792819316e-05, + "loss": 1.1385, + "step": 53000 + }, + { + "epoch": 0.0849598640642175, + "grad_norm": 218.52455139160156, + "learning_rate": 1.698244828082751e-05, + "loss": 1.1812, + "step": 53100 + }, + { + "epoch": 0.08511986380821791, + "grad_norm": 0.32875338196754456, + "learning_rate": 1.70144477688357e-05, + "loss": 1.4763, + "step": 53200 + }, + { + "epoch": 0.08527986355221831, + "grad_norm": 6.30516242980957, + "learning_rate": 1.704644725684389e-05, + "loss": 1.9444, + "step": 53300 + }, + { + "epoch": 0.08543986329621872, + "grad_norm": 0.10023212432861328, + "learning_rate": 1.7078446744852085e-05, + "loss": 1.5316, + "step": 53400 + }, + { + "epoch": 0.08559986304021913, + "grad_norm": 0.16311447322368622, + "learning_rate": 1.7110446232860278e-05, + "loss": 1.6928, + "step": 53500 + }, + { + "epoch": 0.08575986278421954, + "grad_norm": 113.52496337890625, + "learning_rate": 1.7142445720868467e-05, + "loss": 1.4466, + "step": 53600 + }, + { + "epoch": 0.08591986252821995, + "grad_norm": 202.2323760986328, + "learning_rate": 1.717444520887666e-05, + "loss": 1.438, + "step": 53700 + }, + { + "epoch": 0.08607986227222036, + "grad_norm": 0.009465747512876987, + "learning_rate": 1.7206444696884853e-05, + "loss": 1.1629, + "step": 53800 + }, + { + "epoch": 0.08623986201622078, + "grad_norm": 36.7415771484375, + "learning_rate": 1.7238444184893043e-05, + "loss": 1.3017, + "step": 53900 + }, + { + "epoch": 0.08639986176022119, + "grad_norm": 238.30662536621094, + "learning_rate": 1.7270443672901236e-05, + "loss": 1.6614, + "step": 54000 + }, + { + "epoch": 0.0865598615042216, + "grad_norm": 92.3208236694336, + "learning_rate": 1.730244316090943e-05, + "loss": 1.4535, + "step": 54100 + }, + { + "epoch": 0.08671986124822201, + "grad_norm": 78.44776153564453, + "learning_rate": 1.733444264891762e-05, + "loss": 1.7061, + "step": 54200 + }, + { + "epoch": 0.08687986099222242, + "grad_norm": 0.22147144377231598, + "learning_rate": 1.736644213692581e-05, + "loss": 1.4681, + "step": 54300 + }, + { + "epoch": 0.08703986073622282, + "grad_norm": 11.244450569152832, + "learning_rate": 1.7398441624934e-05, + "loss": 1.3449, + "step": 54400 + }, + { + "epoch": 0.08719986048022323, + "grad_norm": 0.002357147866860032, + "learning_rate": 1.7430441112942195e-05, + "loss": 1.8814, + "step": 54500 + }, + { + "epoch": 0.08735986022422364, + "grad_norm": 0.008252524770796299, + "learning_rate": 1.7462440600950384e-05, + "loss": 1.5989, + "step": 54600 + }, + { + "epoch": 0.08751985996822405, + "grad_norm": 61.25815200805664, + "learning_rate": 1.7494440088958577e-05, + "loss": 1.3711, + "step": 54700 + }, + { + "epoch": 0.08767985971222446, + "grad_norm": 0.536085307598114, + "learning_rate": 1.752643957696677e-05, + "loss": 1.3199, + "step": 54800 + }, + { + "epoch": 0.08783985945622487, + "grad_norm": 261.3522033691406, + "learning_rate": 1.755811907009488e-05, + "loss": 1.3713, + "step": 54900 + }, + { + "epoch": 0.08799985920022528, + "grad_norm": 0.18219026923179626, + "learning_rate": 1.7590118558103072e-05, + "loss": 1.441, + "step": 55000 + }, + { + "epoch": 0.08815985894422569, + "grad_norm": 2.0045886039733887, + "learning_rate": 1.7622118046111262e-05, + "loss": 1.268, + "step": 55100 + }, + { + "epoch": 0.0883198586882261, + "grad_norm": 119.64205932617188, + "learning_rate": 1.7654117534119455e-05, + "loss": 1.1648, + "step": 55200 + }, + { + "epoch": 0.08847985843222651, + "grad_norm": 1316.0831298828125, + "learning_rate": 1.7686117022127648e-05, + "loss": 1.8108, + "step": 55300 + }, + { + "epoch": 0.08863985817622692, + "grad_norm": 0.016518862918019295, + "learning_rate": 1.7718116510135838e-05, + "loss": 1.4904, + "step": 55400 + }, + { + "epoch": 0.08879985792022732, + "grad_norm": 0.0020672741811722517, + "learning_rate": 1.774979600326395e-05, + "loss": 1.2555, + "step": 55500 + }, + { + "epoch": 0.08895985766422773, + "grad_norm": 0.0013748366618528962, + "learning_rate": 1.778179549127214e-05, + "loss": 1.2733, + "step": 55600 + }, + { + "epoch": 0.08911985740822814, + "grad_norm": 156.89892578125, + "learning_rate": 1.7813794979280332e-05, + "loss": 1.5194, + "step": 55700 + }, + { + "epoch": 0.08927985715222855, + "grad_norm": 0.034991975873708725, + "learning_rate": 1.7845794467288526e-05, + "loss": 1.7587, + "step": 55800 + }, + { + "epoch": 0.08943985689622896, + "grad_norm": 25.602022171020508, + "learning_rate": 1.7877793955296715e-05, + "loss": 1.6183, + "step": 55900 + }, + { + "epoch": 0.08959985664022938, + "grad_norm": 0.6393762230873108, + "learning_rate": 1.7909793443304908e-05, + "loss": 1.3596, + "step": 56000 + }, + { + "epoch": 0.08975985638422979, + "grad_norm": 48.19321823120117, + "learning_rate": 1.79417929313131e-05, + "loss": 1.5248, + "step": 56100 + }, + { + "epoch": 0.0899198561282302, + "grad_norm": 88.53876495361328, + "learning_rate": 1.797379241932129e-05, + "loss": 1.5177, + "step": 56200 + }, + { + "epoch": 0.09007985587223061, + "grad_norm": 4.195464611053467, + "learning_rate": 1.8005791907329484e-05, + "loss": 1.7579, + "step": 56300 + }, + { + "epoch": 0.09023985561623102, + "grad_norm": 4.309329986572266, + "learning_rate": 1.8037791395337677e-05, + "loss": 1.5508, + "step": 56400 + }, + { + "epoch": 0.09039985536023143, + "grad_norm": 781.6338500976562, + "learning_rate": 1.8069790883345867e-05, + "loss": 1.5965, + "step": 56500 + }, + { + "epoch": 0.09055985510423184, + "grad_norm": 0.2752499580383301, + "learning_rate": 1.810179037135406e-05, + "loss": 1.5762, + "step": 56600 + }, + { + "epoch": 0.09071985484823224, + "grad_norm": 107.10926818847656, + "learning_rate": 1.8133789859362253e-05, + "loss": 1.7441, + "step": 56700 + }, + { + "epoch": 0.09087985459223265, + "grad_norm": 97.79568481445312, + "learning_rate": 1.8165789347370442e-05, + "loss": 2.0257, + "step": 56800 + }, + { + "epoch": 0.09103985433623306, + "grad_norm": 0.0016732424264773726, + "learning_rate": 1.8197788835378635e-05, + "loss": 1.1371, + "step": 56900 + }, + { + "epoch": 0.09119985408023347, + "grad_norm": 7.662989139556885, + "learning_rate": 1.822978832338683e-05, + "loss": 1.8825, + "step": 57000 + }, + { + "epoch": 0.09135985382423388, + "grad_norm": 16.23094940185547, + "learning_rate": 1.8261787811395018e-05, + "loss": 1.0455, + "step": 57100 + }, + { + "epoch": 0.09151985356823429, + "grad_norm": 0.025669243186712265, + "learning_rate": 1.829378729940321e-05, + "loss": 1.5889, + "step": 57200 + }, + { + "epoch": 0.0916798533122347, + "grad_norm": 0.4638320505619049, + "learning_rate": 1.8325786787411404e-05, + "loss": 1.192, + "step": 57300 + }, + { + "epoch": 0.09183985305623511, + "grad_norm": 61.32036209106445, + "learning_rate": 1.8357786275419597e-05, + "loss": 1.5374, + "step": 57400 + }, + { + "epoch": 0.09199985280023552, + "grad_norm": 0.0012545910431072116, + "learning_rate": 1.8389785763427787e-05, + "loss": 1.6236, + "step": 57500 + }, + { + "epoch": 0.09215985254423593, + "grad_norm": 92.61576080322266, + "learning_rate": 1.842178525143598e-05, + "loss": 1.8945, + "step": 57600 + }, + { + "epoch": 0.09231985228823635, + "grad_norm": 22.349824905395508, + "learning_rate": 1.845378473944417e-05, + "loss": 1.607, + "step": 57700 + }, + { + "epoch": 0.09247985203223674, + "grad_norm": 126.0189208984375, + "learning_rate": 1.8485784227452363e-05, + "loss": 1.8133, + "step": 57800 + }, + { + "epoch": 0.09263985177623715, + "grad_norm": 78.0487060546875, + "learning_rate": 1.8517783715460552e-05, + "loss": 1.5777, + "step": 57900 + }, + { + "epoch": 0.09279985152023756, + "grad_norm": 0.0007238721009343863, + "learning_rate": 1.8549783203468745e-05, + "loss": 1.5043, + "step": 58000 + }, + { + "epoch": 0.09295985126423797, + "grad_norm": 123.57425689697266, + "learning_rate": 1.858178269147694e-05, + "loss": 1.7681, + "step": 58100 + }, + { + "epoch": 0.09311985100823839, + "grad_norm": 0.2985190153121948, + "learning_rate": 1.8613782179485128e-05, + "loss": 1.623, + "step": 58200 + }, + { + "epoch": 0.0932798507522388, + "grad_norm": 64.27520751953125, + "learning_rate": 1.864578166749332e-05, + "loss": 2.2137, + "step": 58300 + }, + { + "epoch": 0.09343985049623921, + "grad_norm": 207.88841247558594, + "learning_rate": 1.8677781155501514e-05, + "loss": 2.2447, + "step": 58400 + }, + { + "epoch": 0.09359985024023962, + "grad_norm": 94.58351135253906, + "learning_rate": 1.8709780643509704e-05, + "loss": 2.3013, + "step": 58500 + }, + { + "epoch": 0.09375984998424003, + "grad_norm": 0.29524990916252136, + "learning_rate": 1.8741780131517897e-05, + "loss": 1.3105, + "step": 58600 + }, + { + "epoch": 0.09391984972824044, + "grad_norm": 110.85052490234375, + "learning_rate": 1.877377961952609e-05, + "loss": 1.4461, + "step": 58700 + }, + { + "epoch": 0.09407984947224085, + "grad_norm": 44.61641311645508, + "learning_rate": 1.880577910753428e-05, + "loss": 2.1321, + "step": 58800 + }, + { + "epoch": 0.09423984921624125, + "grad_norm": 699.353759765625, + "learning_rate": 1.8837778595542473e-05, + "loss": 1.7541, + "step": 58900 + }, + { + "epoch": 0.09439984896024166, + "grad_norm": 88.22161865234375, + "learning_rate": 1.8869778083550666e-05, + "loss": 1.7894, + "step": 59000 + }, + { + "epoch": 0.09455984870424207, + "grad_norm": 25.957782745361328, + "learning_rate": 1.8901777571558855e-05, + "loss": 1.693, + "step": 59100 + }, + { + "epoch": 0.09471984844824248, + "grad_norm": 1.7580465078353882, + "learning_rate": 1.8933457064686968e-05, + "loss": 1.7073, + "step": 59200 + }, + { + "epoch": 0.09487984819224289, + "grad_norm": 5.568783283233643, + "learning_rate": 1.8965456552695157e-05, + "loss": 2.0305, + "step": 59300 + }, + { + "epoch": 0.0950398479362433, + "grad_norm": 0.21757324039936066, + "learning_rate": 1.899745604070335e-05, + "loss": 1.3684, + "step": 59400 + }, + { + "epoch": 0.09519984768024371, + "grad_norm": 123.5767593383789, + "learning_rate": 1.9029455528711543e-05, + "loss": 1.8754, + "step": 59500 + }, + { + "epoch": 0.09535984742424412, + "grad_norm": 66.91508483886719, + "learning_rate": 1.9061455016719733e-05, + "loss": 2.0225, + "step": 59600 + }, + { + "epoch": 0.09551984716824453, + "grad_norm": 0.00018894312961492687, + "learning_rate": 1.9093454504727926e-05, + "loss": 2.1975, + "step": 59700 + }, + { + "epoch": 0.09567984691224495, + "grad_norm": 84.60813903808594, + "learning_rate": 1.912545399273612e-05, + "loss": 1.7173, + "step": 59800 + }, + { + "epoch": 0.09583984665624536, + "grad_norm": 67.51477813720703, + "learning_rate": 1.915745348074431e-05, + "loss": 1.4302, + "step": 59900 + }, + { + "epoch": 0.09599984640024575, + "grad_norm": 59.47672653198242, + "learning_rate": 1.9189452968752502e-05, + "loss": 1.2497, + "step": 60000 + }, + { + "epoch": 0.09615984614424616, + "grad_norm": 19.75477409362793, + "learning_rate": 1.9221452456760695e-05, + "loss": 1.4058, + "step": 60100 + }, + { + "epoch": 0.09631984588824657, + "grad_norm": 91.08583068847656, + "learning_rate": 1.9253451944768885e-05, + "loss": 1.0956, + "step": 60200 + }, + { + "epoch": 0.09647984563224699, + "grad_norm": 133.5473175048828, + "learning_rate": 1.9285451432777078e-05, + "loss": 1.3731, + "step": 60300 + }, + { + "epoch": 0.0966398453762474, + "grad_norm": 0.010973370634019375, + "learning_rate": 1.931745092078527e-05, + "loss": 1.2953, + "step": 60400 + }, + { + "epoch": 0.09679984512024781, + "grad_norm": 0.08579988777637482, + "learning_rate": 1.934945040879346e-05, + "loss": 1.0987, + "step": 60500 + }, + { + "epoch": 0.09695984486424822, + "grad_norm": 1.1617801189422607, + "learning_rate": 1.9381449896801653e-05, + "loss": 1.5104, + "step": 60600 + }, + { + "epoch": 0.09711984460824863, + "grad_norm": 0.3544386029243469, + "learning_rate": 1.9413449384809843e-05, + "loss": 1.5224, + "step": 60700 + }, + { + "epoch": 0.09727984435224904, + "grad_norm": 44.28148651123047, + "learning_rate": 1.9445448872818036e-05, + "loss": 1.3982, + "step": 60800 + }, + { + "epoch": 0.09743984409624945, + "grad_norm": 167.9832305908203, + "learning_rate": 1.947744836082623e-05, + "loss": 1.2785, + "step": 60900 + }, + { + "epoch": 0.09759984384024986, + "grad_norm": 129.58119201660156, + "learning_rate": 1.950944784883442e-05, + "loss": 1.6018, + "step": 61000 + }, + { + "epoch": 0.09775984358425026, + "grad_norm": 0.048871856182813644, + "learning_rate": 1.9541447336842612e-05, + "loss": 1.4968, + "step": 61100 + }, + { + "epoch": 0.09791984332825067, + "grad_norm": 0.14592894911766052, + "learning_rate": 1.9573446824850805e-05, + "loss": 1.2423, + "step": 61200 + }, + { + "epoch": 0.09807984307225108, + "grad_norm": 0.548117458820343, + "learning_rate": 1.9605446312858995e-05, + "loss": 1.9973, + "step": 61300 + }, + { + "epoch": 0.09823984281625149, + "grad_norm": 52.4393424987793, + "learning_rate": 1.9637445800867188e-05, + "loss": 1.2149, + "step": 61400 + }, + { + "epoch": 0.0983998425602519, + "grad_norm": 101.79759216308594, + "learning_rate": 1.966944528887538e-05, + "loss": 1.731, + "step": 61500 + }, + { + "epoch": 0.09855984230425231, + "grad_norm": 0.04176723212003708, + "learning_rate": 1.970144477688357e-05, + "loss": 1.2889, + "step": 61600 + }, + { + "epoch": 0.09871984204825272, + "grad_norm": 8.380585670471191, + "learning_rate": 1.9733444264891763e-05, + "loss": 1.856, + "step": 61700 + }, + { + "epoch": 0.09887984179225313, + "grad_norm": 0.014852323569357395, + "learning_rate": 1.9765443752899956e-05, + "loss": 0.8942, + "step": 61800 + }, + { + "epoch": 0.09903984153625355, + "grad_norm": 0.3229600787162781, + "learning_rate": 1.9797443240908146e-05, + "loss": 1.3371, + "step": 61900 + }, + { + "epoch": 0.09919984128025396, + "grad_norm": 141.8211212158203, + "learning_rate": 1.982944272891634e-05, + "loss": 1.5222, + "step": 62000 + }, + { + "epoch": 0.09935984102425437, + "grad_norm": 0.025253353640437126, + "learning_rate": 1.9861442216924532e-05, + "loss": 1.5435, + "step": 62100 + }, + { + "epoch": 0.09951984076825478, + "grad_norm": 0.0009790909243747592, + "learning_rate": 1.9893441704932722e-05, + "loss": 1.1172, + "step": 62200 + }, + { + "epoch": 0.09967984051225517, + "grad_norm": 43.73761749267578, + "learning_rate": 1.9925441192940915e-05, + "loss": 1.6024, + "step": 62300 + }, + { + "epoch": 0.09983984025625559, + "grad_norm": 18.400936126708984, + "learning_rate": 1.9957440680949108e-05, + "loss": 1.3914, + "step": 62400 + }, + { + "epoch": 0.099999840000256, + "grad_norm": 937.4790649414062, + "learning_rate": 1.9989440168957298e-05, + "loss": 1.4714, + "step": 62500 + }, + { + "epoch": 0.10015983974425641, + "grad_norm": 0.14736099541187286, + "learning_rate": 1.999761777777778e-05, + "loss": 1.2922, + "step": 62600 + }, + { + "epoch": 0.10031983948825682, + "grad_norm": 0.606549084186554, + "learning_rate": 1.999409777777778e-05, + "loss": 1.4263, + "step": 62700 + }, + { + "epoch": 0.10047983923225723, + "grad_norm": 426.79400634765625, + "learning_rate": 1.9990542222222224e-05, + "loss": 1.4586, + "step": 62800 + }, + { + "epoch": 0.10063983897625764, + "grad_norm": 0.007887039333581924, + "learning_rate": 1.9986986666666668e-05, + "loss": 1.6312, + "step": 62900 + }, + { + "epoch": 0.10079983872025805, + "grad_norm": 0.9442864060401917, + "learning_rate": 1.9983431111111113e-05, + "loss": 1.9607, + "step": 63000 + }, + { + "epoch": 0.10095983846425846, + "grad_norm": 0.002317711478099227, + "learning_rate": 1.9979875555555557e-05, + "loss": 1.5771, + "step": 63100 + }, + { + "epoch": 0.10111983820825887, + "grad_norm": 75.09770965576172, + "learning_rate": 1.9976320000000002e-05, + "loss": 1.6721, + "step": 63200 + }, + { + "epoch": 0.10127983795225928, + "grad_norm": 135.64022827148438, + "learning_rate": 1.9972764444444446e-05, + "loss": 1.8461, + "step": 63300 + }, + { + "epoch": 0.10143983769625968, + "grad_norm": 0.1608121395111084, + "learning_rate": 1.996920888888889e-05, + "loss": 1.5256, + "step": 63400 + }, + { + "epoch": 0.10159983744026009, + "grad_norm": 266.8143615722656, + "learning_rate": 1.9965653333333336e-05, + "loss": 1.9736, + "step": 63500 + }, + { + "epoch": 0.1017598371842605, + "grad_norm": 125.29386138916016, + "learning_rate": 1.996209777777778e-05, + "loss": 1.4735, + "step": 63600 + }, + { + "epoch": 0.10191983692826091, + "grad_norm": 1.8401292562484741, + "learning_rate": 1.9958542222222225e-05, + "loss": 1.4619, + "step": 63700 + }, + { + "epoch": 0.10207983667226132, + "grad_norm": 352.0743103027344, + "learning_rate": 1.995498666666667e-05, + "loss": 1.6571, + "step": 63800 + }, + { + "epoch": 0.10223983641626173, + "grad_norm": 546.5570068359375, + "learning_rate": 1.9951431111111114e-05, + "loss": 1.5888, + "step": 63900 + }, + { + "epoch": 0.10239983616026214, + "grad_norm": 0.0009566646185703576, + "learning_rate": 1.994787555555556e-05, + "loss": 2.0457, + "step": 64000 + }, + { + "epoch": 0.10255983590426256, + "grad_norm": 717.4028930664062, + "learning_rate": 1.9944320000000003e-05, + "loss": 1.7843, + "step": 64100 + }, + { + "epoch": 0.10271983564826297, + "grad_norm": 0.16068622469902039, + "learning_rate": 1.9940764444444447e-05, + "loss": 1.5116, + "step": 64200 + }, + { + "epoch": 0.10287983539226338, + "grad_norm": 69.0772705078125, + "learning_rate": 1.9937208888888892e-05, + "loss": 1.6682, + "step": 64300 + }, + { + "epoch": 0.10303983513626379, + "grad_norm": 0.0007585228304378688, + "learning_rate": 1.9933653333333337e-05, + "loss": 1.2137, + "step": 64400 + }, + { + "epoch": 0.10319983488026419, + "grad_norm": 140.95750427246094, + "learning_rate": 1.9930097777777778e-05, + "loss": 1.1308, + "step": 64500 + }, + { + "epoch": 0.1033598346242646, + "grad_norm": 1.2280133962631226, + "learning_rate": 1.9926542222222226e-05, + "loss": 2.031, + "step": 64600 + }, + { + "epoch": 0.103519834368265, + "grad_norm": 51.01097106933594, + "learning_rate": 1.9922986666666667e-05, + "loss": 1.6903, + "step": 64700 + }, + { + "epoch": 0.10367983411226542, + "grad_norm": 354.4974365234375, + "learning_rate": 1.9919431111111115e-05, + "loss": 1.3365, + "step": 64800 + }, + { + "epoch": 0.10383983385626583, + "grad_norm": 98.43709564208984, + "learning_rate": 1.9915875555555556e-05, + "loss": 1.5736, + "step": 64900 + }, + { + "epoch": 0.10399983360026624, + "grad_norm": 176.26661682128906, + "learning_rate": 1.991232e-05, + "loss": 1.7264, + "step": 65000 + }, + { + "epoch": 0.10415983334426665, + "grad_norm": 55.52714920043945, + "learning_rate": 1.9908764444444445e-05, + "loss": 1.1781, + "step": 65100 + }, + { + "epoch": 0.10431983308826706, + "grad_norm": 0.0009407736943103373, + "learning_rate": 1.990520888888889e-05, + "loss": 1.2503, + "step": 65200 + }, + { + "epoch": 0.10447983283226747, + "grad_norm": 0.001376794883981347, + "learning_rate": 1.9901653333333334e-05, + "loss": 0.9432, + "step": 65300 + }, + { + "epoch": 0.10463983257626788, + "grad_norm": 52.175819396972656, + "learning_rate": 1.9898133333333335e-05, + "loss": 1.264, + "step": 65400 + }, + { + "epoch": 0.1047998323202683, + "grad_norm": 147.7506866455078, + "learning_rate": 1.989457777777778e-05, + "loss": 1.2086, + "step": 65500 + }, + { + "epoch": 0.10495983206426869, + "grad_norm": 31.214550018310547, + "learning_rate": 1.9891022222222224e-05, + "loss": 1.8692, + "step": 65600 + }, + { + "epoch": 0.1051198318082691, + "grad_norm": 168.40858459472656, + "learning_rate": 1.988746666666667e-05, + "loss": 1.2745, + "step": 65700 + }, + { + "epoch": 0.10527983155226951, + "grad_norm": 401.31842041015625, + "learning_rate": 1.9883911111111113e-05, + "loss": 1.6839, + "step": 65800 + }, + { + "epoch": 0.10543983129626992, + "grad_norm": 139.64588928222656, + "learning_rate": 1.9880355555555558e-05, + "loss": 1.4509, + "step": 65900 + }, + { + "epoch": 0.10559983104027033, + "grad_norm": 125.26469421386719, + "learning_rate": 1.98768e-05, + "loss": 1.1615, + "step": 66000 + }, + { + "epoch": 0.10575983078427074, + "grad_norm": 0.1609152853488922, + "learning_rate": 1.9873244444444447e-05, + "loss": 1.4458, + "step": 66100 + }, + { + "epoch": 0.10591983052827116, + "grad_norm": 0.000580300809815526, + "learning_rate": 1.9869688888888888e-05, + "loss": 1.8329, + "step": 66200 + }, + { + "epoch": 0.10607983027227157, + "grad_norm": 101.93132781982422, + "learning_rate": 1.9866133333333336e-05, + "loss": 1.567, + "step": 66300 + }, + { + "epoch": 0.10623983001627198, + "grad_norm": 99.72083282470703, + "learning_rate": 1.9862577777777777e-05, + "loss": 1.6746, + "step": 66400 + }, + { + "epoch": 0.10639982976027239, + "grad_norm": 1651.70263671875, + "learning_rate": 1.9859022222222225e-05, + "loss": 1.65, + "step": 66500 + }, + { + "epoch": 0.1065598295042728, + "grad_norm": 132.80343627929688, + "learning_rate": 1.9855466666666666e-05, + "loss": 1.5497, + "step": 66600 + }, + { + "epoch": 0.1067198292482732, + "grad_norm": 0.004364237189292908, + "learning_rate": 1.9851911111111114e-05, + "loss": 1.4009, + "step": 66700 + }, + { + "epoch": 0.1068798289922736, + "grad_norm": 4.1050825119018555, + "learning_rate": 1.9848355555555556e-05, + "loss": 2.058, + "step": 66800 + }, + { + "epoch": 0.10703982873627402, + "grad_norm": 0.047410767525434494, + "learning_rate": 1.9844800000000004e-05, + "loss": 1.6306, + "step": 66900 + }, + { + "epoch": 0.10719982848027443, + "grad_norm": 6.651243686676025, + "learning_rate": 1.9841244444444445e-05, + "loss": 1.4377, + "step": 67000 + }, + { + "epoch": 0.10735982822427484, + "grad_norm": 0.010524190030992031, + "learning_rate": 1.983768888888889e-05, + "loss": 1.4501, + "step": 67100 + }, + { + "epoch": 0.10751982796827525, + "grad_norm": 0.0009414692758582532, + "learning_rate": 1.9834133333333334e-05, + "loss": 1.2648, + "step": 67200 + }, + { + "epoch": 0.10767982771227566, + "grad_norm": 1019.7636108398438, + "learning_rate": 1.983057777777778e-05, + "loss": 1.3186, + "step": 67300 + }, + { + "epoch": 0.10783982745627607, + "grad_norm": 0.006541971582919359, + "learning_rate": 1.982705777777778e-05, + "loss": 1.1313, + "step": 67400 + }, + { + "epoch": 0.10799982720027648, + "grad_norm": 323.63226318359375, + "learning_rate": 1.9823502222222224e-05, + "loss": 2.2523, + "step": 67500 + }, + { + "epoch": 0.1081598269442769, + "grad_norm": 114.84791564941406, + "learning_rate": 1.981994666666667e-05, + "loss": 1.9146, + "step": 67600 + }, + { + "epoch": 0.1083198266882773, + "grad_norm": 4.059427738189697, + "learning_rate": 1.9816391111111113e-05, + "loss": 1.7334, + "step": 67700 + }, + { + "epoch": 0.10847982643227772, + "grad_norm": 3.274331569671631, + "learning_rate": 1.9812835555555558e-05, + "loss": 1.7195, + "step": 67800 + }, + { + "epoch": 0.10863982617627811, + "grad_norm": 0.0480005145072937, + "learning_rate": 1.9809280000000002e-05, + "loss": 1.4661, + "step": 67900 + }, + { + "epoch": 0.10879982592027852, + "grad_norm": 45.43354415893555, + "learning_rate": 1.9805724444444447e-05, + "loss": 1.3503, + "step": 68000 + }, + { + "epoch": 0.10895982566427893, + "grad_norm": 0.0006582220084965229, + "learning_rate": 1.980216888888889e-05, + "loss": 1.0129, + "step": 68100 + }, + { + "epoch": 0.10911982540827934, + "grad_norm": 111.87661743164062, + "learning_rate": 1.9798613333333332e-05, + "loss": 1.6036, + "step": 68200 + }, + { + "epoch": 0.10927982515227976, + "grad_norm": 122.35249328613281, + "learning_rate": 1.979505777777778e-05, + "loss": 0.9312, + "step": 68300 + }, + { + "epoch": 0.10943982489628017, + "grad_norm": 0.5635089874267578, + "learning_rate": 1.979150222222222e-05, + "loss": 1.5817, + "step": 68400 + }, + { + "epoch": 0.10959982464028058, + "grad_norm": 2.6275858879089355, + "learning_rate": 1.978794666666667e-05, + "loss": 1.2024, + "step": 68500 + }, + { + "epoch": 0.10975982438428099, + "grad_norm": 0.6521372199058533, + "learning_rate": 1.978439111111111e-05, + "loss": 0.985, + "step": 68600 + }, + { + "epoch": 0.1099198241282814, + "grad_norm": 2.0386836528778076, + "learning_rate": 1.978083555555556e-05, + "loss": 1.1712, + "step": 68700 + }, + { + "epoch": 0.11007982387228181, + "grad_norm": 132.18045043945312, + "learning_rate": 1.977728e-05, + "loss": 1.5874, + "step": 68800 + }, + { + "epoch": 0.11023982361628222, + "grad_norm": 0.0068659852258861065, + "learning_rate": 1.9773724444444448e-05, + "loss": 1.8551, + "step": 68900 + }, + { + "epoch": 0.11039982336028262, + "grad_norm": 84.89590454101562, + "learning_rate": 1.977016888888889e-05, + "loss": 1.232, + "step": 69000 + }, + { + "epoch": 0.11055982310428303, + "grad_norm": 191.7918243408203, + "learning_rate": 1.9766613333333337e-05, + "loss": 1.4688, + "step": 69100 + }, + { + "epoch": 0.11071982284828344, + "grad_norm": 10.109711647033691, + "learning_rate": 1.9763057777777778e-05, + "loss": 1.1107, + "step": 69200 + }, + { + "epoch": 0.11087982259228385, + "grad_norm": 63.09272766113281, + "learning_rate": 1.9759502222222226e-05, + "loss": 1.6495, + "step": 69300 + }, + { + "epoch": 0.11103982233628426, + "grad_norm": 66.24422454833984, + "learning_rate": 1.9755946666666667e-05, + "loss": 1.6278, + "step": 69400 + }, + { + "epoch": 0.11119982208028467, + "grad_norm": 95.56941223144531, + "learning_rate": 1.975239111111111e-05, + "loss": 1.7135, + "step": 69500 + }, + { + "epoch": 0.11135982182428508, + "grad_norm": 324.1082763671875, + "learning_rate": 1.9748835555555556e-05, + "loss": 1.5108, + "step": 69600 + }, + { + "epoch": 0.1115198215682855, + "grad_norm": 123.81194305419922, + "learning_rate": 1.974528e-05, + "loss": 1.4056, + "step": 69700 + }, + { + "epoch": 0.1116798213122859, + "grad_norm": 0.9657291769981384, + "learning_rate": 1.9741724444444445e-05, + "loss": 0.9324, + "step": 69800 + }, + { + "epoch": 0.11183982105628631, + "grad_norm": 21.34449005126953, + "learning_rate": 1.973816888888889e-05, + "loss": 1.3613, + "step": 69900 + }, + { + "epoch": 0.11199982080028673, + "grad_norm": 51.35042953491211, + "learning_rate": 1.9734613333333334e-05, + "loss": 1.5283, + "step": 70000 + }, + { + "epoch": 0.11215982054428712, + "grad_norm": 16.527353286743164, + "learning_rate": 1.973105777777778e-05, + "loss": 1.3809, + "step": 70100 + }, + { + "epoch": 0.11231982028828753, + "grad_norm": 0.032987259328365326, + "learning_rate": 1.9727502222222224e-05, + "loss": 1.5552, + "step": 70200 + }, + { + "epoch": 0.11247982003228794, + "grad_norm": 96.99321746826172, + "learning_rate": 1.9723946666666668e-05, + "loss": 1.4567, + "step": 70300 + }, + { + "epoch": 0.11263981977628836, + "grad_norm": 291.11444091796875, + "learning_rate": 1.9720391111111113e-05, + "loss": 1.4404, + "step": 70400 + }, + { + "epoch": 0.11279981952028877, + "grad_norm": 1.33843195438385, + "learning_rate": 1.9716835555555557e-05, + "loss": 1.1805, + "step": 70500 + }, + { + "epoch": 0.11295981926428918, + "grad_norm": 340.20355224609375, + "learning_rate": 1.9713280000000002e-05, + "loss": 2.514, + "step": 70600 + }, + { + "epoch": 0.11311981900828959, + "grad_norm": 0.20241181552410126, + "learning_rate": 1.9709724444444446e-05, + "loss": 1.4821, + "step": 70700 + }, + { + "epoch": 0.11327981875229, + "grad_norm": 0.044828109443187714, + "learning_rate": 1.970616888888889e-05, + "loss": 1.5156, + "step": 70800 + }, + { + "epoch": 0.11343981849629041, + "grad_norm": 121.0267105102539, + "learning_rate": 1.9702648888888892e-05, + "loss": 1.5925, + "step": 70900 + }, + { + "epoch": 0.11359981824029082, + "grad_norm": 1217.32373046875, + "learning_rate": 1.9699093333333333e-05, + "loss": 1.9517, + "step": 71000 + }, + { + "epoch": 0.11375981798429123, + "grad_norm": 102.1255111694336, + "learning_rate": 1.969553777777778e-05, + "loss": 1.2685, + "step": 71100 + }, + { + "epoch": 0.11391981772829163, + "grad_norm": 0.0009581278427504003, + "learning_rate": 1.9691982222222222e-05, + "loss": 1.6314, + "step": 71200 + }, + { + "epoch": 0.11407981747229204, + "grad_norm": 105.03948974609375, + "learning_rate": 1.968842666666667e-05, + "loss": 1.5252, + "step": 71300 + }, + { + "epoch": 0.11423981721629245, + "grad_norm": 2.6692819595336914, + "learning_rate": 1.968487111111111e-05, + "loss": 1.5176, + "step": 71400 + }, + { + "epoch": 0.11439981696029286, + "grad_norm": 93.15460205078125, + "learning_rate": 1.968131555555556e-05, + "loss": 1.3461, + "step": 71500 + }, + { + "epoch": 0.11455981670429327, + "grad_norm": 37.849117279052734, + "learning_rate": 1.967776e-05, + "loss": 1.3832, + "step": 71600 + }, + { + "epoch": 0.11471981644829368, + "grad_norm": 3.6809959411621094, + "learning_rate": 1.967420444444445e-05, + "loss": 1.2962, + "step": 71700 + }, + { + "epoch": 0.11487981619229409, + "grad_norm": 40.560264587402344, + "learning_rate": 1.967064888888889e-05, + "loss": 1.5179, + "step": 71800 + }, + { + "epoch": 0.1150398159362945, + "grad_norm": 0.17644475400447845, + "learning_rate": 1.9667093333333334e-05, + "loss": 1.1041, + "step": 71900 + }, + { + "epoch": 0.11519981568029491, + "grad_norm": 0.05514904111623764, + "learning_rate": 1.966353777777778e-05, + "loss": 1.5031, + "step": 72000 + }, + { + "epoch": 0.11535981542429533, + "grad_norm": 23.659364700317383, + "learning_rate": 1.9659982222222223e-05, + "loss": 1.5412, + "step": 72100 + }, + { + "epoch": 0.11551981516829574, + "grad_norm": 0.0025822233874350786, + "learning_rate": 1.9656426666666668e-05, + "loss": 1.2971, + "step": 72200 + }, + { + "epoch": 0.11567981491229615, + "grad_norm": 2.382300853729248, + "learning_rate": 1.9652871111111112e-05, + "loss": 1.0979, + "step": 72300 + }, + { + "epoch": 0.11583981465629654, + "grad_norm": 150.96646118164062, + "learning_rate": 1.9649315555555557e-05, + "loss": 1.307, + "step": 72400 + }, + { + "epoch": 0.11599981440029696, + "grad_norm": 0.022950541228055954, + "learning_rate": 1.964576e-05, + "loss": 1.3418, + "step": 72500 + }, + { + "epoch": 0.11615981414429737, + "grad_norm": 21.7007999420166, + "learning_rate": 1.9642204444444446e-05, + "loss": 1.7298, + "step": 72600 + }, + { + "epoch": 0.11631981388829778, + "grad_norm": 100.60992431640625, + "learning_rate": 1.963864888888889e-05, + "loss": 1.68, + "step": 72700 + }, + { + "epoch": 0.11647981363229819, + "grad_norm": 104.22400665283203, + "learning_rate": 1.9635093333333335e-05, + "loss": 1.3106, + "step": 72800 + }, + { + "epoch": 0.1166398133762986, + "grad_norm": 0.01572352461516857, + "learning_rate": 1.963153777777778e-05, + "loss": 1.0954, + "step": 72900 + }, + { + "epoch": 0.11679981312029901, + "grad_norm": 0.1720964014530182, + "learning_rate": 1.9627982222222224e-05, + "loss": 1.5994, + "step": 73000 + }, + { + "epoch": 0.11695981286429942, + "grad_norm": 90.89932250976562, + "learning_rate": 1.962442666666667e-05, + "loss": 1.5953, + "step": 73100 + }, + { + "epoch": 0.11711981260829983, + "grad_norm": 94.24946594238281, + "learning_rate": 1.9620871111111113e-05, + "loss": 1.9498, + "step": 73200 + }, + { + "epoch": 0.11727981235230024, + "grad_norm": 0.08061110228300095, + "learning_rate": 1.9617315555555554e-05, + "loss": 0.9937, + "step": 73300 + }, + { + "epoch": 0.11743981209630065, + "grad_norm": 9.990059852600098, + "learning_rate": 1.9613760000000002e-05, + "loss": 1.4753, + "step": 73400 + }, + { + "epoch": 0.11759981184030105, + "grad_norm": 1.572757601737976, + "learning_rate": 1.9610204444444444e-05, + "loss": 1.417, + "step": 73500 + }, + { + "epoch": 0.11775981158430146, + "grad_norm": 23.618915557861328, + "learning_rate": 1.960664888888889e-05, + "loss": 1.596, + "step": 73600 + }, + { + "epoch": 0.11791981132830187, + "grad_norm": 90.75736999511719, + "learning_rate": 1.9603093333333333e-05, + "loss": 1.8794, + "step": 73700 + }, + { + "epoch": 0.11807981107230228, + "grad_norm": 0.07401008158922195, + "learning_rate": 1.959953777777778e-05, + "loss": 1.3118, + "step": 73800 + }, + { + "epoch": 0.11823981081630269, + "grad_norm": 80.61852264404297, + "learning_rate": 1.9595982222222222e-05, + "loss": 1.732, + "step": 73900 + }, + { + "epoch": 0.1183998105603031, + "grad_norm": 0.18783807754516602, + "learning_rate": 1.959242666666667e-05, + "loss": 1.4504, + "step": 74000 + }, + { + "epoch": 0.11855981030430351, + "grad_norm": 0.0010747779160737991, + "learning_rate": 1.958887111111111e-05, + "loss": 1.0878, + "step": 74100 + }, + { + "epoch": 0.11871981004830393, + "grad_norm": 0.0007994744810275733, + "learning_rate": 1.958531555555556e-05, + "loss": 1.2488, + "step": 74200 + }, + { + "epoch": 0.11887980979230434, + "grad_norm": 16.047822952270508, + "learning_rate": 1.958176e-05, + "loss": 1.3887, + "step": 74300 + }, + { + "epoch": 0.11903980953630475, + "grad_norm": 105.44868469238281, + "learning_rate": 1.9578204444444448e-05, + "loss": 1.2265, + "step": 74400 + }, + { + "epoch": 0.11919980928030516, + "grad_norm": 0.0008991442155092955, + "learning_rate": 1.957464888888889e-05, + "loss": 1.4668, + "step": 74500 + }, + { + "epoch": 0.11935980902430555, + "grad_norm": 0.04507048800587654, + "learning_rate": 1.9571093333333334e-05, + "loss": 1.6258, + "step": 74600 + }, + { + "epoch": 0.11951980876830597, + "grad_norm": 0.00036178340087644756, + "learning_rate": 1.9567537777777778e-05, + "loss": 1.9551, + "step": 74700 + }, + { + "epoch": 0.11967980851230638, + "grad_norm": 0.0032805718947201967, + "learning_rate": 1.9563982222222223e-05, + "loss": 1.1811, + "step": 74800 + }, + { + "epoch": 0.11983980825630679, + "grad_norm": 126.1537857055664, + "learning_rate": 1.9560426666666667e-05, + "loss": 1.2119, + "step": 74900 + }, + { + "epoch": 0.1199998080003072, + "grad_norm": 1.2026222944259644, + "learning_rate": 1.9556871111111112e-05, + "loss": 1.4051, + "step": 75000 + }, + { + "epoch": 0.12015980774430761, + "grad_norm": 1.8911128044128418, + "learning_rate": 1.9553351111111113e-05, + "loss": 1.2587, + "step": 75100 + }, + { + "epoch": 0.12031980748830802, + "grad_norm": 0.4351516664028168, + "learning_rate": 1.9549795555555558e-05, + "loss": 1.4563, + "step": 75200 + }, + { + "epoch": 0.12047980723230843, + "grad_norm": 0.004506128840148449, + "learning_rate": 1.9546240000000002e-05, + "loss": 1.5581, + "step": 75300 + }, + { + "epoch": 0.12063980697630884, + "grad_norm": 0.0002510923077352345, + "learning_rate": 1.9542684444444447e-05, + "loss": 1.5457, + "step": 75400 + }, + { + "epoch": 0.12079980672030925, + "grad_norm": 0.15088102221488953, + "learning_rate": 1.953912888888889e-05, + "loss": 1.2675, + "step": 75500 + }, + { + "epoch": 0.12095980646430966, + "grad_norm": 0.05502159520983696, + "learning_rate": 1.9535573333333336e-05, + "loss": 1.0948, + "step": 75600 + }, + { + "epoch": 0.12111980620831006, + "grad_norm": 0.09219387173652649, + "learning_rate": 1.953201777777778e-05, + "loss": 1.2045, + "step": 75700 + }, + { + "epoch": 0.12127980595231047, + "grad_norm": 199.2202911376953, + "learning_rate": 1.9528462222222225e-05, + "loss": 1.5964, + "step": 75800 + }, + { + "epoch": 0.12143980569631088, + "grad_norm": 11.821746826171875, + "learning_rate": 1.9524906666666666e-05, + "loss": 1.0517, + "step": 75900 + }, + { + "epoch": 0.12159980544031129, + "grad_norm": 106.8429946899414, + "learning_rate": 1.9521351111111114e-05, + "loss": 1.2883, + "step": 76000 + }, + { + "epoch": 0.1217598051843117, + "grad_norm": 0.08651433885097504, + "learning_rate": 1.9517795555555555e-05, + "loss": 1.2276, + "step": 76100 + }, + { + "epoch": 0.12191980492831211, + "grad_norm": 79.67507934570312, + "learning_rate": 1.9514240000000003e-05, + "loss": 1.2463, + "step": 76200 + }, + { + "epoch": 0.12207980467231253, + "grad_norm": 0.6488481163978577, + "learning_rate": 1.9510684444444444e-05, + "loss": 1.241, + "step": 76300 + }, + { + "epoch": 0.12223980441631294, + "grad_norm": 6.853870391845703, + "learning_rate": 1.9507128888888892e-05, + "loss": 1.8648, + "step": 76400 + }, + { + "epoch": 0.12239980416031335, + "grad_norm": 0.0021333652548491955, + "learning_rate": 1.9503573333333333e-05, + "loss": 1.4848, + "step": 76500 + }, + { + "epoch": 0.12255980390431376, + "grad_norm": 0.0014837757917121053, + "learning_rate": 1.950001777777778e-05, + "loss": 1.413, + "step": 76600 + }, + { + "epoch": 0.12271980364831417, + "grad_norm": 196.25413513183594, + "learning_rate": 1.9496462222222222e-05, + "loss": 1.594, + "step": 76700 + }, + { + "epoch": 0.12287980339231457, + "grad_norm": 117.68331909179688, + "learning_rate": 1.949290666666667e-05, + "loss": 1.3682, + "step": 76800 + }, + { + "epoch": 0.12303980313631498, + "grad_norm": 3.5699806213378906, + "learning_rate": 1.948935111111111e-05, + "loss": 1.159, + "step": 76900 + }, + { + "epoch": 0.12319980288031539, + "grad_norm": 0.051335014402866364, + "learning_rate": 1.9485795555555556e-05, + "loss": 1.4702, + "step": 77000 + }, + { + "epoch": 0.1233598026243158, + "grad_norm": 0.012798790819942951, + "learning_rate": 1.948224e-05, + "loss": 1.3251, + "step": 77100 + }, + { + "epoch": 0.12351980236831621, + "grad_norm": 3.92969012260437, + "learning_rate": 1.9478684444444445e-05, + "loss": 1.0538, + "step": 77200 + }, + { + "epoch": 0.12367980211231662, + "grad_norm": 2.8302226066589355, + "learning_rate": 1.947512888888889e-05, + "loss": 1.1708, + "step": 77300 + }, + { + "epoch": 0.12383980185631703, + "grad_norm": 0.11278839409351349, + "learning_rate": 1.947160888888889e-05, + "loss": 1.2864, + "step": 77400 + }, + { + "epoch": 0.12399980160031744, + "grad_norm": 99.1993408203125, + "learning_rate": 1.9468053333333335e-05, + "loss": 1.6501, + "step": 77500 + }, + { + "epoch": 0.12415980134431785, + "grad_norm": 0.22377841174602509, + "learning_rate": 1.946449777777778e-05, + "loss": 1.0104, + "step": 77600 + }, + { + "epoch": 0.12431980108831826, + "grad_norm": 99.57634735107422, + "learning_rate": 1.9460942222222225e-05, + "loss": 1.7969, + "step": 77700 + }, + { + "epoch": 0.12447980083231867, + "grad_norm": 0.9174038767814636, + "learning_rate": 1.945738666666667e-05, + "loss": 1.0293, + "step": 77800 + }, + { + "epoch": 0.12463980057631908, + "grad_norm": 61.01045227050781, + "learning_rate": 1.9453831111111114e-05, + "loss": 1.5593, + "step": 77900 + }, + { + "epoch": 0.12479980032031948, + "grad_norm": 2412.276611328125, + "learning_rate": 1.9450275555555558e-05, + "loss": 0.9902, + "step": 78000 + }, + { + "epoch": 0.12495980006431989, + "grad_norm": 241.99363708496094, + "learning_rate": 1.9446720000000003e-05, + "loss": 1.058, + "step": 78100 + }, + { + "epoch": 0.1251197998083203, + "grad_norm": 96.88700866699219, + "learning_rate": 1.9443164444444447e-05, + "loss": 1.4039, + "step": 78200 + }, + { + "epoch": 0.12527979955232071, + "grad_norm": 56.962181091308594, + "learning_rate": 1.943960888888889e-05, + "loss": 1.008, + "step": 78300 + }, + { + "epoch": 0.12543979929632113, + "grad_norm": 23.272607803344727, + "learning_rate": 1.9436053333333336e-05, + "loss": 1.4593, + "step": 78400 + }, + { + "epoch": 0.12559979904032154, + "grad_norm": 97.3494644165039, + "learning_rate": 1.9432497777777778e-05, + "loss": 1.563, + "step": 78500 + }, + { + "epoch": 0.12575979878432195, + "grad_norm": 3.625567674636841, + "learning_rate": 1.9428942222222226e-05, + "loss": 1.1569, + "step": 78600 + }, + { + "epoch": 0.12591979852832236, + "grad_norm": 63.88728332519531, + "learning_rate": 1.9425386666666667e-05, + "loss": 1.3886, + "step": 78700 + }, + { + "epoch": 0.12607979827232277, + "grad_norm": 0.533359169960022, + "learning_rate": 1.9421831111111115e-05, + "loss": 1.061, + "step": 78800 + }, + { + "epoch": 0.12623979801632318, + "grad_norm": 0.0005107554607093334, + "learning_rate": 1.9418275555555556e-05, + "loss": 1.2085, + "step": 78900 + }, + { + "epoch": 0.1263997977603236, + "grad_norm": 66.7668685913086, + "learning_rate": 1.9414720000000004e-05, + "loss": 1.8553, + "step": 79000 + }, + { + "epoch": 0.126559797504324, + "grad_norm": 2.3458669185638428, + "learning_rate": 1.9411164444444445e-05, + "loss": 1.7144, + "step": 79100 + }, + { + "epoch": 0.1267197972483244, + "grad_norm": 101.0086669921875, + "learning_rate": 1.9407608888888893e-05, + "loss": 1.2216, + "step": 79200 + }, + { + "epoch": 0.12687979699232482, + "grad_norm": 14.662532806396484, + "learning_rate": 1.9404053333333334e-05, + "loss": 1.1646, + "step": 79300 + }, + { + "epoch": 0.12703979673632523, + "grad_norm": 70.46912384033203, + "learning_rate": 1.9400497777777782e-05, + "loss": 1.7768, + "step": 79400 + }, + { + "epoch": 0.12719979648032564, + "grad_norm": 3.7776920795440674, + "learning_rate": 1.9396942222222223e-05, + "loss": 1.1314, + "step": 79500 + }, + { + "epoch": 0.12735979622432603, + "grad_norm": 0.05991614609956741, + "learning_rate": 1.9393386666666668e-05, + "loss": 1.2374, + "step": 79600 + }, + { + "epoch": 0.12751979596832644, + "grad_norm": 1.138396978378296, + "learning_rate": 1.9389831111111112e-05, + "loss": 1.2681, + "step": 79700 + }, + { + "epoch": 0.12767979571232685, + "grad_norm": 117.04296875, + "learning_rate": 1.9386275555555557e-05, + "loss": 1.2624, + "step": 79800 + }, + { + "epoch": 0.12783979545632726, + "grad_norm": 165.1708984375, + "learning_rate": 1.9382755555555558e-05, + "loss": 1.6775, + "step": 79900 + }, + { + "epoch": 0.12799979520032767, + "grad_norm": 127.26524353027344, + "learning_rate": 1.9379200000000002e-05, + "loss": 1.3587, + "step": 80000 + }, + { + "epoch": 0.12815979494432808, + "grad_norm": 128.8250274658203, + "learning_rate": 1.9375644444444447e-05, + "loss": 1.7402, + "step": 80100 + }, + { + "epoch": 0.1283197946883285, + "grad_norm": 83.64952850341797, + "learning_rate": 1.937208888888889e-05, + "loss": 1.5349, + "step": 80200 + }, + { + "epoch": 0.1284797944323289, + "grad_norm": 2.9033825397491455, + "learning_rate": 1.9368533333333336e-05, + "loss": 0.8546, + "step": 80300 + }, + { + "epoch": 0.12863979417632931, + "grad_norm": 1.8563624620437622, + "learning_rate": 1.936497777777778e-05, + "loss": 1.3903, + "step": 80400 + }, + { + "epoch": 0.12879979392032972, + "grad_norm": 0.020641742274165154, + "learning_rate": 1.9361422222222225e-05, + "loss": 1.0712, + "step": 80500 + }, + { + "epoch": 0.12895979366433014, + "grad_norm": 0.030105268582701683, + "learning_rate": 1.935786666666667e-05, + "loss": 1.6633, + "step": 80600 + }, + { + "epoch": 0.12911979340833055, + "grad_norm": 7.39204216003418, + "learning_rate": 1.935431111111111e-05, + "loss": 1.4125, + "step": 80700 + }, + { + "epoch": 0.12927979315233096, + "grad_norm": 0.6996489763259888, + "learning_rate": 1.935075555555556e-05, + "loss": 0.6973, + "step": 80800 + }, + { + "epoch": 0.12943979289633137, + "grad_norm": 45.01316452026367, + "learning_rate": 1.93472e-05, + "loss": 1.1729, + "step": 80900 + }, + { + "epoch": 0.12959979264033178, + "grad_norm": 0.2586953938007355, + "learning_rate": 1.9343644444444448e-05, + "loss": 1.2217, + "step": 81000 + }, + { + "epoch": 0.1297597923843322, + "grad_norm": 0.02437330223619938, + "learning_rate": 1.934008888888889e-05, + "loss": 1.3184, + "step": 81100 + }, + { + "epoch": 0.1299197921283326, + "grad_norm": 86.13786315917969, + "learning_rate": 1.9336533333333334e-05, + "loss": 1.2718, + "step": 81200 + }, + { + "epoch": 0.130079791872333, + "grad_norm": 129.18377685546875, + "learning_rate": 1.9332977777777778e-05, + "loss": 1.1913, + "step": 81300 + }, + { + "epoch": 0.13023979161633342, + "grad_norm": 0.21126429736614227, + "learning_rate": 1.9329422222222223e-05, + "loss": 1.4728, + "step": 81400 + }, + { + "epoch": 0.13039979136033383, + "grad_norm": 17.239547729492188, + "learning_rate": 1.9325902222222224e-05, + "loss": 1.1221, + "step": 81500 + }, + { + "epoch": 0.13055979110433424, + "grad_norm": 3.2373907566070557, + "learning_rate": 1.932234666666667e-05, + "loss": 1.235, + "step": 81600 + }, + { + "epoch": 0.13071979084833465, + "grad_norm": 5.152343273162842, + "learning_rate": 1.9318791111111113e-05, + "loss": 1.3497, + "step": 81700 + }, + { + "epoch": 0.13087979059233507, + "grad_norm": 88.2583236694336, + "learning_rate": 1.9315235555555558e-05, + "loss": 1.2361, + "step": 81800 + }, + { + "epoch": 0.13103979033633545, + "grad_norm": 0.001005143509246409, + "learning_rate": 1.9311680000000002e-05, + "loss": 2.0015, + "step": 81900 + }, + { + "epoch": 0.13119979008033586, + "grad_norm": 0.3949466347694397, + "learning_rate": 1.9308124444444447e-05, + "loss": 1.2259, + "step": 82000 + }, + { + "epoch": 0.13135978982433627, + "grad_norm": 0.6978406310081482, + "learning_rate": 1.930456888888889e-05, + "loss": 0.9236, + "step": 82100 + }, + { + "epoch": 0.13151978956833668, + "grad_norm": 0.6740103363990784, + "learning_rate": 1.9301013333333332e-05, + "loss": 1.5339, + "step": 82200 + }, + { + "epoch": 0.1316797893123371, + "grad_norm": 0.007084805518388748, + "learning_rate": 1.929745777777778e-05, + "loss": 1.2036, + "step": 82300 + }, + { + "epoch": 0.1318397890563375, + "grad_norm": 88.51764678955078, + "learning_rate": 1.929390222222222e-05, + "loss": 1.2631, + "step": 82400 + }, + { + "epoch": 0.1319997888003379, + "grad_norm": 0.000969950866419822, + "learning_rate": 1.929034666666667e-05, + "loss": 1.0858, + "step": 82500 + }, + { + "epoch": 0.13215978854433832, + "grad_norm": 0.10399264842271805, + "learning_rate": 1.928679111111111e-05, + "loss": 1.635, + "step": 82600 + }, + { + "epoch": 0.13231978828833874, + "grad_norm": 39.86758804321289, + "learning_rate": 1.928323555555556e-05, + "loss": 1.285, + "step": 82700 + }, + { + "epoch": 0.13247978803233915, + "grad_norm": 2.6627957820892334, + "learning_rate": 1.927968e-05, + "loss": 1.1209, + "step": 82800 + }, + { + "epoch": 0.13263978777633956, + "grad_norm": 0.2310008406639099, + "learning_rate": 1.9276124444444448e-05, + "loss": 1.4032, + "step": 82900 + }, + { + "epoch": 0.13279978752033997, + "grad_norm": 76.39102935791016, + "learning_rate": 1.927256888888889e-05, + "loss": 1.1279, + "step": 83000 + }, + { + "epoch": 0.13295978726434038, + "grad_norm": 0.5016289949417114, + "learning_rate": 1.9269013333333337e-05, + "loss": 1.5145, + "step": 83100 + }, + { + "epoch": 0.1331197870083408, + "grad_norm": 0.2468506544828415, + "learning_rate": 1.9265457777777778e-05, + "loss": 1.4923, + "step": 83200 + }, + { + "epoch": 0.1332797867523412, + "grad_norm": 0.03473009541630745, + "learning_rate": 1.9261902222222222e-05, + "loss": 0.9845, + "step": 83300 + }, + { + "epoch": 0.1334397864963416, + "grad_norm": 15.979637145996094, + "learning_rate": 1.9258346666666667e-05, + "loss": 1.3847, + "step": 83400 + }, + { + "epoch": 0.13359978624034202, + "grad_norm": 0.13443760573863983, + "learning_rate": 1.925479111111111e-05, + "loss": 1.0149, + "step": 83500 + }, + { + "epoch": 0.13375978598434243, + "grad_norm": 0.09114881604909897, + "learning_rate": 1.9251235555555556e-05, + "loss": 1.2644, + "step": 83600 + }, + { + "epoch": 0.13391978572834284, + "grad_norm": 67.14397430419922, + "learning_rate": 1.924768e-05, + "loss": 1.2981, + "step": 83700 + }, + { + "epoch": 0.13407978547234325, + "grad_norm": 9.479357719421387, + "learning_rate": 1.9244124444444445e-05, + "loss": 1.6903, + "step": 83800 + }, + { + "epoch": 0.13423978521634367, + "grad_norm": 222.48973083496094, + "learning_rate": 1.924056888888889e-05, + "loss": 1.2846, + "step": 83900 + }, + { + "epoch": 0.13439978496034408, + "grad_norm": 0.24070465564727783, + "learning_rate": 1.9237013333333334e-05, + "loss": 1.4647, + "step": 84000 + }, + { + "epoch": 0.13455978470434446, + "grad_norm": 1444.743896484375, + "learning_rate": 1.923345777777778e-05, + "loss": 1.1213, + "step": 84100 + }, + { + "epoch": 0.13471978444834487, + "grad_norm": 1.174815058708191, + "learning_rate": 1.9229902222222223e-05, + "loss": 1.1379, + "step": 84200 + }, + { + "epoch": 0.13487978419234528, + "grad_norm": 120.12804412841797, + "learning_rate": 1.9226346666666668e-05, + "loss": 1.2793, + "step": 84300 + }, + { + "epoch": 0.1350397839363457, + "grad_norm": 89.50218200683594, + "learning_rate": 1.9222791111111113e-05, + "loss": 1.343, + "step": 84400 + }, + { + "epoch": 0.1351997836803461, + "grad_norm": 100.86327362060547, + "learning_rate": 1.9219235555555557e-05, + "loss": 1.8342, + "step": 84500 + }, + { + "epoch": 0.1353597834243465, + "grad_norm": 99.5421371459961, + "learning_rate": 1.921568e-05, + "loss": 1.0487, + "step": 84600 + }, + { + "epoch": 0.13551978316834692, + "grad_norm": 0.0015393303474411368, + "learning_rate": 1.9212124444444446e-05, + "loss": 1.1531, + "step": 84700 + }, + { + "epoch": 0.13567978291234734, + "grad_norm": 3.457564353942871, + "learning_rate": 1.920856888888889e-05, + "loss": 0.8552, + "step": 84800 + }, + { + "epoch": 0.13583978265634775, + "grad_norm": 0.00041561800753697753, + "learning_rate": 1.9205013333333335e-05, + "loss": 1.1422, + "step": 84900 + }, + { + "epoch": 0.13599978240034816, + "grad_norm": 0.0012223550584167242, + "learning_rate": 1.920145777777778e-05, + "loss": 1.0918, + "step": 85000 + }, + { + "epoch": 0.13615978214434857, + "grad_norm": 0.21084783971309662, + "learning_rate": 1.9197902222222224e-05, + "loss": 1.2873, + "step": 85100 + }, + { + "epoch": 0.13631978188834898, + "grad_norm": 24.241910934448242, + "learning_rate": 1.919434666666667e-05, + "loss": 1.547, + "step": 85200 + }, + { + "epoch": 0.1364797816323494, + "grad_norm": 47.714027404785156, + "learning_rate": 1.9190791111111114e-05, + "loss": 1.5094, + "step": 85300 + }, + { + "epoch": 0.1366397813763498, + "grad_norm": 6.054490089416504, + "learning_rate": 1.9187235555555558e-05, + "loss": 1.051, + "step": 85400 + }, + { + "epoch": 0.1367997811203502, + "grad_norm": 0.001112865749746561, + "learning_rate": 1.9183680000000003e-05, + "loss": 0.9952, + "step": 85500 + }, + { + "epoch": 0.13695978086435062, + "grad_norm": 0.015406353399157524, + "learning_rate": 1.9180124444444447e-05, + "loss": 1.1978, + "step": 85600 + }, + { + "epoch": 0.13711978060835103, + "grad_norm": 17.65171241760254, + "learning_rate": 1.9176604444444445e-05, + "loss": 1.5221, + "step": 85700 + }, + { + "epoch": 0.13727978035235144, + "grad_norm": 14.018333435058594, + "learning_rate": 1.917304888888889e-05, + "loss": 1.3841, + "step": 85800 + }, + { + "epoch": 0.13743978009635185, + "grad_norm": 0.0006000687135383487, + "learning_rate": 1.9169493333333334e-05, + "loss": 1.3999, + "step": 85900 + }, + { + "epoch": 0.13759977984035227, + "grad_norm": 1715.1507568359375, + "learning_rate": 1.916593777777778e-05, + "loss": 1.5574, + "step": 86000 + }, + { + "epoch": 0.13775977958435268, + "grad_norm": 4.6950907707214355, + "learning_rate": 1.9162382222222223e-05, + "loss": 1.3267, + "step": 86100 + }, + { + "epoch": 0.1379197793283531, + "grad_norm": 0.8909225463867188, + "learning_rate": 1.9158826666666668e-05, + "loss": 1.358, + "step": 86200 + }, + { + "epoch": 0.1380797790723535, + "grad_norm": 27.72040367126465, + "learning_rate": 1.9155271111111112e-05, + "loss": 1.5441, + "step": 86300 + }, + { + "epoch": 0.13823977881635388, + "grad_norm": 120.01333618164062, + "learning_rate": 1.9151715555555557e-05, + "loss": 1.4124, + "step": 86400 + }, + { + "epoch": 0.1383997785603543, + "grad_norm": 27.406797409057617, + "learning_rate": 1.914816e-05, + "loss": 0.8352, + "step": 86500 + }, + { + "epoch": 0.1385597783043547, + "grad_norm": 0.07549207657575607, + "learning_rate": 1.9144604444444446e-05, + "loss": 1.2549, + "step": 86600 + }, + { + "epoch": 0.1387197780483551, + "grad_norm": 1.3746123313903809, + "learning_rate": 1.914104888888889e-05, + "loss": 1.4328, + "step": 86700 + }, + { + "epoch": 0.13887977779235552, + "grad_norm": 0.002391360467299819, + "learning_rate": 1.9137493333333335e-05, + "loss": 1.2577, + "step": 86800 + }, + { + "epoch": 0.13903977753635594, + "grad_norm": 39.3692626953125, + "learning_rate": 1.913393777777778e-05, + "loss": 1.4417, + "step": 86900 + }, + { + "epoch": 0.13919977728035635, + "grad_norm": 0.0015022088773548603, + "learning_rate": 1.9130382222222224e-05, + "loss": 1.1927, + "step": 87000 + }, + { + "epoch": 0.13935977702435676, + "grad_norm": 3.776437520980835, + "learning_rate": 1.912682666666667e-05, + "loss": 1.4435, + "step": 87100 + }, + { + "epoch": 0.13951977676835717, + "grad_norm": 9.693673133850098, + "learning_rate": 1.9123271111111113e-05, + "loss": 1.3579, + "step": 87200 + }, + { + "epoch": 0.13967977651235758, + "grad_norm": 47.54679870605469, + "learning_rate": 1.9119751111111114e-05, + "loss": 1.3883, + "step": 87300 + }, + { + "epoch": 0.139839776256358, + "grad_norm": 57.24945068359375, + "learning_rate": 1.9116195555555555e-05, + "loss": 1.2645, + "step": 87400 + }, + { + "epoch": 0.1399997760003584, + "grad_norm": 0.0025031184777617455, + "learning_rate": 1.9112640000000003e-05, + "loss": 1.1366, + "step": 87500 + }, + { + "epoch": 0.1401597757443588, + "grad_norm": 0.015484058298170567, + "learning_rate": 1.9109084444444445e-05, + "loss": 1.4566, + "step": 87600 + }, + { + "epoch": 0.14031977548835922, + "grad_norm": 0.24919560551643372, + "learning_rate": 1.9105528888888893e-05, + "loss": 1.447, + "step": 87700 + }, + { + "epoch": 0.14047977523235963, + "grad_norm": 74.2865219116211, + "learning_rate": 1.9101973333333334e-05, + "loss": 1.0701, + "step": 87800 + }, + { + "epoch": 0.14063977497636004, + "grad_norm": 127.0066909790039, + "learning_rate": 1.909841777777778e-05, + "loss": 1.3449, + "step": 87900 + }, + { + "epoch": 0.14079977472036045, + "grad_norm": 87.54583740234375, + "learning_rate": 1.9094862222222223e-05, + "loss": 1.4331, + "step": 88000 + }, + { + "epoch": 0.14095977446436087, + "grad_norm": 0.001399531727656722, + "learning_rate": 1.909130666666667e-05, + "loss": 1.3965, + "step": 88100 + }, + { + "epoch": 0.14111977420836128, + "grad_norm": 69.87310028076172, + "learning_rate": 1.9087751111111112e-05, + "loss": 1.347, + "step": 88200 + }, + { + "epoch": 0.1412797739523617, + "grad_norm": 69.38946533203125, + "learning_rate": 1.9084195555555556e-05, + "loss": 1.0262, + "step": 88300 + }, + { + "epoch": 0.1414397736963621, + "grad_norm": 18.69589614868164, + "learning_rate": 1.908064e-05, + "loss": 1.0787, + "step": 88400 + }, + { + "epoch": 0.1415997734403625, + "grad_norm": 0.20538243651390076, + "learning_rate": 1.9077084444444446e-05, + "loss": 1.3829, + "step": 88500 + }, + { + "epoch": 0.1417597731843629, + "grad_norm": 0.0005450706230476499, + "learning_rate": 1.907352888888889e-05, + "loss": 1.2001, + "step": 88600 + }, + { + "epoch": 0.1419197729283633, + "grad_norm": 2.548616409301758, + "learning_rate": 1.9069973333333335e-05, + "loss": 1.2407, + "step": 88700 + }, + { + "epoch": 0.1420797726723637, + "grad_norm": 139.437744140625, + "learning_rate": 1.906641777777778e-05, + "loss": 1.6291, + "step": 88800 + }, + { + "epoch": 0.14223977241636412, + "grad_norm": 10.629435539245605, + "learning_rate": 1.9062862222222224e-05, + "loss": 1.1502, + "step": 88900 + }, + { + "epoch": 0.14239977216036454, + "grad_norm": 3.494685411453247, + "learning_rate": 1.905930666666667e-05, + "loss": 1.2155, + "step": 89000 + }, + { + "epoch": 0.14255977190436495, + "grad_norm": 107.18891143798828, + "learning_rate": 1.9055751111111113e-05, + "loss": 1.3381, + "step": 89100 + }, + { + "epoch": 0.14271977164836536, + "grad_norm": 2575.91796875, + "learning_rate": 1.9052195555555557e-05, + "loss": 0.819, + "step": 89200 + }, + { + "epoch": 0.14287977139236577, + "grad_norm": 302.19500732421875, + "learning_rate": 1.9048640000000002e-05, + "loss": 1.0402, + "step": 89300 + }, + { + "epoch": 0.14303977113636618, + "grad_norm": 87.07076263427734, + "learning_rate": 1.9045084444444447e-05, + "loss": 1.1062, + "step": 89400 + }, + { + "epoch": 0.1431997708803666, + "grad_norm": 5.228755950927734, + "learning_rate": 1.9041528888888888e-05, + "loss": 1.6693, + "step": 89500 + }, + { + "epoch": 0.143359770624367, + "grad_norm": 0.20638461410999298, + "learning_rate": 1.9037973333333336e-05, + "loss": 1.1991, + "step": 89600 + }, + { + "epoch": 0.1435197703683674, + "grad_norm": 114.9300308227539, + "learning_rate": 1.9034417777777777e-05, + "loss": 1.3535, + "step": 89700 + }, + { + "epoch": 0.14367977011236782, + "grad_norm": 86.26241302490234, + "learning_rate": 1.9030862222222225e-05, + "loss": 1.6776, + "step": 89800 + }, + { + "epoch": 0.14383976985636823, + "grad_norm": 78.20118713378906, + "learning_rate": 1.9027306666666666e-05, + "loss": 1.2221, + "step": 89900 + }, + { + "epoch": 0.14399976960036864, + "grad_norm": 7.4184088706970215, + "learning_rate": 1.9023751111111114e-05, + "loss": 1.0253, + "step": 90000 + }, + { + "epoch": 0.14415976934436905, + "grad_norm": 99.789794921875, + "learning_rate": 1.9020195555555555e-05, + "loss": 1.0469, + "step": 90100 + }, + { + "epoch": 0.14431976908836947, + "grad_norm": 0.003789502428844571, + "learning_rate": 1.9016640000000003e-05, + "loss": 1.2465, + "step": 90200 + }, + { + "epoch": 0.14447976883236988, + "grad_norm": 0.27373766899108887, + "learning_rate": 1.9013084444444444e-05, + "loss": 1.4068, + "step": 90300 + }, + { + "epoch": 0.1446397685763703, + "grad_norm": 101.30089569091797, + "learning_rate": 1.9009528888888892e-05, + "loss": 1.5961, + "step": 90400 + }, + { + "epoch": 0.1447997683203707, + "grad_norm": 0.24238981306552887, + "learning_rate": 1.9005973333333333e-05, + "loss": 1.0579, + "step": 90500 + }, + { + "epoch": 0.1449597680643711, + "grad_norm": 0.6612280011177063, + "learning_rate": 1.900241777777778e-05, + "loss": 0.941, + "step": 90600 + }, + { + "epoch": 0.14511976780837152, + "grad_norm": 3.1052684783935547, + "learning_rate": 1.8998862222222222e-05, + "loss": 1.1861, + "step": 90700 + }, + { + "epoch": 0.14527976755237193, + "grad_norm": 9.876090049743652, + "learning_rate": 1.8995306666666667e-05, + "loss": 1.4697, + "step": 90800 + }, + { + "epoch": 0.1454397672963723, + "grad_norm": 32.829795837402344, + "learning_rate": 1.899175111111111e-05, + "loss": 0.6486, + "step": 90900 + }, + { + "epoch": 0.14559976704037272, + "grad_norm": 2219.107177734375, + "learning_rate": 1.8988195555555556e-05, + "loss": 1.3865, + "step": 91000 + }, + { + "epoch": 0.14575976678437313, + "grad_norm": 0.2465362697839737, + "learning_rate": 1.898464e-05, + "loss": 1.1494, + "step": 91100 + }, + { + "epoch": 0.14591976652837355, + "grad_norm": 0.06304822117090225, + "learning_rate": 1.8981084444444445e-05, + "loss": 1.3623, + "step": 91200 + }, + { + "epoch": 0.14607976627237396, + "grad_norm": 0.0003378583351150155, + "learning_rate": 1.897752888888889e-05, + "loss": 1.2193, + "step": 91300 + }, + { + "epoch": 0.14623976601637437, + "grad_norm": 0.023890919983386993, + "learning_rate": 1.8973973333333334e-05, + "loss": 1.3003, + "step": 91400 + }, + { + "epoch": 0.14639976576037478, + "grad_norm": 0.20042432844638824, + "learning_rate": 1.8970453333333335e-05, + "loss": 1.2608, + "step": 91500 + }, + { + "epoch": 0.1465597655043752, + "grad_norm": 0.4310738742351532, + "learning_rate": 1.896689777777778e-05, + "loss": 1.2544, + "step": 91600 + }, + { + "epoch": 0.1467197652483756, + "grad_norm": 6.881536960601807, + "learning_rate": 1.8963342222222224e-05, + "loss": 1.332, + "step": 91700 + }, + { + "epoch": 0.146879764992376, + "grad_norm": 34.862266540527344, + "learning_rate": 1.895978666666667e-05, + "loss": 1.3548, + "step": 91800 + }, + { + "epoch": 0.14703976473637642, + "grad_norm": 41.60286331176758, + "learning_rate": 1.8956231111111114e-05, + "loss": 1.54, + "step": 91900 + }, + { + "epoch": 0.14719976448037683, + "grad_norm": 0.21723419427871704, + "learning_rate": 1.8952675555555558e-05, + "loss": 1.3125, + "step": 92000 + }, + { + "epoch": 0.14735976422437724, + "grad_norm": 12.313715934753418, + "learning_rate": 1.894912e-05, + "loss": 0.897, + "step": 92100 + }, + { + "epoch": 0.14751976396837765, + "grad_norm": 9.945670171873644e-05, + "learning_rate": 1.8945564444444447e-05, + "loss": 1.1594, + "step": 92200 + }, + { + "epoch": 0.14767976371237806, + "grad_norm": 0.00018985375936608762, + "learning_rate": 1.894200888888889e-05, + "loss": 0.9194, + "step": 92300 + }, + { + "epoch": 0.14783976345637848, + "grad_norm": 6.04590368270874, + "learning_rate": 1.8938453333333336e-05, + "loss": 1.2209, + "step": 92400 + }, + { + "epoch": 0.1479997632003789, + "grad_norm": 0.19547709822654724, + "learning_rate": 1.8934897777777777e-05, + "loss": 1.0027, + "step": 92500 + }, + { + "epoch": 0.1481597629443793, + "grad_norm": 0.00870482623577118, + "learning_rate": 1.8931342222222225e-05, + "loss": 1.4675, + "step": 92600 + }, + { + "epoch": 0.1483197626883797, + "grad_norm": 112.5121078491211, + "learning_rate": 1.8927786666666667e-05, + "loss": 1.3982, + "step": 92700 + }, + { + "epoch": 0.14847976243238012, + "grad_norm": 0.000352471019141376, + "learning_rate": 1.8924231111111115e-05, + "loss": 0.8595, + "step": 92800 + }, + { + "epoch": 0.14863976217638053, + "grad_norm": 0.20730045437812805, + "learning_rate": 1.8920675555555556e-05, + "loss": 1.572, + "step": 92900 + }, + { + "epoch": 0.14879976192038094, + "grad_norm": 5.9496917724609375, + "learning_rate": 1.8917120000000004e-05, + "loss": 1.2832, + "step": 93000 + }, + { + "epoch": 0.14895976166438132, + "grad_norm": 0.012922318652272224, + "learning_rate": 1.8913564444444445e-05, + "loss": 1.2838, + "step": 93100 + }, + { + "epoch": 0.14911976140838173, + "grad_norm": 40.53496170043945, + "learning_rate": 1.8910008888888893e-05, + "loss": 1.6535, + "step": 93200 + }, + { + "epoch": 0.14927976115238215, + "grad_norm": 106.17526245117188, + "learning_rate": 1.8906453333333334e-05, + "loss": 1.5996, + "step": 93300 + }, + { + "epoch": 0.14943976089638256, + "grad_norm": 93.25550079345703, + "learning_rate": 1.890289777777778e-05, + "loss": 1.058, + "step": 93400 + }, + { + "epoch": 0.14959976064038297, + "grad_norm": 83.99794006347656, + "learning_rate": 1.8899342222222223e-05, + "loss": 1.3316, + "step": 93500 + }, + { + "epoch": 0.14975976038438338, + "grad_norm": 0.0036302392836660147, + "learning_rate": 1.8895822222222224e-05, + "loss": 0.8627, + "step": 93600 + }, + { + "epoch": 0.1499197601283838, + "grad_norm": 4.188179969787598, + "learning_rate": 1.889226666666667e-05, + "loss": 1.4411, + "step": 93700 + }, + { + "epoch": 0.1500797598723842, + "grad_norm": 0.38291868567466736, + "learning_rate": 1.8888711111111113e-05, + "loss": 0.9331, + "step": 93800 + }, + { + "epoch": 0.1502397596163846, + "grad_norm": 2.517091751098633, + "learning_rate": 1.8885155555555558e-05, + "loss": 1.0032, + "step": 93900 + }, + { + "epoch": 0.15039975936038502, + "grad_norm": 128.78472900390625, + "learning_rate": 1.8881600000000002e-05, + "loss": 1.2341, + "step": 94000 + }, + { + "epoch": 0.15055975910438543, + "grad_norm": 0.0004920060164295137, + "learning_rate": 1.8878044444444447e-05, + "loss": 1.3369, + "step": 94100 + }, + { + "epoch": 0.15071975884838584, + "grad_norm": 36.320411682128906, + "learning_rate": 1.887448888888889e-05, + "loss": 1.2324, + "step": 94200 + }, + { + "epoch": 0.15087975859238625, + "grad_norm": 34.70246887207031, + "learning_rate": 1.8870933333333336e-05, + "loss": 1.6952, + "step": 94300 + }, + { + "epoch": 0.15103975833638666, + "grad_norm": 0.5894516110420227, + "learning_rate": 1.886737777777778e-05, + "loss": 1.2401, + "step": 94400 + }, + { + "epoch": 0.15119975808038708, + "grad_norm": 0.0024842778220772743, + "learning_rate": 1.8863822222222222e-05, + "loss": 1.2998, + "step": 94500 + }, + { + "epoch": 0.1513597578243875, + "grad_norm": 0.16776002943515778, + "learning_rate": 1.886026666666667e-05, + "loss": 1.1458, + "step": 94600 + }, + { + "epoch": 0.1515197575683879, + "grad_norm": 0.008235426619648933, + "learning_rate": 1.885671111111111e-05, + "loss": 1.0211, + "step": 94700 + }, + { + "epoch": 0.1516797573123883, + "grad_norm": 82.82616424560547, + "learning_rate": 1.8853191111111112e-05, + "loss": 0.9866, + "step": 94800 + }, + { + "epoch": 0.15183975705638872, + "grad_norm": 0.0073872278444468975, + "learning_rate": 1.8849635555555556e-05, + "loss": 1.3636, + "step": 94900 + }, + { + "epoch": 0.15199975680038913, + "grad_norm": 11.451104164123535, + "learning_rate": 1.884608e-05, + "loss": 1.1485, + "step": 95000 + }, + { + "epoch": 0.15215975654438954, + "grad_norm": 100.47103118896484, + "learning_rate": 1.8842524444444446e-05, + "loss": 0.7671, + "step": 95100 + }, + { + "epoch": 0.15231975628838995, + "grad_norm": 0.4101124703884125, + "learning_rate": 1.883896888888889e-05, + "loss": 1.0069, + "step": 95200 + }, + { + "epoch": 0.15247975603239033, + "grad_norm": 37.40227508544922, + "learning_rate": 1.8835413333333335e-05, + "loss": 1.1276, + "step": 95300 + }, + { + "epoch": 0.15263975577639075, + "grad_norm": 141.687744140625, + "learning_rate": 1.883185777777778e-05, + "loss": 1.4477, + "step": 95400 + }, + { + "epoch": 0.15279975552039116, + "grad_norm": 0.008716798387467861, + "learning_rate": 1.8828302222222224e-05, + "loss": 0.9887, + "step": 95500 + }, + { + "epoch": 0.15295975526439157, + "grad_norm": 7.543517858721316e-05, + "learning_rate": 1.882474666666667e-05, + "loss": 1.065, + "step": 95600 + }, + { + "epoch": 0.15311975500839198, + "grad_norm": 6.7989821434021, + "learning_rate": 1.8821191111111113e-05, + "loss": 0.982, + "step": 95700 + }, + { + "epoch": 0.1532797547523924, + "grad_norm": 27.62921714782715, + "learning_rate": 1.8817635555555557e-05, + "loss": 1.1166, + "step": 95800 + }, + { + "epoch": 0.1534397544963928, + "grad_norm": 28.467132568359375, + "learning_rate": 1.8814080000000002e-05, + "loss": 1.3949, + "step": 95900 + }, + { + "epoch": 0.1535997542403932, + "grad_norm": 89.31570434570312, + "learning_rate": 1.8810524444444447e-05, + "loss": 1.4164, + "step": 96000 + }, + { + "epoch": 0.15375975398439362, + "grad_norm": 0.30763378739356995, + "learning_rate": 1.880696888888889e-05, + "loss": 1.7997, + "step": 96100 + }, + { + "epoch": 0.15391975372839403, + "grad_norm": 90.01514434814453, + "learning_rate": 1.8803413333333336e-05, + "loss": 1.3941, + "step": 96200 + }, + { + "epoch": 0.15407975347239444, + "grad_norm": 0.9498651027679443, + "learning_rate": 1.879985777777778e-05, + "loss": 1.0592, + "step": 96300 + }, + { + "epoch": 0.15423975321639485, + "grad_norm": 88.10832977294922, + "learning_rate": 1.8796302222222225e-05, + "loss": 1.1661, + "step": 96400 + }, + { + "epoch": 0.15439975296039526, + "grad_norm": 3.867802143096924, + "learning_rate": 1.879274666666667e-05, + "loss": 1.5968, + "step": 96500 + }, + { + "epoch": 0.15455975270439568, + "grad_norm": 114.89385986328125, + "learning_rate": 1.8789191111111114e-05, + "loss": 1.2586, + "step": 96600 + }, + { + "epoch": 0.1547197524483961, + "grad_norm": 0.009951179847121239, + "learning_rate": 1.878563555555556e-05, + "loss": 1.5164, + "step": 96700 + }, + { + "epoch": 0.1548797521923965, + "grad_norm": 1.848288893699646, + "learning_rate": 1.8782080000000003e-05, + "loss": 1.5942, + "step": 96800 + }, + { + "epoch": 0.1550397519363969, + "grad_norm": 56.26310348510742, + "learning_rate": 1.8778524444444448e-05, + "loss": 0.6635, + "step": 96900 + }, + { + "epoch": 0.15519975168039732, + "grad_norm": 0.2863824963569641, + "learning_rate": 1.8774968888888892e-05, + "loss": 1.3037, + "step": 97000 + }, + { + "epoch": 0.15535975142439773, + "grad_norm": 0.006738504860550165, + "learning_rate": 1.8771413333333333e-05, + "loss": 1.3557, + "step": 97100 + }, + { + "epoch": 0.15551975116839814, + "grad_norm": 0.24526001513004303, + "learning_rate": 1.876785777777778e-05, + "loss": 1.0864, + "step": 97200 + }, + { + "epoch": 0.15567975091239855, + "grad_norm": 70.70162963867188, + "learning_rate": 1.8764302222222222e-05, + "loss": 1.3139, + "step": 97300 + }, + { + "epoch": 0.15583975065639896, + "grad_norm": 0.7548888921737671, + "learning_rate": 1.8760746666666667e-05, + "loss": 0.7139, + "step": 97400 + }, + { + "epoch": 0.15599975040039937, + "grad_norm": 0.08793803304433823, + "learning_rate": 1.875719111111111e-05, + "loss": 1.1084, + "step": 97500 + }, + { + "epoch": 0.15615975014439976, + "grad_norm": 8.044859886169434, + "learning_rate": 1.8753635555555556e-05, + "loss": 1.2294, + "step": 97600 + }, + { + "epoch": 0.15631974988840017, + "grad_norm": 0.4635624587535858, + "learning_rate": 1.875008e-05, + "loss": 0.9581, + "step": 97700 + }, + { + "epoch": 0.15647974963240058, + "grad_norm": 0.0022484343498945236, + "learning_rate": 1.8746524444444445e-05, + "loss": 1.2983, + "step": 97800 + }, + { + "epoch": 0.156639749376401, + "grad_norm": 2.357697010040283, + "learning_rate": 1.874296888888889e-05, + "loss": 1.8281, + "step": 97900 + }, + { + "epoch": 0.1567997491204014, + "grad_norm": 78.0554428100586, + "learning_rate": 1.8739413333333334e-05, + "loss": 1.2914, + "step": 98000 + }, + { + "epoch": 0.1569597488644018, + "grad_norm": 0.6091700196266174, + "learning_rate": 1.873585777777778e-05, + "loss": 0.8656, + "step": 98100 + }, + { + "epoch": 0.15711974860840222, + "grad_norm": 0.20535144209861755, + "learning_rate": 1.8732302222222223e-05, + "loss": 1.3438, + "step": 98200 + }, + { + "epoch": 0.15727974835240263, + "grad_norm": 0.029342494904994965, + "learning_rate": 1.8728746666666668e-05, + "loss": 1.465, + "step": 98300 + }, + { + "epoch": 0.15743974809640304, + "grad_norm": 0.20423032343387604, + "learning_rate": 1.8725191111111112e-05, + "loss": 1.2253, + "step": 98400 + }, + { + "epoch": 0.15759974784040345, + "grad_norm": 0.020203936845064163, + "learning_rate": 1.8721635555555557e-05, + "loss": 1.3481, + "step": 98500 + }, + { + "epoch": 0.15775974758440386, + "grad_norm": 0.001091059297323227, + "learning_rate": 1.871808e-05, + "loss": 1.5131, + "step": 98600 + }, + { + "epoch": 0.15791974732840428, + "grad_norm": 42.3817253112793, + "learning_rate": 1.8714524444444446e-05, + "loss": 1.4852, + "step": 98700 + }, + { + "epoch": 0.15807974707240469, + "grad_norm": 11.986414909362793, + "learning_rate": 1.871096888888889e-05, + "loss": 1.1317, + "step": 98800 + }, + { + "epoch": 0.1582397468164051, + "grad_norm": 6.878232002258301, + "learning_rate": 1.8707413333333335e-05, + "loss": 1.0395, + "step": 98900 + }, + { + "epoch": 0.1583997465604055, + "grad_norm": 0.011188351549208164, + "learning_rate": 1.8703893333333333e-05, + "loss": 0.9256, + "step": 99000 + }, + { + "epoch": 0.15855974630440592, + "grad_norm": 0.03425045683979988, + "learning_rate": 1.870033777777778e-05, + "loss": 0.9774, + "step": 99100 + }, + { + "epoch": 0.15871974604840633, + "grad_norm": 26.11473846435547, + "learning_rate": 1.8696782222222222e-05, + "loss": 0.9756, + "step": 99200 + }, + { + "epoch": 0.15887974579240674, + "grad_norm": 0.0001582380209583789, + "learning_rate": 1.869322666666667e-05, + "loss": 1.4885, + "step": 99300 + }, + { + "epoch": 0.15903974553640715, + "grad_norm": 0.1462339162826538, + "learning_rate": 1.868967111111111e-05, + "loss": 1.2373, + "step": 99400 + }, + { + "epoch": 0.15919974528040756, + "grad_norm": 20.499425888061523, + "learning_rate": 1.8686115555555556e-05, + "loss": 1.3868, + "step": 99500 + }, + { + "epoch": 0.15935974502440797, + "grad_norm": 89.42505645751953, + "learning_rate": 1.868256e-05, + "loss": 0.9238, + "step": 99600 + }, + { + "epoch": 0.15951974476840838, + "grad_norm": 4.4118266105651855, + "learning_rate": 1.8679004444444445e-05, + "loss": 1.0793, + "step": 99700 + }, + { + "epoch": 0.15967974451240877, + "grad_norm": 0.08320512622594833, + "learning_rate": 1.867544888888889e-05, + "loss": 1.2405, + "step": 99800 + }, + { + "epoch": 0.15983974425640918, + "grad_norm": 1.5630072355270386, + "learning_rate": 1.8671893333333334e-05, + "loss": 1.2417, + "step": 99900 + }, + { + "epoch": 0.1599997440004096, + "grad_norm": 1.7299790382385254, + "learning_rate": 1.866833777777778e-05, + "loss": 1.1264, + "step": 100000 + }, + { + "epoch": 0.16015974374441, + "grad_norm": 0.939391553401947, + "learning_rate": 1.8664782222222223e-05, + "loss": 1.3042, + "step": 100100 + }, + { + "epoch": 0.1603197434884104, + "grad_norm": 1.1316514015197754, + "learning_rate": 1.8661226666666668e-05, + "loss": 1.7169, + "step": 100200 + }, + { + "epoch": 0.16047974323241082, + "grad_norm": 103.74418640136719, + "learning_rate": 1.8657671111111112e-05, + "loss": 1.0939, + "step": 100300 + }, + { + "epoch": 0.16063974297641123, + "grad_norm": 42.52919006347656, + "learning_rate": 1.8654115555555557e-05, + "loss": 1.4, + "step": 100400 + }, + { + "epoch": 0.16079974272041164, + "grad_norm": 0.11312247812747955, + "learning_rate": 1.865056e-05, + "loss": 1.1289, + "step": 100500 + }, + { + "epoch": 0.16095974246441205, + "grad_norm": 5.693640232086182, + "learning_rate": 1.8647004444444446e-05, + "loss": 1.26, + "step": 100600 + }, + { + "epoch": 0.16111974220841246, + "grad_norm": 1.3500862121582031, + "learning_rate": 1.864344888888889e-05, + "loss": 0.815, + "step": 100700 + }, + { + "epoch": 0.16127974195241288, + "grad_norm": 0.7210176587104797, + "learning_rate": 1.8639928888888888e-05, + "loss": 0.9622, + "step": 100800 + }, + { + "epoch": 0.16143974169641329, + "grad_norm": 70.74454498291016, + "learning_rate": 1.8636373333333336e-05, + "loss": 1.0715, + "step": 100900 + }, + { + "epoch": 0.1615997414404137, + "grad_norm": 619.2721557617188, + "learning_rate": 1.8632817777777777e-05, + "loss": 1.4498, + "step": 101000 + }, + { + "epoch": 0.1617597411844141, + "grad_norm": 3.1772332191467285, + "learning_rate": 1.8629262222222225e-05, + "loss": 1.2484, + "step": 101100 + }, + { + "epoch": 0.16191974092841452, + "grad_norm": 0.00012408003385644406, + "learning_rate": 1.8625706666666666e-05, + "loss": 1.5755, + "step": 101200 + }, + { + "epoch": 0.16207974067241493, + "grad_norm": 84.21326446533203, + "learning_rate": 1.8622151111111114e-05, + "loss": 1.3742, + "step": 101300 + }, + { + "epoch": 0.16223974041641534, + "grad_norm": 7.633441925048828, + "learning_rate": 1.8618595555555555e-05, + "loss": 1.6062, + "step": 101400 + }, + { + "epoch": 0.16239974016041575, + "grad_norm": 0.0011624402832239866, + "learning_rate": 1.8615040000000003e-05, + "loss": 1.6763, + "step": 101500 + }, + { + "epoch": 0.16255973990441616, + "grad_norm": 0.10516340285539627, + "learning_rate": 1.8611484444444444e-05, + "loss": 1.5295, + "step": 101600 + }, + { + "epoch": 0.16271973964841657, + "grad_norm": 0.34978896379470825, + "learning_rate": 1.8607928888888892e-05, + "loss": 1.3866, + "step": 101700 + }, + { + "epoch": 0.16287973939241698, + "grad_norm": 96.7831039428711, + "learning_rate": 1.8604373333333334e-05, + "loss": 1.1005, + "step": 101800 + }, + { + "epoch": 0.1630397391364174, + "grad_norm": 0.05071321874856949, + "learning_rate": 1.860081777777778e-05, + "loss": 0.818, + "step": 101900 + }, + { + "epoch": 0.1631997388804178, + "grad_norm": 0.011139901354908943, + "learning_rate": 1.8597262222222223e-05, + "loss": 1.6994, + "step": 102000 + }, + { + "epoch": 0.1633597386244182, + "grad_norm": 3.774827241897583, + "learning_rate": 1.8593706666666667e-05, + "loss": 0.7468, + "step": 102100 + }, + { + "epoch": 0.1635197383684186, + "grad_norm": 31.253124237060547, + "learning_rate": 1.8590151111111112e-05, + "loss": 1.1504, + "step": 102200 + }, + { + "epoch": 0.163679738112419, + "grad_norm": 0.020283292979002, + "learning_rate": 1.8586595555555556e-05, + "loss": 1.023, + "step": 102300 + }, + { + "epoch": 0.16383973785641942, + "grad_norm": 0.5132649540901184, + "learning_rate": 1.858304e-05, + "loss": 1.1705, + "step": 102400 + }, + { + "epoch": 0.16399973760041983, + "grad_norm": 65.84090423583984, + "learning_rate": 1.8579484444444445e-05, + "loss": 1.2671, + "step": 102500 + }, + { + "epoch": 0.16415973734442024, + "grad_norm": 10.329702377319336, + "learning_rate": 1.857592888888889e-05, + "loss": 1.1874, + "step": 102600 + }, + { + "epoch": 0.16431973708842065, + "grad_norm": 0.012285147793591022, + "learning_rate": 1.8572373333333335e-05, + "loss": 1.0913, + "step": 102700 + }, + { + "epoch": 0.16447973683242106, + "grad_norm": 0.5362551212310791, + "learning_rate": 1.856881777777778e-05, + "loss": 1.3353, + "step": 102800 + }, + { + "epoch": 0.16463973657642147, + "grad_norm": 164.44480895996094, + "learning_rate": 1.8565262222222224e-05, + "loss": 1.1726, + "step": 102900 + }, + { + "epoch": 0.16479973632042189, + "grad_norm": 7.289721488952637, + "learning_rate": 1.8561706666666668e-05, + "loss": 0.9484, + "step": 103000 + }, + { + "epoch": 0.1649597360644223, + "grad_norm": 1.6847283840179443, + "learning_rate": 1.8558151111111113e-05, + "loss": 1.1276, + "step": 103100 + }, + { + "epoch": 0.1651197358084227, + "grad_norm": 0.8367934226989746, + "learning_rate": 1.8554595555555557e-05, + "loss": 1.6352, + "step": 103200 + }, + { + "epoch": 0.16527973555242312, + "grad_norm": 106.39085388183594, + "learning_rate": 1.8551040000000002e-05, + "loss": 1.1789, + "step": 103300 + }, + { + "epoch": 0.16543973529642353, + "grad_norm": 1.374647617340088, + "learning_rate": 1.8547484444444446e-05, + "loss": 1.2853, + "step": 103400 + }, + { + "epoch": 0.16559973504042394, + "grad_norm": 15.723018646240234, + "learning_rate": 1.854392888888889e-05, + "loss": 1.3151, + "step": 103500 + }, + { + "epoch": 0.16575973478442435, + "grad_norm": 11.512675285339355, + "learning_rate": 1.8540373333333336e-05, + "loss": 1.1619, + "step": 103600 + }, + { + "epoch": 0.16591973452842476, + "grad_norm": 0.00038478337228298187, + "learning_rate": 1.853681777777778e-05, + "loss": 1.2232, + "step": 103700 + }, + { + "epoch": 0.16607973427242517, + "grad_norm": 0.0019370738882571459, + "learning_rate": 1.8533262222222225e-05, + "loss": 0.8593, + "step": 103800 + }, + { + "epoch": 0.16623973401642558, + "grad_norm": 26.676523208618164, + "learning_rate": 1.852970666666667e-05, + "loss": 0.8925, + "step": 103900 + }, + { + "epoch": 0.166399733760426, + "grad_norm": 19.97356605529785, + "learning_rate": 1.8526151111111114e-05, + "loss": 1.3056, + "step": 104000 + }, + { + "epoch": 0.1665597335044264, + "grad_norm": 0.2853270173072815, + "learning_rate": 1.8522631111111115e-05, + "loss": 1.7856, + "step": 104100 + }, + { + "epoch": 0.16671973324842682, + "grad_norm": 1.1277884244918823, + "learning_rate": 1.8519075555555556e-05, + "loss": 0.7826, + "step": 104200 + }, + { + "epoch": 0.1668797329924272, + "grad_norm": 0.0009695938788354397, + "learning_rate": 1.8515520000000004e-05, + "loss": 0.8696, + "step": 104300 + }, + { + "epoch": 0.1670397327364276, + "grad_norm": 57.40456008911133, + "learning_rate": 1.8511964444444445e-05, + "loss": 1.0999, + "step": 104400 + }, + { + "epoch": 0.16719973248042802, + "grad_norm": 55.53129959106445, + "learning_rate": 1.850840888888889e-05, + "loss": 0.9611, + "step": 104500 + }, + { + "epoch": 0.16735973222442843, + "grad_norm": 39.75944519042969, + "learning_rate": 1.8504853333333334e-05, + "loss": 1.2425, + "step": 104600 + }, + { + "epoch": 0.16751973196842884, + "grad_norm": 64.35377502441406, + "learning_rate": 1.850129777777778e-05, + "loss": 0.8884, + "step": 104700 + }, + { + "epoch": 0.16767973171242925, + "grad_norm": 35.91818618774414, + "learning_rate": 1.8497742222222223e-05, + "loss": 1.0689, + "step": 104800 + }, + { + "epoch": 0.16783973145642966, + "grad_norm": 0.041738979518413544, + "learning_rate": 1.8494186666666668e-05, + "loss": 1.0076, + "step": 104900 + }, + { + "epoch": 0.16799973120043007, + "grad_norm": 64.14539337158203, + "learning_rate": 1.8490631111111112e-05, + "loss": 1.3108, + "step": 105000 + } + ], + "logging_steps": 100, + "max_steps": 625001, + "num_input_tokens_seen": 0, + "num_train_epochs": 1, + "save_steps": 500, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 0.0, + "train_batch_size": 128, + "trial_name": null, + "trial_params": null +}