youssefkhalil320's picture
Upload folder using huggingface_hub
f9baaab verified
raw
history blame
177 kB
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 0.1599997440004096,
"eval_steps": 200000,
"global_step": 100000,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.0001599997440004096,
"grad_norm": 84.32501983642578,
"learning_rate": 3.103950336794611e-08,
"loss": 10.8792,
"step": 100
},
{
"epoch": 0.0003199994880008192,
"grad_norm": 60.63747024536133,
"learning_rate": 6.303899137613798e-08,
"loss": 10.9284,
"step": 200
},
{
"epoch": 0.00047999923200122877,
"grad_norm": 55.71075439453125,
"learning_rate": 9.503847938432986e-08,
"loss": 10.6466,
"step": 300
},
{
"epoch": 0.0006399989760016384,
"grad_norm": 57.63307189941406,
"learning_rate": 1.2703796739252173e-07,
"loss": 10.841,
"step": 400
},
{
"epoch": 0.000799998720002048,
"grad_norm": 89.1032485961914,
"learning_rate": 1.590374554007136e-07,
"loss": 10.8094,
"step": 500
},
{
"epoch": 0.0009599984640024575,
"grad_norm": 57.2479362487793,
"learning_rate": 1.9103694340890547e-07,
"loss": 10.4323,
"step": 600
},
{
"epoch": 0.0011199982080028672,
"grad_norm": 51.17530059814453,
"learning_rate": 2.2303643141709733e-07,
"loss": 10.3032,
"step": 700
},
{
"epoch": 0.0012799979520032767,
"grad_norm": 60.76409912109375,
"learning_rate": 2.550359194252892e-07,
"loss": 10.4006,
"step": 800
},
{
"epoch": 0.0014399976960036865,
"grad_norm": 67.00859069824219,
"learning_rate": 2.870354074334811e-07,
"loss": 10.4743,
"step": 900
},
{
"epoch": 0.001599997440004096,
"grad_norm": 68.4343032836914,
"learning_rate": 3.19034895441673e-07,
"loss": 10.2334,
"step": 1000
},
{
"epoch": 0.0017599971840045055,
"grad_norm": 48.704105377197266,
"learning_rate": 3.510343834498648e-07,
"loss": 10.0135,
"step": 1100
},
{
"epoch": 0.001919996928004915,
"grad_norm": 45.30134963989258,
"learning_rate": 3.830338714580567e-07,
"loss": 9.7874,
"step": 1200
},
{
"epoch": 0.002079996672005325,
"grad_norm": 84.56024169921875,
"learning_rate": 4.150333594662486e-07,
"loss": 9.7419,
"step": 1300
},
{
"epoch": 0.0022399964160057344,
"grad_norm": 45.73213195800781,
"learning_rate": 4.470328474744404e-07,
"loss": 9.7412,
"step": 1400
},
{
"epoch": 0.002399996160006144,
"grad_norm": 50.21996307373047,
"learning_rate": 4.790323354826324e-07,
"loss": 9.4585,
"step": 1500
},
{
"epoch": 0.0025599959040065534,
"grad_norm": 59.475799560546875,
"learning_rate": 5.110318234908241e-07,
"loss": 9.5339,
"step": 1600
},
{
"epoch": 0.002719995648006963,
"grad_norm": 82.53620910644531,
"learning_rate": 5.43031311499016e-07,
"loss": 9.4345,
"step": 1700
},
{
"epoch": 0.002879995392007373,
"grad_norm": 39.44235610961914,
"learning_rate": 5.750307995072079e-07,
"loss": 9.1733,
"step": 1800
},
{
"epoch": 0.0030399951360077825,
"grad_norm": 37.58698654174805,
"learning_rate": 6.070302875153998e-07,
"loss": 8.9952,
"step": 1900
},
{
"epoch": 0.003199994880008192,
"grad_norm": 40.35204315185547,
"learning_rate": 6.390297755235917e-07,
"loss": 8.9669,
"step": 2000
},
{
"epoch": 0.0033599946240086016,
"grad_norm": 57.84451675415039,
"learning_rate": 6.707092686517017e-07,
"loss": 8.8152,
"step": 2100
},
{
"epoch": 0.003519994368009011,
"grad_norm": 40.126953125,
"learning_rate": 7.027087566598935e-07,
"loss": 8.7936,
"step": 2200
},
{
"epoch": 0.0036799941120094206,
"grad_norm": 35.435707092285156,
"learning_rate": 7.347082446680854e-07,
"loss": 8.6771,
"step": 2300
},
{
"epoch": 0.00383999385600983,
"grad_norm": 42.3509635925293,
"learning_rate": 7.667077326762773e-07,
"loss": 8.4648,
"step": 2400
},
{
"epoch": 0.00399999360001024,
"grad_norm": 33.58556365966797,
"learning_rate": 7.987072206844691e-07,
"loss": 8.5764,
"step": 2500
},
{
"epoch": 0.00415999334401065,
"grad_norm": 34.014678955078125,
"learning_rate": 8.30706708692661e-07,
"loss": 8.4587,
"step": 2600
},
{
"epoch": 0.004319993088011059,
"grad_norm": 36.43831253051758,
"learning_rate": 8.627061967008528e-07,
"loss": 8.2966,
"step": 2700
},
{
"epoch": 0.004479992832011469,
"grad_norm": 31.411684036254883,
"learning_rate": 8.947056847090448e-07,
"loss": 8.2329,
"step": 2800
},
{
"epoch": 0.004639992576011879,
"grad_norm": 47.570125579833984,
"learning_rate": 9.267051727172366e-07,
"loss": 8.1415,
"step": 2900
},
{
"epoch": 0.004799992320012288,
"grad_norm": 30.771928787231445,
"learning_rate": 9.587046607254284e-07,
"loss": 8.0404,
"step": 3000
},
{
"epoch": 0.004959992064012698,
"grad_norm": 26.92803955078125,
"learning_rate": 9.907041487336204e-07,
"loss": 7.9698,
"step": 3100
},
{
"epoch": 0.005119991808013107,
"grad_norm": 31.121917724609375,
"learning_rate": 1.0227036367418122e-06,
"loss": 7.9205,
"step": 3200
},
{
"epoch": 0.005279991552013517,
"grad_norm": 33.991416931152344,
"learning_rate": 1.054703124750004e-06,
"loss": 7.8314,
"step": 3300
},
{
"epoch": 0.005439991296013926,
"grad_norm": 31.278030395507812,
"learning_rate": 1.086702612758196e-06,
"loss": 7.8369,
"step": 3400
},
{
"epoch": 0.005599991040014336,
"grad_norm": 28.116140365600586,
"learning_rate": 1.1187021007663878e-06,
"loss": 7.6403,
"step": 3500
},
{
"epoch": 0.005759990784014746,
"grad_norm": 30.954113006591797,
"learning_rate": 1.1507015887745798e-06,
"loss": 7.5842,
"step": 3600
},
{
"epoch": 0.005919990528015155,
"grad_norm": 36.53567886352539,
"learning_rate": 1.1827010767827715e-06,
"loss": 7.5812,
"step": 3700
},
{
"epoch": 0.006079990272015565,
"grad_norm": 36.81153106689453,
"learning_rate": 1.2147005647909635e-06,
"loss": 7.4335,
"step": 3800
},
{
"epoch": 0.006239990016015974,
"grad_norm": 22.556833267211914,
"learning_rate": 1.2467000527991553e-06,
"loss": 7.4917,
"step": 3900
},
{
"epoch": 0.006399989760016384,
"grad_norm": 40.195579528808594,
"learning_rate": 1.278699540807347e-06,
"loss": 7.3204,
"step": 4000
},
{
"epoch": 0.006559989504016793,
"grad_norm": 21.862642288208008,
"learning_rate": 1.310699028815539e-06,
"loss": 7.2971,
"step": 4100
},
{
"epoch": 0.006719989248017203,
"grad_norm": 29.61161231994629,
"learning_rate": 1.3426985168237308e-06,
"loss": 7.2233,
"step": 4200
},
{
"epoch": 0.006879988992017613,
"grad_norm": 22.342451095581055,
"learning_rate": 1.3746980048319228e-06,
"loss": 7.2081,
"step": 4300
},
{
"epoch": 0.007039988736018022,
"grad_norm": 36.36684799194336,
"learning_rate": 1.4066974928401148e-06,
"loss": 7.1364,
"step": 4400
},
{
"epoch": 0.007199988480018432,
"grad_norm": 25.563953399658203,
"learning_rate": 1.4386969808483064e-06,
"loss": 7.0663,
"step": 4500
},
{
"epoch": 0.007359988224018841,
"grad_norm": 22.50385856628418,
"learning_rate": 1.4706964688564984e-06,
"loss": 6.9601,
"step": 4600
},
{
"epoch": 0.007519987968019251,
"grad_norm": 31.61231231689453,
"learning_rate": 1.5026959568646904e-06,
"loss": 6.9546,
"step": 4700
},
{
"epoch": 0.00767998771201966,
"grad_norm": 18.862520217895508,
"learning_rate": 1.5346954448728822e-06,
"loss": 6.9019,
"step": 4800
},
{
"epoch": 0.00783998745602007,
"grad_norm": 32.594539642333984,
"learning_rate": 1.5666949328810741e-06,
"loss": 6.8801,
"step": 4900
},
{
"epoch": 0.00799998720002048,
"grad_norm": 21.06804084777832,
"learning_rate": 1.598694420889266e-06,
"loss": 6.7734,
"step": 5000
},
{
"epoch": 0.00815998694402089,
"grad_norm": 31.783803939819336,
"learning_rate": 1.6303739140173757e-06,
"loss": 6.7648,
"step": 5100
},
{
"epoch": 0.0083199866880213,
"grad_norm": 49.79084777832031,
"learning_rate": 1.6623734020255677e-06,
"loss": 6.7498,
"step": 5200
},
{
"epoch": 0.008479986432021708,
"grad_norm": 26.1977481842041,
"learning_rate": 1.6943728900337597e-06,
"loss": 6.6872,
"step": 5300
},
{
"epoch": 0.008639986176022118,
"grad_norm": 21.942001342773438,
"learning_rate": 1.7263723780419515e-06,
"loss": 6.6264,
"step": 5400
},
{
"epoch": 0.008799985920022528,
"grad_norm": 32.572959899902344,
"learning_rate": 1.7583718660501433e-06,
"loss": 6.579,
"step": 5500
},
{
"epoch": 0.008959985664022938,
"grad_norm": 20.728240966796875,
"learning_rate": 1.7903713540583353e-06,
"loss": 6.6001,
"step": 5600
},
{
"epoch": 0.009119985408023347,
"grad_norm": 24.334205627441406,
"learning_rate": 1.822370842066527e-06,
"loss": 6.5971,
"step": 5700
},
{
"epoch": 0.009279985152023757,
"grad_norm": 27.025753021240234,
"learning_rate": 1.854370330074719e-06,
"loss": 6.4694,
"step": 5800
},
{
"epoch": 0.009439984896024167,
"grad_norm": 23.506013870239258,
"learning_rate": 1.8863698180829106e-06,
"loss": 6.3983,
"step": 5900
},
{
"epoch": 0.009599984640024576,
"grad_norm": 35.65713882446289,
"learning_rate": 1.9183693060911026e-06,
"loss": 6.4477,
"step": 6000
},
{
"epoch": 0.009759984384024985,
"grad_norm": 22.977373123168945,
"learning_rate": 1.950368794099295e-06,
"loss": 6.4308,
"step": 6100
},
{
"epoch": 0.009919984128025396,
"grad_norm": 22.127635955810547,
"learning_rate": 1.982368282107486e-06,
"loss": 6.4248,
"step": 6200
},
{
"epoch": 0.010079983872025805,
"grad_norm": 33.53960418701172,
"learning_rate": 2.0143677701156784e-06,
"loss": 6.2642,
"step": 6300
},
{
"epoch": 0.010239983616026214,
"grad_norm": 24.39597511291504,
"learning_rate": 2.04636725812387e-06,
"loss": 6.2763,
"step": 6400
},
{
"epoch": 0.010399983360026625,
"grad_norm": 24.471288681030273,
"learning_rate": 2.078366746132062e-06,
"loss": 6.3878,
"step": 6500
},
{
"epoch": 0.010559983104027034,
"grad_norm": 34.05498123168945,
"learning_rate": 2.110366234140254e-06,
"loss": 6.2601,
"step": 6600
},
{
"epoch": 0.010719982848027443,
"grad_norm": 30.60455322265625,
"learning_rate": 2.142365722148446e-06,
"loss": 6.1789,
"step": 6700
},
{
"epoch": 0.010879982592027852,
"grad_norm": 27.737686157226562,
"learning_rate": 2.1743652101566377e-06,
"loss": 6.1773,
"step": 6800
},
{
"epoch": 0.011039982336028263,
"grad_norm": 24.246810913085938,
"learning_rate": 2.2063646981648294e-06,
"loss": 6.1439,
"step": 6900
},
{
"epoch": 0.011199982080028672,
"grad_norm": 27.53533363342285,
"learning_rate": 2.2383641861730217e-06,
"loss": 6.1863,
"step": 7000
},
{
"epoch": 0.011359981824029081,
"grad_norm": 27.81687355041504,
"learning_rate": 2.2703636741812134e-06,
"loss": 6.0513,
"step": 7100
},
{
"epoch": 0.011519981568029492,
"grad_norm": 28.00519371032715,
"learning_rate": 2.3020431673093234e-06,
"loss": 6.0671,
"step": 7200
},
{
"epoch": 0.011679981312029901,
"grad_norm": 29.347061157226562,
"learning_rate": 2.3340426553175152e-06,
"loss": 6.0212,
"step": 7300
},
{
"epoch": 0.01183998105603031,
"grad_norm": 29.621200561523438,
"learning_rate": 2.365722148445625e-06,
"loss": 6.0043,
"step": 7400
},
{
"epoch": 0.011999980800030719,
"grad_norm": 31.689117431640625,
"learning_rate": 2.397721636453817e-06,
"loss": 6.0166,
"step": 7500
},
{
"epoch": 0.01215998054403113,
"grad_norm": 46.79508972167969,
"learning_rate": 2.429721124462009e-06,
"loss": 5.9754,
"step": 7600
},
{
"epoch": 0.012319980288031539,
"grad_norm": 28.857833862304688,
"learning_rate": 2.4617206124702006e-06,
"loss": 5.9211,
"step": 7700
},
{
"epoch": 0.012479980032031948,
"grad_norm": 58.34132766723633,
"learning_rate": 2.4937201004783928e-06,
"loss": 5.7867,
"step": 7800
},
{
"epoch": 0.012639979776032359,
"grad_norm": 49.33425521850586,
"learning_rate": 2.525719588486584e-06,
"loss": 5.8534,
"step": 7900
},
{
"epoch": 0.012799979520032768,
"grad_norm": 39.17392349243164,
"learning_rate": 2.5577190764947763e-06,
"loss": 5.7708,
"step": 8000
},
{
"epoch": 0.012959979264033177,
"grad_norm": 45.94136428833008,
"learning_rate": 2.589718564502968e-06,
"loss": 5.8328,
"step": 8100
},
{
"epoch": 0.013119979008033586,
"grad_norm": 36.19196319580078,
"learning_rate": 2.6217180525111603e-06,
"loss": 5.7417,
"step": 8200
},
{
"epoch": 0.013279978752033997,
"grad_norm": 37.051658630371094,
"learning_rate": 2.653717540519352e-06,
"loss": 5.8097,
"step": 8300
},
{
"epoch": 0.013439978496034406,
"grad_norm": 90.0757064819336,
"learning_rate": 2.6857170285275435e-06,
"loss": 5.7578,
"step": 8400
},
{
"epoch": 0.013599978240034815,
"grad_norm": 92.7857894897461,
"learning_rate": 2.7177165165357357e-06,
"loss": 5.643,
"step": 8500
},
{
"epoch": 0.013759977984035226,
"grad_norm": 26.648149490356445,
"learning_rate": 2.7497160045439274e-06,
"loss": 5.6401,
"step": 8600
},
{
"epoch": 0.013919977728035635,
"grad_norm": 45.42919158935547,
"learning_rate": 2.7817154925521196e-06,
"loss": 5.6627,
"step": 8700
},
{
"epoch": 0.014079977472036044,
"grad_norm": 48.3182487487793,
"learning_rate": 2.8137149805603114e-06,
"loss": 5.6167,
"step": 8800
},
{
"epoch": 0.014239977216036454,
"grad_norm": 51.463653564453125,
"learning_rate": 2.8457144685685028e-06,
"loss": 5.6539,
"step": 8900
},
{
"epoch": 0.014399976960036864,
"grad_norm": 47.81680679321289,
"learning_rate": 2.877713956576695e-06,
"loss": 5.4513,
"step": 9000
},
{
"epoch": 0.014559976704037273,
"grad_norm": 42.410667419433594,
"learning_rate": 2.9097134445848868e-06,
"loss": 5.4132,
"step": 9100
},
{
"epoch": 0.014719976448037683,
"grad_norm": 55.33562088012695,
"learning_rate": 2.941712932593079e-06,
"loss": 5.4714,
"step": 9200
},
{
"epoch": 0.014879976192038093,
"grad_norm": 38.538246154785156,
"learning_rate": 2.9737124206012707e-06,
"loss": 5.4786,
"step": 9300
},
{
"epoch": 0.015039975936038502,
"grad_norm": 43.42023468017578,
"learning_rate": 3.0057119086094625e-06,
"loss": 5.3928,
"step": 9400
},
{
"epoch": 0.015199975680038912,
"grad_norm": 24.861467361450195,
"learning_rate": 3.037391401737572e-06,
"loss": 5.4774,
"step": 9500
},
{
"epoch": 0.01535997542403932,
"grad_norm": 98.92141723632812,
"learning_rate": 3.0693908897457643e-06,
"loss": 5.2881,
"step": 9600
},
{
"epoch": 0.015519975168039732,
"grad_norm": 62.839866638183594,
"learning_rate": 3.101390377753956e-06,
"loss": 5.3699,
"step": 9700
},
{
"epoch": 0.01567997491204014,
"grad_norm": 46.006065368652344,
"learning_rate": 3.133069870882066e-06,
"loss": 5.1483,
"step": 9800
},
{
"epoch": 0.01583997465604055,
"grad_norm": 89.62445068359375,
"learning_rate": 3.1650693588902583e-06,
"loss": 5.3051,
"step": 9900
},
{
"epoch": 0.01599997440004096,
"grad_norm": 41.113609313964844,
"learning_rate": 3.19706884689845e-06,
"loss": 5.2546,
"step": 10000
},
{
"epoch": 0.01615997414404137,
"grad_norm": 46.37376403808594,
"learning_rate": 3.2290683349066414e-06,
"loss": 5.2314,
"step": 10100
},
{
"epoch": 0.01631997388804178,
"grad_norm": 60.3846321105957,
"learning_rate": 3.2610678229148337e-06,
"loss": 5.1783,
"step": 10200
},
{
"epoch": 0.016479973632042188,
"grad_norm": 145.4359130859375,
"learning_rate": 3.2930673109230254e-06,
"loss": 5.2074,
"step": 10300
},
{
"epoch": 0.0166399733760426,
"grad_norm": 69.00183868408203,
"learning_rate": 3.325066798931217e-06,
"loss": 5.2825,
"step": 10400
},
{
"epoch": 0.01679997312004301,
"grad_norm": 48.03580093383789,
"learning_rate": 3.3570662869394094e-06,
"loss": 5.1715,
"step": 10500
},
{
"epoch": 0.016959972864043417,
"grad_norm": 58.56736755371094,
"learning_rate": 3.389065774947601e-06,
"loss": 5.087,
"step": 10600
},
{
"epoch": 0.017119972608043828,
"grad_norm": 54.484527587890625,
"learning_rate": 3.421065262955793e-06,
"loss": 5.082,
"step": 10700
},
{
"epoch": 0.017279972352044235,
"grad_norm": 74.30866241455078,
"learning_rate": 3.4530647509639847e-06,
"loss": 4.9111,
"step": 10800
},
{
"epoch": 0.017439972096044646,
"grad_norm": 60.489505767822266,
"learning_rate": 3.4850642389721765e-06,
"loss": 5.0213,
"step": 10900
},
{
"epoch": 0.017599971840045057,
"grad_norm": 61.25093460083008,
"learning_rate": 3.5170637269803687e-06,
"loss": 4.9898,
"step": 11000
},
{
"epoch": 0.017759971584045464,
"grad_norm": 51.98568344116211,
"learning_rate": 3.5490632149885605e-06,
"loss": 4.7734,
"step": 11100
},
{
"epoch": 0.017919971328045875,
"grad_norm": 64.08167266845703,
"learning_rate": 3.581062702996752e-06,
"loss": 4.9511,
"step": 11200
},
{
"epoch": 0.018079971072046286,
"grad_norm": 61.8354606628418,
"learning_rate": 3.613062191004944e-06,
"loss": 5.0481,
"step": 11300
},
{
"epoch": 0.018239970816046693,
"grad_norm": 97.53675842285156,
"learning_rate": 3.645061679013136e-06,
"loss": 4.8441,
"step": 11400
},
{
"epoch": 0.018399970560047104,
"grad_norm": 49.35017013549805,
"learning_rate": 3.677061167021328e-06,
"loss": 4.873,
"step": 11500
},
{
"epoch": 0.018559970304047515,
"grad_norm": 44.33409118652344,
"learning_rate": 3.70906065502952e-06,
"loss": 4.9988,
"step": 11600
},
{
"epoch": 0.018719970048047922,
"grad_norm": 140.5505828857422,
"learning_rate": 3.741060143037712e-06,
"loss": 4.7653,
"step": 11700
},
{
"epoch": 0.018879969792048333,
"grad_norm": 68.21163177490234,
"learning_rate": 3.7730596310459034e-06,
"loss": 4.804,
"step": 11800
},
{
"epoch": 0.019039969536048744,
"grad_norm": 48.678226470947266,
"learning_rate": 3.805059119054095e-06,
"loss": 4.8288,
"step": 11900
},
{
"epoch": 0.01919996928004915,
"grad_norm": 76.32611083984375,
"learning_rate": 3.837058607062287e-06,
"loss": 4.7053,
"step": 12000
},
{
"epoch": 0.019359969024049562,
"grad_norm": 70.85586547851562,
"learning_rate": 3.869058095070479e-06,
"loss": 4.6887,
"step": 12100
},
{
"epoch": 0.01951996876804997,
"grad_norm": 66.46036529541016,
"learning_rate": 3.901057583078671e-06,
"loss": 4.7832,
"step": 12200
},
{
"epoch": 0.01967996851205038,
"grad_norm": 165.13221740722656,
"learning_rate": 3.9330570710868636e-06,
"loss": 4.6817,
"step": 12300
},
{
"epoch": 0.01983996825605079,
"grad_norm": 118.48895263671875,
"learning_rate": 3.965056559095055e-06,
"loss": 4.6252,
"step": 12400
},
{
"epoch": 0.0199999680000512,
"grad_norm": 64.3436050415039,
"learning_rate": 3.997056047103246e-06,
"loss": 4.5936,
"step": 12500
},
{
"epoch": 0.02015996774405161,
"grad_norm": 42.27592468261719,
"learning_rate": 4.0290555351114385e-06,
"loss": 4.7452,
"step": 12600
},
{
"epoch": 0.02031996748805202,
"grad_norm": 60.829036712646484,
"learning_rate": 4.061055023119631e-06,
"loss": 4.5321,
"step": 12700
},
{
"epoch": 0.020479967232052428,
"grad_norm": 161.975830078125,
"learning_rate": 4.093054511127823e-06,
"loss": 4.4964,
"step": 12800
},
{
"epoch": 0.02063996697605284,
"grad_norm": 99.2963638305664,
"learning_rate": 4.125053999136014e-06,
"loss": 4.4421,
"step": 12900
},
{
"epoch": 0.02079996672005325,
"grad_norm": 68.78880310058594,
"learning_rate": 4.156733492264124e-06,
"loss": 4.3782,
"step": 13000
},
{
"epoch": 0.020959966464053657,
"grad_norm": 80.74951171875,
"learning_rate": 4.188732980272316e-06,
"loss": 4.5169,
"step": 13100
},
{
"epoch": 0.021119966208054067,
"grad_norm": 157.87254333496094,
"learning_rate": 4.220412473400426e-06,
"loss": 4.533,
"step": 13200
},
{
"epoch": 0.02127996595205448,
"grad_norm": 148.68331909179688,
"learning_rate": 4.252411961408618e-06,
"loss": 4.3725,
"step": 13300
},
{
"epoch": 0.021439965696054886,
"grad_norm": 72.9531021118164,
"learning_rate": 4.28441144941681e-06,
"loss": 4.2911,
"step": 13400
},
{
"epoch": 0.021599965440055297,
"grad_norm": 73.24847412109375,
"learning_rate": 4.316410937425001e-06,
"loss": 4.2261,
"step": 13500
},
{
"epoch": 0.021759965184055704,
"grad_norm": 94.57313537597656,
"learning_rate": 4.348410425433194e-06,
"loss": 4.2467,
"step": 13600
},
{
"epoch": 0.021919964928056115,
"grad_norm": 105.674560546875,
"learning_rate": 4.380409913441385e-06,
"loss": 4.1558,
"step": 13700
},
{
"epoch": 0.022079964672056526,
"grad_norm": 63.658287048339844,
"learning_rate": 4.412409401449577e-06,
"loss": 4.2794,
"step": 13800
},
{
"epoch": 0.022239964416056933,
"grad_norm": 77.69287109375,
"learning_rate": 4.444408889457769e-06,
"loss": 4.2383,
"step": 13900
},
{
"epoch": 0.022399964160057344,
"grad_norm": 82.83360290527344,
"learning_rate": 4.4764083774659615e-06,
"loss": 4.1654,
"step": 14000
},
{
"epoch": 0.022559963904057755,
"grad_norm": 47.373531341552734,
"learning_rate": 4.508407865474153e-06,
"loss": 4.158,
"step": 14100
},
{
"epoch": 0.022719963648058162,
"grad_norm": 97.64757537841797,
"learning_rate": 4.540407353482344e-06,
"loss": 4.1299,
"step": 14200
},
{
"epoch": 0.022879963392058573,
"grad_norm": 54.75618362426758,
"learning_rate": 4.5724068414905365e-06,
"loss": 4.1902,
"step": 14300
},
{
"epoch": 0.023039963136058984,
"grad_norm": 258.4887390136719,
"learning_rate": 4.604406329498729e-06,
"loss": 3.7853,
"step": 14400
},
{
"epoch": 0.02319996288005939,
"grad_norm": 104.63798522949219,
"learning_rate": 4.63640581750692e-06,
"loss": 4.0514,
"step": 14500
},
{
"epoch": 0.023359962624059802,
"grad_norm": 60.090843200683594,
"learning_rate": 4.668405305515112e-06,
"loss": 4.1655,
"step": 14600
},
{
"epoch": 0.023519962368060213,
"grad_norm": 44.36670684814453,
"learning_rate": 4.7004047935233036e-06,
"loss": 4.051,
"step": 14700
},
{
"epoch": 0.02367996211206062,
"grad_norm": 41.61213302612305,
"learning_rate": 4.732404281531496e-06,
"loss": 4.078,
"step": 14800
},
{
"epoch": 0.02383996185606103,
"grad_norm": 73.2448501586914,
"learning_rate": 4.764403769539688e-06,
"loss": 4.1193,
"step": 14900
},
{
"epoch": 0.023999961600061438,
"grad_norm": 77.30301666259766,
"learning_rate": 4.796403257547879e-06,
"loss": 4.1536,
"step": 15000
},
{
"epoch": 0.02415996134406185,
"grad_norm": 48.1458854675293,
"learning_rate": 4.8284027455560715e-06,
"loss": 3.935,
"step": 15100
},
{
"epoch": 0.02431996108806226,
"grad_norm": 129.59295654296875,
"learning_rate": 4.860402233564263e-06,
"loss": 3.9535,
"step": 15200
},
{
"epoch": 0.024479960832062667,
"grad_norm": 163.0813751220703,
"learning_rate": 4.892401721572455e-06,
"loss": 3.7051,
"step": 15300
},
{
"epoch": 0.024639960576063078,
"grad_norm": 102.2786865234375,
"learning_rate": 4.924401209580647e-06,
"loss": 3.8329,
"step": 15400
},
{
"epoch": 0.02479996032006349,
"grad_norm": 160.66392517089844,
"learning_rate": 4.956400697588839e-06,
"loss": 3.9412,
"step": 15500
},
{
"epoch": 0.024959960064063896,
"grad_norm": 136.77218627929688,
"learning_rate": 4.988400185597031e-06,
"loss": 3.6668,
"step": 15600
},
{
"epoch": 0.025119959808064307,
"grad_norm": 63.87991714477539,
"learning_rate": 5.0200796787251404e-06,
"loss": 3.7758,
"step": 15700
},
{
"epoch": 0.025279959552064718,
"grad_norm": 352.977294921875,
"learning_rate": 5.052079166733333e-06,
"loss": 3.8805,
"step": 15800
},
{
"epoch": 0.025439959296065125,
"grad_norm": 148.54776000976562,
"learning_rate": 5.084078654741524e-06,
"loss": 3.8848,
"step": 15900
},
{
"epoch": 0.025599959040065536,
"grad_norm": 105.01113891601562,
"learning_rate": 5.116078142749716e-06,
"loss": 3.75,
"step": 16000
},
{
"epoch": 0.025759958784065947,
"grad_norm": 170.62828063964844,
"learning_rate": 5.148077630757908e-06,
"loss": 3.5685,
"step": 16100
},
{
"epoch": 0.025919958528066354,
"grad_norm": 164.85324096679688,
"learning_rate": 5.180077118766101e-06,
"loss": 3.7016,
"step": 16200
},
{
"epoch": 0.026079958272066765,
"grad_norm": 79.85810852050781,
"learning_rate": 5.212076606774292e-06,
"loss": 4.0955,
"step": 16300
},
{
"epoch": 0.026239958016067173,
"grad_norm": 109.73529815673828,
"learning_rate": 5.244076094782484e-06,
"loss": 3.7577,
"step": 16400
},
{
"epoch": 0.026399957760067583,
"grad_norm": 105.98066711425781,
"learning_rate": 5.276075582790676e-06,
"loss": 3.7485,
"step": 16500
},
{
"epoch": 0.026559957504067994,
"grad_norm": 71.02545166015625,
"learning_rate": 5.3080750707988686e-06,
"loss": 3.8263,
"step": 16600
},
{
"epoch": 0.0267199572480684,
"grad_norm": 245.44224548339844,
"learning_rate": 5.340074558807059e-06,
"loss": 3.6922,
"step": 16700
},
{
"epoch": 0.026879956992068813,
"grad_norm": 42.178157806396484,
"learning_rate": 5.372074046815251e-06,
"loss": 3.6568,
"step": 16800
},
{
"epoch": 0.027039956736069223,
"grad_norm": 114.55894470214844,
"learning_rate": 5.404073534823443e-06,
"loss": 3.7317,
"step": 16900
},
{
"epoch": 0.02719995648006963,
"grad_norm": 86.70626831054688,
"learning_rate": 5.436073022831635e-06,
"loss": 3.5089,
"step": 17000
},
{
"epoch": 0.02735995622407004,
"grad_norm": 202.02505493164062,
"learning_rate": 5.468072510839827e-06,
"loss": 3.7377,
"step": 17100
},
{
"epoch": 0.027519955968070452,
"grad_norm": 114.00701141357422,
"learning_rate": 5.500071998848019e-06,
"loss": 3.6206,
"step": 17200
},
{
"epoch": 0.02767995571207086,
"grad_norm": 152.38311767578125,
"learning_rate": 5.532071486856211e-06,
"loss": 3.3702,
"step": 17300
},
{
"epoch": 0.02783995545607127,
"grad_norm": 156.1048126220703,
"learning_rate": 5.564070974864403e-06,
"loss": 3.5126,
"step": 17400
},
{
"epoch": 0.02799995520007168,
"grad_norm": 117.87386322021484,
"learning_rate": 5.596070462872595e-06,
"loss": 3.4841,
"step": 17500
},
{
"epoch": 0.02815995494407209,
"grad_norm": 616.7991333007812,
"learning_rate": 5.628069950880786e-06,
"loss": 3.1464,
"step": 17600
},
{
"epoch": 0.0283199546880725,
"grad_norm": 131.32760620117188,
"learning_rate": 5.6600694388889786e-06,
"loss": 3.7012,
"step": 17700
},
{
"epoch": 0.028479954432072907,
"grad_norm": 60.172969818115234,
"learning_rate": 5.69206892689717e-06,
"loss": 3.5802,
"step": 17800
},
{
"epoch": 0.028639954176073318,
"grad_norm": 169.24374389648438,
"learning_rate": 5.724068414905361e-06,
"loss": 3.4952,
"step": 17900
},
{
"epoch": 0.02879995392007373,
"grad_norm": 158.77391052246094,
"learning_rate": 5.7560679029135535e-06,
"loss": 3.1174,
"step": 18000
},
{
"epoch": 0.028959953664074136,
"grad_norm": 218.98867797851562,
"learning_rate": 5.787747396041664e-06,
"loss": 3.3134,
"step": 18100
},
{
"epoch": 0.029119953408074547,
"grad_norm": 185.3249053955078,
"learning_rate": 5.819746884049856e-06,
"loss": 3.3578,
"step": 18200
},
{
"epoch": 0.029279953152074958,
"grad_norm": 93.69242858886719,
"learning_rate": 5.851746372058048e-06,
"loss": 3.0209,
"step": 18300
},
{
"epoch": 0.029439952896075365,
"grad_norm": 85.82784271240234,
"learning_rate": 5.883745860066239e-06,
"loss": 3.3796,
"step": 18400
},
{
"epoch": 0.029599952640075776,
"grad_norm": 125.96697998046875,
"learning_rate": 5.915745348074431e-06,
"loss": 3.2287,
"step": 18500
},
{
"epoch": 0.029759952384076187,
"grad_norm": 235.71075439453125,
"learning_rate": 5.947744836082623e-06,
"loss": 3.1537,
"step": 18600
},
{
"epoch": 0.029919952128076594,
"grad_norm": 139.5558319091797,
"learning_rate": 5.979744324090815e-06,
"loss": 2.9073,
"step": 18700
},
{
"epoch": 0.030079951872077005,
"grad_norm": 204.2928924560547,
"learning_rate": 6.011743812099007e-06,
"loss": 3.3444,
"step": 18800
},
{
"epoch": 0.030239951616077416,
"grad_norm": 165.4457244873047,
"learning_rate": 6.043743300107199e-06,
"loss": 3.1341,
"step": 18900
},
{
"epoch": 0.030399951360077823,
"grad_norm": 66.5983657836914,
"learning_rate": 6.07574278811539e-06,
"loss": 2.8862,
"step": 19000
},
{
"epoch": 0.030559951104078234,
"grad_norm": 219.95774841308594,
"learning_rate": 6.1077422761235826e-06,
"loss": 3.2033,
"step": 19100
},
{
"epoch": 0.03071995084807864,
"grad_norm": 125.15766906738281,
"learning_rate": 6.139741764131775e-06,
"loss": 3.2764,
"step": 19200
},
{
"epoch": 0.030879950592079052,
"grad_norm": 207.95970153808594,
"learning_rate": 6.171741252139967e-06,
"loss": 3.0725,
"step": 19300
},
{
"epoch": 0.031039950336079463,
"grad_norm": 368.32781982421875,
"learning_rate": 6.203740740148158e-06,
"loss": 3.0436,
"step": 19400
},
{
"epoch": 0.03119995008007987,
"grad_norm": 412.2764587402344,
"learning_rate": 6.23574022815635e-06,
"loss": 3.3493,
"step": 19500
},
{
"epoch": 0.03135994982408028,
"grad_norm": 155.46766662597656,
"learning_rate": 6.267739716164542e-06,
"loss": 3.0141,
"step": 19600
},
{
"epoch": 0.03151994956808069,
"grad_norm": 89.32569885253906,
"learning_rate": 6.299739204172733e-06,
"loss": 2.779,
"step": 19700
},
{
"epoch": 0.0316799493120811,
"grad_norm": 241.4378204345703,
"learning_rate": 6.3317386921809254e-06,
"loss": 3.3543,
"step": 19800
},
{
"epoch": 0.03183994905608151,
"grad_norm": 13.20569133758545,
"learning_rate": 6.363738180189118e-06,
"loss": 3.1526,
"step": 19900
},
{
"epoch": 0.03199994880008192,
"grad_norm": 270.6402893066406,
"learning_rate": 6.395737668197309e-06,
"loss": 2.7896,
"step": 20000
},
{
"epoch": 0.03215994854408233,
"grad_norm": 106.38632202148438,
"learning_rate": 6.427737156205501e-06,
"loss": 2.9398,
"step": 20100
},
{
"epoch": 0.03231994828808274,
"grad_norm": 191.7210693359375,
"learning_rate": 6.459416649333611e-06,
"loss": 3.1254,
"step": 20200
},
{
"epoch": 0.03247994803208315,
"grad_norm": 143.96151733398438,
"learning_rate": 6.491416137341803e-06,
"loss": 2.8832,
"step": 20300
},
{
"epoch": 0.03263994777608356,
"grad_norm": 150.26368713378906,
"learning_rate": 6.523415625349994e-06,
"loss": 3.0542,
"step": 20400
},
{
"epoch": 0.032799947520083965,
"grad_norm": 178.11705017089844,
"learning_rate": 6.5554151133581865e-06,
"loss": 2.9722,
"step": 20500
},
{
"epoch": 0.032959947264084376,
"grad_norm": 222.4794921875,
"learning_rate": 6.587414601366379e-06,
"loss": 2.9321,
"step": 20600
},
{
"epoch": 0.03311994700808479,
"grad_norm": 155.37796020507812,
"learning_rate": 6.619414089374571e-06,
"loss": 2.6448,
"step": 20700
},
{
"epoch": 0.0332799467520852,
"grad_norm": 155.5786590576172,
"learning_rate": 6.651413577382762e-06,
"loss": 3.4006,
"step": 20800
},
{
"epoch": 0.03343994649608561,
"grad_norm": 684.525146484375,
"learning_rate": 6.6834130653909545e-06,
"loss": 3.0022,
"step": 20900
},
{
"epoch": 0.03359994624008602,
"grad_norm": 545.5623168945312,
"learning_rate": 6.715412553399147e-06,
"loss": 2.6366,
"step": 21000
},
{
"epoch": 0.03375994598408642,
"grad_norm": 292.9093017578125,
"learning_rate": 6.747412041407339e-06,
"loss": 3.0112,
"step": 21100
},
{
"epoch": 0.033919945728086834,
"grad_norm": 2.531680107116699,
"learning_rate": 6.7794115294155294e-06,
"loss": 2.7856,
"step": 21200
},
{
"epoch": 0.034079945472087245,
"grad_norm": 216.7860565185547,
"learning_rate": 6.811411017423722e-06,
"loss": 3.0967,
"step": 21300
},
{
"epoch": 0.034239945216087656,
"grad_norm": 138.73028564453125,
"learning_rate": 6.843410505431913e-06,
"loss": 2.8754,
"step": 21400
},
{
"epoch": 0.034399944960088066,
"grad_norm": 78.2362060546875,
"learning_rate": 6.875409993440105e-06,
"loss": 3.1269,
"step": 21500
},
{
"epoch": 0.03455994470408847,
"grad_norm": 144.1228790283203,
"learning_rate": 6.907409481448297e-06,
"loss": 2.8235,
"step": 21600
},
{
"epoch": 0.03471994444808888,
"grad_norm": 275.1159973144531,
"learning_rate": 6.93940896945649e-06,
"loss": 2.4912,
"step": 21700
},
{
"epoch": 0.03487994419208929,
"grad_norm": 216.12060546875,
"learning_rate": 6.971408457464681e-06,
"loss": 2.5079,
"step": 21800
},
{
"epoch": 0.0350399439360897,
"grad_norm": 398.5049133300781,
"learning_rate": 7.003407945472873e-06,
"loss": 3.2942,
"step": 21900
},
{
"epoch": 0.035199943680090114,
"grad_norm": 116.13761901855469,
"learning_rate": 7.035407433481065e-06,
"loss": 2.4184,
"step": 22000
},
{
"epoch": 0.035359943424090524,
"grad_norm": 425.1556091308594,
"learning_rate": 7.067406921489257e-06,
"loss": 2.782,
"step": 22100
},
{
"epoch": 0.03551994316809093,
"grad_norm": 17.029335021972656,
"learning_rate": 7.099086414617366e-06,
"loss": 2.7652,
"step": 22200
},
{
"epoch": 0.03567994291209134,
"grad_norm": 307.45343017578125,
"learning_rate": 7.1310859026255585e-06,
"loss": 3.113,
"step": 22300
},
{
"epoch": 0.03583994265609175,
"grad_norm": 69.89311981201172,
"learning_rate": 7.163085390633751e-06,
"loss": 2.7451,
"step": 22400
},
{
"epoch": 0.03599994240009216,
"grad_norm": 28.0865535736084,
"learning_rate": 7.195084878641943e-06,
"loss": 2.7473,
"step": 22500
},
{
"epoch": 0.03615994214409257,
"grad_norm": 108.03202056884766,
"learning_rate": 7.227084366650134e-06,
"loss": 2.5116,
"step": 22600
},
{
"epoch": 0.03631994188809298,
"grad_norm": 299.888427734375,
"learning_rate": 7.2590838546583265e-06,
"loss": 2.8531,
"step": 22700
},
{
"epoch": 0.036479941632093386,
"grad_norm": 87.79664611816406,
"learning_rate": 7.291083342666519e-06,
"loss": 2.9171,
"step": 22800
},
{
"epoch": 0.0366399413760938,
"grad_norm": 388.6871337890625,
"learning_rate": 7.323082830674709e-06,
"loss": 2.7954,
"step": 22900
},
{
"epoch": 0.03679994112009421,
"grad_norm": 87.27410888671875,
"learning_rate": 7.355082318682901e-06,
"loss": 2.5376,
"step": 23000
},
{
"epoch": 0.03695994086409462,
"grad_norm": 159.74534606933594,
"learning_rate": 7.387081806691094e-06,
"loss": 3.2488,
"step": 23100
},
{
"epoch": 0.03711994060809503,
"grad_norm": 169.96243286132812,
"learning_rate": 7.419081294699285e-06,
"loss": 2.6131,
"step": 23200
},
{
"epoch": 0.037279940352095434,
"grad_norm": 221.1896514892578,
"learning_rate": 7.451080782707477e-06,
"loss": 3.1343,
"step": 23300
},
{
"epoch": 0.037439940096095845,
"grad_norm": 67.28482818603516,
"learning_rate": 7.482760275835588e-06,
"loss": 2.3159,
"step": 23400
},
{
"epoch": 0.037599939840096255,
"grad_norm": 341.05975341796875,
"learning_rate": 7.514759763843779e-06,
"loss": 2.4225,
"step": 23500
},
{
"epoch": 0.037759939584096666,
"grad_norm": 250.44683837890625,
"learning_rate": 7.54675925185197e-06,
"loss": 2.5034,
"step": 23600
},
{
"epoch": 0.03791993932809708,
"grad_norm": 423.6518249511719,
"learning_rate": 7.5787587398601625e-06,
"loss": 3.0067,
"step": 23700
},
{
"epoch": 0.03807993907209749,
"grad_norm": 169.45944213867188,
"learning_rate": 7.610758227868355e-06,
"loss": 2.313,
"step": 23800
},
{
"epoch": 0.03823993881609789,
"grad_norm": 80.43399047851562,
"learning_rate": 7.642757715876546e-06,
"loss": 2.5363,
"step": 23900
},
{
"epoch": 0.0383999385600983,
"grad_norm": 248.08848571777344,
"learning_rate": 7.674757203884739e-06,
"loss": 2.7929,
"step": 24000
},
{
"epoch": 0.03855993830409871,
"grad_norm": 3.7647440433502197,
"learning_rate": 7.70675669189293e-06,
"loss": 2.617,
"step": 24100
},
{
"epoch": 0.038719938048099124,
"grad_norm": 3.100020170211792,
"learning_rate": 7.738756179901122e-06,
"loss": 2.9711,
"step": 24200
},
{
"epoch": 0.038879937792099535,
"grad_norm": 69.79640197753906,
"learning_rate": 7.770755667909315e-06,
"loss": 2.7726,
"step": 24300
},
{
"epoch": 0.03903993753609994,
"grad_norm": 190.2179412841797,
"learning_rate": 7.802755155917506e-06,
"loss": 2.5849,
"step": 24400
},
{
"epoch": 0.03919993728010035,
"grad_norm": 75.47491455078125,
"learning_rate": 7.834754643925698e-06,
"loss": 2.3231,
"step": 24500
},
{
"epoch": 0.03935993702410076,
"grad_norm": 13.3529691696167,
"learning_rate": 7.866754131933889e-06,
"loss": 2.2477,
"step": 24600
},
{
"epoch": 0.03951993676810117,
"grad_norm": 280.162109375,
"learning_rate": 7.89875361994208e-06,
"loss": 2.5487,
"step": 24700
},
{
"epoch": 0.03967993651210158,
"grad_norm": 376.9624938964844,
"learning_rate": 7.930753107950273e-06,
"loss": 2.5175,
"step": 24800
},
{
"epoch": 0.03983993625610199,
"grad_norm": 341.099609375,
"learning_rate": 7.962752595958465e-06,
"loss": 2.6758,
"step": 24900
},
{
"epoch": 0.0399999360001024,
"grad_norm": 436.5195007324219,
"learning_rate": 7.994752083966658e-06,
"loss": 2.7313,
"step": 25000
},
{
"epoch": 0.04015993574410281,
"grad_norm": 274.91363525390625,
"learning_rate": 8.026751571974849e-06,
"loss": 2.4846,
"step": 25100
},
{
"epoch": 0.04031993548810322,
"grad_norm": 183.5716094970703,
"learning_rate": 8.05875105998304e-06,
"loss": 2.8697,
"step": 25200
},
{
"epoch": 0.04047993523210363,
"grad_norm": 70.23844909667969,
"learning_rate": 8.090750547991234e-06,
"loss": 2.5289,
"step": 25300
},
{
"epoch": 0.04063993497610404,
"grad_norm": 139.3669891357422,
"learning_rate": 8.122750035999425e-06,
"loss": 2.235,
"step": 25400
},
{
"epoch": 0.04079993472010445,
"grad_norm": 242.79315185546875,
"learning_rate": 8.154429529127534e-06,
"loss": 2.5028,
"step": 25500
},
{
"epoch": 0.040959934464104855,
"grad_norm": 257.0070495605469,
"learning_rate": 8.186429017135727e-06,
"loss": 2.6295,
"step": 25600
},
{
"epoch": 0.041119934208105266,
"grad_norm": 314.8670959472656,
"learning_rate": 8.218428505143918e-06,
"loss": 2.6159,
"step": 25700
},
{
"epoch": 0.04127993395210568,
"grad_norm": 284.12762451171875,
"learning_rate": 8.250427993152111e-06,
"loss": 2.4447,
"step": 25800
},
{
"epoch": 0.04143993369610609,
"grad_norm": 5.427358627319336,
"learning_rate": 8.282427481160302e-06,
"loss": 2.7233,
"step": 25900
},
{
"epoch": 0.0415999334401065,
"grad_norm": 240.23260498046875,
"learning_rate": 8.314426969168494e-06,
"loss": 2.5651,
"step": 26000
},
{
"epoch": 0.0417599331841069,
"grad_norm": 15.093184471130371,
"learning_rate": 8.346426457176687e-06,
"loss": 2.1317,
"step": 26100
},
{
"epoch": 0.04191993292810731,
"grad_norm": 14.953177452087402,
"learning_rate": 8.378425945184878e-06,
"loss": 2.6157,
"step": 26200
},
{
"epoch": 0.042079932672107724,
"grad_norm": 242.84718322753906,
"learning_rate": 8.410105438312987e-06,
"loss": 2.7385,
"step": 26300
},
{
"epoch": 0.042239932416108135,
"grad_norm": 1.3409643173217773,
"learning_rate": 8.44210492632118e-06,
"loss": 2.4642,
"step": 26400
},
{
"epoch": 0.042399932160108546,
"grad_norm": 90.02801513671875,
"learning_rate": 8.474104414329371e-06,
"loss": 2.0621,
"step": 26500
},
{
"epoch": 0.04255993190410896,
"grad_norm": 11.879080772399902,
"learning_rate": 8.506103902337564e-06,
"loss": 2.3864,
"step": 26600
},
{
"epoch": 0.04271993164810936,
"grad_norm": 598.356689453125,
"learning_rate": 8.538103390345756e-06,
"loss": 2.6951,
"step": 26700
},
{
"epoch": 0.04287993139210977,
"grad_norm": 144.25924682617188,
"learning_rate": 8.570102878353947e-06,
"loss": 2.2628,
"step": 26800
},
{
"epoch": 0.04303993113611018,
"grad_norm": 521.1145629882812,
"learning_rate": 8.602102366362138e-06,
"loss": 2.7538,
"step": 26900
},
{
"epoch": 0.04319993088011059,
"grad_norm": 86.13031005859375,
"learning_rate": 8.63410185437033e-06,
"loss": 2.6871,
"step": 27000
},
{
"epoch": 0.043359930624111004,
"grad_norm": 268.4532775878906,
"learning_rate": 8.666101342378523e-06,
"loss": 2.2453,
"step": 27100
},
{
"epoch": 0.04351993036811141,
"grad_norm": 531.1592407226562,
"learning_rate": 8.698100830386714e-06,
"loss": 1.6334,
"step": 27200
},
{
"epoch": 0.04367993011211182,
"grad_norm": 166.83230590820312,
"learning_rate": 8.730100318394906e-06,
"loss": 2.666,
"step": 27300
},
{
"epoch": 0.04383992985611223,
"grad_norm": 208.4716033935547,
"learning_rate": 8.762099806403099e-06,
"loss": 2.128,
"step": 27400
},
{
"epoch": 0.04399992960011264,
"grad_norm": 257.9130859375,
"learning_rate": 8.79409929441129e-06,
"loss": 2.7573,
"step": 27500
},
{
"epoch": 0.04415992934411305,
"grad_norm": 85.08763885498047,
"learning_rate": 8.826098782419481e-06,
"loss": 2.5276,
"step": 27600
},
{
"epoch": 0.04431992908811346,
"grad_norm": 8.960221290588379,
"learning_rate": 8.858098270427674e-06,
"loss": 2.2438,
"step": 27700
},
{
"epoch": 0.044479928832113866,
"grad_norm": 404.66558837890625,
"learning_rate": 8.890097758435866e-06,
"loss": 2.3156,
"step": 27800
},
{
"epoch": 0.04463992857611428,
"grad_norm": 151.23495483398438,
"learning_rate": 8.922097246444059e-06,
"loss": 2.1735,
"step": 27900
},
{
"epoch": 0.04479992832011469,
"grad_norm": 151.7221221923828,
"learning_rate": 8.95409673445225e-06,
"loss": 2.1733,
"step": 28000
},
{
"epoch": 0.0449599280641151,
"grad_norm": 228.95974731445312,
"learning_rate": 8.986096222460441e-06,
"loss": 2.4094,
"step": 28100
},
{
"epoch": 0.04511992780811551,
"grad_norm": 522.6806640625,
"learning_rate": 9.018095710468633e-06,
"loss": 2.8484,
"step": 28200
},
{
"epoch": 0.04527992755211592,
"grad_norm": 16.065011978149414,
"learning_rate": 9.050095198476824e-06,
"loss": 2.4507,
"step": 28300
},
{
"epoch": 0.045439927296116324,
"grad_norm": 227.2984619140625,
"learning_rate": 9.082094686485017e-06,
"loss": 2.6822,
"step": 28400
},
{
"epoch": 0.045599927040116735,
"grad_norm": 430.3262634277344,
"learning_rate": 9.114094174493209e-06,
"loss": 2.1191,
"step": 28500
},
{
"epoch": 0.045759926784117146,
"grad_norm": 0.1830236166715622,
"learning_rate": 9.1460936625014e-06,
"loss": 2.0696,
"step": 28600
},
{
"epoch": 0.045919926528117556,
"grad_norm": 97.45941162109375,
"learning_rate": 9.178093150509593e-06,
"loss": 2.4027,
"step": 28700
},
{
"epoch": 0.04607992627211797,
"grad_norm": 22.469968795776367,
"learning_rate": 9.210092638517784e-06,
"loss": 1.7958,
"step": 28800
},
{
"epoch": 0.04623992601611837,
"grad_norm": 103.27215576171875,
"learning_rate": 9.242092126525977e-06,
"loss": 2.5874,
"step": 28900
},
{
"epoch": 0.04639992576011878,
"grad_norm": 578.951171875,
"learning_rate": 9.274091614534169e-06,
"loss": 2.2679,
"step": 29000
},
{
"epoch": 0.04655992550411919,
"grad_norm": 6.261137008666992,
"learning_rate": 9.30609110254236e-06,
"loss": 2.6394,
"step": 29100
},
{
"epoch": 0.046719925248119604,
"grad_norm": 113.35989379882812,
"learning_rate": 9.338090590550551e-06,
"loss": 1.7998,
"step": 29200
},
{
"epoch": 0.046879924992120015,
"grad_norm": 116.46363830566406,
"learning_rate": 9.370090078558743e-06,
"loss": 2.6834,
"step": 29300
},
{
"epoch": 0.047039924736120425,
"grad_norm": 84.5538101196289,
"learning_rate": 9.402089566566936e-06,
"loss": 2.1242,
"step": 29400
},
{
"epoch": 0.04719992448012083,
"grad_norm": 150.44454956054688,
"learning_rate": 9.434089054575127e-06,
"loss": 2.0039,
"step": 29500
},
{
"epoch": 0.04735992422412124,
"grad_norm": 12.482616424560547,
"learning_rate": 9.466088542583319e-06,
"loss": 2.018,
"step": 29600
},
{
"epoch": 0.04751992396812165,
"grad_norm": 1.1050609350204468,
"learning_rate": 9.498088030591512e-06,
"loss": 2.9357,
"step": 29700
},
{
"epoch": 0.04767992371212206,
"grad_norm": 256.4771423339844,
"learning_rate": 9.530087518599703e-06,
"loss": 2.1914,
"step": 29800
},
{
"epoch": 0.04783992345612247,
"grad_norm": 178.9422149658203,
"learning_rate": 9.562087006607896e-06,
"loss": 2.0968,
"step": 29900
},
{
"epoch": 0.047999923200122876,
"grad_norm": 160.1494140625,
"learning_rate": 9.594086494616087e-06,
"loss": 1.9762,
"step": 30000
},
{
"epoch": 0.04815992294412329,
"grad_norm": 86.46272277832031,
"learning_rate": 9.626085982624279e-06,
"loss": 2.1436,
"step": 30100
},
{
"epoch": 0.0483199226881237,
"grad_norm": 76.13285064697266,
"learning_rate": 9.658085470632472e-06,
"loss": 2.1919,
"step": 30200
},
{
"epoch": 0.04847992243212411,
"grad_norm": 2.952242374420166,
"learning_rate": 9.690084958640661e-06,
"loss": 1.9683,
"step": 30300
},
{
"epoch": 0.04863992217612452,
"grad_norm": 33.4036979675293,
"learning_rate": 9.722084446648855e-06,
"loss": 2.3543,
"step": 30400
},
{
"epoch": 0.04879992192012493,
"grad_norm": 173.6257781982422,
"learning_rate": 9.753763939776965e-06,
"loss": 2.0642,
"step": 30500
},
{
"epoch": 0.048959921664125335,
"grad_norm": 0.08548393100500107,
"learning_rate": 9.785763427785156e-06,
"loss": 1.8447,
"step": 30600
},
{
"epoch": 0.049119921408125745,
"grad_norm": 111.82203674316406,
"learning_rate": 9.817762915793348e-06,
"loss": 2.3467,
"step": 30700
},
{
"epoch": 0.049279921152126156,
"grad_norm": 142.97500610351562,
"learning_rate": 9.84976240380154e-06,
"loss": 2.6461,
"step": 30800
},
{
"epoch": 0.04943992089612657,
"grad_norm": 417.88677978515625,
"learning_rate": 9.881761891809732e-06,
"loss": 2.028,
"step": 30900
},
{
"epoch": 0.04959992064012698,
"grad_norm": 4.543129920959473,
"learning_rate": 9.913761379817923e-06,
"loss": 1.4188,
"step": 31000
},
{
"epoch": 0.04975992038412739,
"grad_norm": 205.02293395996094,
"learning_rate": 9.945760867826115e-06,
"loss": 2.7219,
"step": 31100
},
{
"epoch": 0.04991992012812779,
"grad_norm": 123.40583038330078,
"learning_rate": 9.977760355834308e-06,
"loss": 2.2345,
"step": 31200
},
{
"epoch": 0.050079919872128204,
"grad_norm": 0.9410820603370667,
"learning_rate": 1.00097598438425e-05,
"loss": 2.201,
"step": 31300
},
{
"epoch": 0.050239919616128614,
"grad_norm": 51.27448272705078,
"learning_rate": 1.004175933185069e-05,
"loss": 2.092,
"step": 31400
},
{
"epoch": 0.050399919360129025,
"grad_norm": 258.7269592285156,
"learning_rate": 1.0073758819858884e-05,
"loss": 2.2871,
"step": 31500
},
{
"epoch": 0.050559919104129436,
"grad_norm": 108.11058044433594,
"learning_rate": 1.0105758307867075e-05,
"loss": 2.0167,
"step": 31600
},
{
"epoch": 0.05071991884812984,
"grad_norm": 229.5725555419922,
"learning_rate": 1.0137757795875266e-05,
"loss": 1.9175,
"step": 31700
},
{
"epoch": 0.05087991859213025,
"grad_norm": 204.41357421875,
"learning_rate": 1.016975728388346e-05,
"loss": 2.2229,
"step": 31800
},
{
"epoch": 0.05103991833613066,
"grad_norm": 8.951689720153809,
"learning_rate": 1.020175677189165e-05,
"loss": 2.1196,
"step": 31900
},
{
"epoch": 0.05119991808013107,
"grad_norm": 275.85198974609375,
"learning_rate": 1.0233756259899844e-05,
"loss": 2.2192,
"step": 32000
},
{
"epoch": 0.05135991782413148,
"grad_norm": 359.066650390625,
"learning_rate": 1.0265755747908035e-05,
"loss": 1.6462,
"step": 32100
},
{
"epoch": 0.051519917568131894,
"grad_norm": 0.10183493793010712,
"learning_rate": 1.0297755235916226e-05,
"loss": 2.099,
"step": 32200
},
{
"epoch": 0.0516799173121323,
"grad_norm": 43.3016357421875,
"learning_rate": 1.0329434729044337e-05,
"loss": 2.0914,
"step": 32300
},
{
"epoch": 0.05183991705613271,
"grad_norm": 97.42915344238281,
"learning_rate": 1.0361434217052528e-05,
"loss": 2.3295,
"step": 32400
},
{
"epoch": 0.05199991680013312,
"grad_norm": 1.9172292947769165,
"learning_rate": 1.039343370506072e-05,
"loss": 2.256,
"step": 32500
},
{
"epoch": 0.05215991654413353,
"grad_norm": 157.83743286132812,
"learning_rate": 1.0425433193068913e-05,
"loss": 1.7662,
"step": 32600
},
{
"epoch": 0.05231991628813394,
"grad_norm": 1.3025041818618774,
"learning_rate": 1.0457432681077104e-05,
"loss": 1.7234,
"step": 32700
},
{
"epoch": 0.052479916032134345,
"grad_norm": 234.8426971435547,
"learning_rate": 1.0489432169085297e-05,
"loss": 1.984,
"step": 32800
},
{
"epoch": 0.052639915776134756,
"grad_norm": 9.249500274658203,
"learning_rate": 1.0521431657093488e-05,
"loss": 2.1815,
"step": 32900
},
{
"epoch": 0.05279991552013517,
"grad_norm": 164.6519012451172,
"learning_rate": 1.055343114510168e-05,
"loss": 1.4987,
"step": 33000
},
{
"epoch": 0.05295991526413558,
"grad_norm": 145.9049072265625,
"learning_rate": 1.0585430633109873e-05,
"loss": 2.0034,
"step": 33100
},
{
"epoch": 0.05311991500813599,
"grad_norm": 79.73159790039062,
"learning_rate": 1.0617430121118062e-05,
"loss": 2.6008,
"step": 33200
},
{
"epoch": 0.0532799147521364,
"grad_norm": 131.95318603515625,
"learning_rate": 1.0649429609126254e-05,
"loss": 2.4585,
"step": 33300
},
{
"epoch": 0.0534399144961368,
"grad_norm": 44.75098419189453,
"learning_rate": 1.0681429097134445e-05,
"loss": 1.881,
"step": 33400
},
{
"epoch": 0.053599914240137214,
"grad_norm": 0.9141740202903748,
"learning_rate": 1.0713428585142638e-05,
"loss": 1.8738,
"step": 33500
},
{
"epoch": 0.053759913984137625,
"grad_norm": 248.49734497070312,
"learning_rate": 1.074542807315083e-05,
"loss": 1.9726,
"step": 33600
},
{
"epoch": 0.053919913728138036,
"grad_norm": 167.88706970214844,
"learning_rate": 1.0777427561159021e-05,
"loss": 2.3734,
"step": 33700
},
{
"epoch": 0.05407991347213845,
"grad_norm": 0.43971773982048035,
"learning_rate": 1.0809427049167214e-05,
"loss": 1.6898,
"step": 33800
},
{
"epoch": 0.05423991321613886,
"grad_norm": 713.7942504882812,
"learning_rate": 1.0841426537175405e-05,
"loss": 2.2171,
"step": 33900
},
{
"epoch": 0.05439991296013926,
"grad_norm": 57.55624771118164,
"learning_rate": 1.0873426025183598e-05,
"loss": 1.4453,
"step": 34000
},
{
"epoch": 0.05455991270413967,
"grad_norm": 409.5030822753906,
"learning_rate": 1.090542551319179e-05,
"loss": 1.5057,
"step": 34100
},
{
"epoch": 0.05471991244814008,
"grad_norm": 60.115047454833984,
"learning_rate": 1.0937425001199981e-05,
"loss": 2.1497,
"step": 34200
},
{
"epoch": 0.054879912192140494,
"grad_norm": 0.7692262530326843,
"learning_rate": 1.0969424489208174e-05,
"loss": 1.8618,
"step": 34300
},
{
"epoch": 0.055039911936140905,
"grad_norm": 698.8638916015625,
"learning_rate": 1.1001423977216366e-05,
"loss": 1.7878,
"step": 34400
},
{
"epoch": 0.05519991168014131,
"grad_norm": 0.5103877186775208,
"learning_rate": 1.1033423465224557e-05,
"loss": 1.8199,
"step": 34500
},
{
"epoch": 0.05535991142414172,
"grad_norm": 347.1667175292969,
"learning_rate": 1.106542295323275e-05,
"loss": 2.1649,
"step": 34600
},
{
"epoch": 0.05551991116814213,
"grad_norm": 99.95459747314453,
"learning_rate": 1.1097422441240941e-05,
"loss": 1.7906,
"step": 34700
},
{
"epoch": 0.05567991091214254,
"grad_norm": 211.90087890625,
"learning_rate": 1.1129421929249133e-05,
"loss": 1.6816,
"step": 34800
},
{
"epoch": 0.05583991065614295,
"grad_norm": 60.790199279785156,
"learning_rate": 1.1161421417257326e-05,
"loss": 2.1464,
"step": 34900
},
{
"epoch": 0.05599991040014336,
"grad_norm": 585.09716796875,
"learning_rate": 1.1193420905265517e-05,
"loss": 2.0039,
"step": 35000
},
{
"epoch": 0.05615991014414377,
"grad_norm": 0.2061644047498703,
"learning_rate": 1.122542039327371e-05,
"loss": 1.735,
"step": 35100
},
{
"epoch": 0.05631990988814418,
"grad_norm": 204.5592498779297,
"learning_rate": 1.1257419881281901e-05,
"loss": 1.853,
"step": 35200
},
{
"epoch": 0.05647990963214459,
"grad_norm": 695.4961547851562,
"learning_rate": 1.1289419369290093e-05,
"loss": 1.6068,
"step": 35300
},
{
"epoch": 0.056639909376145,
"grad_norm": 220.02767944335938,
"learning_rate": 1.1321418857298282e-05,
"loss": 1.6349,
"step": 35400
},
{
"epoch": 0.05679990912014541,
"grad_norm": 0.07823936641216278,
"learning_rate": 1.1353418345306476e-05,
"loss": 1.9571,
"step": 35500
},
{
"epoch": 0.056959908864145814,
"grad_norm": 31.91838836669922,
"learning_rate": 1.1385417833314667e-05,
"loss": 1.5854,
"step": 35600
},
{
"epoch": 0.057119908608146225,
"grad_norm": 1040.179931640625,
"learning_rate": 1.1417417321322858e-05,
"loss": 1.9756,
"step": 35700
},
{
"epoch": 0.057279908352146636,
"grad_norm": 16.008800506591797,
"learning_rate": 1.1449416809331051e-05,
"loss": 1.9816,
"step": 35800
},
{
"epoch": 0.057439908096147047,
"grad_norm": 226.522705078125,
"learning_rate": 1.1481416297339243e-05,
"loss": 1.6758,
"step": 35900
},
{
"epoch": 0.05759990784014746,
"grad_norm": 85.04449462890625,
"learning_rate": 1.1513415785347436e-05,
"loss": 2.2583,
"step": 36000
},
{
"epoch": 0.05775990758414787,
"grad_norm": 3.989626884460449,
"learning_rate": 1.1545415273355627e-05,
"loss": 1.7584,
"step": 36100
},
{
"epoch": 0.05791990732814827,
"grad_norm": 63.272911071777344,
"learning_rate": 1.1577414761363818e-05,
"loss": 1.9894,
"step": 36200
},
{
"epoch": 0.05807990707214868,
"grad_norm": 175.4257049560547,
"learning_rate": 1.1609414249372011e-05,
"loss": 2.3922,
"step": 36300
},
{
"epoch": 0.058239906816149094,
"grad_norm": 160.36253356933594,
"learning_rate": 1.164109374250012e-05,
"loss": 2.0077,
"step": 36400
},
{
"epoch": 0.058399906560149505,
"grad_norm": 95.23787689208984,
"learning_rate": 1.1673093230508312e-05,
"loss": 2.3684,
"step": 36500
},
{
"epoch": 0.058559906304149915,
"grad_norm": 173.092041015625,
"learning_rate": 1.1705092718516505e-05,
"loss": 2.1103,
"step": 36600
},
{
"epoch": 0.058719906048150326,
"grad_norm": 719.3712768554688,
"learning_rate": 1.1736772211644613e-05,
"loss": 2.0728,
"step": 36700
},
{
"epoch": 0.05887990579215073,
"grad_norm": 1.7120122909545898,
"learning_rate": 1.1768771699652806e-05,
"loss": 1.9364,
"step": 36800
},
{
"epoch": 0.05903990553615114,
"grad_norm": 120.16387176513672,
"learning_rate": 1.1800771187660998e-05,
"loss": 2.5203,
"step": 36900
},
{
"epoch": 0.05919990528015155,
"grad_norm": 46.504329681396484,
"learning_rate": 1.1832770675669189e-05,
"loss": 1.8473,
"step": 37000
},
{
"epoch": 0.05935990502415196,
"grad_norm": 255.33987426757812,
"learning_rate": 1.1864770163677382e-05,
"loss": 1.8076,
"step": 37100
},
{
"epoch": 0.059519904768152374,
"grad_norm": 130.05715942382812,
"learning_rate": 1.1896769651685574e-05,
"loss": 2.0157,
"step": 37200
},
{
"epoch": 0.05967990451215278,
"grad_norm": 201.22866821289062,
"learning_rate": 1.1928769139693765e-05,
"loss": 2.1587,
"step": 37300
},
{
"epoch": 0.05983990425615319,
"grad_norm": 0.2600236237049103,
"learning_rate": 1.1960768627701958e-05,
"loss": 1.9825,
"step": 37400
},
{
"epoch": 0.0599999040001536,
"grad_norm": 0.20701654255390167,
"learning_rate": 1.199276811571015e-05,
"loss": 2.0693,
"step": 37500
},
{
"epoch": 0.06015990374415401,
"grad_norm": 247.0039520263672,
"learning_rate": 1.202476760371834e-05,
"loss": 1.5505,
"step": 37600
},
{
"epoch": 0.06031990348815442,
"grad_norm": 15.698258399963379,
"learning_rate": 1.2056767091726534e-05,
"loss": 1.5472,
"step": 37700
},
{
"epoch": 0.06047990323215483,
"grad_norm": 357.7092590332031,
"learning_rate": 1.2088766579734725e-05,
"loss": 2.0568,
"step": 37800
},
{
"epoch": 0.060639902976155236,
"grad_norm": 54.52446365356445,
"learning_rate": 1.2120766067742918e-05,
"loss": 1.9219,
"step": 37900
},
{
"epoch": 0.060799902720155646,
"grad_norm": 240.81784057617188,
"learning_rate": 1.215276555575111e-05,
"loss": 2.091,
"step": 38000
},
{
"epoch": 0.06095990246415606,
"grad_norm": 0.14063161611557007,
"learning_rate": 1.21847650437593e-05,
"loss": 2.0523,
"step": 38100
},
{
"epoch": 0.06111990220815647,
"grad_norm": 101.88555145263672,
"learning_rate": 1.2216764531767494e-05,
"loss": 1.7628,
"step": 38200
},
{
"epoch": 0.06127990195215688,
"grad_norm": 1.7761729955673218,
"learning_rate": 1.2248764019775685e-05,
"loss": 1.8753,
"step": 38300
},
{
"epoch": 0.06143990169615728,
"grad_norm": 183.46917724609375,
"learning_rate": 1.2280763507783877e-05,
"loss": 1.846,
"step": 38400
},
{
"epoch": 0.061599901440157694,
"grad_norm": 0.008245576173067093,
"learning_rate": 1.231276299579207e-05,
"loss": 1.803,
"step": 38500
},
{
"epoch": 0.061759901184158104,
"grad_norm": 389.3524169921875,
"learning_rate": 1.2344762483800261e-05,
"loss": 2.1226,
"step": 38600
},
{
"epoch": 0.061919900928158515,
"grad_norm": 457.38519287109375,
"learning_rate": 1.2376761971808452e-05,
"loss": 2.0906,
"step": 38700
},
{
"epoch": 0.062079900672158926,
"grad_norm": 95.94575500488281,
"learning_rate": 1.2408441464936563e-05,
"loss": 1.4321,
"step": 38800
},
{
"epoch": 0.06223990041615934,
"grad_norm": 0.09420862793922424,
"learning_rate": 1.2440440952944754e-05,
"loss": 2.5214,
"step": 38900
},
{
"epoch": 0.06239990016015974,
"grad_norm": 7.472883224487305,
"learning_rate": 1.2472440440952947e-05,
"loss": 1.5412,
"step": 39000
},
{
"epoch": 0.06255989990416015,
"grad_norm": 198.42828369140625,
"learning_rate": 1.2504439928961139e-05,
"loss": 1.4382,
"step": 39100
},
{
"epoch": 0.06271989964816056,
"grad_norm": 1.2646727561950684,
"learning_rate": 1.253643941696933e-05,
"loss": 1.8417,
"step": 39200
},
{
"epoch": 0.06287989939216097,
"grad_norm": 85.20125579833984,
"learning_rate": 1.2568438904977523e-05,
"loss": 2.1105,
"step": 39300
},
{
"epoch": 0.06303989913616138,
"grad_norm": 6.063973903656006,
"learning_rate": 1.2600438392985714e-05,
"loss": 1.6347,
"step": 39400
},
{
"epoch": 0.0631998988801618,
"grad_norm": 1.7712761163711548,
"learning_rate": 1.2632437880993904e-05,
"loss": 2.0372,
"step": 39500
},
{
"epoch": 0.0633598986241622,
"grad_norm": 105.22515106201172,
"learning_rate": 1.2664437369002095e-05,
"loss": 1.6222,
"step": 39600
},
{
"epoch": 0.06351989836816262,
"grad_norm": 152.34910583496094,
"learning_rate": 1.2696436857010288e-05,
"loss": 1.8033,
"step": 39700
},
{
"epoch": 0.06367989811216301,
"grad_norm": 0.4972204864025116,
"learning_rate": 1.272843634501848e-05,
"loss": 1.9847,
"step": 39800
},
{
"epoch": 0.06383989785616342,
"grad_norm": 145.8481903076172,
"learning_rate": 1.2760435833026673e-05,
"loss": 2.1354,
"step": 39900
},
{
"epoch": 0.06399989760016384,
"grad_norm": 0.4929490089416504,
"learning_rate": 1.2792435321034864e-05,
"loss": 1.6792,
"step": 40000
},
{
"epoch": 0.06415989734416425,
"grad_norm": 0.004757192451506853,
"learning_rate": 1.2824434809043055e-05,
"loss": 2.1055,
"step": 40100
},
{
"epoch": 0.06431989708816466,
"grad_norm": 133.86878967285156,
"learning_rate": 1.2856434297051249e-05,
"loss": 2.0657,
"step": 40200
},
{
"epoch": 0.06447989683216507,
"grad_norm": 75.14216613769531,
"learning_rate": 1.288843378505944e-05,
"loss": 1.9618,
"step": 40300
},
{
"epoch": 0.06463989657616548,
"grad_norm": 0.47655782103538513,
"learning_rate": 1.2920433273067631e-05,
"loss": 1.5807,
"step": 40400
},
{
"epoch": 0.06479989632016589,
"grad_norm": 0.25797244906425476,
"learning_rate": 1.2952432761075824e-05,
"loss": 1.6451,
"step": 40500
},
{
"epoch": 0.0649598960641663,
"grad_norm": 0.013840774074196815,
"learning_rate": 1.2984432249084016e-05,
"loss": 2.1299,
"step": 40600
},
{
"epoch": 0.06511989580816671,
"grad_norm": 0.016265127807855606,
"learning_rate": 1.3016431737092207e-05,
"loss": 1.9912,
"step": 40700
},
{
"epoch": 0.06527989555216712,
"grad_norm": 91.05821228027344,
"learning_rate": 1.30484312251004e-05,
"loss": 1.6392,
"step": 40800
},
{
"epoch": 0.06543989529616753,
"grad_norm": 0.5753430724143982,
"learning_rate": 1.3080430713108591e-05,
"loss": 1.8049,
"step": 40900
},
{
"epoch": 0.06559989504016793,
"grad_norm": 1.7056798934936523,
"learning_rate": 1.3112430201116784e-05,
"loss": 1.9832,
"step": 41000
},
{
"epoch": 0.06575989478416834,
"grad_norm": 115.96708679199219,
"learning_rate": 1.3144109694244893e-05,
"loss": 2.0309,
"step": 41100
},
{
"epoch": 0.06591989452816875,
"grad_norm": 128.86553955078125,
"learning_rate": 1.3176109182253085e-05,
"loss": 1.8362,
"step": 41200
},
{
"epoch": 0.06607989427216916,
"grad_norm": 8.644057273864746,
"learning_rate": 1.3208108670261278e-05,
"loss": 2.2709,
"step": 41300
},
{
"epoch": 0.06623989401616957,
"grad_norm": 105.3166732788086,
"learning_rate": 1.3240108158269469e-05,
"loss": 2.0785,
"step": 41400
},
{
"epoch": 0.06639989376016998,
"grad_norm": 66.77593231201172,
"learning_rate": 1.327210764627766e-05,
"loss": 1.5627,
"step": 41500
},
{
"epoch": 0.0665598935041704,
"grad_norm": 0.6800107359886169,
"learning_rate": 1.3304107134285853e-05,
"loss": 1.6058,
"step": 41600
},
{
"epoch": 0.0667198932481708,
"grad_norm": 138.5995330810547,
"learning_rate": 1.3336106622294045e-05,
"loss": 1.7099,
"step": 41700
},
{
"epoch": 0.06687989299217122,
"grad_norm": 0.2328547090291977,
"learning_rate": 1.3368106110302238e-05,
"loss": 1.7096,
"step": 41800
},
{
"epoch": 0.06703989273617163,
"grad_norm": 82.12950897216797,
"learning_rate": 1.3400105598310429e-05,
"loss": 1.6429,
"step": 41900
},
{
"epoch": 0.06719989248017204,
"grad_norm": 1.3431618213653564,
"learning_rate": 1.343210508631862e-05,
"loss": 1.2514,
"step": 42000
},
{
"epoch": 0.06735989222417244,
"grad_norm": 30.410139083862305,
"learning_rate": 1.3464104574326814e-05,
"loss": 1.5746,
"step": 42100
},
{
"epoch": 0.06751989196817285,
"grad_norm": 106.41495513916016,
"learning_rate": 1.3496104062335005e-05,
"loss": 1.7186,
"step": 42200
},
{
"epoch": 0.06767989171217326,
"grad_norm": 200.46978759765625,
"learning_rate": 1.3528103550343195e-05,
"loss": 1.8152,
"step": 42300
},
{
"epoch": 0.06783989145617367,
"grad_norm": 0.09822285175323486,
"learning_rate": 1.3560103038351386e-05,
"loss": 1.705,
"step": 42400
},
{
"epoch": 0.06799989120017408,
"grad_norm": 438.903564453125,
"learning_rate": 1.3592102526359579e-05,
"loss": 1.6779,
"step": 42500
},
{
"epoch": 0.06815989094417449,
"grad_norm": 0.03262553736567497,
"learning_rate": 1.362410201436777e-05,
"loss": 1.8157,
"step": 42600
},
{
"epoch": 0.0683198906881749,
"grad_norm": 376.0313720703125,
"learning_rate": 1.3656101502375962e-05,
"loss": 1.8464,
"step": 42700
},
{
"epoch": 0.06847989043217531,
"grad_norm": 29.421518325805664,
"learning_rate": 1.3688100990384155e-05,
"loss": 1.748,
"step": 42800
},
{
"epoch": 0.06863989017617572,
"grad_norm": 183.51832580566406,
"learning_rate": 1.3720100478392346e-05,
"loss": 1.6836,
"step": 42900
},
{
"epoch": 0.06879988992017613,
"grad_norm": 0.0013067092513665557,
"learning_rate": 1.3752099966400539e-05,
"loss": 1.65,
"step": 43000
},
{
"epoch": 0.06895988966417654,
"grad_norm": 0.006181403063237667,
"learning_rate": 1.378409945440873e-05,
"loss": 1.5632,
"step": 43100
},
{
"epoch": 0.06911988940817694,
"grad_norm": 0.134628027677536,
"learning_rate": 1.3816098942416922e-05,
"loss": 2.0987,
"step": 43200
},
{
"epoch": 0.06927988915217735,
"grad_norm": 235.39088439941406,
"learning_rate": 1.3848098430425115e-05,
"loss": 1.5783,
"step": 43300
},
{
"epoch": 0.06943988889617776,
"grad_norm": 89.28943634033203,
"learning_rate": 1.3880097918433306e-05,
"loss": 1.8029,
"step": 43400
},
{
"epoch": 0.06959988864017817,
"grad_norm": 197.04258728027344,
"learning_rate": 1.3911777411561415e-05,
"loss": 1.7154,
"step": 43500
},
{
"epoch": 0.06975988838417858,
"grad_norm": 96.05148315429688,
"learning_rate": 1.3943776899569608e-05,
"loss": 1.663,
"step": 43600
},
{
"epoch": 0.069919888128179,
"grad_norm": 8.378194808959961,
"learning_rate": 1.39757763875778e-05,
"loss": 1.4403,
"step": 43700
},
{
"epoch": 0.0700798878721794,
"grad_norm": 1932.4417724609375,
"learning_rate": 1.4007775875585992e-05,
"loss": 1.6513,
"step": 43800
},
{
"epoch": 0.07023988761617982,
"grad_norm": 185.06163024902344,
"learning_rate": 1.4039775363594184e-05,
"loss": 2.2041,
"step": 43900
},
{
"epoch": 0.07039988736018023,
"grad_norm": 1.7904412746429443,
"learning_rate": 1.4071774851602375e-05,
"loss": 2.3908,
"step": 44000
},
{
"epoch": 0.07055988710418064,
"grad_norm": 0.07365602254867554,
"learning_rate": 1.4103774339610568e-05,
"loss": 1.7153,
"step": 44100
},
{
"epoch": 0.07071988684818105,
"grad_norm": 117.01744842529297,
"learning_rate": 1.413577382761876e-05,
"loss": 2.2112,
"step": 44200
},
{
"epoch": 0.07087988659218145,
"grad_norm": 509.897216796875,
"learning_rate": 1.4167773315626951e-05,
"loss": 1.8663,
"step": 44300
},
{
"epoch": 0.07103988633618186,
"grad_norm": 188.78509521484375,
"learning_rate": 1.4199772803635144e-05,
"loss": 1.8206,
"step": 44400
},
{
"epoch": 0.07119988608018227,
"grad_norm": 122.20122528076172,
"learning_rate": 1.4231772291643335e-05,
"loss": 2.2269,
"step": 44500
},
{
"epoch": 0.07135988582418268,
"grad_norm": 894.0123901367188,
"learning_rate": 1.4263771779651527e-05,
"loss": 1.8159,
"step": 44600
},
{
"epoch": 0.07151988556818309,
"grad_norm": 217.02325439453125,
"learning_rate": 1.429577126765972e-05,
"loss": 1.9257,
"step": 44700
},
{
"epoch": 0.0716798853121835,
"grad_norm": 0.4048191010951996,
"learning_rate": 1.4327770755667911e-05,
"loss": 2.087,
"step": 44800
},
{
"epoch": 0.07183988505618391,
"grad_norm": 85.02055358886719,
"learning_rate": 1.4359770243676104e-05,
"loss": 1.3623,
"step": 44900
},
{
"epoch": 0.07199988480018432,
"grad_norm": 90.52297973632812,
"learning_rate": 1.4391769731684295e-05,
"loss": 1.5747,
"step": 45000
},
{
"epoch": 0.07215988454418473,
"grad_norm": 28.14681053161621,
"learning_rate": 1.4423769219692485e-05,
"loss": 1.8051,
"step": 45100
},
{
"epoch": 0.07231988428818514,
"grad_norm": 0.31391066312789917,
"learning_rate": 1.4455768707700676e-05,
"loss": 2.3691,
"step": 45200
},
{
"epoch": 0.07247988403218555,
"grad_norm": 0.08174788951873779,
"learning_rate": 1.448776819570887e-05,
"loss": 2.1125,
"step": 45300
},
{
"epoch": 0.07263988377618597,
"grad_norm": 107.4333267211914,
"learning_rate": 1.4519767683717061e-05,
"loss": 1.566,
"step": 45400
},
{
"epoch": 0.07279988352018636,
"grad_norm": 119.36717224121094,
"learning_rate": 1.4551767171725252e-05,
"loss": 1.5042,
"step": 45500
},
{
"epoch": 0.07295988326418677,
"grad_norm": 157.90757751464844,
"learning_rate": 1.4583446664853364e-05,
"loss": 1.9469,
"step": 45600
},
{
"epoch": 0.07311988300818718,
"grad_norm": 172.0279998779297,
"learning_rate": 1.4615446152861554e-05,
"loss": 1.9346,
"step": 45700
},
{
"epoch": 0.0732798827521876,
"grad_norm": 961.58935546875,
"learning_rate": 1.4647445640869747e-05,
"loss": 1.4362,
"step": 45800
},
{
"epoch": 0.073439882496188,
"grad_norm": 0.0017953349743038416,
"learning_rate": 1.4679445128877938e-05,
"loss": 1.9164,
"step": 45900
},
{
"epoch": 0.07359988224018842,
"grad_norm": 67.61031341552734,
"learning_rate": 1.471144461688613e-05,
"loss": 1.511,
"step": 46000
},
{
"epoch": 0.07375988198418883,
"grad_norm": 223.40682983398438,
"learning_rate": 1.4743444104894323e-05,
"loss": 1.4523,
"step": 46100
},
{
"epoch": 0.07391988172818924,
"grad_norm": 86.30171966552734,
"learning_rate": 1.4775443592902514e-05,
"loss": 1.1247,
"step": 46200
},
{
"epoch": 0.07407988147218965,
"grad_norm": 147.9749755859375,
"learning_rate": 1.4807443080910706e-05,
"loss": 1.9694,
"step": 46300
},
{
"epoch": 0.07423988121619006,
"grad_norm": 0.01015425007790327,
"learning_rate": 1.4839442568918899e-05,
"loss": 2.1909,
"step": 46400
},
{
"epoch": 0.07439988096019047,
"grad_norm": 259.0996398925781,
"learning_rate": 1.487144205692709e-05,
"loss": 2.0247,
"step": 46500
},
{
"epoch": 0.07455988070419087,
"grad_norm": 95.31226348876953,
"learning_rate": 1.4903441544935281e-05,
"loss": 1.2061,
"step": 46600
},
{
"epoch": 0.07471988044819128,
"grad_norm": 173.83978271484375,
"learning_rate": 1.4935441032943474e-05,
"loss": 1.6151,
"step": 46700
},
{
"epoch": 0.07487988019219169,
"grad_norm": 2.386795997619629,
"learning_rate": 1.4967440520951666e-05,
"loss": 1.6184,
"step": 46800
},
{
"epoch": 0.0750398799361921,
"grad_norm": 429.2137756347656,
"learning_rate": 1.4999440008959859e-05,
"loss": 2.0375,
"step": 46900
},
{
"epoch": 0.07519987968019251,
"grad_norm": 387.4931945800781,
"learning_rate": 1.503143949696805e-05,
"loss": 1.8357,
"step": 47000
},
{
"epoch": 0.07535987942419292,
"grad_norm": 412.69384765625,
"learning_rate": 1.5063438984976241e-05,
"loss": 1.7605,
"step": 47100
},
{
"epoch": 0.07551987916819333,
"grad_norm": 64.52519989013672,
"learning_rate": 1.5095438472984435e-05,
"loss": 2.1139,
"step": 47200
},
{
"epoch": 0.07567987891219374,
"grad_norm": 2.706088066101074,
"learning_rate": 1.5127437960992626e-05,
"loss": 1.2971,
"step": 47300
},
{
"epoch": 0.07583987865619415,
"grad_norm": 3.714489459991455,
"learning_rate": 1.5159437449000817e-05,
"loss": 1.7242,
"step": 47400
},
{
"epoch": 0.07599987840019456,
"grad_norm": 5.220536708831787,
"learning_rate": 1.519143693700901e-05,
"loss": 1.2726,
"step": 47500
},
{
"epoch": 0.07615987814419498,
"grad_norm": 133.04861450195312,
"learning_rate": 1.5223436425017202e-05,
"loss": 1.9947,
"step": 47600
},
{
"epoch": 0.07631987788819537,
"grad_norm": 0.19635449349880219,
"learning_rate": 1.5255435913025393e-05,
"loss": 2.2796,
"step": 47700
},
{
"epoch": 0.07647987763219578,
"grad_norm": 168.06861877441406,
"learning_rate": 1.5287115406153503e-05,
"loss": 1.6232,
"step": 47800
},
{
"epoch": 0.0766398773761962,
"grad_norm": 0.004633053671568632,
"learning_rate": 1.5319114894161697e-05,
"loss": 1.3513,
"step": 47900
},
{
"epoch": 0.0767998771201966,
"grad_norm": 0.0009558099554851651,
"learning_rate": 1.5351114382169886e-05,
"loss": 1.291,
"step": 48000
},
{
"epoch": 0.07695987686419702,
"grad_norm": 1.2679246664047241,
"learning_rate": 1.538311387017808e-05,
"loss": 1.5954,
"step": 48100
},
{
"epoch": 0.07711987660819743,
"grad_norm": 511.6206970214844,
"learning_rate": 1.5415113358186272e-05,
"loss": 1.6232,
"step": 48200
},
{
"epoch": 0.07727987635219784,
"grad_norm": 190.66940307617188,
"learning_rate": 1.5447112846194462e-05,
"loss": 1.8858,
"step": 48300
},
{
"epoch": 0.07743987609619825,
"grad_norm": 368.29022216796875,
"learning_rate": 1.5479112334202655e-05,
"loss": 1.6235,
"step": 48400
},
{
"epoch": 0.07759987584019866,
"grad_norm": 0.00299979280680418,
"learning_rate": 1.5511111822210848e-05,
"loss": 1.9061,
"step": 48500
},
{
"epoch": 0.07775987558419907,
"grad_norm": 96.37437438964844,
"learning_rate": 1.5543111310219038e-05,
"loss": 1.5919,
"step": 48600
},
{
"epoch": 0.07791987532819948,
"grad_norm": 141.4491729736328,
"learning_rate": 1.5575110798227227e-05,
"loss": 1.8474,
"step": 48700
},
{
"epoch": 0.07807987507219988,
"grad_norm": 9.810319900512695,
"learning_rate": 1.560711028623542e-05,
"loss": 1.7112,
"step": 48800
},
{
"epoch": 0.07823987481620029,
"grad_norm": 0.20426060259342194,
"learning_rate": 1.5639109774243613e-05,
"loss": 1.8007,
"step": 48900
},
{
"epoch": 0.0783998745602007,
"grad_norm": 2.3212544918060303,
"learning_rate": 1.5671109262251803e-05,
"loss": 1.7499,
"step": 49000
},
{
"epoch": 0.07855987430420111,
"grad_norm": 2386.313232421875,
"learning_rate": 1.5703108750259996e-05,
"loss": 1.4046,
"step": 49100
},
{
"epoch": 0.07871987404820152,
"grad_norm": 123.7901611328125,
"learning_rate": 1.573510823826819e-05,
"loss": 2.0843,
"step": 49200
},
{
"epoch": 0.07887987379220193,
"grad_norm": 190.3510284423828,
"learning_rate": 1.576710772627638e-05,
"loss": 1.52,
"step": 49300
},
{
"epoch": 0.07903987353620234,
"grad_norm": 0.0007205315632745624,
"learning_rate": 1.5799107214284572e-05,
"loss": 1.8708,
"step": 49400
},
{
"epoch": 0.07919987328020275,
"grad_norm": 0.013503137975931168,
"learning_rate": 1.5831106702292765e-05,
"loss": 1.673,
"step": 49500
},
{
"epoch": 0.07935987302420316,
"grad_norm": 65.62171936035156,
"learning_rate": 1.5863106190300958e-05,
"loss": 1.8457,
"step": 49600
},
{
"epoch": 0.07951987276820358,
"grad_norm": 92.97589874267578,
"learning_rate": 1.5895105678309148e-05,
"loss": 1.5627,
"step": 49700
},
{
"epoch": 0.07967987251220399,
"grad_norm": 21.424842834472656,
"learning_rate": 1.5926785171437256e-05,
"loss": 1.6497,
"step": 49800
},
{
"epoch": 0.07983987225620438,
"grad_norm": 0.0011259341845288873,
"learning_rate": 1.595878465944545e-05,
"loss": 1.5787,
"step": 49900
},
{
"epoch": 0.0799998720002048,
"grad_norm": 0.01448867842555046,
"learning_rate": 1.5990784147453643e-05,
"loss": 1.8507,
"step": 50000
},
{
"epoch": 0.0801598717442052,
"grad_norm": 66.43374633789062,
"learning_rate": 1.6022783635461832e-05,
"loss": 1.4336,
"step": 50100
},
{
"epoch": 0.08031987148820562,
"grad_norm": 27.450525283813477,
"learning_rate": 1.6054783123470025e-05,
"loss": 2.152,
"step": 50200
},
{
"epoch": 0.08047987123220603,
"grad_norm": 33.92656707763672,
"learning_rate": 1.6086782611478218e-05,
"loss": 1.6311,
"step": 50300
},
{
"epoch": 0.08063987097620644,
"grad_norm": 20.14742660522461,
"learning_rate": 1.611878209948641e-05,
"loss": 1.7442,
"step": 50400
},
{
"epoch": 0.08079987072020685,
"grad_norm": 190.49342346191406,
"learning_rate": 1.61507815874946e-05,
"loss": 1.8063,
"step": 50500
},
{
"epoch": 0.08095987046420726,
"grad_norm": 673.4315185546875,
"learning_rate": 1.6182781075502794e-05,
"loss": 1.4,
"step": 50600
},
{
"epoch": 0.08111987020820767,
"grad_norm": 87.93296813964844,
"learning_rate": 1.6214780563510987e-05,
"loss": 1.6401,
"step": 50700
},
{
"epoch": 0.08127986995220808,
"grad_norm": 10.013740539550781,
"learning_rate": 1.6246780051519177e-05,
"loss": 1.9426,
"step": 50800
},
{
"epoch": 0.08143986969620849,
"grad_norm": 84.70770263671875,
"learning_rate": 1.627877953952737e-05,
"loss": 2.0937,
"step": 50900
},
{
"epoch": 0.0815998694402089,
"grad_norm": 66.33674621582031,
"learning_rate": 1.6310779027535563e-05,
"loss": 1.8187,
"step": 51000
},
{
"epoch": 0.0817598691842093,
"grad_norm": 0.013598043471574783,
"learning_rate": 1.6342778515543753e-05,
"loss": 2.1751,
"step": 51100
},
{
"epoch": 0.08191986892820971,
"grad_norm": 0.8764291405677795,
"learning_rate": 1.6374778003551946e-05,
"loss": 2.1703,
"step": 51200
},
{
"epoch": 0.08207986867221012,
"grad_norm": 14.436594009399414,
"learning_rate": 1.640677749156014e-05,
"loss": 1.4443,
"step": 51300
},
{
"epoch": 0.08223986841621053,
"grad_norm": 0.27148157358169556,
"learning_rate": 1.6438776979568328e-05,
"loss": 1.9266,
"step": 51400
},
{
"epoch": 0.08239986816021094,
"grad_norm": 11.139505386352539,
"learning_rate": 1.6470776467576518e-05,
"loss": 1.8226,
"step": 51500
},
{
"epoch": 0.08255986790421135,
"grad_norm": 105.84121704101562,
"learning_rate": 1.650277595558471e-05,
"loss": 1.4394,
"step": 51600
},
{
"epoch": 0.08271986764821176,
"grad_norm": 161.04141235351562,
"learning_rate": 1.6534775443592904e-05,
"loss": 1.052,
"step": 51700
},
{
"epoch": 0.08287986739221218,
"grad_norm": 11.454148292541504,
"learning_rate": 1.6566774931601094e-05,
"loss": 1.0614,
"step": 51800
},
{
"epoch": 0.08303986713621259,
"grad_norm": 0.03253089264035225,
"learning_rate": 1.6598774419609287e-05,
"loss": 1.4591,
"step": 51900
},
{
"epoch": 0.083199866880213,
"grad_norm": 0.2750859558582306,
"learning_rate": 1.663077390761748e-05,
"loss": 1.6479,
"step": 52000
},
{
"epoch": 0.08335986662421341,
"grad_norm": 3.381882667541504,
"learning_rate": 1.666245340074559e-05,
"loss": 1.7548,
"step": 52100
},
{
"epoch": 0.0835198663682138,
"grad_norm": 102.45317840576172,
"learning_rate": 1.669445288875378e-05,
"loss": 1.6293,
"step": 52200
},
{
"epoch": 0.08367986611221422,
"grad_norm": 122.25707244873047,
"learning_rate": 1.672645237676197e-05,
"loss": 1.7183,
"step": 52300
},
{
"epoch": 0.08383986585621463,
"grad_norm": 1.0929410457611084,
"learning_rate": 1.6758451864770164e-05,
"loss": 1.2329,
"step": 52400
},
{
"epoch": 0.08399986560021504,
"grad_norm": 0.0009238706552423537,
"learning_rate": 1.6790451352778357e-05,
"loss": 1.5292,
"step": 52500
},
{
"epoch": 0.08415986534421545,
"grad_norm": 15.874957084655762,
"learning_rate": 1.6822450840786547e-05,
"loss": 1.6752,
"step": 52600
},
{
"epoch": 0.08431986508821586,
"grad_norm": 8.129535675048828,
"learning_rate": 1.685445032879474e-05,
"loss": 1.3228,
"step": 52700
},
{
"epoch": 0.08447986483221627,
"grad_norm": 196.6626434326172,
"learning_rate": 1.6886449816802933e-05,
"loss": 1.485,
"step": 52800
},
{
"epoch": 0.08463986457621668,
"grad_norm": 257.9208679199219,
"learning_rate": 1.6918449304811123e-05,
"loss": 1.4228,
"step": 52900
},
{
"epoch": 0.08479986432021709,
"grad_norm": 126.92493438720703,
"learning_rate": 1.6950448792819316e-05,
"loss": 1.1385,
"step": 53000
},
{
"epoch": 0.0849598640642175,
"grad_norm": 218.52455139160156,
"learning_rate": 1.698244828082751e-05,
"loss": 1.1812,
"step": 53100
},
{
"epoch": 0.08511986380821791,
"grad_norm": 0.32875338196754456,
"learning_rate": 1.70144477688357e-05,
"loss": 1.4763,
"step": 53200
},
{
"epoch": 0.08527986355221831,
"grad_norm": 6.30516242980957,
"learning_rate": 1.704644725684389e-05,
"loss": 1.9444,
"step": 53300
},
{
"epoch": 0.08543986329621872,
"grad_norm": 0.10023212432861328,
"learning_rate": 1.7078446744852085e-05,
"loss": 1.5316,
"step": 53400
},
{
"epoch": 0.08559986304021913,
"grad_norm": 0.16311447322368622,
"learning_rate": 1.7110446232860278e-05,
"loss": 1.6928,
"step": 53500
},
{
"epoch": 0.08575986278421954,
"grad_norm": 113.52496337890625,
"learning_rate": 1.7142445720868467e-05,
"loss": 1.4466,
"step": 53600
},
{
"epoch": 0.08591986252821995,
"grad_norm": 202.2323760986328,
"learning_rate": 1.717444520887666e-05,
"loss": 1.438,
"step": 53700
},
{
"epoch": 0.08607986227222036,
"grad_norm": 0.009465747512876987,
"learning_rate": 1.7206444696884853e-05,
"loss": 1.1629,
"step": 53800
},
{
"epoch": 0.08623986201622078,
"grad_norm": 36.7415771484375,
"learning_rate": 1.7238444184893043e-05,
"loss": 1.3017,
"step": 53900
},
{
"epoch": 0.08639986176022119,
"grad_norm": 238.30662536621094,
"learning_rate": 1.7270443672901236e-05,
"loss": 1.6614,
"step": 54000
},
{
"epoch": 0.0865598615042216,
"grad_norm": 92.3208236694336,
"learning_rate": 1.730244316090943e-05,
"loss": 1.4535,
"step": 54100
},
{
"epoch": 0.08671986124822201,
"grad_norm": 78.44776153564453,
"learning_rate": 1.733444264891762e-05,
"loss": 1.7061,
"step": 54200
},
{
"epoch": 0.08687986099222242,
"grad_norm": 0.22147144377231598,
"learning_rate": 1.736644213692581e-05,
"loss": 1.4681,
"step": 54300
},
{
"epoch": 0.08703986073622282,
"grad_norm": 11.244450569152832,
"learning_rate": 1.7398441624934e-05,
"loss": 1.3449,
"step": 54400
},
{
"epoch": 0.08719986048022323,
"grad_norm": 0.002357147866860032,
"learning_rate": 1.7430441112942195e-05,
"loss": 1.8814,
"step": 54500
},
{
"epoch": 0.08735986022422364,
"grad_norm": 0.008252524770796299,
"learning_rate": 1.7462440600950384e-05,
"loss": 1.5989,
"step": 54600
},
{
"epoch": 0.08751985996822405,
"grad_norm": 61.25815200805664,
"learning_rate": 1.7494440088958577e-05,
"loss": 1.3711,
"step": 54700
},
{
"epoch": 0.08767985971222446,
"grad_norm": 0.536085307598114,
"learning_rate": 1.752643957696677e-05,
"loss": 1.3199,
"step": 54800
},
{
"epoch": 0.08783985945622487,
"grad_norm": 261.3522033691406,
"learning_rate": 1.755811907009488e-05,
"loss": 1.3713,
"step": 54900
},
{
"epoch": 0.08799985920022528,
"grad_norm": 0.18219026923179626,
"learning_rate": 1.7590118558103072e-05,
"loss": 1.441,
"step": 55000
},
{
"epoch": 0.08815985894422569,
"grad_norm": 2.0045886039733887,
"learning_rate": 1.7622118046111262e-05,
"loss": 1.268,
"step": 55100
},
{
"epoch": 0.0883198586882261,
"grad_norm": 119.64205932617188,
"learning_rate": 1.7654117534119455e-05,
"loss": 1.1648,
"step": 55200
},
{
"epoch": 0.08847985843222651,
"grad_norm": 1316.0831298828125,
"learning_rate": 1.7686117022127648e-05,
"loss": 1.8108,
"step": 55300
},
{
"epoch": 0.08863985817622692,
"grad_norm": 0.016518862918019295,
"learning_rate": 1.7718116510135838e-05,
"loss": 1.4904,
"step": 55400
},
{
"epoch": 0.08879985792022732,
"grad_norm": 0.0020672741811722517,
"learning_rate": 1.774979600326395e-05,
"loss": 1.2555,
"step": 55500
},
{
"epoch": 0.08895985766422773,
"grad_norm": 0.0013748366618528962,
"learning_rate": 1.778179549127214e-05,
"loss": 1.2733,
"step": 55600
},
{
"epoch": 0.08911985740822814,
"grad_norm": 156.89892578125,
"learning_rate": 1.7813794979280332e-05,
"loss": 1.5194,
"step": 55700
},
{
"epoch": 0.08927985715222855,
"grad_norm": 0.034991975873708725,
"learning_rate": 1.7845794467288526e-05,
"loss": 1.7587,
"step": 55800
},
{
"epoch": 0.08943985689622896,
"grad_norm": 25.602022171020508,
"learning_rate": 1.7877793955296715e-05,
"loss": 1.6183,
"step": 55900
},
{
"epoch": 0.08959985664022938,
"grad_norm": 0.6393762230873108,
"learning_rate": 1.7909793443304908e-05,
"loss": 1.3596,
"step": 56000
},
{
"epoch": 0.08975985638422979,
"grad_norm": 48.19321823120117,
"learning_rate": 1.79417929313131e-05,
"loss": 1.5248,
"step": 56100
},
{
"epoch": 0.0899198561282302,
"grad_norm": 88.53876495361328,
"learning_rate": 1.797379241932129e-05,
"loss": 1.5177,
"step": 56200
},
{
"epoch": 0.09007985587223061,
"grad_norm": 4.195464611053467,
"learning_rate": 1.8005791907329484e-05,
"loss": 1.7579,
"step": 56300
},
{
"epoch": 0.09023985561623102,
"grad_norm": 4.309329986572266,
"learning_rate": 1.8037791395337677e-05,
"loss": 1.5508,
"step": 56400
},
{
"epoch": 0.09039985536023143,
"grad_norm": 781.6338500976562,
"learning_rate": 1.8069790883345867e-05,
"loss": 1.5965,
"step": 56500
},
{
"epoch": 0.09055985510423184,
"grad_norm": 0.2752499580383301,
"learning_rate": 1.810179037135406e-05,
"loss": 1.5762,
"step": 56600
},
{
"epoch": 0.09071985484823224,
"grad_norm": 107.10926818847656,
"learning_rate": 1.8133789859362253e-05,
"loss": 1.7441,
"step": 56700
},
{
"epoch": 0.09087985459223265,
"grad_norm": 97.79568481445312,
"learning_rate": 1.8165789347370442e-05,
"loss": 2.0257,
"step": 56800
},
{
"epoch": 0.09103985433623306,
"grad_norm": 0.0016732424264773726,
"learning_rate": 1.8197788835378635e-05,
"loss": 1.1371,
"step": 56900
},
{
"epoch": 0.09119985408023347,
"grad_norm": 7.662989139556885,
"learning_rate": 1.822978832338683e-05,
"loss": 1.8825,
"step": 57000
},
{
"epoch": 0.09135985382423388,
"grad_norm": 16.23094940185547,
"learning_rate": 1.8261787811395018e-05,
"loss": 1.0455,
"step": 57100
},
{
"epoch": 0.09151985356823429,
"grad_norm": 0.025669243186712265,
"learning_rate": 1.829378729940321e-05,
"loss": 1.5889,
"step": 57200
},
{
"epoch": 0.0916798533122347,
"grad_norm": 0.4638320505619049,
"learning_rate": 1.8325786787411404e-05,
"loss": 1.192,
"step": 57300
},
{
"epoch": 0.09183985305623511,
"grad_norm": 61.32036209106445,
"learning_rate": 1.8357786275419597e-05,
"loss": 1.5374,
"step": 57400
},
{
"epoch": 0.09199985280023552,
"grad_norm": 0.0012545910431072116,
"learning_rate": 1.8389785763427787e-05,
"loss": 1.6236,
"step": 57500
},
{
"epoch": 0.09215985254423593,
"grad_norm": 92.61576080322266,
"learning_rate": 1.842178525143598e-05,
"loss": 1.8945,
"step": 57600
},
{
"epoch": 0.09231985228823635,
"grad_norm": 22.349824905395508,
"learning_rate": 1.845378473944417e-05,
"loss": 1.607,
"step": 57700
},
{
"epoch": 0.09247985203223674,
"grad_norm": 126.0189208984375,
"learning_rate": 1.8485784227452363e-05,
"loss": 1.8133,
"step": 57800
},
{
"epoch": 0.09263985177623715,
"grad_norm": 78.0487060546875,
"learning_rate": 1.8517783715460552e-05,
"loss": 1.5777,
"step": 57900
},
{
"epoch": 0.09279985152023756,
"grad_norm": 0.0007238721009343863,
"learning_rate": 1.8549783203468745e-05,
"loss": 1.5043,
"step": 58000
},
{
"epoch": 0.09295985126423797,
"grad_norm": 123.57425689697266,
"learning_rate": 1.858178269147694e-05,
"loss": 1.7681,
"step": 58100
},
{
"epoch": 0.09311985100823839,
"grad_norm": 0.2985190153121948,
"learning_rate": 1.8613782179485128e-05,
"loss": 1.623,
"step": 58200
},
{
"epoch": 0.0932798507522388,
"grad_norm": 64.27520751953125,
"learning_rate": 1.864578166749332e-05,
"loss": 2.2137,
"step": 58300
},
{
"epoch": 0.09343985049623921,
"grad_norm": 207.88841247558594,
"learning_rate": 1.8677781155501514e-05,
"loss": 2.2447,
"step": 58400
},
{
"epoch": 0.09359985024023962,
"grad_norm": 94.58351135253906,
"learning_rate": 1.8709780643509704e-05,
"loss": 2.3013,
"step": 58500
},
{
"epoch": 0.09375984998424003,
"grad_norm": 0.29524990916252136,
"learning_rate": 1.8741780131517897e-05,
"loss": 1.3105,
"step": 58600
},
{
"epoch": 0.09391984972824044,
"grad_norm": 110.85052490234375,
"learning_rate": 1.877377961952609e-05,
"loss": 1.4461,
"step": 58700
},
{
"epoch": 0.09407984947224085,
"grad_norm": 44.61641311645508,
"learning_rate": 1.880577910753428e-05,
"loss": 2.1321,
"step": 58800
},
{
"epoch": 0.09423984921624125,
"grad_norm": 699.353759765625,
"learning_rate": 1.8837778595542473e-05,
"loss": 1.7541,
"step": 58900
},
{
"epoch": 0.09439984896024166,
"grad_norm": 88.22161865234375,
"learning_rate": 1.8869778083550666e-05,
"loss": 1.7894,
"step": 59000
},
{
"epoch": 0.09455984870424207,
"grad_norm": 25.957782745361328,
"learning_rate": 1.8901777571558855e-05,
"loss": 1.693,
"step": 59100
},
{
"epoch": 0.09471984844824248,
"grad_norm": 1.7580465078353882,
"learning_rate": 1.8933457064686968e-05,
"loss": 1.7073,
"step": 59200
},
{
"epoch": 0.09487984819224289,
"grad_norm": 5.568783283233643,
"learning_rate": 1.8965456552695157e-05,
"loss": 2.0305,
"step": 59300
},
{
"epoch": 0.0950398479362433,
"grad_norm": 0.21757324039936066,
"learning_rate": 1.899745604070335e-05,
"loss": 1.3684,
"step": 59400
},
{
"epoch": 0.09519984768024371,
"grad_norm": 123.5767593383789,
"learning_rate": 1.9029455528711543e-05,
"loss": 1.8754,
"step": 59500
},
{
"epoch": 0.09535984742424412,
"grad_norm": 66.91508483886719,
"learning_rate": 1.9061455016719733e-05,
"loss": 2.0225,
"step": 59600
},
{
"epoch": 0.09551984716824453,
"grad_norm": 0.00018894312961492687,
"learning_rate": 1.9093454504727926e-05,
"loss": 2.1975,
"step": 59700
},
{
"epoch": 0.09567984691224495,
"grad_norm": 84.60813903808594,
"learning_rate": 1.912545399273612e-05,
"loss": 1.7173,
"step": 59800
},
{
"epoch": 0.09583984665624536,
"grad_norm": 67.51477813720703,
"learning_rate": 1.915745348074431e-05,
"loss": 1.4302,
"step": 59900
},
{
"epoch": 0.09599984640024575,
"grad_norm": 59.47672653198242,
"learning_rate": 1.9189452968752502e-05,
"loss": 1.2497,
"step": 60000
},
{
"epoch": 0.09615984614424616,
"grad_norm": 19.75477409362793,
"learning_rate": 1.9221452456760695e-05,
"loss": 1.4058,
"step": 60100
},
{
"epoch": 0.09631984588824657,
"grad_norm": 91.08583068847656,
"learning_rate": 1.9253451944768885e-05,
"loss": 1.0956,
"step": 60200
},
{
"epoch": 0.09647984563224699,
"grad_norm": 133.5473175048828,
"learning_rate": 1.9285451432777078e-05,
"loss": 1.3731,
"step": 60300
},
{
"epoch": 0.0966398453762474,
"grad_norm": 0.010973370634019375,
"learning_rate": 1.931745092078527e-05,
"loss": 1.2953,
"step": 60400
},
{
"epoch": 0.09679984512024781,
"grad_norm": 0.08579988777637482,
"learning_rate": 1.934945040879346e-05,
"loss": 1.0987,
"step": 60500
},
{
"epoch": 0.09695984486424822,
"grad_norm": 1.1617801189422607,
"learning_rate": 1.9381449896801653e-05,
"loss": 1.5104,
"step": 60600
},
{
"epoch": 0.09711984460824863,
"grad_norm": 0.3544386029243469,
"learning_rate": 1.9413449384809843e-05,
"loss": 1.5224,
"step": 60700
},
{
"epoch": 0.09727984435224904,
"grad_norm": 44.28148651123047,
"learning_rate": 1.9445448872818036e-05,
"loss": 1.3982,
"step": 60800
},
{
"epoch": 0.09743984409624945,
"grad_norm": 167.9832305908203,
"learning_rate": 1.947744836082623e-05,
"loss": 1.2785,
"step": 60900
},
{
"epoch": 0.09759984384024986,
"grad_norm": 129.58119201660156,
"learning_rate": 1.950944784883442e-05,
"loss": 1.6018,
"step": 61000
},
{
"epoch": 0.09775984358425026,
"grad_norm": 0.048871856182813644,
"learning_rate": 1.9541447336842612e-05,
"loss": 1.4968,
"step": 61100
},
{
"epoch": 0.09791984332825067,
"grad_norm": 0.14592894911766052,
"learning_rate": 1.9573446824850805e-05,
"loss": 1.2423,
"step": 61200
},
{
"epoch": 0.09807984307225108,
"grad_norm": 0.548117458820343,
"learning_rate": 1.9605446312858995e-05,
"loss": 1.9973,
"step": 61300
},
{
"epoch": 0.09823984281625149,
"grad_norm": 52.4393424987793,
"learning_rate": 1.9637445800867188e-05,
"loss": 1.2149,
"step": 61400
},
{
"epoch": 0.0983998425602519,
"grad_norm": 101.79759216308594,
"learning_rate": 1.966944528887538e-05,
"loss": 1.731,
"step": 61500
},
{
"epoch": 0.09855984230425231,
"grad_norm": 0.04176723212003708,
"learning_rate": 1.970144477688357e-05,
"loss": 1.2889,
"step": 61600
},
{
"epoch": 0.09871984204825272,
"grad_norm": 8.380585670471191,
"learning_rate": 1.9733444264891763e-05,
"loss": 1.856,
"step": 61700
},
{
"epoch": 0.09887984179225313,
"grad_norm": 0.014852323569357395,
"learning_rate": 1.9765443752899956e-05,
"loss": 0.8942,
"step": 61800
},
{
"epoch": 0.09903984153625355,
"grad_norm": 0.3229600787162781,
"learning_rate": 1.9797443240908146e-05,
"loss": 1.3371,
"step": 61900
},
{
"epoch": 0.09919984128025396,
"grad_norm": 141.8211212158203,
"learning_rate": 1.982944272891634e-05,
"loss": 1.5222,
"step": 62000
},
{
"epoch": 0.09935984102425437,
"grad_norm": 0.025253353640437126,
"learning_rate": 1.9861442216924532e-05,
"loss": 1.5435,
"step": 62100
},
{
"epoch": 0.09951984076825478,
"grad_norm": 0.0009790909243747592,
"learning_rate": 1.9893441704932722e-05,
"loss": 1.1172,
"step": 62200
},
{
"epoch": 0.09967984051225517,
"grad_norm": 43.73761749267578,
"learning_rate": 1.9925441192940915e-05,
"loss": 1.6024,
"step": 62300
},
{
"epoch": 0.09983984025625559,
"grad_norm": 18.400936126708984,
"learning_rate": 1.9957440680949108e-05,
"loss": 1.3914,
"step": 62400
},
{
"epoch": 0.099999840000256,
"grad_norm": 937.4790649414062,
"learning_rate": 1.9989440168957298e-05,
"loss": 1.4714,
"step": 62500
},
{
"epoch": 0.10015983974425641,
"grad_norm": 0.14736099541187286,
"learning_rate": 1.999761777777778e-05,
"loss": 1.2922,
"step": 62600
},
{
"epoch": 0.10031983948825682,
"grad_norm": 0.606549084186554,
"learning_rate": 1.999409777777778e-05,
"loss": 1.4263,
"step": 62700
},
{
"epoch": 0.10047983923225723,
"grad_norm": 426.79400634765625,
"learning_rate": 1.9990542222222224e-05,
"loss": 1.4586,
"step": 62800
},
{
"epoch": 0.10063983897625764,
"grad_norm": 0.007887039333581924,
"learning_rate": 1.9986986666666668e-05,
"loss": 1.6312,
"step": 62900
},
{
"epoch": 0.10079983872025805,
"grad_norm": 0.9442864060401917,
"learning_rate": 1.9983431111111113e-05,
"loss": 1.9607,
"step": 63000
},
{
"epoch": 0.10095983846425846,
"grad_norm": 0.002317711478099227,
"learning_rate": 1.9979875555555557e-05,
"loss": 1.5771,
"step": 63100
},
{
"epoch": 0.10111983820825887,
"grad_norm": 75.09770965576172,
"learning_rate": 1.9976320000000002e-05,
"loss": 1.6721,
"step": 63200
},
{
"epoch": 0.10127983795225928,
"grad_norm": 135.64022827148438,
"learning_rate": 1.9972764444444446e-05,
"loss": 1.8461,
"step": 63300
},
{
"epoch": 0.10143983769625968,
"grad_norm": 0.1608121395111084,
"learning_rate": 1.996920888888889e-05,
"loss": 1.5256,
"step": 63400
},
{
"epoch": 0.10159983744026009,
"grad_norm": 266.8143615722656,
"learning_rate": 1.9965653333333336e-05,
"loss": 1.9736,
"step": 63500
},
{
"epoch": 0.1017598371842605,
"grad_norm": 125.29386138916016,
"learning_rate": 1.996209777777778e-05,
"loss": 1.4735,
"step": 63600
},
{
"epoch": 0.10191983692826091,
"grad_norm": 1.8401292562484741,
"learning_rate": 1.9958542222222225e-05,
"loss": 1.4619,
"step": 63700
},
{
"epoch": 0.10207983667226132,
"grad_norm": 352.0743103027344,
"learning_rate": 1.995498666666667e-05,
"loss": 1.6571,
"step": 63800
},
{
"epoch": 0.10223983641626173,
"grad_norm": 546.5570068359375,
"learning_rate": 1.9951431111111114e-05,
"loss": 1.5888,
"step": 63900
},
{
"epoch": 0.10239983616026214,
"grad_norm": 0.0009566646185703576,
"learning_rate": 1.994787555555556e-05,
"loss": 2.0457,
"step": 64000
},
{
"epoch": 0.10255983590426256,
"grad_norm": 717.4028930664062,
"learning_rate": 1.9944320000000003e-05,
"loss": 1.7843,
"step": 64100
},
{
"epoch": 0.10271983564826297,
"grad_norm": 0.16068622469902039,
"learning_rate": 1.9940764444444447e-05,
"loss": 1.5116,
"step": 64200
},
{
"epoch": 0.10287983539226338,
"grad_norm": 69.0772705078125,
"learning_rate": 1.9937208888888892e-05,
"loss": 1.6682,
"step": 64300
},
{
"epoch": 0.10303983513626379,
"grad_norm": 0.0007585228304378688,
"learning_rate": 1.9933653333333337e-05,
"loss": 1.2137,
"step": 64400
},
{
"epoch": 0.10319983488026419,
"grad_norm": 140.95750427246094,
"learning_rate": 1.9930097777777778e-05,
"loss": 1.1308,
"step": 64500
},
{
"epoch": 0.1033598346242646,
"grad_norm": 1.2280133962631226,
"learning_rate": 1.9926542222222226e-05,
"loss": 2.031,
"step": 64600
},
{
"epoch": 0.103519834368265,
"grad_norm": 51.01097106933594,
"learning_rate": 1.9922986666666667e-05,
"loss": 1.6903,
"step": 64700
},
{
"epoch": 0.10367983411226542,
"grad_norm": 354.4974365234375,
"learning_rate": 1.9919431111111115e-05,
"loss": 1.3365,
"step": 64800
},
{
"epoch": 0.10383983385626583,
"grad_norm": 98.43709564208984,
"learning_rate": 1.9915875555555556e-05,
"loss": 1.5736,
"step": 64900
},
{
"epoch": 0.10399983360026624,
"grad_norm": 176.26661682128906,
"learning_rate": 1.991232e-05,
"loss": 1.7264,
"step": 65000
},
{
"epoch": 0.10415983334426665,
"grad_norm": 55.52714920043945,
"learning_rate": 1.9908764444444445e-05,
"loss": 1.1781,
"step": 65100
},
{
"epoch": 0.10431983308826706,
"grad_norm": 0.0009407736943103373,
"learning_rate": 1.990520888888889e-05,
"loss": 1.2503,
"step": 65200
},
{
"epoch": 0.10447983283226747,
"grad_norm": 0.001376794883981347,
"learning_rate": 1.9901653333333334e-05,
"loss": 0.9432,
"step": 65300
},
{
"epoch": 0.10463983257626788,
"grad_norm": 52.175819396972656,
"learning_rate": 1.9898133333333335e-05,
"loss": 1.264,
"step": 65400
},
{
"epoch": 0.1047998323202683,
"grad_norm": 147.7506866455078,
"learning_rate": 1.989457777777778e-05,
"loss": 1.2086,
"step": 65500
},
{
"epoch": 0.10495983206426869,
"grad_norm": 31.214550018310547,
"learning_rate": 1.9891022222222224e-05,
"loss": 1.8692,
"step": 65600
},
{
"epoch": 0.1051198318082691,
"grad_norm": 168.40858459472656,
"learning_rate": 1.988746666666667e-05,
"loss": 1.2745,
"step": 65700
},
{
"epoch": 0.10527983155226951,
"grad_norm": 401.31842041015625,
"learning_rate": 1.9883911111111113e-05,
"loss": 1.6839,
"step": 65800
},
{
"epoch": 0.10543983129626992,
"grad_norm": 139.64588928222656,
"learning_rate": 1.9880355555555558e-05,
"loss": 1.4509,
"step": 65900
},
{
"epoch": 0.10559983104027033,
"grad_norm": 125.26469421386719,
"learning_rate": 1.98768e-05,
"loss": 1.1615,
"step": 66000
},
{
"epoch": 0.10575983078427074,
"grad_norm": 0.1609152853488922,
"learning_rate": 1.9873244444444447e-05,
"loss": 1.4458,
"step": 66100
},
{
"epoch": 0.10591983052827116,
"grad_norm": 0.000580300809815526,
"learning_rate": 1.9869688888888888e-05,
"loss": 1.8329,
"step": 66200
},
{
"epoch": 0.10607983027227157,
"grad_norm": 101.93132781982422,
"learning_rate": 1.9866133333333336e-05,
"loss": 1.567,
"step": 66300
},
{
"epoch": 0.10623983001627198,
"grad_norm": 99.72083282470703,
"learning_rate": 1.9862577777777777e-05,
"loss": 1.6746,
"step": 66400
},
{
"epoch": 0.10639982976027239,
"grad_norm": 1651.70263671875,
"learning_rate": 1.9859022222222225e-05,
"loss": 1.65,
"step": 66500
},
{
"epoch": 0.1065598295042728,
"grad_norm": 132.80343627929688,
"learning_rate": 1.9855466666666666e-05,
"loss": 1.5497,
"step": 66600
},
{
"epoch": 0.1067198292482732,
"grad_norm": 0.004364237189292908,
"learning_rate": 1.9851911111111114e-05,
"loss": 1.4009,
"step": 66700
},
{
"epoch": 0.1068798289922736,
"grad_norm": 4.1050825119018555,
"learning_rate": 1.9848355555555556e-05,
"loss": 2.058,
"step": 66800
},
{
"epoch": 0.10703982873627402,
"grad_norm": 0.047410767525434494,
"learning_rate": 1.9844800000000004e-05,
"loss": 1.6306,
"step": 66900
},
{
"epoch": 0.10719982848027443,
"grad_norm": 6.651243686676025,
"learning_rate": 1.9841244444444445e-05,
"loss": 1.4377,
"step": 67000
},
{
"epoch": 0.10735982822427484,
"grad_norm": 0.010524190030992031,
"learning_rate": 1.983768888888889e-05,
"loss": 1.4501,
"step": 67100
},
{
"epoch": 0.10751982796827525,
"grad_norm": 0.0009414692758582532,
"learning_rate": 1.9834133333333334e-05,
"loss": 1.2648,
"step": 67200
},
{
"epoch": 0.10767982771227566,
"grad_norm": 1019.7636108398438,
"learning_rate": 1.983057777777778e-05,
"loss": 1.3186,
"step": 67300
},
{
"epoch": 0.10783982745627607,
"grad_norm": 0.006541971582919359,
"learning_rate": 1.982705777777778e-05,
"loss": 1.1313,
"step": 67400
},
{
"epoch": 0.10799982720027648,
"grad_norm": 323.63226318359375,
"learning_rate": 1.9823502222222224e-05,
"loss": 2.2523,
"step": 67500
},
{
"epoch": 0.1081598269442769,
"grad_norm": 114.84791564941406,
"learning_rate": 1.981994666666667e-05,
"loss": 1.9146,
"step": 67600
},
{
"epoch": 0.1083198266882773,
"grad_norm": 4.059427738189697,
"learning_rate": 1.9816391111111113e-05,
"loss": 1.7334,
"step": 67700
},
{
"epoch": 0.10847982643227772,
"grad_norm": 3.274331569671631,
"learning_rate": 1.9812835555555558e-05,
"loss": 1.7195,
"step": 67800
},
{
"epoch": 0.10863982617627811,
"grad_norm": 0.0480005145072937,
"learning_rate": 1.9809280000000002e-05,
"loss": 1.4661,
"step": 67900
},
{
"epoch": 0.10879982592027852,
"grad_norm": 45.43354415893555,
"learning_rate": 1.9805724444444447e-05,
"loss": 1.3503,
"step": 68000
},
{
"epoch": 0.10895982566427893,
"grad_norm": 0.0006582220084965229,
"learning_rate": 1.980216888888889e-05,
"loss": 1.0129,
"step": 68100
},
{
"epoch": 0.10911982540827934,
"grad_norm": 111.87661743164062,
"learning_rate": 1.9798613333333332e-05,
"loss": 1.6036,
"step": 68200
},
{
"epoch": 0.10927982515227976,
"grad_norm": 122.35249328613281,
"learning_rate": 1.979505777777778e-05,
"loss": 0.9312,
"step": 68300
},
{
"epoch": 0.10943982489628017,
"grad_norm": 0.5635089874267578,
"learning_rate": 1.979150222222222e-05,
"loss": 1.5817,
"step": 68400
},
{
"epoch": 0.10959982464028058,
"grad_norm": 2.6275858879089355,
"learning_rate": 1.978794666666667e-05,
"loss": 1.2024,
"step": 68500
},
{
"epoch": 0.10975982438428099,
"grad_norm": 0.6521372199058533,
"learning_rate": 1.978439111111111e-05,
"loss": 0.985,
"step": 68600
},
{
"epoch": 0.1099198241282814,
"grad_norm": 2.0386836528778076,
"learning_rate": 1.978083555555556e-05,
"loss": 1.1712,
"step": 68700
},
{
"epoch": 0.11007982387228181,
"grad_norm": 132.18045043945312,
"learning_rate": 1.977728e-05,
"loss": 1.5874,
"step": 68800
},
{
"epoch": 0.11023982361628222,
"grad_norm": 0.0068659852258861065,
"learning_rate": 1.9773724444444448e-05,
"loss": 1.8551,
"step": 68900
},
{
"epoch": 0.11039982336028262,
"grad_norm": 84.89590454101562,
"learning_rate": 1.977016888888889e-05,
"loss": 1.232,
"step": 69000
},
{
"epoch": 0.11055982310428303,
"grad_norm": 191.7918243408203,
"learning_rate": 1.9766613333333337e-05,
"loss": 1.4688,
"step": 69100
},
{
"epoch": 0.11071982284828344,
"grad_norm": 10.109711647033691,
"learning_rate": 1.9763057777777778e-05,
"loss": 1.1107,
"step": 69200
},
{
"epoch": 0.11087982259228385,
"grad_norm": 63.09272766113281,
"learning_rate": 1.9759502222222226e-05,
"loss": 1.6495,
"step": 69300
},
{
"epoch": 0.11103982233628426,
"grad_norm": 66.24422454833984,
"learning_rate": 1.9755946666666667e-05,
"loss": 1.6278,
"step": 69400
},
{
"epoch": 0.11119982208028467,
"grad_norm": 95.56941223144531,
"learning_rate": 1.975239111111111e-05,
"loss": 1.7135,
"step": 69500
},
{
"epoch": 0.11135982182428508,
"grad_norm": 324.1082763671875,
"learning_rate": 1.9748835555555556e-05,
"loss": 1.5108,
"step": 69600
},
{
"epoch": 0.1115198215682855,
"grad_norm": 123.81194305419922,
"learning_rate": 1.974528e-05,
"loss": 1.4056,
"step": 69700
},
{
"epoch": 0.1116798213122859,
"grad_norm": 0.9657291769981384,
"learning_rate": 1.9741724444444445e-05,
"loss": 0.9324,
"step": 69800
},
{
"epoch": 0.11183982105628631,
"grad_norm": 21.34449005126953,
"learning_rate": 1.973816888888889e-05,
"loss": 1.3613,
"step": 69900
},
{
"epoch": 0.11199982080028673,
"grad_norm": 51.35042953491211,
"learning_rate": 1.9734613333333334e-05,
"loss": 1.5283,
"step": 70000
},
{
"epoch": 0.11215982054428712,
"grad_norm": 16.527353286743164,
"learning_rate": 1.973105777777778e-05,
"loss": 1.3809,
"step": 70100
},
{
"epoch": 0.11231982028828753,
"grad_norm": 0.032987259328365326,
"learning_rate": 1.9727502222222224e-05,
"loss": 1.5552,
"step": 70200
},
{
"epoch": 0.11247982003228794,
"grad_norm": 96.99321746826172,
"learning_rate": 1.9723946666666668e-05,
"loss": 1.4567,
"step": 70300
},
{
"epoch": 0.11263981977628836,
"grad_norm": 291.11444091796875,
"learning_rate": 1.9720391111111113e-05,
"loss": 1.4404,
"step": 70400
},
{
"epoch": 0.11279981952028877,
"grad_norm": 1.33843195438385,
"learning_rate": 1.9716835555555557e-05,
"loss": 1.1805,
"step": 70500
},
{
"epoch": 0.11295981926428918,
"grad_norm": 340.20355224609375,
"learning_rate": 1.9713280000000002e-05,
"loss": 2.514,
"step": 70600
},
{
"epoch": 0.11311981900828959,
"grad_norm": 0.20241181552410126,
"learning_rate": 1.9709724444444446e-05,
"loss": 1.4821,
"step": 70700
},
{
"epoch": 0.11327981875229,
"grad_norm": 0.044828109443187714,
"learning_rate": 1.970616888888889e-05,
"loss": 1.5156,
"step": 70800
},
{
"epoch": 0.11343981849629041,
"grad_norm": 121.0267105102539,
"learning_rate": 1.9702648888888892e-05,
"loss": 1.5925,
"step": 70900
},
{
"epoch": 0.11359981824029082,
"grad_norm": 1217.32373046875,
"learning_rate": 1.9699093333333333e-05,
"loss": 1.9517,
"step": 71000
},
{
"epoch": 0.11375981798429123,
"grad_norm": 102.1255111694336,
"learning_rate": 1.969553777777778e-05,
"loss": 1.2685,
"step": 71100
},
{
"epoch": 0.11391981772829163,
"grad_norm": 0.0009581278427504003,
"learning_rate": 1.9691982222222222e-05,
"loss": 1.6314,
"step": 71200
},
{
"epoch": 0.11407981747229204,
"grad_norm": 105.03948974609375,
"learning_rate": 1.968842666666667e-05,
"loss": 1.5252,
"step": 71300
},
{
"epoch": 0.11423981721629245,
"grad_norm": 2.6692819595336914,
"learning_rate": 1.968487111111111e-05,
"loss": 1.5176,
"step": 71400
},
{
"epoch": 0.11439981696029286,
"grad_norm": 93.15460205078125,
"learning_rate": 1.968131555555556e-05,
"loss": 1.3461,
"step": 71500
},
{
"epoch": 0.11455981670429327,
"grad_norm": 37.849117279052734,
"learning_rate": 1.967776e-05,
"loss": 1.3832,
"step": 71600
},
{
"epoch": 0.11471981644829368,
"grad_norm": 3.6809959411621094,
"learning_rate": 1.967420444444445e-05,
"loss": 1.2962,
"step": 71700
},
{
"epoch": 0.11487981619229409,
"grad_norm": 40.560264587402344,
"learning_rate": 1.967064888888889e-05,
"loss": 1.5179,
"step": 71800
},
{
"epoch": 0.1150398159362945,
"grad_norm": 0.17644475400447845,
"learning_rate": 1.9667093333333334e-05,
"loss": 1.1041,
"step": 71900
},
{
"epoch": 0.11519981568029491,
"grad_norm": 0.05514904111623764,
"learning_rate": 1.966353777777778e-05,
"loss": 1.5031,
"step": 72000
},
{
"epoch": 0.11535981542429533,
"grad_norm": 23.659364700317383,
"learning_rate": 1.9659982222222223e-05,
"loss": 1.5412,
"step": 72100
},
{
"epoch": 0.11551981516829574,
"grad_norm": 0.0025822233874350786,
"learning_rate": 1.9656426666666668e-05,
"loss": 1.2971,
"step": 72200
},
{
"epoch": 0.11567981491229615,
"grad_norm": 2.382300853729248,
"learning_rate": 1.9652871111111112e-05,
"loss": 1.0979,
"step": 72300
},
{
"epoch": 0.11583981465629654,
"grad_norm": 150.96646118164062,
"learning_rate": 1.9649315555555557e-05,
"loss": 1.307,
"step": 72400
},
{
"epoch": 0.11599981440029696,
"grad_norm": 0.022950541228055954,
"learning_rate": 1.964576e-05,
"loss": 1.3418,
"step": 72500
},
{
"epoch": 0.11615981414429737,
"grad_norm": 21.7007999420166,
"learning_rate": 1.9642204444444446e-05,
"loss": 1.7298,
"step": 72600
},
{
"epoch": 0.11631981388829778,
"grad_norm": 100.60992431640625,
"learning_rate": 1.963864888888889e-05,
"loss": 1.68,
"step": 72700
},
{
"epoch": 0.11647981363229819,
"grad_norm": 104.22400665283203,
"learning_rate": 1.9635093333333335e-05,
"loss": 1.3106,
"step": 72800
},
{
"epoch": 0.1166398133762986,
"grad_norm": 0.01572352461516857,
"learning_rate": 1.963153777777778e-05,
"loss": 1.0954,
"step": 72900
},
{
"epoch": 0.11679981312029901,
"grad_norm": 0.1720964014530182,
"learning_rate": 1.9627982222222224e-05,
"loss": 1.5994,
"step": 73000
},
{
"epoch": 0.11695981286429942,
"grad_norm": 90.89932250976562,
"learning_rate": 1.962442666666667e-05,
"loss": 1.5953,
"step": 73100
},
{
"epoch": 0.11711981260829983,
"grad_norm": 94.24946594238281,
"learning_rate": 1.9620871111111113e-05,
"loss": 1.9498,
"step": 73200
},
{
"epoch": 0.11727981235230024,
"grad_norm": 0.08061110228300095,
"learning_rate": 1.9617315555555554e-05,
"loss": 0.9937,
"step": 73300
},
{
"epoch": 0.11743981209630065,
"grad_norm": 9.990059852600098,
"learning_rate": 1.9613760000000002e-05,
"loss": 1.4753,
"step": 73400
},
{
"epoch": 0.11759981184030105,
"grad_norm": 1.572757601737976,
"learning_rate": 1.9610204444444444e-05,
"loss": 1.417,
"step": 73500
},
{
"epoch": 0.11775981158430146,
"grad_norm": 23.618915557861328,
"learning_rate": 1.960664888888889e-05,
"loss": 1.596,
"step": 73600
},
{
"epoch": 0.11791981132830187,
"grad_norm": 90.75736999511719,
"learning_rate": 1.9603093333333333e-05,
"loss": 1.8794,
"step": 73700
},
{
"epoch": 0.11807981107230228,
"grad_norm": 0.07401008158922195,
"learning_rate": 1.959953777777778e-05,
"loss": 1.3118,
"step": 73800
},
{
"epoch": 0.11823981081630269,
"grad_norm": 80.61852264404297,
"learning_rate": 1.9595982222222222e-05,
"loss": 1.732,
"step": 73900
},
{
"epoch": 0.1183998105603031,
"grad_norm": 0.18783807754516602,
"learning_rate": 1.959242666666667e-05,
"loss": 1.4504,
"step": 74000
},
{
"epoch": 0.11855981030430351,
"grad_norm": 0.0010747779160737991,
"learning_rate": 1.958887111111111e-05,
"loss": 1.0878,
"step": 74100
},
{
"epoch": 0.11871981004830393,
"grad_norm": 0.0007994744810275733,
"learning_rate": 1.958531555555556e-05,
"loss": 1.2488,
"step": 74200
},
{
"epoch": 0.11887980979230434,
"grad_norm": 16.047822952270508,
"learning_rate": 1.958176e-05,
"loss": 1.3887,
"step": 74300
},
{
"epoch": 0.11903980953630475,
"grad_norm": 105.44868469238281,
"learning_rate": 1.9578204444444448e-05,
"loss": 1.2265,
"step": 74400
},
{
"epoch": 0.11919980928030516,
"grad_norm": 0.0008991442155092955,
"learning_rate": 1.957464888888889e-05,
"loss": 1.4668,
"step": 74500
},
{
"epoch": 0.11935980902430555,
"grad_norm": 0.04507048800587654,
"learning_rate": 1.9571093333333334e-05,
"loss": 1.6258,
"step": 74600
},
{
"epoch": 0.11951980876830597,
"grad_norm": 0.00036178340087644756,
"learning_rate": 1.9567537777777778e-05,
"loss": 1.9551,
"step": 74700
},
{
"epoch": 0.11967980851230638,
"grad_norm": 0.0032805718947201967,
"learning_rate": 1.9563982222222223e-05,
"loss": 1.1811,
"step": 74800
},
{
"epoch": 0.11983980825630679,
"grad_norm": 126.1537857055664,
"learning_rate": 1.9560426666666667e-05,
"loss": 1.2119,
"step": 74900
},
{
"epoch": 0.1199998080003072,
"grad_norm": 1.2026222944259644,
"learning_rate": 1.9556871111111112e-05,
"loss": 1.4051,
"step": 75000
},
{
"epoch": 0.12015980774430761,
"grad_norm": 1.8911128044128418,
"learning_rate": 1.9553351111111113e-05,
"loss": 1.2587,
"step": 75100
},
{
"epoch": 0.12031980748830802,
"grad_norm": 0.4351516664028168,
"learning_rate": 1.9549795555555558e-05,
"loss": 1.4563,
"step": 75200
},
{
"epoch": 0.12047980723230843,
"grad_norm": 0.004506128840148449,
"learning_rate": 1.9546240000000002e-05,
"loss": 1.5581,
"step": 75300
},
{
"epoch": 0.12063980697630884,
"grad_norm": 0.0002510923077352345,
"learning_rate": 1.9542684444444447e-05,
"loss": 1.5457,
"step": 75400
},
{
"epoch": 0.12079980672030925,
"grad_norm": 0.15088102221488953,
"learning_rate": 1.953912888888889e-05,
"loss": 1.2675,
"step": 75500
},
{
"epoch": 0.12095980646430966,
"grad_norm": 0.05502159520983696,
"learning_rate": 1.9535573333333336e-05,
"loss": 1.0948,
"step": 75600
},
{
"epoch": 0.12111980620831006,
"grad_norm": 0.09219387173652649,
"learning_rate": 1.953201777777778e-05,
"loss": 1.2045,
"step": 75700
},
{
"epoch": 0.12127980595231047,
"grad_norm": 199.2202911376953,
"learning_rate": 1.9528462222222225e-05,
"loss": 1.5964,
"step": 75800
},
{
"epoch": 0.12143980569631088,
"grad_norm": 11.821746826171875,
"learning_rate": 1.9524906666666666e-05,
"loss": 1.0517,
"step": 75900
},
{
"epoch": 0.12159980544031129,
"grad_norm": 106.8429946899414,
"learning_rate": 1.9521351111111114e-05,
"loss": 1.2883,
"step": 76000
},
{
"epoch": 0.1217598051843117,
"grad_norm": 0.08651433885097504,
"learning_rate": 1.9517795555555555e-05,
"loss": 1.2276,
"step": 76100
},
{
"epoch": 0.12191980492831211,
"grad_norm": 79.67507934570312,
"learning_rate": 1.9514240000000003e-05,
"loss": 1.2463,
"step": 76200
},
{
"epoch": 0.12207980467231253,
"grad_norm": 0.6488481163978577,
"learning_rate": 1.9510684444444444e-05,
"loss": 1.241,
"step": 76300
},
{
"epoch": 0.12223980441631294,
"grad_norm": 6.853870391845703,
"learning_rate": 1.9507128888888892e-05,
"loss": 1.8648,
"step": 76400
},
{
"epoch": 0.12239980416031335,
"grad_norm": 0.0021333652548491955,
"learning_rate": 1.9503573333333333e-05,
"loss": 1.4848,
"step": 76500
},
{
"epoch": 0.12255980390431376,
"grad_norm": 0.0014837757917121053,
"learning_rate": 1.950001777777778e-05,
"loss": 1.413,
"step": 76600
},
{
"epoch": 0.12271980364831417,
"grad_norm": 196.25413513183594,
"learning_rate": 1.9496462222222222e-05,
"loss": 1.594,
"step": 76700
},
{
"epoch": 0.12287980339231457,
"grad_norm": 117.68331909179688,
"learning_rate": 1.949290666666667e-05,
"loss": 1.3682,
"step": 76800
},
{
"epoch": 0.12303980313631498,
"grad_norm": 3.5699806213378906,
"learning_rate": 1.948935111111111e-05,
"loss": 1.159,
"step": 76900
},
{
"epoch": 0.12319980288031539,
"grad_norm": 0.051335014402866364,
"learning_rate": 1.9485795555555556e-05,
"loss": 1.4702,
"step": 77000
},
{
"epoch": 0.1233598026243158,
"grad_norm": 0.012798790819942951,
"learning_rate": 1.948224e-05,
"loss": 1.3251,
"step": 77100
},
{
"epoch": 0.12351980236831621,
"grad_norm": 3.92969012260437,
"learning_rate": 1.9478684444444445e-05,
"loss": 1.0538,
"step": 77200
},
{
"epoch": 0.12367980211231662,
"grad_norm": 2.8302226066589355,
"learning_rate": 1.947512888888889e-05,
"loss": 1.1708,
"step": 77300
},
{
"epoch": 0.12383980185631703,
"grad_norm": 0.11278839409351349,
"learning_rate": 1.947160888888889e-05,
"loss": 1.2864,
"step": 77400
},
{
"epoch": 0.12399980160031744,
"grad_norm": 99.1993408203125,
"learning_rate": 1.9468053333333335e-05,
"loss": 1.6501,
"step": 77500
},
{
"epoch": 0.12415980134431785,
"grad_norm": 0.22377841174602509,
"learning_rate": 1.946449777777778e-05,
"loss": 1.0104,
"step": 77600
},
{
"epoch": 0.12431980108831826,
"grad_norm": 99.57634735107422,
"learning_rate": 1.9460942222222225e-05,
"loss": 1.7969,
"step": 77700
},
{
"epoch": 0.12447980083231867,
"grad_norm": 0.9174038767814636,
"learning_rate": 1.945738666666667e-05,
"loss": 1.0293,
"step": 77800
},
{
"epoch": 0.12463980057631908,
"grad_norm": 61.01045227050781,
"learning_rate": 1.9453831111111114e-05,
"loss": 1.5593,
"step": 77900
},
{
"epoch": 0.12479980032031948,
"grad_norm": 2412.276611328125,
"learning_rate": 1.9450275555555558e-05,
"loss": 0.9902,
"step": 78000
},
{
"epoch": 0.12495980006431989,
"grad_norm": 241.99363708496094,
"learning_rate": 1.9446720000000003e-05,
"loss": 1.058,
"step": 78100
},
{
"epoch": 0.1251197998083203,
"grad_norm": 96.88700866699219,
"learning_rate": 1.9443164444444447e-05,
"loss": 1.4039,
"step": 78200
},
{
"epoch": 0.12527979955232071,
"grad_norm": 56.962181091308594,
"learning_rate": 1.943960888888889e-05,
"loss": 1.008,
"step": 78300
},
{
"epoch": 0.12543979929632113,
"grad_norm": 23.272607803344727,
"learning_rate": 1.9436053333333336e-05,
"loss": 1.4593,
"step": 78400
},
{
"epoch": 0.12559979904032154,
"grad_norm": 97.3494644165039,
"learning_rate": 1.9432497777777778e-05,
"loss": 1.563,
"step": 78500
},
{
"epoch": 0.12575979878432195,
"grad_norm": 3.625567674636841,
"learning_rate": 1.9428942222222226e-05,
"loss": 1.1569,
"step": 78600
},
{
"epoch": 0.12591979852832236,
"grad_norm": 63.88728332519531,
"learning_rate": 1.9425386666666667e-05,
"loss": 1.3886,
"step": 78700
},
{
"epoch": 0.12607979827232277,
"grad_norm": 0.533359169960022,
"learning_rate": 1.9421831111111115e-05,
"loss": 1.061,
"step": 78800
},
{
"epoch": 0.12623979801632318,
"grad_norm": 0.0005107554607093334,
"learning_rate": 1.9418275555555556e-05,
"loss": 1.2085,
"step": 78900
},
{
"epoch": 0.1263997977603236,
"grad_norm": 66.7668685913086,
"learning_rate": 1.9414720000000004e-05,
"loss": 1.8553,
"step": 79000
},
{
"epoch": 0.126559797504324,
"grad_norm": 2.3458669185638428,
"learning_rate": 1.9411164444444445e-05,
"loss": 1.7144,
"step": 79100
},
{
"epoch": 0.1267197972483244,
"grad_norm": 101.0086669921875,
"learning_rate": 1.9407608888888893e-05,
"loss": 1.2216,
"step": 79200
},
{
"epoch": 0.12687979699232482,
"grad_norm": 14.662532806396484,
"learning_rate": 1.9404053333333334e-05,
"loss": 1.1646,
"step": 79300
},
{
"epoch": 0.12703979673632523,
"grad_norm": 70.46912384033203,
"learning_rate": 1.9400497777777782e-05,
"loss": 1.7768,
"step": 79400
},
{
"epoch": 0.12719979648032564,
"grad_norm": 3.7776920795440674,
"learning_rate": 1.9396942222222223e-05,
"loss": 1.1314,
"step": 79500
},
{
"epoch": 0.12735979622432603,
"grad_norm": 0.05991614609956741,
"learning_rate": 1.9393386666666668e-05,
"loss": 1.2374,
"step": 79600
},
{
"epoch": 0.12751979596832644,
"grad_norm": 1.138396978378296,
"learning_rate": 1.9389831111111112e-05,
"loss": 1.2681,
"step": 79700
},
{
"epoch": 0.12767979571232685,
"grad_norm": 117.04296875,
"learning_rate": 1.9386275555555557e-05,
"loss": 1.2624,
"step": 79800
},
{
"epoch": 0.12783979545632726,
"grad_norm": 165.1708984375,
"learning_rate": 1.9382755555555558e-05,
"loss": 1.6775,
"step": 79900
},
{
"epoch": 0.12799979520032767,
"grad_norm": 127.26524353027344,
"learning_rate": 1.9379200000000002e-05,
"loss": 1.3587,
"step": 80000
},
{
"epoch": 0.12815979494432808,
"grad_norm": 128.8250274658203,
"learning_rate": 1.9375644444444447e-05,
"loss": 1.7402,
"step": 80100
},
{
"epoch": 0.1283197946883285,
"grad_norm": 83.64952850341797,
"learning_rate": 1.937208888888889e-05,
"loss": 1.5349,
"step": 80200
},
{
"epoch": 0.1284797944323289,
"grad_norm": 2.9033825397491455,
"learning_rate": 1.9368533333333336e-05,
"loss": 0.8546,
"step": 80300
},
{
"epoch": 0.12863979417632931,
"grad_norm": 1.8563624620437622,
"learning_rate": 1.936497777777778e-05,
"loss": 1.3903,
"step": 80400
},
{
"epoch": 0.12879979392032972,
"grad_norm": 0.020641742274165154,
"learning_rate": 1.9361422222222225e-05,
"loss": 1.0712,
"step": 80500
},
{
"epoch": 0.12895979366433014,
"grad_norm": 0.030105268582701683,
"learning_rate": 1.935786666666667e-05,
"loss": 1.6633,
"step": 80600
},
{
"epoch": 0.12911979340833055,
"grad_norm": 7.39204216003418,
"learning_rate": 1.935431111111111e-05,
"loss": 1.4125,
"step": 80700
},
{
"epoch": 0.12927979315233096,
"grad_norm": 0.6996489763259888,
"learning_rate": 1.935075555555556e-05,
"loss": 0.6973,
"step": 80800
},
{
"epoch": 0.12943979289633137,
"grad_norm": 45.01316452026367,
"learning_rate": 1.93472e-05,
"loss": 1.1729,
"step": 80900
},
{
"epoch": 0.12959979264033178,
"grad_norm": 0.2586953938007355,
"learning_rate": 1.9343644444444448e-05,
"loss": 1.2217,
"step": 81000
},
{
"epoch": 0.1297597923843322,
"grad_norm": 0.02437330223619938,
"learning_rate": 1.934008888888889e-05,
"loss": 1.3184,
"step": 81100
},
{
"epoch": 0.1299197921283326,
"grad_norm": 86.13786315917969,
"learning_rate": 1.9336533333333334e-05,
"loss": 1.2718,
"step": 81200
},
{
"epoch": 0.130079791872333,
"grad_norm": 129.18377685546875,
"learning_rate": 1.9332977777777778e-05,
"loss": 1.1913,
"step": 81300
},
{
"epoch": 0.13023979161633342,
"grad_norm": 0.21126429736614227,
"learning_rate": 1.9329422222222223e-05,
"loss": 1.4728,
"step": 81400
},
{
"epoch": 0.13039979136033383,
"grad_norm": 17.239547729492188,
"learning_rate": 1.9325902222222224e-05,
"loss": 1.1221,
"step": 81500
},
{
"epoch": 0.13055979110433424,
"grad_norm": 3.2373907566070557,
"learning_rate": 1.932234666666667e-05,
"loss": 1.235,
"step": 81600
},
{
"epoch": 0.13071979084833465,
"grad_norm": 5.152343273162842,
"learning_rate": 1.9318791111111113e-05,
"loss": 1.3497,
"step": 81700
},
{
"epoch": 0.13087979059233507,
"grad_norm": 88.2583236694336,
"learning_rate": 1.9315235555555558e-05,
"loss": 1.2361,
"step": 81800
},
{
"epoch": 0.13103979033633545,
"grad_norm": 0.001005143509246409,
"learning_rate": 1.9311680000000002e-05,
"loss": 2.0015,
"step": 81900
},
{
"epoch": 0.13119979008033586,
"grad_norm": 0.3949466347694397,
"learning_rate": 1.9308124444444447e-05,
"loss": 1.2259,
"step": 82000
},
{
"epoch": 0.13135978982433627,
"grad_norm": 0.6978406310081482,
"learning_rate": 1.930456888888889e-05,
"loss": 0.9236,
"step": 82100
},
{
"epoch": 0.13151978956833668,
"grad_norm": 0.6740103363990784,
"learning_rate": 1.9301013333333332e-05,
"loss": 1.5339,
"step": 82200
},
{
"epoch": 0.1316797893123371,
"grad_norm": 0.007084805518388748,
"learning_rate": 1.929745777777778e-05,
"loss": 1.2036,
"step": 82300
},
{
"epoch": 0.1318397890563375,
"grad_norm": 88.51764678955078,
"learning_rate": 1.929390222222222e-05,
"loss": 1.2631,
"step": 82400
},
{
"epoch": 0.1319997888003379,
"grad_norm": 0.000969950866419822,
"learning_rate": 1.929034666666667e-05,
"loss": 1.0858,
"step": 82500
},
{
"epoch": 0.13215978854433832,
"grad_norm": 0.10399264842271805,
"learning_rate": 1.928679111111111e-05,
"loss": 1.635,
"step": 82600
},
{
"epoch": 0.13231978828833874,
"grad_norm": 39.86758804321289,
"learning_rate": 1.928323555555556e-05,
"loss": 1.285,
"step": 82700
},
{
"epoch": 0.13247978803233915,
"grad_norm": 2.6627957820892334,
"learning_rate": 1.927968e-05,
"loss": 1.1209,
"step": 82800
},
{
"epoch": 0.13263978777633956,
"grad_norm": 0.2310008406639099,
"learning_rate": 1.9276124444444448e-05,
"loss": 1.4032,
"step": 82900
},
{
"epoch": 0.13279978752033997,
"grad_norm": 76.39102935791016,
"learning_rate": 1.927256888888889e-05,
"loss": 1.1279,
"step": 83000
},
{
"epoch": 0.13295978726434038,
"grad_norm": 0.5016289949417114,
"learning_rate": 1.9269013333333337e-05,
"loss": 1.5145,
"step": 83100
},
{
"epoch": 0.1331197870083408,
"grad_norm": 0.2468506544828415,
"learning_rate": 1.9265457777777778e-05,
"loss": 1.4923,
"step": 83200
},
{
"epoch": 0.1332797867523412,
"grad_norm": 0.03473009541630745,
"learning_rate": 1.9261902222222222e-05,
"loss": 0.9845,
"step": 83300
},
{
"epoch": 0.1334397864963416,
"grad_norm": 15.979637145996094,
"learning_rate": 1.9258346666666667e-05,
"loss": 1.3847,
"step": 83400
},
{
"epoch": 0.13359978624034202,
"grad_norm": 0.13443760573863983,
"learning_rate": 1.925479111111111e-05,
"loss": 1.0149,
"step": 83500
},
{
"epoch": 0.13375978598434243,
"grad_norm": 0.09114881604909897,
"learning_rate": 1.9251235555555556e-05,
"loss": 1.2644,
"step": 83600
},
{
"epoch": 0.13391978572834284,
"grad_norm": 67.14397430419922,
"learning_rate": 1.924768e-05,
"loss": 1.2981,
"step": 83700
},
{
"epoch": 0.13407978547234325,
"grad_norm": 9.479357719421387,
"learning_rate": 1.9244124444444445e-05,
"loss": 1.6903,
"step": 83800
},
{
"epoch": 0.13423978521634367,
"grad_norm": 222.48973083496094,
"learning_rate": 1.924056888888889e-05,
"loss": 1.2846,
"step": 83900
},
{
"epoch": 0.13439978496034408,
"grad_norm": 0.24070465564727783,
"learning_rate": 1.9237013333333334e-05,
"loss": 1.4647,
"step": 84000
},
{
"epoch": 0.13455978470434446,
"grad_norm": 1444.743896484375,
"learning_rate": 1.923345777777778e-05,
"loss": 1.1213,
"step": 84100
},
{
"epoch": 0.13471978444834487,
"grad_norm": 1.174815058708191,
"learning_rate": 1.9229902222222223e-05,
"loss": 1.1379,
"step": 84200
},
{
"epoch": 0.13487978419234528,
"grad_norm": 120.12804412841797,
"learning_rate": 1.9226346666666668e-05,
"loss": 1.2793,
"step": 84300
},
{
"epoch": 0.1350397839363457,
"grad_norm": 89.50218200683594,
"learning_rate": 1.9222791111111113e-05,
"loss": 1.343,
"step": 84400
},
{
"epoch": 0.1351997836803461,
"grad_norm": 100.86327362060547,
"learning_rate": 1.9219235555555557e-05,
"loss": 1.8342,
"step": 84500
},
{
"epoch": 0.1353597834243465,
"grad_norm": 99.5421371459961,
"learning_rate": 1.921568e-05,
"loss": 1.0487,
"step": 84600
},
{
"epoch": 0.13551978316834692,
"grad_norm": 0.0015393303474411368,
"learning_rate": 1.9212124444444446e-05,
"loss": 1.1531,
"step": 84700
},
{
"epoch": 0.13567978291234734,
"grad_norm": 3.457564353942871,
"learning_rate": 1.920856888888889e-05,
"loss": 0.8552,
"step": 84800
},
{
"epoch": 0.13583978265634775,
"grad_norm": 0.00041561800753697753,
"learning_rate": 1.9205013333333335e-05,
"loss": 1.1422,
"step": 84900
},
{
"epoch": 0.13599978240034816,
"grad_norm": 0.0012223550584167242,
"learning_rate": 1.920145777777778e-05,
"loss": 1.0918,
"step": 85000
},
{
"epoch": 0.13615978214434857,
"grad_norm": 0.21084783971309662,
"learning_rate": 1.9197902222222224e-05,
"loss": 1.2873,
"step": 85100
},
{
"epoch": 0.13631978188834898,
"grad_norm": 24.241910934448242,
"learning_rate": 1.919434666666667e-05,
"loss": 1.547,
"step": 85200
},
{
"epoch": 0.1364797816323494,
"grad_norm": 47.714027404785156,
"learning_rate": 1.9190791111111114e-05,
"loss": 1.5094,
"step": 85300
},
{
"epoch": 0.1366397813763498,
"grad_norm": 6.054490089416504,
"learning_rate": 1.9187235555555558e-05,
"loss": 1.051,
"step": 85400
},
{
"epoch": 0.1367997811203502,
"grad_norm": 0.001112865749746561,
"learning_rate": 1.9183680000000003e-05,
"loss": 0.9952,
"step": 85500
},
{
"epoch": 0.13695978086435062,
"grad_norm": 0.015406353399157524,
"learning_rate": 1.9180124444444447e-05,
"loss": 1.1978,
"step": 85600
},
{
"epoch": 0.13711978060835103,
"grad_norm": 17.65171241760254,
"learning_rate": 1.9176604444444445e-05,
"loss": 1.5221,
"step": 85700
},
{
"epoch": 0.13727978035235144,
"grad_norm": 14.018333435058594,
"learning_rate": 1.917304888888889e-05,
"loss": 1.3841,
"step": 85800
},
{
"epoch": 0.13743978009635185,
"grad_norm": 0.0006000687135383487,
"learning_rate": 1.9169493333333334e-05,
"loss": 1.3999,
"step": 85900
},
{
"epoch": 0.13759977984035227,
"grad_norm": 1715.1507568359375,
"learning_rate": 1.916593777777778e-05,
"loss": 1.5574,
"step": 86000
},
{
"epoch": 0.13775977958435268,
"grad_norm": 4.6950907707214355,
"learning_rate": 1.9162382222222223e-05,
"loss": 1.3267,
"step": 86100
},
{
"epoch": 0.1379197793283531,
"grad_norm": 0.8909225463867188,
"learning_rate": 1.9158826666666668e-05,
"loss": 1.358,
"step": 86200
},
{
"epoch": 0.1380797790723535,
"grad_norm": 27.72040367126465,
"learning_rate": 1.9155271111111112e-05,
"loss": 1.5441,
"step": 86300
},
{
"epoch": 0.13823977881635388,
"grad_norm": 120.01333618164062,
"learning_rate": 1.9151715555555557e-05,
"loss": 1.4124,
"step": 86400
},
{
"epoch": 0.1383997785603543,
"grad_norm": 27.406797409057617,
"learning_rate": 1.914816e-05,
"loss": 0.8352,
"step": 86500
},
{
"epoch": 0.1385597783043547,
"grad_norm": 0.07549207657575607,
"learning_rate": 1.9144604444444446e-05,
"loss": 1.2549,
"step": 86600
},
{
"epoch": 0.1387197780483551,
"grad_norm": 1.3746123313903809,
"learning_rate": 1.914104888888889e-05,
"loss": 1.4328,
"step": 86700
},
{
"epoch": 0.13887977779235552,
"grad_norm": 0.002391360467299819,
"learning_rate": 1.9137493333333335e-05,
"loss": 1.2577,
"step": 86800
},
{
"epoch": 0.13903977753635594,
"grad_norm": 39.3692626953125,
"learning_rate": 1.913393777777778e-05,
"loss": 1.4417,
"step": 86900
},
{
"epoch": 0.13919977728035635,
"grad_norm": 0.0015022088773548603,
"learning_rate": 1.9130382222222224e-05,
"loss": 1.1927,
"step": 87000
},
{
"epoch": 0.13935977702435676,
"grad_norm": 3.776437520980835,
"learning_rate": 1.912682666666667e-05,
"loss": 1.4435,
"step": 87100
},
{
"epoch": 0.13951977676835717,
"grad_norm": 9.693673133850098,
"learning_rate": 1.9123271111111113e-05,
"loss": 1.3579,
"step": 87200
},
{
"epoch": 0.13967977651235758,
"grad_norm": 47.54679870605469,
"learning_rate": 1.9119751111111114e-05,
"loss": 1.3883,
"step": 87300
},
{
"epoch": 0.139839776256358,
"grad_norm": 57.24945068359375,
"learning_rate": 1.9116195555555555e-05,
"loss": 1.2645,
"step": 87400
},
{
"epoch": 0.1399997760003584,
"grad_norm": 0.0025031184777617455,
"learning_rate": 1.9112640000000003e-05,
"loss": 1.1366,
"step": 87500
},
{
"epoch": 0.1401597757443588,
"grad_norm": 0.015484058298170567,
"learning_rate": 1.9109084444444445e-05,
"loss": 1.4566,
"step": 87600
},
{
"epoch": 0.14031977548835922,
"grad_norm": 0.24919560551643372,
"learning_rate": 1.9105528888888893e-05,
"loss": 1.447,
"step": 87700
},
{
"epoch": 0.14047977523235963,
"grad_norm": 74.2865219116211,
"learning_rate": 1.9101973333333334e-05,
"loss": 1.0701,
"step": 87800
},
{
"epoch": 0.14063977497636004,
"grad_norm": 127.0066909790039,
"learning_rate": 1.909841777777778e-05,
"loss": 1.3449,
"step": 87900
},
{
"epoch": 0.14079977472036045,
"grad_norm": 87.54583740234375,
"learning_rate": 1.9094862222222223e-05,
"loss": 1.4331,
"step": 88000
},
{
"epoch": 0.14095977446436087,
"grad_norm": 0.001399531727656722,
"learning_rate": 1.909130666666667e-05,
"loss": 1.3965,
"step": 88100
},
{
"epoch": 0.14111977420836128,
"grad_norm": 69.87310028076172,
"learning_rate": 1.9087751111111112e-05,
"loss": 1.347,
"step": 88200
},
{
"epoch": 0.1412797739523617,
"grad_norm": 69.38946533203125,
"learning_rate": 1.9084195555555556e-05,
"loss": 1.0262,
"step": 88300
},
{
"epoch": 0.1414397736963621,
"grad_norm": 18.69589614868164,
"learning_rate": 1.908064e-05,
"loss": 1.0787,
"step": 88400
},
{
"epoch": 0.1415997734403625,
"grad_norm": 0.20538243651390076,
"learning_rate": 1.9077084444444446e-05,
"loss": 1.3829,
"step": 88500
},
{
"epoch": 0.1417597731843629,
"grad_norm": 0.0005450706230476499,
"learning_rate": 1.907352888888889e-05,
"loss": 1.2001,
"step": 88600
},
{
"epoch": 0.1419197729283633,
"grad_norm": 2.548616409301758,
"learning_rate": 1.9069973333333335e-05,
"loss": 1.2407,
"step": 88700
},
{
"epoch": 0.1420797726723637,
"grad_norm": 139.437744140625,
"learning_rate": 1.906641777777778e-05,
"loss": 1.6291,
"step": 88800
},
{
"epoch": 0.14223977241636412,
"grad_norm": 10.629435539245605,
"learning_rate": 1.9062862222222224e-05,
"loss": 1.1502,
"step": 88900
},
{
"epoch": 0.14239977216036454,
"grad_norm": 3.494685411453247,
"learning_rate": 1.905930666666667e-05,
"loss": 1.2155,
"step": 89000
},
{
"epoch": 0.14255977190436495,
"grad_norm": 107.18891143798828,
"learning_rate": 1.9055751111111113e-05,
"loss": 1.3381,
"step": 89100
},
{
"epoch": 0.14271977164836536,
"grad_norm": 2575.91796875,
"learning_rate": 1.9052195555555557e-05,
"loss": 0.819,
"step": 89200
},
{
"epoch": 0.14287977139236577,
"grad_norm": 302.19500732421875,
"learning_rate": 1.9048640000000002e-05,
"loss": 1.0402,
"step": 89300
},
{
"epoch": 0.14303977113636618,
"grad_norm": 87.07076263427734,
"learning_rate": 1.9045084444444447e-05,
"loss": 1.1062,
"step": 89400
},
{
"epoch": 0.1431997708803666,
"grad_norm": 5.228755950927734,
"learning_rate": 1.9041528888888888e-05,
"loss": 1.6693,
"step": 89500
},
{
"epoch": 0.143359770624367,
"grad_norm": 0.20638461410999298,
"learning_rate": 1.9037973333333336e-05,
"loss": 1.1991,
"step": 89600
},
{
"epoch": 0.1435197703683674,
"grad_norm": 114.9300308227539,
"learning_rate": 1.9034417777777777e-05,
"loss": 1.3535,
"step": 89700
},
{
"epoch": 0.14367977011236782,
"grad_norm": 86.26241302490234,
"learning_rate": 1.9030862222222225e-05,
"loss": 1.6776,
"step": 89800
},
{
"epoch": 0.14383976985636823,
"grad_norm": 78.20118713378906,
"learning_rate": 1.9027306666666666e-05,
"loss": 1.2221,
"step": 89900
},
{
"epoch": 0.14399976960036864,
"grad_norm": 7.4184088706970215,
"learning_rate": 1.9023751111111114e-05,
"loss": 1.0253,
"step": 90000
},
{
"epoch": 0.14415976934436905,
"grad_norm": 99.789794921875,
"learning_rate": 1.9020195555555555e-05,
"loss": 1.0469,
"step": 90100
},
{
"epoch": 0.14431976908836947,
"grad_norm": 0.003789502428844571,
"learning_rate": 1.9016640000000003e-05,
"loss": 1.2465,
"step": 90200
},
{
"epoch": 0.14447976883236988,
"grad_norm": 0.27373766899108887,
"learning_rate": 1.9013084444444444e-05,
"loss": 1.4068,
"step": 90300
},
{
"epoch": 0.1446397685763703,
"grad_norm": 101.30089569091797,
"learning_rate": 1.9009528888888892e-05,
"loss": 1.5961,
"step": 90400
},
{
"epoch": 0.1447997683203707,
"grad_norm": 0.24238981306552887,
"learning_rate": 1.9005973333333333e-05,
"loss": 1.0579,
"step": 90500
},
{
"epoch": 0.1449597680643711,
"grad_norm": 0.6612280011177063,
"learning_rate": 1.900241777777778e-05,
"loss": 0.941,
"step": 90600
},
{
"epoch": 0.14511976780837152,
"grad_norm": 3.1052684783935547,
"learning_rate": 1.8998862222222222e-05,
"loss": 1.1861,
"step": 90700
},
{
"epoch": 0.14527976755237193,
"grad_norm": 9.876090049743652,
"learning_rate": 1.8995306666666667e-05,
"loss": 1.4697,
"step": 90800
},
{
"epoch": 0.1454397672963723,
"grad_norm": 32.829795837402344,
"learning_rate": 1.899175111111111e-05,
"loss": 0.6486,
"step": 90900
},
{
"epoch": 0.14559976704037272,
"grad_norm": 2219.107177734375,
"learning_rate": 1.8988195555555556e-05,
"loss": 1.3865,
"step": 91000
},
{
"epoch": 0.14575976678437313,
"grad_norm": 0.2465362697839737,
"learning_rate": 1.898464e-05,
"loss": 1.1494,
"step": 91100
},
{
"epoch": 0.14591976652837355,
"grad_norm": 0.06304822117090225,
"learning_rate": 1.8981084444444445e-05,
"loss": 1.3623,
"step": 91200
},
{
"epoch": 0.14607976627237396,
"grad_norm": 0.0003378583351150155,
"learning_rate": 1.897752888888889e-05,
"loss": 1.2193,
"step": 91300
},
{
"epoch": 0.14623976601637437,
"grad_norm": 0.023890919983386993,
"learning_rate": 1.8973973333333334e-05,
"loss": 1.3003,
"step": 91400
},
{
"epoch": 0.14639976576037478,
"grad_norm": 0.20042432844638824,
"learning_rate": 1.8970453333333335e-05,
"loss": 1.2608,
"step": 91500
},
{
"epoch": 0.1465597655043752,
"grad_norm": 0.4310738742351532,
"learning_rate": 1.896689777777778e-05,
"loss": 1.2544,
"step": 91600
},
{
"epoch": 0.1467197652483756,
"grad_norm": 6.881536960601807,
"learning_rate": 1.8963342222222224e-05,
"loss": 1.332,
"step": 91700
},
{
"epoch": 0.146879764992376,
"grad_norm": 34.862266540527344,
"learning_rate": 1.895978666666667e-05,
"loss": 1.3548,
"step": 91800
},
{
"epoch": 0.14703976473637642,
"grad_norm": 41.60286331176758,
"learning_rate": 1.8956231111111114e-05,
"loss": 1.54,
"step": 91900
},
{
"epoch": 0.14719976448037683,
"grad_norm": 0.21723419427871704,
"learning_rate": 1.8952675555555558e-05,
"loss": 1.3125,
"step": 92000
},
{
"epoch": 0.14735976422437724,
"grad_norm": 12.313715934753418,
"learning_rate": 1.894912e-05,
"loss": 0.897,
"step": 92100
},
{
"epoch": 0.14751976396837765,
"grad_norm": 9.945670171873644e-05,
"learning_rate": 1.8945564444444447e-05,
"loss": 1.1594,
"step": 92200
},
{
"epoch": 0.14767976371237806,
"grad_norm": 0.00018985375936608762,
"learning_rate": 1.894200888888889e-05,
"loss": 0.9194,
"step": 92300
},
{
"epoch": 0.14783976345637848,
"grad_norm": 6.04590368270874,
"learning_rate": 1.8938453333333336e-05,
"loss": 1.2209,
"step": 92400
},
{
"epoch": 0.1479997632003789,
"grad_norm": 0.19547709822654724,
"learning_rate": 1.8934897777777777e-05,
"loss": 1.0027,
"step": 92500
},
{
"epoch": 0.1481597629443793,
"grad_norm": 0.00870482623577118,
"learning_rate": 1.8931342222222225e-05,
"loss": 1.4675,
"step": 92600
},
{
"epoch": 0.1483197626883797,
"grad_norm": 112.5121078491211,
"learning_rate": 1.8927786666666667e-05,
"loss": 1.3982,
"step": 92700
},
{
"epoch": 0.14847976243238012,
"grad_norm": 0.000352471019141376,
"learning_rate": 1.8924231111111115e-05,
"loss": 0.8595,
"step": 92800
},
{
"epoch": 0.14863976217638053,
"grad_norm": 0.20730045437812805,
"learning_rate": 1.8920675555555556e-05,
"loss": 1.572,
"step": 92900
},
{
"epoch": 0.14879976192038094,
"grad_norm": 5.9496917724609375,
"learning_rate": 1.8917120000000004e-05,
"loss": 1.2832,
"step": 93000
},
{
"epoch": 0.14895976166438132,
"grad_norm": 0.012922318652272224,
"learning_rate": 1.8913564444444445e-05,
"loss": 1.2838,
"step": 93100
},
{
"epoch": 0.14911976140838173,
"grad_norm": 40.53496170043945,
"learning_rate": 1.8910008888888893e-05,
"loss": 1.6535,
"step": 93200
},
{
"epoch": 0.14927976115238215,
"grad_norm": 106.17526245117188,
"learning_rate": 1.8906453333333334e-05,
"loss": 1.5996,
"step": 93300
},
{
"epoch": 0.14943976089638256,
"grad_norm": 93.25550079345703,
"learning_rate": 1.890289777777778e-05,
"loss": 1.058,
"step": 93400
},
{
"epoch": 0.14959976064038297,
"grad_norm": 83.99794006347656,
"learning_rate": 1.8899342222222223e-05,
"loss": 1.3316,
"step": 93500
},
{
"epoch": 0.14975976038438338,
"grad_norm": 0.0036302392836660147,
"learning_rate": 1.8895822222222224e-05,
"loss": 0.8627,
"step": 93600
},
{
"epoch": 0.1499197601283838,
"grad_norm": 4.188179969787598,
"learning_rate": 1.889226666666667e-05,
"loss": 1.4411,
"step": 93700
},
{
"epoch": 0.1500797598723842,
"grad_norm": 0.38291868567466736,
"learning_rate": 1.8888711111111113e-05,
"loss": 0.9331,
"step": 93800
},
{
"epoch": 0.1502397596163846,
"grad_norm": 2.517091751098633,
"learning_rate": 1.8885155555555558e-05,
"loss": 1.0032,
"step": 93900
},
{
"epoch": 0.15039975936038502,
"grad_norm": 128.78472900390625,
"learning_rate": 1.8881600000000002e-05,
"loss": 1.2341,
"step": 94000
},
{
"epoch": 0.15055975910438543,
"grad_norm": 0.0004920060164295137,
"learning_rate": 1.8878044444444447e-05,
"loss": 1.3369,
"step": 94100
},
{
"epoch": 0.15071975884838584,
"grad_norm": 36.320411682128906,
"learning_rate": 1.887448888888889e-05,
"loss": 1.2324,
"step": 94200
},
{
"epoch": 0.15087975859238625,
"grad_norm": 34.70246887207031,
"learning_rate": 1.8870933333333336e-05,
"loss": 1.6952,
"step": 94300
},
{
"epoch": 0.15103975833638666,
"grad_norm": 0.5894516110420227,
"learning_rate": 1.886737777777778e-05,
"loss": 1.2401,
"step": 94400
},
{
"epoch": 0.15119975808038708,
"grad_norm": 0.0024842778220772743,
"learning_rate": 1.8863822222222222e-05,
"loss": 1.2998,
"step": 94500
},
{
"epoch": 0.1513597578243875,
"grad_norm": 0.16776002943515778,
"learning_rate": 1.886026666666667e-05,
"loss": 1.1458,
"step": 94600
},
{
"epoch": 0.1515197575683879,
"grad_norm": 0.008235426619648933,
"learning_rate": 1.885671111111111e-05,
"loss": 1.0211,
"step": 94700
},
{
"epoch": 0.1516797573123883,
"grad_norm": 82.82616424560547,
"learning_rate": 1.8853191111111112e-05,
"loss": 0.9866,
"step": 94800
},
{
"epoch": 0.15183975705638872,
"grad_norm": 0.0073872278444468975,
"learning_rate": 1.8849635555555556e-05,
"loss": 1.3636,
"step": 94900
},
{
"epoch": 0.15199975680038913,
"grad_norm": 11.451104164123535,
"learning_rate": 1.884608e-05,
"loss": 1.1485,
"step": 95000
},
{
"epoch": 0.15215975654438954,
"grad_norm": 100.47103118896484,
"learning_rate": 1.8842524444444446e-05,
"loss": 0.7671,
"step": 95100
},
{
"epoch": 0.15231975628838995,
"grad_norm": 0.4101124703884125,
"learning_rate": 1.883896888888889e-05,
"loss": 1.0069,
"step": 95200
},
{
"epoch": 0.15247975603239033,
"grad_norm": 37.40227508544922,
"learning_rate": 1.8835413333333335e-05,
"loss": 1.1276,
"step": 95300
},
{
"epoch": 0.15263975577639075,
"grad_norm": 141.687744140625,
"learning_rate": 1.883185777777778e-05,
"loss": 1.4477,
"step": 95400
},
{
"epoch": 0.15279975552039116,
"grad_norm": 0.008716798387467861,
"learning_rate": 1.8828302222222224e-05,
"loss": 0.9887,
"step": 95500
},
{
"epoch": 0.15295975526439157,
"grad_norm": 7.543517858721316e-05,
"learning_rate": 1.882474666666667e-05,
"loss": 1.065,
"step": 95600
},
{
"epoch": 0.15311975500839198,
"grad_norm": 6.7989821434021,
"learning_rate": 1.8821191111111113e-05,
"loss": 0.982,
"step": 95700
},
{
"epoch": 0.1532797547523924,
"grad_norm": 27.62921714782715,
"learning_rate": 1.8817635555555557e-05,
"loss": 1.1166,
"step": 95800
},
{
"epoch": 0.1534397544963928,
"grad_norm": 28.467132568359375,
"learning_rate": 1.8814080000000002e-05,
"loss": 1.3949,
"step": 95900
},
{
"epoch": 0.1535997542403932,
"grad_norm": 89.31570434570312,
"learning_rate": 1.8810524444444447e-05,
"loss": 1.4164,
"step": 96000
},
{
"epoch": 0.15375975398439362,
"grad_norm": 0.30763378739356995,
"learning_rate": 1.880696888888889e-05,
"loss": 1.7997,
"step": 96100
},
{
"epoch": 0.15391975372839403,
"grad_norm": 90.01514434814453,
"learning_rate": 1.8803413333333336e-05,
"loss": 1.3941,
"step": 96200
},
{
"epoch": 0.15407975347239444,
"grad_norm": 0.9498651027679443,
"learning_rate": 1.879985777777778e-05,
"loss": 1.0592,
"step": 96300
},
{
"epoch": 0.15423975321639485,
"grad_norm": 88.10832977294922,
"learning_rate": 1.8796302222222225e-05,
"loss": 1.1661,
"step": 96400
},
{
"epoch": 0.15439975296039526,
"grad_norm": 3.867802143096924,
"learning_rate": 1.879274666666667e-05,
"loss": 1.5968,
"step": 96500
},
{
"epoch": 0.15455975270439568,
"grad_norm": 114.89385986328125,
"learning_rate": 1.8789191111111114e-05,
"loss": 1.2586,
"step": 96600
},
{
"epoch": 0.1547197524483961,
"grad_norm": 0.009951179847121239,
"learning_rate": 1.878563555555556e-05,
"loss": 1.5164,
"step": 96700
},
{
"epoch": 0.1548797521923965,
"grad_norm": 1.848288893699646,
"learning_rate": 1.8782080000000003e-05,
"loss": 1.5942,
"step": 96800
},
{
"epoch": 0.1550397519363969,
"grad_norm": 56.26310348510742,
"learning_rate": 1.8778524444444448e-05,
"loss": 0.6635,
"step": 96900
},
{
"epoch": 0.15519975168039732,
"grad_norm": 0.2863824963569641,
"learning_rate": 1.8774968888888892e-05,
"loss": 1.3037,
"step": 97000
},
{
"epoch": 0.15535975142439773,
"grad_norm": 0.006738504860550165,
"learning_rate": 1.8771413333333333e-05,
"loss": 1.3557,
"step": 97100
},
{
"epoch": 0.15551975116839814,
"grad_norm": 0.24526001513004303,
"learning_rate": 1.876785777777778e-05,
"loss": 1.0864,
"step": 97200
},
{
"epoch": 0.15567975091239855,
"grad_norm": 70.70162963867188,
"learning_rate": 1.8764302222222222e-05,
"loss": 1.3139,
"step": 97300
},
{
"epoch": 0.15583975065639896,
"grad_norm": 0.7548888921737671,
"learning_rate": 1.8760746666666667e-05,
"loss": 0.7139,
"step": 97400
},
{
"epoch": 0.15599975040039937,
"grad_norm": 0.08793803304433823,
"learning_rate": 1.875719111111111e-05,
"loss": 1.1084,
"step": 97500
},
{
"epoch": 0.15615975014439976,
"grad_norm": 8.044859886169434,
"learning_rate": 1.8753635555555556e-05,
"loss": 1.2294,
"step": 97600
},
{
"epoch": 0.15631974988840017,
"grad_norm": 0.4635624587535858,
"learning_rate": 1.875008e-05,
"loss": 0.9581,
"step": 97700
},
{
"epoch": 0.15647974963240058,
"grad_norm": 0.0022484343498945236,
"learning_rate": 1.8746524444444445e-05,
"loss": 1.2983,
"step": 97800
},
{
"epoch": 0.156639749376401,
"grad_norm": 2.357697010040283,
"learning_rate": 1.874296888888889e-05,
"loss": 1.8281,
"step": 97900
},
{
"epoch": 0.1567997491204014,
"grad_norm": 78.0554428100586,
"learning_rate": 1.8739413333333334e-05,
"loss": 1.2914,
"step": 98000
},
{
"epoch": 0.1569597488644018,
"grad_norm": 0.6091700196266174,
"learning_rate": 1.873585777777778e-05,
"loss": 0.8656,
"step": 98100
},
{
"epoch": 0.15711974860840222,
"grad_norm": 0.20535144209861755,
"learning_rate": 1.8732302222222223e-05,
"loss": 1.3438,
"step": 98200
},
{
"epoch": 0.15727974835240263,
"grad_norm": 0.029342494904994965,
"learning_rate": 1.8728746666666668e-05,
"loss": 1.465,
"step": 98300
},
{
"epoch": 0.15743974809640304,
"grad_norm": 0.20423032343387604,
"learning_rate": 1.8725191111111112e-05,
"loss": 1.2253,
"step": 98400
},
{
"epoch": 0.15759974784040345,
"grad_norm": 0.020203936845064163,
"learning_rate": 1.8721635555555557e-05,
"loss": 1.3481,
"step": 98500
},
{
"epoch": 0.15775974758440386,
"grad_norm": 0.001091059297323227,
"learning_rate": 1.871808e-05,
"loss": 1.5131,
"step": 98600
},
{
"epoch": 0.15791974732840428,
"grad_norm": 42.3817253112793,
"learning_rate": 1.8714524444444446e-05,
"loss": 1.4852,
"step": 98700
},
{
"epoch": 0.15807974707240469,
"grad_norm": 11.986414909362793,
"learning_rate": 1.871096888888889e-05,
"loss": 1.1317,
"step": 98800
},
{
"epoch": 0.1582397468164051,
"grad_norm": 6.878232002258301,
"learning_rate": 1.8707413333333335e-05,
"loss": 1.0395,
"step": 98900
},
{
"epoch": 0.1583997465604055,
"grad_norm": 0.011188351549208164,
"learning_rate": 1.8703893333333333e-05,
"loss": 0.9256,
"step": 99000
},
{
"epoch": 0.15855974630440592,
"grad_norm": 0.03425045683979988,
"learning_rate": 1.870033777777778e-05,
"loss": 0.9774,
"step": 99100
},
{
"epoch": 0.15871974604840633,
"grad_norm": 26.11473846435547,
"learning_rate": 1.8696782222222222e-05,
"loss": 0.9756,
"step": 99200
},
{
"epoch": 0.15887974579240674,
"grad_norm": 0.0001582380209583789,
"learning_rate": 1.869322666666667e-05,
"loss": 1.4885,
"step": 99300
},
{
"epoch": 0.15903974553640715,
"grad_norm": 0.1462339162826538,
"learning_rate": 1.868967111111111e-05,
"loss": 1.2373,
"step": 99400
},
{
"epoch": 0.15919974528040756,
"grad_norm": 20.499425888061523,
"learning_rate": 1.8686115555555556e-05,
"loss": 1.3868,
"step": 99500
},
{
"epoch": 0.15935974502440797,
"grad_norm": 89.42505645751953,
"learning_rate": 1.868256e-05,
"loss": 0.9238,
"step": 99600
},
{
"epoch": 0.15951974476840838,
"grad_norm": 4.4118266105651855,
"learning_rate": 1.8679004444444445e-05,
"loss": 1.0793,
"step": 99700
},
{
"epoch": 0.15967974451240877,
"grad_norm": 0.08320512622594833,
"learning_rate": 1.867544888888889e-05,
"loss": 1.2405,
"step": 99800
},
{
"epoch": 0.15983974425640918,
"grad_norm": 1.5630072355270386,
"learning_rate": 1.8671893333333334e-05,
"loss": 1.2417,
"step": 99900
},
{
"epoch": 0.1599997440004096,
"grad_norm": 1.7299790382385254,
"learning_rate": 1.866833777777778e-05,
"loss": 1.1264,
"step": 100000
}
],
"logging_steps": 100,
"max_steps": 625001,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 0.0,
"train_batch_size": 128,
"trial_name": null,
"trial_params": null
}