smallcoder-303m / trainer_state.json
Beebey's picture
Upload folder using huggingface_hub
2b66fa5 verified
{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 0.25599901577555206,
"eval_steps": 500,
"global_step": 22889,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.00011184368726268168,
"grad_norm": 0.6328383684158325,
"learning_rate": 4.5e-06,
"loss": 1.734,
"step": 10
},
{
"epoch": 0.00022368737452536336,
"grad_norm": 0.566952109336853,
"learning_rate": 9.5e-06,
"loss": 1.6903,
"step": 20
},
{
"epoch": 0.00033553106178804503,
"grad_norm": 0.5359939932823181,
"learning_rate": 1.4500000000000002e-05,
"loss": 1.6266,
"step": 30
},
{
"epoch": 0.0004473747490507267,
"grad_norm": 0.4729914367198944,
"learning_rate": 1.95e-05,
"loss": 1.5731,
"step": 40
},
{
"epoch": 0.0005592184363134084,
"grad_norm": 0.42020025849342346,
"learning_rate": 2.4500000000000003e-05,
"loss": 1.5335,
"step": 50
},
{
"epoch": 0.0006710621235760901,
"grad_norm": 0.4461672604084015,
"learning_rate": 2.95e-05,
"loss": 1.4851,
"step": 60
},
{
"epoch": 0.0007829058108387717,
"grad_norm": 0.4443751275539398,
"learning_rate": 3.4500000000000005e-05,
"loss": 1.4431,
"step": 70
},
{
"epoch": 0.0008947494981014534,
"grad_norm": 0.4204632639884949,
"learning_rate": 3.95e-05,
"loss": 1.4036,
"step": 80
},
{
"epoch": 0.0010065931853641351,
"grad_norm": 0.3985028862953186,
"learning_rate": 4.45e-05,
"loss": 1.3725,
"step": 90
},
{
"epoch": 0.0011184368726268167,
"grad_norm": 0.4111650586128235,
"learning_rate": 4.9500000000000004e-05,
"loss": 1.3527,
"step": 100
},
{
"epoch": 0.0012302805598894985,
"grad_norm": 0.4175569713115692,
"learning_rate": 5.45e-05,
"loss": 1.3431,
"step": 110
},
{
"epoch": 0.0013421242471521801,
"grad_norm": 0.3871678411960602,
"learning_rate": 5.9499999999999996e-05,
"loss": 1.3322,
"step": 120
},
{
"epoch": 0.0014539679344148617,
"grad_norm": 0.39584827423095703,
"learning_rate": 6.450000000000001e-05,
"loss": 1.3075,
"step": 130
},
{
"epoch": 0.0015658116216775435,
"grad_norm": 0.4165605902671814,
"learning_rate": 6.950000000000001e-05,
"loss": 1.286,
"step": 140
},
{
"epoch": 0.001677655308940225,
"grad_norm": 0.3985513150691986,
"learning_rate": 7.45e-05,
"loss": 1.2567,
"step": 150
},
{
"epoch": 0.0017894989962029069,
"grad_norm": 0.39112743735313416,
"learning_rate": 7.950000000000001e-05,
"loss": 1.2448,
"step": 160
},
{
"epoch": 0.0019013426834655885,
"grad_norm": 0.3867124915122986,
"learning_rate": 8.450000000000001e-05,
"loss": 1.2405,
"step": 170
},
{
"epoch": 0.0020131863707282703,
"grad_norm": 0.3955863416194916,
"learning_rate": 8.95e-05,
"loss": 1.2123,
"step": 180
},
{
"epoch": 0.002125030057990952,
"grad_norm": 0.40293410420417786,
"learning_rate": 9.45e-05,
"loss": 1.2081,
"step": 190
},
{
"epoch": 0.0022368737452536334,
"grad_norm": 0.3828902542591095,
"learning_rate": 9.95e-05,
"loss": 1.2049,
"step": 200
},
{
"epoch": 0.002348717432516315,
"grad_norm": 0.3969178795814514,
"learning_rate": 0.00010449999999999999,
"loss": 1.1892,
"step": 210
},
{
"epoch": 0.002460561119778997,
"grad_norm": 0.4122287929058075,
"learning_rate": 0.0001095,
"loss": 1.184,
"step": 220
},
{
"epoch": 0.0025724048070416786,
"grad_norm": 0.3793940246105194,
"learning_rate": 0.0001145,
"loss": 1.1809,
"step": 230
},
{
"epoch": 0.0026842484943043602,
"grad_norm": 0.4132145643234253,
"learning_rate": 0.00011949999999999999,
"loss": 1.1883,
"step": 240
},
{
"epoch": 0.002796092181567042,
"grad_norm": 0.3900831639766693,
"learning_rate": 0.0001245,
"loss": 1.1818,
"step": 250
},
{
"epoch": 0.0029079358688297234,
"grad_norm": 0.3898029625415802,
"learning_rate": 0.0001295,
"loss": 1.1693,
"step": 260
},
{
"epoch": 0.0030197795560924054,
"grad_norm": 0.40828797221183777,
"learning_rate": 0.00013450000000000002,
"loss": 1.1869,
"step": 270
},
{
"epoch": 0.003131623243355087,
"grad_norm": 0.3976770341396332,
"learning_rate": 0.0001395,
"loss": 1.1841,
"step": 280
},
{
"epoch": 0.0032434669306177686,
"grad_norm": 0.3902062773704529,
"learning_rate": 0.0001445,
"loss": 1.1843,
"step": 290
},
{
"epoch": 0.00335531061788045,
"grad_norm": 0.38051125407218933,
"learning_rate": 0.0001495,
"loss": 1.1662,
"step": 300
},
{
"epoch": 0.0034671543051431318,
"grad_norm": 0.3628483712673187,
"learning_rate": 0.00015450000000000001,
"loss": 1.1638,
"step": 310
},
{
"epoch": 0.0035789979924058138,
"grad_norm": 0.3693360388278961,
"learning_rate": 0.0001595,
"loss": 1.1606,
"step": 320
},
{
"epoch": 0.0036908416796684954,
"grad_norm": 0.38896557688713074,
"learning_rate": 0.00016450000000000001,
"loss": 1.1448,
"step": 330
},
{
"epoch": 0.003802685366931177,
"grad_norm": 0.40257108211517334,
"learning_rate": 0.00016950000000000003,
"loss": 1.143,
"step": 340
},
{
"epoch": 0.0039145290541938585,
"grad_norm": 0.38656994700431824,
"learning_rate": 0.00017449999999999999,
"loss": 1.141,
"step": 350
},
{
"epoch": 0.0040263727414565405,
"grad_norm": 0.3700025677680969,
"learning_rate": 0.0001795,
"loss": 1.136,
"step": 360
},
{
"epoch": 0.004138216428719222,
"grad_norm": 0.37222161889076233,
"learning_rate": 0.0001845,
"loss": 1.1292,
"step": 370
},
{
"epoch": 0.004250060115981904,
"grad_norm": 0.39386317133903503,
"learning_rate": 0.0001895,
"loss": 1.1139,
"step": 380
},
{
"epoch": 0.004361903803244586,
"grad_norm": 0.3776305913925171,
"learning_rate": 0.0001945,
"loss": 1.1125,
"step": 390
},
{
"epoch": 0.004473747490507267,
"grad_norm": 0.40314197540283203,
"learning_rate": 0.00019950000000000002,
"loss": 1.0962,
"step": 400
},
{
"epoch": 0.004585591177769949,
"grad_norm": 0.37841472029685974,
"learning_rate": 0.00020449999999999998,
"loss": 1.0987,
"step": 410
},
{
"epoch": 0.00469743486503263,
"grad_norm": 0.3678649365901947,
"learning_rate": 0.0002095,
"loss": 1.0826,
"step": 420
},
{
"epoch": 0.004809278552295312,
"grad_norm": 0.37902751564979553,
"learning_rate": 0.0002145,
"loss": 1.0973,
"step": 430
},
{
"epoch": 0.004921122239557994,
"grad_norm": 0.3776302635669708,
"learning_rate": 0.0002195,
"loss": 1.112,
"step": 440
},
{
"epoch": 0.005032965926820675,
"grad_norm": 0.43771493434906006,
"learning_rate": 0.0002245,
"loss": 1.1005,
"step": 450
},
{
"epoch": 0.005144809614083357,
"grad_norm": 0.3662595748901367,
"learning_rate": 0.00022950000000000002,
"loss": 1.0899,
"step": 460
},
{
"epoch": 0.005256653301346038,
"grad_norm": 0.37473002076148987,
"learning_rate": 0.00023449999999999998,
"loss": 1.0982,
"step": 470
},
{
"epoch": 0.0053684969886087204,
"grad_norm": 0.35591790080070496,
"learning_rate": 0.0002395,
"loss": 1.1005,
"step": 480
},
{
"epoch": 0.0054803406758714025,
"grad_norm": 0.3825643062591553,
"learning_rate": 0.0002445,
"loss": 1.0896,
"step": 490
},
{
"epoch": 0.005592184363134084,
"grad_norm": 0.3784261643886566,
"learning_rate": 0.0002495,
"loss": 1.1039,
"step": 500
},
{
"epoch": 0.005704028050396766,
"grad_norm": 0.35387158393859863,
"learning_rate": 0.0002545,
"loss": 1.1038,
"step": 510
},
{
"epoch": 0.005815871737659447,
"grad_norm": 0.3992142975330353,
"learning_rate": 0.0002595,
"loss": 1.088,
"step": 520
},
{
"epoch": 0.005927715424922129,
"grad_norm": 0.36795270442962646,
"learning_rate": 0.00026450000000000003,
"loss": 1.0888,
"step": 530
},
{
"epoch": 0.006039559112184811,
"grad_norm": 0.4007701575756073,
"learning_rate": 0.00026950000000000005,
"loss": 1.0838,
"step": 540
},
{
"epoch": 0.006151402799447492,
"grad_norm": 0.34527722001075745,
"learning_rate": 0.0002745,
"loss": 1.0892,
"step": 550
},
{
"epoch": 0.006263246486710174,
"grad_norm": 0.37232115864753723,
"learning_rate": 0.0002795,
"loss": 1.0939,
"step": 560
},
{
"epoch": 0.006375090173972855,
"grad_norm": 0.4048405885696411,
"learning_rate": 0.0002845,
"loss": 1.0863,
"step": 570
},
{
"epoch": 0.006486933861235537,
"grad_norm": 0.37317511439323425,
"learning_rate": 0.0002895,
"loss": 1.0711,
"step": 580
},
{
"epoch": 0.006598777548498219,
"grad_norm": 0.38564008474349976,
"learning_rate": 0.0002945,
"loss": 1.091,
"step": 590
},
{
"epoch": 0.0067106212357609,
"grad_norm": 0.3639361262321472,
"learning_rate": 0.0002995,
"loss": 1.0682,
"step": 600
},
{
"epoch": 0.006822464923023582,
"grad_norm": 0.35907182097435,
"learning_rate": 0.0003045,
"loss": 1.0755,
"step": 610
},
{
"epoch": 0.0069343086102862635,
"grad_norm": 0.35199785232543945,
"learning_rate": 0.0003095,
"loss": 1.0581,
"step": 620
},
{
"epoch": 0.0070461522975489455,
"grad_norm": 0.35156381130218506,
"learning_rate": 0.0003145,
"loss": 1.0651,
"step": 630
},
{
"epoch": 0.0071579959848116275,
"grad_norm": 0.3742520213127136,
"learning_rate": 0.0003195,
"loss": 1.0555,
"step": 640
},
{
"epoch": 0.007269839672074309,
"grad_norm": 0.3587191700935364,
"learning_rate": 0.00032450000000000003,
"loss": 1.0548,
"step": 650
},
{
"epoch": 0.007381683359336991,
"grad_norm": 0.37587791681289673,
"learning_rate": 0.00032950000000000004,
"loss": 1.0437,
"step": 660
},
{
"epoch": 0.007493527046599672,
"grad_norm": 0.3410298526287079,
"learning_rate": 0.00033450000000000005,
"loss": 1.0426,
"step": 670
},
{
"epoch": 0.007605370733862354,
"grad_norm": 0.3450978696346283,
"learning_rate": 0.0003395,
"loss": 1.0487,
"step": 680
},
{
"epoch": 0.007717214421125036,
"grad_norm": 0.3445068299770355,
"learning_rate": 0.00034449999999999997,
"loss": 1.0411,
"step": 690
},
{
"epoch": 0.007829058108387717,
"grad_norm": 0.34611567854881287,
"learning_rate": 0.0003495,
"loss": 1.0404,
"step": 700
},
{
"epoch": 0.007940901795650398,
"grad_norm": 0.3339330852031708,
"learning_rate": 0.0003545,
"loss": 1.0361,
"step": 710
},
{
"epoch": 0.008052745482913081,
"grad_norm": 0.33232080936431885,
"learning_rate": 0.0003595,
"loss": 1.0271,
"step": 720
},
{
"epoch": 0.008164589170175762,
"grad_norm": 0.33050498366355896,
"learning_rate": 0.0003645,
"loss": 1.0316,
"step": 730
},
{
"epoch": 0.008276432857438443,
"grad_norm": 0.3449972867965698,
"learning_rate": 0.0003695,
"loss": 1.0426,
"step": 740
},
{
"epoch": 0.008388276544701126,
"grad_norm": 0.3543892502784729,
"learning_rate": 0.0003745,
"loss": 1.0475,
"step": 750
},
{
"epoch": 0.008500120231963807,
"grad_norm": 0.3447831869125366,
"learning_rate": 0.0003795,
"loss": 1.0482,
"step": 760
},
{
"epoch": 0.008611963919226489,
"grad_norm": 0.33845630288124084,
"learning_rate": 0.0003845,
"loss": 1.0533,
"step": 770
},
{
"epoch": 0.008723807606489171,
"grad_norm": 0.3394622802734375,
"learning_rate": 0.00038950000000000003,
"loss": 1.0803,
"step": 780
},
{
"epoch": 0.008835651293751853,
"grad_norm": 0.33649975061416626,
"learning_rate": 0.00039450000000000005,
"loss": 1.0461,
"step": 790
},
{
"epoch": 0.008947494981014534,
"grad_norm": 0.3265191912651062,
"learning_rate": 0.0003995,
"loss": 1.0714,
"step": 800
},
{
"epoch": 0.009059338668277215,
"grad_norm": 0.34960776567459106,
"learning_rate": 0.0004045,
"loss": 1.0542,
"step": 810
},
{
"epoch": 0.009171182355539898,
"grad_norm": 0.3353814482688904,
"learning_rate": 0.0004095,
"loss": 1.0625,
"step": 820
},
{
"epoch": 0.009283026042802579,
"grad_norm": 0.3499109148979187,
"learning_rate": 0.0004145,
"loss": 1.0679,
"step": 830
},
{
"epoch": 0.00939486973006526,
"grad_norm": 0.33906084299087524,
"learning_rate": 0.0004195,
"loss": 1.0659,
"step": 840
},
{
"epoch": 0.009506713417327943,
"grad_norm": 0.3245256543159485,
"learning_rate": 0.0004245,
"loss": 1.078,
"step": 850
},
{
"epoch": 0.009618557104590624,
"grad_norm": 0.3364386260509491,
"learning_rate": 0.0004295,
"loss": 1.0771,
"step": 860
},
{
"epoch": 0.009730400791853305,
"grad_norm": 0.348718523979187,
"learning_rate": 0.0004345,
"loss": 1.0751,
"step": 870
},
{
"epoch": 0.009842244479115988,
"grad_norm": 0.31124839186668396,
"learning_rate": 0.0004395,
"loss": 1.0693,
"step": 880
},
{
"epoch": 0.00995408816637867,
"grad_norm": 0.3478352129459381,
"learning_rate": 0.0004445,
"loss": 1.0682,
"step": 890
},
{
"epoch": 0.01006593185364135,
"grad_norm": 0.31189802289009094,
"learning_rate": 0.00044950000000000003,
"loss": 1.0608,
"step": 900
},
{
"epoch": 0.010177775540904033,
"grad_norm": 0.34715884923934937,
"learning_rate": 0.00045450000000000004,
"loss": 1.0698,
"step": 910
},
{
"epoch": 0.010289619228166715,
"grad_norm": 0.3279336988925934,
"learning_rate": 0.00045950000000000006,
"loss": 1.0728,
"step": 920
},
{
"epoch": 0.010401462915429396,
"grad_norm": 0.32010868191719055,
"learning_rate": 0.0004645,
"loss": 1.0765,
"step": 930
},
{
"epoch": 0.010513306602692077,
"grad_norm": 0.3618028163909912,
"learning_rate": 0.0004695,
"loss": 1.0815,
"step": 940
},
{
"epoch": 0.01062515028995476,
"grad_norm": 0.3403186798095703,
"learning_rate": 0.0004745,
"loss": 1.0713,
"step": 950
},
{
"epoch": 0.010736993977217441,
"grad_norm": 0.347687691450119,
"learning_rate": 0.0004795,
"loss": 1.0844,
"step": 960
},
{
"epoch": 0.010848837664480122,
"grad_norm": 0.3537987768650055,
"learning_rate": 0.0004845,
"loss": 1.0762,
"step": 970
},
{
"epoch": 0.010960681351742805,
"grad_norm": 0.42015892267227173,
"learning_rate": 0.0004895,
"loss": 1.0832,
"step": 980
},
{
"epoch": 0.011072525039005486,
"grad_norm": 0.35781368613243103,
"learning_rate": 0.0004945,
"loss": 1.0606,
"step": 990
},
{
"epoch": 0.011184368726268167,
"grad_norm": 0.3361358344554901,
"learning_rate": 0.0004995,
"loss": 1.0717,
"step": 1000
},
{
"epoch": 0.01129621241353085,
"grad_norm": 0.36569204926490784,
"learning_rate": 0.0004997944172872219,
"loss": 1.0602,
"step": 1010
},
{
"epoch": 0.011408056100793531,
"grad_norm": 0.31979477405548096,
"learning_rate": 0.0004995659920508017,
"loss": 1.0531,
"step": 1020
},
{
"epoch": 0.011519899788056212,
"grad_norm": 0.3295707404613495,
"learning_rate": 0.0004993375668143817,
"loss": 1.0346,
"step": 1030
},
{
"epoch": 0.011631743475318894,
"grad_norm": 0.3207838833332062,
"learning_rate": 0.0004991091415779616,
"loss": 1.059,
"step": 1040
},
{
"epoch": 0.011743587162581576,
"grad_norm": 0.33032119274139404,
"learning_rate": 0.0004988807163415415,
"loss": 1.0573,
"step": 1050
},
{
"epoch": 0.011855430849844258,
"grad_norm": 0.3566173017024994,
"learning_rate": 0.0004986522911051213,
"loss": 1.0501,
"step": 1060
},
{
"epoch": 0.011967274537106939,
"grad_norm": 0.31658655405044556,
"learning_rate": 0.0004984238658687012,
"loss": 1.0706,
"step": 1070
},
{
"epoch": 0.012079118224369622,
"grad_norm": 0.3438680171966553,
"learning_rate": 0.0004981954406322811,
"loss": 1.0765,
"step": 1080
},
{
"epoch": 0.012190961911632303,
"grad_norm": 0.3130144774913788,
"learning_rate": 0.0004979670153958609,
"loss": 1.0588,
"step": 1090
},
{
"epoch": 0.012302805598894984,
"grad_norm": 0.31765422224998474,
"learning_rate": 0.0004977385901594408,
"loss": 1.0703,
"step": 1100
},
{
"epoch": 0.012414649286157667,
"grad_norm": 0.36112868785858154,
"learning_rate": 0.0004975101649230207,
"loss": 1.0642,
"step": 1110
},
{
"epoch": 0.012526492973420348,
"grad_norm": 0.33418065309524536,
"learning_rate": 0.0004972817396866005,
"loss": 1.0572,
"step": 1120
},
{
"epoch": 0.01263833666068303,
"grad_norm": 0.34439629316329956,
"learning_rate": 0.0004970533144501805,
"loss": 1.0473,
"step": 1130
},
{
"epoch": 0.01275018034794571,
"grad_norm": 0.32954639196395874,
"learning_rate": 0.0004968248892137603,
"loss": 1.054,
"step": 1140
},
{
"epoch": 0.012862024035208393,
"grad_norm": 0.3351511061191559,
"learning_rate": 0.0004965964639773402,
"loss": 1.0444,
"step": 1150
},
{
"epoch": 0.012973867722471074,
"grad_norm": 0.3065156638622284,
"learning_rate": 0.0004963680387409202,
"loss": 1.0546,
"step": 1160
},
{
"epoch": 0.013085711409733755,
"grad_norm": 0.36450672149658203,
"learning_rate": 0.0004961396135045,
"loss": 1.0501,
"step": 1170
},
{
"epoch": 0.013197555096996438,
"grad_norm": 0.3020591735839844,
"learning_rate": 0.0004959111882680799,
"loss": 1.052,
"step": 1180
},
{
"epoch": 0.01330939878425912,
"grad_norm": 0.3097701966762543,
"learning_rate": 0.0004956827630316598,
"loss": 1.0695,
"step": 1190
},
{
"epoch": 0.0134212424715218,
"grad_norm": 0.3410932719707489,
"learning_rate": 0.0004954543377952396,
"loss": 1.0692,
"step": 1200
},
{
"epoch": 0.013533086158784484,
"grad_norm": 0.38478952646255493,
"learning_rate": 0.0004952259125588195,
"loss": 1.0592,
"step": 1210
},
{
"epoch": 0.013644929846047165,
"grad_norm": 0.3737089931964874,
"learning_rate": 0.0004949974873223994,
"loss": 1.0808,
"step": 1220
},
{
"epoch": 0.013756773533309846,
"grad_norm": 0.3264448940753937,
"learning_rate": 0.0004947690620859793,
"loss": 1.0759,
"step": 1230
},
{
"epoch": 0.013868617220572527,
"grad_norm": 0.3922732472419739,
"learning_rate": 0.0004945406368495591,
"loss": 1.0634,
"step": 1240
},
{
"epoch": 0.01398046090783521,
"grad_norm": 0.36068034172058105,
"learning_rate": 0.000494312211613139,
"loss": 1.0683,
"step": 1250
},
{
"epoch": 0.014092304595097891,
"grad_norm": 0.3544798791408539,
"learning_rate": 0.0004940837863767189,
"loss": 1.0687,
"step": 1260
},
{
"epoch": 0.014204148282360572,
"grad_norm": 0.31447795033454895,
"learning_rate": 0.0004938553611402987,
"loss": 1.0549,
"step": 1270
},
{
"epoch": 0.014315991969623255,
"grad_norm": 0.37639158964157104,
"learning_rate": 0.0004936269359038786,
"loss": 1.0698,
"step": 1280
},
{
"epoch": 0.014427835656885936,
"grad_norm": 0.32416418194770813,
"learning_rate": 0.0004933985106674586,
"loss": 1.0617,
"step": 1290
},
{
"epoch": 0.014539679344148617,
"grad_norm": 0.3122979998588562,
"learning_rate": 0.0004931700854310385,
"loss": 1.0553,
"step": 1300
},
{
"epoch": 0.0146515230314113,
"grad_norm": 0.3574884533882141,
"learning_rate": 0.0004929416601946184,
"loss": 1.0598,
"step": 1310
},
{
"epoch": 0.014763366718673981,
"grad_norm": 0.30762428045272827,
"learning_rate": 0.0004927132349581982,
"loss": 1.0642,
"step": 1320
},
{
"epoch": 0.014875210405936663,
"grad_norm": 0.34350454807281494,
"learning_rate": 0.0004924848097217781,
"loss": 1.0663,
"step": 1330
},
{
"epoch": 0.014987054093199344,
"grad_norm": 0.33486828207969666,
"learning_rate": 0.000492256384485358,
"loss": 1.0479,
"step": 1340
},
{
"epoch": 0.015098897780462027,
"grad_norm": 0.3025324046611786,
"learning_rate": 0.0004920279592489378,
"loss": 1.0705,
"step": 1350
},
{
"epoch": 0.015210741467724708,
"grad_norm": 0.35260385274887085,
"learning_rate": 0.0004917995340125177,
"loss": 1.0762,
"step": 1360
},
{
"epoch": 0.015322585154987389,
"grad_norm": 0.3188925087451935,
"learning_rate": 0.0004915711087760976,
"loss": 1.069,
"step": 1370
},
{
"epoch": 0.015434428842250072,
"grad_norm": 0.332660436630249,
"learning_rate": 0.0004913426835396775,
"loss": 1.0749,
"step": 1380
},
{
"epoch": 0.015546272529512753,
"grad_norm": 0.31745171546936035,
"learning_rate": 0.0004911142583032573,
"loss": 1.0811,
"step": 1390
},
{
"epoch": 0.015658116216775434,
"grad_norm": 0.3237819969654083,
"learning_rate": 0.0004908858330668372,
"loss": 1.0634,
"step": 1400
},
{
"epoch": 0.015769959904038115,
"grad_norm": 0.3300880789756775,
"learning_rate": 0.0004906574078304171,
"loss": 1.0554,
"step": 1410
},
{
"epoch": 0.015881803591300796,
"grad_norm": 0.32475635409355164,
"learning_rate": 0.0004904289825939969,
"loss": 1.0598,
"step": 1420
},
{
"epoch": 0.01599364727856348,
"grad_norm": 0.31278952956199646,
"learning_rate": 0.0004902005573575769,
"loss": 1.0498,
"step": 1430
},
{
"epoch": 0.016105490965826162,
"grad_norm": 0.308680921792984,
"learning_rate": 0.0004899721321211568,
"loss": 1.0586,
"step": 1440
},
{
"epoch": 0.016217334653088843,
"grad_norm": 0.34637314081192017,
"learning_rate": 0.0004897437068847367,
"loss": 1.0535,
"step": 1450
},
{
"epoch": 0.016329178340351524,
"grad_norm": 0.3220643401145935,
"learning_rate": 0.0004895152816483165,
"loss": 1.0624,
"step": 1460
},
{
"epoch": 0.016441022027614206,
"grad_norm": 0.31472912430763245,
"learning_rate": 0.0004892868564118964,
"loss": 1.0748,
"step": 1470
},
{
"epoch": 0.016552865714876887,
"grad_norm": 0.3416632115840912,
"learning_rate": 0.0004890584311754763,
"loss": 1.0715,
"step": 1480
},
{
"epoch": 0.01666470940213957,
"grad_norm": 0.3463667631149292,
"learning_rate": 0.0004888300059390561,
"loss": 1.0914,
"step": 1490
},
{
"epoch": 0.016776553089402253,
"grad_norm": 0.3322199881076813,
"learning_rate": 0.000488601580702636,
"loss": 1.0707,
"step": 1500
},
{
"epoch": 0.016888396776664934,
"grad_norm": 0.3899800479412079,
"learning_rate": 0.0004883731554662159,
"loss": 1.0883,
"step": 1510
},
{
"epoch": 0.017000240463927615,
"grad_norm": 0.3409605324268341,
"learning_rate": 0.0004881447302297958,
"loss": 1.0982,
"step": 1520
},
{
"epoch": 0.017112084151190296,
"grad_norm": 0.3720357120037079,
"learning_rate": 0.0004879163049933757,
"loss": 1.0674,
"step": 1530
},
{
"epoch": 0.017223927838452977,
"grad_norm": 0.326050728559494,
"learning_rate": 0.00048768787975695554,
"loss": 1.0764,
"step": 1540
},
{
"epoch": 0.01733577152571566,
"grad_norm": 0.3238283395767212,
"learning_rate": 0.0004874594545205354,
"loss": 1.0547,
"step": 1550
},
{
"epoch": 0.017447615212978343,
"grad_norm": 0.3324073553085327,
"learning_rate": 0.00048723102928411536,
"loss": 1.0608,
"step": 1560
},
{
"epoch": 0.017559458900241024,
"grad_norm": 0.3382217586040497,
"learning_rate": 0.0004870026040476952,
"loss": 1.0505,
"step": 1570
},
{
"epoch": 0.017671302587503705,
"grad_norm": 0.3409116566181183,
"learning_rate": 0.00048677417881127507,
"loss": 1.0673,
"step": 1580
},
{
"epoch": 0.017783146274766386,
"grad_norm": 0.3123399019241333,
"learning_rate": 0.000486545753574855,
"loss": 1.0461,
"step": 1590
},
{
"epoch": 0.017894989962029068,
"grad_norm": 0.3178008198738098,
"learning_rate": 0.00048631732833843484,
"loss": 1.0526,
"step": 1600
},
{
"epoch": 0.01800683364929175,
"grad_norm": 0.37002459168434143,
"learning_rate": 0.0004860889031020147,
"loss": 1.0483,
"step": 1610
},
{
"epoch": 0.01811867733655443,
"grad_norm": 0.31036287546157837,
"learning_rate": 0.0004858604778655946,
"loss": 1.0418,
"step": 1620
},
{
"epoch": 0.018230521023817114,
"grad_norm": 0.3027215600013733,
"learning_rate": 0.00048563205262917446,
"loss": 1.0467,
"step": 1630
},
{
"epoch": 0.018342364711079796,
"grad_norm": 0.32144612073898315,
"learning_rate": 0.00048540362739275437,
"loss": 1.0437,
"step": 1640
},
{
"epoch": 0.018454208398342477,
"grad_norm": 0.3156447410583496,
"learning_rate": 0.0004851752021563343,
"loss": 1.0447,
"step": 1650
},
{
"epoch": 0.018566052085605158,
"grad_norm": 0.3228546380996704,
"learning_rate": 0.00048494677691991413,
"loss": 1.056,
"step": 1660
},
{
"epoch": 0.01867789577286784,
"grad_norm": 0.3478510081768036,
"learning_rate": 0.000484718351683494,
"loss": 1.0523,
"step": 1670
},
{
"epoch": 0.01878973946013052,
"grad_norm": 0.3413507342338562,
"learning_rate": 0.0004844899264470739,
"loss": 1.049,
"step": 1680
},
{
"epoch": 0.018901583147393205,
"grad_norm": 0.3277221918106079,
"learning_rate": 0.00048426150121065375,
"loss": 1.0403,
"step": 1690
},
{
"epoch": 0.019013426834655886,
"grad_norm": 0.3044646382331848,
"learning_rate": 0.0004840330759742336,
"loss": 1.0518,
"step": 1700
},
{
"epoch": 0.019125270521918567,
"grad_norm": 0.31599846482276917,
"learning_rate": 0.0004838046507378135,
"loss": 1.0475,
"step": 1710
},
{
"epoch": 0.01923711420918125,
"grad_norm": 0.346741646528244,
"learning_rate": 0.00048357622550139343,
"loss": 1.0515,
"step": 1720
},
{
"epoch": 0.01934895789644393,
"grad_norm": 0.32756108045578003,
"learning_rate": 0.0004833478002649733,
"loss": 1.054,
"step": 1730
},
{
"epoch": 0.01946080158370661,
"grad_norm": 0.3318345546722412,
"learning_rate": 0.0004831193750285532,
"loss": 1.0575,
"step": 1740
},
{
"epoch": 0.019572645270969292,
"grad_norm": 0.3389560282230377,
"learning_rate": 0.00048289094979213305,
"loss": 1.0576,
"step": 1750
},
{
"epoch": 0.019684488958231976,
"grad_norm": 0.31532642245292664,
"learning_rate": 0.0004826625245557129,
"loss": 1.0554,
"step": 1760
},
{
"epoch": 0.019796332645494658,
"grad_norm": 0.3263496160507202,
"learning_rate": 0.0004824340993192928,
"loss": 1.0697,
"step": 1770
},
{
"epoch": 0.01990817633275734,
"grad_norm": 0.328225314617157,
"learning_rate": 0.00048220567408287267,
"loss": 1.0584,
"step": 1780
},
{
"epoch": 0.02002002002002002,
"grad_norm": 0.3030998706817627,
"learning_rate": 0.00048197724884645253,
"loss": 1.0555,
"step": 1790
},
{
"epoch": 0.0201318637072827,
"grad_norm": 0.32594701647758484,
"learning_rate": 0.0004817488236100325,
"loss": 1.0512,
"step": 1800
},
{
"epoch": 0.020243707394545382,
"grad_norm": 0.2882954776287079,
"learning_rate": 0.00048152039837361235,
"loss": 1.0441,
"step": 1810
},
{
"epoch": 0.020355551081808067,
"grad_norm": 0.33917129039764404,
"learning_rate": 0.0004812919731371922,
"loss": 1.048,
"step": 1820
},
{
"epoch": 0.020467394769070748,
"grad_norm": 0.32748523354530334,
"learning_rate": 0.0004810635479007721,
"loss": 1.042,
"step": 1830
},
{
"epoch": 0.02057923845633343,
"grad_norm": 0.32332462072372437,
"learning_rate": 0.00048083512266435197,
"loss": 1.0396,
"step": 1840
},
{
"epoch": 0.02069108214359611,
"grad_norm": 0.36977729201316833,
"learning_rate": 0.0004806066974279318,
"loss": 1.0337,
"step": 1850
},
{
"epoch": 0.02080292583085879,
"grad_norm": 0.33298948407173157,
"learning_rate": 0.00048037827219151174,
"loss": 1.045,
"step": 1860
},
{
"epoch": 0.020914769518121473,
"grad_norm": 0.328861802816391,
"learning_rate": 0.00048014984695509165,
"loss": 1.053,
"step": 1870
},
{
"epoch": 0.021026613205384154,
"grad_norm": 0.3438888490200043,
"learning_rate": 0.0004799214217186715,
"loss": 1.0385,
"step": 1880
},
{
"epoch": 0.02113845689264684,
"grad_norm": 0.3251883387565613,
"learning_rate": 0.00047969299648225136,
"loss": 1.0436,
"step": 1890
},
{
"epoch": 0.02125030057990952,
"grad_norm": 0.3300330340862274,
"learning_rate": 0.00047946457124583127,
"loss": 1.0627,
"step": 1900
},
{
"epoch": 0.0213621442671722,
"grad_norm": 0.31774377822875977,
"learning_rate": 0.0004792361460094111,
"loss": 1.0491,
"step": 1910
},
{
"epoch": 0.021473987954434882,
"grad_norm": 0.36171990633010864,
"learning_rate": 0.000479007720772991,
"loss": 1.0536,
"step": 1920
},
{
"epoch": 0.021585831641697563,
"grad_norm": 0.33032888174057007,
"learning_rate": 0.0004787792955365709,
"loss": 1.0327,
"step": 1930
},
{
"epoch": 0.021697675328960244,
"grad_norm": 0.34056538343429565,
"learning_rate": 0.00047855087030015074,
"loss": 1.0354,
"step": 1940
},
{
"epoch": 0.021809519016222925,
"grad_norm": 0.31768256425857544,
"learning_rate": 0.00047832244506373065,
"loss": 1.0278,
"step": 1950
},
{
"epoch": 0.02192136270348561,
"grad_norm": 0.33165955543518066,
"learning_rate": 0.00047809401982731056,
"loss": 1.057,
"step": 1960
},
{
"epoch": 0.02203320639074829,
"grad_norm": 0.34456339478492737,
"learning_rate": 0.0004778655945908904,
"loss": 1.0465,
"step": 1970
},
{
"epoch": 0.022145050078010972,
"grad_norm": 0.35331544280052185,
"learning_rate": 0.0004776371693544703,
"loss": 1.0509,
"step": 1980
},
{
"epoch": 0.022256893765273653,
"grad_norm": 0.3497447669506073,
"learning_rate": 0.0004774087441180502,
"loss": 1.0579,
"step": 1990
},
{
"epoch": 0.022368737452536334,
"grad_norm": 0.31631171703338623,
"learning_rate": 0.00047718031888163004,
"loss": 1.0747,
"step": 2000
},
{
"epoch": 0.022480581139799016,
"grad_norm": 0.34811535477638245,
"learning_rate": 0.0004769518936452099,
"loss": 1.0443,
"step": 2010
},
{
"epoch": 0.0225924248270617,
"grad_norm": 0.350975900888443,
"learning_rate": 0.0004767234684087898,
"loss": 1.0721,
"step": 2020
},
{
"epoch": 0.02270426851432438,
"grad_norm": 0.38026875257492065,
"learning_rate": 0.0004764950431723697,
"loss": 1.0502,
"step": 2030
},
{
"epoch": 0.022816112201587063,
"grad_norm": 0.3079335391521454,
"learning_rate": 0.00047626661793594957,
"loss": 1.0325,
"step": 2040
},
{
"epoch": 0.022927955888849744,
"grad_norm": 0.3412174582481384,
"learning_rate": 0.0004760381926995295,
"loss": 1.026,
"step": 2050
},
{
"epoch": 0.023039799576112425,
"grad_norm": 0.31905752420425415,
"learning_rate": 0.00047580976746310934,
"loss": 1.033,
"step": 2060
},
{
"epoch": 0.023151643263375106,
"grad_norm": 0.3110033869743347,
"learning_rate": 0.0004755813422266892,
"loss": 1.026,
"step": 2070
},
{
"epoch": 0.023263486950637787,
"grad_norm": 0.3087383210659027,
"learning_rate": 0.0004753529169902691,
"loss": 1.0285,
"step": 2080
},
{
"epoch": 0.023375330637900472,
"grad_norm": 0.310497522354126,
"learning_rate": 0.00047512449175384896,
"loss": 1.012,
"step": 2090
},
{
"epoch": 0.023487174325163153,
"grad_norm": 0.35822993516921997,
"learning_rate": 0.0004748960665174288,
"loss": 1.0124,
"step": 2100
},
{
"epoch": 0.023599018012425834,
"grad_norm": 0.3355759084224701,
"learning_rate": 0.0004746676412810088,
"loss": 1.0159,
"step": 2110
},
{
"epoch": 0.023710861699688515,
"grad_norm": 0.29633432626724243,
"learning_rate": 0.00047443921604458863,
"loss": 1.0068,
"step": 2120
},
{
"epoch": 0.023822705386951196,
"grad_norm": 0.3268597424030304,
"learning_rate": 0.0004742107908081685,
"loss": 1.0029,
"step": 2130
},
{
"epoch": 0.023934549074213878,
"grad_norm": 0.32010769844055176,
"learning_rate": 0.0004739823655717484,
"loss": 1.0081,
"step": 2140
},
{
"epoch": 0.02404639276147656,
"grad_norm": 0.30638498067855835,
"learning_rate": 0.00047375394033532826,
"loss": 0.9955,
"step": 2150
},
{
"epoch": 0.024158236448739243,
"grad_norm": 0.32299259305000305,
"learning_rate": 0.0004735255150989081,
"loss": 1.0028,
"step": 2160
},
{
"epoch": 0.024270080136001924,
"grad_norm": 0.30714213848114014,
"learning_rate": 0.000473297089862488,
"loss": 1.0163,
"step": 2170
},
{
"epoch": 0.024381923823264606,
"grad_norm": 0.3207940459251404,
"learning_rate": 0.0004730686646260679,
"loss": 1.0053,
"step": 2180
},
{
"epoch": 0.024493767510527287,
"grad_norm": 0.3073663115501404,
"learning_rate": 0.0004728402393896478,
"loss": 1.0007,
"step": 2190
},
{
"epoch": 0.024605611197789968,
"grad_norm": 0.3209913671016693,
"learning_rate": 0.0004726118141532277,
"loss": 1.0065,
"step": 2200
},
{
"epoch": 0.02471745488505265,
"grad_norm": 0.2987804114818573,
"learning_rate": 0.00047238338891680755,
"loss": 1.0015,
"step": 2210
},
{
"epoch": 0.024829298572315334,
"grad_norm": 0.31511807441711426,
"learning_rate": 0.0004721549636803874,
"loss": 0.9892,
"step": 2220
},
{
"epoch": 0.024941142259578015,
"grad_norm": 0.2840864956378937,
"learning_rate": 0.0004719265384439673,
"loss": 1.0084,
"step": 2230
},
{
"epoch": 0.025052985946840696,
"grad_norm": 0.3094743490219116,
"learning_rate": 0.0004716981132075472,
"loss": 1.0169,
"step": 2240
},
{
"epoch": 0.025164829634103377,
"grad_norm": 0.2905067205429077,
"learning_rate": 0.00047146968797112703,
"loss": 0.9991,
"step": 2250
},
{
"epoch": 0.02527667332136606,
"grad_norm": 0.31322264671325684,
"learning_rate": 0.00047124126273470694,
"loss": 1.0169,
"step": 2260
},
{
"epoch": 0.02538851700862874,
"grad_norm": 0.29053428769111633,
"learning_rate": 0.00047101283749828685,
"loss": 0.9942,
"step": 2270
},
{
"epoch": 0.02550036069589142,
"grad_norm": 0.2863853871822357,
"learning_rate": 0.0004707844122618667,
"loss": 1.002,
"step": 2280
},
{
"epoch": 0.025612204383154105,
"grad_norm": 0.3087761104106903,
"learning_rate": 0.0004705559870254466,
"loss": 1.0025,
"step": 2290
},
{
"epoch": 0.025724048070416786,
"grad_norm": 0.3308629095554352,
"learning_rate": 0.00047032756178902647,
"loss": 1.0078,
"step": 2300
},
{
"epoch": 0.025835891757679467,
"grad_norm": 0.29703134298324585,
"learning_rate": 0.0004700991365526063,
"loss": 1.006,
"step": 2310
},
{
"epoch": 0.02594773544494215,
"grad_norm": 0.27238258719444275,
"learning_rate": 0.0004698707113161862,
"loss": 0.9963,
"step": 2320
},
{
"epoch": 0.02605957913220483,
"grad_norm": 0.2795617878437042,
"learning_rate": 0.0004696422860797661,
"loss": 0.9876,
"step": 2330
},
{
"epoch": 0.02617142281946751,
"grad_norm": 0.2989327013492584,
"learning_rate": 0.000469413860843346,
"loss": 0.9864,
"step": 2340
},
{
"epoch": 0.026283266506730196,
"grad_norm": 0.3229614794254303,
"learning_rate": 0.00046918543560692586,
"loss": 0.9849,
"step": 2350
},
{
"epoch": 0.026395110193992877,
"grad_norm": 0.2921406328678131,
"learning_rate": 0.00046895701037050577,
"loss": 0.9764,
"step": 2360
},
{
"epoch": 0.026506953881255558,
"grad_norm": 0.2955220639705658,
"learning_rate": 0.0004687285851340856,
"loss": 0.9883,
"step": 2370
},
{
"epoch": 0.02661879756851824,
"grad_norm": 0.31378960609436035,
"learning_rate": 0.0004685001598976655,
"loss": 0.9978,
"step": 2380
},
{
"epoch": 0.02673064125578092,
"grad_norm": 0.30504587292671204,
"learning_rate": 0.0004682717346612454,
"loss": 0.9912,
"step": 2390
},
{
"epoch": 0.0268424849430436,
"grad_norm": 0.3066459000110626,
"learning_rate": 0.00046804330942482524,
"loss": 0.9877,
"step": 2400
},
{
"epoch": 0.026954328630306282,
"grad_norm": 0.3198714256286621,
"learning_rate": 0.0004678148841884051,
"loss": 0.98,
"step": 2410
},
{
"epoch": 0.027066172317568967,
"grad_norm": 0.27119094133377075,
"learning_rate": 0.00046758645895198506,
"loss": 1.001,
"step": 2420
},
{
"epoch": 0.027178016004831648,
"grad_norm": 0.28178098797798157,
"learning_rate": 0.0004673580337155649,
"loss": 0.9605,
"step": 2430
},
{
"epoch": 0.02728985969209433,
"grad_norm": 0.29373088479042053,
"learning_rate": 0.0004671296084791448,
"loss": 0.9834,
"step": 2440
},
{
"epoch": 0.02740170337935701,
"grad_norm": 0.2861827313899994,
"learning_rate": 0.0004669011832427247,
"loss": 0.9797,
"step": 2450
},
{
"epoch": 0.02751354706661969,
"grad_norm": 0.3488409221172333,
"learning_rate": 0.00046667275800630454,
"loss": 0.9682,
"step": 2460
},
{
"epoch": 0.027625390753882373,
"grad_norm": 0.29631665349006653,
"learning_rate": 0.0004664443327698844,
"loss": 0.9751,
"step": 2470
},
{
"epoch": 0.027737234441145054,
"grad_norm": 0.27299416065216064,
"learning_rate": 0.0004662159075334643,
"loss": 0.9571,
"step": 2480
},
{
"epoch": 0.02784907812840774,
"grad_norm": 0.30409684777259827,
"learning_rate": 0.00046598748229704416,
"loss": 0.968,
"step": 2490
},
{
"epoch": 0.02796092181567042,
"grad_norm": 0.2957991063594818,
"learning_rate": 0.00046575905706062407,
"loss": 0.9814,
"step": 2500
},
{
"epoch": 0.0280727655029331,
"grad_norm": 0.28328225016593933,
"learning_rate": 0.000465530631824204,
"loss": 0.9816,
"step": 2510
},
{
"epoch": 0.028184609190195782,
"grad_norm": 0.40670067071914673,
"learning_rate": 0.00046530220658778384,
"loss": 0.9737,
"step": 2520
},
{
"epoch": 0.028296452877458463,
"grad_norm": 0.2818649411201477,
"learning_rate": 0.0004650737813513637,
"loss": 0.9891,
"step": 2530
},
{
"epoch": 0.028408296564721144,
"grad_norm": 0.3054118752479553,
"learning_rate": 0.0004648453561149436,
"loss": 0.9976,
"step": 2540
},
{
"epoch": 0.02852014025198383,
"grad_norm": 0.31439468264579773,
"learning_rate": 0.00046461693087852346,
"loss": 0.9928,
"step": 2550
},
{
"epoch": 0.02863198393924651,
"grad_norm": 0.3173445761203766,
"learning_rate": 0.0004643885056421033,
"loss": 1.0002,
"step": 2560
},
{
"epoch": 0.02874382762650919,
"grad_norm": 0.32495757937431335,
"learning_rate": 0.0004641600804056832,
"loss": 0.9981,
"step": 2570
},
{
"epoch": 0.028855671313771872,
"grad_norm": 0.35957351326942444,
"learning_rate": 0.00046393165516926313,
"loss": 1.0112,
"step": 2580
},
{
"epoch": 0.028967515001034554,
"grad_norm": 0.3070557713508606,
"learning_rate": 0.000463703229932843,
"loss": 1.0047,
"step": 2590
},
{
"epoch": 0.029079358688297235,
"grad_norm": 0.3227770924568176,
"learning_rate": 0.0004634748046964229,
"loss": 1.0115,
"step": 2600
},
{
"epoch": 0.029191202375559916,
"grad_norm": 0.34345880150794983,
"learning_rate": 0.00046324637946000276,
"loss": 0.9984,
"step": 2610
},
{
"epoch": 0.0293030460628226,
"grad_norm": 0.34459254145622253,
"learning_rate": 0.0004630179542235826,
"loss": 0.9965,
"step": 2620
},
{
"epoch": 0.02941488975008528,
"grad_norm": 0.3396269679069519,
"learning_rate": 0.0004627895289871625,
"loss": 0.9986,
"step": 2630
},
{
"epoch": 0.029526733437347963,
"grad_norm": 0.3370846211910248,
"learning_rate": 0.0004625611037507424,
"loss": 0.9987,
"step": 2640
},
{
"epoch": 0.029638577124610644,
"grad_norm": 0.30689191818237305,
"learning_rate": 0.00046233267851432223,
"loss": 1.0081,
"step": 2650
},
{
"epoch": 0.029750420811873325,
"grad_norm": 0.35536935925483704,
"learning_rate": 0.0004621042532779022,
"loss": 0.9948,
"step": 2660
},
{
"epoch": 0.029862264499136006,
"grad_norm": 0.3295105993747711,
"learning_rate": 0.00046187582804148205,
"loss": 1.0115,
"step": 2670
},
{
"epoch": 0.029974108186398687,
"grad_norm": 0.34881895780563354,
"learning_rate": 0.0004616474028050619,
"loss": 1.0024,
"step": 2680
},
{
"epoch": 0.030085951873661372,
"grad_norm": 0.379261314868927,
"learning_rate": 0.0004614189775686418,
"loss": 0.9965,
"step": 2690
},
{
"epoch": 0.030197795560924053,
"grad_norm": 0.34729093313217163,
"learning_rate": 0.0004611905523322217,
"loss": 1.0026,
"step": 2700
},
{
"epoch": 0.030309639248186734,
"grad_norm": 0.34687525033950806,
"learning_rate": 0.00046096212709580153,
"loss": 0.9992,
"step": 2710
},
{
"epoch": 0.030421482935449416,
"grad_norm": 0.3564583659172058,
"learning_rate": 0.00046073370185938144,
"loss": 0.9859,
"step": 2720
},
{
"epoch": 0.030533326622712097,
"grad_norm": 0.3762670159339905,
"learning_rate": 0.0004605052766229613,
"loss": 1.0059,
"step": 2730
},
{
"epoch": 0.030645170309974778,
"grad_norm": 0.3470481038093567,
"learning_rate": 0.0004602768513865412,
"loss": 1.0044,
"step": 2740
},
{
"epoch": 0.030757013997237462,
"grad_norm": 0.3322189450263977,
"learning_rate": 0.0004600484261501211,
"loss": 0.9811,
"step": 2750
},
{
"epoch": 0.030868857684500144,
"grad_norm": 0.3248903751373291,
"learning_rate": 0.00045982000091370097,
"loss": 0.9721,
"step": 2760
},
{
"epoch": 0.030980701371762825,
"grad_norm": 0.32881951332092285,
"learning_rate": 0.0004595915756772808,
"loss": 0.9821,
"step": 2770
},
{
"epoch": 0.031092545059025506,
"grad_norm": 0.35410797595977783,
"learning_rate": 0.0004593631504408607,
"loss": 0.9786,
"step": 2780
},
{
"epoch": 0.031204388746288187,
"grad_norm": 0.3307279050350189,
"learning_rate": 0.0004591347252044406,
"loss": 0.9759,
"step": 2790
},
{
"epoch": 0.03131623243355087,
"grad_norm": 0.3207128643989563,
"learning_rate": 0.00045890629996802045,
"loss": 0.9812,
"step": 2800
},
{
"epoch": 0.03142807612081355,
"grad_norm": 0.3065459728240967,
"learning_rate": 0.0004586778747316003,
"loss": 0.9596,
"step": 2810
},
{
"epoch": 0.03153991980807623,
"grad_norm": 0.3115104138851166,
"learning_rate": 0.00045844944949518027,
"loss": 0.9732,
"step": 2820
},
{
"epoch": 0.031651763495338915,
"grad_norm": 0.3136879801750183,
"learning_rate": 0.0004582210242587601,
"loss": 0.9818,
"step": 2830
},
{
"epoch": 0.03176360718260159,
"grad_norm": 0.3240731656551361,
"learning_rate": 0.00045799259902234,
"loss": 0.9836,
"step": 2840
},
{
"epoch": 0.03187545086986428,
"grad_norm": 0.31390219926834106,
"learning_rate": 0.0004577641737859199,
"loss": 0.9837,
"step": 2850
},
{
"epoch": 0.03198729455712696,
"grad_norm": 0.3056069612503052,
"learning_rate": 0.00045753574854949975,
"loss": 0.995,
"step": 2860
},
{
"epoch": 0.03209913824438964,
"grad_norm": 0.29556363821029663,
"learning_rate": 0.0004573073233130796,
"loss": 1.0018,
"step": 2870
},
{
"epoch": 0.032210981931652324,
"grad_norm": 0.2931666374206543,
"learning_rate": 0.0004570788980766595,
"loss": 1.0124,
"step": 2880
},
{
"epoch": 0.032322825618915,
"grad_norm": 0.31029924750328064,
"learning_rate": 0.0004568504728402394,
"loss": 1.0115,
"step": 2890
},
{
"epoch": 0.03243466930617769,
"grad_norm": 0.3164144456386566,
"learning_rate": 0.0004566220476038193,
"loss": 0.9966,
"step": 2900
},
{
"epoch": 0.032546512993440364,
"grad_norm": 0.31638383865356445,
"learning_rate": 0.0004563936223673992,
"loss": 0.989,
"step": 2910
},
{
"epoch": 0.03265835668070305,
"grad_norm": 0.28559473156929016,
"learning_rate": 0.00045616519713097904,
"loss": 1.0038,
"step": 2920
},
{
"epoch": 0.032770200367965734,
"grad_norm": 0.285154789686203,
"learning_rate": 0.0004559367718945589,
"loss": 1.0009,
"step": 2930
},
{
"epoch": 0.03288204405522841,
"grad_norm": 0.2722555100917816,
"learning_rate": 0.0004557083466581388,
"loss": 0.9977,
"step": 2940
},
{
"epoch": 0.032993887742491096,
"grad_norm": 0.2854909896850586,
"learning_rate": 0.00045547992142171866,
"loss": 0.9996,
"step": 2950
},
{
"epoch": 0.033105731429753774,
"grad_norm": 0.2726607620716095,
"learning_rate": 0.0004552514961852985,
"loss": 0.9925,
"step": 2960
},
{
"epoch": 0.03321757511701646,
"grad_norm": 0.30692654848098755,
"learning_rate": 0.0004550230709488785,
"loss": 0.9776,
"step": 2970
},
{
"epoch": 0.03332941880427914,
"grad_norm": 0.2921067774295807,
"learning_rate": 0.00045479464571245834,
"loss": 0.9831,
"step": 2980
},
{
"epoch": 0.03344126249154182,
"grad_norm": 0.30490297079086304,
"learning_rate": 0.0004545662204760382,
"loss": 0.9835,
"step": 2990
},
{
"epoch": 0.033553106178804505,
"grad_norm": 0.2823980450630188,
"learning_rate": 0.0004543377952396181,
"loss": 0.9859,
"step": 3000
},
{
"epoch": 0.03366494986606718,
"grad_norm": 0.31844133138656616,
"learning_rate": 0.00045410937000319796,
"loss": 1.0007,
"step": 3010
},
{
"epoch": 0.03377679355332987,
"grad_norm": 0.30595019459724426,
"learning_rate": 0.0004538809447667778,
"loss": 1.0069,
"step": 3020
},
{
"epoch": 0.033888637240592545,
"grad_norm": 0.31177419424057007,
"learning_rate": 0.0004536525195303577,
"loss": 1.0068,
"step": 3030
},
{
"epoch": 0.03400048092785523,
"grad_norm": 0.33921870589256287,
"learning_rate": 0.0004534240942939376,
"loss": 1.0116,
"step": 3040
},
{
"epoch": 0.034112324615117914,
"grad_norm": 0.29299408197402954,
"learning_rate": 0.0004531956690575175,
"loss": 1.0014,
"step": 3050
},
{
"epoch": 0.03422416830238059,
"grad_norm": 0.28572002053260803,
"learning_rate": 0.0004529672438210974,
"loss": 0.9976,
"step": 3060
},
{
"epoch": 0.03433601198964328,
"grad_norm": 0.30842283368110657,
"learning_rate": 0.00045273881858467726,
"loss": 0.9994,
"step": 3070
},
{
"epoch": 0.034447855676905954,
"grad_norm": 0.29677408933639526,
"learning_rate": 0.0004525103933482571,
"loss": 1.0055,
"step": 3080
},
{
"epoch": 0.03455969936416864,
"grad_norm": 0.388823926448822,
"learning_rate": 0.000452281968111837,
"loss": 1.0062,
"step": 3090
},
{
"epoch": 0.03467154305143132,
"grad_norm": 0.2956707775592804,
"learning_rate": 0.0004520535428754169,
"loss": 0.9794,
"step": 3100
},
{
"epoch": 0.034783386738694,
"grad_norm": 0.3179475665092468,
"learning_rate": 0.00045182511763899673,
"loss": 0.9831,
"step": 3110
},
{
"epoch": 0.034895230425956686,
"grad_norm": 0.29509803652763367,
"learning_rate": 0.00045159669240257664,
"loss": 0.9851,
"step": 3120
},
{
"epoch": 0.035007074113219364,
"grad_norm": 0.31095758080482483,
"learning_rate": 0.00045136826716615655,
"loss": 0.9852,
"step": 3130
},
{
"epoch": 0.03511891780048205,
"grad_norm": 0.27768880128860474,
"learning_rate": 0.0004511398419297364,
"loss": 0.9741,
"step": 3140
},
{
"epoch": 0.035230761487744726,
"grad_norm": 0.3117106854915619,
"learning_rate": 0.0004509114166933163,
"loss": 0.9987,
"step": 3150
},
{
"epoch": 0.03534260517500741,
"grad_norm": 0.30113616585731506,
"learning_rate": 0.0004506829914568962,
"loss": 0.9855,
"step": 3160
},
{
"epoch": 0.03545444886227009,
"grad_norm": 0.2842777967453003,
"learning_rate": 0.00045045456622047603,
"loss": 0.9793,
"step": 3170
},
{
"epoch": 0.03556629254953277,
"grad_norm": 0.30115559697151184,
"learning_rate": 0.00045022614098405594,
"loss": 0.9854,
"step": 3180
},
{
"epoch": 0.03567813623679546,
"grad_norm": 0.3350517153739929,
"learning_rate": 0.0004499977157476358,
"loss": 0.9787,
"step": 3190
},
{
"epoch": 0.035789979924058135,
"grad_norm": 0.2736664414405823,
"learning_rate": 0.00044976929051121565,
"loss": 1.0067,
"step": 3200
},
{
"epoch": 0.03590182361132082,
"grad_norm": 0.2868112027645111,
"learning_rate": 0.0004495408652747956,
"loss": 1.0002,
"step": 3210
},
{
"epoch": 0.0360136672985835,
"grad_norm": 0.27296972274780273,
"learning_rate": 0.00044931244003837547,
"loss": 0.9939,
"step": 3220
},
{
"epoch": 0.03612551098584618,
"grad_norm": 0.2894013226032257,
"learning_rate": 0.00044908401480195533,
"loss": 1.0017,
"step": 3230
},
{
"epoch": 0.03623735467310886,
"grad_norm": 0.26549386978149414,
"learning_rate": 0.0004488555895655352,
"loss": 0.9953,
"step": 3240
},
{
"epoch": 0.036349198360371544,
"grad_norm": 0.27381303906440735,
"learning_rate": 0.0004486271643291151,
"loss": 1.0077,
"step": 3250
},
{
"epoch": 0.03646104204763423,
"grad_norm": 0.2829972505569458,
"learning_rate": 0.00044839873909269495,
"loss": 1.0008,
"step": 3260
},
{
"epoch": 0.03657288573489691,
"grad_norm": 0.29023584723472595,
"learning_rate": 0.0004481703138562748,
"loss": 0.9999,
"step": 3270
},
{
"epoch": 0.03668472942215959,
"grad_norm": 0.29526880383491516,
"learning_rate": 0.00044794188861985477,
"loss": 0.9982,
"step": 3280
},
{
"epoch": 0.03679657310942227,
"grad_norm": 0.27724817395210266,
"learning_rate": 0.0004477134633834346,
"loss": 1.0109,
"step": 3290
},
{
"epoch": 0.036908416796684954,
"grad_norm": 0.2780180275440216,
"learning_rate": 0.0004474850381470145,
"loss": 0.997,
"step": 3300
},
{
"epoch": 0.03702026048394764,
"grad_norm": 0.29814234375953674,
"learning_rate": 0.0004472566129105944,
"loss": 1.0056,
"step": 3310
},
{
"epoch": 0.037132104171210316,
"grad_norm": 0.3131207823753357,
"learning_rate": 0.00044702818767417425,
"loss": 0.999,
"step": 3320
},
{
"epoch": 0.037243947858473,
"grad_norm": 0.2865641415119171,
"learning_rate": 0.0004467997624377541,
"loss": 0.9938,
"step": 3330
},
{
"epoch": 0.03735579154573568,
"grad_norm": 0.31247007846832275,
"learning_rate": 0.000446571337201334,
"loss": 1.0029,
"step": 3340
},
{
"epoch": 0.03746763523299836,
"grad_norm": 0.3432846665382385,
"learning_rate": 0.00044634291196491387,
"loss": 0.9861,
"step": 3350
},
{
"epoch": 0.03757947892026104,
"grad_norm": 0.3200684189796448,
"learning_rate": 0.0004461144867284938,
"loss": 0.9958,
"step": 3360
},
{
"epoch": 0.037691322607523725,
"grad_norm": 0.3280775547027588,
"learning_rate": 0.0004458860614920737,
"loss": 0.9972,
"step": 3370
},
{
"epoch": 0.03780316629478641,
"grad_norm": 0.3129955232143402,
"learning_rate": 0.00044565763625565354,
"loss": 0.9947,
"step": 3380
},
{
"epoch": 0.03791500998204909,
"grad_norm": 0.27574583888053894,
"learning_rate": 0.0004454292110192334,
"loss": 1.0004,
"step": 3390
},
{
"epoch": 0.03802685366931177,
"grad_norm": 0.3088320791721344,
"learning_rate": 0.0004452007857828133,
"loss": 0.9907,
"step": 3400
},
{
"epoch": 0.03813869735657445,
"grad_norm": 0.3232235908508301,
"learning_rate": 0.00044497236054639316,
"loss": 0.9956,
"step": 3410
},
{
"epoch": 0.038250541043837134,
"grad_norm": 0.3009951114654541,
"learning_rate": 0.000444743935309973,
"loss": 0.9899,
"step": 3420
},
{
"epoch": 0.03836238473109981,
"grad_norm": 0.2987104058265686,
"learning_rate": 0.00044451551007355293,
"loss": 0.9852,
"step": 3430
},
{
"epoch": 0.0384742284183625,
"grad_norm": 0.2890870273113251,
"learning_rate": 0.00044428708483713284,
"loss": 0.9775,
"step": 3440
},
{
"epoch": 0.03858607210562518,
"grad_norm": 0.2704969048500061,
"learning_rate": 0.0004440586596007127,
"loss": 0.9745,
"step": 3450
},
{
"epoch": 0.03869791579288786,
"grad_norm": 0.3041844964027405,
"learning_rate": 0.0004438302343642926,
"loss": 0.977,
"step": 3460
},
{
"epoch": 0.038809759480150544,
"grad_norm": 0.2794378995895386,
"learning_rate": 0.00044360180912787246,
"loss": 0.9818,
"step": 3470
},
{
"epoch": 0.03892160316741322,
"grad_norm": 0.2784910798072815,
"learning_rate": 0.0004433733838914523,
"loss": 0.9655,
"step": 3480
},
{
"epoch": 0.039033446854675906,
"grad_norm": 0.2610478103160858,
"learning_rate": 0.0004431449586550322,
"loss": 0.975,
"step": 3490
},
{
"epoch": 0.039145290541938584,
"grad_norm": 0.2646799087524414,
"learning_rate": 0.0004429165334186121,
"loss": 0.9767,
"step": 3500
},
{
"epoch": 0.03925713422920127,
"grad_norm": 0.2622663676738739,
"learning_rate": 0.00044268810818219194,
"loss": 0.98,
"step": 3510
},
{
"epoch": 0.03936897791646395,
"grad_norm": 0.26897987723350525,
"learning_rate": 0.0004424596829457719,
"loss": 0.9718,
"step": 3520
},
{
"epoch": 0.03948082160372663,
"grad_norm": 0.29816752672195435,
"learning_rate": 0.00044223125770935176,
"loss": 1.0074,
"step": 3530
},
{
"epoch": 0.039592665290989315,
"grad_norm": 0.2652198076248169,
"learning_rate": 0.0004420028324729316,
"loss": 0.9789,
"step": 3540
},
{
"epoch": 0.03970450897825199,
"grad_norm": 0.2648336887359619,
"learning_rate": 0.0004417744072365115,
"loss": 0.9794,
"step": 3550
},
{
"epoch": 0.03981635266551468,
"grad_norm": 0.25409677624702454,
"learning_rate": 0.0004415459820000914,
"loss": 0.9868,
"step": 3560
},
{
"epoch": 0.039928196352777355,
"grad_norm": 0.25675469636917114,
"learning_rate": 0.00044131755676367123,
"loss": 0.9827,
"step": 3570
},
{
"epoch": 0.04004004004004004,
"grad_norm": 0.2915634214878082,
"learning_rate": 0.00044108913152725114,
"loss": 0.9833,
"step": 3580
},
{
"epoch": 0.040151883727302724,
"grad_norm": 0.29538393020629883,
"learning_rate": 0.000440860706290831,
"loss": 0.9848,
"step": 3590
},
{
"epoch": 0.0402637274145654,
"grad_norm": 0.3026215732097626,
"learning_rate": 0.0004406322810544109,
"loss": 0.9778,
"step": 3600
},
{
"epoch": 0.04037557110182809,
"grad_norm": 0.30865418910980225,
"learning_rate": 0.0004404038558179908,
"loss": 0.9743,
"step": 3610
},
{
"epoch": 0.040487414789090764,
"grad_norm": 0.28092265129089355,
"learning_rate": 0.0004401754305815707,
"loss": 0.9795,
"step": 3620
},
{
"epoch": 0.04059925847635345,
"grad_norm": 0.27747923135757446,
"learning_rate": 0.00043994700534515053,
"loss": 0.9642,
"step": 3630
},
{
"epoch": 0.040711102163616134,
"grad_norm": 0.28192010521888733,
"learning_rate": 0.00043971858010873044,
"loss": 0.9742,
"step": 3640
},
{
"epoch": 0.04082294585087881,
"grad_norm": 0.2670564651489258,
"learning_rate": 0.0004394901548723103,
"loss": 0.9544,
"step": 3650
},
{
"epoch": 0.040934789538141496,
"grad_norm": 0.3089617192745209,
"learning_rate": 0.00043926172963589015,
"loss": 0.9563,
"step": 3660
},
{
"epoch": 0.041046633225404174,
"grad_norm": 0.26768213510513306,
"learning_rate": 0.00043903330439947,
"loss": 0.9531,
"step": 3670
},
{
"epoch": 0.04115847691266686,
"grad_norm": 0.28865131735801697,
"learning_rate": 0.00043880487916305,
"loss": 0.9579,
"step": 3680
},
{
"epoch": 0.041270320599929536,
"grad_norm": 0.27369582653045654,
"learning_rate": 0.00043857645392662983,
"loss": 0.9679,
"step": 3690
},
{
"epoch": 0.04138216428719222,
"grad_norm": 0.2889108955860138,
"learning_rate": 0.0004383480286902097,
"loss": 0.9561,
"step": 3700
},
{
"epoch": 0.041494007974454905,
"grad_norm": 0.2701929211616516,
"learning_rate": 0.0004381196034537896,
"loss": 0.9642,
"step": 3710
},
{
"epoch": 0.04160585166171758,
"grad_norm": 0.2817586064338684,
"learning_rate": 0.00043789117821736945,
"loss": 0.9701,
"step": 3720
},
{
"epoch": 0.04171769534898027,
"grad_norm": 0.2924664318561554,
"learning_rate": 0.0004376627529809493,
"loss": 0.9617,
"step": 3730
},
{
"epoch": 0.041829539036242945,
"grad_norm": 0.28590497374534607,
"learning_rate": 0.0004374343277445292,
"loss": 0.9646,
"step": 3740
},
{
"epoch": 0.04194138272350563,
"grad_norm": 0.270046591758728,
"learning_rate": 0.0004372059025081091,
"loss": 0.95,
"step": 3750
},
{
"epoch": 0.04205322641076831,
"grad_norm": 0.2508755326271057,
"learning_rate": 0.000436977477271689,
"loss": 0.9525,
"step": 3760
},
{
"epoch": 0.04216507009803099,
"grad_norm": 0.26878127455711365,
"learning_rate": 0.0004367490520352689,
"loss": 0.9609,
"step": 3770
},
{
"epoch": 0.04227691378529368,
"grad_norm": 0.26882994174957275,
"learning_rate": 0.00043652062679884875,
"loss": 0.9671,
"step": 3780
},
{
"epoch": 0.042388757472556354,
"grad_norm": 0.28049325942993164,
"learning_rate": 0.0004362922015624286,
"loss": 0.9492,
"step": 3790
},
{
"epoch": 0.04250060115981904,
"grad_norm": 0.33502647280693054,
"learning_rate": 0.0004360637763260085,
"loss": 0.9537,
"step": 3800
},
{
"epoch": 0.04261244484708172,
"grad_norm": 0.321997731924057,
"learning_rate": 0.00043583535108958837,
"loss": 0.9646,
"step": 3810
},
{
"epoch": 0.0427242885343444,
"grad_norm": 0.29477357864379883,
"learning_rate": 0.0004356069258531682,
"loss": 0.9794,
"step": 3820
},
{
"epoch": 0.04283613222160708,
"grad_norm": 0.2989972233772278,
"learning_rate": 0.0004353785006167482,
"loss": 0.9645,
"step": 3830
},
{
"epoch": 0.042947975908869764,
"grad_norm": 0.33459851145744324,
"learning_rate": 0.00043515007538032804,
"loss": 0.9556,
"step": 3840
},
{
"epoch": 0.04305981959613245,
"grad_norm": 0.2941781282424927,
"learning_rate": 0.0004349216501439079,
"loss": 0.9507,
"step": 3850
},
{
"epoch": 0.043171663283395126,
"grad_norm": 0.27801111340522766,
"learning_rate": 0.0004346932249074878,
"loss": 0.9623,
"step": 3860
},
{
"epoch": 0.04328350697065781,
"grad_norm": 0.2765832841396332,
"learning_rate": 0.00043446479967106767,
"loss": 0.9815,
"step": 3870
},
{
"epoch": 0.04339535065792049,
"grad_norm": 0.303786039352417,
"learning_rate": 0.0004342363744346475,
"loss": 0.9575,
"step": 3880
},
{
"epoch": 0.04350719434518317,
"grad_norm": 0.29517048597335815,
"learning_rate": 0.00043400794919822743,
"loss": 0.9554,
"step": 3890
},
{
"epoch": 0.04361903803244585,
"grad_norm": 0.28657206892967224,
"learning_rate": 0.0004337795239618073,
"loss": 0.9631,
"step": 3900
},
{
"epoch": 0.043730881719708535,
"grad_norm": 0.2933245003223419,
"learning_rate": 0.0004335510987253872,
"loss": 0.987,
"step": 3910
},
{
"epoch": 0.04384272540697122,
"grad_norm": 0.31331002712249756,
"learning_rate": 0.0004333226734889671,
"loss": 0.971,
"step": 3920
},
{
"epoch": 0.0439545690942339,
"grad_norm": 0.32431700825691223,
"learning_rate": 0.00043309424825254696,
"loss": 0.9603,
"step": 3930
},
{
"epoch": 0.04406641278149658,
"grad_norm": 0.3346642851829529,
"learning_rate": 0.0004328658230161268,
"loss": 0.9721,
"step": 3940
},
{
"epoch": 0.04417825646875926,
"grad_norm": 0.33921241760253906,
"learning_rate": 0.00043263739777970673,
"loss": 0.9639,
"step": 3950
},
{
"epoch": 0.044290100156021944,
"grad_norm": 0.3068247139453888,
"learning_rate": 0.0004324089725432866,
"loss": 0.9756,
"step": 3960
},
{
"epoch": 0.04440194384328462,
"grad_norm": 0.3049049973487854,
"learning_rate": 0.00043218054730686644,
"loss": 0.9693,
"step": 3970
},
{
"epoch": 0.04451378753054731,
"grad_norm": 0.30104655027389526,
"learning_rate": 0.00043195212207044635,
"loss": 0.9704,
"step": 3980
},
{
"epoch": 0.04462563121780999,
"grad_norm": 0.36955609917640686,
"learning_rate": 0.00043172369683402626,
"loss": 0.9527,
"step": 3990
},
{
"epoch": 0.04473747490507267,
"grad_norm": 0.318854957818985,
"learning_rate": 0.0004314952715976061,
"loss": 0.9543,
"step": 4000
},
{
"epoch": 0.044849318592335354,
"grad_norm": 0.3166191875934601,
"learning_rate": 0.000431266846361186,
"loss": 0.968,
"step": 4010
},
{
"epoch": 0.04496116227959803,
"grad_norm": 0.2976950407028198,
"learning_rate": 0.0004310384211247659,
"loss": 0.9822,
"step": 4020
},
{
"epoch": 0.045073005966860716,
"grad_norm": 0.2912284731864929,
"learning_rate": 0.00043080999588834574,
"loss": 0.9759,
"step": 4030
},
{
"epoch": 0.0451848496541234,
"grad_norm": 0.31027549505233765,
"learning_rate": 0.00043058157065192565,
"loss": 0.9794,
"step": 4040
},
{
"epoch": 0.04529669334138608,
"grad_norm": 0.3182738721370697,
"learning_rate": 0.0004303531454155055,
"loss": 0.9654,
"step": 4050
},
{
"epoch": 0.04540853702864876,
"grad_norm": 0.3006060719490051,
"learning_rate": 0.00043012472017908536,
"loss": 0.9548,
"step": 4060
},
{
"epoch": 0.04552038071591144,
"grad_norm": 0.2828291654586792,
"learning_rate": 0.0004298962949426653,
"loss": 0.9611,
"step": 4070
},
{
"epoch": 0.045632224403174125,
"grad_norm": 0.30988603830337524,
"learning_rate": 0.0004296678697062452,
"loss": 0.9614,
"step": 4080
},
{
"epoch": 0.0457440680904368,
"grad_norm": 0.29344943165779114,
"learning_rate": 0.00042943944446982503,
"loss": 0.9522,
"step": 4090
},
{
"epoch": 0.04585591177769949,
"grad_norm": 0.29713529348373413,
"learning_rate": 0.00042921101923340494,
"loss": 0.9468,
"step": 4100
},
{
"epoch": 0.04596775546496217,
"grad_norm": 0.2815961539745331,
"learning_rate": 0.0004289825939969848,
"loss": 0.9546,
"step": 4110
},
{
"epoch": 0.04607959915222485,
"grad_norm": 0.25218480825424194,
"learning_rate": 0.00042875416876056465,
"loss": 0.9372,
"step": 4120
},
{
"epoch": 0.046191442839487534,
"grad_norm": 0.2735552191734314,
"learning_rate": 0.0004285257435241445,
"loss": 0.942,
"step": 4130
},
{
"epoch": 0.04630328652675021,
"grad_norm": 0.27451473474502563,
"learning_rate": 0.0004282973182877245,
"loss": 0.931,
"step": 4140
},
{
"epoch": 0.0464151302140129,
"grad_norm": 0.24361196160316467,
"learning_rate": 0.00042806889305130433,
"loss": 0.924,
"step": 4150
},
{
"epoch": 0.046526973901275574,
"grad_norm": 0.25817179679870605,
"learning_rate": 0.0004278404678148842,
"loss": 0.9373,
"step": 4160
},
{
"epoch": 0.04663881758853826,
"grad_norm": 0.28722450137138367,
"learning_rate": 0.0004276120425784641,
"loss": 0.9271,
"step": 4170
},
{
"epoch": 0.046750661275800943,
"grad_norm": 0.25202882289886475,
"learning_rate": 0.00042738361734204395,
"loss": 0.9187,
"step": 4180
},
{
"epoch": 0.04686250496306362,
"grad_norm": 0.2637481391429901,
"learning_rate": 0.0004271551921056238,
"loss": 0.9402,
"step": 4190
},
{
"epoch": 0.046974348650326306,
"grad_norm": 0.2684090733528137,
"learning_rate": 0.0004269267668692037,
"loss": 0.9574,
"step": 4200
},
{
"epoch": 0.047086192337588983,
"grad_norm": 0.28711873292922974,
"learning_rate": 0.00042669834163278357,
"loss": 0.9551,
"step": 4210
},
{
"epoch": 0.04719803602485167,
"grad_norm": 0.2933102250099182,
"learning_rate": 0.0004264699163963635,
"loss": 0.9457,
"step": 4220
},
{
"epoch": 0.047309879712114346,
"grad_norm": 0.2875578701496124,
"learning_rate": 0.0004262414911599434,
"loss": 0.9667,
"step": 4230
},
{
"epoch": 0.04742172339937703,
"grad_norm": 0.3007104694843292,
"learning_rate": 0.00042601306592352325,
"loss": 0.9672,
"step": 4240
},
{
"epoch": 0.047533567086639715,
"grad_norm": 0.30211201310157776,
"learning_rate": 0.0004257846406871031,
"loss": 0.9781,
"step": 4250
},
{
"epoch": 0.04764541077390239,
"grad_norm": 0.29263827204704285,
"learning_rate": 0.000425556215450683,
"loss": 0.9923,
"step": 4260
},
{
"epoch": 0.04775725446116508,
"grad_norm": 0.29569676518440247,
"learning_rate": 0.00042532779021426287,
"loss": 0.9913,
"step": 4270
},
{
"epoch": 0.047869098148427755,
"grad_norm": 0.28223690390586853,
"learning_rate": 0.0004250993649778427,
"loss": 0.9817,
"step": 4280
},
{
"epoch": 0.04798094183569044,
"grad_norm": 0.271419882774353,
"learning_rate": 0.00042487093974142263,
"loss": 0.9977,
"step": 4290
},
{
"epoch": 0.04809278552295312,
"grad_norm": 0.26362791657447815,
"learning_rate": 0.00042464251450500254,
"loss": 0.9859,
"step": 4300
},
{
"epoch": 0.0482046292102158,
"grad_norm": 0.31365934014320374,
"learning_rate": 0.0004244140892685824,
"loss": 0.9862,
"step": 4310
},
{
"epoch": 0.04831647289747849,
"grad_norm": 0.26915237307548523,
"learning_rate": 0.0004241856640321623,
"loss": 0.9693,
"step": 4320
},
{
"epoch": 0.048428316584741164,
"grad_norm": 0.2639203369617462,
"learning_rate": 0.00042395723879574217,
"loss": 0.9691,
"step": 4330
},
{
"epoch": 0.04854016027200385,
"grad_norm": 0.30106601119041443,
"learning_rate": 0.000423728813559322,
"loss": 0.9521,
"step": 4340
},
{
"epoch": 0.04865200395926653,
"grad_norm": 0.2807524800300598,
"learning_rate": 0.00042350038832290193,
"loss": 0.9616,
"step": 4350
},
{
"epoch": 0.04876384764652921,
"grad_norm": 0.27363407611846924,
"learning_rate": 0.0004232719630864818,
"loss": 0.9538,
"step": 4360
},
{
"epoch": 0.048875691333791896,
"grad_norm": 0.29041701555252075,
"learning_rate": 0.00042304353785006164,
"loss": 0.9455,
"step": 4370
},
{
"epoch": 0.048987535021054573,
"grad_norm": 0.28237226605415344,
"learning_rate": 0.0004228151126136416,
"loss": 0.9615,
"step": 4380
},
{
"epoch": 0.04909937870831726,
"grad_norm": 0.30885329842567444,
"learning_rate": 0.00042258668737722146,
"loss": 0.9691,
"step": 4390
},
{
"epoch": 0.049211222395579936,
"grad_norm": 0.2734643220901489,
"learning_rate": 0.0004223582621408013,
"loss": 0.9663,
"step": 4400
},
{
"epoch": 0.04932306608284262,
"grad_norm": 0.2652278244495392,
"learning_rate": 0.00042212983690438123,
"loss": 0.9439,
"step": 4410
},
{
"epoch": 0.0494349097701053,
"grad_norm": 0.27749761939048767,
"learning_rate": 0.0004219014116679611,
"loss": 0.9623,
"step": 4420
},
{
"epoch": 0.04954675345736798,
"grad_norm": 0.2812553942203522,
"learning_rate": 0.00042167298643154094,
"loss": 0.9557,
"step": 4430
},
{
"epoch": 0.04965859714463067,
"grad_norm": 0.2762252688407898,
"learning_rate": 0.00042144456119512085,
"loss": 0.945,
"step": 4440
},
{
"epoch": 0.049770440831893345,
"grad_norm": 0.277118980884552,
"learning_rate": 0.0004212161359587007,
"loss": 0.93,
"step": 4450
},
{
"epoch": 0.04988228451915603,
"grad_norm": 0.2723037004470825,
"learning_rate": 0.0004209877107222806,
"loss": 0.963,
"step": 4460
},
{
"epoch": 0.04999412820641871,
"grad_norm": 0.29789137840270996,
"learning_rate": 0.0004207592854858605,
"loss": 0.954,
"step": 4470
},
{
"epoch": 0.05010597189368139,
"grad_norm": 0.26940014958381653,
"learning_rate": 0.0004205308602494404,
"loss": 0.9443,
"step": 4480
},
{
"epoch": 0.05021781558094407,
"grad_norm": 0.263300359249115,
"learning_rate": 0.00042030243501302024,
"loss": 0.9403,
"step": 4490
},
{
"epoch": 0.050329659268206754,
"grad_norm": 0.27823972702026367,
"learning_rate": 0.00042007400977660015,
"loss": 0.95,
"step": 4500
},
{
"epoch": 0.05044150295546944,
"grad_norm": 0.2782444357872009,
"learning_rate": 0.00041984558454018,
"loss": 0.953,
"step": 4510
},
{
"epoch": 0.05055334664273212,
"grad_norm": 0.277182936668396,
"learning_rate": 0.00041961715930375986,
"loss": 0.9498,
"step": 4520
},
{
"epoch": 0.0506651903299948,
"grad_norm": 0.2942575514316559,
"learning_rate": 0.00041938873406733977,
"loss": 0.957,
"step": 4530
},
{
"epoch": 0.05077703401725748,
"grad_norm": 0.3258327543735504,
"learning_rate": 0.0004191603088309197,
"loss": 0.9626,
"step": 4540
},
{
"epoch": 0.05088887770452016,
"grad_norm": 0.27874353528022766,
"learning_rate": 0.00041893188359449953,
"loss": 0.971,
"step": 4550
},
{
"epoch": 0.05100072139178284,
"grad_norm": 0.2981313169002533,
"learning_rate": 0.00041870345835807944,
"loss": 0.965,
"step": 4560
},
{
"epoch": 0.051112565079045526,
"grad_norm": 0.30568984150886536,
"learning_rate": 0.0004184750331216593,
"loss": 0.9566,
"step": 4570
},
{
"epoch": 0.05122440876630821,
"grad_norm": 0.27867600321769714,
"learning_rate": 0.00041824660788523915,
"loss": 0.94,
"step": 4580
},
{
"epoch": 0.05133625245357089,
"grad_norm": 0.30877605080604553,
"learning_rate": 0.000418018182648819,
"loss": 0.9453,
"step": 4590
},
{
"epoch": 0.05144809614083357,
"grad_norm": 0.3018844425678253,
"learning_rate": 0.0004177897574123989,
"loss": 0.9511,
"step": 4600
},
{
"epoch": 0.05155993982809625,
"grad_norm": 0.27943944931030273,
"learning_rate": 0.0004175613321759788,
"loss": 0.9371,
"step": 4610
},
{
"epoch": 0.051671783515358935,
"grad_norm": 0.2654775381088257,
"learning_rate": 0.0004173329069395587,
"loss": 0.9366,
"step": 4620
},
{
"epoch": 0.05178362720262161,
"grad_norm": 0.27594050765037537,
"learning_rate": 0.0004171044817031386,
"loss": 0.9229,
"step": 4630
},
{
"epoch": 0.0518954708898843,
"grad_norm": 0.26856914162635803,
"learning_rate": 0.00041687605646671845,
"loss": 0.9357,
"step": 4640
},
{
"epoch": 0.05200731457714698,
"grad_norm": 0.2956237494945526,
"learning_rate": 0.0004166476312302983,
"loss": 0.9023,
"step": 4650
},
{
"epoch": 0.05211915826440966,
"grad_norm": 0.30004164576530457,
"learning_rate": 0.0004164192059938782,
"loss": 0.9273,
"step": 4660
},
{
"epoch": 0.052231001951672344,
"grad_norm": 0.2691096365451813,
"learning_rate": 0.0004161907807574581,
"loss": 0.9332,
"step": 4670
},
{
"epoch": 0.05234284563893502,
"grad_norm": 0.2551780641078949,
"learning_rate": 0.00041596235552103793,
"loss": 0.9327,
"step": 4680
},
{
"epoch": 0.052454689326197707,
"grad_norm": 0.2806546092033386,
"learning_rate": 0.0004157339302846179,
"loss": 0.9355,
"step": 4690
},
{
"epoch": 0.05256653301346039,
"grad_norm": 0.27648645639419556,
"learning_rate": 0.00041550550504819775,
"loss": 0.9348,
"step": 4700
},
{
"epoch": 0.05267837670072307,
"grad_norm": 0.2816336750984192,
"learning_rate": 0.0004152770798117776,
"loss": 0.9294,
"step": 4710
},
{
"epoch": 0.05279022038798575,
"grad_norm": 0.29570698738098145,
"learning_rate": 0.0004150486545753575,
"loss": 0.9317,
"step": 4720
},
{
"epoch": 0.05290206407524843,
"grad_norm": 0.26981687545776367,
"learning_rate": 0.00041482022933893737,
"loss": 0.9317,
"step": 4730
},
{
"epoch": 0.053013907762511116,
"grad_norm": 0.2586159110069275,
"learning_rate": 0.0004145918041025172,
"loss": 0.9162,
"step": 4740
},
{
"epoch": 0.05312575144977379,
"grad_norm": 0.24129503965377808,
"learning_rate": 0.00041436337886609714,
"loss": 0.934,
"step": 4750
},
{
"epoch": 0.05323759513703648,
"grad_norm": 0.28072717785835266,
"learning_rate": 0.000414134953629677,
"loss": 0.9089,
"step": 4760
},
{
"epoch": 0.05334943882429916,
"grad_norm": 0.2760024964809418,
"learning_rate": 0.0004139065283932569,
"loss": 0.9115,
"step": 4770
},
{
"epoch": 0.05346128251156184,
"grad_norm": 0.28894710540771484,
"learning_rate": 0.0004136781031568368,
"loss": 0.9108,
"step": 4780
},
{
"epoch": 0.053573126198824525,
"grad_norm": 0.27882319688796997,
"learning_rate": 0.00041344967792041667,
"loss": 0.9184,
"step": 4790
},
{
"epoch": 0.0536849698860872,
"grad_norm": 0.27242934703826904,
"learning_rate": 0.0004132212526839965,
"loss": 0.9498,
"step": 4800
},
{
"epoch": 0.05379681357334989,
"grad_norm": 0.2809596359729767,
"learning_rate": 0.00041299282744757643,
"loss": 0.9365,
"step": 4810
},
{
"epoch": 0.053908657260612565,
"grad_norm": 0.3026556074619293,
"learning_rate": 0.0004127644022111563,
"loss": 0.9433,
"step": 4820
},
{
"epoch": 0.05402050094787525,
"grad_norm": 0.2933846116065979,
"learning_rate": 0.00041253597697473614,
"loss": 0.9351,
"step": 4830
},
{
"epoch": 0.054132344635137934,
"grad_norm": 0.2774868309497833,
"learning_rate": 0.00041230755173831605,
"loss": 0.9285,
"step": 4840
},
{
"epoch": 0.05424418832240061,
"grad_norm": 0.2859903573989868,
"learning_rate": 0.00041207912650189596,
"loss": 0.9344,
"step": 4850
},
{
"epoch": 0.054356032009663297,
"grad_norm": 0.26687270402908325,
"learning_rate": 0.0004118507012654758,
"loss": 0.9281,
"step": 4860
},
{
"epoch": 0.054467875696925974,
"grad_norm": 0.31075340509414673,
"learning_rate": 0.00041162227602905573,
"loss": 0.9418,
"step": 4870
},
{
"epoch": 0.05457971938418866,
"grad_norm": 0.2569184899330139,
"learning_rate": 0.0004113938507926356,
"loss": 0.9394,
"step": 4880
},
{
"epoch": 0.054691563071451336,
"grad_norm": 0.26250478625297546,
"learning_rate": 0.00041116542555621544,
"loss": 0.9499,
"step": 4890
},
{
"epoch": 0.05480340675871402,
"grad_norm": 0.27604004740715027,
"learning_rate": 0.00041093700031979535,
"loss": 0.9268,
"step": 4900
},
{
"epoch": 0.054915250445976706,
"grad_norm": 0.26279163360595703,
"learning_rate": 0.0004107085750833752,
"loss": 0.9313,
"step": 4910
},
{
"epoch": 0.05502709413323938,
"grad_norm": 0.29265978932380676,
"learning_rate": 0.00041048014984695506,
"loss": 0.9498,
"step": 4920
},
{
"epoch": 0.05513893782050207,
"grad_norm": 0.32107868790626526,
"learning_rate": 0.000410251724610535,
"loss": 0.9708,
"step": 4930
},
{
"epoch": 0.055250781507764746,
"grad_norm": 0.32804161310195923,
"learning_rate": 0.0004100232993741149,
"loss": 0.9624,
"step": 4940
},
{
"epoch": 0.05536262519502743,
"grad_norm": 0.3207037150859833,
"learning_rate": 0.00040979487413769474,
"loss": 0.9538,
"step": 4950
},
{
"epoch": 0.05547446888229011,
"grad_norm": 0.29660555720329285,
"learning_rate": 0.00040956644890127465,
"loss": 0.9677,
"step": 4960
},
{
"epoch": 0.05558631256955279,
"grad_norm": 0.34930771589279175,
"learning_rate": 0.0004093380236648545,
"loss": 0.9777,
"step": 4970
},
{
"epoch": 0.05569815625681548,
"grad_norm": 0.3037464916706085,
"learning_rate": 0.00040910959842843436,
"loss": 0.9826,
"step": 4980
},
{
"epoch": 0.055809999944078155,
"grad_norm": 0.31435292959213257,
"learning_rate": 0.00040888117319201427,
"loss": 0.9677,
"step": 4990
},
{
"epoch": 0.05592184363134084,
"grad_norm": 0.29182785749435425,
"learning_rate": 0.0004086527479555941,
"loss": 0.9563,
"step": 5000
},
{
"epoch": 0.05603368731860352,
"grad_norm": 0.34796231985092163,
"learning_rate": 0.00040842432271917403,
"loss": 0.957,
"step": 5010
},
{
"epoch": 0.0561455310058662,
"grad_norm": 0.3027050495147705,
"learning_rate": 0.00040819589748275394,
"loss": 0.967,
"step": 5020
},
{
"epoch": 0.056257374693128887,
"grad_norm": 0.3419332802295685,
"learning_rate": 0.0004079674722463338,
"loss": 0.9654,
"step": 5030
},
{
"epoch": 0.056369218380391564,
"grad_norm": 0.29381224513053894,
"learning_rate": 0.00040773904700991366,
"loss": 0.9647,
"step": 5040
},
{
"epoch": 0.05648106206765425,
"grad_norm": 0.29206860065460205,
"learning_rate": 0.0004075106217734935,
"loss": 0.9637,
"step": 5050
},
{
"epoch": 0.056592905754916926,
"grad_norm": 0.3169795274734497,
"learning_rate": 0.0004072821965370734,
"loss": 0.963,
"step": 5060
},
{
"epoch": 0.05670474944217961,
"grad_norm": 0.30713772773742676,
"learning_rate": 0.0004070537713006533,
"loss": 0.9766,
"step": 5070
},
{
"epoch": 0.05681659312944229,
"grad_norm": 0.29805994033813477,
"learning_rate": 0.00040682534606423313,
"loss": 0.9597,
"step": 5080
},
{
"epoch": 0.05692843681670497,
"grad_norm": 0.33419644832611084,
"learning_rate": 0.0004065969208278131,
"loss": 0.9598,
"step": 5090
},
{
"epoch": 0.05704028050396766,
"grad_norm": 0.31769025325775146,
"learning_rate": 0.00040636849559139295,
"loss": 0.942,
"step": 5100
},
{
"epoch": 0.057152124191230336,
"grad_norm": 0.3017726242542267,
"learning_rate": 0.0004061400703549728,
"loss": 0.9627,
"step": 5110
},
{
"epoch": 0.05726396787849302,
"grad_norm": 0.32213470339775085,
"learning_rate": 0.0004059116451185527,
"loss": 0.9518,
"step": 5120
},
{
"epoch": 0.0573758115657557,
"grad_norm": 0.29069948196411133,
"learning_rate": 0.0004056832198821326,
"loss": 0.9337,
"step": 5130
},
{
"epoch": 0.05748765525301838,
"grad_norm": 0.32283100485801697,
"learning_rate": 0.00040545479464571243,
"loss": 0.959,
"step": 5140
},
{
"epoch": 0.05759949894028106,
"grad_norm": 0.3191847801208496,
"learning_rate": 0.00040522636940929234,
"loss": 0.9439,
"step": 5150
},
{
"epoch": 0.057711342627543745,
"grad_norm": 0.565864622592926,
"learning_rate": 0.00040499794417287225,
"loss": 0.9587,
"step": 5160
},
{
"epoch": 0.05782318631480643,
"grad_norm": 0.3419003188610077,
"learning_rate": 0.0004047695189364521,
"loss": 0.9466,
"step": 5170
},
{
"epoch": 0.05793503000206911,
"grad_norm": 0.28331097960472107,
"learning_rate": 0.000404541093700032,
"loss": 0.9472,
"step": 5180
},
{
"epoch": 0.05804687368933179,
"grad_norm": 0.2994554042816162,
"learning_rate": 0.00040431266846361187,
"loss": 0.9434,
"step": 5190
},
{
"epoch": 0.05815871737659447,
"grad_norm": 0.30070000886917114,
"learning_rate": 0.0004040842432271917,
"loss": 0.9408,
"step": 5200
},
{
"epoch": 0.058270561063857154,
"grad_norm": 0.29924333095550537,
"learning_rate": 0.00040385581799077164,
"loss": 0.9484,
"step": 5210
},
{
"epoch": 0.05838240475111983,
"grad_norm": 0.2905283272266388,
"learning_rate": 0.0004036273927543515,
"loss": 0.9636,
"step": 5220
},
{
"epoch": 0.058494248438382516,
"grad_norm": 0.3290540874004364,
"learning_rate": 0.00040339896751793135,
"loss": 0.9396,
"step": 5230
},
{
"epoch": 0.0586060921256452,
"grad_norm": 0.29686272144317627,
"learning_rate": 0.0004031705422815113,
"loss": 0.9408,
"step": 5240
},
{
"epoch": 0.05871793581290788,
"grad_norm": 0.2768057882785797,
"learning_rate": 0.00040294211704509117,
"loss": 0.9328,
"step": 5250
},
{
"epoch": 0.05882977950017056,
"grad_norm": 0.2614899277687073,
"learning_rate": 0.000402713691808671,
"loss": 0.9483,
"step": 5260
},
{
"epoch": 0.05894162318743324,
"grad_norm": 0.2692766487598419,
"learning_rate": 0.00040248526657225093,
"loss": 0.9479,
"step": 5270
},
{
"epoch": 0.059053466874695926,
"grad_norm": 0.3009514808654785,
"learning_rate": 0.0004022568413358308,
"loss": 0.9681,
"step": 5280
},
{
"epoch": 0.0591653105619586,
"grad_norm": 0.27767086029052734,
"learning_rate": 0.00040202841609941064,
"loss": 0.9685,
"step": 5290
},
{
"epoch": 0.05927715424922129,
"grad_norm": 0.2956901788711548,
"learning_rate": 0.00040179999086299055,
"loss": 0.9609,
"step": 5300
},
{
"epoch": 0.05938899793648397,
"grad_norm": 0.3046570420265198,
"learning_rate": 0.0004015715656265704,
"loss": 0.961,
"step": 5310
},
{
"epoch": 0.05950084162374665,
"grad_norm": 0.24477365612983704,
"learning_rate": 0.0004013431403901503,
"loss": 0.9501,
"step": 5320
},
{
"epoch": 0.059612685311009335,
"grad_norm": 0.25505194067955017,
"learning_rate": 0.00040111471515373023,
"loss": 0.946,
"step": 5330
},
{
"epoch": 0.05972452899827201,
"grad_norm": 0.26015251874923706,
"learning_rate": 0.0004008862899173101,
"loss": 0.9372,
"step": 5340
},
{
"epoch": 0.0598363726855347,
"grad_norm": 0.24911250174045563,
"learning_rate": 0.00040065786468088994,
"loss": 0.9487,
"step": 5350
},
{
"epoch": 0.059948216372797375,
"grad_norm": 0.2779735028743744,
"learning_rate": 0.00040042943944446985,
"loss": 0.9316,
"step": 5360
},
{
"epoch": 0.06006006006006006,
"grad_norm": 0.30663251876831055,
"learning_rate": 0.0004002010142080497,
"loss": 0.9461,
"step": 5370
},
{
"epoch": 0.060171903747322744,
"grad_norm": 0.2724740505218506,
"learning_rate": 0.00039997258897162956,
"loss": 0.9214,
"step": 5380
},
{
"epoch": 0.06028374743458542,
"grad_norm": 0.26819276809692383,
"learning_rate": 0.00039974416373520947,
"loss": 0.9368,
"step": 5390
},
{
"epoch": 0.060395591121848106,
"grad_norm": 0.26342320442199707,
"learning_rate": 0.0003995157384987894,
"loss": 0.9332,
"step": 5400
},
{
"epoch": 0.060507434809110784,
"grad_norm": 0.32590556144714355,
"learning_rate": 0.00039928731326236924,
"loss": 0.9286,
"step": 5410
},
{
"epoch": 0.06061927849637347,
"grad_norm": 0.2747272849082947,
"learning_rate": 0.00039905888802594915,
"loss": 0.932,
"step": 5420
},
{
"epoch": 0.06073112218363615,
"grad_norm": 0.23089702427387238,
"learning_rate": 0.000398830462789529,
"loss": 0.9216,
"step": 5430
},
{
"epoch": 0.06084296587089883,
"grad_norm": 0.24383346736431122,
"learning_rate": 0.00039860203755310886,
"loss": 0.9333,
"step": 5440
},
{
"epoch": 0.060954809558161516,
"grad_norm": 0.23999489843845367,
"learning_rate": 0.00039837361231668877,
"loss": 0.9134,
"step": 5450
},
{
"epoch": 0.06106665324542419,
"grad_norm": 0.3041435480117798,
"learning_rate": 0.0003981451870802686,
"loss": 0.9226,
"step": 5460
},
{
"epoch": 0.06117849693268688,
"grad_norm": 0.2667579650878906,
"learning_rate": 0.0003979167618438485,
"loss": 0.9148,
"step": 5470
},
{
"epoch": 0.061290340619949556,
"grad_norm": 0.2730364203453064,
"learning_rate": 0.0003976883366074284,
"loss": 0.9073,
"step": 5480
},
{
"epoch": 0.06140218430721224,
"grad_norm": 0.28175118565559387,
"learning_rate": 0.0003974599113710083,
"loss": 0.9097,
"step": 5490
},
{
"epoch": 0.061514027994474925,
"grad_norm": 0.2826266288757324,
"learning_rate": 0.00039723148613458816,
"loss": 0.8972,
"step": 5500
},
{
"epoch": 0.0616258716817376,
"grad_norm": 0.25821810960769653,
"learning_rate": 0.000397003060898168,
"loss": 0.8898,
"step": 5510
},
{
"epoch": 0.06173771536900029,
"grad_norm": 0.31401073932647705,
"learning_rate": 0.0003967746356617479,
"loss": 0.8986,
"step": 5520
},
{
"epoch": 0.061849559056262965,
"grad_norm": 0.2664715349674225,
"learning_rate": 0.0003965462104253278,
"loss": 0.9178,
"step": 5530
},
{
"epoch": 0.06196140274352565,
"grad_norm": 0.2725924253463745,
"learning_rate": 0.00039631778518890763,
"loss": 0.8941,
"step": 5540
},
{
"epoch": 0.06207324643078833,
"grad_norm": 0.2991993725299835,
"learning_rate": 0.0003960893599524876,
"loss": 0.899,
"step": 5550
},
{
"epoch": 0.06218509011805101,
"grad_norm": 0.2683865427970886,
"learning_rate": 0.00039586093471606745,
"loss": 0.9105,
"step": 5560
},
{
"epoch": 0.062296933805313696,
"grad_norm": 0.29127469658851624,
"learning_rate": 0.0003956325094796473,
"loss": 0.9091,
"step": 5570
},
{
"epoch": 0.062408777492576374,
"grad_norm": 0.28191229701042175,
"learning_rate": 0.0003954040842432272,
"loss": 0.9078,
"step": 5580
},
{
"epoch": 0.06252062117983906,
"grad_norm": 0.28319644927978516,
"learning_rate": 0.0003951756590068071,
"loss": 0.9134,
"step": 5590
},
{
"epoch": 0.06263246486710174,
"grad_norm": 0.2563108205795288,
"learning_rate": 0.00039494723377038693,
"loss": 0.9166,
"step": 5600
},
{
"epoch": 0.06274430855436441,
"grad_norm": 0.29730817675590515,
"learning_rate": 0.00039471880853396684,
"loss": 0.9101,
"step": 5610
},
{
"epoch": 0.0628561522416271,
"grad_norm": 0.25925830006599426,
"learning_rate": 0.0003944903832975467,
"loss": 0.9131,
"step": 5620
},
{
"epoch": 0.06296799592888978,
"grad_norm": 0.2645208537578583,
"learning_rate": 0.0003942619580611266,
"loss": 0.9203,
"step": 5630
},
{
"epoch": 0.06307983961615246,
"grad_norm": 0.2844574749469757,
"learning_rate": 0.0003940335328247065,
"loss": 0.914,
"step": 5640
},
{
"epoch": 0.06319168330341515,
"grad_norm": 0.2687402367591858,
"learning_rate": 0.00039380510758828637,
"loss": 0.9095,
"step": 5650
},
{
"epoch": 0.06330352699067783,
"grad_norm": 0.22893477976322174,
"learning_rate": 0.00039357668235186623,
"loss": 0.8993,
"step": 5660
},
{
"epoch": 0.06341537067794051,
"grad_norm": 0.27271768450737,
"learning_rate": 0.00039334825711544614,
"loss": 0.8989,
"step": 5670
},
{
"epoch": 0.06352721436520319,
"grad_norm": 0.27709853649139404,
"learning_rate": 0.000393119831879026,
"loss": 0.8998,
"step": 5680
},
{
"epoch": 0.06363905805246588,
"grad_norm": 0.24321520328521729,
"learning_rate": 0.00039289140664260585,
"loss": 0.887,
"step": 5690
},
{
"epoch": 0.06375090173972855,
"grad_norm": 0.26779887080192566,
"learning_rate": 0.00039266298140618576,
"loss": 0.9091,
"step": 5700
},
{
"epoch": 0.06386274542699123,
"grad_norm": 0.2612350881099701,
"learning_rate": 0.00039243455616976567,
"loss": 0.9043,
"step": 5710
},
{
"epoch": 0.06397458911425392,
"grad_norm": 0.26247987151145935,
"learning_rate": 0.0003922061309333455,
"loss": 0.9024,
"step": 5720
},
{
"epoch": 0.0640864328015166,
"grad_norm": 0.2605653703212738,
"learning_rate": 0.00039197770569692543,
"loss": 0.9311,
"step": 5730
},
{
"epoch": 0.06419827648877928,
"grad_norm": 0.28249841928482056,
"learning_rate": 0.0003917492804605053,
"loss": 0.9265,
"step": 5740
},
{
"epoch": 0.06431012017604196,
"grad_norm": 0.2880108654499054,
"learning_rate": 0.00039152085522408515,
"loss": 0.9331,
"step": 5750
},
{
"epoch": 0.06442196386330465,
"grad_norm": 0.31626009941101074,
"learning_rate": 0.00039129242998766506,
"loss": 0.9483,
"step": 5760
},
{
"epoch": 0.06453380755056733,
"grad_norm": 0.28972744941711426,
"learning_rate": 0.0003910640047512449,
"loss": 0.9239,
"step": 5770
},
{
"epoch": 0.06464565123783,
"grad_norm": 0.27140864729881287,
"learning_rate": 0.00039083557951482477,
"loss": 0.9259,
"step": 5780
},
{
"epoch": 0.0647574949250927,
"grad_norm": 0.26331818103790283,
"learning_rate": 0.00039060715427840473,
"loss": 0.9383,
"step": 5790
},
{
"epoch": 0.06486933861235537,
"grad_norm": 0.26927000284194946,
"learning_rate": 0.0003903787290419846,
"loss": 0.9236,
"step": 5800
},
{
"epoch": 0.06498118229961805,
"grad_norm": 0.2833601236343384,
"learning_rate": 0.00039015030380556444,
"loss": 0.9257,
"step": 5810
},
{
"epoch": 0.06509302598688073,
"grad_norm": 0.2970174551010132,
"learning_rate": 0.00038992187856914435,
"loss": 0.9164,
"step": 5820
},
{
"epoch": 0.06520486967414342,
"grad_norm": 0.27904263138771057,
"learning_rate": 0.0003896934533327242,
"loss": 0.9045,
"step": 5830
},
{
"epoch": 0.0653167133614061,
"grad_norm": 0.24879537522792816,
"learning_rate": 0.00038946502809630406,
"loss": 0.9,
"step": 5840
},
{
"epoch": 0.06542855704866878,
"grad_norm": 0.2897798717021942,
"learning_rate": 0.000389236602859884,
"loss": 0.919,
"step": 5850
},
{
"epoch": 0.06554040073593147,
"grad_norm": 0.26522865891456604,
"learning_rate": 0.00038900817762346383,
"loss": 0.9168,
"step": 5860
},
{
"epoch": 0.06565224442319414,
"grad_norm": 0.26405441761016846,
"learning_rate": 0.00038877975238704374,
"loss": 0.9169,
"step": 5870
},
{
"epoch": 0.06576408811045682,
"grad_norm": 0.2543514370918274,
"learning_rate": 0.00038855132715062365,
"loss": 0.917,
"step": 5880
},
{
"epoch": 0.06587593179771951,
"grad_norm": 0.2683538794517517,
"learning_rate": 0.0003883229019142035,
"loss": 0.9179,
"step": 5890
},
{
"epoch": 0.06598777548498219,
"grad_norm": 0.24559274315834045,
"learning_rate": 0.00038809447667778336,
"loss": 0.907,
"step": 5900
},
{
"epoch": 0.06609961917224487,
"grad_norm": 0.2604455351829529,
"learning_rate": 0.00038786605144136327,
"loss": 0.9172,
"step": 5910
},
{
"epoch": 0.06621146285950755,
"grad_norm": 0.24329319596290588,
"learning_rate": 0.0003876376262049431,
"loss": 0.9171,
"step": 5920
},
{
"epoch": 0.06632330654677024,
"grad_norm": 0.237509623169899,
"learning_rate": 0.000387409200968523,
"loss": 0.9272,
"step": 5930
},
{
"epoch": 0.06643515023403292,
"grad_norm": 0.2569025754928589,
"learning_rate": 0.00038718077573210284,
"loss": 0.9327,
"step": 5940
},
{
"epoch": 0.0665469939212956,
"grad_norm": 0.2908497750759125,
"learning_rate": 0.0003869523504956828,
"loss": 0.9299,
"step": 5950
},
{
"epoch": 0.06665883760855829,
"grad_norm": 0.24669544398784637,
"learning_rate": 0.00038672392525926266,
"loss": 0.9036,
"step": 5960
},
{
"epoch": 0.06677068129582096,
"grad_norm": 0.23906981945037842,
"learning_rate": 0.0003864955000228425,
"loss": 0.9266,
"step": 5970
},
{
"epoch": 0.06688252498308364,
"grad_norm": 0.2822079658508301,
"learning_rate": 0.0003862670747864224,
"loss": 0.9209,
"step": 5980
},
{
"epoch": 0.06699436867034632,
"grad_norm": 0.27469775080680847,
"learning_rate": 0.0003860386495500023,
"loss": 0.9385,
"step": 5990
},
{
"epoch": 0.06710621235760901,
"grad_norm": 0.24559862911701202,
"learning_rate": 0.00038581022431358213,
"loss": 0.9248,
"step": 6000
},
{
"epoch": 0.06721805604487169,
"grad_norm": 0.24427008628845215,
"learning_rate": 0.00038558179907716204,
"loss": 0.9358,
"step": 6010
},
{
"epoch": 0.06732989973213437,
"grad_norm": 0.2626965641975403,
"learning_rate": 0.00038535337384074195,
"loss": 0.9211,
"step": 6020
},
{
"epoch": 0.06744174341939706,
"grad_norm": 0.226990208029747,
"learning_rate": 0.0003851249486043218,
"loss": 0.9292,
"step": 6030
},
{
"epoch": 0.06755358710665973,
"grad_norm": 0.2762834131717682,
"learning_rate": 0.0003848965233679017,
"loss": 0.932,
"step": 6040
},
{
"epoch": 0.06766543079392241,
"grad_norm": 0.2799958884716034,
"learning_rate": 0.0003846680981314816,
"loss": 0.943,
"step": 6050
},
{
"epoch": 0.06777727448118509,
"grad_norm": 0.26224029064178467,
"learning_rate": 0.00038443967289506143,
"loss": 0.9236,
"step": 6060
},
{
"epoch": 0.06788911816844778,
"grad_norm": 0.2897866368293762,
"learning_rate": 0.00038421124765864134,
"loss": 0.95,
"step": 6070
},
{
"epoch": 0.06800096185571046,
"grad_norm": 0.2899113893508911,
"learning_rate": 0.0003839828224222212,
"loss": 0.9403,
"step": 6080
},
{
"epoch": 0.06811280554297314,
"grad_norm": 0.27765581011772156,
"learning_rate": 0.00038375439718580105,
"loss": 0.9447,
"step": 6090
},
{
"epoch": 0.06822464923023583,
"grad_norm": 0.27683207392692566,
"learning_rate": 0.000383525971949381,
"loss": 0.949,
"step": 6100
},
{
"epoch": 0.0683364929174985,
"grad_norm": 0.2815559506416321,
"learning_rate": 0.00038329754671296087,
"loss": 0.9627,
"step": 6110
},
{
"epoch": 0.06844833660476118,
"grad_norm": 0.2741657793521881,
"learning_rate": 0.00038306912147654073,
"loss": 0.9659,
"step": 6120
},
{
"epoch": 0.06856018029202386,
"grad_norm": 0.4103181064128876,
"learning_rate": 0.00038284069624012064,
"loss": 0.9612,
"step": 6130
},
{
"epoch": 0.06867202397928655,
"grad_norm": 0.2862701416015625,
"learning_rate": 0.0003826122710037005,
"loss": 0.9393,
"step": 6140
},
{
"epoch": 0.06878386766654923,
"grad_norm": 0.2789844274520874,
"learning_rate": 0.00038238384576728035,
"loss": 0.9447,
"step": 6150
},
{
"epoch": 0.06889571135381191,
"grad_norm": 0.590391993522644,
"learning_rate": 0.00038215542053086026,
"loss": 0.9525,
"step": 6160
},
{
"epoch": 0.0690075550410746,
"grad_norm": 0.2721211016178131,
"learning_rate": 0.0003819269952944401,
"loss": 0.9467,
"step": 6170
},
{
"epoch": 0.06911939872833728,
"grad_norm": 0.27576929330825806,
"learning_rate": 0.00038169857005802,
"loss": 0.9428,
"step": 6180
},
{
"epoch": 0.06923124241559996,
"grad_norm": 0.28229111433029175,
"learning_rate": 0.00038147014482159993,
"loss": 0.9418,
"step": 6190
},
{
"epoch": 0.06934308610286263,
"grad_norm": 0.29595518112182617,
"learning_rate": 0.0003812417195851798,
"loss": 0.9178,
"step": 6200
},
{
"epoch": 0.06945492979012532,
"grad_norm": 0.3055596351623535,
"learning_rate": 0.00038101329434875965,
"loss": 0.9464,
"step": 6210
},
{
"epoch": 0.069566773477388,
"grad_norm": 0.29212549328804016,
"learning_rate": 0.00038078486911233956,
"loss": 0.9491,
"step": 6220
},
{
"epoch": 0.06967861716465068,
"grad_norm": 0.288928359746933,
"learning_rate": 0.0003805564438759194,
"loss": 0.9285,
"step": 6230
},
{
"epoch": 0.06979046085191337,
"grad_norm": 0.2759207487106323,
"learning_rate": 0.00038032801863949927,
"loss": 0.9336,
"step": 6240
},
{
"epoch": 0.06990230453917605,
"grad_norm": 0.31041648983955383,
"learning_rate": 0.0003800995934030792,
"loss": 0.9317,
"step": 6250
},
{
"epoch": 0.07001414822643873,
"grad_norm": 0.29425299167633057,
"learning_rate": 0.0003798711681666591,
"loss": 0.9212,
"step": 6260
},
{
"epoch": 0.0701259919137014,
"grad_norm": 0.278062105178833,
"learning_rate": 0.00037964274293023894,
"loss": 0.9291,
"step": 6270
},
{
"epoch": 0.0702378356009641,
"grad_norm": 0.2983698546886444,
"learning_rate": 0.00037941431769381885,
"loss": 0.9169,
"step": 6280
},
{
"epoch": 0.07034967928822677,
"grad_norm": 0.29595527052879333,
"learning_rate": 0.0003791858924573987,
"loss": 0.9286,
"step": 6290
},
{
"epoch": 0.07046152297548945,
"grad_norm": 0.26365020871162415,
"learning_rate": 0.00037895746722097856,
"loss": 0.9312,
"step": 6300
},
{
"epoch": 0.07057336666275214,
"grad_norm": 0.27807778120040894,
"learning_rate": 0.0003787290419845585,
"loss": 0.9274,
"step": 6310
},
{
"epoch": 0.07068521035001482,
"grad_norm": 0.2585415840148926,
"learning_rate": 0.00037850061674813833,
"loss": 0.9513,
"step": 6320
},
{
"epoch": 0.0707970540372775,
"grad_norm": 0.2740543484687805,
"learning_rate": 0.0003782721915117182,
"loss": 0.922,
"step": 6330
},
{
"epoch": 0.07090889772454018,
"grad_norm": 0.28271788358688354,
"learning_rate": 0.00037804376627529815,
"loss": 0.94,
"step": 6340
},
{
"epoch": 0.07102074141180287,
"grad_norm": 0.28767603635787964,
"learning_rate": 0.000377815341038878,
"loss": 0.9295,
"step": 6350
},
{
"epoch": 0.07113258509906555,
"grad_norm": 0.25200092792510986,
"learning_rate": 0.00037758691580245786,
"loss": 0.9219,
"step": 6360
},
{
"epoch": 0.07124442878632822,
"grad_norm": 0.27449852228164673,
"learning_rate": 0.00037735849056603777,
"loss": 0.9227,
"step": 6370
},
{
"epoch": 0.07135627247359091,
"grad_norm": 0.27951040863990784,
"learning_rate": 0.0003771300653296176,
"loss": 0.9256,
"step": 6380
},
{
"epoch": 0.07146811616085359,
"grad_norm": 0.27883175015449524,
"learning_rate": 0.0003769016400931975,
"loss": 0.9244,
"step": 6390
},
{
"epoch": 0.07157995984811627,
"grad_norm": 0.27942216396331787,
"learning_rate": 0.00037667321485677734,
"loss": 0.9287,
"step": 6400
},
{
"epoch": 0.07169180353537895,
"grad_norm": 0.2605076730251312,
"learning_rate": 0.00037644478962035725,
"loss": 0.9213,
"step": 6410
},
{
"epoch": 0.07180364722264164,
"grad_norm": 0.25812190771102905,
"learning_rate": 0.00037621636438393716,
"loss": 0.9268,
"step": 6420
},
{
"epoch": 0.07191549090990432,
"grad_norm": 0.27478551864624023,
"learning_rate": 0.000375987939147517,
"loss": 0.9341,
"step": 6430
},
{
"epoch": 0.072027334597167,
"grad_norm": 0.2799810469150543,
"learning_rate": 0.0003757595139110969,
"loss": 0.9308,
"step": 6440
},
{
"epoch": 0.07213917828442969,
"grad_norm": 0.2494313269853592,
"learning_rate": 0.0003755310886746768,
"loss": 0.9389,
"step": 6450
},
{
"epoch": 0.07225102197169236,
"grad_norm": 0.3362772762775421,
"learning_rate": 0.00037530266343825664,
"loss": 0.9362,
"step": 6460
},
{
"epoch": 0.07236286565895504,
"grad_norm": 0.28501999378204346,
"learning_rate": 0.00037507423820183655,
"loss": 0.9262,
"step": 6470
},
{
"epoch": 0.07247470934621772,
"grad_norm": 0.24787545204162598,
"learning_rate": 0.0003748458129654164,
"loss": 0.9409,
"step": 6480
},
{
"epoch": 0.07258655303348041,
"grad_norm": 0.277665913105011,
"learning_rate": 0.0003746173877289963,
"loss": 0.9244,
"step": 6490
},
{
"epoch": 0.07269839672074309,
"grad_norm": 0.2613317370414734,
"learning_rate": 0.0003743889624925762,
"loss": 0.9429,
"step": 6500
},
{
"epoch": 0.07281024040800577,
"grad_norm": 0.2740306258201599,
"learning_rate": 0.0003741605372561561,
"loss": 0.9422,
"step": 6510
},
{
"epoch": 0.07292208409526846,
"grad_norm": 0.3052440881729126,
"learning_rate": 0.00037393211201973593,
"loss": 0.9346,
"step": 6520
},
{
"epoch": 0.07303392778253114,
"grad_norm": 0.27979132533073425,
"learning_rate": 0.00037370368678331584,
"loss": 0.9305,
"step": 6530
},
{
"epoch": 0.07314577146979381,
"grad_norm": 0.2834227979183197,
"learning_rate": 0.0003734752615468957,
"loss": 0.9305,
"step": 6540
},
{
"epoch": 0.07325761515705649,
"grad_norm": 0.28621387481689453,
"learning_rate": 0.00037324683631047555,
"loss": 0.9505,
"step": 6550
},
{
"epoch": 0.07336945884431918,
"grad_norm": 0.2539358139038086,
"learning_rate": 0.00037301841107405546,
"loss": 0.9491,
"step": 6560
},
{
"epoch": 0.07348130253158186,
"grad_norm": 0.29257437586784363,
"learning_rate": 0.0003727899858376354,
"loss": 0.9428,
"step": 6570
},
{
"epoch": 0.07359314621884454,
"grad_norm": 0.25158485770225525,
"learning_rate": 0.00037256156060121523,
"loss": 0.9471,
"step": 6580
},
{
"epoch": 0.07370498990610723,
"grad_norm": 0.26301345229148865,
"learning_rate": 0.00037233313536479514,
"loss": 0.928,
"step": 6590
},
{
"epoch": 0.07381683359336991,
"grad_norm": 0.2519192397594452,
"learning_rate": 0.000372104710128375,
"loss": 0.9189,
"step": 6600
},
{
"epoch": 0.07392867728063258,
"grad_norm": 0.29801836609840393,
"learning_rate": 0.00037187628489195485,
"loss": 0.9218,
"step": 6610
},
{
"epoch": 0.07404052096789528,
"grad_norm": 0.30779263377189636,
"learning_rate": 0.00037164785965553476,
"loss": 0.9263,
"step": 6620
},
{
"epoch": 0.07415236465515795,
"grad_norm": 0.2758638262748718,
"learning_rate": 0.0003714194344191146,
"loss": 0.904,
"step": 6630
},
{
"epoch": 0.07426420834242063,
"grad_norm": 0.26482871174812317,
"learning_rate": 0.00037119100918269447,
"loss": 0.9024,
"step": 6640
},
{
"epoch": 0.07437605202968331,
"grad_norm": 0.24001047015190125,
"learning_rate": 0.00037096258394627444,
"loss": 0.914,
"step": 6650
},
{
"epoch": 0.074487895716946,
"grad_norm": 0.2694549560546875,
"learning_rate": 0.0003707341587098543,
"loss": 0.921,
"step": 6660
},
{
"epoch": 0.07459973940420868,
"grad_norm": 0.25042393803596497,
"learning_rate": 0.00037050573347343415,
"loss": 0.9108,
"step": 6670
},
{
"epoch": 0.07471158309147136,
"grad_norm": 0.25945019721984863,
"learning_rate": 0.00037027730823701406,
"loss": 0.912,
"step": 6680
},
{
"epoch": 0.07482342677873405,
"grad_norm": 0.2624742090702057,
"learning_rate": 0.0003700488830005939,
"loss": 0.9108,
"step": 6690
},
{
"epoch": 0.07493527046599673,
"grad_norm": 0.27438145875930786,
"learning_rate": 0.00036982045776417377,
"loss": 0.9215,
"step": 6700
},
{
"epoch": 0.0750471141532594,
"grad_norm": 0.27610865235328674,
"learning_rate": 0.0003695920325277537,
"loss": 0.9053,
"step": 6710
},
{
"epoch": 0.07515895784052208,
"grad_norm": 0.2616426944732666,
"learning_rate": 0.00036936360729133353,
"loss": 0.9255,
"step": 6720
},
{
"epoch": 0.07527080152778477,
"grad_norm": 0.3146522641181946,
"learning_rate": 0.00036913518205491344,
"loss": 0.9105,
"step": 6730
},
{
"epoch": 0.07538264521504745,
"grad_norm": 0.29139819741249084,
"learning_rate": 0.00036890675681849335,
"loss": 0.9324,
"step": 6740
},
{
"epoch": 0.07549448890231013,
"grad_norm": 0.3176229000091553,
"learning_rate": 0.0003686783315820732,
"loss": 0.9434,
"step": 6750
},
{
"epoch": 0.07560633258957282,
"grad_norm": 0.2786601781845093,
"learning_rate": 0.00036844990634565307,
"loss": 0.9405,
"step": 6760
},
{
"epoch": 0.0757181762768355,
"grad_norm": 0.2988050580024719,
"learning_rate": 0.000368221481109233,
"loss": 0.9477,
"step": 6770
},
{
"epoch": 0.07583001996409817,
"grad_norm": 0.28120875358581543,
"learning_rate": 0.00036799305587281283,
"loss": 0.9521,
"step": 6780
},
{
"epoch": 0.07594186365136085,
"grad_norm": 0.27376359701156616,
"learning_rate": 0.0003677646306363927,
"loss": 0.9405,
"step": 6790
},
{
"epoch": 0.07605370733862354,
"grad_norm": 0.2721284329891205,
"learning_rate": 0.0003675362053999726,
"loss": 0.9392,
"step": 6800
},
{
"epoch": 0.07616555102588622,
"grad_norm": 0.31443721055984497,
"learning_rate": 0.0003673077801635525,
"loss": 0.939,
"step": 6810
},
{
"epoch": 0.0762773947131489,
"grad_norm": 0.27175766229629517,
"learning_rate": 0.00036707935492713236,
"loss": 0.9262,
"step": 6820
},
{
"epoch": 0.07638923840041159,
"grad_norm": 0.2984711527824402,
"learning_rate": 0.00036685092969071227,
"loss": 0.9381,
"step": 6830
},
{
"epoch": 0.07650108208767427,
"grad_norm": 0.2773591876029968,
"learning_rate": 0.00036662250445429213,
"loss": 0.9217,
"step": 6840
},
{
"epoch": 0.07661292577493695,
"grad_norm": 0.29338565468788147,
"learning_rate": 0.000366394079217872,
"loss": 0.9197,
"step": 6850
},
{
"epoch": 0.07672476946219962,
"grad_norm": 0.2456415593624115,
"learning_rate": 0.00036616565398145184,
"loss": 0.9191,
"step": 6860
},
{
"epoch": 0.07683661314946232,
"grad_norm": 0.324935644865036,
"learning_rate": 0.00036593722874503175,
"loss": 0.8975,
"step": 6870
},
{
"epoch": 0.076948456836725,
"grad_norm": 0.6967706680297852,
"learning_rate": 0.0003657088035086116,
"loss": 0.9053,
"step": 6880
},
{
"epoch": 0.07706030052398767,
"grad_norm": 0.8192552328109741,
"learning_rate": 0.0003654803782721915,
"loss": 0.9129,
"step": 6890
},
{
"epoch": 0.07717214421125036,
"grad_norm": 0.4698907136917114,
"learning_rate": 0.0003652519530357714,
"loss": 0.9128,
"step": 6900
},
{
"epoch": 0.07728398789851304,
"grad_norm": 0.3055092990398407,
"learning_rate": 0.0003650235277993513,
"loss": 0.9207,
"step": 6910
},
{
"epoch": 0.07739583158577572,
"grad_norm": 0.31879591941833496,
"learning_rate": 0.00036479510256293114,
"loss": 0.9101,
"step": 6920
},
{
"epoch": 0.0775076752730384,
"grad_norm": 0.2708083987236023,
"learning_rate": 0.00036456667732651105,
"loss": 0.9036,
"step": 6930
},
{
"epoch": 0.07761951896030109,
"grad_norm": 0.2801443040370941,
"learning_rate": 0.0003643382520900909,
"loss": 0.9031,
"step": 6940
},
{
"epoch": 0.07773136264756376,
"grad_norm": 0.2481400966644287,
"learning_rate": 0.00036410982685367076,
"loss": 0.8952,
"step": 6950
},
{
"epoch": 0.07784320633482644,
"grad_norm": 0.25424808263778687,
"learning_rate": 0.0003638814016172507,
"loss": 0.8846,
"step": 6960
},
{
"epoch": 0.07795505002208913,
"grad_norm": 0.2655096650123596,
"learning_rate": 0.0003636529763808306,
"loss": 0.8922,
"step": 6970
},
{
"epoch": 0.07806689370935181,
"grad_norm": 0.281180202960968,
"learning_rate": 0.00036342455114441043,
"loss": 0.8934,
"step": 6980
},
{
"epoch": 0.07817873739661449,
"grad_norm": 0.2850550413131714,
"learning_rate": 0.00036319612590799034,
"loss": 0.8856,
"step": 6990
},
{
"epoch": 0.07829058108387717,
"grad_norm": 0.24838604032993317,
"learning_rate": 0.0003629677006715702,
"loss": 0.905,
"step": 7000
},
{
"epoch": 0.07840242477113986,
"grad_norm": 0.2703045606613159,
"learning_rate": 0.00036273927543515005,
"loss": 0.8816,
"step": 7010
},
{
"epoch": 0.07851426845840254,
"grad_norm": 0.2786656320095062,
"learning_rate": 0.00036251085019872996,
"loss": 0.8997,
"step": 7020
},
{
"epoch": 0.07862611214566521,
"grad_norm": 0.2771463692188263,
"learning_rate": 0.0003622824249623098,
"loss": 0.9033,
"step": 7030
},
{
"epoch": 0.0787379558329279,
"grad_norm": 0.2721976339817047,
"learning_rate": 0.00036205399972588973,
"loss": 0.9109,
"step": 7040
},
{
"epoch": 0.07884979952019058,
"grad_norm": 0.2606031596660614,
"learning_rate": 0.00036182557448946964,
"loss": 0.9221,
"step": 7050
},
{
"epoch": 0.07896164320745326,
"grad_norm": 0.45895281434059143,
"learning_rate": 0.0003615971492530495,
"loss": 0.908,
"step": 7060
},
{
"epoch": 0.07907348689471594,
"grad_norm": 0.30524522066116333,
"learning_rate": 0.00036136872401662935,
"loss": 0.9234,
"step": 7070
},
{
"epoch": 0.07918533058197863,
"grad_norm": 0.2704319953918457,
"learning_rate": 0.00036114029878020926,
"loss": 0.9003,
"step": 7080
},
{
"epoch": 0.07929717426924131,
"grad_norm": 0.2770727872848511,
"learning_rate": 0.0003609118735437891,
"loss": 0.9253,
"step": 7090
},
{
"epoch": 0.07940901795650399,
"grad_norm": 0.25288262963294983,
"learning_rate": 0.00036068344830736897,
"loss": 0.9089,
"step": 7100
},
{
"epoch": 0.07952086164376668,
"grad_norm": 0.27105236053466797,
"learning_rate": 0.0003604550230709489,
"loss": 0.9138,
"step": 7110
},
{
"epoch": 0.07963270533102935,
"grad_norm": 0.2631518840789795,
"learning_rate": 0.0003602265978345288,
"loss": 0.9226,
"step": 7120
},
{
"epoch": 0.07974454901829203,
"grad_norm": 0.25269970297813416,
"learning_rate": 0.00035999817259810865,
"loss": 0.9102,
"step": 7130
},
{
"epoch": 0.07985639270555471,
"grad_norm": 0.2576468586921692,
"learning_rate": 0.00035976974736168856,
"loss": 0.9075,
"step": 7140
},
{
"epoch": 0.0799682363928174,
"grad_norm": 0.26297688484191895,
"learning_rate": 0.0003595413221252684,
"loss": 0.9004,
"step": 7150
},
{
"epoch": 0.08008008008008008,
"grad_norm": 0.3029099702835083,
"learning_rate": 0.00035931289688884827,
"loss": 0.9165,
"step": 7160
},
{
"epoch": 0.08019192376734276,
"grad_norm": 0.2699684798717499,
"learning_rate": 0.0003590844716524282,
"loss": 0.9232,
"step": 7170
},
{
"epoch": 0.08030376745460545,
"grad_norm": 0.26480093598365784,
"learning_rate": 0.00035885604641600804,
"loss": 0.9319,
"step": 7180
},
{
"epoch": 0.08041561114186813,
"grad_norm": 0.27503007650375366,
"learning_rate": 0.0003586276211795879,
"loss": 0.9398,
"step": 7190
},
{
"epoch": 0.0805274548291308,
"grad_norm": 0.2715147137641907,
"learning_rate": 0.00035839919594316785,
"loss": 0.9307,
"step": 7200
},
{
"epoch": 0.08063929851639348,
"grad_norm": 0.2697315812110901,
"learning_rate": 0.0003581707707067477,
"loss": 0.9342,
"step": 7210
},
{
"epoch": 0.08075114220365617,
"grad_norm": 0.2833189070224762,
"learning_rate": 0.00035794234547032757,
"loss": 0.9604,
"step": 7220
},
{
"epoch": 0.08086298589091885,
"grad_norm": 0.3069300353527069,
"learning_rate": 0.0003577139202339075,
"loss": 0.9397,
"step": 7230
},
{
"epoch": 0.08097482957818153,
"grad_norm": 0.28459593653678894,
"learning_rate": 0.00035748549499748733,
"loss": 0.925,
"step": 7240
},
{
"epoch": 0.08108667326544422,
"grad_norm": 0.28896769881248474,
"learning_rate": 0.0003572570697610672,
"loss": 0.9245,
"step": 7250
},
{
"epoch": 0.0811985169527069,
"grad_norm": 0.2574586272239685,
"learning_rate": 0.0003570286445246471,
"loss": 0.9326,
"step": 7260
},
{
"epoch": 0.08131036063996958,
"grad_norm": 0.2965002954006195,
"learning_rate": 0.00035680021928822695,
"loss": 0.9221,
"step": 7270
},
{
"epoch": 0.08142220432723227,
"grad_norm": 0.2657724618911743,
"learning_rate": 0.00035657179405180686,
"loss": 0.9143,
"step": 7280
},
{
"epoch": 0.08153404801449494,
"grad_norm": 0.2973329424858093,
"learning_rate": 0.0003563433688153867,
"loss": 0.9164,
"step": 7290
},
{
"epoch": 0.08164589170175762,
"grad_norm": 0.3032989501953125,
"learning_rate": 0.00035611494357896663,
"loss": 0.9254,
"step": 7300
},
{
"epoch": 0.0817577353890203,
"grad_norm": 0.28107839822769165,
"learning_rate": 0.0003558865183425465,
"loss": 0.9155,
"step": 7310
},
{
"epoch": 0.08186957907628299,
"grad_norm": 0.30296218395233154,
"learning_rate": 0.00035565809310612634,
"loss": 0.9218,
"step": 7320
},
{
"epoch": 0.08198142276354567,
"grad_norm": 0.28191155195236206,
"learning_rate": 0.00035542966786970625,
"loss": 0.9197,
"step": 7330
},
{
"epoch": 0.08209326645080835,
"grad_norm": 0.3113023638725281,
"learning_rate": 0.0003552012426332861,
"loss": 0.9228,
"step": 7340
},
{
"epoch": 0.08220511013807104,
"grad_norm": 0.3066212832927704,
"learning_rate": 0.00035497281739686596,
"loss": 0.9191,
"step": 7350
},
{
"epoch": 0.08231695382533372,
"grad_norm": 0.2658233940601349,
"learning_rate": 0.0003547443921604459,
"loss": 0.918,
"step": 7360
},
{
"epoch": 0.0824287975125964,
"grad_norm": 0.28222033381462097,
"learning_rate": 0.0003545159669240258,
"loss": 0.9253,
"step": 7370
},
{
"epoch": 0.08254064119985907,
"grad_norm": 0.2917843461036682,
"learning_rate": 0.00035428754168760564,
"loss": 0.9059,
"step": 7380
},
{
"epoch": 0.08265248488712176,
"grad_norm": 0.290404349565506,
"learning_rate": 0.00035405911645118555,
"loss": 0.9044,
"step": 7390
},
{
"epoch": 0.08276432857438444,
"grad_norm": 0.28990834951400757,
"learning_rate": 0.0003538306912147654,
"loss": 0.9078,
"step": 7400
},
{
"epoch": 0.08287617226164712,
"grad_norm": 0.27296292781829834,
"learning_rate": 0.00035360226597834526,
"loss": 0.9081,
"step": 7410
},
{
"epoch": 0.08298801594890981,
"grad_norm": 0.25443321466445923,
"learning_rate": 0.00035337384074192517,
"loss": 0.9019,
"step": 7420
},
{
"epoch": 0.08309985963617249,
"grad_norm": 0.25014832615852356,
"learning_rate": 0.0003531454155055051,
"loss": 0.8976,
"step": 7430
},
{
"epoch": 0.08321170332343517,
"grad_norm": 0.2844237983226776,
"learning_rate": 0.00035291699026908493,
"loss": 0.9039,
"step": 7440
},
{
"epoch": 0.08332354701069784,
"grad_norm": 0.26745542883872986,
"learning_rate": 0.00035268856503266484,
"loss": 0.8813,
"step": 7450
},
{
"epoch": 0.08343539069796053,
"grad_norm": 0.30750566720962524,
"learning_rate": 0.0003524601397962447,
"loss": 0.8988,
"step": 7460
},
{
"epoch": 0.08354723438522321,
"grad_norm": 0.2960536777973175,
"learning_rate": 0.00035223171455982456,
"loss": 0.8966,
"step": 7470
},
{
"epoch": 0.08365907807248589,
"grad_norm": 0.28923213481903076,
"learning_rate": 0.00035200328932340447,
"loss": 0.8872,
"step": 7480
},
{
"epoch": 0.08377092175974858,
"grad_norm": 0.2762465476989746,
"learning_rate": 0.0003517748640869843,
"loss": 0.8655,
"step": 7490
},
{
"epoch": 0.08388276544701126,
"grad_norm": 0.2870965301990509,
"learning_rate": 0.0003515464388505642,
"loss": 0.889,
"step": 7500
},
{
"epoch": 0.08399460913427394,
"grad_norm": 0.3135611116886139,
"learning_rate": 0.00035131801361414414,
"loss": 0.8898,
"step": 7510
},
{
"epoch": 0.08410645282153661,
"grad_norm": 0.29541128873825073,
"learning_rate": 0.000351089588377724,
"loss": 0.8884,
"step": 7520
},
{
"epoch": 0.0842182965087993,
"grad_norm": 0.2667001485824585,
"learning_rate": 0.00035086116314130385,
"loss": 0.8923,
"step": 7530
},
{
"epoch": 0.08433014019606198,
"grad_norm": 0.28677645325660706,
"learning_rate": 0.00035063273790488376,
"loss": 0.8862,
"step": 7540
},
{
"epoch": 0.08444198388332466,
"grad_norm": 0.26973757147789,
"learning_rate": 0.0003504043126684636,
"loss": 0.8739,
"step": 7550
},
{
"epoch": 0.08455382757058735,
"grad_norm": 0.2670735716819763,
"learning_rate": 0.0003501758874320435,
"loss": 0.8843,
"step": 7560
},
{
"epoch": 0.08466567125785003,
"grad_norm": 0.2678844928741455,
"learning_rate": 0.0003499474621956234,
"loss": 0.8855,
"step": 7570
},
{
"epoch": 0.08477751494511271,
"grad_norm": 0.26894411444664,
"learning_rate": 0.00034971903695920324,
"loss": 0.8828,
"step": 7580
},
{
"epoch": 0.08488935863237539,
"grad_norm": 0.28703927993774414,
"learning_rate": 0.00034949061172278315,
"loss": 0.885,
"step": 7590
},
{
"epoch": 0.08500120231963808,
"grad_norm": 0.2618086636066437,
"learning_rate": 0.00034926218648636306,
"loss": 0.8777,
"step": 7600
},
{
"epoch": 0.08511304600690076,
"grad_norm": 0.28816747665405273,
"learning_rate": 0.0003490337612499429,
"loss": 0.8836,
"step": 7610
},
{
"epoch": 0.08522488969416343,
"grad_norm": 0.29172763228416443,
"learning_rate": 0.00034880533601352277,
"loss": 0.8835,
"step": 7620
},
{
"epoch": 0.08533673338142612,
"grad_norm": 0.2613106667995453,
"learning_rate": 0.0003485769107771027,
"loss": 0.8736,
"step": 7630
},
{
"epoch": 0.0854485770686888,
"grad_norm": 0.2737283408641815,
"learning_rate": 0.00034834848554068254,
"loss": 0.8589,
"step": 7640
},
{
"epoch": 0.08556042075595148,
"grad_norm": 0.2709786295890808,
"learning_rate": 0.0003481200603042624,
"loss": 0.8675,
"step": 7650
},
{
"epoch": 0.08567226444321416,
"grad_norm": 0.2982759177684784,
"learning_rate": 0.0003478916350678423,
"loss": 0.8827,
"step": 7660
},
{
"epoch": 0.08578410813047685,
"grad_norm": 0.21551093459129333,
"learning_rate": 0.0003476632098314222,
"loss": 0.8663,
"step": 7670
},
{
"epoch": 0.08589595181773953,
"grad_norm": 0.26418018341064453,
"learning_rate": 0.00034743478459500207,
"loss": 0.8845,
"step": 7680
},
{
"epoch": 0.0860077955050022,
"grad_norm": 0.2310175597667694,
"learning_rate": 0.000347206359358582,
"loss": 0.8874,
"step": 7690
},
{
"epoch": 0.0861196391922649,
"grad_norm": 0.25112512707710266,
"learning_rate": 0.00034697793412216183,
"loss": 0.8896,
"step": 7700
},
{
"epoch": 0.08623148287952757,
"grad_norm": 0.33391082286834717,
"learning_rate": 0.0003467495088857417,
"loss": 0.8765,
"step": 7710
},
{
"epoch": 0.08634332656679025,
"grad_norm": 0.24641484022140503,
"learning_rate": 0.0003465210836493216,
"loss": 0.8572,
"step": 7720
},
{
"epoch": 0.08645517025405293,
"grad_norm": 0.26017534732818604,
"learning_rate": 0.00034629265841290145,
"loss": 0.8585,
"step": 7730
},
{
"epoch": 0.08656701394131562,
"grad_norm": 0.23500847816467285,
"learning_rate": 0.0003460642331764813,
"loss": 0.8797,
"step": 7740
},
{
"epoch": 0.0866788576285783,
"grad_norm": 0.25485488772392273,
"learning_rate": 0.0003458358079400612,
"loss": 0.8796,
"step": 7750
},
{
"epoch": 0.08679070131584098,
"grad_norm": 0.27644404768943787,
"learning_rate": 0.00034560738270364113,
"loss": 0.8708,
"step": 7760
},
{
"epoch": 0.08690254500310367,
"grad_norm": 0.233077734708786,
"learning_rate": 0.000345378957467221,
"loss": 0.8652,
"step": 7770
},
{
"epoch": 0.08701438869036635,
"grad_norm": 0.24039144814014435,
"learning_rate": 0.00034515053223080084,
"loss": 0.8723,
"step": 7780
},
{
"epoch": 0.08712623237762902,
"grad_norm": 0.23007874190807343,
"learning_rate": 0.00034492210699438075,
"loss": 0.8644,
"step": 7790
},
{
"epoch": 0.0872380760648917,
"grad_norm": 0.27570798993110657,
"learning_rate": 0.0003446936817579606,
"loss": 0.872,
"step": 7800
},
{
"epoch": 0.08734991975215439,
"grad_norm": 0.24157382547855377,
"learning_rate": 0.00034446525652154046,
"loss": 0.8846,
"step": 7810
},
{
"epoch": 0.08746176343941707,
"grad_norm": 0.2703733742237091,
"learning_rate": 0.0003442368312851204,
"loss": 0.889,
"step": 7820
},
{
"epoch": 0.08757360712667975,
"grad_norm": 0.26786255836486816,
"learning_rate": 0.0003440084060487003,
"loss": 0.8933,
"step": 7830
},
{
"epoch": 0.08768545081394244,
"grad_norm": 0.2595812976360321,
"learning_rate": 0.00034377998081228014,
"loss": 0.9156,
"step": 7840
},
{
"epoch": 0.08779729450120512,
"grad_norm": 0.24396800994873047,
"learning_rate": 0.00034355155557586005,
"loss": 0.8849,
"step": 7850
},
{
"epoch": 0.0879091381884678,
"grad_norm": 0.24363452196121216,
"learning_rate": 0.0003433231303394399,
"loss": 0.9011,
"step": 7860
},
{
"epoch": 0.08802098187573047,
"grad_norm": 0.2666647434234619,
"learning_rate": 0.00034309470510301976,
"loss": 0.8952,
"step": 7870
},
{
"epoch": 0.08813282556299316,
"grad_norm": 0.267863005399704,
"learning_rate": 0.00034286627986659967,
"loss": 0.9113,
"step": 7880
},
{
"epoch": 0.08824466925025584,
"grad_norm": 0.24397262930870056,
"learning_rate": 0.0003426378546301795,
"loss": 0.8762,
"step": 7890
},
{
"epoch": 0.08835651293751852,
"grad_norm": 0.23912496864795685,
"learning_rate": 0.00034240942939375943,
"loss": 0.8865,
"step": 7900
},
{
"epoch": 0.08846835662478121,
"grad_norm": 0.2737523913383484,
"learning_rate": 0.00034218100415733934,
"loss": 0.8732,
"step": 7910
},
{
"epoch": 0.08858020031204389,
"grad_norm": 0.24978673458099365,
"learning_rate": 0.0003419525789209192,
"loss": 0.8832,
"step": 7920
},
{
"epoch": 0.08869204399930657,
"grad_norm": 0.25200751423835754,
"learning_rate": 0.00034172415368449906,
"loss": 0.8952,
"step": 7930
},
{
"epoch": 0.08880388768656924,
"grad_norm": 0.7863819003105164,
"learning_rate": 0.00034149572844807897,
"loss": 0.8708,
"step": 7940
},
{
"epoch": 0.08891573137383194,
"grad_norm": 0.2560253441333771,
"learning_rate": 0.0003412673032116588,
"loss": 0.8681,
"step": 7950
},
{
"epoch": 0.08902757506109461,
"grad_norm": 0.2669181823730469,
"learning_rate": 0.0003410388779752387,
"loss": 0.9007,
"step": 7960
},
{
"epoch": 0.08913941874835729,
"grad_norm": 0.27906209230422974,
"learning_rate": 0.0003408104527388186,
"loss": 0.8988,
"step": 7970
},
{
"epoch": 0.08925126243561998,
"grad_norm": 0.2506297826766968,
"learning_rate": 0.0003405820275023985,
"loss": 0.8997,
"step": 7980
},
{
"epoch": 0.08936310612288266,
"grad_norm": 0.2513269782066345,
"learning_rate": 0.00034035360226597835,
"loss": 0.9215,
"step": 7990
},
{
"epoch": 0.08947494981014534,
"grad_norm": 0.2672421634197235,
"learning_rate": 0.00034012517702955826,
"loss": 0.9112,
"step": 8000
},
{
"epoch": 0.08958679349740803,
"grad_norm": 0.2553747296333313,
"learning_rate": 0.0003398967517931381,
"loss": 0.9255,
"step": 8010
},
{
"epoch": 0.08969863718467071,
"grad_norm": 0.2325398176908493,
"learning_rate": 0.000339668326556718,
"loss": 0.9173,
"step": 8020
},
{
"epoch": 0.08981048087193338,
"grad_norm": 0.23461295664310455,
"learning_rate": 0.0003394399013202979,
"loss": 0.9183,
"step": 8030
},
{
"epoch": 0.08992232455919606,
"grad_norm": 0.26092031598091125,
"learning_rate": 0.00033921147608387774,
"loss": 0.9106,
"step": 8040
},
{
"epoch": 0.09003416824645875,
"grad_norm": 0.26250872015953064,
"learning_rate": 0.0003389830508474576,
"loss": 0.8893,
"step": 8050
},
{
"epoch": 0.09014601193372143,
"grad_norm": 0.2501981556415558,
"learning_rate": 0.00033875462561103756,
"loss": 0.8934,
"step": 8060
},
{
"epoch": 0.09025785562098411,
"grad_norm": 0.26185476779937744,
"learning_rate": 0.0003385262003746174,
"loss": 0.8855,
"step": 8070
},
{
"epoch": 0.0903696993082468,
"grad_norm": 0.26889827847480774,
"learning_rate": 0.00033829777513819727,
"loss": 0.8944,
"step": 8080
},
{
"epoch": 0.09048154299550948,
"grad_norm": 0.2473451793193817,
"learning_rate": 0.0003380693499017772,
"loss": 0.8937,
"step": 8090
},
{
"epoch": 0.09059338668277216,
"grad_norm": 0.24157559871673584,
"learning_rate": 0.00033784092466535704,
"loss": 0.8903,
"step": 8100
},
{
"epoch": 0.09070523037003483,
"grad_norm": 0.2701563239097595,
"learning_rate": 0.0003376124994289369,
"loss": 0.9109,
"step": 8110
},
{
"epoch": 0.09081707405729753,
"grad_norm": 0.28706929087638855,
"learning_rate": 0.0003373840741925168,
"loss": 0.8956,
"step": 8120
},
{
"epoch": 0.0909289177445602,
"grad_norm": 0.27120909094810486,
"learning_rate": 0.00033715564895609666,
"loss": 0.8947,
"step": 8130
},
{
"epoch": 0.09104076143182288,
"grad_norm": 0.2504216432571411,
"learning_rate": 0.00033692722371967657,
"loss": 0.8814,
"step": 8140
},
{
"epoch": 0.09115260511908557,
"grad_norm": 0.2921849489212036,
"learning_rate": 0.0003366987984832565,
"loss": 0.8856,
"step": 8150
},
{
"epoch": 0.09126444880634825,
"grad_norm": 0.2587922513484955,
"learning_rate": 0.00033647037324683633,
"loss": 0.8778,
"step": 8160
},
{
"epoch": 0.09137629249361093,
"grad_norm": 0.2399989813566208,
"learning_rate": 0.0003362419480104162,
"loss": 0.883,
"step": 8170
},
{
"epoch": 0.0914881361808736,
"grad_norm": 0.24794407188892365,
"learning_rate": 0.0003360135227739961,
"loss": 0.8935,
"step": 8180
},
{
"epoch": 0.0915999798681363,
"grad_norm": 0.26669082045555115,
"learning_rate": 0.00033578509753757595,
"loss": 0.863,
"step": 8190
},
{
"epoch": 0.09171182355539897,
"grad_norm": 0.25162795186042786,
"learning_rate": 0.0003355566723011558,
"loss": 0.8887,
"step": 8200
},
{
"epoch": 0.09182366724266165,
"grad_norm": 0.28969621658325195,
"learning_rate": 0.00033532824706473567,
"loss": 0.9066,
"step": 8210
},
{
"epoch": 0.09193551092992434,
"grad_norm": 0.25944870710372925,
"learning_rate": 0.00033509982182831563,
"loss": 0.8875,
"step": 8220
},
{
"epoch": 0.09204735461718702,
"grad_norm": 0.27627986669540405,
"learning_rate": 0.0003348713965918955,
"loss": 0.8895,
"step": 8230
},
{
"epoch": 0.0921591983044497,
"grad_norm": 0.2673914134502411,
"learning_rate": 0.00033464297135547534,
"loss": 0.8937,
"step": 8240
},
{
"epoch": 0.09227104199171238,
"grad_norm": 0.2810732126235962,
"learning_rate": 0.00033441454611905525,
"loss": 0.9007,
"step": 8250
},
{
"epoch": 0.09238288567897507,
"grad_norm": 0.2671091556549072,
"learning_rate": 0.0003341861208826351,
"loss": 0.905,
"step": 8260
},
{
"epoch": 0.09249472936623775,
"grad_norm": 0.25006943941116333,
"learning_rate": 0.00033395769564621496,
"loss": 0.8981,
"step": 8270
},
{
"epoch": 0.09260657305350042,
"grad_norm": 0.2891542613506317,
"learning_rate": 0.0003337292704097949,
"loss": 0.8978,
"step": 8280
},
{
"epoch": 0.09271841674076312,
"grad_norm": 0.29497236013412476,
"learning_rate": 0.0003335008451733748,
"loss": 0.9044,
"step": 8290
},
{
"epoch": 0.0928302604280258,
"grad_norm": 0.29290974140167236,
"learning_rate": 0.00033327241993695464,
"loss": 0.9081,
"step": 8300
},
{
"epoch": 0.09294210411528847,
"grad_norm": 0.27077415585517883,
"learning_rate": 0.00033304399470053455,
"loss": 0.9184,
"step": 8310
},
{
"epoch": 0.09305394780255115,
"grad_norm": 0.26410186290740967,
"learning_rate": 0.0003328155694641144,
"loss": 0.8912,
"step": 8320
},
{
"epoch": 0.09316579148981384,
"grad_norm": 0.2818413972854614,
"learning_rate": 0.00033258714422769426,
"loss": 0.9096,
"step": 8330
},
{
"epoch": 0.09327763517707652,
"grad_norm": 0.265286386013031,
"learning_rate": 0.00033235871899127417,
"loss": 0.9192,
"step": 8340
},
{
"epoch": 0.0933894788643392,
"grad_norm": 0.2714836597442627,
"learning_rate": 0.000332130293754854,
"loss": 0.9122,
"step": 8350
},
{
"epoch": 0.09350132255160189,
"grad_norm": 0.2858263850212097,
"learning_rate": 0.0003319018685184339,
"loss": 0.9143,
"step": 8360
},
{
"epoch": 0.09361316623886456,
"grad_norm": 0.27788257598876953,
"learning_rate": 0.00033167344328201385,
"loss": 0.9116,
"step": 8370
},
{
"epoch": 0.09372500992612724,
"grad_norm": 0.27748674154281616,
"learning_rate": 0.0003314450180455937,
"loss": 0.8934,
"step": 8380
},
{
"epoch": 0.09383685361338992,
"grad_norm": 0.4757048785686493,
"learning_rate": 0.00033121659280917356,
"loss": 0.9097,
"step": 8390
},
{
"epoch": 0.09394869730065261,
"grad_norm": 0.3016970157623291,
"learning_rate": 0.00033098816757275347,
"loss": 0.8973,
"step": 8400
},
{
"epoch": 0.09406054098791529,
"grad_norm": 0.2640211880207062,
"learning_rate": 0.0003307597423363333,
"loss": 0.8914,
"step": 8410
},
{
"epoch": 0.09417238467517797,
"grad_norm": 0.2608022391796112,
"learning_rate": 0.0003305313170999132,
"loss": 0.9138,
"step": 8420
},
{
"epoch": 0.09428422836244066,
"grad_norm": 0.23691967129707336,
"learning_rate": 0.0003303028918634931,
"loss": 0.9149,
"step": 8430
},
{
"epoch": 0.09439607204970334,
"grad_norm": 0.28734761476516724,
"learning_rate": 0.00033007446662707294,
"loss": 0.9056,
"step": 8440
},
{
"epoch": 0.09450791573696601,
"grad_norm": 0.2846873700618744,
"learning_rate": 0.00032984604139065285,
"loss": 0.9052,
"step": 8450
},
{
"epoch": 0.09461975942422869,
"grad_norm": 0.2613682448863983,
"learning_rate": 0.00032961761615423276,
"loss": 0.9129,
"step": 8460
},
{
"epoch": 0.09473160311149138,
"grad_norm": 0.25336501002311707,
"learning_rate": 0.0003293891909178126,
"loss": 0.9048,
"step": 8470
},
{
"epoch": 0.09484344679875406,
"grad_norm": 0.2662324905395508,
"learning_rate": 0.0003291607656813925,
"loss": 0.9181,
"step": 8480
},
{
"epoch": 0.09495529048601674,
"grad_norm": 0.2482605278491974,
"learning_rate": 0.0003289323404449724,
"loss": 0.8978,
"step": 8490
},
{
"epoch": 0.09506713417327943,
"grad_norm": 0.24181032180786133,
"learning_rate": 0.00032870391520855224,
"loss": 0.9121,
"step": 8500
},
{
"epoch": 0.09517897786054211,
"grad_norm": 0.276621013879776,
"learning_rate": 0.0003284754899721321,
"loss": 0.9106,
"step": 8510
},
{
"epoch": 0.09529082154780479,
"grad_norm": 0.2788410186767578,
"learning_rate": 0.000328247064735712,
"loss": 0.9062,
"step": 8520
},
{
"epoch": 0.09540266523506746,
"grad_norm": 0.28387385606765747,
"learning_rate": 0.0003280186394992919,
"loss": 0.9309,
"step": 8530
},
{
"epoch": 0.09551450892233015,
"grad_norm": 0.2923261523246765,
"learning_rate": 0.00032779021426287177,
"loss": 0.9278,
"step": 8540
},
{
"epoch": 0.09562635260959283,
"grad_norm": 0.3008005917072296,
"learning_rate": 0.0003275617890264517,
"loss": 0.9196,
"step": 8550
},
{
"epoch": 0.09573819629685551,
"grad_norm": 0.2849402129650116,
"learning_rate": 0.00032733336379003154,
"loss": 0.9243,
"step": 8560
},
{
"epoch": 0.0958500399841182,
"grad_norm": 0.262134313583374,
"learning_rate": 0.0003271049385536114,
"loss": 0.9346,
"step": 8570
},
{
"epoch": 0.09596188367138088,
"grad_norm": 0.2891925573348999,
"learning_rate": 0.0003268765133171913,
"loss": 0.9176,
"step": 8580
},
{
"epoch": 0.09607372735864356,
"grad_norm": 0.26165837049484253,
"learning_rate": 0.00032664808808077116,
"loss": 0.9229,
"step": 8590
},
{
"epoch": 0.09618557104590623,
"grad_norm": 0.2683985233306885,
"learning_rate": 0.000326419662844351,
"loss": 0.9067,
"step": 8600
},
{
"epoch": 0.09629741473316893,
"grad_norm": 0.25300973653793335,
"learning_rate": 0.000326191237607931,
"loss": 0.9037,
"step": 8610
},
{
"epoch": 0.0964092584204316,
"grad_norm": 0.30520153045654297,
"learning_rate": 0.00032596281237151083,
"loss": 0.9038,
"step": 8620
},
{
"epoch": 0.09652110210769428,
"grad_norm": 0.2573854327201843,
"learning_rate": 0.0003257343871350907,
"loss": 0.9062,
"step": 8630
},
{
"epoch": 0.09663294579495697,
"grad_norm": 0.2664088308811188,
"learning_rate": 0.0003255059618986706,
"loss": 0.8864,
"step": 8640
},
{
"epoch": 0.09674478948221965,
"grad_norm": 0.26375049352645874,
"learning_rate": 0.00032527753666225046,
"loss": 0.8804,
"step": 8650
},
{
"epoch": 0.09685663316948233,
"grad_norm": 0.25367647409439087,
"learning_rate": 0.0003250491114258303,
"loss": 0.8987,
"step": 8660
},
{
"epoch": 0.09696847685674502,
"grad_norm": 0.2764420807361603,
"learning_rate": 0.00032482068618941017,
"loss": 0.9078,
"step": 8670
},
{
"epoch": 0.0970803205440077,
"grad_norm": 0.2663860023021698,
"learning_rate": 0.0003245922609529901,
"loss": 0.8838,
"step": 8680
},
{
"epoch": 0.09719216423127038,
"grad_norm": 0.25380998849868774,
"learning_rate": 0.00032436383571657,
"loss": 0.8949,
"step": 8690
},
{
"epoch": 0.09730400791853305,
"grad_norm": 0.29428210854530334,
"learning_rate": 0.00032413541048014984,
"loss": 0.883,
"step": 8700
},
{
"epoch": 0.09741585160579574,
"grad_norm": 0.25604331493377686,
"learning_rate": 0.00032390698524372975,
"loss": 0.8891,
"step": 8710
},
{
"epoch": 0.09752769529305842,
"grad_norm": 0.26663005352020264,
"learning_rate": 0.0003236785600073096,
"loss": 0.8763,
"step": 8720
},
{
"epoch": 0.0976395389803211,
"grad_norm": 0.27305158972740173,
"learning_rate": 0.00032345013477088946,
"loss": 0.8877,
"step": 8730
},
{
"epoch": 0.09775138266758379,
"grad_norm": 0.27395525574684143,
"learning_rate": 0.0003232217095344694,
"loss": 0.871,
"step": 8740
},
{
"epoch": 0.09786322635484647,
"grad_norm": 0.26152902841567993,
"learning_rate": 0.00032299328429804923,
"loss": 0.8714,
"step": 8750
},
{
"epoch": 0.09797507004210915,
"grad_norm": 0.2872631847858429,
"learning_rate": 0.0003227648590616291,
"loss": 0.8754,
"step": 8760
},
{
"epoch": 0.09808691372937182,
"grad_norm": 0.2681150436401367,
"learning_rate": 0.00032253643382520905,
"loss": 0.8699,
"step": 8770
},
{
"epoch": 0.09819875741663452,
"grad_norm": 0.27205002307891846,
"learning_rate": 0.0003223080085887889,
"loss": 0.8743,
"step": 8780
},
{
"epoch": 0.0983106011038972,
"grad_norm": 0.27747979760169983,
"learning_rate": 0.00032207958335236876,
"loss": 0.8607,
"step": 8790
},
{
"epoch": 0.09842244479115987,
"grad_norm": 0.2963927984237671,
"learning_rate": 0.00032185115811594867,
"loss": 0.8676,
"step": 8800
},
{
"epoch": 0.09853428847842256,
"grad_norm": 0.26414602994918823,
"learning_rate": 0.0003216227328795285,
"loss": 0.8556,
"step": 8810
},
{
"epoch": 0.09864613216568524,
"grad_norm": 0.3005480170249939,
"learning_rate": 0.0003213943076431084,
"loss": 0.8816,
"step": 8820
},
{
"epoch": 0.09875797585294792,
"grad_norm": 0.29625314474105835,
"learning_rate": 0.0003211658824066883,
"loss": 0.8747,
"step": 8830
},
{
"epoch": 0.0988698195402106,
"grad_norm": 0.2900589108467102,
"learning_rate": 0.0003209374571702682,
"loss": 0.8697,
"step": 8840
},
{
"epoch": 0.09898166322747329,
"grad_norm": 0.2951551675796509,
"learning_rate": 0.00032070903193384806,
"loss": 0.8756,
"step": 8850
},
{
"epoch": 0.09909350691473597,
"grad_norm": 0.3049459159374237,
"learning_rate": 0.00032048060669742797,
"loss": 0.8767,
"step": 8860
},
{
"epoch": 0.09920535060199864,
"grad_norm": 0.30216872692108154,
"learning_rate": 0.0003202521814610078,
"loss": 0.8687,
"step": 8870
},
{
"epoch": 0.09931719428926133,
"grad_norm": 0.2913934290409088,
"learning_rate": 0.0003200237562245877,
"loss": 0.8616,
"step": 8880
},
{
"epoch": 0.09942903797652401,
"grad_norm": 0.26879578828811646,
"learning_rate": 0.0003197953309881676,
"loss": 0.8681,
"step": 8890
},
{
"epoch": 0.09954088166378669,
"grad_norm": 0.28092971444129944,
"learning_rate": 0.00031956690575174744,
"loss": 0.8765,
"step": 8900
},
{
"epoch": 0.09965272535104937,
"grad_norm": 0.3074035048484802,
"learning_rate": 0.0003193384805153273,
"loss": 0.881,
"step": 8910
},
{
"epoch": 0.09976456903831206,
"grad_norm": 0.2945140600204468,
"learning_rate": 0.00031911005527890726,
"loss": 0.8913,
"step": 8920
},
{
"epoch": 0.09987641272557474,
"grad_norm": 0.2707176208496094,
"learning_rate": 0.0003188816300424871,
"loss": 0.8822,
"step": 8930
},
{
"epoch": 0.09998825641283741,
"grad_norm": 0.2639947235584259,
"learning_rate": 0.000318653204806067,
"loss": 0.8892,
"step": 8940
},
{
"epoch": 0.1001001001001001,
"grad_norm": 0.2709505558013916,
"learning_rate": 0.0003184247795696469,
"loss": 0.8654,
"step": 8950
},
{
"epoch": 0.10021194378736278,
"grad_norm": 0.27803289890289307,
"learning_rate": 0.00031819635433322674,
"loss": 0.8887,
"step": 8960
},
{
"epoch": 0.10032378747462546,
"grad_norm": 0.25851163268089294,
"learning_rate": 0.0003179679290968066,
"loss": 0.8662,
"step": 8970
},
{
"epoch": 0.10043563116188814,
"grad_norm": 0.261068731546402,
"learning_rate": 0.0003177395038603865,
"loss": 0.8641,
"step": 8980
},
{
"epoch": 0.10054747484915083,
"grad_norm": 0.25510483980178833,
"learning_rate": 0.00031751107862396636,
"loss": 0.8762,
"step": 8990
},
{
"epoch": 0.10065931853641351,
"grad_norm": 0.25765854120254517,
"learning_rate": 0.00031728265338754627,
"loss": 0.8837,
"step": 9000
},
{
"epoch": 0.10077116222367619,
"grad_norm": 0.24198535084724426,
"learning_rate": 0.0003170542281511262,
"loss": 0.8791,
"step": 9010
},
{
"epoch": 0.10088300591093888,
"grad_norm": 0.2673517167568207,
"learning_rate": 0.00031682580291470604,
"loss": 0.8795,
"step": 9020
},
{
"epoch": 0.10099484959820156,
"grad_norm": 0.26392221450805664,
"learning_rate": 0.0003165973776782859,
"loss": 0.8788,
"step": 9030
},
{
"epoch": 0.10110669328546423,
"grad_norm": 0.2698739171028137,
"learning_rate": 0.0003163689524418658,
"loss": 0.8959,
"step": 9040
},
{
"epoch": 0.10121853697272691,
"grad_norm": 0.2800233066082001,
"learning_rate": 0.00031614052720544566,
"loss": 0.8945,
"step": 9050
},
{
"epoch": 0.1013303806599896,
"grad_norm": 0.29603493213653564,
"learning_rate": 0.0003159121019690255,
"loss": 0.892,
"step": 9060
},
{
"epoch": 0.10144222434725228,
"grad_norm": 0.26462167501449585,
"learning_rate": 0.0003156836767326054,
"loss": 0.8849,
"step": 9070
},
{
"epoch": 0.10155406803451496,
"grad_norm": 0.27941739559173584,
"learning_rate": 0.00031545525149618534,
"loss": 0.8782,
"step": 9080
},
{
"epoch": 0.10166591172177765,
"grad_norm": 0.2777186334133148,
"learning_rate": 0.0003152268262597652,
"loss": 0.8787,
"step": 9090
},
{
"epoch": 0.10177775540904033,
"grad_norm": 0.25893428921699524,
"learning_rate": 0.00031499840102334505,
"loss": 0.8629,
"step": 9100
},
{
"epoch": 0.101889599096303,
"grad_norm": 0.27407601475715637,
"learning_rate": 0.00031476997578692496,
"loss": 0.8619,
"step": 9110
},
{
"epoch": 0.10200144278356568,
"grad_norm": 0.2663459777832031,
"learning_rate": 0.0003145415505505048,
"loss": 0.8474,
"step": 9120
},
{
"epoch": 0.10211328647082837,
"grad_norm": 0.2621177136898041,
"learning_rate": 0.00031431312531408467,
"loss": 0.8565,
"step": 9130
},
{
"epoch": 0.10222513015809105,
"grad_norm": 0.26687386631965637,
"learning_rate": 0.0003140847000776646,
"loss": 0.8438,
"step": 9140
},
{
"epoch": 0.10233697384535373,
"grad_norm": 0.24772432446479797,
"learning_rate": 0.00031385627484124443,
"loss": 0.8511,
"step": 9150
},
{
"epoch": 0.10244881753261642,
"grad_norm": 0.278730183839798,
"learning_rate": 0.00031362784960482434,
"loss": 0.8499,
"step": 9160
},
{
"epoch": 0.1025606612198791,
"grad_norm": 0.28657999634742737,
"learning_rate": 0.00031339942436840425,
"loss": 0.85,
"step": 9170
},
{
"epoch": 0.10267250490714178,
"grad_norm": 0.2848927676677704,
"learning_rate": 0.0003131709991319841,
"loss": 0.8411,
"step": 9180
},
{
"epoch": 0.10278434859440445,
"grad_norm": 0.28381872177124023,
"learning_rate": 0.00031294257389556396,
"loss": 0.8508,
"step": 9190
},
{
"epoch": 0.10289619228166715,
"grad_norm": 0.26624616980552673,
"learning_rate": 0.0003127141486591439,
"loss": 0.8658,
"step": 9200
},
{
"epoch": 0.10300803596892982,
"grad_norm": 0.2605401277542114,
"learning_rate": 0.00031248572342272373,
"loss": 0.8602,
"step": 9210
},
{
"epoch": 0.1031198796561925,
"grad_norm": 0.2819276750087738,
"learning_rate": 0.0003122572981863036,
"loss": 0.8614,
"step": 9220
},
{
"epoch": 0.10323172334345519,
"grad_norm": 0.27677878737449646,
"learning_rate": 0.00031202887294988355,
"loss": 0.8556,
"step": 9230
},
{
"epoch": 0.10334356703071787,
"grad_norm": 0.25589799880981445,
"learning_rate": 0.0003118004477134634,
"loss": 0.8704,
"step": 9240
},
{
"epoch": 0.10345541071798055,
"grad_norm": 0.2731853425502777,
"learning_rate": 0.00031157202247704326,
"loss": 0.8428,
"step": 9250
},
{
"epoch": 0.10356725440524323,
"grad_norm": 0.3047199547290802,
"learning_rate": 0.00031134359724062317,
"loss": 0.8508,
"step": 9260
},
{
"epoch": 0.10367909809250592,
"grad_norm": 0.28696686029434204,
"learning_rate": 0.00031111517200420303,
"loss": 0.8571,
"step": 9270
},
{
"epoch": 0.1037909417797686,
"grad_norm": 0.23354049026966095,
"learning_rate": 0.0003108867467677829,
"loss": 0.8518,
"step": 9280
},
{
"epoch": 0.10390278546703127,
"grad_norm": 0.27123787999153137,
"learning_rate": 0.0003106583215313628,
"loss": 0.8621,
"step": 9290
},
{
"epoch": 0.10401462915429396,
"grad_norm": 0.2509523332118988,
"learning_rate": 0.00031042989629494265,
"loss": 0.8568,
"step": 9300
},
{
"epoch": 0.10412647284155664,
"grad_norm": 0.2359481155872345,
"learning_rate": 0.00031020147105852256,
"loss": 0.8598,
"step": 9310
},
{
"epoch": 0.10423831652881932,
"grad_norm": 0.27097463607788086,
"learning_rate": 0.00030997304582210247,
"loss": 0.8615,
"step": 9320
},
{
"epoch": 0.104350160216082,
"grad_norm": 0.2616114020347595,
"learning_rate": 0.0003097446205856823,
"loss": 0.8462,
"step": 9330
},
{
"epoch": 0.10446200390334469,
"grad_norm": 0.30027398467063904,
"learning_rate": 0.0003095161953492622,
"loss": 0.8683,
"step": 9340
},
{
"epoch": 0.10457384759060737,
"grad_norm": 0.28468623757362366,
"learning_rate": 0.0003092877701128421,
"loss": 0.856,
"step": 9350
},
{
"epoch": 0.10468569127787004,
"grad_norm": 0.318521112203598,
"learning_rate": 0.00030905934487642195,
"loss": 0.8532,
"step": 9360
},
{
"epoch": 0.10479753496513274,
"grad_norm": 0.3118298351764679,
"learning_rate": 0.0003088309196400018,
"loss": 0.8546,
"step": 9370
},
{
"epoch": 0.10490937865239541,
"grad_norm": 0.28549399971961975,
"learning_rate": 0.0003086024944035817,
"loss": 0.8718,
"step": 9380
},
{
"epoch": 0.10502122233965809,
"grad_norm": 0.24803526699543,
"learning_rate": 0.0003083740691671616,
"loss": 0.8489,
"step": 9390
},
{
"epoch": 0.10513306602692078,
"grad_norm": 0.26765918731689453,
"learning_rate": 0.0003081456439307415,
"loss": 0.8617,
"step": 9400
},
{
"epoch": 0.10524490971418346,
"grad_norm": 0.26363757252693176,
"learning_rate": 0.0003079172186943214,
"loss": 0.8648,
"step": 9410
},
{
"epoch": 0.10535675340144614,
"grad_norm": 0.2734963595867157,
"learning_rate": 0.00030768879345790124,
"loss": 0.8556,
"step": 9420
},
{
"epoch": 0.10546859708870882,
"grad_norm": 0.2773530185222626,
"learning_rate": 0.0003074603682214811,
"loss": 0.8737,
"step": 9430
},
{
"epoch": 0.1055804407759715,
"grad_norm": 0.2684498429298401,
"learning_rate": 0.000307231942985061,
"loss": 0.8657,
"step": 9440
},
{
"epoch": 0.10569228446323418,
"grad_norm": 0.26110732555389404,
"learning_rate": 0.00030700351774864086,
"loss": 0.8618,
"step": 9450
},
{
"epoch": 0.10580412815049686,
"grad_norm": 0.27595090866088867,
"learning_rate": 0.0003067750925122207,
"loss": 0.8654,
"step": 9460
},
{
"epoch": 0.10591597183775955,
"grad_norm": 0.2799736559391022,
"learning_rate": 0.0003065466672758007,
"loss": 0.8583,
"step": 9470
},
{
"epoch": 0.10602781552502223,
"grad_norm": 0.2729387879371643,
"learning_rate": 0.00030631824203938054,
"loss": 0.8628,
"step": 9480
},
{
"epoch": 0.10613965921228491,
"grad_norm": 0.30332332849502563,
"learning_rate": 0.0003060898168029604,
"loss": 0.8512,
"step": 9490
},
{
"epoch": 0.10625150289954759,
"grad_norm": 0.276753306388855,
"learning_rate": 0.0003058613915665403,
"loss": 0.85,
"step": 9500
},
{
"epoch": 0.10636334658681028,
"grad_norm": 0.3190478980541229,
"learning_rate": 0.00030563296633012016,
"loss": 0.8534,
"step": 9510
},
{
"epoch": 0.10647519027407296,
"grad_norm": 0.2926968038082123,
"learning_rate": 0.0003054045410937,
"loss": 0.8309,
"step": 9520
},
{
"epoch": 0.10658703396133563,
"grad_norm": 0.29631507396698,
"learning_rate": 0.0003051761158572799,
"loss": 0.8406,
"step": 9530
},
{
"epoch": 0.10669887764859833,
"grad_norm": 0.2881840765476227,
"learning_rate": 0.0003049476906208598,
"loss": 0.8274,
"step": 9540
},
{
"epoch": 0.106810721335861,
"grad_norm": 0.2623940408229828,
"learning_rate": 0.0003047192653844397,
"loss": 0.8346,
"step": 9550
},
{
"epoch": 0.10692256502312368,
"grad_norm": 0.29798468947410583,
"learning_rate": 0.00030449084014801955,
"loss": 0.8362,
"step": 9560
},
{
"epoch": 0.10703440871038636,
"grad_norm": 0.2976382076740265,
"learning_rate": 0.00030426241491159946,
"loss": 0.8179,
"step": 9570
},
{
"epoch": 0.10714625239764905,
"grad_norm": 0.28637486696243286,
"learning_rate": 0.0003040339896751793,
"loss": 0.8363,
"step": 9580
},
{
"epoch": 0.10725809608491173,
"grad_norm": 0.3023325204849243,
"learning_rate": 0.00030380556443875917,
"loss": 0.8382,
"step": 9590
},
{
"epoch": 0.1073699397721744,
"grad_norm": 0.2889160215854645,
"learning_rate": 0.0003035771392023391,
"loss": 0.8476,
"step": 9600
},
{
"epoch": 0.1074817834594371,
"grad_norm": 0.2868768572807312,
"learning_rate": 0.00030334871396591893,
"loss": 0.8482,
"step": 9610
},
{
"epoch": 0.10759362714669977,
"grad_norm": 0.2773813307285309,
"learning_rate": 0.0003031202887294988,
"loss": 0.8577,
"step": 9620
},
{
"epoch": 0.10770547083396245,
"grad_norm": 0.28698423504829407,
"learning_rate": 0.00030289186349307875,
"loss": 0.8663,
"step": 9630
},
{
"epoch": 0.10781731452122513,
"grad_norm": 0.26839759945869446,
"learning_rate": 0.0003026634382566586,
"loss": 0.8649,
"step": 9640
},
{
"epoch": 0.10792915820848782,
"grad_norm": 0.2686857283115387,
"learning_rate": 0.00030243501302023847,
"loss": 0.8563,
"step": 9650
},
{
"epoch": 0.1080410018957505,
"grad_norm": 0.2815250754356384,
"learning_rate": 0.0003022065877838184,
"loss": 0.8538,
"step": 9660
},
{
"epoch": 0.10815284558301318,
"grad_norm": 0.24625800549983978,
"learning_rate": 0.00030197816254739823,
"loss": 0.87,
"step": 9670
},
{
"epoch": 0.10826468927027587,
"grad_norm": 0.27051877975463867,
"learning_rate": 0.0003017497373109781,
"loss": 0.8692,
"step": 9680
},
{
"epoch": 0.10837653295753855,
"grad_norm": 0.253892183303833,
"learning_rate": 0.000301521312074558,
"loss": 0.8583,
"step": 9690
},
{
"epoch": 0.10848837664480122,
"grad_norm": 0.26951879262924194,
"learning_rate": 0.0003012928868381379,
"loss": 0.8699,
"step": 9700
},
{
"epoch": 0.1086002203320639,
"grad_norm": 0.27741488814353943,
"learning_rate": 0.00030106446160171776,
"loss": 0.8673,
"step": 9710
},
{
"epoch": 0.10871206401932659,
"grad_norm": 0.2655075788497925,
"learning_rate": 0.00030083603636529767,
"loss": 0.8628,
"step": 9720
},
{
"epoch": 0.10882390770658927,
"grad_norm": 0.298532098531723,
"learning_rate": 0.00030060761112887753,
"loss": 0.8707,
"step": 9730
},
{
"epoch": 0.10893575139385195,
"grad_norm": 0.3105684816837311,
"learning_rate": 0.0003003791858924574,
"loss": 0.8661,
"step": 9740
},
{
"epoch": 0.10904759508111464,
"grad_norm": 0.27781355381011963,
"learning_rate": 0.0003001507606560373,
"loss": 0.8871,
"step": 9750
},
{
"epoch": 0.10915943876837732,
"grad_norm": 0.2966761589050293,
"learning_rate": 0.00029992233541961715,
"loss": 0.875,
"step": 9760
},
{
"epoch": 0.10927128245564,
"grad_norm": 0.3010736405849457,
"learning_rate": 0.000299693910183197,
"loss": 0.8746,
"step": 9770
},
{
"epoch": 0.10938312614290267,
"grad_norm": 0.31352171301841736,
"learning_rate": 0.00029946548494677697,
"loss": 0.8733,
"step": 9780
},
{
"epoch": 0.10949496983016536,
"grad_norm": 0.30627313256263733,
"learning_rate": 0.0002992370597103568,
"loss": 0.8675,
"step": 9790
},
{
"epoch": 0.10960681351742804,
"grad_norm": 0.23990577459335327,
"learning_rate": 0.0002990086344739367,
"loss": 0.8614,
"step": 9800
},
{
"epoch": 0.10971865720469072,
"grad_norm": 0.2856599688529968,
"learning_rate": 0.0002987802092375166,
"loss": 0.8454,
"step": 9810
},
{
"epoch": 0.10983050089195341,
"grad_norm": 0.26476389169692993,
"learning_rate": 0.00029855178400109645,
"loss": 0.8616,
"step": 9820
},
{
"epoch": 0.10994234457921609,
"grad_norm": 0.2871752381324768,
"learning_rate": 0.0002983233587646763,
"loss": 0.8444,
"step": 9830
},
{
"epoch": 0.11005418826647877,
"grad_norm": 0.27318039536476135,
"learning_rate": 0.0002980949335282562,
"loss": 0.8487,
"step": 9840
},
{
"epoch": 0.11016603195374144,
"grad_norm": 0.25630125403404236,
"learning_rate": 0.00029786650829183607,
"loss": 0.846,
"step": 9850
},
{
"epoch": 0.11027787564100414,
"grad_norm": 0.23908184468746185,
"learning_rate": 0.000297638083055416,
"loss": 0.8403,
"step": 9860
},
{
"epoch": 0.11038971932826681,
"grad_norm": 0.2978418469429016,
"learning_rate": 0.0002974096578189959,
"loss": 0.8652,
"step": 9870
},
{
"epoch": 0.11050156301552949,
"grad_norm": 0.2503781318664551,
"learning_rate": 0.00029718123258257574,
"loss": 0.8657,
"step": 9880
},
{
"epoch": 0.11061340670279218,
"grad_norm": 0.28556469082832336,
"learning_rate": 0.0002969528073461556,
"loss": 0.8501,
"step": 9890
},
{
"epoch": 0.11072525039005486,
"grad_norm": 0.2643977701663971,
"learning_rate": 0.0002967243821097355,
"loss": 0.8742,
"step": 9900
},
{
"epoch": 0.11083709407731754,
"grad_norm": 0.2757241725921631,
"learning_rate": 0.00029649595687331536,
"loss": 0.8837,
"step": 9910
},
{
"epoch": 0.11094893776458022,
"grad_norm": 0.28263452649116516,
"learning_rate": 0.0002962675316368952,
"loss": 0.8793,
"step": 9920
},
{
"epoch": 0.11106078145184291,
"grad_norm": 0.27624276280403137,
"learning_rate": 0.00029603910640047513,
"loss": 0.8669,
"step": 9930
},
{
"epoch": 0.11117262513910559,
"grad_norm": 0.2814600467681885,
"learning_rate": 0.00029581068116405504,
"loss": 0.8858,
"step": 9940
},
{
"epoch": 0.11128446882636826,
"grad_norm": 0.2871972918510437,
"learning_rate": 0.0002955822559276349,
"loss": 0.8714,
"step": 9950
},
{
"epoch": 0.11139631251363095,
"grad_norm": 0.2885976731777191,
"learning_rate": 0.0002953538306912148,
"loss": 0.8675,
"step": 9960
},
{
"epoch": 0.11150815620089363,
"grad_norm": 0.281021386384964,
"learning_rate": 0.00029512540545479466,
"loss": 0.8762,
"step": 9970
},
{
"epoch": 0.11161999988815631,
"grad_norm": 0.2923888862133026,
"learning_rate": 0.0002948969802183745,
"loss": 0.87,
"step": 9980
},
{
"epoch": 0.11173184357541899,
"grad_norm": 0.2596036195755005,
"learning_rate": 0.00029466855498195443,
"loss": 0.8696,
"step": 9990
},
{
"epoch": 0.11184368726268168,
"grad_norm": 0.2749873697757721,
"learning_rate": 0.0002944401297455343,
"loss": 0.8604,
"step": 10000
},
{
"epoch": 0.11195553094994436,
"grad_norm": 0.2696766257286072,
"learning_rate": 0.00029421170450911414,
"loss": 0.8743,
"step": 10010
},
{
"epoch": 0.11206737463720703,
"grad_norm": 0.2824450731277466,
"learning_rate": 0.00029398327927269405,
"loss": 0.8734,
"step": 10020
},
{
"epoch": 0.11217921832446973,
"grad_norm": 0.2795054614543915,
"learning_rate": 0.00029375485403627396,
"loss": 0.865,
"step": 10030
},
{
"epoch": 0.1122910620117324,
"grad_norm": 0.2974453866481781,
"learning_rate": 0.0002935264287998538,
"loss": 0.8762,
"step": 10040
},
{
"epoch": 0.11240290569899508,
"grad_norm": 0.27134743332862854,
"learning_rate": 0.00029329800356343367,
"loss": 0.8616,
"step": 10050
},
{
"epoch": 0.11251474938625777,
"grad_norm": 0.2651810348033905,
"learning_rate": 0.0002930695783270136,
"loss": 0.8653,
"step": 10060
},
{
"epoch": 0.11262659307352045,
"grad_norm": 0.29161420464515686,
"learning_rate": 0.00029284115309059344,
"loss": 0.8583,
"step": 10070
},
{
"epoch": 0.11273843676078313,
"grad_norm": 0.27624139189720154,
"learning_rate": 0.0002926127278541733,
"loss": 0.8447,
"step": 10080
},
{
"epoch": 0.1128502804480458,
"grad_norm": 0.290632039308548,
"learning_rate": 0.00029238430261775326,
"loss": 0.8568,
"step": 10090
},
{
"epoch": 0.1129621241353085,
"grad_norm": 0.2906644940376282,
"learning_rate": 0.0002921558773813331,
"loss": 0.8566,
"step": 10100
},
{
"epoch": 0.11307396782257118,
"grad_norm": 0.29284584522247314,
"learning_rate": 0.00029192745214491297,
"loss": 0.8679,
"step": 10110
},
{
"epoch": 0.11318581150983385,
"grad_norm": 0.29635393619537354,
"learning_rate": 0.0002916990269084929,
"loss": 0.8648,
"step": 10120
},
{
"epoch": 0.11329765519709654,
"grad_norm": 0.2560585141181946,
"learning_rate": 0.00029147060167207273,
"loss": 0.8565,
"step": 10130
},
{
"epoch": 0.11340949888435922,
"grad_norm": 0.2480679154396057,
"learning_rate": 0.0002912421764356526,
"loss": 0.8574,
"step": 10140
},
{
"epoch": 0.1135213425716219,
"grad_norm": 0.28708118200302124,
"learning_rate": 0.0002910137511992325,
"loss": 0.8658,
"step": 10150
},
{
"epoch": 0.11363318625888458,
"grad_norm": 0.2553873062133789,
"learning_rate": 0.00029078532596281235,
"loss": 0.8721,
"step": 10160
},
{
"epoch": 0.11374502994614727,
"grad_norm": 0.26742488145828247,
"learning_rate": 0.00029055690072639226,
"loss": 0.8608,
"step": 10170
},
{
"epoch": 0.11385687363340995,
"grad_norm": 0.2674279510974884,
"learning_rate": 0.0002903284754899722,
"loss": 0.8763,
"step": 10180
},
{
"epoch": 0.11396871732067262,
"grad_norm": 0.2484348863363266,
"learning_rate": 0.00029010005025355203,
"loss": 0.8799,
"step": 10190
},
{
"epoch": 0.11408056100793532,
"grad_norm": 0.2603932321071625,
"learning_rate": 0.0002898716250171319,
"loss": 0.8922,
"step": 10200
},
{
"epoch": 0.114192404695198,
"grad_norm": 0.2510204613208771,
"learning_rate": 0.0002896431997807118,
"loss": 0.8851,
"step": 10210
},
{
"epoch": 0.11430424838246067,
"grad_norm": 0.26795732975006104,
"learning_rate": 0.00028941477454429165,
"loss": 0.8917,
"step": 10220
},
{
"epoch": 0.11441609206972335,
"grad_norm": 0.2880701422691345,
"learning_rate": 0.0002891863493078715,
"loss": 0.8903,
"step": 10230
},
{
"epoch": 0.11452793575698604,
"grad_norm": 0.23970642685890198,
"learning_rate": 0.0002889579240714514,
"loss": 0.8882,
"step": 10240
},
{
"epoch": 0.11463977944424872,
"grad_norm": 0.2786742150783539,
"learning_rate": 0.0002887294988350313,
"loss": 0.8827,
"step": 10250
},
{
"epoch": 0.1147516231315114,
"grad_norm": 0.2780776619911194,
"learning_rate": 0.0002885010735986112,
"loss": 0.8879,
"step": 10260
},
{
"epoch": 0.11486346681877409,
"grad_norm": 0.26984742283821106,
"learning_rate": 0.0002882726483621911,
"loss": 0.8732,
"step": 10270
},
{
"epoch": 0.11497531050603677,
"grad_norm": 0.26902884244918823,
"learning_rate": 0.00028804422312577095,
"loss": 0.878,
"step": 10280
},
{
"epoch": 0.11508715419329944,
"grad_norm": 0.24787285923957825,
"learning_rate": 0.0002878157978893508,
"loss": 0.8573,
"step": 10290
},
{
"epoch": 0.11519899788056212,
"grad_norm": 0.22702965140342712,
"learning_rate": 0.0002875873726529307,
"loss": 0.8621,
"step": 10300
},
{
"epoch": 0.11531084156782481,
"grad_norm": 0.27474096417427063,
"learning_rate": 0.00028735894741651057,
"loss": 0.8763,
"step": 10310
},
{
"epoch": 0.11542268525508749,
"grad_norm": 0.2605912983417511,
"learning_rate": 0.0002871305221800904,
"loss": 0.8706,
"step": 10320
},
{
"epoch": 0.11553452894235017,
"grad_norm": 0.25281742215156555,
"learning_rate": 0.0002869020969436704,
"loss": 0.855,
"step": 10330
},
{
"epoch": 0.11564637262961286,
"grad_norm": 0.2559000849723816,
"learning_rate": 0.00028667367170725024,
"loss": 0.8549,
"step": 10340
},
{
"epoch": 0.11575821631687554,
"grad_norm": 0.2439345121383667,
"learning_rate": 0.0002864452464708301,
"loss": 0.8639,
"step": 10350
},
{
"epoch": 0.11587006000413821,
"grad_norm": 0.2690776288509369,
"learning_rate": 0.00028621682123441,
"loss": 0.8487,
"step": 10360
},
{
"epoch": 0.11598190369140089,
"grad_norm": 0.25111067295074463,
"learning_rate": 0.00028598839599798987,
"loss": 0.8558,
"step": 10370
},
{
"epoch": 0.11609374737866358,
"grad_norm": 0.26838451623916626,
"learning_rate": 0.0002857599707615697,
"loss": 0.8603,
"step": 10380
},
{
"epoch": 0.11620559106592626,
"grad_norm": 0.2401856780052185,
"learning_rate": 0.00028553154552514963,
"loss": 0.8286,
"step": 10390
},
{
"epoch": 0.11631743475318894,
"grad_norm": 0.26284924149513245,
"learning_rate": 0.0002853031202887295,
"loss": 0.8402,
"step": 10400
},
{
"epoch": 0.11642927844045163,
"grad_norm": 0.28734955191612244,
"learning_rate": 0.0002850746950523094,
"loss": 0.8358,
"step": 10410
},
{
"epoch": 0.11654112212771431,
"grad_norm": 0.2564549446105957,
"learning_rate": 0.0002848462698158893,
"loss": 0.8458,
"step": 10420
},
{
"epoch": 0.11665296581497699,
"grad_norm": 0.2507050633430481,
"learning_rate": 0.00028461784457946916,
"loss": 0.8371,
"step": 10430
},
{
"epoch": 0.11676480950223966,
"grad_norm": 0.25748834013938904,
"learning_rate": 0.000284389419343049,
"loss": 0.8527,
"step": 10440
},
{
"epoch": 0.11687665318950236,
"grad_norm": 0.24484454095363617,
"learning_rate": 0.00028416099410662893,
"loss": 0.8372,
"step": 10450
},
{
"epoch": 0.11698849687676503,
"grad_norm": 0.24171967804431915,
"learning_rate": 0.0002839325688702088,
"loss": 0.8327,
"step": 10460
},
{
"epoch": 0.11710034056402771,
"grad_norm": 0.30423420667648315,
"learning_rate": 0.00028370414363378864,
"loss": 0.8271,
"step": 10470
},
{
"epoch": 0.1172121842512904,
"grad_norm": 0.2598424553871155,
"learning_rate": 0.0002834757183973685,
"loss": 0.8169,
"step": 10480
},
{
"epoch": 0.11732402793855308,
"grad_norm": 0.2608656585216522,
"learning_rate": 0.00028324729316094846,
"loss": 0.8261,
"step": 10490
},
{
"epoch": 0.11743587162581576,
"grad_norm": 0.25370126962661743,
"learning_rate": 0.0002830188679245283,
"loss": 0.8227,
"step": 10500
},
{
"epoch": 0.11754771531307844,
"grad_norm": 0.2760542333126068,
"learning_rate": 0.00028279044268810817,
"loss": 0.8413,
"step": 10510
},
{
"epoch": 0.11765955900034113,
"grad_norm": 0.24994856119155884,
"learning_rate": 0.0002825620174516881,
"loss": 0.8288,
"step": 10520
},
{
"epoch": 0.1177714026876038,
"grad_norm": 0.25439032912254333,
"learning_rate": 0.00028233359221526794,
"loss": 0.8318,
"step": 10530
},
{
"epoch": 0.11788324637486648,
"grad_norm": 0.28182244300842285,
"learning_rate": 0.0002821051669788478,
"loss": 0.8437,
"step": 10540
},
{
"epoch": 0.11799509006212917,
"grad_norm": 0.2419012039899826,
"learning_rate": 0.0002818767417424277,
"loss": 0.8446,
"step": 10550
},
{
"epoch": 0.11810693374939185,
"grad_norm": 0.2598857581615448,
"learning_rate": 0.0002816483165060076,
"loss": 0.8428,
"step": 10560
},
{
"epoch": 0.11821877743665453,
"grad_norm": 0.25206229090690613,
"learning_rate": 0.00028141989126958747,
"loss": 0.8533,
"step": 10570
},
{
"epoch": 0.1183306211239172,
"grad_norm": 0.25155991315841675,
"learning_rate": 0.0002811914660331674,
"loss": 0.8538,
"step": 10580
},
{
"epoch": 0.1184424648111799,
"grad_norm": 0.2342199832201004,
"learning_rate": 0.00028096304079674723,
"loss": 0.8519,
"step": 10590
},
{
"epoch": 0.11855430849844258,
"grad_norm": 0.25823327898979187,
"learning_rate": 0.0002807346155603271,
"loss": 0.8483,
"step": 10600
},
{
"epoch": 0.11866615218570525,
"grad_norm": 0.26428598165512085,
"learning_rate": 0.000280506190323907,
"loss": 0.86,
"step": 10610
},
{
"epoch": 0.11877799587296795,
"grad_norm": 0.25176918506622314,
"learning_rate": 0.00028027776508748685,
"loss": 0.8589,
"step": 10620
},
{
"epoch": 0.11888983956023062,
"grad_norm": 0.28826919198036194,
"learning_rate": 0.0002800493398510667,
"loss": 0.8627,
"step": 10630
},
{
"epoch": 0.1190016832474933,
"grad_norm": 0.24679958820343018,
"learning_rate": 0.0002798209146146467,
"loss": 0.8563,
"step": 10640
},
{
"epoch": 0.11911352693475598,
"grad_norm": 0.2550687789916992,
"learning_rate": 0.00027959248937822653,
"loss": 0.8535,
"step": 10650
},
{
"epoch": 0.11922537062201867,
"grad_norm": 0.2506476640701294,
"learning_rate": 0.0002793640641418064,
"loss": 0.8553,
"step": 10660
},
{
"epoch": 0.11933721430928135,
"grad_norm": 0.24980700016021729,
"learning_rate": 0.0002791356389053863,
"loss": 0.854,
"step": 10670
},
{
"epoch": 0.11944905799654403,
"grad_norm": 0.2280970811843872,
"learning_rate": 0.00027890721366896615,
"loss": 0.8569,
"step": 10680
},
{
"epoch": 0.11956090168380672,
"grad_norm": 0.25191232562065125,
"learning_rate": 0.000278678788432546,
"loss": 0.8566,
"step": 10690
},
{
"epoch": 0.1196727453710694,
"grad_norm": 0.2748493552207947,
"learning_rate": 0.0002784503631961259,
"loss": 0.8573,
"step": 10700
},
{
"epoch": 0.11978458905833207,
"grad_norm": 0.25123515725135803,
"learning_rate": 0.00027822193795970577,
"loss": 0.8473,
"step": 10710
},
{
"epoch": 0.11989643274559475,
"grad_norm": 0.25573378801345825,
"learning_rate": 0.0002779935127232857,
"loss": 0.8469,
"step": 10720
},
{
"epoch": 0.12000827643285744,
"grad_norm": 0.23367713391780853,
"learning_rate": 0.0002777650874868656,
"loss": 0.8452,
"step": 10730
},
{
"epoch": 0.12012012012012012,
"grad_norm": 0.24593010544776917,
"learning_rate": 0.00027753666225044545,
"loss": 0.838,
"step": 10740
},
{
"epoch": 0.1202319638073828,
"grad_norm": 0.2422724962234497,
"learning_rate": 0.0002773082370140253,
"loss": 0.8398,
"step": 10750
},
{
"epoch": 0.12034380749464549,
"grad_norm": 0.24471783638000488,
"learning_rate": 0.0002770798117776052,
"loss": 0.8409,
"step": 10760
},
{
"epoch": 0.12045565118190817,
"grad_norm": 0.25523480772972107,
"learning_rate": 0.00027685138654118507,
"loss": 0.835,
"step": 10770
},
{
"epoch": 0.12056749486917084,
"grad_norm": 0.24846532940864563,
"learning_rate": 0.0002766229613047649,
"loss": 0.842,
"step": 10780
},
{
"epoch": 0.12067933855643354,
"grad_norm": 0.26955240964889526,
"learning_rate": 0.00027639453606834484,
"loss": 0.8525,
"step": 10790
},
{
"epoch": 0.12079118224369621,
"grad_norm": 0.2711884081363678,
"learning_rate": 0.00027616611083192475,
"loss": 0.8352,
"step": 10800
},
{
"epoch": 0.12090302593095889,
"grad_norm": 0.24954953789710999,
"learning_rate": 0.0002759376855955046,
"loss": 0.8257,
"step": 10810
},
{
"epoch": 0.12101486961822157,
"grad_norm": 0.27029111981391907,
"learning_rate": 0.0002757092603590845,
"loss": 0.8147,
"step": 10820
},
{
"epoch": 0.12112671330548426,
"grad_norm": 0.2440258413553238,
"learning_rate": 0.00027548083512266437,
"loss": 0.8239,
"step": 10830
},
{
"epoch": 0.12123855699274694,
"grad_norm": 0.27082934975624084,
"learning_rate": 0.0002752524098862442,
"loss": 0.8391,
"step": 10840
},
{
"epoch": 0.12135040068000962,
"grad_norm": 0.27641886472702026,
"learning_rate": 0.00027502398464982413,
"loss": 0.8276,
"step": 10850
},
{
"epoch": 0.1214622443672723,
"grad_norm": 0.24772177636623383,
"learning_rate": 0.000274795559413404,
"loss": 0.8226,
"step": 10860
},
{
"epoch": 0.12157408805453498,
"grad_norm": 0.2585364580154419,
"learning_rate": 0.00027456713417698384,
"loss": 0.8096,
"step": 10870
},
{
"epoch": 0.12168593174179766,
"grad_norm": 0.2730146050453186,
"learning_rate": 0.0002743387089405638,
"loss": 0.8156,
"step": 10880
},
{
"epoch": 0.12179777542906034,
"grad_norm": 0.2693599760532379,
"learning_rate": 0.00027411028370414366,
"loss": 0.8125,
"step": 10890
},
{
"epoch": 0.12190961911632303,
"grad_norm": 0.26071295142173767,
"learning_rate": 0.0002738818584677235,
"loss": 0.8106,
"step": 10900
},
{
"epoch": 0.12202146280358571,
"grad_norm": 0.2560258209705353,
"learning_rate": 0.0002736534332313034,
"loss": 0.8195,
"step": 10910
},
{
"epoch": 0.12213330649084839,
"grad_norm": 0.27529552578926086,
"learning_rate": 0.0002734250079948833,
"loss": 0.8104,
"step": 10920
},
{
"epoch": 0.12224515017811108,
"grad_norm": 0.2782133221626282,
"learning_rate": 0.00027319658275846314,
"loss": 0.8105,
"step": 10930
},
{
"epoch": 0.12235699386537376,
"grad_norm": 0.27981024980545044,
"learning_rate": 0.000272968157522043,
"loss": 0.8085,
"step": 10940
},
{
"epoch": 0.12246883755263643,
"grad_norm": 0.2741667926311493,
"learning_rate": 0.0002727397322856229,
"loss": 0.8042,
"step": 10950
},
{
"epoch": 0.12258068123989911,
"grad_norm": 0.2468159943819046,
"learning_rate": 0.0002725113070492028,
"loss": 0.8198,
"step": 10960
},
{
"epoch": 0.1226925249271618,
"grad_norm": 0.26167941093444824,
"learning_rate": 0.00027228288181278267,
"loss": 0.8176,
"step": 10970
},
{
"epoch": 0.12280436861442448,
"grad_norm": 0.26660802960395813,
"learning_rate": 0.0002720544565763626,
"loss": 0.8036,
"step": 10980
},
{
"epoch": 0.12291621230168716,
"grad_norm": 0.301575243473053,
"learning_rate": 0.00027182603133994244,
"loss": 0.8049,
"step": 10990
},
{
"epoch": 0.12302805598894985,
"grad_norm": 0.2759682834148407,
"learning_rate": 0.0002715976061035223,
"loss": 0.8024,
"step": 11000
},
{
"epoch": 0.12313989967621253,
"grad_norm": 0.25659626722335815,
"learning_rate": 0.0002713691808671022,
"loss": 0.8229,
"step": 11010
},
{
"epoch": 0.1232517433634752,
"grad_norm": 0.2672923505306244,
"learning_rate": 0.00027114075563068206,
"loss": 0.8018,
"step": 11020
},
{
"epoch": 0.12336358705073788,
"grad_norm": 0.25423988699913025,
"learning_rate": 0.0002709123303942619,
"loss": 0.836,
"step": 11030
},
{
"epoch": 0.12347543073800057,
"grad_norm": 0.28428804874420166,
"learning_rate": 0.0002706839051578419,
"loss": 0.8299,
"step": 11040
},
{
"epoch": 0.12358727442526325,
"grad_norm": 0.2924467921257019,
"learning_rate": 0.00027045547992142173,
"loss": 0.8236,
"step": 11050
},
{
"epoch": 0.12369911811252593,
"grad_norm": 0.25230658054351807,
"learning_rate": 0.0002702270546850016,
"loss": 0.8274,
"step": 11060
},
{
"epoch": 0.12381096179978862,
"grad_norm": 0.27876734733581543,
"learning_rate": 0.0002699986294485815,
"loss": 0.8244,
"step": 11070
},
{
"epoch": 0.1239228054870513,
"grad_norm": 0.29841694235801697,
"learning_rate": 0.00026977020421216136,
"loss": 0.8327,
"step": 11080
},
{
"epoch": 0.12403464917431398,
"grad_norm": 0.3055926263332367,
"learning_rate": 0.0002695417789757412,
"loss": 0.8247,
"step": 11090
},
{
"epoch": 0.12414649286157665,
"grad_norm": 0.275919109582901,
"learning_rate": 0.0002693133537393211,
"loss": 0.8263,
"step": 11100
},
{
"epoch": 0.12425833654883935,
"grad_norm": 0.3069559931755066,
"learning_rate": 0.00026908492850290103,
"loss": 0.8242,
"step": 11110
},
{
"epoch": 0.12437018023610202,
"grad_norm": 0.2574029564857483,
"learning_rate": 0.0002688565032664809,
"loss": 0.819,
"step": 11120
},
{
"epoch": 0.1244820239233647,
"grad_norm": 0.25053170323371887,
"learning_rate": 0.0002686280780300608,
"loss": 0.8022,
"step": 11130
},
{
"epoch": 0.12459386761062739,
"grad_norm": 0.27337634563446045,
"learning_rate": 0.00026839965279364065,
"loss": 0.8127,
"step": 11140
},
{
"epoch": 0.12470571129789007,
"grad_norm": 0.2531510889530182,
"learning_rate": 0.0002681712275572205,
"loss": 0.8138,
"step": 11150
},
{
"epoch": 0.12481755498515275,
"grad_norm": 0.27455076575279236,
"learning_rate": 0.0002679428023208004,
"loss": 0.7974,
"step": 11160
},
{
"epoch": 0.12492939867241543,
"grad_norm": 0.2515604496002197,
"learning_rate": 0.0002677143770843803,
"loss": 0.8077,
"step": 11170
},
{
"epoch": 0.12504124235967812,
"grad_norm": 0.27941974997520447,
"learning_rate": 0.00026748595184796013,
"loss": 0.8099,
"step": 11180
},
{
"epoch": 0.1251530860469408,
"grad_norm": 0.2508449852466583,
"learning_rate": 0.0002672575266115401,
"loss": 0.8077,
"step": 11190
},
{
"epoch": 0.12526492973420347,
"grad_norm": 0.24805410206317902,
"learning_rate": 0.00026702910137511995,
"loss": 0.8029,
"step": 11200
},
{
"epoch": 0.12537677342146616,
"grad_norm": 0.2730201184749603,
"learning_rate": 0.0002668006761386998,
"loss": 0.8383,
"step": 11210
},
{
"epoch": 0.12548861710872883,
"grad_norm": 0.24301932752132416,
"learning_rate": 0.0002665722509022797,
"loss": 0.8245,
"step": 11220
},
{
"epoch": 0.12560046079599152,
"grad_norm": 0.270059734582901,
"learning_rate": 0.00026634382566585957,
"loss": 0.8287,
"step": 11230
},
{
"epoch": 0.1257123044832542,
"grad_norm": 0.24491746723651886,
"learning_rate": 0.0002661154004294394,
"loss": 0.8283,
"step": 11240
},
{
"epoch": 0.12582414817051688,
"grad_norm": 0.2461182177066803,
"learning_rate": 0.00026588697519301934,
"loss": 0.8285,
"step": 11250
},
{
"epoch": 0.12593599185777957,
"grad_norm": 0.26306700706481934,
"learning_rate": 0.0002656585499565992,
"loss": 0.8366,
"step": 11260
},
{
"epoch": 0.12604783554504226,
"grad_norm": 0.2317613661289215,
"learning_rate": 0.0002654301247201791,
"loss": 0.8373,
"step": 11270
},
{
"epoch": 0.12615967923230492,
"grad_norm": 0.25218284130096436,
"learning_rate": 0.000265201699483759,
"loss": 0.8163,
"step": 11280
},
{
"epoch": 0.1262715229195676,
"grad_norm": 0.2527898848056793,
"learning_rate": 0.00026497327424733887,
"loss": 0.819,
"step": 11290
},
{
"epoch": 0.1263833666068303,
"grad_norm": 0.2344309389591217,
"learning_rate": 0.0002647448490109187,
"loss": 0.8335,
"step": 11300
},
{
"epoch": 0.12649521029409297,
"grad_norm": 0.23913320899009705,
"learning_rate": 0.00026451642377449863,
"loss": 0.8289,
"step": 11310
},
{
"epoch": 0.12660705398135566,
"grad_norm": 0.24901095032691956,
"learning_rate": 0.0002642879985380785,
"loss": 0.8159,
"step": 11320
},
{
"epoch": 0.12671889766861835,
"grad_norm": 0.2503173351287842,
"learning_rate": 0.00026405957330165834,
"loss": 0.8372,
"step": 11330
},
{
"epoch": 0.12683074135588102,
"grad_norm": 0.2341470569372177,
"learning_rate": 0.00026383114806523825,
"loss": 0.8264,
"step": 11340
},
{
"epoch": 0.1269425850431437,
"grad_norm": 0.23143555223941803,
"learning_rate": 0.00026360272282881816,
"loss": 0.824,
"step": 11350
},
{
"epoch": 0.12705442873040637,
"grad_norm": 0.24911652505397797,
"learning_rate": 0.000263374297592398,
"loss": 0.82,
"step": 11360
},
{
"epoch": 0.12716627241766906,
"grad_norm": 0.21931353211402893,
"learning_rate": 0.0002631458723559779,
"loss": 0.8194,
"step": 11370
},
{
"epoch": 0.12727811610493175,
"grad_norm": 0.2432345151901245,
"learning_rate": 0.0002629174471195578,
"loss": 0.8371,
"step": 11380
},
{
"epoch": 0.12738995979219442,
"grad_norm": 0.24188277125358582,
"learning_rate": 0.00026268902188313764,
"loss": 0.8096,
"step": 11390
},
{
"epoch": 0.1275018034794571,
"grad_norm": 0.2522214651107788,
"learning_rate": 0.0002624605966467175,
"loss": 0.8187,
"step": 11400
},
{
"epoch": 0.1276136471667198,
"grad_norm": 0.2596495449542999,
"learning_rate": 0.0002622321714102974,
"loss": 0.8138,
"step": 11410
},
{
"epoch": 0.12772549085398247,
"grad_norm": 0.2708049714565277,
"learning_rate": 0.00026200374617387726,
"loss": 0.8066,
"step": 11420
},
{
"epoch": 0.12783733454124516,
"grad_norm": 0.27820831537246704,
"learning_rate": 0.00026177532093745717,
"loss": 0.8112,
"step": 11430
},
{
"epoch": 0.12794917822850785,
"grad_norm": 0.23918400704860687,
"learning_rate": 0.0002615468957010371,
"loss": 0.8148,
"step": 11440
},
{
"epoch": 0.1280610219157705,
"grad_norm": 0.22054031491279602,
"learning_rate": 0.00026131847046461694,
"loss": 0.8183,
"step": 11450
},
{
"epoch": 0.1281728656030332,
"grad_norm": 0.25998455286026,
"learning_rate": 0.0002610900452281968,
"loss": 0.8242,
"step": 11460
},
{
"epoch": 0.1282847092902959,
"grad_norm": 0.26852914690971375,
"learning_rate": 0.0002608616199917767,
"loss": 0.8161,
"step": 11470
},
{
"epoch": 0.12839655297755856,
"grad_norm": 0.24028563499450684,
"learning_rate": 0.00026063319475535656,
"loss": 0.8083,
"step": 11480
},
{
"epoch": 0.12850839666482125,
"grad_norm": 0.24944745004177094,
"learning_rate": 0.0002604047695189364,
"loss": 0.8168,
"step": 11490
},
{
"epoch": 0.12862024035208391,
"grad_norm": 0.26595303416252136,
"learning_rate": 0.0002601763442825164,
"loss": 0.8178,
"step": 11500
},
{
"epoch": 0.1287320840393466,
"grad_norm": 0.24556541442871094,
"learning_rate": 0.00025994791904609623,
"loss": 0.8229,
"step": 11510
},
{
"epoch": 0.1288439277266093,
"grad_norm": 0.24716900289058685,
"learning_rate": 0.0002597194938096761,
"loss": 0.809,
"step": 11520
},
{
"epoch": 0.12895577141387196,
"grad_norm": 0.24745820462703705,
"learning_rate": 0.000259491068573256,
"loss": 0.8293,
"step": 11530
},
{
"epoch": 0.12906761510113465,
"grad_norm": 0.2732492983341217,
"learning_rate": 0.00025926264333683586,
"loss": 0.8,
"step": 11540
},
{
"epoch": 0.12917945878839734,
"grad_norm": 0.23239663243293762,
"learning_rate": 0.0002590342181004157,
"loss": 0.8175,
"step": 11550
},
{
"epoch": 0.12929130247566,
"grad_norm": 0.24953389167785645,
"learning_rate": 0.0002588057928639956,
"loss": 0.8152,
"step": 11560
},
{
"epoch": 0.1294031461629227,
"grad_norm": 0.25258156657218933,
"learning_rate": 0.0002585773676275755,
"loss": 0.8301,
"step": 11570
},
{
"epoch": 0.1295149898501854,
"grad_norm": 0.2609168291091919,
"learning_rate": 0.0002583489423911554,
"loss": 0.8197,
"step": 11580
},
{
"epoch": 0.12962683353744806,
"grad_norm": 0.2484872192144394,
"learning_rate": 0.0002581205171547353,
"loss": 0.8362,
"step": 11590
},
{
"epoch": 0.12973867722471075,
"grad_norm": 0.2833307385444641,
"learning_rate": 0.00025789209191831515,
"loss": 0.8338,
"step": 11600
},
{
"epoch": 0.12985052091197344,
"grad_norm": 0.24657459557056427,
"learning_rate": 0.000257663666681895,
"loss": 0.8205,
"step": 11610
},
{
"epoch": 0.1299623645992361,
"grad_norm": 0.2499598115682602,
"learning_rate": 0.0002574352414454749,
"loss": 0.8406,
"step": 11620
},
{
"epoch": 0.1300742082864988,
"grad_norm": 0.2757512629032135,
"learning_rate": 0.0002572068162090548,
"loss": 0.8247,
"step": 11630
},
{
"epoch": 0.13018605197376146,
"grad_norm": 0.25661805272102356,
"learning_rate": 0.00025697839097263463,
"loss": 0.8384,
"step": 11640
},
{
"epoch": 0.13029789566102415,
"grad_norm": 0.27651283144950867,
"learning_rate": 0.00025674996573621454,
"loss": 0.818,
"step": 11650
},
{
"epoch": 0.13040973934828684,
"grad_norm": 0.247050940990448,
"learning_rate": 0.00025652154049979445,
"loss": 0.8261,
"step": 11660
},
{
"epoch": 0.1305215830355495,
"grad_norm": 0.23124581575393677,
"learning_rate": 0.0002562931152633743,
"loss": 0.8259,
"step": 11670
},
{
"epoch": 0.1306334267228122,
"grad_norm": 0.2694045603275299,
"learning_rate": 0.0002560646900269542,
"loss": 0.8304,
"step": 11680
},
{
"epoch": 0.1307452704100749,
"grad_norm": 0.26821568608283997,
"learning_rate": 0.00025583626479053407,
"loss": 0.8441,
"step": 11690
},
{
"epoch": 0.13085711409733755,
"grad_norm": 0.2747989892959595,
"learning_rate": 0.0002556078395541139,
"loss": 0.841,
"step": 11700
},
{
"epoch": 0.13096895778460024,
"grad_norm": 0.28248855471611023,
"learning_rate": 0.00025537941431769384,
"loss": 0.857,
"step": 11710
},
{
"epoch": 0.13108080147186293,
"grad_norm": 0.25378182530403137,
"learning_rate": 0.0002551509890812737,
"loss": 0.8437,
"step": 11720
},
{
"epoch": 0.1311926451591256,
"grad_norm": 0.25950944423675537,
"learning_rate": 0.00025492256384485355,
"loss": 0.8497,
"step": 11730
},
{
"epoch": 0.1313044888463883,
"grad_norm": 0.26261699199676514,
"learning_rate": 0.0002546941386084335,
"loss": 0.8477,
"step": 11740
},
{
"epoch": 0.13141633253365098,
"grad_norm": 0.30151599645614624,
"learning_rate": 0.00025446571337201337,
"loss": 0.8405,
"step": 11750
},
{
"epoch": 0.13152817622091365,
"grad_norm": 0.2556060254573822,
"learning_rate": 0.0002542372881355932,
"loss": 0.831,
"step": 11760
},
{
"epoch": 0.13164001990817634,
"grad_norm": 0.26560309529304504,
"learning_rate": 0.00025400886289917313,
"loss": 0.8445,
"step": 11770
},
{
"epoch": 0.13175186359543903,
"grad_norm": 0.28504636883735657,
"learning_rate": 0.000253780437662753,
"loss": 0.8432,
"step": 11780
},
{
"epoch": 0.1318637072827017,
"grad_norm": 0.2985188663005829,
"learning_rate": 0.00025355201242633285,
"loss": 0.8584,
"step": 11790
},
{
"epoch": 0.13197555096996438,
"grad_norm": 0.28022414445877075,
"learning_rate": 0.00025332358718991276,
"loss": 0.8393,
"step": 11800
},
{
"epoch": 0.13208739465722705,
"grad_norm": 0.28535568714141846,
"learning_rate": 0.0002530951619534926,
"loss": 0.8369,
"step": 11810
},
{
"epoch": 0.13219923834448974,
"grad_norm": 0.27764952182769775,
"learning_rate": 0.0002528667367170725,
"loss": 0.8435,
"step": 11820
},
{
"epoch": 0.13231108203175243,
"grad_norm": 0.28943151235580444,
"learning_rate": 0.0002526383114806524,
"loss": 0.8334,
"step": 11830
},
{
"epoch": 0.1324229257190151,
"grad_norm": 0.28240668773651123,
"learning_rate": 0.0002524098862442323,
"loss": 0.8338,
"step": 11840
},
{
"epoch": 0.13253476940627779,
"grad_norm": 0.27650541067123413,
"learning_rate": 0.00025218146100781214,
"loss": 0.8275,
"step": 11850
},
{
"epoch": 0.13264661309354048,
"grad_norm": 0.27569788694381714,
"learning_rate": 0.000251953035771392,
"loss": 0.8323,
"step": 11860
},
{
"epoch": 0.13275845678080314,
"grad_norm": 0.29103782773017883,
"learning_rate": 0.0002517246105349719,
"loss": 0.8401,
"step": 11870
},
{
"epoch": 0.13287030046806583,
"grad_norm": 0.28769806027412415,
"learning_rate": 0.00025149618529855176,
"loss": 0.8369,
"step": 11880
},
{
"epoch": 0.13298214415532852,
"grad_norm": 0.2803378701210022,
"learning_rate": 0.0002512677600621316,
"loss": 0.8308,
"step": 11890
},
{
"epoch": 0.1330939878425912,
"grad_norm": 0.29264572262763977,
"learning_rate": 0.0002510393348257116,
"loss": 0.8314,
"step": 11900
},
{
"epoch": 0.13320583152985388,
"grad_norm": 0.27434802055358887,
"learning_rate": 0.00025081090958929144,
"loss": 0.8337,
"step": 11910
},
{
"epoch": 0.13331767521711657,
"grad_norm": 0.270589143037796,
"learning_rate": 0.0002505824843528713,
"loss": 0.8503,
"step": 11920
},
{
"epoch": 0.13342951890437924,
"grad_norm": 0.27260124683380127,
"learning_rate": 0.0002503540591164512,
"loss": 0.8293,
"step": 11930
},
{
"epoch": 0.13354136259164193,
"grad_norm": 0.2684808075428009,
"learning_rate": 0.00025012563388003106,
"loss": 0.8339,
"step": 11940
},
{
"epoch": 0.1336532062789046,
"grad_norm": 0.2510156035423279,
"learning_rate": 0.00024989720864361097,
"loss": 0.8464,
"step": 11950
},
{
"epoch": 0.13376504996616728,
"grad_norm": 0.24331960082054138,
"learning_rate": 0.0002496687834071908,
"loss": 0.8443,
"step": 11960
},
{
"epoch": 0.13387689365342997,
"grad_norm": 0.2688249349594116,
"learning_rate": 0.00024944035817077074,
"loss": 0.8483,
"step": 11970
},
{
"epoch": 0.13398873734069264,
"grad_norm": 0.2608729898929596,
"learning_rate": 0.0002492119329343506,
"loss": 0.852,
"step": 11980
},
{
"epoch": 0.13410058102795533,
"grad_norm": 0.28415507078170776,
"learning_rate": 0.00024898350769793045,
"loss": 0.8449,
"step": 11990
},
{
"epoch": 0.13421242471521802,
"grad_norm": 0.2920886278152466,
"learning_rate": 0.00024875508246151036,
"loss": 0.8281,
"step": 12000
},
{
"epoch": 0.13432426840248068,
"grad_norm": 0.2763430178165436,
"learning_rate": 0.00024852665722509027,
"loss": 0.8492,
"step": 12010
},
{
"epoch": 0.13443611208974338,
"grad_norm": 0.26460400223731995,
"learning_rate": 0.0002482982319886701,
"loss": 0.8409,
"step": 12020
},
{
"epoch": 0.13454795577700607,
"grad_norm": 0.2698183059692383,
"learning_rate": 0.00024806980675225,
"loss": 0.8295,
"step": 12030
},
{
"epoch": 0.13465979946426873,
"grad_norm": 0.2728478014469147,
"learning_rate": 0.0002478413815158299,
"loss": 0.837,
"step": 12040
},
{
"epoch": 0.13477164315153142,
"grad_norm": 0.282924085855484,
"learning_rate": 0.00024761295627940974,
"loss": 0.8482,
"step": 12050
},
{
"epoch": 0.13488348683879411,
"grad_norm": 0.264614999294281,
"learning_rate": 0.00024738453104298965,
"loss": 0.8432,
"step": 12060
},
{
"epoch": 0.13499533052605678,
"grad_norm": 0.2475707232952118,
"learning_rate": 0.0002471561058065695,
"loss": 0.8387,
"step": 12070
},
{
"epoch": 0.13510717421331947,
"grad_norm": 0.2620779573917389,
"learning_rate": 0.00024692768057014937,
"loss": 0.8559,
"step": 12080
},
{
"epoch": 0.13521901790058213,
"grad_norm": 0.2645311951637268,
"learning_rate": 0.0002466992553337293,
"loss": 0.8363,
"step": 12090
},
{
"epoch": 0.13533086158784483,
"grad_norm": 0.27586236596107483,
"learning_rate": 0.0002464708300973092,
"loss": 0.8365,
"step": 12100
},
{
"epoch": 0.13544270527510752,
"grad_norm": 0.2695125341415405,
"learning_rate": 0.00024624240486088904,
"loss": 0.8412,
"step": 12110
},
{
"epoch": 0.13555454896237018,
"grad_norm": 0.2473846971988678,
"learning_rate": 0.0002460139796244689,
"loss": 0.8362,
"step": 12120
},
{
"epoch": 0.13566639264963287,
"grad_norm": 0.28001588582992554,
"learning_rate": 0.0002457855543880488,
"loss": 0.8462,
"step": 12130
},
{
"epoch": 0.13577823633689556,
"grad_norm": 0.29486599564552307,
"learning_rate": 0.00024555712915162866,
"loss": 0.8607,
"step": 12140
},
{
"epoch": 0.13589008002415823,
"grad_norm": 0.2761843204498291,
"learning_rate": 0.00024532870391520857,
"loss": 0.8668,
"step": 12150
},
{
"epoch": 0.13600192371142092,
"grad_norm": 0.25779953598976135,
"learning_rate": 0.00024510027867878843,
"loss": 0.853,
"step": 12160
},
{
"epoch": 0.1361137673986836,
"grad_norm": 0.27593857049942017,
"learning_rate": 0.00024487185344236834,
"loss": 0.8506,
"step": 12170
},
{
"epoch": 0.13622561108594627,
"grad_norm": 0.24426791071891785,
"learning_rate": 0.0002446434282059482,
"loss": 0.8623,
"step": 12180
},
{
"epoch": 0.13633745477320897,
"grad_norm": 0.25555628538131714,
"learning_rate": 0.00024441500296952805,
"loss": 0.8493,
"step": 12190
},
{
"epoch": 0.13644929846047166,
"grad_norm": 0.2234913557767868,
"learning_rate": 0.00024418657773310796,
"loss": 0.8644,
"step": 12200
},
{
"epoch": 0.13656114214773432,
"grad_norm": 0.27130651473999023,
"learning_rate": 0.00024395815249668784,
"loss": 0.8791,
"step": 12210
},
{
"epoch": 0.136672985834997,
"grad_norm": 0.24734824895858765,
"learning_rate": 0.0002437297272602677,
"loss": 0.8719,
"step": 12220
},
{
"epoch": 0.13678482952225968,
"grad_norm": 0.24316945672035217,
"learning_rate": 0.0002435013020238476,
"loss": 0.8546,
"step": 12230
},
{
"epoch": 0.13689667320952237,
"grad_norm": 0.2349976748228073,
"learning_rate": 0.0002432728767874275,
"loss": 0.8458,
"step": 12240
},
{
"epoch": 0.13700851689678506,
"grad_norm": 0.26791033148765564,
"learning_rate": 0.00024304445155100735,
"loss": 0.8485,
"step": 12250
},
{
"epoch": 0.13712036058404772,
"grad_norm": 0.23598451912403107,
"learning_rate": 0.00024281602631458723,
"loss": 0.8451,
"step": 12260
},
{
"epoch": 0.13723220427131042,
"grad_norm": 0.23012129962444305,
"learning_rate": 0.00024258760107816714,
"loss": 0.8332,
"step": 12270
},
{
"epoch": 0.1373440479585731,
"grad_norm": 0.22834524512290955,
"learning_rate": 0.000242359175841747,
"loss": 0.8203,
"step": 12280
},
{
"epoch": 0.13745589164583577,
"grad_norm": 0.2247861921787262,
"learning_rate": 0.00024213075060532688,
"loss": 0.8303,
"step": 12290
},
{
"epoch": 0.13756773533309846,
"grad_norm": 0.2438284307718277,
"learning_rate": 0.00024190232536890676,
"loss": 0.8216,
"step": 12300
},
{
"epoch": 0.13767957902036115,
"grad_norm": 0.24075888097286224,
"learning_rate": 0.00024167390013248664,
"loss": 0.7964,
"step": 12310
},
{
"epoch": 0.13779142270762382,
"grad_norm": 0.24668976664543152,
"learning_rate": 0.00024144547489606653,
"loss": 0.8028,
"step": 12320
},
{
"epoch": 0.1379032663948865,
"grad_norm": 0.26727405190467834,
"learning_rate": 0.0002412170496596464,
"loss": 0.8081,
"step": 12330
},
{
"epoch": 0.1380151100821492,
"grad_norm": 0.2645564377307892,
"learning_rate": 0.00024098862442322626,
"loss": 0.8116,
"step": 12340
},
{
"epoch": 0.13812695376941186,
"grad_norm": 0.25368645787239075,
"learning_rate": 0.00024076019918680617,
"loss": 0.8105,
"step": 12350
},
{
"epoch": 0.13823879745667456,
"grad_norm": 0.26823967695236206,
"learning_rate": 0.00024053177395038606,
"loss": 0.8249,
"step": 12360
},
{
"epoch": 0.13835064114393722,
"grad_norm": 0.2827225625514984,
"learning_rate": 0.0002403033487139659,
"loss": 0.8191,
"step": 12370
},
{
"epoch": 0.1384624848311999,
"grad_norm": 0.23261433839797974,
"learning_rate": 0.00024007492347754582,
"loss": 0.8215,
"step": 12380
},
{
"epoch": 0.1385743285184626,
"grad_norm": 0.27331966161727905,
"learning_rate": 0.00023984649824112568,
"loss": 0.8232,
"step": 12390
},
{
"epoch": 0.13868617220572527,
"grad_norm": 0.2801966369152069,
"learning_rate": 0.00023961807300470556,
"loss": 0.8074,
"step": 12400
},
{
"epoch": 0.13879801589298796,
"grad_norm": 0.2379591315984726,
"learning_rate": 0.00023938964776828544,
"loss": 0.8209,
"step": 12410
},
{
"epoch": 0.13890985958025065,
"grad_norm": 0.27151694893836975,
"learning_rate": 0.00023916122253186533,
"loss": 0.8258,
"step": 12420
},
{
"epoch": 0.1390217032675133,
"grad_norm": 0.21429865062236786,
"learning_rate": 0.0002389327972954452,
"loss": 0.8178,
"step": 12430
},
{
"epoch": 0.139133546954776,
"grad_norm": 0.2777722477912903,
"learning_rate": 0.0002387043720590251,
"loss": 0.826,
"step": 12440
},
{
"epoch": 0.1392453906420387,
"grad_norm": 0.2514742910861969,
"learning_rate": 0.00023847594682260495,
"loss": 0.8362,
"step": 12450
},
{
"epoch": 0.13935723432930136,
"grad_norm": 0.23247656226158142,
"learning_rate": 0.00023824752158618486,
"loss": 0.8049,
"step": 12460
},
{
"epoch": 0.13946907801656405,
"grad_norm": 0.2391313910484314,
"learning_rate": 0.00023801909634976474,
"loss": 0.8082,
"step": 12470
},
{
"epoch": 0.13958092170382674,
"grad_norm": 0.2366340011358261,
"learning_rate": 0.0002377906711133446,
"loss": 0.8214,
"step": 12480
},
{
"epoch": 0.1396927653910894,
"grad_norm": 0.2570713758468628,
"learning_rate": 0.00023756224587692448,
"loss": 0.827,
"step": 12490
},
{
"epoch": 0.1398046090783521,
"grad_norm": 0.22823789715766907,
"learning_rate": 0.0002373338206405044,
"loss": 0.8314,
"step": 12500
},
{
"epoch": 0.1399164527656148,
"grad_norm": 0.24660278856754303,
"learning_rate": 0.00023710539540408424,
"loss": 0.838,
"step": 12510
},
{
"epoch": 0.14002829645287745,
"grad_norm": 0.25041723251342773,
"learning_rate": 0.00023687697016766413,
"loss": 0.8371,
"step": 12520
},
{
"epoch": 0.14014014014014015,
"grad_norm": 0.23942531645298004,
"learning_rate": 0.000236648544931244,
"loss": 0.8282,
"step": 12530
},
{
"epoch": 0.1402519838274028,
"grad_norm": 0.2445865273475647,
"learning_rate": 0.0002364201196948239,
"loss": 0.8307,
"step": 12540
},
{
"epoch": 0.1403638275146655,
"grad_norm": 0.25278452038764954,
"learning_rate": 0.00023619169445840378,
"loss": 0.8483,
"step": 12550
},
{
"epoch": 0.1404756712019282,
"grad_norm": 0.22890037298202515,
"learning_rate": 0.00023596326922198366,
"loss": 0.8328,
"step": 12560
},
{
"epoch": 0.14058751488919086,
"grad_norm": 0.2360977679491043,
"learning_rate": 0.00023573484398556351,
"loss": 0.8373,
"step": 12570
},
{
"epoch": 0.14069935857645355,
"grad_norm": 0.22873692214488983,
"learning_rate": 0.00023550641874914342,
"loss": 0.8399,
"step": 12580
},
{
"epoch": 0.14081120226371624,
"grad_norm": 0.228402242064476,
"learning_rate": 0.0002352779935127233,
"loss": 0.8272,
"step": 12590
},
{
"epoch": 0.1409230459509789,
"grad_norm": 0.2625369131565094,
"learning_rate": 0.00023504956827630316,
"loss": 0.8413,
"step": 12600
},
{
"epoch": 0.1410348896382416,
"grad_norm": 0.2744843363761902,
"learning_rate": 0.00023482114303988305,
"loss": 0.823,
"step": 12610
},
{
"epoch": 0.1411467333255043,
"grad_norm": 0.24845914542675018,
"learning_rate": 0.00023459271780346293,
"loss": 0.8089,
"step": 12620
},
{
"epoch": 0.14125857701276695,
"grad_norm": 0.2431713193655014,
"learning_rate": 0.0002343642925670428,
"loss": 0.8204,
"step": 12630
},
{
"epoch": 0.14137042070002964,
"grad_norm": 0.2636731266975403,
"learning_rate": 0.0002341358673306227,
"loss": 0.8241,
"step": 12640
},
{
"epoch": 0.14148226438729233,
"grad_norm": 0.24605631828308105,
"learning_rate": 0.00023390744209420255,
"loss": 0.837,
"step": 12650
},
{
"epoch": 0.141594108074555,
"grad_norm": 0.25722581148147583,
"learning_rate": 0.00023367901685778246,
"loss": 0.8338,
"step": 12660
},
{
"epoch": 0.1417059517618177,
"grad_norm": 0.2628157138824463,
"learning_rate": 0.00023345059162136234,
"loss": 0.8271,
"step": 12670
},
{
"epoch": 0.14181779544908035,
"grad_norm": 0.24534687399864197,
"learning_rate": 0.0002332221663849422,
"loss": 0.8281,
"step": 12680
},
{
"epoch": 0.14192963913634304,
"grad_norm": 0.24370639026165009,
"learning_rate": 0.00023299374114852208,
"loss": 0.8243,
"step": 12690
},
{
"epoch": 0.14204148282360574,
"grad_norm": 0.2993674576282501,
"learning_rate": 0.000232765315912102,
"loss": 0.8191,
"step": 12700
},
{
"epoch": 0.1421533265108684,
"grad_norm": 0.2372383326292038,
"learning_rate": 0.00023253689067568185,
"loss": 0.8115,
"step": 12710
},
{
"epoch": 0.1422651701981311,
"grad_norm": 0.2405237853527069,
"learning_rate": 0.00023230846543926173,
"loss": 0.8012,
"step": 12720
},
{
"epoch": 0.14237701388539378,
"grad_norm": 0.23501497507095337,
"learning_rate": 0.0002320800402028416,
"loss": 0.8272,
"step": 12730
},
{
"epoch": 0.14248885757265645,
"grad_norm": 0.2573966085910797,
"learning_rate": 0.0002318516149664215,
"loss": 0.8231,
"step": 12740
},
{
"epoch": 0.14260070125991914,
"grad_norm": 0.25884565711021423,
"learning_rate": 0.00023162318973000138,
"loss": 0.8293,
"step": 12750
},
{
"epoch": 0.14271254494718183,
"grad_norm": 0.24788953363895416,
"learning_rate": 0.00023139476449358126,
"loss": 0.8338,
"step": 12760
},
{
"epoch": 0.1428243886344445,
"grad_norm": 0.23874413967132568,
"learning_rate": 0.00023116633925716112,
"loss": 0.8184,
"step": 12770
},
{
"epoch": 0.14293623232170719,
"grad_norm": 0.2358027547597885,
"learning_rate": 0.00023093791402074103,
"loss": 0.8143,
"step": 12780
},
{
"epoch": 0.14304807600896988,
"grad_norm": 0.22447925806045532,
"learning_rate": 0.0002307094887843209,
"loss": 0.8093,
"step": 12790
},
{
"epoch": 0.14315991969623254,
"grad_norm": 0.25550246238708496,
"learning_rate": 0.00023048106354790077,
"loss": 0.8178,
"step": 12800
},
{
"epoch": 0.14327176338349523,
"grad_norm": 0.2370327264070511,
"learning_rate": 0.00023025263831148065,
"loss": 0.8035,
"step": 12810
},
{
"epoch": 0.1433836070707579,
"grad_norm": 0.24910229444503784,
"learning_rate": 0.00023002421307506056,
"loss": 0.7965,
"step": 12820
},
{
"epoch": 0.1434954507580206,
"grad_norm": 0.23592302203178406,
"learning_rate": 0.0002297957878386404,
"loss": 0.808,
"step": 12830
},
{
"epoch": 0.14360729444528328,
"grad_norm": 0.24010522663593292,
"learning_rate": 0.0002295673626022203,
"loss": 0.8047,
"step": 12840
},
{
"epoch": 0.14371913813254594,
"grad_norm": 0.26334619522094727,
"learning_rate": 0.00022933893736580015,
"loss": 0.8011,
"step": 12850
},
{
"epoch": 0.14383098181980863,
"grad_norm": 0.23162928223609924,
"learning_rate": 0.00022911051212938006,
"loss": 0.811,
"step": 12860
},
{
"epoch": 0.14394282550707133,
"grad_norm": 0.24273565411567688,
"learning_rate": 0.00022888208689295994,
"loss": 0.8249,
"step": 12870
},
{
"epoch": 0.144054669194334,
"grad_norm": 0.239716574549675,
"learning_rate": 0.0002286536616565398,
"loss": 0.8146,
"step": 12880
},
{
"epoch": 0.14416651288159668,
"grad_norm": 0.22947145998477936,
"learning_rate": 0.0002284252364201197,
"loss": 0.8037,
"step": 12890
},
{
"epoch": 0.14427835656885937,
"grad_norm": 0.2369975745677948,
"learning_rate": 0.0002281968111836996,
"loss": 0.7938,
"step": 12900
},
{
"epoch": 0.14439020025612204,
"grad_norm": 0.23150302469730377,
"learning_rate": 0.00022796838594727945,
"loss": 0.7971,
"step": 12910
},
{
"epoch": 0.14450204394338473,
"grad_norm": 0.25659120082855225,
"learning_rate": 0.00022773996071085933,
"loss": 0.7897,
"step": 12920
},
{
"epoch": 0.14461388763064742,
"grad_norm": 0.26838308572769165,
"learning_rate": 0.00022751153547443924,
"loss": 0.8025,
"step": 12930
},
{
"epoch": 0.14472573131791008,
"grad_norm": 0.2421617954969406,
"learning_rate": 0.0002272831102380191,
"loss": 0.7937,
"step": 12940
},
{
"epoch": 0.14483757500517278,
"grad_norm": 0.22780479490756989,
"learning_rate": 0.00022705468500159898,
"loss": 0.7861,
"step": 12950
},
{
"epoch": 0.14494941869243544,
"grad_norm": 0.2561044692993164,
"learning_rate": 0.00022682625976517886,
"loss": 0.7817,
"step": 12960
},
{
"epoch": 0.14506126237969813,
"grad_norm": 0.24073092639446259,
"learning_rate": 0.00022659783452875875,
"loss": 0.8024,
"step": 12970
},
{
"epoch": 0.14517310606696082,
"grad_norm": 0.24959658086299896,
"learning_rate": 0.00022636940929233863,
"loss": 0.7994,
"step": 12980
},
{
"epoch": 0.14528494975422349,
"grad_norm": 0.2711149752140045,
"learning_rate": 0.0002261409840559185,
"loss": 0.8011,
"step": 12990
},
{
"epoch": 0.14539679344148618,
"grad_norm": 0.2447725236415863,
"learning_rate": 0.00022591255881949837,
"loss": 0.7957,
"step": 13000
},
{
"epoch": 0.14550863712874887,
"grad_norm": 0.26505330204963684,
"learning_rate": 0.00022568413358307828,
"loss": 0.7932,
"step": 13010
},
{
"epoch": 0.14562048081601153,
"grad_norm": 0.256712943315506,
"learning_rate": 0.00022545570834665816,
"loss": 0.7919,
"step": 13020
},
{
"epoch": 0.14573232450327422,
"grad_norm": 0.23816627264022827,
"learning_rate": 0.00022522728311023802,
"loss": 0.7942,
"step": 13030
},
{
"epoch": 0.14584416819053692,
"grad_norm": 0.25607794523239136,
"learning_rate": 0.0002249988578738179,
"loss": 0.8058,
"step": 13040
},
{
"epoch": 0.14595601187779958,
"grad_norm": 0.2644692361354828,
"learning_rate": 0.0002247704326373978,
"loss": 0.8026,
"step": 13050
},
{
"epoch": 0.14606785556506227,
"grad_norm": 0.24160505831241608,
"learning_rate": 0.00022454200740097766,
"loss": 0.8013,
"step": 13060
},
{
"epoch": 0.14617969925232496,
"grad_norm": 0.25321200489997864,
"learning_rate": 0.00022431358216455755,
"loss": 0.802,
"step": 13070
},
{
"epoch": 0.14629154293958763,
"grad_norm": 0.38834208250045776,
"learning_rate": 0.0002240851569281374,
"loss": 0.8053,
"step": 13080
},
{
"epoch": 0.14640338662685032,
"grad_norm": 0.2638767957687378,
"learning_rate": 0.0002238567316917173,
"loss": 0.803,
"step": 13090
},
{
"epoch": 0.14651523031411298,
"grad_norm": 0.33412685990333557,
"learning_rate": 0.0002236283064552972,
"loss": 0.8091,
"step": 13100
},
{
"epoch": 0.14662707400137567,
"grad_norm": 0.27539852261543274,
"learning_rate": 0.00022339988121887705,
"loss": 0.8019,
"step": 13110
},
{
"epoch": 0.14673891768863837,
"grad_norm": 0.25128626823425293,
"learning_rate": 0.00022317145598245693,
"loss": 0.7961,
"step": 13120
},
{
"epoch": 0.14685076137590103,
"grad_norm": 0.27428579330444336,
"learning_rate": 0.00022294303074603684,
"loss": 0.792,
"step": 13130
},
{
"epoch": 0.14696260506316372,
"grad_norm": 0.25421425700187683,
"learning_rate": 0.0002227146055096167,
"loss": 0.8139,
"step": 13140
},
{
"epoch": 0.1470744487504264,
"grad_norm": 0.23709440231323242,
"learning_rate": 0.00022248618027319658,
"loss": 0.8147,
"step": 13150
},
{
"epoch": 0.14718629243768908,
"grad_norm": 0.2693617641925812,
"learning_rate": 0.00022225775503677646,
"loss": 0.8174,
"step": 13160
},
{
"epoch": 0.14729813612495177,
"grad_norm": 0.26674261689186096,
"learning_rate": 0.00022202932980035635,
"loss": 0.8105,
"step": 13170
},
{
"epoch": 0.14740997981221446,
"grad_norm": 0.2656268775463104,
"learning_rate": 0.00022180090456393623,
"loss": 0.8355,
"step": 13180
},
{
"epoch": 0.14752182349947712,
"grad_norm": 0.2587822377681732,
"learning_rate": 0.0002215724793275161,
"loss": 0.8311,
"step": 13190
},
{
"epoch": 0.14763366718673981,
"grad_norm": 0.29723209142684937,
"learning_rate": 0.00022134405409109597,
"loss": 0.8664,
"step": 13200
},
{
"epoch": 0.1477455108740025,
"grad_norm": 0.2579325735569,
"learning_rate": 0.00022111562885467588,
"loss": 0.8515,
"step": 13210
},
{
"epoch": 0.14785735456126517,
"grad_norm": 0.28357258439064026,
"learning_rate": 0.00022088720361825576,
"loss": 0.8562,
"step": 13220
},
{
"epoch": 0.14796919824852786,
"grad_norm": 0.26742318272590637,
"learning_rate": 0.00022065877838183562,
"loss": 0.8571,
"step": 13230
},
{
"epoch": 0.14808104193579055,
"grad_norm": 0.2750874161720276,
"learning_rate": 0.0002204303531454155,
"loss": 0.8449,
"step": 13240
},
{
"epoch": 0.14819288562305322,
"grad_norm": 0.3043031692504883,
"learning_rate": 0.0002202019279089954,
"loss": 0.8472,
"step": 13250
},
{
"epoch": 0.1483047293103159,
"grad_norm": 0.27216988801956177,
"learning_rate": 0.00021997350267257527,
"loss": 0.8732,
"step": 13260
},
{
"epoch": 0.14841657299757857,
"grad_norm": 0.2818603515625,
"learning_rate": 0.00021974507743615515,
"loss": 0.8333,
"step": 13270
},
{
"epoch": 0.14852841668484126,
"grad_norm": 0.2604407072067261,
"learning_rate": 0.000219516652199735,
"loss": 0.8467,
"step": 13280
},
{
"epoch": 0.14864026037210396,
"grad_norm": 0.28342294692993164,
"learning_rate": 0.00021928822696331491,
"loss": 0.8292,
"step": 13290
},
{
"epoch": 0.14875210405936662,
"grad_norm": 0.2564396262168884,
"learning_rate": 0.0002190598017268948,
"loss": 0.8355,
"step": 13300
},
{
"epoch": 0.1488639477466293,
"grad_norm": 0.2528108060359955,
"learning_rate": 0.00021883137649047465,
"loss": 0.8269,
"step": 13310
},
{
"epoch": 0.148975791433892,
"grad_norm": 0.26454785466194153,
"learning_rate": 0.00021860295125405456,
"loss": 0.8425,
"step": 13320
},
{
"epoch": 0.14908763512115467,
"grad_norm": 0.25204601883888245,
"learning_rate": 0.00021837452601763445,
"loss": 0.8251,
"step": 13330
},
{
"epoch": 0.14919947880841736,
"grad_norm": 0.24680152535438538,
"learning_rate": 0.0002181461007812143,
"loss": 0.8247,
"step": 13340
},
{
"epoch": 0.14931132249568005,
"grad_norm": 0.27356913685798645,
"learning_rate": 0.00021791767554479418,
"loss": 0.811,
"step": 13350
},
{
"epoch": 0.1494231661829427,
"grad_norm": 0.24703428149223328,
"learning_rate": 0.0002176892503083741,
"loss": 0.8145,
"step": 13360
},
{
"epoch": 0.1495350098702054,
"grad_norm": 0.27793166041374207,
"learning_rate": 0.00021746082507195395,
"loss": 0.8162,
"step": 13370
},
{
"epoch": 0.1496468535574681,
"grad_norm": 0.28826582431793213,
"learning_rate": 0.00021723239983553383,
"loss": 0.8258,
"step": 13380
},
{
"epoch": 0.14975869724473076,
"grad_norm": 0.24826544523239136,
"learning_rate": 0.00021700397459911372,
"loss": 0.8131,
"step": 13390
},
{
"epoch": 0.14987054093199345,
"grad_norm": 0.29015326499938965,
"learning_rate": 0.0002167755493626936,
"loss": 0.8241,
"step": 13400
},
{
"epoch": 0.14998238461925611,
"grad_norm": 0.2692265510559082,
"learning_rate": 0.00021654712412627348,
"loss": 0.8046,
"step": 13410
},
{
"epoch": 0.1500942283065188,
"grad_norm": 0.28277263045310974,
"learning_rate": 0.00021631869888985336,
"loss": 0.8075,
"step": 13420
},
{
"epoch": 0.1502060719937815,
"grad_norm": 0.25920721888542175,
"learning_rate": 0.00021609027365343322,
"loss": 0.8146,
"step": 13430
},
{
"epoch": 0.15031791568104416,
"grad_norm": 0.2548248767852783,
"learning_rate": 0.00021586184841701313,
"loss": 0.82,
"step": 13440
},
{
"epoch": 0.15042975936830685,
"grad_norm": 0.3121783435344696,
"learning_rate": 0.000215633423180593,
"loss": 0.796,
"step": 13450
},
{
"epoch": 0.15054160305556955,
"grad_norm": 0.2799825370311737,
"learning_rate": 0.00021540499794417287,
"loss": 0.8073,
"step": 13460
},
{
"epoch": 0.1506534467428322,
"grad_norm": 0.24525675177574158,
"learning_rate": 0.00021517657270775275,
"loss": 0.804,
"step": 13470
},
{
"epoch": 0.1507652904300949,
"grad_norm": 0.26799294352531433,
"learning_rate": 0.00021494814747133266,
"loss": 0.8086,
"step": 13480
},
{
"epoch": 0.1508771341173576,
"grad_norm": 0.24744056165218353,
"learning_rate": 0.00021471972223491252,
"loss": 0.7972,
"step": 13490
},
{
"epoch": 0.15098897780462026,
"grad_norm": 0.27284878492355347,
"learning_rate": 0.0002144912969984924,
"loss": 0.8048,
"step": 13500
},
{
"epoch": 0.15110082149188295,
"grad_norm": 0.2427281141281128,
"learning_rate": 0.00021426287176207225,
"loss": 0.8043,
"step": 13510
},
{
"epoch": 0.15121266517914564,
"grad_norm": 0.27432921528816223,
"learning_rate": 0.00021403444652565216,
"loss": 0.8198,
"step": 13520
},
{
"epoch": 0.1513245088664083,
"grad_norm": 0.26843661069869995,
"learning_rate": 0.00021380602128923205,
"loss": 0.8156,
"step": 13530
},
{
"epoch": 0.151436352553671,
"grad_norm": 0.2460176795721054,
"learning_rate": 0.0002135775960528119,
"loss": 0.806,
"step": 13540
},
{
"epoch": 0.15154819624093366,
"grad_norm": 0.24147658050060272,
"learning_rate": 0.00021334917081639179,
"loss": 0.8146,
"step": 13550
},
{
"epoch": 0.15166003992819635,
"grad_norm": 0.2715270221233368,
"learning_rate": 0.0002131207455799717,
"loss": 0.8065,
"step": 13560
},
{
"epoch": 0.15177188361545904,
"grad_norm": 0.2851991653442383,
"learning_rate": 0.00021289232034355155,
"loss": 0.8042,
"step": 13570
},
{
"epoch": 0.1518837273027217,
"grad_norm": 0.2779170870780945,
"learning_rate": 0.00021266389510713143,
"loss": 0.8163,
"step": 13580
},
{
"epoch": 0.1519955709899844,
"grad_norm": 0.2853197455406189,
"learning_rate": 0.00021243546987071132,
"loss": 0.8025,
"step": 13590
},
{
"epoch": 0.1521074146772471,
"grad_norm": 0.2753603160381317,
"learning_rate": 0.0002122070446342912,
"loss": 0.8187,
"step": 13600
},
{
"epoch": 0.15221925836450975,
"grad_norm": 0.29546552896499634,
"learning_rate": 0.00021197861939787108,
"loss": 0.8189,
"step": 13610
},
{
"epoch": 0.15233110205177244,
"grad_norm": 0.2799798250198364,
"learning_rate": 0.00021175019416145097,
"loss": 0.8098,
"step": 13620
},
{
"epoch": 0.15244294573903514,
"grad_norm": 0.23527085781097412,
"learning_rate": 0.00021152176892503082,
"loss": 0.8212,
"step": 13630
},
{
"epoch": 0.1525547894262978,
"grad_norm": 0.27207401394844055,
"learning_rate": 0.00021129334368861073,
"loss": 0.808,
"step": 13640
},
{
"epoch": 0.1526666331135605,
"grad_norm": 0.26520609855651855,
"learning_rate": 0.00021106491845219061,
"loss": 0.8133,
"step": 13650
},
{
"epoch": 0.15277847680082318,
"grad_norm": 0.2750151455402374,
"learning_rate": 0.00021083649321577047,
"loss": 0.8248,
"step": 13660
},
{
"epoch": 0.15289032048808585,
"grad_norm": 0.28339120745658875,
"learning_rate": 0.00021060806797935035,
"loss": 0.8175,
"step": 13670
},
{
"epoch": 0.15300216417534854,
"grad_norm": 0.27611440420150757,
"learning_rate": 0.00021037964274293026,
"loss": 0.8232,
"step": 13680
},
{
"epoch": 0.1531140078626112,
"grad_norm": 0.264113187789917,
"learning_rate": 0.00021015121750651012,
"loss": 0.8217,
"step": 13690
},
{
"epoch": 0.1532258515498739,
"grad_norm": 0.27031853795051575,
"learning_rate": 0.00020992279227009,
"loss": 0.8242,
"step": 13700
},
{
"epoch": 0.15333769523713658,
"grad_norm": 0.2753359079360962,
"learning_rate": 0.00020969436703366988,
"loss": 0.8311,
"step": 13710
},
{
"epoch": 0.15344953892439925,
"grad_norm": 0.24859648942947388,
"learning_rate": 0.00020946594179724977,
"loss": 0.8285,
"step": 13720
},
{
"epoch": 0.15356138261166194,
"grad_norm": 0.2773294448852539,
"learning_rate": 0.00020923751656082965,
"loss": 0.8201,
"step": 13730
},
{
"epoch": 0.15367322629892463,
"grad_norm": 0.23855488002300262,
"learning_rate": 0.0002090090913244095,
"loss": 0.8145,
"step": 13740
},
{
"epoch": 0.1537850699861873,
"grad_norm": 0.27641457319259644,
"learning_rate": 0.0002087806660879894,
"loss": 0.8233,
"step": 13750
},
{
"epoch": 0.15389691367345,
"grad_norm": 0.26556023955345154,
"learning_rate": 0.0002085522408515693,
"loss": 0.8309,
"step": 13760
},
{
"epoch": 0.15400875736071268,
"grad_norm": 0.2980164885520935,
"learning_rate": 0.00020832381561514915,
"loss": 0.8585,
"step": 13770
},
{
"epoch": 0.15412060104797534,
"grad_norm": 0.21802592277526855,
"learning_rate": 0.00020809539037872904,
"loss": 0.8385,
"step": 13780
},
{
"epoch": 0.15423244473523803,
"grad_norm": 0.3153620958328247,
"learning_rate": 0.00020786696514230895,
"loss": 0.8423,
"step": 13790
},
{
"epoch": 0.15434428842250072,
"grad_norm": 0.2928372621536255,
"learning_rate": 0.0002076385399058888,
"loss": 0.8399,
"step": 13800
},
{
"epoch": 0.1544561321097634,
"grad_norm": 0.3015557527542114,
"learning_rate": 0.00020741011466946868,
"loss": 0.843,
"step": 13810
},
{
"epoch": 0.15456797579702608,
"grad_norm": 0.2243575006723404,
"learning_rate": 0.00020718168943304857,
"loss": 0.8302,
"step": 13820
},
{
"epoch": 0.15467981948428874,
"grad_norm": 0.23281534016132355,
"learning_rate": 0.00020695326419662845,
"loss": 0.8268,
"step": 13830
},
{
"epoch": 0.15479166317155144,
"grad_norm": 0.2412877380847931,
"learning_rate": 0.00020672483896020833,
"loss": 0.849,
"step": 13840
},
{
"epoch": 0.15490350685881413,
"grad_norm": 0.2762492001056671,
"learning_rate": 0.00020649641372378822,
"loss": 0.8324,
"step": 13850
},
{
"epoch": 0.1550153505460768,
"grad_norm": 0.27976560592651367,
"learning_rate": 0.00020626798848736807,
"loss": 0.843,
"step": 13860
},
{
"epoch": 0.15512719423333948,
"grad_norm": 0.29076194763183594,
"learning_rate": 0.00020603956325094798,
"loss": 0.8575,
"step": 13870
},
{
"epoch": 0.15523903792060217,
"grad_norm": 0.2367868423461914,
"learning_rate": 0.00020581113801452786,
"loss": 0.8465,
"step": 13880
},
{
"epoch": 0.15535088160786484,
"grad_norm": 0.26191186904907227,
"learning_rate": 0.00020558271277810772,
"loss": 0.8291,
"step": 13890
},
{
"epoch": 0.15546272529512753,
"grad_norm": 0.27254414558410645,
"learning_rate": 0.0002053542875416876,
"loss": 0.8347,
"step": 13900
},
{
"epoch": 0.15557456898239022,
"grad_norm": 0.2718988060951233,
"learning_rate": 0.0002051258623052675,
"loss": 0.8319,
"step": 13910
},
{
"epoch": 0.15568641266965288,
"grad_norm": 0.24478264153003693,
"learning_rate": 0.00020489743706884737,
"loss": 0.8369,
"step": 13920
},
{
"epoch": 0.15579825635691558,
"grad_norm": 0.27791038155555725,
"learning_rate": 0.00020466901183242725,
"loss": 0.8486,
"step": 13930
},
{
"epoch": 0.15591010004417827,
"grad_norm": 0.27220630645751953,
"learning_rate": 0.00020444058659600713,
"loss": 0.8335,
"step": 13940
},
{
"epoch": 0.15602194373144093,
"grad_norm": 0.2945479154586792,
"learning_rate": 0.00020421216135958702,
"loss": 0.8234,
"step": 13950
},
{
"epoch": 0.15613378741870362,
"grad_norm": 0.2911258041858673,
"learning_rate": 0.0002039837361231669,
"loss": 0.8279,
"step": 13960
},
{
"epoch": 0.15624563110596631,
"grad_norm": 0.3039700984954834,
"learning_rate": 0.00020375531088674676,
"loss": 0.8409,
"step": 13970
},
{
"epoch": 0.15635747479322898,
"grad_norm": 0.27290788292884827,
"learning_rate": 0.00020352688565032664,
"loss": 0.8394,
"step": 13980
},
{
"epoch": 0.15646931848049167,
"grad_norm": 0.28534916043281555,
"learning_rate": 0.00020329846041390655,
"loss": 0.8431,
"step": 13990
},
{
"epoch": 0.15658116216775433,
"grad_norm": 0.304221510887146,
"learning_rate": 0.0002030700351774864,
"loss": 0.8476,
"step": 14000
},
{
"epoch": 0.15669300585501703,
"grad_norm": 0.3151461184024811,
"learning_rate": 0.0002028416099410663,
"loss": 0.852,
"step": 14010
},
{
"epoch": 0.15680484954227972,
"grad_norm": 0.2947019040584564,
"learning_rate": 0.00020261318470464617,
"loss": 0.8396,
"step": 14020
},
{
"epoch": 0.15691669322954238,
"grad_norm": 0.2737627625465393,
"learning_rate": 0.00020238475946822605,
"loss": 0.8337,
"step": 14030
},
{
"epoch": 0.15702853691680507,
"grad_norm": 0.28257089853286743,
"learning_rate": 0.00020215633423180594,
"loss": 0.8475,
"step": 14040
},
{
"epoch": 0.15714038060406776,
"grad_norm": 0.3102625608444214,
"learning_rate": 0.00020192790899538582,
"loss": 0.8451,
"step": 14050
},
{
"epoch": 0.15725222429133043,
"grad_norm": 0.2839931845664978,
"learning_rate": 0.00020169948375896567,
"loss": 0.8365,
"step": 14060
},
{
"epoch": 0.15736406797859312,
"grad_norm": 0.25566980242729187,
"learning_rate": 0.00020147105852254558,
"loss": 0.8287,
"step": 14070
},
{
"epoch": 0.1574759116658558,
"grad_norm": 0.267791211605072,
"learning_rate": 0.00020124263328612547,
"loss": 0.8289,
"step": 14080
},
{
"epoch": 0.15758775535311847,
"grad_norm": 0.267635703086853,
"learning_rate": 0.00020101420804970532,
"loss": 0.8357,
"step": 14090
},
{
"epoch": 0.15769959904038117,
"grad_norm": 0.28065699338912964,
"learning_rate": 0.0002007857828132852,
"loss": 0.8363,
"step": 14100
},
{
"epoch": 0.15781144272764386,
"grad_norm": 0.26585736870765686,
"learning_rate": 0.00020055735757686512,
"loss": 0.8409,
"step": 14110
},
{
"epoch": 0.15792328641490652,
"grad_norm": 0.2562732398509979,
"learning_rate": 0.00020032893234044497,
"loss": 0.8374,
"step": 14120
},
{
"epoch": 0.1580351301021692,
"grad_norm": 0.2572222650051117,
"learning_rate": 0.00020010050710402485,
"loss": 0.8405,
"step": 14130
},
{
"epoch": 0.15814697378943188,
"grad_norm": 0.3075050413608551,
"learning_rate": 0.00019987208186760474,
"loss": 0.825,
"step": 14140
},
{
"epoch": 0.15825881747669457,
"grad_norm": 0.2630293071269989,
"learning_rate": 0.00019964365663118462,
"loss": 0.8326,
"step": 14150
},
{
"epoch": 0.15837066116395726,
"grad_norm": 0.255015105009079,
"learning_rate": 0.0001994152313947645,
"loss": 0.8181,
"step": 14160
},
{
"epoch": 0.15848250485121992,
"grad_norm": 0.25929179787635803,
"learning_rate": 0.00019918680615834438,
"loss": 0.8067,
"step": 14170
},
{
"epoch": 0.15859434853848262,
"grad_norm": 0.27078965306282043,
"learning_rate": 0.00019895838092192424,
"loss": 0.8043,
"step": 14180
},
{
"epoch": 0.1587061922257453,
"grad_norm": 0.2618376612663269,
"learning_rate": 0.00019872995568550415,
"loss": 0.8191,
"step": 14190
},
{
"epoch": 0.15881803591300797,
"grad_norm": 0.246153324842453,
"learning_rate": 0.000198501530449084,
"loss": 0.8251,
"step": 14200
},
{
"epoch": 0.15892987960027066,
"grad_norm": 0.25498026609420776,
"learning_rate": 0.0001982731052126639,
"loss": 0.8319,
"step": 14210
},
{
"epoch": 0.15904172328753335,
"grad_norm": 0.2517942190170288,
"learning_rate": 0.0001980446799762438,
"loss": 0.8106,
"step": 14220
},
{
"epoch": 0.15915356697479602,
"grad_norm": 0.2659161388874054,
"learning_rate": 0.00019781625473982365,
"loss": 0.8163,
"step": 14230
},
{
"epoch": 0.1592654106620587,
"grad_norm": 0.24527288973331451,
"learning_rate": 0.00019758782950340354,
"loss": 0.8359,
"step": 14240
},
{
"epoch": 0.1593772543493214,
"grad_norm": 0.23943792283535004,
"learning_rate": 0.00019735940426698342,
"loss": 0.8253,
"step": 14250
},
{
"epoch": 0.15948909803658406,
"grad_norm": 0.30401650071144104,
"learning_rate": 0.0001971309790305633,
"loss": 0.8369,
"step": 14260
},
{
"epoch": 0.15960094172384676,
"grad_norm": 0.25001001358032227,
"learning_rate": 0.00019690255379414319,
"loss": 0.8354,
"step": 14270
},
{
"epoch": 0.15971278541110942,
"grad_norm": 0.2378586083650589,
"learning_rate": 0.00019667412855772307,
"loss": 0.8324,
"step": 14280
},
{
"epoch": 0.1598246290983721,
"grad_norm": 0.26216059923171997,
"learning_rate": 0.00019644570332130292,
"loss": 0.8227,
"step": 14290
},
{
"epoch": 0.1599364727856348,
"grad_norm": 0.24156969785690308,
"learning_rate": 0.00019621727808488283,
"loss": 0.8362,
"step": 14300
},
{
"epoch": 0.16004831647289747,
"grad_norm": 0.24192091822624207,
"learning_rate": 0.00019598885284846272,
"loss": 0.835,
"step": 14310
},
{
"epoch": 0.16016016016016016,
"grad_norm": 0.24861887097358704,
"learning_rate": 0.00019576042761204257,
"loss": 0.8232,
"step": 14320
},
{
"epoch": 0.16027200384742285,
"grad_norm": 0.27175864577293396,
"learning_rate": 0.00019553200237562246,
"loss": 0.8303,
"step": 14330
},
{
"epoch": 0.16038384753468551,
"grad_norm": 0.272334486246109,
"learning_rate": 0.00019530357713920237,
"loss": 0.8217,
"step": 14340
},
{
"epoch": 0.1604956912219482,
"grad_norm": 0.28357213735580444,
"learning_rate": 0.00019507515190278222,
"loss": 0.8343,
"step": 14350
},
{
"epoch": 0.1606075349092109,
"grad_norm": 0.272276371717453,
"learning_rate": 0.0001948467266663621,
"loss": 0.8235,
"step": 14360
},
{
"epoch": 0.16071937859647356,
"grad_norm": 0.26771044731140137,
"learning_rate": 0.000194618301429942,
"loss": 0.8292,
"step": 14370
},
{
"epoch": 0.16083122228373625,
"grad_norm": 0.27449774742126465,
"learning_rate": 0.00019438987619352187,
"loss": 0.8485,
"step": 14380
},
{
"epoch": 0.16094306597099894,
"grad_norm": 0.26026156544685364,
"learning_rate": 0.00019416145095710175,
"loss": 0.8458,
"step": 14390
},
{
"epoch": 0.1610549096582616,
"grad_norm": 0.2667345404624939,
"learning_rate": 0.00019393302572068164,
"loss": 0.8519,
"step": 14400
},
{
"epoch": 0.1611667533455243,
"grad_norm": 0.26302048563957214,
"learning_rate": 0.0001937046004842615,
"loss": 0.8353,
"step": 14410
},
{
"epoch": 0.16127859703278696,
"grad_norm": 0.24420003592967987,
"learning_rate": 0.0001934761752478414,
"loss": 0.8464,
"step": 14420
},
{
"epoch": 0.16139044072004965,
"grad_norm": 0.2739315629005432,
"learning_rate": 0.00019324775001142126,
"loss": 0.8257,
"step": 14430
},
{
"epoch": 0.16150228440731235,
"grad_norm": 0.2370629757642746,
"learning_rate": 0.00019301932477500114,
"loss": 0.8324,
"step": 14440
},
{
"epoch": 0.161614128094575,
"grad_norm": 0.2616153955459595,
"learning_rate": 0.00019279089953858102,
"loss": 0.8513,
"step": 14450
},
{
"epoch": 0.1617259717818377,
"grad_norm": 0.2527558207511902,
"learning_rate": 0.0001925624743021609,
"loss": 0.8435,
"step": 14460
},
{
"epoch": 0.1618378154691004,
"grad_norm": 0.28255122900009155,
"learning_rate": 0.0001923340490657408,
"loss": 0.8497,
"step": 14470
},
{
"epoch": 0.16194965915636306,
"grad_norm": 0.23198026418685913,
"learning_rate": 0.00019210562382932067,
"loss": 0.8357,
"step": 14480
},
{
"epoch": 0.16206150284362575,
"grad_norm": 0.2534460127353668,
"learning_rate": 0.00019187719859290053,
"loss": 0.8396,
"step": 14490
},
{
"epoch": 0.16217334653088844,
"grad_norm": 0.2693686783313751,
"learning_rate": 0.00019164877335648044,
"loss": 0.8438,
"step": 14500
},
{
"epoch": 0.1622851902181511,
"grad_norm": 0.26181599497795105,
"learning_rate": 0.00019142034812006032,
"loss": 0.8452,
"step": 14510
},
{
"epoch": 0.1623970339054138,
"grad_norm": 0.2268761545419693,
"learning_rate": 0.00019119192288364017,
"loss": 0.8496,
"step": 14520
},
{
"epoch": 0.1625088775926765,
"grad_norm": 0.27698907256126404,
"learning_rate": 0.00019096349764722006,
"loss": 0.8265,
"step": 14530
},
{
"epoch": 0.16262072127993915,
"grad_norm": 0.30570700764656067,
"learning_rate": 0.00019073507241079997,
"loss": 0.8399,
"step": 14540
},
{
"epoch": 0.16273256496720184,
"grad_norm": 0.2894477844238281,
"learning_rate": 0.00019050664717437982,
"loss": 0.8488,
"step": 14550
},
{
"epoch": 0.16284440865446453,
"grad_norm": 0.3094457685947418,
"learning_rate": 0.0001902782219379597,
"loss": 0.8243,
"step": 14560
},
{
"epoch": 0.1629562523417272,
"grad_norm": 0.2908037602901459,
"learning_rate": 0.0001900497967015396,
"loss": 0.835,
"step": 14570
},
{
"epoch": 0.1630680960289899,
"grad_norm": 0.27222102880477905,
"learning_rate": 0.00018982137146511947,
"loss": 0.8306,
"step": 14580
},
{
"epoch": 0.16317993971625255,
"grad_norm": 0.2542339563369751,
"learning_rate": 0.00018959294622869935,
"loss": 0.8259,
"step": 14590
},
{
"epoch": 0.16329178340351524,
"grad_norm": 0.28288012742996216,
"learning_rate": 0.00018936452099227924,
"loss": 0.8243,
"step": 14600
},
{
"epoch": 0.16340362709077794,
"grad_norm": 0.2584143877029419,
"learning_rate": 0.0001891360957558591,
"loss": 0.8224,
"step": 14610
},
{
"epoch": 0.1635154707780406,
"grad_norm": 0.26679450273513794,
"learning_rate": 0.000188907670519439,
"loss": 0.8142,
"step": 14620
},
{
"epoch": 0.1636273144653033,
"grad_norm": 0.24589306116104126,
"learning_rate": 0.00018867924528301889,
"loss": 0.81,
"step": 14630
},
{
"epoch": 0.16373915815256598,
"grad_norm": 0.28474611043930054,
"learning_rate": 0.00018845082004659874,
"loss": 0.7989,
"step": 14640
},
{
"epoch": 0.16385100183982865,
"grad_norm": 0.27567991614341736,
"learning_rate": 0.00018822239481017862,
"loss": 0.8049,
"step": 14650
},
{
"epoch": 0.16396284552709134,
"grad_norm": 0.2509905695915222,
"learning_rate": 0.0001879939695737585,
"loss": 0.8168,
"step": 14660
},
{
"epoch": 0.16407468921435403,
"grad_norm": 0.30284953117370605,
"learning_rate": 0.0001877655443373384,
"loss": 0.8055,
"step": 14670
},
{
"epoch": 0.1641865329016167,
"grad_norm": 0.27638325095176697,
"learning_rate": 0.00018753711910091827,
"loss": 0.8368,
"step": 14680
},
{
"epoch": 0.16429837658887939,
"grad_norm": 0.29546642303466797,
"learning_rate": 0.00018730869386449816,
"loss": 0.8161,
"step": 14690
},
{
"epoch": 0.16441022027614208,
"grad_norm": 0.2483370304107666,
"learning_rate": 0.00018708026862807804,
"loss": 0.8136,
"step": 14700
},
{
"epoch": 0.16452206396340474,
"grad_norm": 0.2862898111343384,
"learning_rate": 0.00018685184339165792,
"loss": 0.836,
"step": 14710
},
{
"epoch": 0.16463390765066743,
"grad_norm": 0.2730434238910675,
"learning_rate": 0.00018662341815523778,
"loss": 0.8279,
"step": 14720
},
{
"epoch": 0.1647457513379301,
"grad_norm": 0.2846275269985199,
"learning_rate": 0.0001863949929188177,
"loss": 0.7991,
"step": 14730
},
{
"epoch": 0.1648575950251928,
"grad_norm": 0.2455524355173111,
"learning_rate": 0.00018616656768239757,
"loss": 0.7931,
"step": 14740
},
{
"epoch": 0.16496943871245548,
"grad_norm": 0.25060829520225525,
"learning_rate": 0.00018593814244597743,
"loss": 0.8009,
"step": 14750
},
{
"epoch": 0.16508128239971814,
"grad_norm": 0.2687000334262848,
"learning_rate": 0.0001857097172095573,
"loss": 0.7968,
"step": 14760
},
{
"epoch": 0.16519312608698083,
"grad_norm": 0.28619691729545593,
"learning_rate": 0.00018548129197313722,
"loss": 0.7818,
"step": 14770
},
{
"epoch": 0.16530496977424353,
"grad_norm": 0.2549494206905365,
"learning_rate": 0.00018525286673671707,
"loss": 0.7877,
"step": 14780
},
{
"epoch": 0.1654168134615062,
"grad_norm": 0.2419700175523758,
"learning_rate": 0.00018502444150029696,
"loss": 0.7899,
"step": 14790
},
{
"epoch": 0.16552865714876888,
"grad_norm": 0.2636066675186157,
"learning_rate": 0.00018479601626387684,
"loss": 0.7893,
"step": 14800
},
{
"epoch": 0.16564050083603157,
"grad_norm": 0.264072984457016,
"learning_rate": 0.00018456759102745672,
"loss": 0.7984,
"step": 14810
},
{
"epoch": 0.16575234452329424,
"grad_norm": 0.2661677598953247,
"learning_rate": 0.0001843391657910366,
"loss": 0.8085,
"step": 14820
},
{
"epoch": 0.16586418821055693,
"grad_norm": 0.28324052691459656,
"learning_rate": 0.0001841107405546165,
"loss": 0.8066,
"step": 14830
},
{
"epoch": 0.16597603189781962,
"grad_norm": 0.277761310338974,
"learning_rate": 0.00018388231531819634,
"loss": 0.8008,
"step": 14840
},
{
"epoch": 0.16608787558508228,
"grad_norm": 0.2669602036476135,
"learning_rate": 0.00018365389008177625,
"loss": 0.8285,
"step": 14850
},
{
"epoch": 0.16619971927234498,
"grad_norm": 0.28757140040397644,
"learning_rate": 0.00018342546484535614,
"loss": 0.8121,
"step": 14860
},
{
"epoch": 0.16631156295960764,
"grad_norm": 0.2616439163684845,
"learning_rate": 0.000183197039608936,
"loss": 0.8185,
"step": 14870
},
{
"epoch": 0.16642340664687033,
"grad_norm": 0.28334370255470276,
"learning_rate": 0.00018296861437251587,
"loss": 0.8229,
"step": 14880
},
{
"epoch": 0.16653525033413302,
"grad_norm": 0.2659022808074951,
"learning_rate": 0.00018274018913609576,
"loss": 0.82,
"step": 14890
},
{
"epoch": 0.1666470940213957,
"grad_norm": 0.2544262111186981,
"learning_rate": 0.00018251176389967564,
"loss": 0.84,
"step": 14900
},
{
"epoch": 0.16675893770865838,
"grad_norm": 0.27492937445640564,
"learning_rate": 0.00018228333866325552,
"loss": 0.8411,
"step": 14910
},
{
"epoch": 0.16687078139592107,
"grad_norm": 0.2961216866970062,
"learning_rate": 0.00018205491342683538,
"loss": 0.8178,
"step": 14920
},
{
"epoch": 0.16698262508318373,
"grad_norm": 0.2704416811466217,
"learning_rate": 0.0001818264881904153,
"loss": 0.8264,
"step": 14930
},
{
"epoch": 0.16709446877044642,
"grad_norm": 0.261704683303833,
"learning_rate": 0.00018159806295399517,
"loss": 0.8307,
"step": 14940
},
{
"epoch": 0.16720631245770912,
"grad_norm": 0.26157405972480774,
"learning_rate": 0.00018136963771757503,
"loss": 0.8064,
"step": 14950
},
{
"epoch": 0.16731815614497178,
"grad_norm": 0.2589896023273468,
"learning_rate": 0.0001811412124811549,
"loss": 0.8195,
"step": 14960
},
{
"epoch": 0.16742999983223447,
"grad_norm": 0.24691319465637207,
"learning_rate": 0.00018091278724473482,
"loss": 0.8283,
"step": 14970
},
{
"epoch": 0.16754184351949716,
"grad_norm": 0.2527819871902466,
"learning_rate": 0.00018068436200831468,
"loss": 0.8229,
"step": 14980
},
{
"epoch": 0.16765368720675983,
"grad_norm": 0.2639094293117523,
"learning_rate": 0.00018045593677189456,
"loss": 0.8393,
"step": 14990
},
{
"epoch": 0.16776553089402252,
"grad_norm": 0.24417634308338165,
"learning_rate": 0.00018022751153547444,
"loss": 0.8204,
"step": 15000
},
{
"epoch": 0.16787737458128518,
"grad_norm": 0.25673115253448486,
"learning_rate": 0.00017999908629905432,
"loss": 0.8184,
"step": 15010
},
{
"epoch": 0.16798921826854787,
"grad_norm": 0.254077285528183,
"learning_rate": 0.0001797706610626342,
"loss": 0.8195,
"step": 15020
},
{
"epoch": 0.16810106195581057,
"grad_norm": 0.2455417662858963,
"learning_rate": 0.0001795422358262141,
"loss": 0.8255,
"step": 15030
},
{
"epoch": 0.16821290564307323,
"grad_norm": 0.27918189764022827,
"learning_rate": 0.00017931381058979395,
"loss": 0.8345,
"step": 15040
},
{
"epoch": 0.16832474933033592,
"grad_norm": 0.2272186279296875,
"learning_rate": 0.00017908538535337386,
"loss": 0.8178,
"step": 15050
},
{
"epoch": 0.1684365930175986,
"grad_norm": 0.269189715385437,
"learning_rate": 0.00017885696011695374,
"loss": 0.8343,
"step": 15060
},
{
"epoch": 0.16854843670486128,
"grad_norm": 0.2805529832839966,
"learning_rate": 0.0001786285348805336,
"loss": 0.8126,
"step": 15070
},
{
"epoch": 0.16866028039212397,
"grad_norm": 0.28788769245147705,
"learning_rate": 0.00017840010964411348,
"loss": 0.8278,
"step": 15080
},
{
"epoch": 0.16877212407938666,
"grad_norm": 0.2439277619123459,
"learning_rate": 0.00017817168440769336,
"loss": 0.8272,
"step": 15090
},
{
"epoch": 0.16888396776664932,
"grad_norm": 0.3151440918445587,
"learning_rate": 0.00017794325917127324,
"loss": 0.8201,
"step": 15100
},
{
"epoch": 0.16899581145391201,
"grad_norm": 0.2562885880470276,
"learning_rate": 0.00017771483393485313,
"loss": 0.8275,
"step": 15110
},
{
"epoch": 0.1691076551411747,
"grad_norm": 0.2718476355075836,
"learning_rate": 0.00017748640869843298,
"loss": 0.821,
"step": 15120
},
{
"epoch": 0.16921949882843737,
"grad_norm": 0.2699459493160248,
"learning_rate": 0.0001772579834620129,
"loss": 0.8352,
"step": 15130
},
{
"epoch": 0.16933134251570006,
"grad_norm": 0.29737600684165955,
"learning_rate": 0.00017702955822559277,
"loss": 0.8279,
"step": 15140
},
{
"epoch": 0.16944318620296273,
"grad_norm": 0.3075369894504547,
"learning_rate": 0.00017680113298917263,
"loss": 0.8037,
"step": 15150
},
{
"epoch": 0.16955502989022542,
"grad_norm": 0.27061593532562256,
"learning_rate": 0.00017657270775275254,
"loss": 0.8149,
"step": 15160
},
{
"epoch": 0.1696668735774881,
"grad_norm": 0.26719844341278076,
"learning_rate": 0.00017634428251633242,
"loss": 0.7896,
"step": 15170
},
{
"epoch": 0.16977871726475077,
"grad_norm": 0.2871409058570862,
"learning_rate": 0.00017611585727991228,
"loss": 0.7863,
"step": 15180
},
{
"epoch": 0.16989056095201346,
"grad_norm": 0.2502906620502472,
"learning_rate": 0.00017588743204349216,
"loss": 0.7817,
"step": 15190
},
{
"epoch": 0.17000240463927616,
"grad_norm": 0.2579248547554016,
"learning_rate": 0.00017565900680707207,
"loss": 0.796,
"step": 15200
},
{
"epoch": 0.17011424832653882,
"grad_norm": 0.2537415325641632,
"learning_rate": 0.00017543058157065193,
"loss": 0.78,
"step": 15210
},
{
"epoch": 0.1702260920138015,
"grad_norm": 0.2420157790184021,
"learning_rate": 0.0001752021563342318,
"loss": 0.7946,
"step": 15220
},
{
"epoch": 0.1703379357010642,
"grad_norm": 0.2423790544271469,
"learning_rate": 0.0001749737310978117,
"loss": 0.797,
"step": 15230
},
{
"epoch": 0.17044977938832687,
"grad_norm": 0.2521071434020996,
"learning_rate": 0.00017474530586139157,
"loss": 0.8073,
"step": 15240
},
{
"epoch": 0.17056162307558956,
"grad_norm": 0.22921273112297058,
"learning_rate": 0.00017451688062497146,
"loss": 0.7916,
"step": 15250
},
{
"epoch": 0.17067346676285225,
"grad_norm": 0.35150206089019775,
"learning_rate": 0.00017428845538855134,
"loss": 0.8001,
"step": 15260
},
{
"epoch": 0.1707853104501149,
"grad_norm": 0.27637869119644165,
"learning_rate": 0.0001740600301521312,
"loss": 0.7948,
"step": 15270
},
{
"epoch": 0.1708971541373776,
"grad_norm": 0.22480230033397675,
"learning_rate": 0.0001738316049157111,
"loss": 0.7932,
"step": 15280
},
{
"epoch": 0.1710089978246403,
"grad_norm": 0.27264508605003357,
"learning_rate": 0.000173603179679291,
"loss": 0.8083,
"step": 15290
},
{
"epoch": 0.17112084151190296,
"grad_norm": 0.2647417485713959,
"learning_rate": 0.00017337475444287084,
"loss": 0.8177,
"step": 15300
},
{
"epoch": 0.17123268519916565,
"grad_norm": 0.23619987070560455,
"learning_rate": 0.00017314632920645073,
"loss": 0.8068,
"step": 15310
},
{
"epoch": 0.17134452888642832,
"grad_norm": 0.22450131177902222,
"learning_rate": 0.0001729179039700306,
"loss": 0.8004,
"step": 15320
},
{
"epoch": 0.171456372573691,
"grad_norm": 0.2784859240055084,
"learning_rate": 0.0001726894787336105,
"loss": 0.7938,
"step": 15330
},
{
"epoch": 0.1715682162609537,
"grad_norm": 0.25513574481010437,
"learning_rate": 0.00017246105349719038,
"loss": 0.7844,
"step": 15340
},
{
"epoch": 0.17168005994821636,
"grad_norm": 0.27425146102905273,
"learning_rate": 0.00017223262826077023,
"loss": 0.7906,
"step": 15350
},
{
"epoch": 0.17179190363547905,
"grad_norm": 0.2500791847705841,
"learning_rate": 0.00017200420302435014,
"loss": 0.7834,
"step": 15360
},
{
"epoch": 0.17190374732274175,
"grad_norm": 0.2550630271434784,
"learning_rate": 0.00017177577778793002,
"loss": 0.7736,
"step": 15370
},
{
"epoch": 0.1720155910100044,
"grad_norm": 0.25209444761276245,
"learning_rate": 0.00017154735255150988,
"loss": 0.773,
"step": 15380
},
{
"epoch": 0.1721274346972671,
"grad_norm": 0.2347812056541443,
"learning_rate": 0.00017131892731508976,
"loss": 0.7745,
"step": 15390
},
{
"epoch": 0.1722392783845298,
"grad_norm": 0.2858305871486664,
"learning_rate": 0.00017109050207866967,
"loss": 0.7776,
"step": 15400
},
{
"epoch": 0.17235112207179246,
"grad_norm": 0.30414941906929016,
"learning_rate": 0.00017086207684224953,
"loss": 0.7701,
"step": 15410
},
{
"epoch": 0.17246296575905515,
"grad_norm": 0.2645011842250824,
"learning_rate": 0.0001706336516058294,
"loss": 0.7746,
"step": 15420
},
{
"epoch": 0.17257480944631784,
"grad_norm": 0.2984048128128052,
"learning_rate": 0.0001704052263694093,
"loss": 0.771,
"step": 15430
},
{
"epoch": 0.1726866531335805,
"grad_norm": 0.2734147906303406,
"learning_rate": 0.00017017680113298918,
"loss": 0.7769,
"step": 15440
},
{
"epoch": 0.1727984968208432,
"grad_norm": 0.2632124125957489,
"learning_rate": 0.00016994837589656906,
"loss": 0.7754,
"step": 15450
},
{
"epoch": 0.17291034050810586,
"grad_norm": 0.29384443163871765,
"learning_rate": 0.00016971995066014894,
"loss": 0.7833,
"step": 15460
},
{
"epoch": 0.17302218419536855,
"grad_norm": 0.3194182813167572,
"learning_rate": 0.0001694915254237288,
"loss": 0.7813,
"step": 15470
},
{
"epoch": 0.17313402788263124,
"grad_norm": 0.25995251536369324,
"learning_rate": 0.0001692631001873087,
"loss": 0.7796,
"step": 15480
},
{
"epoch": 0.1732458715698939,
"grad_norm": 0.272419810295105,
"learning_rate": 0.0001690346749508886,
"loss": 0.7839,
"step": 15490
},
{
"epoch": 0.1733577152571566,
"grad_norm": 0.26239413022994995,
"learning_rate": 0.00016880624971446845,
"loss": 0.7807,
"step": 15500
},
{
"epoch": 0.1734695589444193,
"grad_norm": 0.29991698265075684,
"learning_rate": 0.00016857782447804833,
"loss": 0.7941,
"step": 15510
},
{
"epoch": 0.17358140263168195,
"grad_norm": 0.2812528908252716,
"learning_rate": 0.00016834939924162824,
"loss": 0.7863,
"step": 15520
},
{
"epoch": 0.17369324631894464,
"grad_norm": 0.2557685077190399,
"learning_rate": 0.0001681209740052081,
"loss": 0.7953,
"step": 15530
},
{
"epoch": 0.17380509000620734,
"grad_norm": 0.28565913438796997,
"learning_rate": 0.00016789254876878798,
"loss": 0.7934,
"step": 15540
},
{
"epoch": 0.17391693369347,
"grad_norm": 0.25316086411476135,
"learning_rate": 0.00016766412353236783,
"loss": 0.7969,
"step": 15550
},
{
"epoch": 0.1740287773807327,
"grad_norm": 0.2636478543281555,
"learning_rate": 0.00016743569829594774,
"loss": 0.8021,
"step": 15560
},
{
"epoch": 0.17414062106799538,
"grad_norm": 0.28839442133903503,
"learning_rate": 0.00016720727305952763,
"loss": 0.8108,
"step": 15570
},
{
"epoch": 0.17425246475525805,
"grad_norm": 0.2453639954328537,
"learning_rate": 0.00016697884782310748,
"loss": 0.8034,
"step": 15580
},
{
"epoch": 0.17436430844252074,
"grad_norm": 0.2550848424434662,
"learning_rate": 0.0001667504225866874,
"loss": 0.8169,
"step": 15590
},
{
"epoch": 0.1744761521297834,
"grad_norm": 0.24949923157691956,
"learning_rate": 0.00016652199735026727,
"loss": 0.8167,
"step": 15600
},
{
"epoch": 0.1745879958170461,
"grad_norm": 0.24357125163078308,
"learning_rate": 0.00016629357211384713,
"loss": 0.821,
"step": 15610
},
{
"epoch": 0.17469983950430878,
"grad_norm": 0.2246461659669876,
"learning_rate": 0.000166065146877427,
"loss": 0.82,
"step": 15620
},
{
"epoch": 0.17481168319157145,
"grad_norm": 0.26160740852355957,
"learning_rate": 0.00016583672164100692,
"loss": 0.8167,
"step": 15630
},
{
"epoch": 0.17492352687883414,
"grad_norm": 0.25773337483406067,
"learning_rate": 0.00016560829640458678,
"loss": 0.8305,
"step": 15640
},
{
"epoch": 0.17503537056609683,
"grad_norm": 0.24051527678966522,
"learning_rate": 0.00016537987116816666,
"loss": 0.8201,
"step": 15650
},
{
"epoch": 0.1751472142533595,
"grad_norm": 0.2507860064506531,
"learning_rate": 0.00016515144593174654,
"loss": 0.8444,
"step": 15660
},
{
"epoch": 0.1752590579406222,
"grad_norm": 0.24071821570396423,
"learning_rate": 0.00016492302069532643,
"loss": 0.8071,
"step": 15670
},
{
"epoch": 0.17537090162788488,
"grad_norm": 0.2533905506134033,
"learning_rate": 0.0001646945954589063,
"loss": 0.8164,
"step": 15680
},
{
"epoch": 0.17548274531514754,
"grad_norm": 0.2546316683292389,
"learning_rate": 0.0001644661702224862,
"loss": 0.8237,
"step": 15690
},
{
"epoch": 0.17559458900241023,
"grad_norm": 0.25692155957221985,
"learning_rate": 0.00016423774498606605,
"loss": 0.8198,
"step": 15700
},
{
"epoch": 0.17570643268967293,
"grad_norm": 0.254535436630249,
"learning_rate": 0.00016400931974964596,
"loss": 0.8061,
"step": 15710
},
{
"epoch": 0.1758182763769356,
"grad_norm": 0.2557326555252075,
"learning_rate": 0.00016378089451322584,
"loss": 0.8194,
"step": 15720
},
{
"epoch": 0.17593012006419828,
"grad_norm": 0.24234241247177124,
"learning_rate": 0.0001635524692768057,
"loss": 0.8183,
"step": 15730
},
{
"epoch": 0.17604196375146094,
"grad_norm": 0.2597709596157074,
"learning_rate": 0.00016332404404038558,
"loss": 0.7957,
"step": 15740
},
{
"epoch": 0.17615380743872364,
"grad_norm": 0.2896418273448944,
"learning_rate": 0.0001630956188039655,
"loss": 0.8146,
"step": 15750
},
{
"epoch": 0.17626565112598633,
"grad_norm": 0.2686966061592102,
"learning_rate": 0.00016286719356754535,
"loss": 0.7988,
"step": 15760
},
{
"epoch": 0.176377494813249,
"grad_norm": 0.26220840215682983,
"learning_rate": 0.00016263876833112523,
"loss": 0.7936,
"step": 15770
},
{
"epoch": 0.17648933850051168,
"grad_norm": 0.260547012090683,
"learning_rate": 0.00016241034309470508,
"loss": 0.8002,
"step": 15780
},
{
"epoch": 0.17660118218777437,
"grad_norm": 0.22341471910476685,
"learning_rate": 0.000162181917858285,
"loss": 0.7935,
"step": 15790
},
{
"epoch": 0.17671302587503704,
"grad_norm": 0.24994009733200073,
"learning_rate": 0.00016195349262186488,
"loss": 0.7971,
"step": 15800
},
{
"epoch": 0.17682486956229973,
"grad_norm": 0.24070651829242706,
"learning_rate": 0.00016172506738544473,
"loss": 0.7844,
"step": 15810
},
{
"epoch": 0.17693671324956242,
"grad_norm": 0.23858696222305298,
"learning_rate": 0.00016149664214902461,
"loss": 0.7687,
"step": 15820
},
{
"epoch": 0.17704855693682509,
"grad_norm": 0.24684946238994598,
"learning_rate": 0.00016126821691260452,
"loss": 0.7848,
"step": 15830
},
{
"epoch": 0.17716040062408778,
"grad_norm": 0.2525545656681061,
"learning_rate": 0.00016103979167618438,
"loss": 0.773,
"step": 15840
},
{
"epoch": 0.17727224431135047,
"grad_norm": 0.2485392689704895,
"learning_rate": 0.00016081136643976426,
"loss": 0.7787,
"step": 15850
},
{
"epoch": 0.17738408799861313,
"grad_norm": 0.2384241223335266,
"learning_rate": 0.00016058294120334415,
"loss": 0.7732,
"step": 15860
},
{
"epoch": 0.17749593168587582,
"grad_norm": 0.25029659271240234,
"learning_rate": 0.00016035451596692403,
"loss": 0.7819,
"step": 15870
},
{
"epoch": 0.1776077753731385,
"grad_norm": 0.2988499701023102,
"learning_rate": 0.0001601260907305039,
"loss": 0.7815,
"step": 15880
},
{
"epoch": 0.17771961906040118,
"grad_norm": 0.25840380787849426,
"learning_rate": 0.0001598976654940838,
"loss": 0.7899,
"step": 15890
},
{
"epoch": 0.17783146274766387,
"grad_norm": 0.2870889902114868,
"learning_rate": 0.00015966924025766365,
"loss": 0.7964,
"step": 15900
},
{
"epoch": 0.17794330643492653,
"grad_norm": 0.270702987909317,
"learning_rate": 0.00015944081502124356,
"loss": 0.7907,
"step": 15910
},
{
"epoch": 0.17805515012218923,
"grad_norm": 0.24939289689064026,
"learning_rate": 0.00015921238978482344,
"loss": 0.7909,
"step": 15920
},
{
"epoch": 0.17816699380945192,
"grad_norm": 0.25692620873451233,
"learning_rate": 0.0001589839645484033,
"loss": 0.7864,
"step": 15930
},
{
"epoch": 0.17827883749671458,
"grad_norm": 0.25667235255241394,
"learning_rate": 0.00015875553931198318,
"loss": 0.7792,
"step": 15940
},
{
"epoch": 0.17839068118397727,
"grad_norm": 0.27988189458847046,
"learning_rate": 0.0001585271140755631,
"loss": 0.78,
"step": 15950
},
{
"epoch": 0.17850252487123996,
"grad_norm": 0.26706936955451965,
"learning_rate": 0.00015829868883914295,
"loss": 0.7764,
"step": 15960
},
{
"epoch": 0.17861436855850263,
"grad_norm": 0.25825801491737366,
"learning_rate": 0.00015807026360272283,
"loss": 0.7798,
"step": 15970
},
{
"epoch": 0.17872621224576532,
"grad_norm": 0.26630404591560364,
"learning_rate": 0.0001578418383663027,
"loss": 0.7877,
"step": 15980
},
{
"epoch": 0.178838055933028,
"grad_norm": 0.24562442302703857,
"learning_rate": 0.0001576134131298826,
"loss": 0.7761,
"step": 15990
},
{
"epoch": 0.17894989962029068,
"grad_norm": 0.2607520818710327,
"learning_rate": 0.00015738498789346248,
"loss": 0.7844,
"step": 16000
},
{
"epoch": 0.17906174330755337,
"grad_norm": 0.25256794691085815,
"learning_rate": 0.00015715656265704233,
"loss": 0.7712,
"step": 16010
},
{
"epoch": 0.17917358699481606,
"grad_norm": 0.24657808244228363,
"learning_rate": 0.00015692813742062222,
"loss": 0.7766,
"step": 16020
},
{
"epoch": 0.17928543068207872,
"grad_norm": 0.2546744644641876,
"learning_rate": 0.00015669971218420213,
"loss": 0.781,
"step": 16030
},
{
"epoch": 0.17939727436934141,
"grad_norm": 0.24849241971969604,
"learning_rate": 0.00015647128694778198,
"loss": 0.786,
"step": 16040
},
{
"epoch": 0.17950911805660408,
"grad_norm": 0.2447352409362793,
"learning_rate": 0.00015624286171136187,
"loss": 0.7805,
"step": 16050
},
{
"epoch": 0.17962096174386677,
"grad_norm": 0.3004114031791687,
"learning_rate": 0.00015601443647494178,
"loss": 0.7748,
"step": 16060
},
{
"epoch": 0.17973280543112946,
"grad_norm": 0.24974007904529572,
"learning_rate": 0.00015578601123852163,
"loss": 0.7823,
"step": 16070
},
{
"epoch": 0.17984464911839212,
"grad_norm": 0.2995624542236328,
"learning_rate": 0.00015555758600210151,
"loss": 0.7894,
"step": 16080
},
{
"epoch": 0.17995649280565482,
"grad_norm": 0.2560220956802368,
"learning_rate": 0.0001553291607656814,
"loss": 0.7849,
"step": 16090
},
{
"epoch": 0.1800683364929175,
"grad_norm": 0.24940122663974762,
"learning_rate": 0.00015510073552926128,
"loss": 0.7903,
"step": 16100
},
{
"epoch": 0.18018018018018017,
"grad_norm": 0.22082312405109406,
"learning_rate": 0.00015487231029284116,
"loss": 0.783,
"step": 16110
},
{
"epoch": 0.18029202386744286,
"grad_norm": 0.2670224606990814,
"learning_rate": 0.00015464388505642104,
"loss": 0.7919,
"step": 16120
},
{
"epoch": 0.18040386755470555,
"grad_norm": 0.2533135414123535,
"learning_rate": 0.0001544154598200009,
"loss": 0.8007,
"step": 16130
},
{
"epoch": 0.18051571124196822,
"grad_norm": 0.2660861909389496,
"learning_rate": 0.0001541870345835808,
"loss": 0.7913,
"step": 16140
},
{
"epoch": 0.1806275549292309,
"grad_norm": 0.2556677460670471,
"learning_rate": 0.0001539586093471607,
"loss": 0.7826,
"step": 16150
},
{
"epoch": 0.1807393986164936,
"grad_norm": 0.275900661945343,
"learning_rate": 0.00015373018411074055,
"loss": 0.8048,
"step": 16160
},
{
"epoch": 0.18085124230375627,
"grad_norm": 0.29176998138427734,
"learning_rate": 0.00015350175887432043,
"loss": 0.8241,
"step": 16170
},
{
"epoch": 0.18096308599101896,
"grad_norm": 0.2635776996612549,
"learning_rate": 0.00015327333363790034,
"loss": 0.8211,
"step": 16180
},
{
"epoch": 0.18107492967828162,
"grad_norm": 0.27744734287261963,
"learning_rate": 0.0001530449084014802,
"loss": 0.8254,
"step": 16190
},
{
"epoch": 0.1811867733655443,
"grad_norm": 0.28162074089050293,
"learning_rate": 0.00015281648316506008,
"loss": 0.8182,
"step": 16200
},
{
"epoch": 0.181298617052807,
"grad_norm": 0.29347339272499084,
"learning_rate": 0.00015258805792863996,
"loss": 0.812,
"step": 16210
},
{
"epoch": 0.18141046074006967,
"grad_norm": 0.26170992851257324,
"learning_rate": 0.00015235963269221985,
"loss": 0.8221,
"step": 16220
},
{
"epoch": 0.18152230442733236,
"grad_norm": 0.27848196029663086,
"learning_rate": 0.00015213120745579973,
"loss": 0.825,
"step": 16230
},
{
"epoch": 0.18163414811459505,
"grad_norm": 0.2994973659515381,
"learning_rate": 0.00015190278221937958,
"loss": 0.8158,
"step": 16240
},
{
"epoch": 0.18174599180185771,
"grad_norm": 0.27873843908309937,
"learning_rate": 0.00015167435698295947,
"loss": 0.816,
"step": 16250
},
{
"epoch": 0.1818578354891204,
"grad_norm": 0.3014775812625885,
"learning_rate": 0.00015144593174653938,
"loss": 0.8174,
"step": 16260
},
{
"epoch": 0.1819696791763831,
"grad_norm": 0.29963594675064087,
"learning_rate": 0.00015121750651011923,
"loss": 0.8104,
"step": 16270
},
{
"epoch": 0.18208152286364576,
"grad_norm": 0.3388141393661499,
"learning_rate": 0.00015098908127369912,
"loss": 0.826,
"step": 16280
},
{
"epoch": 0.18219336655090845,
"grad_norm": 0.29143062233924866,
"learning_rate": 0.000150760656037279,
"loss": 0.8222,
"step": 16290
},
{
"epoch": 0.18230521023817114,
"grad_norm": 0.327824205160141,
"learning_rate": 0.00015053223080085888,
"loss": 0.8186,
"step": 16300
},
{
"epoch": 0.1824170539254338,
"grad_norm": 0.3053797483444214,
"learning_rate": 0.00015030380556443876,
"loss": 0.8214,
"step": 16310
},
{
"epoch": 0.1825288976126965,
"grad_norm": 0.3030015230178833,
"learning_rate": 0.00015007538032801865,
"loss": 0.8198,
"step": 16320
},
{
"epoch": 0.18264074129995916,
"grad_norm": 0.3147192597389221,
"learning_rate": 0.0001498469550915985,
"loss": 0.8224,
"step": 16330
},
{
"epoch": 0.18275258498722186,
"grad_norm": 0.2838999927043915,
"learning_rate": 0.0001496185298551784,
"loss": 0.8142,
"step": 16340
},
{
"epoch": 0.18286442867448455,
"grad_norm": 0.27273476123809814,
"learning_rate": 0.0001493901046187583,
"loss": 0.8054,
"step": 16350
},
{
"epoch": 0.1829762723617472,
"grad_norm": 0.2754770517349243,
"learning_rate": 0.00014916167938233815,
"loss": 0.8131,
"step": 16360
},
{
"epoch": 0.1830881160490099,
"grad_norm": 0.29061514139175415,
"learning_rate": 0.00014893325414591803,
"loss": 0.7988,
"step": 16370
},
{
"epoch": 0.1831999597362726,
"grad_norm": 0.2525017559528351,
"learning_rate": 0.00014870482890949794,
"loss": 0.8023,
"step": 16380
},
{
"epoch": 0.18331180342353526,
"grad_norm": 0.3019058108329773,
"learning_rate": 0.0001484764036730778,
"loss": 0.8077,
"step": 16390
},
{
"epoch": 0.18342364711079795,
"grad_norm": 0.302090048789978,
"learning_rate": 0.00014824797843665768,
"loss": 0.812,
"step": 16400
},
{
"epoch": 0.18353549079806064,
"grad_norm": 0.29742154479026794,
"learning_rate": 0.00014801955320023757,
"loss": 0.7911,
"step": 16410
},
{
"epoch": 0.1836473344853233,
"grad_norm": 0.31950804591178894,
"learning_rate": 0.00014779112796381745,
"loss": 0.7875,
"step": 16420
},
{
"epoch": 0.183759178172586,
"grad_norm": 0.32971978187561035,
"learning_rate": 0.00014756270272739733,
"loss": 0.7788,
"step": 16430
},
{
"epoch": 0.1838710218598487,
"grad_norm": 0.2941220700740814,
"learning_rate": 0.00014733427749097721,
"loss": 0.7772,
"step": 16440
},
{
"epoch": 0.18398286554711135,
"grad_norm": 0.2639923393726349,
"learning_rate": 0.00014710585225455707,
"loss": 0.7708,
"step": 16450
},
{
"epoch": 0.18409470923437404,
"grad_norm": 0.2483467161655426,
"learning_rate": 0.00014687742701813698,
"loss": 0.7846,
"step": 16460
},
{
"epoch": 0.1842065529216367,
"grad_norm": 0.31150713562965393,
"learning_rate": 0.00014664900178171683,
"loss": 0.7853,
"step": 16470
},
{
"epoch": 0.1843183966088994,
"grad_norm": 0.30439406633377075,
"learning_rate": 0.00014642057654529672,
"loss": 0.7779,
"step": 16480
},
{
"epoch": 0.1844302402961621,
"grad_norm": 0.29318898916244507,
"learning_rate": 0.00014619215130887663,
"loss": 0.7911,
"step": 16490
},
{
"epoch": 0.18454208398342475,
"grad_norm": 0.2726874053478241,
"learning_rate": 0.00014596372607245648,
"loss": 0.7869,
"step": 16500
},
{
"epoch": 0.18465392767068745,
"grad_norm": 0.2978016436100006,
"learning_rate": 0.00014573530083603637,
"loss": 0.783,
"step": 16510
},
{
"epoch": 0.18476577135795014,
"grad_norm": 0.3107501268386841,
"learning_rate": 0.00014550687559961625,
"loss": 0.801,
"step": 16520
},
{
"epoch": 0.1848776150452128,
"grad_norm": 0.2848517894744873,
"learning_rate": 0.00014527845036319613,
"loss": 0.8063,
"step": 16530
},
{
"epoch": 0.1849894587324755,
"grad_norm": 0.2625429332256317,
"learning_rate": 0.00014505002512677601,
"loss": 0.8074,
"step": 16540
},
{
"epoch": 0.18510130241973818,
"grad_norm": 0.2805044949054718,
"learning_rate": 0.0001448215998903559,
"loss": 0.8013,
"step": 16550
},
{
"epoch": 0.18521314610700085,
"grad_norm": 0.27657589316368103,
"learning_rate": 0.00014459317465393575,
"loss": 0.8012,
"step": 16560
},
{
"epoch": 0.18532498979426354,
"grad_norm": 0.2780141532421112,
"learning_rate": 0.00014436474941751566,
"loss": 0.8161,
"step": 16570
},
{
"epoch": 0.18543683348152623,
"grad_norm": 0.2871207892894745,
"learning_rate": 0.00014413632418109555,
"loss": 0.7899,
"step": 16580
},
{
"epoch": 0.1855486771687889,
"grad_norm": 0.2656658887863159,
"learning_rate": 0.0001439078989446754,
"loss": 0.7985,
"step": 16590
},
{
"epoch": 0.1856605208560516,
"grad_norm": 0.2766350209712982,
"learning_rate": 0.00014367947370825528,
"loss": 0.7999,
"step": 16600
},
{
"epoch": 0.18577236454331428,
"grad_norm": 0.2616749107837677,
"learning_rate": 0.0001434510484718352,
"loss": 0.8002,
"step": 16610
},
{
"epoch": 0.18588420823057694,
"grad_norm": 0.25887414813041687,
"learning_rate": 0.00014322262323541505,
"loss": 0.8112,
"step": 16620
},
{
"epoch": 0.18599605191783963,
"grad_norm": 0.2594297528266907,
"learning_rate": 0.00014299419799899493,
"loss": 0.802,
"step": 16630
},
{
"epoch": 0.1861078956051023,
"grad_norm": 0.2535499036312103,
"learning_rate": 0.00014276577276257482,
"loss": 0.7867,
"step": 16640
},
{
"epoch": 0.186219739292365,
"grad_norm": 0.25161436200141907,
"learning_rate": 0.0001425373475261547,
"loss": 0.8059,
"step": 16650
},
{
"epoch": 0.18633158297962768,
"grad_norm": 0.22897444665431976,
"learning_rate": 0.00014230892228973458,
"loss": 0.7864,
"step": 16660
},
{
"epoch": 0.18644342666689034,
"grad_norm": 0.27164047956466675,
"learning_rate": 0.00014208049705331446,
"loss": 0.796,
"step": 16670
},
{
"epoch": 0.18655527035415304,
"grad_norm": 0.2717941701412201,
"learning_rate": 0.00014185207181689432,
"loss": 0.7801,
"step": 16680
},
{
"epoch": 0.18666711404141573,
"grad_norm": 0.27144837379455566,
"learning_rate": 0.00014162364658047423,
"loss": 0.7758,
"step": 16690
},
{
"epoch": 0.1867789577286784,
"grad_norm": 0.2357831746339798,
"learning_rate": 0.00014139522134405409,
"loss": 0.7674,
"step": 16700
},
{
"epoch": 0.18689080141594108,
"grad_norm": 0.23233544826507568,
"learning_rate": 0.00014116679610763397,
"loss": 0.7827,
"step": 16710
},
{
"epoch": 0.18700264510320377,
"grad_norm": 0.2399321347475052,
"learning_rate": 0.00014093837087121385,
"loss": 0.7811,
"step": 16720
},
{
"epoch": 0.18711448879046644,
"grad_norm": 0.2493642419576645,
"learning_rate": 0.00014070994563479373,
"loss": 0.7762,
"step": 16730
},
{
"epoch": 0.18722633247772913,
"grad_norm": 0.23383350670337677,
"learning_rate": 0.00014048152039837362,
"loss": 0.7754,
"step": 16740
},
{
"epoch": 0.18733817616499182,
"grad_norm": 0.2624364197254181,
"learning_rate": 0.0001402530951619535,
"loss": 0.7766,
"step": 16750
},
{
"epoch": 0.18745001985225448,
"grad_norm": 0.24138151109218597,
"learning_rate": 0.00014002466992553336,
"loss": 0.7869,
"step": 16760
},
{
"epoch": 0.18756186353951718,
"grad_norm": 0.2397204041481018,
"learning_rate": 0.00013979624468911326,
"loss": 0.7974,
"step": 16770
},
{
"epoch": 0.18767370722677984,
"grad_norm": 0.27491655945777893,
"learning_rate": 0.00013956781945269315,
"loss": 0.8011,
"step": 16780
},
{
"epoch": 0.18778555091404253,
"grad_norm": 0.2321402132511139,
"learning_rate": 0.000139339394216273,
"loss": 0.803,
"step": 16790
},
{
"epoch": 0.18789739460130522,
"grad_norm": 0.24487042427062988,
"learning_rate": 0.00013911096897985289,
"loss": 0.7975,
"step": 16800
},
{
"epoch": 0.1880092382885679,
"grad_norm": 0.23328396677970886,
"learning_rate": 0.0001388825437434328,
"loss": 0.795,
"step": 16810
},
{
"epoch": 0.18812108197583058,
"grad_norm": 0.22705566883087158,
"learning_rate": 0.00013865411850701265,
"loss": 0.7895,
"step": 16820
},
{
"epoch": 0.18823292566309327,
"grad_norm": 0.24339929223060608,
"learning_rate": 0.00013842569327059253,
"loss": 0.7931,
"step": 16830
},
{
"epoch": 0.18834476935035593,
"grad_norm": 0.2613057494163513,
"learning_rate": 0.00013819726803417242,
"loss": 0.7785,
"step": 16840
},
{
"epoch": 0.18845661303761863,
"grad_norm": 0.27011603116989136,
"learning_rate": 0.0001379688427977523,
"loss": 0.7853,
"step": 16850
},
{
"epoch": 0.18856845672488132,
"grad_norm": 0.26589342951774597,
"learning_rate": 0.00013774041756133218,
"loss": 0.7893,
"step": 16860
},
{
"epoch": 0.18868030041214398,
"grad_norm": 0.26286208629608154,
"learning_rate": 0.00013751199232491207,
"loss": 0.7707,
"step": 16870
},
{
"epoch": 0.18879214409940667,
"grad_norm": 0.3021993637084961,
"learning_rate": 0.00013728356708849192,
"loss": 0.7896,
"step": 16880
},
{
"epoch": 0.18890398778666936,
"grad_norm": 0.30742523074150085,
"learning_rate": 0.00013705514185207183,
"loss": 0.7895,
"step": 16890
},
{
"epoch": 0.18901583147393203,
"grad_norm": 0.3027999699115753,
"learning_rate": 0.0001368267166156517,
"loss": 0.7839,
"step": 16900
},
{
"epoch": 0.18912767516119472,
"grad_norm": 0.29199281334877014,
"learning_rate": 0.00013659829137923157,
"loss": 0.7771,
"step": 16910
},
{
"epoch": 0.18923951884845738,
"grad_norm": 0.2460477203130722,
"learning_rate": 0.00013636986614281145,
"loss": 0.7823,
"step": 16920
},
{
"epoch": 0.18935136253572007,
"grad_norm": 0.2608555853366852,
"learning_rate": 0.00013614144090639134,
"loss": 0.7664,
"step": 16930
},
{
"epoch": 0.18946320622298277,
"grad_norm": 0.2723162770271301,
"learning_rate": 0.00013591301566997122,
"loss": 0.7768,
"step": 16940
},
{
"epoch": 0.18957504991024543,
"grad_norm": 0.2690962255001068,
"learning_rate": 0.0001356845904335511,
"loss": 0.7697,
"step": 16950
},
{
"epoch": 0.18968689359750812,
"grad_norm": 0.2892717719078064,
"learning_rate": 0.00013545616519713096,
"loss": 0.769,
"step": 16960
},
{
"epoch": 0.1897987372847708,
"grad_norm": 0.2581406533718109,
"learning_rate": 0.00013522773996071087,
"loss": 0.7766,
"step": 16970
},
{
"epoch": 0.18991058097203348,
"grad_norm": 0.2944723963737488,
"learning_rate": 0.00013499931472429075,
"loss": 0.7638,
"step": 16980
},
{
"epoch": 0.19002242465929617,
"grad_norm": 0.2776504158973694,
"learning_rate": 0.0001347708894878706,
"loss": 0.7731,
"step": 16990
},
{
"epoch": 0.19013426834655886,
"grad_norm": 0.267098993062973,
"learning_rate": 0.00013454246425145052,
"loss": 0.7772,
"step": 17000
},
{
"epoch": 0.19024611203382152,
"grad_norm": 0.2806127071380615,
"learning_rate": 0.0001343140390150304,
"loss": 0.772,
"step": 17010
},
{
"epoch": 0.19035795572108422,
"grad_norm": 0.2872319519519806,
"learning_rate": 0.00013408561377861025,
"loss": 0.7695,
"step": 17020
},
{
"epoch": 0.1904697994083469,
"grad_norm": 0.24477818608283997,
"learning_rate": 0.00013385718854219014,
"loss": 0.7764,
"step": 17030
},
{
"epoch": 0.19058164309560957,
"grad_norm": 0.2637476623058319,
"learning_rate": 0.00013362876330577005,
"loss": 0.7712,
"step": 17040
},
{
"epoch": 0.19069348678287226,
"grad_norm": 0.2676442861557007,
"learning_rate": 0.0001334003380693499,
"loss": 0.7707,
"step": 17050
},
{
"epoch": 0.19080533047013493,
"grad_norm": 0.2592306435108185,
"learning_rate": 0.00013317191283292979,
"loss": 0.7808,
"step": 17060
},
{
"epoch": 0.19091717415739762,
"grad_norm": 0.3543199896812439,
"learning_rate": 0.00013294348759650967,
"loss": 0.7928,
"step": 17070
},
{
"epoch": 0.1910290178446603,
"grad_norm": 0.26262548565864563,
"learning_rate": 0.00013271506236008955,
"loss": 0.7677,
"step": 17080
},
{
"epoch": 0.19114086153192297,
"grad_norm": 0.2845424711704254,
"learning_rate": 0.00013248663712366943,
"loss": 0.7758,
"step": 17090
},
{
"epoch": 0.19125270521918566,
"grad_norm": 0.2694297730922699,
"learning_rate": 0.00013225821188724932,
"loss": 0.7857,
"step": 17100
},
{
"epoch": 0.19136454890644836,
"grad_norm": 0.2682325839996338,
"learning_rate": 0.00013202978665082917,
"loss": 0.782,
"step": 17110
},
{
"epoch": 0.19147639259371102,
"grad_norm": 0.26535049080848694,
"learning_rate": 0.00013180136141440908,
"loss": 0.7796,
"step": 17120
},
{
"epoch": 0.1915882362809737,
"grad_norm": 0.2759861946105957,
"learning_rate": 0.00013157293617798894,
"loss": 0.7732,
"step": 17130
},
{
"epoch": 0.1917000799682364,
"grad_norm": 0.24873244762420654,
"learning_rate": 0.00013134451094156882,
"loss": 0.7763,
"step": 17140
},
{
"epoch": 0.19181192365549907,
"grad_norm": 0.2826152443885803,
"learning_rate": 0.0001311160857051487,
"loss": 0.7748,
"step": 17150
},
{
"epoch": 0.19192376734276176,
"grad_norm": 0.2823798358440399,
"learning_rate": 0.00013088766046872859,
"loss": 0.768,
"step": 17160
},
{
"epoch": 0.19203561103002445,
"grad_norm": 0.2591745853424072,
"learning_rate": 0.00013065923523230847,
"loss": 0.7831,
"step": 17170
},
{
"epoch": 0.19214745471728711,
"grad_norm": 0.24773742258548737,
"learning_rate": 0.00013043080999588835,
"loss": 0.7799,
"step": 17180
},
{
"epoch": 0.1922592984045498,
"grad_norm": 0.28184765577316284,
"learning_rate": 0.0001302023847594682,
"loss": 0.787,
"step": 17190
},
{
"epoch": 0.19237114209181247,
"grad_norm": 0.24396668374538422,
"learning_rate": 0.00012997395952304812,
"loss": 0.7777,
"step": 17200
},
{
"epoch": 0.19248298577907516,
"grad_norm": 0.25493332743644714,
"learning_rate": 0.000129745534286628,
"loss": 0.7842,
"step": 17210
},
{
"epoch": 0.19259482946633785,
"grad_norm": 0.2615022361278534,
"learning_rate": 0.00012951710905020786,
"loss": 0.788,
"step": 17220
},
{
"epoch": 0.19270667315360052,
"grad_norm": 0.28270524740219116,
"learning_rate": 0.00012928868381378774,
"loss": 0.7788,
"step": 17230
},
{
"epoch": 0.1928185168408632,
"grad_norm": 0.24917210638523102,
"learning_rate": 0.00012906025857736765,
"loss": 0.7731,
"step": 17240
},
{
"epoch": 0.1929303605281259,
"grad_norm": 0.2589946985244751,
"learning_rate": 0.0001288318333409475,
"loss": 0.7781,
"step": 17250
},
{
"epoch": 0.19304220421538856,
"grad_norm": 0.23770585656166077,
"learning_rate": 0.0001286034081045274,
"loss": 0.7902,
"step": 17260
},
{
"epoch": 0.19315404790265125,
"grad_norm": 0.22782771289348602,
"learning_rate": 0.00012837498286810727,
"loss": 0.7875,
"step": 17270
},
{
"epoch": 0.19326589158991395,
"grad_norm": 0.2611001431941986,
"learning_rate": 0.00012814655763168715,
"loss": 0.794,
"step": 17280
},
{
"epoch": 0.1933777352771766,
"grad_norm": 0.2642746865749359,
"learning_rate": 0.00012791813239526704,
"loss": 0.8005,
"step": 17290
},
{
"epoch": 0.1934895789644393,
"grad_norm": 0.2470688372850418,
"learning_rate": 0.00012768970715884692,
"loss": 0.7854,
"step": 17300
},
{
"epoch": 0.193601422651702,
"grad_norm": 0.24735964834690094,
"learning_rate": 0.00012746128192242677,
"loss": 0.7918,
"step": 17310
},
{
"epoch": 0.19371326633896466,
"grad_norm": 0.2734208405017853,
"learning_rate": 0.00012723285668600668,
"loss": 0.7719,
"step": 17320
},
{
"epoch": 0.19382511002622735,
"grad_norm": 0.28373652696609497,
"learning_rate": 0.00012700443144958657,
"loss": 0.7743,
"step": 17330
},
{
"epoch": 0.19393695371349004,
"grad_norm": 0.25755295157432556,
"learning_rate": 0.00012677600621316642,
"loss": 0.7761,
"step": 17340
},
{
"epoch": 0.1940487974007527,
"grad_norm": 0.2918241322040558,
"learning_rate": 0.0001265475809767463,
"loss": 0.7885,
"step": 17350
},
{
"epoch": 0.1941606410880154,
"grad_norm": 0.2589518427848816,
"learning_rate": 0.0001263191557403262,
"loss": 0.7781,
"step": 17360
},
{
"epoch": 0.19427248477527806,
"grad_norm": 0.2941739857196808,
"learning_rate": 0.00012609073050390607,
"loss": 0.7896,
"step": 17370
},
{
"epoch": 0.19438432846254075,
"grad_norm": 0.2625831663608551,
"learning_rate": 0.00012586230526748595,
"loss": 0.7797,
"step": 17380
},
{
"epoch": 0.19449617214980344,
"grad_norm": 0.2731517255306244,
"learning_rate": 0.0001256338800310658,
"loss": 0.7861,
"step": 17390
},
{
"epoch": 0.1946080158370661,
"grad_norm": 0.2802453637123108,
"learning_rate": 0.00012540545479464572,
"loss": 0.8066,
"step": 17400
},
{
"epoch": 0.1947198595243288,
"grad_norm": 0.24151596426963806,
"learning_rate": 0.0001251770295582256,
"loss": 0.7746,
"step": 17410
},
{
"epoch": 0.1948317032115915,
"grad_norm": 0.27006617188453674,
"learning_rate": 0.00012494860432180549,
"loss": 0.7796,
"step": 17420
},
{
"epoch": 0.19494354689885415,
"grad_norm": 0.2574283480644226,
"learning_rate": 0.00012472017908538537,
"loss": 0.7809,
"step": 17430
},
{
"epoch": 0.19505539058611684,
"grad_norm": 0.25741514563560486,
"learning_rate": 0.00012449175384896522,
"loss": 0.7792,
"step": 17440
},
{
"epoch": 0.19516723427337954,
"grad_norm": 0.2619360685348511,
"learning_rate": 0.00012426332861254513,
"loss": 0.7768,
"step": 17450
},
{
"epoch": 0.1952790779606422,
"grad_norm": 0.28053224086761475,
"learning_rate": 0.000124034903376125,
"loss": 0.7841,
"step": 17460
},
{
"epoch": 0.1953909216479049,
"grad_norm": 0.24019859731197357,
"learning_rate": 0.00012380647813970487,
"loss": 0.783,
"step": 17470
},
{
"epoch": 0.19550276533516758,
"grad_norm": 0.2747540771961212,
"learning_rate": 0.00012357805290328475,
"loss": 0.7911,
"step": 17480
},
{
"epoch": 0.19561460902243025,
"grad_norm": 0.28044483065605164,
"learning_rate": 0.00012334962766686464,
"loss": 0.7986,
"step": 17490
},
{
"epoch": 0.19572645270969294,
"grad_norm": 0.24908137321472168,
"learning_rate": 0.00012312120243044452,
"loss": 0.8087,
"step": 17500
},
{
"epoch": 0.1958382963969556,
"grad_norm": 0.29041793942451477,
"learning_rate": 0.0001228927771940244,
"loss": 0.8063,
"step": 17510
},
{
"epoch": 0.1959501400842183,
"grad_norm": 0.3020537495613098,
"learning_rate": 0.00012266435195760429,
"loss": 0.8004,
"step": 17520
},
{
"epoch": 0.19606198377148099,
"grad_norm": 0.29414400458335876,
"learning_rate": 0.00012243592672118417,
"loss": 0.7846,
"step": 17530
},
{
"epoch": 0.19617382745874365,
"grad_norm": 0.2648397386074066,
"learning_rate": 0.00012220750148476402,
"loss": 0.7708,
"step": 17540
},
{
"epoch": 0.19628567114600634,
"grad_norm": 0.2834302484989166,
"learning_rate": 0.00012197907624834392,
"loss": 0.7818,
"step": 17550
},
{
"epoch": 0.19639751483326903,
"grad_norm": 0.2748505175113678,
"learning_rate": 0.0001217506510119238,
"loss": 0.7642,
"step": 17560
},
{
"epoch": 0.1965093585205317,
"grad_norm": 0.32425326108932495,
"learning_rate": 0.00012152222577550367,
"loss": 0.7765,
"step": 17570
},
{
"epoch": 0.1966212022077944,
"grad_norm": 0.27183324098587036,
"learning_rate": 0.00012129380053908357,
"loss": 0.7572,
"step": 17580
},
{
"epoch": 0.19673304589505708,
"grad_norm": 0.28190943598747253,
"learning_rate": 0.00012106537530266344,
"loss": 0.7571,
"step": 17590
},
{
"epoch": 0.19684488958231974,
"grad_norm": 0.5151196718215942,
"learning_rate": 0.00012083695006624332,
"loss": 0.7565,
"step": 17600
},
{
"epoch": 0.19695673326958243,
"grad_norm": 0.2523132264614105,
"learning_rate": 0.0001206085248298232,
"loss": 0.7597,
"step": 17610
},
{
"epoch": 0.19706857695684513,
"grad_norm": 0.27336063981056213,
"learning_rate": 0.00012038009959340309,
"loss": 0.7546,
"step": 17620
},
{
"epoch": 0.1971804206441078,
"grad_norm": 0.25119057297706604,
"learning_rate": 0.00012015167435698296,
"loss": 0.7519,
"step": 17630
},
{
"epoch": 0.19729226433137048,
"grad_norm": 0.281147301197052,
"learning_rate": 0.00011992324912056284,
"loss": 0.7623,
"step": 17640
},
{
"epoch": 0.19740410801863315,
"grad_norm": 0.2463361769914627,
"learning_rate": 0.00011969482388414272,
"loss": 0.754,
"step": 17650
},
{
"epoch": 0.19751595170589584,
"grad_norm": 0.2902059853076935,
"learning_rate": 0.0001194663986477226,
"loss": 0.7578,
"step": 17660
},
{
"epoch": 0.19762779539315853,
"grad_norm": 0.2590588629245758,
"learning_rate": 0.00011923797341130247,
"loss": 0.7427,
"step": 17670
},
{
"epoch": 0.1977396390804212,
"grad_norm": 0.24349506199359894,
"learning_rate": 0.00011900954817488237,
"loss": 0.7599,
"step": 17680
},
{
"epoch": 0.19785148276768388,
"grad_norm": 0.2568139135837555,
"learning_rate": 0.00011878112293846224,
"loss": 0.7673,
"step": 17690
},
{
"epoch": 0.19796332645494658,
"grad_norm": 0.2617419958114624,
"learning_rate": 0.00011855269770204212,
"loss": 0.7637,
"step": 17700
},
{
"epoch": 0.19807517014220924,
"grad_norm": 0.24309082329273224,
"learning_rate": 0.000118324272465622,
"loss": 0.7583,
"step": 17710
},
{
"epoch": 0.19818701382947193,
"grad_norm": 0.22027656435966492,
"learning_rate": 0.00011809584722920189,
"loss": 0.7479,
"step": 17720
},
{
"epoch": 0.19829885751673462,
"grad_norm": 0.27296265959739685,
"learning_rate": 0.00011786742199278176,
"loss": 0.765,
"step": 17730
},
{
"epoch": 0.1984107012039973,
"grad_norm": 0.2589128613471985,
"learning_rate": 0.00011763899675636165,
"loss": 0.777,
"step": 17740
},
{
"epoch": 0.19852254489125998,
"grad_norm": 0.27665242552757263,
"learning_rate": 0.00011741057151994152,
"loss": 0.7656,
"step": 17750
},
{
"epoch": 0.19863438857852267,
"grad_norm": 0.27103251218795776,
"learning_rate": 0.0001171821462835214,
"loss": 0.7716,
"step": 17760
},
{
"epoch": 0.19874623226578533,
"grad_norm": 0.2768172025680542,
"learning_rate": 0.00011695372104710127,
"loss": 0.7738,
"step": 17770
},
{
"epoch": 0.19885807595304802,
"grad_norm": 0.2424757182598114,
"learning_rate": 0.00011672529581068117,
"loss": 0.7793,
"step": 17780
},
{
"epoch": 0.1989699196403107,
"grad_norm": 0.2821860909461975,
"learning_rate": 0.00011649687057426104,
"loss": 0.7771,
"step": 17790
},
{
"epoch": 0.19908176332757338,
"grad_norm": 0.28263264894485474,
"learning_rate": 0.00011626844533784092,
"loss": 0.7812,
"step": 17800
},
{
"epoch": 0.19919360701483607,
"grad_norm": 0.24835869669914246,
"learning_rate": 0.0001160400201014208,
"loss": 0.7753,
"step": 17810
},
{
"epoch": 0.19930545070209874,
"grad_norm": 0.23325562477111816,
"learning_rate": 0.00011581159486500069,
"loss": 0.7763,
"step": 17820
},
{
"epoch": 0.19941729438936143,
"grad_norm": 0.2520182132720947,
"learning_rate": 0.00011558316962858056,
"loss": 0.791,
"step": 17830
},
{
"epoch": 0.19952913807662412,
"grad_norm": 0.2478768676519394,
"learning_rate": 0.00011535474439216045,
"loss": 0.7819,
"step": 17840
},
{
"epoch": 0.19964098176388678,
"grad_norm": 0.2749478220939636,
"learning_rate": 0.00011512631915574032,
"loss": 0.7805,
"step": 17850
},
{
"epoch": 0.19975282545114947,
"grad_norm": 0.2417723685503006,
"learning_rate": 0.0001148978939193202,
"loss": 0.766,
"step": 17860
},
{
"epoch": 0.19986466913841217,
"grad_norm": 0.25219354033470154,
"learning_rate": 0.00011466946868290008,
"loss": 0.758,
"step": 17870
},
{
"epoch": 0.19997651282567483,
"grad_norm": 0.24644000828266144,
"learning_rate": 0.00011444104344647997,
"loss": 0.7569,
"step": 17880
},
{
"epoch": 0.20008835651293752,
"grad_norm": 0.2683338224887848,
"learning_rate": 0.00011421261821005986,
"loss": 0.7509,
"step": 17890
},
{
"epoch": 0.2002002002002002,
"grad_norm": 0.29149681329727173,
"learning_rate": 0.00011398419297363972,
"loss": 0.7611,
"step": 17900
},
{
"epoch": 0.20031204388746288,
"grad_norm": 0.2651118338108063,
"learning_rate": 0.00011375576773721962,
"loss": 0.756,
"step": 17910
},
{
"epoch": 0.20042388757472557,
"grad_norm": 0.26990607380867004,
"learning_rate": 0.00011352734250079949,
"loss": 0.7726,
"step": 17920
},
{
"epoch": 0.20053573126198823,
"grad_norm": 0.23897935450077057,
"learning_rate": 0.00011329891726437937,
"loss": 0.7875,
"step": 17930
},
{
"epoch": 0.20064757494925092,
"grad_norm": 0.2300727218389511,
"learning_rate": 0.00011307049202795926,
"loss": 0.7697,
"step": 17940
},
{
"epoch": 0.20075941863651361,
"grad_norm": 0.2873596251010895,
"learning_rate": 0.00011284206679153914,
"loss": 0.7776,
"step": 17950
},
{
"epoch": 0.20087126232377628,
"grad_norm": 0.29036712646484375,
"learning_rate": 0.00011261364155511901,
"loss": 0.7794,
"step": 17960
},
{
"epoch": 0.20098310601103897,
"grad_norm": 0.2837420701980591,
"learning_rate": 0.0001123852163186989,
"loss": 0.7818,
"step": 17970
},
{
"epoch": 0.20109494969830166,
"grad_norm": 0.2920686602592468,
"learning_rate": 0.00011215679108227877,
"loss": 0.7851,
"step": 17980
},
{
"epoch": 0.20120679338556433,
"grad_norm": 0.27664583921432495,
"learning_rate": 0.00011192836584585866,
"loss": 0.7601,
"step": 17990
},
{
"epoch": 0.20131863707282702,
"grad_norm": 0.26870399713516235,
"learning_rate": 0.00011169994060943853,
"loss": 0.7961,
"step": 18000
},
{
"epoch": 0.2014304807600897,
"grad_norm": 0.2502228021621704,
"learning_rate": 0.00011147151537301842,
"loss": 0.7827,
"step": 18010
},
{
"epoch": 0.20154232444735237,
"grad_norm": 0.2473440319299698,
"learning_rate": 0.00011124309013659829,
"loss": 0.7815,
"step": 18020
},
{
"epoch": 0.20165416813461506,
"grad_norm": 0.2513076663017273,
"learning_rate": 0.00011101466490017817,
"loss": 0.7675,
"step": 18030
},
{
"epoch": 0.20176601182187776,
"grad_norm": 0.2829226851463318,
"learning_rate": 0.00011078623966375806,
"loss": 0.7669,
"step": 18040
},
{
"epoch": 0.20187785550914042,
"grad_norm": 0.25758418440818787,
"learning_rate": 0.00011055781442733794,
"loss": 0.7707,
"step": 18050
},
{
"epoch": 0.2019896991964031,
"grad_norm": 0.27185285091400146,
"learning_rate": 0.00011032938919091781,
"loss": 0.7742,
"step": 18060
},
{
"epoch": 0.2021015428836658,
"grad_norm": 0.2802230417728424,
"learning_rate": 0.0001101009639544977,
"loss": 0.7821,
"step": 18070
},
{
"epoch": 0.20221338657092847,
"grad_norm": 0.2882921099662781,
"learning_rate": 0.00010987253871807757,
"loss": 0.779,
"step": 18080
},
{
"epoch": 0.20232523025819116,
"grad_norm": 0.2569839358329773,
"learning_rate": 0.00010964411348165746,
"loss": 0.7694,
"step": 18090
},
{
"epoch": 0.20243707394545382,
"grad_norm": 0.2600938379764557,
"learning_rate": 0.00010941568824523733,
"loss": 0.7781,
"step": 18100
},
{
"epoch": 0.2025489176327165,
"grad_norm": 0.28083154559135437,
"learning_rate": 0.00010918726300881722,
"loss": 0.7799,
"step": 18110
},
{
"epoch": 0.2026607613199792,
"grad_norm": 0.22990182042121887,
"learning_rate": 0.00010895883777239709,
"loss": 0.7883,
"step": 18120
},
{
"epoch": 0.20277260500724187,
"grad_norm": 0.27432581782341003,
"learning_rate": 0.00010873041253597697,
"loss": 0.7942,
"step": 18130
},
{
"epoch": 0.20288444869450456,
"grad_norm": 0.2607738971710205,
"learning_rate": 0.00010850198729955686,
"loss": 0.7877,
"step": 18140
},
{
"epoch": 0.20299629238176725,
"grad_norm": 0.2818219065666199,
"learning_rate": 0.00010827356206313674,
"loss": 0.7948,
"step": 18150
},
{
"epoch": 0.20310813606902992,
"grad_norm": 0.2751563489437103,
"learning_rate": 0.00010804513682671661,
"loss": 0.7836,
"step": 18160
},
{
"epoch": 0.2032199797562926,
"grad_norm": 0.2746957242488861,
"learning_rate": 0.0001078167115902965,
"loss": 0.7693,
"step": 18170
},
{
"epoch": 0.2033318234435553,
"grad_norm": 0.24990054965019226,
"learning_rate": 0.00010758828635387638,
"loss": 0.7869,
"step": 18180
},
{
"epoch": 0.20344366713081796,
"grad_norm": 0.24581623077392578,
"learning_rate": 0.00010735986111745626,
"loss": 0.768,
"step": 18190
},
{
"epoch": 0.20355551081808065,
"grad_norm": 0.26637768745422363,
"learning_rate": 0.00010713143588103613,
"loss": 0.7711,
"step": 18200
},
{
"epoch": 0.20366735450534335,
"grad_norm": 0.2510250508785248,
"learning_rate": 0.00010690301064461602,
"loss": 0.7748,
"step": 18210
},
{
"epoch": 0.203779198192606,
"grad_norm": 0.2378496378660202,
"learning_rate": 0.00010667458540819589,
"loss": 0.7622,
"step": 18220
},
{
"epoch": 0.2038910418798687,
"grad_norm": 0.2507869601249695,
"learning_rate": 0.00010644616017177578,
"loss": 0.7739,
"step": 18230
},
{
"epoch": 0.20400288556713136,
"grad_norm": 0.24733096361160278,
"learning_rate": 0.00010621773493535566,
"loss": 0.7508,
"step": 18240
},
{
"epoch": 0.20411472925439406,
"grad_norm": 0.23383109271526337,
"learning_rate": 0.00010598930969893554,
"loss": 0.7507,
"step": 18250
},
{
"epoch": 0.20422657294165675,
"grad_norm": 0.2543237805366516,
"learning_rate": 0.00010576088446251541,
"loss": 0.7578,
"step": 18260
},
{
"epoch": 0.2043384166289194,
"grad_norm": 0.25807520747184753,
"learning_rate": 0.00010553245922609531,
"loss": 0.7513,
"step": 18270
},
{
"epoch": 0.2044502603161821,
"grad_norm": 0.23354406654834747,
"learning_rate": 0.00010530403398967518,
"loss": 0.7566,
"step": 18280
},
{
"epoch": 0.2045621040034448,
"grad_norm": 0.2685154676437378,
"learning_rate": 0.00010507560875325506,
"loss": 0.758,
"step": 18290
},
{
"epoch": 0.20467394769070746,
"grad_norm": 0.24349918961524963,
"learning_rate": 0.00010484718351683494,
"loss": 0.7686,
"step": 18300
},
{
"epoch": 0.20478579137797015,
"grad_norm": 0.24823498725891113,
"learning_rate": 0.00010461875828041482,
"loss": 0.7659,
"step": 18310
},
{
"epoch": 0.20489763506523284,
"grad_norm": 0.2511804401874542,
"learning_rate": 0.0001043903330439947,
"loss": 0.77,
"step": 18320
},
{
"epoch": 0.2050094787524955,
"grad_norm": 0.24065516889095306,
"learning_rate": 0.00010416190780757458,
"loss": 0.7677,
"step": 18330
},
{
"epoch": 0.2051213224397582,
"grad_norm": 0.2819323241710663,
"learning_rate": 0.00010393348257115447,
"loss": 0.753,
"step": 18340
},
{
"epoch": 0.2052331661270209,
"grad_norm": 0.26467952132225037,
"learning_rate": 0.00010370505733473434,
"loss": 0.7826,
"step": 18350
},
{
"epoch": 0.20534500981428355,
"grad_norm": 0.22962163388729095,
"learning_rate": 0.00010347663209831423,
"loss": 0.7683,
"step": 18360
},
{
"epoch": 0.20545685350154624,
"grad_norm": 0.2582736611366272,
"learning_rate": 0.00010324820686189411,
"loss": 0.7951,
"step": 18370
},
{
"epoch": 0.2055686971888089,
"grad_norm": 0.2352149486541748,
"learning_rate": 0.00010301978162547399,
"loss": 0.7577,
"step": 18380
},
{
"epoch": 0.2056805408760716,
"grad_norm": 0.25687554478645325,
"learning_rate": 0.00010279135638905386,
"loss": 0.7696,
"step": 18390
},
{
"epoch": 0.2057923845633343,
"grad_norm": 0.2579772472381592,
"learning_rate": 0.00010256293115263376,
"loss": 0.7837,
"step": 18400
},
{
"epoch": 0.20590422825059695,
"grad_norm": 0.24537009000778198,
"learning_rate": 0.00010233450591621363,
"loss": 0.7799,
"step": 18410
},
{
"epoch": 0.20601607193785965,
"grad_norm": 0.2636966109275818,
"learning_rate": 0.00010210608067979351,
"loss": 0.7588,
"step": 18420
},
{
"epoch": 0.20612791562512234,
"grad_norm": 0.30670562386512756,
"learning_rate": 0.00010187765544337338,
"loss": 0.771,
"step": 18430
},
{
"epoch": 0.206239759312385,
"grad_norm": 0.28400668501853943,
"learning_rate": 0.00010164923020695327,
"loss": 0.7686,
"step": 18440
},
{
"epoch": 0.2063516029996477,
"grad_norm": 0.27395951747894287,
"learning_rate": 0.00010142080497053314,
"loss": 0.776,
"step": 18450
},
{
"epoch": 0.20646344668691038,
"grad_norm": 0.284868061542511,
"learning_rate": 0.00010119237973411303,
"loss": 0.7864,
"step": 18460
},
{
"epoch": 0.20657529037417305,
"grad_norm": 0.2859087586402893,
"learning_rate": 0.00010096395449769291,
"loss": 0.7749,
"step": 18470
},
{
"epoch": 0.20668713406143574,
"grad_norm": 0.28758034110069275,
"learning_rate": 0.00010073552926127279,
"loss": 0.7919,
"step": 18480
},
{
"epoch": 0.20679897774869843,
"grad_norm": 0.2752404510974884,
"learning_rate": 0.00010050710402485266,
"loss": 0.7808,
"step": 18490
},
{
"epoch": 0.2069108214359611,
"grad_norm": 0.30756843090057373,
"learning_rate": 0.00010027867878843256,
"loss": 0.7734,
"step": 18500
},
{
"epoch": 0.2070226651232238,
"grad_norm": 0.2694368064403534,
"learning_rate": 0.00010005025355201243,
"loss": 0.7751,
"step": 18510
},
{
"epoch": 0.20713450881048645,
"grad_norm": 0.25838834047317505,
"learning_rate": 9.982182831559231e-05,
"loss": 0.7686,
"step": 18520
},
{
"epoch": 0.20724635249774914,
"grad_norm": 0.257729709148407,
"learning_rate": 9.959340307917219e-05,
"loss": 0.7827,
"step": 18530
},
{
"epoch": 0.20735819618501183,
"grad_norm": 0.2938844859600067,
"learning_rate": 9.936497784275208e-05,
"loss": 0.7685,
"step": 18540
},
{
"epoch": 0.2074700398722745,
"grad_norm": 0.25894027948379517,
"learning_rate": 9.913655260633194e-05,
"loss": 0.7738,
"step": 18550
},
{
"epoch": 0.2075818835595372,
"grad_norm": 0.2751148045063019,
"learning_rate": 9.890812736991183e-05,
"loss": 0.7594,
"step": 18560
},
{
"epoch": 0.20769372724679988,
"grad_norm": 0.28643253445625305,
"learning_rate": 9.867970213349171e-05,
"loss": 0.7737,
"step": 18570
},
{
"epoch": 0.20780557093406254,
"grad_norm": 0.2575749158859253,
"learning_rate": 9.845127689707159e-05,
"loss": 0.7778,
"step": 18580
},
{
"epoch": 0.20791741462132524,
"grad_norm": 0.27625295519828796,
"learning_rate": 9.822285166065146e-05,
"loss": 0.7716,
"step": 18590
},
{
"epoch": 0.20802925830858793,
"grad_norm": 0.2803322672843933,
"learning_rate": 9.799442642423136e-05,
"loss": 0.7805,
"step": 18600
},
{
"epoch": 0.2081411019958506,
"grad_norm": 0.2567484676837921,
"learning_rate": 9.776600118781123e-05,
"loss": 0.7633,
"step": 18610
},
{
"epoch": 0.20825294568311328,
"grad_norm": 0.28193768858909607,
"learning_rate": 9.753757595139111e-05,
"loss": 0.7895,
"step": 18620
},
{
"epoch": 0.20836478937037597,
"grad_norm": 0.28459542989730835,
"learning_rate": 9.7309150714971e-05,
"loss": 0.7741,
"step": 18630
},
{
"epoch": 0.20847663305763864,
"grad_norm": 0.28346261382102966,
"learning_rate": 9.708072547855088e-05,
"loss": 0.7813,
"step": 18640
},
{
"epoch": 0.20858847674490133,
"grad_norm": 0.2818828523159027,
"learning_rate": 9.685230024213075e-05,
"loss": 0.7755,
"step": 18650
},
{
"epoch": 0.208700320432164,
"grad_norm": 0.28914326429367065,
"learning_rate": 9.662387500571063e-05,
"loss": 0.7798,
"step": 18660
},
{
"epoch": 0.20881216411942669,
"grad_norm": 0.2600755989551544,
"learning_rate": 9.639544976929051e-05,
"loss": 0.7758,
"step": 18670
},
{
"epoch": 0.20892400780668938,
"grad_norm": 0.2726733088493347,
"learning_rate": 9.61670245328704e-05,
"loss": 0.7769,
"step": 18680
},
{
"epoch": 0.20903585149395204,
"grad_norm": 0.23421594500541687,
"learning_rate": 9.593859929645026e-05,
"loss": 0.758,
"step": 18690
},
{
"epoch": 0.20914769518121473,
"grad_norm": 0.29468339681625366,
"learning_rate": 9.571017406003016e-05,
"loss": 0.7746,
"step": 18700
},
{
"epoch": 0.20925953886847742,
"grad_norm": 0.29477235674858093,
"learning_rate": 9.548174882361003e-05,
"loss": 0.7633,
"step": 18710
},
{
"epoch": 0.2093713825557401,
"grad_norm": 0.2564197778701782,
"learning_rate": 9.525332358718991e-05,
"loss": 0.7541,
"step": 18720
},
{
"epoch": 0.20948322624300278,
"grad_norm": 0.2745250165462494,
"learning_rate": 9.50248983507698e-05,
"loss": 0.7887,
"step": 18730
},
{
"epoch": 0.20959506993026547,
"grad_norm": 0.2572060525417328,
"learning_rate": 9.479647311434968e-05,
"loss": 0.774,
"step": 18740
},
{
"epoch": 0.20970691361752813,
"grad_norm": 0.28513193130493164,
"learning_rate": 9.456804787792955e-05,
"loss": 0.7871,
"step": 18750
},
{
"epoch": 0.20981875730479083,
"grad_norm": 0.2643887400627136,
"learning_rate": 9.433962264150944e-05,
"loss": 0.77,
"step": 18760
},
{
"epoch": 0.20993060099205352,
"grad_norm": 0.27534207701683044,
"learning_rate": 9.411119740508931e-05,
"loss": 0.7775,
"step": 18770
},
{
"epoch": 0.21004244467931618,
"grad_norm": 0.2620585858821869,
"learning_rate": 9.38827721686692e-05,
"loss": 0.7808,
"step": 18780
},
{
"epoch": 0.21015428836657887,
"grad_norm": 0.2759549915790558,
"learning_rate": 9.365434693224908e-05,
"loss": 0.7642,
"step": 18790
},
{
"epoch": 0.21026613205384156,
"grad_norm": 0.2919774353504181,
"learning_rate": 9.342592169582896e-05,
"loss": 0.7828,
"step": 18800
},
{
"epoch": 0.21037797574110423,
"grad_norm": 0.2717173099517822,
"learning_rate": 9.319749645940884e-05,
"loss": 0.7513,
"step": 18810
},
{
"epoch": 0.21048981942836692,
"grad_norm": 0.2662122845649719,
"learning_rate": 9.296907122298871e-05,
"loss": 0.7668,
"step": 18820
},
{
"epoch": 0.21060166311562958,
"grad_norm": 0.26051005721092224,
"learning_rate": 9.274064598656861e-05,
"loss": 0.7676,
"step": 18830
},
{
"epoch": 0.21071350680289228,
"grad_norm": 0.27510005235671997,
"learning_rate": 9.251222075014848e-05,
"loss": 0.7507,
"step": 18840
},
{
"epoch": 0.21082535049015497,
"grad_norm": 0.23877868056297302,
"learning_rate": 9.228379551372836e-05,
"loss": 0.7535,
"step": 18850
},
{
"epoch": 0.21093719417741763,
"grad_norm": 0.256104439496994,
"learning_rate": 9.205537027730824e-05,
"loss": 0.7546,
"step": 18860
},
{
"epoch": 0.21104903786468032,
"grad_norm": 0.2829015552997589,
"learning_rate": 9.182694504088813e-05,
"loss": 0.7588,
"step": 18870
},
{
"epoch": 0.211160881551943,
"grad_norm": 0.22898368537425995,
"learning_rate": 9.1598519804468e-05,
"loss": 0.7551,
"step": 18880
},
{
"epoch": 0.21127272523920568,
"grad_norm": 0.23679418861865997,
"learning_rate": 9.137009456804788e-05,
"loss": 0.7718,
"step": 18890
},
{
"epoch": 0.21138456892646837,
"grad_norm": 0.2878457009792328,
"learning_rate": 9.114166933162776e-05,
"loss": 0.7593,
"step": 18900
},
{
"epoch": 0.21149641261373106,
"grad_norm": 0.2936013638973236,
"learning_rate": 9.091324409520764e-05,
"loss": 0.7713,
"step": 18910
},
{
"epoch": 0.21160825630099372,
"grad_norm": 0.26062774658203125,
"learning_rate": 9.068481885878751e-05,
"loss": 0.7763,
"step": 18920
},
{
"epoch": 0.21172009998825642,
"grad_norm": 0.3092271685600281,
"learning_rate": 9.045639362236741e-05,
"loss": 0.7807,
"step": 18930
},
{
"epoch": 0.2118319436755191,
"grad_norm": 0.23566113412380219,
"learning_rate": 9.022796838594728e-05,
"loss": 0.7779,
"step": 18940
},
{
"epoch": 0.21194378736278177,
"grad_norm": 0.27366477251052856,
"learning_rate": 8.999954314952716e-05,
"loss": 0.77,
"step": 18950
},
{
"epoch": 0.21205563105004446,
"grad_norm": 0.23270778357982635,
"learning_rate": 8.977111791310704e-05,
"loss": 0.7549,
"step": 18960
},
{
"epoch": 0.21216747473730713,
"grad_norm": 0.28785306215286255,
"learning_rate": 8.954269267668693e-05,
"loss": 0.7677,
"step": 18970
},
{
"epoch": 0.21227931842456982,
"grad_norm": 0.2588510811328888,
"learning_rate": 8.93142674402668e-05,
"loss": 0.7715,
"step": 18980
},
{
"epoch": 0.2123911621118325,
"grad_norm": 0.248029887676239,
"learning_rate": 8.908584220384668e-05,
"loss": 0.7749,
"step": 18990
},
{
"epoch": 0.21250300579909517,
"grad_norm": 0.2579936981201172,
"learning_rate": 8.885741696742656e-05,
"loss": 0.7552,
"step": 19000
},
{
"epoch": 0.21261484948635787,
"grad_norm": 0.26293206214904785,
"learning_rate": 8.862899173100645e-05,
"loss": 0.7657,
"step": 19010
},
{
"epoch": 0.21272669317362056,
"grad_norm": 0.24589793384075165,
"learning_rate": 8.840056649458631e-05,
"loss": 0.7598,
"step": 19020
},
{
"epoch": 0.21283853686088322,
"grad_norm": 0.2315252274274826,
"learning_rate": 8.817214125816621e-05,
"loss": 0.7637,
"step": 19030
},
{
"epoch": 0.2129503805481459,
"grad_norm": 0.2538358271121979,
"learning_rate": 8.794371602174608e-05,
"loss": 0.7587,
"step": 19040
},
{
"epoch": 0.2130622242354086,
"grad_norm": 0.2626616060733795,
"learning_rate": 8.771529078532596e-05,
"loss": 0.7597,
"step": 19050
},
{
"epoch": 0.21317406792267127,
"grad_norm": 0.2557279169559479,
"learning_rate": 8.748686554890585e-05,
"loss": 0.7499,
"step": 19060
},
{
"epoch": 0.21328591160993396,
"grad_norm": 0.25008153915405273,
"learning_rate": 8.725844031248573e-05,
"loss": 0.7466,
"step": 19070
},
{
"epoch": 0.21339775529719665,
"grad_norm": 0.2647120952606201,
"learning_rate": 8.70300150760656e-05,
"loss": 0.7574,
"step": 19080
},
{
"epoch": 0.21350959898445931,
"grad_norm": 0.2535738945007324,
"learning_rate": 8.68015898396455e-05,
"loss": 0.7672,
"step": 19090
},
{
"epoch": 0.213621442671722,
"grad_norm": 0.28925755620002747,
"learning_rate": 8.657316460322536e-05,
"loss": 0.7692,
"step": 19100
},
{
"epoch": 0.21373328635898467,
"grad_norm": 0.26770591735839844,
"learning_rate": 8.634473936680525e-05,
"loss": 0.7511,
"step": 19110
},
{
"epoch": 0.21384513004624736,
"grad_norm": 0.25162947177886963,
"learning_rate": 8.611631413038512e-05,
"loss": 0.7573,
"step": 19120
},
{
"epoch": 0.21395697373351005,
"grad_norm": 0.253324031829834,
"learning_rate": 8.588788889396501e-05,
"loss": 0.7516,
"step": 19130
},
{
"epoch": 0.21406881742077272,
"grad_norm": 0.2784843146800995,
"learning_rate": 8.565946365754488e-05,
"loss": 0.7522,
"step": 19140
},
{
"epoch": 0.2141806611080354,
"grad_norm": 0.2869722247123718,
"learning_rate": 8.543103842112476e-05,
"loss": 0.7525,
"step": 19150
},
{
"epoch": 0.2142925047952981,
"grad_norm": 0.2467101663351059,
"learning_rate": 8.520261318470465e-05,
"loss": 0.7336,
"step": 19160
},
{
"epoch": 0.21440434848256076,
"grad_norm": 0.26108691096305847,
"learning_rate": 8.497418794828453e-05,
"loss": 0.751,
"step": 19170
},
{
"epoch": 0.21451619216982346,
"grad_norm": 0.2992580533027649,
"learning_rate": 8.47457627118644e-05,
"loss": 0.7599,
"step": 19180
},
{
"epoch": 0.21462803585708615,
"grad_norm": 0.2573351562023163,
"learning_rate": 8.45173374754443e-05,
"loss": 0.752,
"step": 19190
},
{
"epoch": 0.2147398795443488,
"grad_norm": 0.30148234963417053,
"learning_rate": 8.428891223902416e-05,
"loss": 0.7536,
"step": 19200
},
{
"epoch": 0.2148517232316115,
"grad_norm": 0.2811321020126343,
"learning_rate": 8.406048700260405e-05,
"loss": 0.761,
"step": 19210
},
{
"epoch": 0.2149635669188742,
"grad_norm": 0.2792038321495056,
"learning_rate": 8.383206176618392e-05,
"loss": 0.7558,
"step": 19220
},
{
"epoch": 0.21507541060613686,
"grad_norm": 0.30432426929473877,
"learning_rate": 8.360363652976381e-05,
"loss": 0.7541,
"step": 19230
},
{
"epoch": 0.21518725429339955,
"grad_norm": 0.28335481882095337,
"learning_rate": 8.33752112933437e-05,
"loss": 0.7628,
"step": 19240
},
{
"epoch": 0.2152990979806622,
"grad_norm": 0.28402864933013916,
"learning_rate": 8.314678605692357e-05,
"loss": 0.7835,
"step": 19250
},
{
"epoch": 0.2154109416679249,
"grad_norm": 0.2914164662361145,
"learning_rate": 8.291836082050346e-05,
"loss": 0.7705,
"step": 19260
},
{
"epoch": 0.2155227853551876,
"grad_norm": 0.27296769618988037,
"learning_rate": 8.268993558408333e-05,
"loss": 0.7791,
"step": 19270
},
{
"epoch": 0.21563462904245026,
"grad_norm": 0.2987435460090637,
"learning_rate": 8.246151034766321e-05,
"loss": 0.7918,
"step": 19280
},
{
"epoch": 0.21574647272971295,
"grad_norm": 0.2743736207485199,
"learning_rate": 8.22330851112431e-05,
"loss": 0.7777,
"step": 19290
},
{
"epoch": 0.21585831641697564,
"grad_norm": 0.2775188982486725,
"learning_rate": 8.200465987482298e-05,
"loss": 0.7811,
"step": 19300
},
{
"epoch": 0.2159701601042383,
"grad_norm": 0.2942585349082947,
"learning_rate": 8.177623463840285e-05,
"loss": 0.7748,
"step": 19310
},
{
"epoch": 0.216082003791501,
"grad_norm": 0.2545025050640106,
"learning_rate": 8.154780940198274e-05,
"loss": 0.77,
"step": 19320
},
{
"epoch": 0.2161938474787637,
"grad_norm": 0.2571526765823364,
"learning_rate": 8.131938416556261e-05,
"loss": 0.7735,
"step": 19330
},
{
"epoch": 0.21630569116602635,
"grad_norm": 0.2687735855579376,
"learning_rate": 8.10909589291425e-05,
"loss": 0.7703,
"step": 19340
},
{
"epoch": 0.21641753485328905,
"grad_norm": 0.27332374453544617,
"learning_rate": 8.086253369272237e-05,
"loss": 0.7645,
"step": 19350
},
{
"epoch": 0.21652937854055174,
"grad_norm": 0.25585636496543884,
"learning_rate": 8.063410845630226e-05,
"loss": 0.7651,
"step": 19360
},
{
"epoch": 0.2166412222278144,
"grad_norm": 0.25861334800720215,
"learning_rate": 8.040568321988213e-05,
"loss": 0.7788,
"step": 19370
},
{
"epoch": 0.2167530659150771,
"grad_norm": 0.26126453280448914,
"learning_rate": 8.017725798346201e-05,
"loss": 0.7631,
"step": 19380
},
{
"epoch": 0.21686490960233978,
"grad_norm": 0.27623289823532104,
"learning_rate": 7.99488327470419e-05,
"loss": 0.7555,
"step": 19390
},
{
"epoch": 0.21697675328960245,
"grad_norm": 0.256489634513855,
"learning_rate": 7.972040751062178e-05,
"loss": 0.7565,
"step": 19400
},
{
"epoch": 0.21708859697686514,
"grad_norm": 0.26825475692749023,
"learning_rate": 7.949198227420165e-05,
"loss": 0.7619,
"step": 19410
},
{
"epoch": 0.2172004406641278,
"grad_norm": 0.2633214294910431,
"learning_rate": 7.926355703778155e-05,
"loss": 0.7576,
"step": 19420
},
{
"epoch": 0.2173122843513905,
"grad_norm": 0.24602185189723969,
"learning_rate": 7.903513180136141e-05,
"loss": 0.748,
"step": 19430
},
{
"epoch": 0.21742412803865319,
"grad_norm": 0.24769659340381622,
"learning_rate": 7.88067065649413e-05,
"loss": 0.749,
"step": 19440
},
{
"epoch": 0.21753597172591585,
"grad_norm": 0.22824670374393463,
"learning_rate": 7.857828132852117e-05,
"loss": 0.7439,
"step": 19450
},
{
"epoch": 0.21764781541317854,
"grad_norm": 0.24848710000514984,
"learning_rate": 7.834985609210106e-05,
"loss": 0.7422,
"step": 19460
},
{
"epoch": 0.21775965910044123,
"grad_norm": 0.25875037908554077,
"learning_rate": 7.812143085568093e-05,
"loss": 0.7411,
"step": 19470
},
{
"epoch": 0.2178715027877039,
"grad_norm": 0.24616488814353943,
"learning_rate": 7.789300561926082e-05,
"loss": 0.723,
"step": 19480
},
{
"epoch": 0.2179833464749666,
"grad_norm": 0.26018476486206055,
"learning_rate": 7.76645803828407e-05,
"loss": 0.7388,
"step": 19490
},
{
"epoch": 0.21809519016222928,
"grad_norm": 0.24355724453926086,
"learning_rate": 7.743615514642058e-05,
"loss": 0.7337,
"step": 19500
},
{
"epoch": 0.21820703384949194,
"grad_norm": 0.24908235669136047,
"learning_rate": 7.720772991000045e-05,
"loss": 0.7378,
"step": 19510
},
{
"epoch": 0.21831887753675464,
"grad_norm": 0.2710162401199341,
"learning_rate": 7.697930467358035e-05,
"loss": 0.7336,
"step": 19520
},
{
"epoch": 0.21843072122401733,
"grad_norm": 0.24222905933856964,
"learning_rate": 7.675087943716022e-05,
"loss": 0.7386,
"step": 19530
},
{
"epoch": 0.21854256491128,
"grad_norm": 0.23762881755828857,
"learning_rate": 7.65224542007401e-05,
"loss": 0.7354,
"step": 19540
},
{
"epoch": 0.21865440859854268,
"grad_norm": 0.25905948877334595,
"learning_rate": 7.629402896431998e-05,
"loss": 0.7453,
"step": 19550
},
{
"epoch": 0.21876625228580535,
"grad_norm": 0.24563716351985931,
"learning_rate": 7.606560372789986e-05,
"loss": 0.7422,
"step": 19560
},
{
"epoch": 0.21887809597306804,
"grad_norm": 0.2649664878845215,
"learning_rate": 7.583717849147973e-05,
"loss": 0.7301,
"step": 19570
},
{
"epoch": 0.21898993966033073,
"grad_norm": 0.24720273911952972,
"learning_rate": 7.560875325505962e-05,
"loss": 0.7321,
"step": 19580
},
{
"epoch": 0.2191017833475934,
"grad_norm": 0.23652884364128113,
"learning_rate": 7.53803280186395e-05,
"loss": 0.7296,
"step": 19590
},
{
"epoch": 0.21921362703485608,
"grad_norm": 0.23715312778949738,
"learning_rate": 7.515190278221938e-05,
"loss": 0.7237,
"step": 19600
},
{
"epoch": 0.21932547072211878,
"grad_norm": 0.2500048577785492,
"learning_rate": 7.492347754579925e-05,
"loss": 0.7372,
"step": 19610
},
{
"epoch": 0.21943731440938144,
"grad_norm": 0.2575337886810303,
"learning_rate": 7.469505230937915e-05,
"loss": 0.7393,
"step": 19620
},
{
"epoch": 0.21954915809664413,
"grad_norm": 0.255375474691391,
"learning_rate": 7.446662707295902e-05,
"loss": 0.75,
"step": 19630
},
{
"epoch": 0.21966100178390682,
"grad_norm": 0.2793714106082916,
"learning_rate": 7.42382018365389e-05,
"loss": 0.7585,
"step": 19640
},
{
"epoch": 0.2197728454711695,
"grad_norm": 0.2588786482810974,
"learning_rate": 7.400977660011878e-05,
"loss": 0.7661,
"step": 19650
},
{
"epoch": 0.21988468915843218,
"grad_norm": 0.27130866050720215,
"learning_rate": 7.378135136369867e-05,
"loss": 0.7579,
"step": 19660
},
{
"epoch": 0.21999653284569487,
"grad_norm": 0.2730309069156647,
"learning_rate": 7.355292612727853e-05,
"loss": 0.7463,
"step": 19670
},
{
"epoch": 0.22010837653295753,
"grad_norm": 0.24330918490886688,
"learning_rate": 7.332450089085842e-05,
"loss": 0.7388,
"step": 19680
},
{
"epoch": 0.22022022022022023,
"grad_norm": 0.30004703998565674,
"learning_rate": 7.309607565443831e-05,
"loss": 0.7633,
"step": 19690
},
{
"epoch": 0.2203320639074829,
"grad_norm": 0.2754705548286438,
"learning_rate": 7.286765041801818e-05,
"loss": 0.7587,
"step": 19700
},
{
"epoch": 0.22044390759474558,
"grad_norm": 0.27601394057273865,
"learning_rate": 7.263922518159807e-05,
"loss": 0.7468,
"step": 19710
},
{
"epoch": 0.22055575128200827,
"grad_norm": 0.2328653633594513,
"learning_rate": 7.241079994517795e-05,
"loss": 0.7432,
"step": 19720
},
{
"epoch": 0.22066759496927094,
"grad_norm": 0.23960436880588531,
"learning_rate": 7.218237470875783e-05,
"loss": 0.7384,
"step": 19730
},
{
"epoch": 0.22077943865653363,
"grad_norm": 0.2687484323978424,
"learning_rate": 7.19539494723377e-05,
"loss": 0.738,
"step": 19740
},
{
"epoch": 0.22089128234379632,
"grad_norm": 0.2243189811706543,
"learning_rate": 7.17255242359176e-05,
"loss": 0.7467,
"step": 19750
},
{
"epoch": 0.22100312603105898,
"grad_norm": 0.26094529032707214,
"learning_rate": 7.149709899949747e-05,
"loss": 0.7579,
"step": 19760
},
{
"epoch": 0.22111496971832167,
"grad_norm": 0.2761390507221222,
"learning_rate": 7.126867376307735e-05,
"loss": 0.7491,
"step": 19770
},
{
"epoch": 0.22122681340558437,
"grad_norm": 0.2523578405380249,
"learning_rate": 7.104024852665723e-05,
"loss": 0.7358,
"step": 19780
},
{
"epoch": 0.22133865709284703,
"grad_norm": 0.25612056255340576,
"learning_rate": 7.081182329023711e-05,
"loss": 0.7322,
"step": 19790
},
{
"epoch": 0.22145050078010972,
"grad_norm": 0.24379362165927887,
"learning_rate": 7.058339805381698e-05,
"loss": 0.7438,
"step": 19800
},
{
"epoch": 0.2215623444673724,
"grad_norm": 0.2315502017736435,
"learning_rate": 7.035497281739687e-05,
"loss": 0.7349,
"step": 19810
},
{
"epoch": 0.22167418815463508,
"grad_norm": 0.41941365599632263,
"learning_rate": 7.012654758097675e-05,
"loss": 0.743,
"step": 19820
},
{
"epoch": 0.22178603184189777,
"grad_norm": 0.23147599399089813,
"learning_rate": 6.989812234455663e-05,
"loss": 0.7381,
"step": 19830
},
{
"epoch": 0.22189787552916043,
"grad_norm": 0.25920864939689636,
"learning_rate": 6.96696971081365e-05,
"loss": 0.7469,
"step": 19840
},
{
"epoch": 0.22200971921642312,
"grad_norm": 0.23870904743671417,
"learning_rate": 6.94412718717164e-05,
"loss": 0.7476,
"step": 19850
},
{
"epoch": 0.22212156290368582,
"grad_norm": 0.2372673749923706,
"learning_rate": 6.921284663529627e-05,
"loss": 0.7468,
"step": 19860
},
{
"epoch": 0.22223340659094848,
"grad_norm": 0.2703365683555603,
"learning_rate": 6.898442139887615e-05,
"loss": 0.742,
"step": 19870
},
{
"epoch": 0.22234525027821117,
"grad_norm": 0.24437329173088074,
"learning_rate": 6.875599616245603e-05,
"loss": 0.7217,
"step": 19880
},
{
"epoch": 0.22245709396547386,
"grad_norm": 0.21680840849876404,
"learning_rate": 6.852757092603592e-05,
"loss": 0.7547,
"step": 19890
},
{
"epoch": 0.22256893765273653,
"grad_norm": 0.29101526737213135,
"learning_rate": 6.829914568961579e-05,
"loss": 0.7389,
"step": 19900
},
{
"epoch": 0.22268078133999922,
"grad_norm": 0.2821531891822815,
"learning_rate": 6.807072045319567e-05,
"loss": 0.731,
"step": 19910
},
{
"epoch": 0.2227926250272619,
"grad_norm": 0.2773050367832184,
"learning_rate": 6.784229521677555e-05,
"loss": 0.7369,
"step": 19920
},
{
"epoch": 0.22290446871452457,
"grad_norm": 0.2531367838382721,
"learning_rate": 6.761386998035543e-05,
"loss": 0.7399,
"step": 19930
},
{
"epoch": 0.22301631240178726,
"grad_norm": 0.28158465027809143,
"learning_rate": 6.73854447439353e-05,
"loss": 0.7523,
"step": 19940
},
{
"epoch": 0.22312815608904996,
"grad_norm": 0.25612935423851013,
"learning_rate": 6.71570195075152e-05,
"loss": 0.7725,
"step": 19950
},
{
"epoch": 0.22323999977631262,
"grad_norm": 0.26996153593063354,
"learning_rate": 6.692859427109507e-05,
"loss": 0.7823,
"step": 19960
},
{
"epoch": 0.2233518434635753,
"grad_norm": 0.28008782863616943,
"learning_rate": 6.670016903467495e-05,
"loss": 0.7679,
"step": 19970
},
{
"epoch": 0.22346368715083798,
"grad_norm": 0.27016493678092957,
"learning_rate": 6.647174379825483e-05,
"loss": 0.7617,
"step": 19980
},
{
"epoch": 0.22357553083810067,
"grad_norm": 0.2679850459098816,
"learning_rate": 6.624331856183472e-05,
"loss": 0.7737,
"step": 19990
},
{
"epoch": 0.22368737452536336,
"grad_norm": 0.2570480406284332,
"learning_rate": 6.601489332541459e-05,
"loss": 0.758,
"step": 20000
},
{
"epoch": 0.22379921821262602,
"grad_norm": 0.2503785490989685,
"learning_rate": 6.578646808899447e-05,
"loss": 0.761,
"step": 20010
},
{
"epoch": 0.2239110618998887,
"grad_norm": 0.2648092210292816,
"learning_rate": 6.555804285257435e-05,
"loss": 0.7532,
"step": 20020
},
{
"epoch": 0.2240229055871514,
"grad_norm": 0.26829221844673157,
"learning_rate": 6.532961761615423e-05,
"loss": 0.7542,
"step": 20030
},
{
"epoch": 0.22413474927441407,
"grad_norm": 0.27535539865493774,
"learning_rate": 6.51011923797341e-05,
"loss": 0.7578,
"step": 20040
},
{
"epoch": 0.22424659296167676,
"grad_norm": 0.28674209117889404,
"learning_rate": 6.4872767143314e-05,
"loss": 0.756,
"step": 20050
},
{
"epoch": 0.22435843664893945,
"grad_norm": 0.2523026466369629,
"learning_rate": 6.464434190689387e-05,
"loss": 0.7514,
"step": 20060
},
{
"epoch": 0.22447028033620212,
"grad_norm": 0.24213305115699768,
"learning_rate": 6.441591667047375e-05,
"loss": 0.7546,
"step": 20070
},
{
"epoch": 0.2245821240234648,
"grad_norm": 0.2779023349285126,
"learning_rate": 6.418749143405363e-05,
"loss": 0.7654,
"step": 20080
},
{
"epoch": 0.2246939677107275,
"grad_norm": 0.28806111216545105,
"learning_rate": 6.395906619763352e-05,
"loss": 0.7612,
"step": 20090
},
{
"epoch": 0.22480581139799016,
"grad_norm": 0.2637580931186676,
"learning_rate": 6.373064096121339e-05,
"loss": 0.7659,
"step": 20100
},
{
"epoch": 0.22491765508525285,
"grad_norm": 0.2683275043964386,
"learning_rate": 6.350221572479328e-05,
"loss": 0.753,
"step": 20110
},
{
"epoch": 0.22502949877251555,
"grad_norm": 0.2693597078323364,
"learning_rate": 6.327379048837315e-05,
"loss": 0.7697,
"step": 20120
},
{
"epoch": 0.2251413424597782,
"grad_norm": 0.26335635781288147,
"learning_rate": 6.304536525195304e-05,
"loss": 0.7644,
"step": 20130
},
{
"epoch": 0.2252531861470409,
"grad_norm": 0.29237446188926697,
"learning_rate": 6.28169400155329e-05,
"loss": 0.7721,
"step": 20140
},
{
"epoch": 0.22536502983430357,
"grad_norm": 0.3080182373523712,
"learning_rate": 6.25885147791128e-05,
"loss": 0.7666,
"step": 20150
},
{
"epoch": 0.22547687352156626,
"grad_norm": 0.2831542193889618,
"learning_rate": 6.236008954269268e-05,
"loss": 0.7805,
"step": 20160
},
{
"epoch": 0.22558871720882895,
"grad_norm": 0.2860835790634155,
"learning_rate": 6.213166430627257e-05,
"loss": 0.7816,
"step": 20170
},
{
"epoch": 0.2257005608960916,
"grad_norm": 0.28273066878318787,
"learning_rate": 6.190323906985244e-05,
"loss": 0.7812,
"step": 20180
},
{
"epoch": 0.2258124045833543,
"grad_norm": 0.29203614592552185,
"learning_rate": 6.167481383343232e-05,
"loss": 0.7699,
"step": 20190
},
{
"epoch": 0.225924248270617,
"grad_norm": 0.2811570167541504,
"learning_rate": 6.14463885970122e-05,
"loss": 0.7833,
"step": 20200
},
{
"epoch": 0.22603609195787966,
"grad_norm": 0.30047500133514404,
"learning_rate": 6.121796336059208e-05,
"loss": 0.7594,
"step": 20210
},
{
"epoch": 0.22614793564514235,
"grad_norm": 0.2838903069496155,
"learning_rate": 6.098953812417196e-05,
"loss": 0.7678,
"step": 20220
},
{
"epoch": 0.22625977933240504,
"grad_norm": 0.2840651273727417,
"learning_rate": 6.0761112887751836e-05,
"loss": 0.7546,
"step": 20230
},
{
"epoch": 0.2263716230196677,
"grad_norm": 0.31575652956962585,
"learning_rate": 6.053268765133172e-05,
"loss": 0.7533,
"step": 20240
},
{
"epoch": 0.2264834667069304,
"grad_norm": 0.2692145109176636,
"learning_rate": 6.03042624149116e-05,
"loss": 0.744,
"step": 20250
},
{
"epoch": 0.2265953103941931,
"grad_norm": 0.3094116449356079,
"learning_rate": 6.007583717849148e-05,
"loss": 0.7708,
"step": 20260
},
{
"epoch": 0.22670715408145575,
"grad_norm": 0.3123047947883606,
"learning_rate": 5.984741194207136e-05,
"loss": 0.7431,
"step": 20270
},
{
"epoch": 0.22681899776871844,
"grad_norm": 0.2733646631240845,
"learning_rate": 5.961898670565124e-05,
"loss": 0.762,
"step": 20280
},
{
"epoch": 0.2269308414559811,
"grad_norm": 0.23944342136383057,
"learning_rate": 5.939056146923112e-05,
"loss": 0.7488,
"step": 20290
},
{
"epoch": 0.2270426851432438,
"grad_norm": 0.2459600865840912,
"learning_rate": 5.9162136232811e-05,
"loss": 0.7443,
"step": 20300
},
{
"epoch": 0.2271545288305065,
"grad_norm": 0.2502724826335907,
"learning_rate": 5.893371099639088e-05,
"loss": 0.7417,
"step": 20310
},
{
"epoch": 0.22726637251776916,
"grad_norm": 0.23721522092819214,
"learning_rate": 5.870528575997076e-05,
"loss": 0.7393,
"step": 20320
},
{
"epoch": 0.22737821620503185,
"grad_norm": 0.2526785135269165,
"learning_rate": 5.847686052355064e-05,
"loss": 0.7346,
"step": 20330
},
{
"epoch": 0.22749005989229454,
"grad_norm": 0.2573647201061249,
"learning_rate": 5.824843528713052e-05,
"loss": 0.7192,
"step": 20340
},
{
"epoch": 0.2276019035795572,
"grad_norm": 0.2632768750190735,
"learning_rate": 5.80200100507104e-05,
"loss": 0.7234,
"step": 20350
},
{
"epoch": 0.2277137472668199,
"grad_norm": 0.2589345872402191,
"learning_rate": 5.779158481429028e-05,
"loss": 0.7165,
"step": 20360
},
{
"epoch": 0.22782559095408259,
"grad_norm": 0.2480648308992386,
"learning_rate": 5.756315957787016e-05,
"loss": 0.7099,
"step": 20370
},
{
"epoch": 0.22793743464134525,
"grad_norm": 0.24949654936790466,
"learning_rate": 5.733473434145004e-05,
"loss": 0.7187,
"step": 20380
},
{
"epoch": 0.22804927832860794,
"grad_norm": 0.25637611746788025,
"learning_rate": 5.710630910502993e-05,
"loss": 0.7098,
"step": 20390
},
{
"epoch": 0.22816112201587063,
"grad_norm": 0.28809231519699097,
"learning_rate": 5.687788386860981e-05,
"loss": 0.7315,
"step": 20400
},
{
"epoch": 0.2282729657031333,
"grad_norm": 0.25564566254615784,
"learning_rate": 5.6649458632189686e-05,
"loss": 0.7319,
"step": 20410
},
{
"epoch": 0.228384809390396,
"grad_norm": 0.2693794369697571,
"learning_rate": 5.642103339576957e-05,
"loss": 0.7173,
"step": 20420
},
{
"epoch": 0.22849665307765865,
"grad_norm": 0.24680989980697632,
"learning_rate": 5.619260815934945e-05,
"loss": 0.708,
"step": 20430
},
{
"epoch": 0.22860849676492134,
"grad_norm": 0.2790026068687439,
"learning_rate": 5.596418292292933e-05,
"loss": 0.7023,
"step": 20440
},
{
"epoch": 0.22872034045218403,
"grad_norm": 0.2656199038028717,
"learning_rate": 5.573575768650921e-05,
"loss": 0.7113,
"step": 20450
},
{
"epoch": 0.2288321841394467,
"grad_norm": 0.30832743644714355,
"learning_rate": 5.550733245008909e-05,
"loss": 0.7161,
"step": 20460
},
{
"epoch": 0.2289440278267094,
"grad_norm": 0.27060794830322266,
"learning_rate": 5.527890721366897e-05,
"loss": 0.7208,
"step": 20470
},
{
"epoch": 0.22905587151397208,
"grad_norm": 0.26036307215690613,
"learning_rate": 5.505048197724885e-05,
"loss": 0.7004,
"step": 20480
},
{
"epoch": 0.22916771520123475,
"grad_norm": 0.2758086919784546,
"learning_rate": 5.482205674082873e-05,
"loss": 0.7179,
"step": 20490
},
{
"epoch": 0.22927955888849744,
"grad_norm": 0.2821243107318878,
"learning_rate": 5.459363150440861e-05,
"loss": 0.7255,
"step": 20500
},
{
"epoch": 0.22939140257576013,
"grad_norm": 0.2782810628414154,
"learning_rate": 5.436520626798849e-05,
"loss": 0.7149,
"step": 20510
},
{
"epoch": 0.2295032462630228,
"grad_norm": 0.2755940854549408,
"learning_rate": 5.413678103156837e-05,
"loss": 0.7117,
"step": 20520
},
{
"epoch": 0.22961508995028548,
"grad_norm": 0.29176777601242065,
"learning_rate": 5.390835579514825e-05,
"loss": 0.7188,
"step": 20530
},
{
"epoch": 0.22972693363754818,
"grad_norm": 0.27739444375038147,
"learning_rate": 5.367993055872813e-05,
"loss": 0.7196,
"step": 20540
},
{
"epoch": 0.22983877732481084,
"grad_norm": 0.27187204360961914,
"learning_rate": 5.345150532230801e-05,
"loss": 0.722,
"step": 20550
},
{
"epoch": 0.22995062101207353,
"grad_norm": 0.2951996624469757,
"learning_rate": 5.322308008588789e-05,
"loss": 0.7325,
"step": 20560
},
{
"epoch": 0.2300624646993362,
"grad_norm": 0.2677932381629944,
"learning_rate": 5.299465484946777e-05,
"loss": 0.7263,
"step": 20570
},
{
"epoch": 0.23017430838659889,
"grad_norm": 0.29231807589530945,
"learning_rate": 5.2766229613047654e-05,
"loss": 0.7284,
"step": 20580
},
{
"epoch": 0.23028615207386158,
"grad_norm": 0.30211326479911804,
"learning_rate": 5.253780437662753e-05,
"loss": 0.7222,
"step": 20590
},
{
"epoch": 0.23039799576112424,
"grad_norm": 0.29821720719337463,
"learning_rate": 5.230937914020741e-05,
"loss": 0.7316,
"step": 20600
},
{
"epoch": 0.23050983944838693,
"grad_norm": 0.3019379675388336,
"learning_rate": 5.208095390378729e-05,
"loss": 0.7328,
"step": 20610
},
{
"epoch": 0.23062168313564962,
"grad_norm": 0.2569403052330017,
"learning_rate": 5.185252866736717e-05,
"loss": 0.7215,
"step": 20620
},
{
"epoch": 0.2307335268229123,
"grad_norm": 0.3151782155036926,
"learning_rate": 5.1624103430947054e-05,
"loss": 0.7326,
"step": 20630
},
{
"epoch": 0.23084537051017498,
"grad_norm": 0.2748591899871826,
"learning_rate": 5.139567819452693e-05,
"loss": 0.7359,
"step": 20640
},
{
"epoch": 0.23095721419743767,
"grad_norm": 0.27494433522224426,
"learning_rate": 5.116725295810681e-05,
"loss": 0.7351,
"step": 20650
},
{
"epoch": 0.23106905788470034,
"grad_norm": 0.29428452253341675,
"learning_rate": 5.093882772168669e-05,
"loss": 0.7361,
"step": 20660
},
{
"epoch": 0.23118090157196303,
"grad_norm": 0.2924981117248535,
"learning_rate": 5.071040248526657e-05,
"loss": 0.7539,
"step": 20670
},
{
"epoch": 0.23129274525922572,
"grad_norm": 0.28647035360336304,
"learning_rate": 5.0481977248846455e-05,
"loss": 0.7576,
"step": 20680
},
{
"epoch": 0.23140458894648838,
"grad_norm": 0.3107542097568512,
"learning_rate": 5.025355201242633e-05,
"loss": 0.7615,
"step": 20690
},
{
"epoch": 0.23151643263375107,
"grad_norm": 0.27186501026153564,
"learning_rate": 5.0025126776006213e-05,
"loss": 0.7641,
"step": 20700
},
{
"epoch": 0.23162827632101374,
"grad_norm": 0.2838156819343567,
"learning_rate": 4.9796701539586096e-05,
"loss": 0.7695,
"step": 20710
},
{
"epoch": 0.23174012000827643,
"grad_norm": 0.3377101421356201,
"learning_rate": 4.956827630316597e-05,
"loss": 0.7696,
"step": 20720
},
{
"epoch": 0.23185196369553912,
"grad_norm": 0.3177778422832489,
"learning_rate": 4.9339851066745855e-05,
"loss": 0.7677,
"step": 20730
},
{
"epoch": 0.23196380738280178,
"grad_norm": 0.3157583773136139,
"learning_rate": 4.911142583032573e-05,
"loss": 0.7653,
"step": 20740
},
{
"epoch": 0.23207565107006448,
"grad_norm": 0.3123907148838043,
"learning_rate": 4.8883000593905614e-05,
"loss": 0.7677,
"step": 20750
},
{
"epoch": 0.23218749475732717,
"grad_norm": 0.30460426211357117,
"learning_rate": 4.86545753574855e-05,
"loss": 0.7743,
"step": 20760
},
{
"epoch": 0.23229933844458983,
"grad_norm": 0.27507251501083374,
"learning_rate": 4.842615012106537e-05,
"loss": 0.767,
"step": 20770
},
{
"epoch": 0.23241118213185252,
"grad_norm": 0.3233499228954315,
"learning_rate": 4.8197724884645256e-05,
"loss": 0.7717,
"step": 20780
},
{
"epoch": 0.23252302581911521,
"grad_norm": 0.30144819617271423,
"learning_rate": 4.796929964822513e-05,
"loss": 0.7609,
"step": 20790
},
{
"epoch": 0.23263486950637788,
"grad_norm": 0.29588454961776733,
"learning_rate": 4.7740874411805014e-05,
"loss": 0.7682,
"step": 20800
},
{
"epoch": 0.23274671319364057,
"grad_norm": 0.3111203610897064,
"learning_rate": 4.75124491753849e-05,
"loss": 0.7652,
"step": 20810
},
{
"epoch": 0.23285855688090326,
"grad_norm": 0.28917646408081055,
"learning_rate": 4.728402393896477e-05,
"loss": 0.7584,
"step": 20820
},
{
"epoch": 0.23297040056816593,
"grad_norm": 0.3156343698501587,
"learning_rate": 4.7055598702544656e-05,
"loss": 0.7643,
"step": 20830
},
{
"epoch": 0.23308224425542862,
"grad_norm": 0.2909680902957916,
"learning_rate": 4.682717346612454e-05,
"loss": 0.7613,
"step": 20840
},
{
"epoch": 0.2331940879426913,
"grad_norm": 0.3006870746612549,
"learning_rate": 4.659874822970442e-05,
"loss": 0.7603,
"step": 20850
},
{
"epoch": 0.23330593162995397,
"grad_norm": 0.2844945192337036,
"learning_rate": 4.6370322993284304e-05,
"loss": 0.7589,
"step": 20860
},
{
"epoch": 0.23341777531721666,
"grad_norm": 0.26857924461364746,
"learning_rate": 4.614189775686418e-05,
"loss": 0.7401,
"step": 20870
},
{
"epoch": 0.23352961900447933,
"grad_norm": 0.31332314014434814,
"learning_rate": 4.591347252044406e-05,
"loss": 0.7468,
"step": 20880
},
{
"epoch": 0.23364146269174202,
"grad_norm": 0.28083765506744385,
"learning_rate": 4.568504728402394e-05,
"loss": 0.7451,
"step": 20890
},
{
"epoch": 0.2337533063790047,
"grad_norm": 0.29185009002685547,
"learning_rate": 4.545662204760382e-05,
"loss": 0.7478,
"step": 20900
},
{
"epoch": 0.23386515006626737,
"grad_norm": 0.30532801151275635,
"learning_rate": 4.5228196811183705e-05,
"loss": 0.7404,
"step": 20910
},
{
"epoch": 0.23397699375353007,
"grad_norm": 0.2724134922027588,
"learning_rate": 4.499977157476358e-05,
"loss": 0.732,
"step": 20920
},
{
"epoch": 0.23408883744079276,
"grad_norm": 0.29753822088241577,
"learning_rate": 4.4771346338343464e-05,
"loss": 0.7236,
"step": 20930
},
{
"epoch": 0.23420068112805542,
"grad_norm": 0.31980055570602417,
"learning_rate": 4.454292110192334e-05,
"loss": 0.7407,
"step": 20940
},
{
"epoch": 0.2343125248153181,
"grad_norm": 0.29578351974487305,
"learning_rate": 4.431449586550322e-05,
"loss": 0.7166,
"step": 20950
},
{
"epoch": 0.2344243685025808,
"grad_norm": 0.25261184573173523,
"learning_rate": 4.4086070629083105e-05,
"loss": 0.7195,
"step": 20960
},
{
"epoch": 0.23453621218984347,
"grad_norm": 0.2669534385204315,
"learning_rate": 4.385764539266298e-05,
"loss": 0.7224,
"step": 20970
},
{
"epoch": 0.23464805587710616,
"grad_norm": 0.2817215919494629,
"learning_rate": 4.3629220156242864e-05,
"loss": 0.7405,
"step": 20980
},
{
"epoch": 0.23475989956436885,
"grad_norm": 0.27033400535583496,
"learning_rate": 4.340079491982275e-05,
"loss": 0.7292,
"step": 20990
},
{
"epoch": 0.23487174325163152,
"grad_norm": 0.3083013594150543,
"learning_rate": 4.317236968340262e-05,
"loss": 0.7271,
"step": 21000
},
{
"epoch": 0.2349835869388942,
"grad_norm": 0.27074989676475525,
"learning_rate": 4.2943944446982506e-05,
"loss": 0.7346,
"step": 21010
},
{
"epoch": 0.23509543062615687,
"grad_norm": 0.31609755754470825,
"learning_rate": 4.271551921056238e-05,
"loss": 0.7285,
"step": 21020
},
{
"epoch": 0.23520727431341956,
"grad_norm": 0.27084672451019287,
"learning_rate": 4.2487093974142265e-05,
"loss": 0.7411,
"step": 21030
},
{
"epoch": 0.23531911800068225,
"grad_norm": 0.26669842004776,
"learning_rate": 4.225866873772215e-05,
"loss": 0.7423,
"step": 21040
},
{
"epoch": 0.23543096168794492,
"grad_norm": 0.2873358428478241,
"learning_rate": 4.2030243501302024e-05,
"loss": 0.7345,
"step": 21050
},
{
"epoch": 0.2355428053752076,
"grad_norm": 0.2831687033176422,
"learning_rate": 4.1801818264881906e-05,
"loss": 0.7537,
"step": 21060
},
{
"epoch": 0.2356546490624703,
"grad_norm": 0.2781788110733032,
"learning_rate": 4.157339302846178e-05,
"loss": 0.7494,
"step": 21070
},
{
"epoch": 0.23576649274973296,
"grad_norm": 0.27109071612358093,
"learning_rate": 4.1344967792041665e-05,
"loss": 0.7493,
"step": 21080
},
{
"epoch": 0.23587833643699566,
"grad_norm": 0.25398164987564087,
"learning_rate": 4.111654255562155e-05,
"loss": 0.7369,
"step": 21090
},
{
"epoch": 0.23599018012425835,
"grad_norm": 0.3150353729724884,
"learning_rate": 4.0888117319201424e-05,
"loss": 0.754,
"step": 21100
},
{
"epoch": 0.236102023811521,
"grad_norm": 0.27384257316589355,
"learning_rate": 4.065969208278131e-05,
"loss": 0.7439,
"step": 21110
},
{
"epoch": 0.2362138674987837,
"grad_norm": 0.2770559787750244,
"learning_rate": 4.043126684636118e-05,
"loss": 0.7391,
"step": 21120
},
{
"epoch": 0.2363257111860464,
"grad_norm": 0.29367002844810486,
"learning_rate": 4.0202841609941066e-05,
"loss": 0.746,
"step": 21130
},
{
"epoch": 0.23643755487330906,
"grad_norm": 0.2554051876068115,
"learning_rate": 3.997441637352095e-05,
"loss": 0.7386,
"step": 21140
},
{
"epoch": 0.23654939856057175,
"grad_norm": 0.2943428158760071,
"learning_rate": 3.9745991137100825e-05,
"loss": 0.7437,
"step": 21150
},
{
"epoch": 0.2366612422478344,
"grad_norm": 0.24465301632881165,
"learning_rate": 3.951756590068071e-05,
"loss": 0.7331,
"step": 21160
},
{
"epoch": 0.2367730859350971,
"grad_norm": 0.2545934021472931,
"learning_rate": 3.9289140664260584e-05,
"loss": 0.7361,
"step": 21170
},
{
"epoch": 0.2368849296223598,
"grad_norm": 0.2792121469974518,
"learning_rate": 3.9060715427840466e-05,
"loss": 0.7238,
"step": 21180
},
{
"epoch": 0.23699677330962246,
"grad_norm": 0.27943745255470276,
"learning_rate": 3.883229019142035e-05,
"loss": 0.726,
"step": 21190
},
{
"epoch": 0.23710861699688515,
"grad_norm": 0.2514471411705017,
"learning_rate": 3.8603864955000225e-05,
"loss": 0.7214,
"step": 21200
},
{
"epoch": 0.23722046068414784,
"grad_norm": 0.2698551416397095,
"learning_rate": 3.837543971858011e-05,
"loss": 0.7318,
"step": 21210
},
{
"epoch": 0.2373323043714105,
"grad_norm": 0.29603877663612366,
"learning_rate": 3.814701448215999e-05,
"loss": 0.742,
"step": 21220
},
{
"epoch": 0.2374441480586732,
"grad_norm": 0.26655495166778564,
"learning_rate": 3.791858924573987e-05,
"loss": 0.7331,
"step": 21230
},
{
"epoch": 0.2375559917459359,
"grad_norm": 0.29367104172706604,
"learning_rate": 3.769016400931975e-05,
"loss": 0.7233,
"step": 21240
},
{
"epoch": 0.23766783543319855,
"grad_norm": 0.2680334746837616,
"learning_rate": 3.7461738772899626e-05,
"loss": 0.732,
"step": 21250
},
{
"epoch": 0.23777967912046125,
"grad_norm": 0.2748298943042755,
"learning_rate": 3.723331353647951e-05,
"loss": 0.7453,
"step": 21260
},
{
"epoch": 0.23789152280772394,
"grad_norm": 0.28276947140693665,
"learning_rate": 3.700488830005939e-05,
"loss": 0.7524,
"step": 21270
},
{
"epoch": 0.2380033664949866,
"grad_norm": 0.2645372450351715,
"learning_rate": 3.677646306363927e-05,
"loss": 0.7542,
"step": 21280
},
{
"epoch": 0.2381152101822493,
"grad_norm": 0.2866505980491638,
"learning_rate": 3.654803782721916e-05,
"loss": 0.7447,
"step": 21290
},
{
"epoch": 0.23822705386951196,
"grad_norm": 0.29611489176750183,
"learning_rate": 3.631961259079903e-05,
"loss": 0.7662,
"step": 21300
},
{
"epoch": 0.23833889755677465,
"grad_norm": 0.29184749722480774,
"learning_rate": 3.6091187354378916e-05,
"loss": 0.7558,
"step": 21310
},
{
"epoch": 0.23845074124403734,
"grad_norm": 0.27304571866989136,
"learning_rate": 3.58627621179588e-05,
"loss": 0.7578,
"step": 21320
},
{
"epoch": 0.2385625849313,
"grad_norm": 0.2700962424278259,
"learning_rate": 3.5634336881538675e-05,
"loss": 0.7411,
"step": 21330
},
{
"epoch": 0.2386744286185627,
"grad_norm": 0.2845793664455414,
"learning_rate": 3.540591164511856e-05,
"loss": 0.7392,
"step": 21340
},
{
"epoch": 0.2387862723058254,
"grad_norm": 0.32136180996894836,
"learning_rate": 3.5177486408698433e-05,
"loss": 0.7431,
"step": 21350
},
{
"epoch": 0.23889811599308805,
"grad_norm": 0.26846998929977417,
"learning_rate": 3.4949061172278316e-05,
"loss": 0.737,
"step": 21360
},
{
"epoch": 0.23900995968035074,
"grad_norm": 0.26363828778266907,
"learning_rate": 3.47206359358582e-05,
"loss": 0.7416,
"step": 21370
},
{
"epoch": 0.23912180336761343,
"grad_norm": 0.2900106906890869,
"learning_rate": 3.4492210699438075e-05,
"loss": 0.7373,
"step": 21380
},
{
"epoch": 0.2392336470548761,
"grad_norm": 0.2762589156627655,
"learning_rate": 3.426378546301796e-05,
"loss": 0.7379,
"step": 21390
},
{
"epoch": 0.2393454907421388,
"grad_norm": 0.2697104513645172,
"learning_rate": 3.4035360226597834e-05,
"loss": 0.7448,
"step": 21400
},
{
"epoch": 0.23945733442940148,
"grad_norm": 0.2901761829853058,
"learning_rate": 3.380693499017772e-05,
"loss": 0.7394,
"step": 21410
},
{
"epoch": 0.23956917811666414,
"grad_norm": 0.245674267411232,
"learning_rate": 3.35785097537576e-05,
"loss": 0.7387,
"step": 21420
},
{
"epoch": 0.23968102180392684,
"grad_norm": 0.2713403105735779,
"learning_rate": 3.3350084517337476e-05,
"loss": 0.7604,
"step": 21430
},
{
"epoch": 0.2397928654911895,
"grad_norm": 0.27368244528770447,
"learning_rate": 3.312165928091736e-05,
"loss": 0.7489,
"step": 21440
},
{
"epoch": 0.2399047091784522,
"grad_norm": 0.3079991340637207,
"learning_rate": 3.2893234044497234e-05,
"loss": 0.7653,
"step": 21450
},
{
"epoch": 0.24001655286571488,
"grad_norm": 0.2920658588409424,
"learning_rate": 3.266480880807712e-05,
"loss": 0.7588,
"step": 21460
},
{
"epoch": 0.24012839655297755,
"grad_norm": 0.27589842677116394,
"learning_rate": 3.2436383571657e-05,
"loss": 0.7607,
"step": 21470
},
{
"epoch": 0.24024024024024024,
"grad_norm": 0.2592112720012665,
"learning_rate": 3.2207958335236876e-05,
"loss": 0.745,
"step": 21480
},
{
"epoch": 0.24035208392750293,
"grad_norm": 0.27625855803489685,
"learning_rate": 3.197953309881676e-05,
"loss": 0.7488,
"step": 21490
},
{
"epoch": 0.2404639276147656,
"grad_norm": 0.2769569456577301,
"learning_rate": 3.175110786239664e-05,
"loss": 0.7326,
"step": 21500
},
{
"epoch": 0.24057577130202829,
"grad_norm": 0.2705914080142975,
"learning_rate": 3.152268262597652e-05,
"loss": 0.7512,
"step": 21510
},
{
"epoch": 0.24068761498929098,
"grad_norm": 0.2655676603317261,
"learning_rate": 3.12942573895564e-05,
"loss": 0.7366,
"step": 21520
},
{
"epoch": 0.24079945867655364,
"grad_norm": 0.2606657147407532,
"learning_rate": 3.106583215313628e-05,
"loss": 0.7436,
"step": 21530
},
{
"epoch": 0.24091130236381633,
"grad_norm": 0.27843552827835083,
"learning_rate": 3.083740691671616e-05,
"loss": 0.7342,
"step": 21540
},
{
"epoch": 0.24102314605107902,
"grad_norm": 0.27866050601005554,
"learning_rate": 3.060898168029604e-05,
"loss": 0.7305,
"step": 21550
},
{
"epoch": 0.2411349897383417,
"grad_norm": 0.2803070545196533,
"learning_rate": 3.0380556443875918e-05,
"loss": 0.727,
"step": 21560
},
{
"epoch": 0.24124683342560438,
"grad_norm": 0.27220121026039124,
"learning_rate": 3.01521312074558e-05,
"loss": 0.7195,
"step": 21570
},
{
"epoch": 0.24135867711286707,
"grad_norm": 0.26060426235198975,
"learning_rate": 2.992370597103568e-05,
"loss": 0.7013,
"step": 21580
},
{
"epoch": 0.24147052080012973,
"grad_norm": 0.24253526329994202,
"learning_rate": 2.969528073461556e-05,
"loss": 0.6925,
"step": 21590
},
{
"epoch": 0.24158236448739243,
"grad_norm": 0.26293566823005676,
"learning_rate": 2.946685549819544e-05,
"loss": 0.7028,
"step": 21600
},
{
"epoch": 0.2416942081746551,
"grad_norm": 0.26427412033081055,
"learning_rate": 2.923843026177532e-05,
"loss": 0.6993,
"step": 21610
},
{
"epoch": 0.24180605186191778,
"grad_norm": 0.26823869347572327,
"learning_rate": 2.90100050253552e-05,
"loss": 0.6999,
"step": 21620
},
{
"epoch": 0.24191789554918047,
"grad_norm": 0.24203690886497498,
"learning_rate": 2.878157978893508e-05,
"loss": 0.6906,
"step": 21630
},
{
"epoch": 0.24202973923644314,
"grad_norm": 0.2612786889076233,
"learning_rate": 2.8553154552514964e-05,
"loss": 0.6952,
"step": 21640
},
{
"epoch": 0.24214158292370583,
"grad_norm": 0.27152737975120544,
"learning_rate": 2.8324729316094843e-05,
"loss": 0.692,
"step": 21650
},
{
"epoch": 0.24225342661096852,
"grad_norm": 0.2592925727367401,
"learning_rate": 2.8096304079674726e-05,
"loss": 0.6995,
"step": 21660
},
{
"epoch": 0.24236527029823118,
"grad_norm": 0.2419063299894333,
"learning_rate": 2.7867878843254605e-05,
"loss": 0.7067,
"step": 21670
},
{
"epoch": 0.24247711398549388,
"grad_norm": 0.24731135368347168,
"learning_rate": 2.7639453606834485e-05,
"loss": 0.734,
"step": 21680
},
{
"epoch": 0.24258895767275657,
"grad_norm": 0.25746017694473267,
"learning_rate": 2.7411028370414364e-05,
"loss": 0.7075,
"step": 21690
},
{
"epoch": 0.24270080136001923,
"grad_norm": 0.2521972060203552,
"learning_rate": 2.7182603133994244e-05,
"loss": 0.7137,
"step": 21700
},
{
"epoch": 0.24281264504728192,
"grad_norm": 0.26796218752861023,
"learning_rate": 2.6954177897574127e-05,
"loss": 0.7227,
"step": 21710
},
{
"epoch": 0.2429244887345446,
"grad_norm": 0.30404597520828247,
"learning_rate": 2.6725752661154006e-05,
"loss": 0.7243,
"step": 21720
},
{
"epoch": 0.24303633242180728,
"grad_norm": 0.29561156034469604,
"learning_rate": 2.6497327424733885e-05,
"loss": 0.7357,
"step": 21730
},
{
"epoch": 0.24314817610906997,
"grad_norm": 0.28066596388816833,
"learning_rate": 2.6268902188313765e-05,
"loss": 0.7224,
"step": 21740
},
{
"epoch": 0.24326001979633263,
"grad_norm": 0.29235216975212097,
"learning_rate": 2.6040476951893644e-05,
"loss": 0.7288,
"step": 21750
},
{
"epoch": 0.24337186348359532,
"grad_norm": 0.26750460267066956,
"learning_rate": 2.5812051715473527e-05,
"loss": 0.7414,
"step": 21760
},
{
"epoch": 0.24348370717085802,
"grad_norm": 0.2707473039627075,
"learning_rate": 2.5583626479053406e-05,
"loss": 0.7478,
"step": 21770
},
{
"epoch": 0.24359555085812068,
"grad_norm": 0.26526397466659546,
"learning_rate": 2.5355201242633286e-05,
"loss": 0.7513,
"step": 21780
},
{
"epoch": 0.24370739454538337,
"grad_norm": 0.2362915724515915,
"learning_rate": 2.5126776006213165e-05,
"loss": 0.7507,
"step": 21790
},
{
"epoch": 0.24381923823264606,
"grad_norm": 0.2512950599193573,
"learning_rate": 2.4898350769793048e-05,
"loss": 0.7417,
"step": 21800
},
{
"epoch": 0.24393108191990873,
"grad_norm": 0.2366458922624588,
"learning_rate": 2.4669925533372928e-05,
"loss": 0.7402,
"step": 21810
},
{
"epoch": 0.24404292560717142,
"grad_norm": 0.24888353049755096,
"learning_rate": 2.4441500296952807e-05,
"loss": 0.7456,
"step": 21820
},
{
"epoch": 0.2441547692944341,
"grad_norm": 0.24143491685390472,
"learning_rate": 2.4213075060532686e-05,
"loss": 0.7405,
"step": 21830
},
{
"epoch": 0.24426661298169677,
"grad_norm": 0.2669823169708252,
"learning_rate": 2.3984649824112566e-05,
"loss": 0.7544,
"step": 21840
},
{
"epoch": 0.24437845666895947,
"grad_norm": 0.24328452348709106,
"learning_rate": 2.375622458769245e-05,
"loss": 0.7347,
"step": 21850
},
{
"epoch": 0.24449030035622216,
"grad_norm": 0.26204219460487366,
"learning_rate": 2.3527799351272328e-05,
"loss": 0.7397,
"step": 21860
},
{
"epoch": 0.24460214404348482,
"grad_norm": 0.2631550431251526,
"learning_rate": 2.329937411485221e-05,
"loss": 0.7413,
"step": 21870
},
{
"epoch": 0.2447139877307475,
"grad_norm": 0.2729988694190979,
"learning_rate": 2.307094887843209e-05,
"loss": 0.7336,
"step": 21880
},
{
"epoch": 0.24482583141801018,
"grad_norm": 0.2702917754650116,
"learning_rate": 2.284252364201197e-05,
"loss": 0.7294,
"step": 21890
},
{
"epoch": 0.24493767510527287,
"grad_norm": 0.22882196307182312,
"learning_rate": 2.2614098405591852e-05,
"loss": 0.7164,
"step": 21900
},
{
"epoch": 0.24504951879253556,
"grad_norm": 0.2660382390022278,
"learning_rate": 2.2385673169171732e-05,
"loss": 0.7231,
"step": 21910
},
{
"epoch": 0.24516136247979822,
"grad_norm": 0.2580036222934723,
"learning_rate": 2.215724793275161e-05,
"loss": 0.7243,
"step": 21920
},
{
"epoch": 0.24527320616706091,
"grad_norm": 0.25490158796310425,
"learning_rate": 2.192882269633149e-05,
"loss": 0.7129,
"step": 21930
},
{
"epoch": 0.2453850498543236,
"grad_norm": 0.2626509368419647,
"learning_rate": 2.1700397459911374e-05,
"loss": 0.7177,
"step": 21940
},
{
"epoch": 0.24549689354158627,
"grad_norm": 0.2642146646976471,
"learning_rate": 2.1471972223491253e-05,
"loss": 0.7119,
"step": 21950
},
{
"epoch": 0.24560873722884896,
"grad_norm": 0.2683079242706299,
"learning_rate": 2.1243546987071132e-05,
"loss": 0.7226,
"step": 21960
},
{
"epoch": 0.24572058091611165,
"grad_norm": 0.26513761281967163,
"learning_rate": 2.1015121750651012e-05,
"loss": 0.7276,
"step": 21970
},
{
"epoch": 0.24583242460337432,
"grad_norm": 0.25856319069862366,
"learning_rate": 2.078669651423089e-05,
"loss": 0.7168,
"step": 21980
},
{
"epoch": 0.245944268290637,
"grad_norm": 0.29048866033554077,
"learning_rate": 2.0558271277810774e-05,
"loss": 0.7189,
"step": 21990
},
{
"epoch": 0.2460561119778997,
"grad_norm": 0.2775687575340271,
"learning_rate": 2.0329846041390653e-05,
"loss": 0.7276,
"step": 22000
},
{
"epoch": 0.24616795566516236,
"grad_norm": 0.30157843232154846,
"learning_rate": 2.0101420804970533e-05,
"loss": 0.7435,
"step": 22010
},
{
"epoch": 0.24627979935242506,
"grad_norm": 0.2602044939994812,
"learning_rate": 1.9872995568550412e-05,
"loss": 0.7365,
"step": 22020
},
{
"epoch": 0.24639164303968772,
"grad_norm": 0.29975757002830505,
"learning_rate": 1.9644570332130292e-05,
"loss": 0.7484,
"step": 22030
},
{
"epoch": 0.2465034867269504,
"grad_norm": 0.26586923003196716,
"learning_rate": 1.9416145095710175e-05,
"loss": 0.7499,
"step": 22040
},
{
"epoch": 0.2466153304142131,
"grad_norm": 0.25447341799736023,
"learning_rate": 1.9187719859290054e-05,
"loss": 0.7523,
"step": 22050
},
{
"epoch": 0.24672717410147577,
"grad_norm": 0.2876524031162262,
"learning_rate": 1.8959294622869933e-05,
"loss": 0.7532,
"step": 22060
},
{
"epoch": 0.24683901778873846,
"grad_norm": 0.29897189140319824,
"learning_rate": 1.8730869386449813e-05,
"loss": 0.7339,
"step": 22070
},
{
"epoch": 0.24695086147600115,
"grad_norm": 0.24629873037338257,
"learning_rate": 1.8502444150029696e-05,
"loss": 0.7253,
"step": 22080
},
{
"epoch": 0.2470627051632638,
"grad_norm": 0.2844459116458893,
"learning_rate": 1.827401891360958e-05,
"loss": 0.7247,
"step": 22090
},
{
"epoch": 0.2471745488505265,
"grad_norm": 0.2798469662666321,
"learning_rate": 1.8045593677189458e-05,
"loss": 0.7334,
"step": 22100
},
{
"epoch": 0.2472863925377892,
"grad_norm": 0.26282501220703125,
"learning_rate": 1.7817168440769337e-05,
"loss": 0.735,
"step": 22110
},
{
"epoch": 0.24739823622505186,
"grad_norm": 0.25192755460739136,
"learning_rate": 1.7588743204349217e-05,
"loss": 0.733,
"step": 22120
},
{
"epoch": 0.24751007991231455,
"grad_norm": 0.2808292508125305,
"learning_rate": 1.73603179679291e-05,
"loss": 0.7403,
"step": 22130
},
{
"epoch": 0.24762192359957724,
"grad_norm": 0.28252866864204407,
"learning_rate": 1.713189273150898e-05,
"loss": 0.7296,
"step": 22140
},
{
"epoch": 0.2477337672868399,
"grad_norm": 0.2730456590652466,
"learning_rate": 1.690346749508886e-05,
"loss": 0.7321,
"step": 22150
},
{
"epoch": 0.2478456109741026,
"grad_norm": 0.2562378942966461,
"learning_rate": 1.6675042258668738e-05,
"loss": 0.7195,
"step": 22160
},
{
"epoch": 0.2479574546613653,
"grad_norm": 0.2450082004070282,
"learning_rate": 1.6446617022248617e-05,
"loss": 0.7277,
"step": 22170
},
{
"epoch": 0.24806929834862795,
"grad_norm": 0.25871893763542175,
"learning_rate": 1.62181917858285e-05,
"loss": 0.7143,
"step": 22180
},
{
"epoch": 0.24818114203589065,
"grad_norm": 0.2587449848651886,
"learning_rate": 1.598976654940838e-05,
"loss": 0.708,
"step": 22190
},
{
"epoch": 0.2482929857231533,
"grad_norm": 0.25496092438697815,
"learning_rate": 1.576134131298826e-05,
"loss": 0.7123,
"step": 22200
},
{
"epoch": 0.248404829410416,
"grad_norm": 0.2394058257341385,
"learning_rate": 1.553291607656814e-05,
"loss": 0.714,
"step": 22210
},
{
"epoch": 0.2485166730976787,
"grad_norm": 0.2560165524482727,
"learning_rate": 1.530449084014802e-05,
"loss": 0.7162,
"step": 22220
},
{
"epoch": 0.24862851678494136,
"grad_norm": 0.24602052569389343,
"learning_rate": 1.50760656037279e-05,
"loss": 0.7408,
"step": 22230
},
{
"epoch": 0.24874036047220405,
"grad_norm": 0.27800559997558594,
"learning_rate": 1.484764036730778e-05,
"loss": 0.7247,
"step": 22240
},
{
"epoch": 0.24885220415946674,
"grad_norm": 0.24703536927700043,
"learning_rate": 1.461921513088766e-05,
"loss": 0.7352,
"step": 22250
},
{
"epoch": 0.2489640478467294,
"grad_norm": 0.27936097979545593,
"learning_rate": 1.439078989446754e-05,
"loss": 0.7421,
"step": 22260
},
{
"epoch": 0.2490758915339921,
"grad_norm": 0.265828400850296,
"learning_rate": 1.4162364658047422e-05,
"loss": 0.7234,
"step": 22270
},
{
"epoch": 0.24918773522125479,
"grad_norm": 0.26921194791793823,
"learning_rate": 1.3933939421627303e-05,
"loss": 0.7414,
"step": 22280
},
{
"epoch": 0.24929957890851745,
"grad_norm": 0.2829255163669586,
"learning_rate": 1.3705514185207182e-05,
"loss": 0.7378,
"step": 22290
},
{
"epoch": 0.24941142259578014,
"grad_norm": 0.25702667236328125,
"learning_rate": 1.3477088948787063e-05,
"loss": 0.7475,
"step": 22300
},
{
"epoch": 0.24952326628304283,
"grad_norm": 0.28925350308418274,
"learning_rate": 1.3248663712366943e-05,
"loss": 0.738,
"step": 22310
},
{
"epoch": 0.2496351099703055,
"grad_norm": 0.2792825698852539,
"learning_rate": 1.3020238475946822e-05,
"loss": 0.7315,
"step": 22320
},
{
"epoch": 0.2497469536575682,
"grad_norm": 0.246215358376503,
"learning_rate": 1.2791813239526703e-05,
"loss": 0.7391,
"step": 22330
},
{
"epoch": 0.24985879734483085,
"grad_norm": 0.26492443680763245,
"learning_rate": 1.2563388003106583e-05,
"loss": 0.7478,
"step": 22340
},
{
"epoch": 0.24997064103209354,
"grad_norm": 0.27402445673942566,
"learning_rate": 1.2334962766686464e-05,
"loss": 0.7528,
"step": 22350
},
{
"epoch": 0.25008248471935624,
"grad_norm": 0.2757234573364258,
"learning_rate": 1.2106537530266343e-05,
"loss": 0.7306,
"step": 22360
},
{
"epoch": 0.2501943284066189,
"grad_norm": 0.2723679840564728,
"learning_rate": 1.1878112293846224e-05,
"loss": 0.7472,
"step": 22370
},
{
"epoch": 0.2503061720938816,
"grad_norm": 0.22666431963443756,
"learning_rate": 1.1649687057426105e-05,
"loss": 0.7443,
"step": 22380
},
{
"epoch": 0.25041801578114425,
"grad_norm": 0.24548636376857758,
"learning_rate": 1.1421261821005985e-05,
"loss": 0.7525,
"step": 22390
},
{
"epoch": 0.25052985946840695,
"grad_norm": 0.26941460371017456,
"learning_rate": 1.1192836584585866e-05,
"loss": 0.7482,
"step": 22400
},
{
"epoch": 0.25064170315566964,
"grad_norm": 0.2741219997406006,
"learning_rate": 1.0964411348165745e-05,
"loss": 0.7404,
"step": 22410
},
{
"epoch": 0.25075354684293233,
"grad_norm": 0.2622029483318329,
"learning_rate": 1.0735986111745626e-05,
"loss": 0.7463,
"step": 22420
},
{
"epoch": 0.250865390530195,
"grad_norm": 0.25730788707733154,
"learning_rate": 1.0507560875325506e-05,
"loss": 0.7596,
"step": 22430
},
{
"epoch": 0.25097723421745766,
"grad_norm": 0.24054691195487976,
"learning_rate": 1.0279135638905387e-05,
"loss": 0.7397,
"step": 22440
},
{
"epoch": 0.25108907790472035,
"grad_norm": 0.23557224869728088,
"learning_rate": 1.0050710402485266e-05,
"loss": 0.7426,
"step": 22450
},
{
"epoch": 0.25120092159198304,
"grad_norm": 0.25929298996925354,
"learning_rate": 9.822285166065146e-06,
"loss": 0.7402,
"step": 22460
},
{
"epoch": 0.25131276527924573,
"grad_norm": 0.26300865411758423,
"learning_rate": 9.593859929645027e-06,
"loss": 0.755,
"step": 22470
},
{
"epoch": 0.2514246089665084,
"grad_norm": 0.25753623247146606,
"learning_rate": 9.365434693224906e-06,
"loss": 0.7536,
"step": 22480
},
{
"epoch": 0.2515364526537711,
"grad_norm": 0.2438272088766098,
"learning_rate": 9.13700945680479e-06,
"loss": 0.7528,
"step": 22490
},
{
"epoch": 0.25164829634103375,
"grad_norm": 0.2870919406414032,
"learning_rate": 8.908584220384669e-06,
"loss": 0.772,
"step": 22500
},
{
"epoch": 0.25176014002829644,
"grad_norm": 0.2551197111606598,
"learning_rate": 8.68015898396455e-06,
"loss": 0.7571,
"step": 22510
},
{
"epoch": 0.25187198371555913,
"grad_norm": 0.24423009157180786,
"learning_rate": 8.45173374754443e-06,
"loss": 0.7548,
"step": 22520
},
{
"epoch": 0.2519838274028218,
"grad_norm": 0.2683405578136444,
"learning_rate": 8.223308511124309e-06,
"loss": 0.7631,
"step": 22530
},
{
"epoch": 0.2520956710900845,
"grad_norm": 0.25919967889785767,
"learning_rate": 7.99488327470419e-06,
"loss": 0.7556,
"step": 22540
},
{
"epoch": 0.25220751477734715,
"grad_norm": 0.25076591968536377,
"learning_rate": 7.76645803828407e-06,
"loss": 0.7528,
"step": 22550
},
{
"epoch": 0.25231935846460984,
"grad_norm": 0.2598860561847687,
"learning_rate": 7.53803280186395e-06,
"loss": 0.7565,
"step": 22560
},
{
"epoch": 0.25243120215187254,
"grad_norm": 0.30933788418769836,
"learning_rate": 7.30960756544383e-06,
"loss": 0.7645,
"step": 22570
},
{
"epoch": 0.2525430458391352,
"grad_norm": 0.26472121477127075,
"learning_rate": 7.081182329023711e-06,
"loss": 0.7559,
"step": 22580
},
{
"epoch": 0.2526548895263979,
"grad_norm": 0.28362420201301575,
"learning_rate": 6.852757092603591e-06,
"loss": 0.7618,
"step": 22590
},
{
"epoch": 0.2527667332136606,
"grad_norm": 0.27758538722991943,
"learning_rate": 6.624331856183471e-06,
"loss": 0.7656,
"step": 22600
},
{
"epoch": 0.25287857690092325,
"grad_norm": 0.28303948044776917,
"learning_rate": 6.395906619763352e-06,
"loss": 0.7672,
"step": 22610
},
{
"epoch": 0.25299042058818594,
"grad_norm": 0.2938460409641266,
"learning_rate": 6.167481383343232e-06,
"loss": 0.7662,
"step": 22620
},
{
"epoch": 0.25310226427544863,
"grad_norm": 0.25707969069480896,
"learning_rate": 5.939056146923112e-06,
"loss": 0.7667,
"step": 22630
},
{
"epoch": 0.2532141079627113,
"grad_norm": 0.2813314199447632,
"learning_rate": 5.710630910502992e-06,
"loss": 0.7645,
"step": 22640
},
{
"epoch": 0.253325951649974,
"grad_norm": 0.2911704480648041,
"learning_rate": 5.482205674082873e-06,
"loss": 0.763,
"step": 22650
},
{
"epoch": 0.2534377953372367,
"grad_norm": 0.2982921600341797,
"learning_rate": 5.253780437662753e-06,
"loss": 0.7606,
"step": 22660
},
{
"epoch": 0.25354963902449934,
"grad_norm": 0.2803521156311035,
"learning_rate": 5.025355201242633e-06,
"loss": 0.7617,
"step": 22670
},
{
"epoch": 0.25366148271176203,
"grad_norm": 0.26502448320388794,
"learning_rate": 4.7969299648225135e-06,
"loss": 0.7802,
"step": 22680
},
{
"epoch": 0.2537733263990247,
"grad_norm": 0.27778494358062744,
"learning_rate": 4.568504728402395e-06,
"loss": 0.7776,
"step": 22690
},
{
"epoch": 0.2538851700862874,
"grad_norm": 0.27522069215774536,
"learning_rate": 4.340079491982275e-06,
"loss": 0.7712,
"step": 22700
},
{
"epoch": 0.2539970137735501,
"grad_norm": 0.2718433141708374,
"learning_rate": 4.111654255562154e-06,
"loss": 0.7696,
"step": 22710
},
{
"epoch": 0.25410885746081274,
"grad_norm": 0.35057663917541504,
"learning_rate": 3.883229019142035e-06,
"loss": 0.7648,
"step": 22720
},
{
"epoch": 0.25422070114807543,
"grad_norm": 0.274494469165802,
"learning_rate": 3.654803782721915e-06,
"loss": 0.7578,
"step": 22730
},
{
"epoch": 0.2543325448353381,
"grad_norm": 0.2570250928401947,
"learning_rate": 3.4263785463017955e-06,
"loss": 0.7502,
"step": 22740
},
{
"epoch": 0.2544443885226008,
"grad_norm": 0.290217787027359,
"learning_rate": 3.197953309881676e-06,
"loss": 0.7607,
"step": 22750
},
{
"epoch": 0.2545562322098635,
"grad_norm": 0.25752514600753784,
"learning_rate": 2.969528073461556e-06,
"loss": 0.7612,
"step": 22760
},
{
"epoch": 0.2546680758971262,
"grad_norm": 0.23857931792736053,
"learning_rate": 2.7411028370414363e-06,
"loss": 0.7495,
"step": 22770
},
{
"epoch": 0.25477991958438884,
"grad_norm": 0.26004472374916077,
"learning_rate": 2.5126776006213166e-06,
"loss": 0.7477,
"step": 22780
},
{
"epoch": 0.25489176327165153,
"grad_norm": 0.25449565052986145,
"learning_rate": 2.2842523642011973e-06,
"loss": 0.7379,
"step": 22790
},
{
"epoch": 0.2550036069589142,
"grad_norm": 0.2568104565143585,
"learning_rate": 2.055827127781077e-06,
"loss": 0.7407,
"step": 22800
},
{
"epoch": 0.2551154506461769,
"grad_norm": 0.253451406955719,
"learning_rate": 1.8274018913609574e-06,
"loss": 0.7241,
"step": 22810
},
{
"epoch": 0.2552272943334396,
"grad_norm": 0.25928062200546265,
"learning_rate": 1.598976654940838e-06,
"loss": 0.7502,
"step": 22820
},
{
"epoch": 0.2553391380207023,
"grad_norm": 0.24965140223503113,
"learning_rate": 1.3705514185207182e-06,
"loss": 0.7417,
"step": 22830
},
{
"epoch": 0.25545098170796493,
"grad_norm": 0.2660306394100189,
"learning_rate": 1.1421261821005987e-06,
"loss": 0.7463,
"step": 22840
},
{
"epoch": 0.2555628253952276,
"grad_norm": 0.25784334540367126,
"learning_rate": 9.137009456804787e-07,
"loss": 0.7379,
"step": 22850
},
{
"epoch": 0.2556746690824903,
"grad_norm": 0.27776214480400085,
"learning_rate": 6.852757092603591e-07,
"loss": 0.7562,
"step": 22860
},
{
"epoch": 0.255786512769753,
"grad_norm": 0.24403463304042816,
"learning_rate": 4.5685047284023936e-07,
"loss": 0.7427,
"step": 22870
},
{
"epoch": 0.2558983564570157,
"grad_norm": 0.24544622004032135,
"learning_rate": 2.2842523642011968e-07,
"loss": 0.748,
"step": 22880
}
],
"logging_steps": 10,
"max_steps": 22889,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 5.946484739580887e+17,
"train_batch_size": 8,
"trial_name": null,
"trial_params": null
}