Willow-0.6b / trainer_state.json
marcuscedricridia's picture
Upload folder using huggingface_hub
6550b14 verified
{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 1.0,
"eval_steps": 500,
"global_step": 987,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.0010139416983523447,
"grad_norm": 5.121658802032471,
"learning_rate": 0.0,
"loss": 3.0637,
"step": 1
},
{
"epoch": 0.0020278833967046894,
"grad_norm": 4.709918022155762,
"learning_rate": 4.0000000000000003e-07,
"loss": 2.9503,
"step": 2
},
{
"epoch": 0.003041825095057034,
"grad_norm": 4.30893611907959,
"learning_rate": 8.000000000000001e-07,
"loss": 2.925,
"step": 3
},
{
"epoch": 0.004055766793409379,
"grad_norm": 4.957603931427002,
"learning_rate": 1.2000000000000002e-06,
"loss": 2.6827,
"step": 4
},
{
"epoch": 0.005069708491761723,
"grad_norm": 3.963380813598633,
"learning_rate": 1.6000000000000001e-06,
"loss": 2.8648,
"step": 5
},
{
"epoch": 0.006083650190114068,
"grad_norm": 3.815199136734009,
"learning_rate": 2.0000000000000003e-06,
"loss": 2.8512,
"step": 6
},
{
"epoch": 0.007097591888466414,
"grad_norm": 3.235830783843994,
"learning_rate": 2.4000000000000003e-06,
"loss": 3.0078,
"step": 7
},
{
"epoch": 0.008111533586818757,
"grad_norm": 3.193497896194458,
"learning_rate": 2.8000000000000003e-06,
"loss": 2.9759,
"step": 8
},
{
"epoch": 0.009125475285171103,
"grad_norm": 3.743866205215454,
"learning_rate": 3.2000000000000003e-06,
"loss": 2.6373,
"step": 9
},
{
"epoch": 0.010139416983523447,
"grad_norm": 3.758028745651245,
"learning_rate": 3.6000000000000003e-06,
"loss": 3.0804,
"step": 10
},
{
"epoch": 0.011153358681875792,
"grad_norm": 4.36395788192749,
"learning_rate": 4.000000000000001e-06,
"loss": 2.7003,
"step": 11
},
{
"epoch": 0.012167300380228136,
"grad_norm": 3.525747776031494,
"learning_rate": 4.4e-06,
"loss": 2.9755,
"step": 12
},
{
"epoch": 0.013181242078580482,
"grad_norm": 4.282515525817871,
"learning_rate": 4.800000000000001e-06,
"loss": 2.6715,
"step": 13
},
{
"epoch": 0.014195183776932827,
"grad_norm": 3.668238639831543,
"learning_rate": 5.2e-06,
"loss": 2.8512,
"step": 14
},
{
"epoch": 0.015209125475285171,
"grad_norm": 3.9769890308380127,
"learning_rate": 5.600000000000001e-06,
"loss": 2.8992,
"step": 15
},
{
"epoch": 0.016223067173637515,
"grad_norm": 3.9157707691192627,
"learning_rate": 6e-06,
"loss": 2.7106,
"step": 16
},
{
"epoch": 0.01723700887198986,
"grad_norm": 3.3258984088897705,
"learning_rate": 6.4000000000000006e-06,
"loss": 2.6878,
"step": 17
},
{
"epoch": 0.018250950570342206,
"grad_norm": 3.4669008255004883,
"learning_rate": 6.800000000000001e-06,
"loss": 2.8272,
"step": 18
},
{
"epoch": 0.01926489226869455,
"grad_norm": 3.363856077194214,
"learning_rate": 7.2000000000000005e-06,
"loss": 2.9048,
"step": 19
},
{
"epoch": 0.020278833967046894,
"grad_norm": 3.460812568664551,
"learning_rate": 7.600000000000001e-06,
"loss": 2.8649,
"step": 20
},
{
"epoch": 0.02129277566539924,
"grad_norm": 3.656041383743286,
"learning_rate": 8.000000000000001e-06,
"loss": 2.8138,
"step": 21
},
{
"epoch": 0.022306717363751585,
"grad_norm": 3.098552942276001,
"learning_rate": 8.400000000000001e-06,
"loss": 2.9701,
"step": 22
},
{
"epoch": 0.02332065906210393,
"grad_norm": 3.047813892364502,
"learning_rate": 8.8e-06,
"loss": 2.9061,
"step": 23
},
{
"epoch": 0.024334600760456272,
"grad_norm": 3.0990796089172363,
"learning_rate": 9.200000000000002e-06,
"loss": 2.8978,
"step": 24
},
{
"epoch": 0.025348542458808618,
"grad_norm": 3.116218090057373,
"learning_rate": 9.600000000000001e-06,
"loss": 2.7761,
"step": 25
},
{
"epoch": 0.026362484157160963,
"grad_norm": 3.4951255321502686,
"learning_rate": 1e-05,
"loss": 3.0193,
"step": 26
},
{
"epoch": 0.02737642585551331,
"grad_norm": 3.5713748931884766,
"learning_rate": 1.04e-05,
"loss": 2.6956,
"step": 27
},
{
"epoch": 0.028390367553865654,
"grad_norm": 3.284576654434204,
"learning_rate": 1.0800000000000002e-05,
"loss": 2.8543,
"step": 28
},
{
"epoch": 0.029404309252217996,
"grad_norm": 3.5328757762908936,
"learning_rate": 1.1200000000000001e-05,
"loss": 2.7354,
"step": 29
},
{
"epoch": 0.030418250950570342,
"grad_norm": 3.407898187637329,
"learning_rate": 1.16e-05,
"loss": 2.8508,
"step": 30
},
{
"epoch": 0.031432192648922684,
"grad_norm": 3.326301097869873,
"learning_rate": 1.2e-05,
"loss": 2.6679,
"step": 31
},
{
"epoch": 0.03244613434727503,
"grad_norm": 3.08138370513916,
"learning_rate": 1.2400000000000002e-05,
"loss": 2.7425,
"step": 32
},
{
"epoch": 0.033460076045627375,
"grad_norm": 3.3986353874206543,
"learning_rate": 1.2800000000000001e-05,
"loss": 3.036,
"step": 33
},
{
"epoch": 0.03447401774397972,
"grad_norm": 3.1248598098754883,
"learning_rate": 1.3200000000000002e-05,
"loss": 2.6935,
"step": 34
},
{
"epoch": 0.035487959442332066,
"grad_norm": 3.4256432056427,
"learning_rate": 1.3600000000000002e-05,
"loss": 2.9109,
"step": 35
},
{
"epoch": 0.03650190114068441,
"grad_norm": 3.3504834175109863,
"learning_rate": 1.4e-05,
"loss": 2.8338,
"step": 36
},
{
"epoch": 0.03751584283903676,
"grad_norm": 3.3927106857299805,
"learning_rate": 1.4400000000000001e-05,
"loss": 2.7091,
"step": 37
},
{
"epoch": 0.0385297845373891,
"grad_norm": 3.278294086456299,
"learning_rate": 1.48e-05,
"loss": 2.8115,
"step": 38
},
{
"epoch": 0.03954372623574144,
"grad_norm": 3.1749770641326904,
"learning_rate": 1.5200000000000002e-05,
"loss": 2.9376,
"step": 39
},
{
"epoch": 0.04055766793409379,
"grad_norm": 3.4760537147521973,
"learning_rate": 1.5600000000000003e-05,
"loss": 2.626,
"step": 40
},
{
"epoch": 0.04157160963244613,
"grad_norm": 3.464676856994629,
"learning_rate": 1.6000000000000003e-05,
"loss": 2.6685,
"step": 41
},
{
"epoch": 0.04258555133079848,
"grad_norm": 3.4770150184631348,
"learning_rate": 1.64e-05,
"loss": 2.6069,
"step": 42
},
{
"epoch": 0.043599493029150824,
"grad_norm": 3.5418553352355957,
"learning_rate": 1.6800000000000002e-05,
"loss": 2.7534,
"step": 43
},
{
"epoch": 0.04461343472750317,
"grad_norm": 3.355006456375122,
"learning_rate": 1.72e-05,
"loss": 2.8957,
"step": 44
},
{
"epoch": 0.045627376425855515,
"grad_norm": 3.518998861312866,
"learning_rate": 1.76e-05,
"loss": 2.9025,
"step": 45
},
{
"epoch": 0.04664131812420786,
"grad_norm": 3.455519199371338,
"learning_rate": 1.8e-05,
"loss": 2.9387,
"step": 46
},
{
"epoch": 0.047655259822560206,
"grad_norm": 3.3320729732513428,
"learning_rate": 1.8400000000000003e-05,
"loss": 2.7342,
"step": 47
},
{
"epoch": 0.048669201520912544,
"grad_norm": 3.492351531982422,
"learning_rate": 1.88e-05,
"loss": 2.6889,
"step": 48
},
{
"epoch": 0.04968314321926489,
"grad_norm": 3.527967929840088,
"learning_rate": 1.9200000000000003e-05,
"loss": 2.9035,
"step": 49
},
{
"epoch": 0.050697084917617236,
"grad_norm": 3.1041579246520996,
"learning_rate": 1.9600000000000002e-05,
"loss": 3.0125,
"step": 50
},
{
"epoch": 0.05171102661596958,
"grad_norm": 3.240447998046875,
"learning_rate": 2e-05,
"loss": 2.9446,
"step": 51
},
{
"epoch": 0.05272496831432193,
"grad_norm": 3.3307602405548096,
"learning_rate": 1.9999943793031672e-05,
"loss": 2.8274,
"step": 52
},
{
"epoch": 0.05373891001267427,
"grad_norm": 3.235234498977661,
"learning_rate": 1.999977517275853e-05,
"loss": 2.9267,
"step": 53
},
{
"epoch": 0.05475285171102662,
"grad_norm": 3.756901502609253,
"learning_rate": 1.99994941410761e-05,
"loss": 2.9217,
"step": 54
},
{
"epoch": 0.05576679340937896,
"grad_norm": 3.2789499759674072,
"learning_rate": 1.999910070114357e-05,
"loss": 2.8089,
"step": 55
},
{
"epoch": 0.05678073510773131,
"grad_norm": 3.3086624145507812,
"learning_rate": 1.9998594857383756e-05,
"loss": 2.7262,
"step": 56
},
{
"epoch": 0.05779467680608365,
"grad_norm": 3.289076089859009,
"learning_rate": 1.9997976615483042e-05,
"loss": 2.7543,
"step": 57
},
{
"epoch": 0.05880861850443599,
"grad_norm": 3.6002161502838135,
"learning_rate": 1.9997245982391335e-05,
"loss": 3.1664,
"step": 58
},
{
"epoch": 0.05982256020278834,
"grad_norm": 3.1306004524230957,
"learning_rate": 1.9996402966321962e-05,
"loss": 2.7802,
"step": 59
},
{
"epoch": 0.060836501901140684,
"grad_norm": 3.0634193420410156,
"learning_rate": 1.9995447576751605e-05,
"loss": 2.9049,
"step": 60
},
{
"epoch": 0.06185044359949303,
"grad_norm": 3.2394661903381348,
"learning_rate": 1.999437982442017e-05,
"loss": 2.6699,
"step": 61
},
{
"epoch": 0.06286438529784537,
"grad_norm": 3.115633487701416,
"learning_rate": 1.9993199721330684e-05,
"loss": 2.8081,
"step": 62
},
{
"epoch": 0.06387832699619772,
"grad_norm": 3.0398242473602295,
"learning_rate": 1.9991907280749148e-05,
"loss": 2.7188,
"step": 63
},
{
"epoch": 0.06489226869455006,
"grad_norm": 3.1383628845214844,
"learning_rate": 1.99905025172044e-05,
"loss": 2.8215,
"step": 64
},
{
"epoch": 0.06590621039290241,
"grad_norm": 2.999265670776367,
"learning_rate": 1.998898544648793e-05,
"loss": 2.7608,
"step": 65
},
{
"epoch": 0.06692015209125475,
"grad_norm": 3.1938254833221436,
"learning_rate": 1.9987356085653738e-05,
"loss": 2.9058,
"step": 66
},
{
"epoch": 0.0679340937896071,
"grad_norm": 3.3138041496276855,
"learning_rate": 1.99856144530181e-05,
"loss": 2.7962,
"step": 67
},
{
"epoch": 0.06894803548795944,
"grad_norm": 3.443120002746582,
"learning_rate": 1.998376056815941e-05,
"loss": 2.8264,
"step": 68
},
{
"epoch": 0.0699619771863118,
"grad_norm": 3.0764691829681396,
"learning_rate": 1.99817944519179e-05,
"loss": 3.0068,
"step": 69
},
{
"epoch": 0.07097591888466413,
"grad_norm": 3.1897008419036865,
"learning_rate": 1.997971612639547e-05,
"loss": 2.8515,
"step": 70
},
{
"epoch": 0.07198986058301647,
"grad_norm": 3.391533613204956,
"learning_rate": 1.9977525614955388e-05,
"loss": 3.0471,
"step": 71
},
{
"epoch": 0.07300380228136882,
"grad_norm": 3.4642486572265625,
"learning_rate": 1.9975222942222054e-05,
"loss": 2.6579,
"step": 72
},
{
"epoch": 0.07401774397972116,
"grad_norm": 3.1187565326690674,
"learning_rate": 1.9972808134080726e-05,
"loss": 2.6668,
"step": 73
},
{
"epoch": 0.07503168567807351,
"grad_norm": 3.5440473556518555,
"learning_rate": 1.9970281217677207e-05,
"loss": 2.6528,
"step": 74
},
{
"epoch": 0.07604562737642585,
"grad_norm": 3.1511693000793457,
"learning_rate": 1.996764222141756e-05,
"loss": 2.809,
"step": 75
},
{
"epoch": 0.0770595690747782,
"grad_norm": 3.2542364597320557,
"learning_rate": 1.9964891174967786e-05,
"loss": 2.8991,
"step": 76
},
{
"epoch": 0.07807351077313054,
"grad_norm": 3.112870693206787,
"learning_rate": 1.9962028109253474e-05,
"loss": 2.8619,
"step": 77
},
{
"epoch": 0.07908745247148288,
"grad_norm": 3.0254783630371094,
"learning_rate": 1.9959053056459474e-05,
"loss": 2.8254,
"step": 78
},
{
"epoch": 0.08010139416983524,
"grad_norm": 3.1325619220733643,
"learning_rate": 1.995596605002953e-05,
"loss": 3.1019,
"step": 79
},
{
"epoch": 0.08111533586818757,
"grad_norm": 3.5266387462615967,
"learning_rate": 1.9952767124665892e-05,
"loss": 2.7379,
"step": 80
},
{
"epoch": 0.08212927756653993,
"grad_norm": 3.245556116104126,
"learning_rate": 1.9949456316328942e-05,
"loss": 2.8923,
"step": 81
},
{
"epoch": 0.08314321926489227,
"grad_norm": 3.3015191555023193,
"learning_rate": 1.9946033662236778e-05,
"loss": 2.7714,
"step": 82
},
{
"epoch": 0.08415716096324462,
"grad_norm": 3.5391364097595215,
"learning_rate": 1.9942499200864805e-05,
"loss": 2.7163,
"step": 83
},
{
"epoch": 0.08517110266159696,
"grad_norm": 3.1607401371002197,
"learning_rate": 1.993885297194529e-05,
"loss": 2.8144,
"step": 84
},
{
"epoch": 0.08618504435994931,
"grad_norm": 3.0026676654815674,
"learning_rate": 1.993509501646693e-05,
"loss": 2.9125,
"step": 85
},
{
"epoch": 0.08719898605830165,
"grad_norm": 3.2028753757476807,
"learning_rate": 1.9931225376674388e-05,
"loss": 2.8126,
"step": 86
},
{
"epoch": 0.08821292775665399,
"grad_norm": 3.233186960220337,
"learning_rate": 1.99272440960678e-05,
"loss": 2.6726,
"step": 87
},
{
"epoch": 0.08922686945500634,
"grad_norm": 3.0833544731140137,
"learning_rate": 1.9923151219402308e-05,
"loss": 2.6719,
"step": 88
},
{
"epoch": 0.09024081115335868,
"grad_norm": 3.059297561645508,
"learning_rate": 1.9918946792687553e-05,
"loss": 2.7005,
"step": 89
},
{
"epoch": 0.09125475285171103,
"grad_norm": 3.261103868484497,
"learning_rate": 1.9914630863187156e-05,
"loss": 2.5295,
"step": 90
},
{
"epoch": 0.09226869455006337,
"grad_norm": 3.4985969066619873,
"learning_rate": 1.991020347941817e-05,
"loss": 2.7838,
"step": 91
},
{
"epoch": 0.09328263624841572,
"grad_norm": 3.427159309387207,
"learning_rate": 1.9905664691150567e-05,
"loss": 2.6065,
"step": 92
},
{
"epoch": 0.09429657794676806,
"grad_norm": 3.205653429031372,
"learning_rate": 1.9901014549406647e-05,
"loss": 2.854,
"step": 93
},
{
"epoch": 0.09531051964512041,
"grad_norm": 3.1949386596679688,
"learning_rate": 1.9896253106460484e-05,
"loss": 2.7194,
"step": 94
},
{
"epoch": 0.09632446134347275,
"grad_norm": 3.4181830883026123,
"learning_rate": 1.9891380415837333e-05,
"loss": 2.916,
"step": 95
},
{
"epoch": 0.09733840304182509,
"grad_norm": 3.4856419563293457,
"learning_rate": 1.9886396532313033e-05,
"loss": 2.7623,
"step": 96
},
{
"epoch": 0.09835234474017744,
"grad_norm": 3.247903823852539,
"learning_rate": 1.9881301511913372e-05,
"loss": 2.9527,
"step": 97
},
{
"epoch": 0.09936628643852978,
"grad_norm": 3.5031447410583496,
"learning_rate": 1.9876095411913492e-05,
"loss": 2.7794,
"step": 98
},
{
"epoch": 0.10038022813688213,
"grad_norm": 3.3715412616729736,
"learning_rate": 1.9870778290837198e-05,
"loss": 2.9483,
"step": 99
},
{
"epoch": 0.10139416983523447,
"grad_norm": 3.354491949081421,
"learning_rate": 1.9865350208456354e-05,
"loss": 2.7876,
"step": 100
},
{
"epoch": 0.10240811153358682,
"grad_norm": 3.0715112686157227,
"learning_rate": 1.9859811225790164e-05,
"loss": 2.7182,
"step": 101
},
{
"epoch": 0.10342205323193916,
"grad_norm": 3.059152364730835,
"learning_rate": 1.9854161405104512e-05,
"loss": 2.6625,
"step": 102
},
{
"epoch": 0.10443599493029151,
"grad_norm": 3.0856645107269287,
"learning_rate": 1.9848400809911255e-05,
"loss": 2.7493,
"step": 103
},
{
"epoch": 0.10544993662864385,
"grad_norm": 3.523483991622925,
"learning_rate": 1.9842529504967522e-05,
"loss": 2.696,
"step": 104
},
{
"epoch": 0.10646387832699619,
"grad_norm": 3.5456182956695557,
"learning_rate": 1.9836547556274954e-05,
"loss": 2.7098,
"step": 105
},
{
"epoch": 0.10747782002534854,
"grad_norm": 3.539569616317749,
"learning_rate": 1.9830455031078994e-05,
"loss": 2.7044,
"step": 106
},
{
"epoch": 0.10849176172370088,
"grad_norm": 3.270253896713257,
"learning_rate": 1.982425199786811e-05,
"loss": 2.7607,
"step": 107
},
{
"epoch": 0.10950570342205324,
"grad_norm": 3.0167229175567627,
"learning_rate": 1.981793852637305e-05,
"loss": 2.6388,
"step": 108
},
{
"epoch": 0.11051964512040557,
"grad_norm": 3.2504289150238037,
"learning_rate": 1.981151468756603e-05,
"loss": 2.7782,
"step": 109
},
{
"epoch": 0.11153358681875793,
"grad_norm": 3.174086809158325,
"learning_rate": 1.980498055365994e-05,
"loss": 2.875,
"step": 110
},
{
"epoch": 0.11254752851711027,
"grad_norm": 3.3104934692382812,
"learning_rate": 1.9798336198107567e-05,
"loss": 2.8241,
"step": 111
},
{
"epoch": 0.11356147021546262,
"grad_norm": 3.000248908996582,
"learning_rate": 1.9791581695600722e-05,
"loss": 2.8913,
"step": 112
},
{
"epoch": 0.11457541191381496,
"grad_norm": 3.089625597000122,
"learning_rate": 1.9784717122069425e-05,
"loss": 2.7071,
"step": 113
},
{
"epoch": 0.1155893536121673,
"grad_norm": 3.0423169136047363,
"learning_rate": 1.9777742554681044e-05,
"loss": 2.7411,
"step": 114
},
{
"epoch": 0.11660329531051965,
"grad_norm": 3.0240306854248047,
"learning_rate": 1.9770658071839448e-05,
"loss": 2.9403,
"step": 115
},
{
"epoch": 0.11761723700887199,
"grad_norm": 3.2751612663269043,
"learning_rate": 1.9763463753184092e-05,
"loss": 2.8083,
"step": 116
},
{
"epoch": 0.11863117870722434,
"grad_norm": 3.2764761447906494,
"learning_rate": 1.9756159679589143e-05,
"loss": 2.7055,
"step": 117
},
{
"epoch": 0.11964512040557668,
"grad_norm": 3.3007071018218994,
"learning_rate": 1.974874593316257e-05,
"loss": 2.8231,
"step": 118
},
{
"epoch": 0.12065906210392903,
"grad_norm": 3.1403753757476807,
"learning_rate": 1.974122259724521e-05,
"loss": 2.7642,
"step": 119
},
{
"epoch": 0.12167300380228137,
"grad_norm": 3.297636032104492,
"learning_rate": 1.973358975640985e-05,
"loss": 2.7253,
"step": 120
},
{
"epoch": 0.1226869455006337,
"grad_norm": 3.160066604614258,
"learning_rate": 1.9725847496460256e-05,
"loss": 2.7361,
"step": 121
},
{
"epoch": 0.12370088719898606,
"grad_norm": 3.448756217956543,
"learning_rate": 1.9717995904430224e-05,
"loss": 2.6938,
"step": 122
},
{
"epoch": 0.1247148288973384,
"grad_norm": 3.196122884750366,
"learning_rate": 1.9710035068582586e-05,
"loss": 2.881,
"step": 123
},
{
"epoch": 0.12572877059569074,
"grad_norm": 2.8908872604370117,
"learning_rate": 1.970196507840823e-05,
"loss": 2.764,
"step": 124
},
{
"epoch": 0.1267427122940431,
"grad_norm": 3.0789546966552734,
"learning_rate": 1.9693786024625097e-05,
"loss": 2.8602,
"step": 125
},
{
"epoch": 0.12775665399239544,
"grad_norm": 3.1777265071868896,
"learning_rate": 1.968549799917715e-05,
"loss": 2.7955,
"step": 126
},
{
"epoch": 0.12877059569074778,
"grad_norm": 3.3578367233276367,
"learning_rate": 1.9677101095233342e-05,
"loss": 2.8042,
"step": 127
},
{
"epoch": 0.12978453738910012,
"grad_norm": 3.195127487182617,
"learning_rate": 1.966859540718658e-05,
"loss": 2.6936,
"step": 128
},
{
"epoch": 0.13079847908745248,
"grad_norm": 3.3365399837493896,
"learning_rate": 1.9659981030652648e-05,
"loss": 2.8029,
"step": 129
},
{
"epoch": 0.13181242078580482,
"grad_norm": 2.9787654876708984,
"learning_rate": 1.965125806246915e-05,
"loss": 2.6843,
"step": 130
},
{
"epoch": 0.13282636248415716,
"grad_norm": 3.062211751937866,
"learning_rate": 1.9642426600694395e-05,
"loss": 2.8227,
"step": 131
},
{
"epoch": 0.1338403041825095,
"grad_norm": 3.1210687160491943,
"learning_rate": 1.963348674460633e-05,
"loss": 2.9978,
"step": 132
},
{
"epoch": 0.13485424588086184,
"grad_norm": 3.349961757659912,
"learning_rate": 1.9624438594701397e-05,
"loss": 2.6609,
"step": 133
},
{
"epoch": 0.1358681875792142,
"grad_norm": 2.9804890155792236,
"learning_rate": 1.9615282252693407e-05,
"loss": 2.7305,
"step": 134
},
{
"epoch": 0.13688212927756654,
"grad_norm": 3.0483453273773193,
"learning_rate": 1.9606017821512405e-05,
"loss": 2.8606,
"step": 135
},
{
"epoch": 0.13789607097591888,
"grad_norm": 2.8871631622314453,
"learning_rate": 1.9596645405303508e-05,
"loss": 2.959,
"step": 136
},
{
"epoch": 0.13891001267427122,
"grad_norm": 2.925555944442749,
"learning_rate": 1.9587165109425746e-05,
"loss": 2.7799,
"step": 137
},
{
"epoch": 0.1399239543726236,
"grad_norm": 3.3795876502990723,
"learning_rate": 1.9577577040450842e-05,
"loss": 2.4838,
"step": 138
},
{
"epoch": 0.14093789607097593,
"grad_norm": 2.9723339080810547,
"learning_rate": 1.9567881306162065e-05,
"loss": 2.5472,
"step": 139
},
{
"epoch": 0.14195183776932827,
"grad_norm": 3.181236505508423,
"learning_rate": 1.9558078015552973e-05,
"loss": 2.7748,
"step": 140
},
{
"epoch": 0.1429657794676806,
"grad_norm": 3.5478413105010986,
"learning_rate": 1.9548167278826224e-05,
"loss": 2.5927,
"step": 141
},
{
"epoch": 0.14397972116603294,
"grad_norm": 3.1357247829437256,
"learning_rate": 1.9538149207392306e-05,
"loss": 2.8367,
"step": 142
},
{
"epoch": 0.1449936628643853,
"grad_norm": 2.7414956092834473,
"learning_rate": 1.9528023913868305e-05,
"loss": 2.7574,
"step": 143
},
{
"epoch": 0.14600760456273765,
"grad_norm": 2.9295525550842285,
"learning_rate": 1.9517791512076628e-05,
"loss": 2.7733,
"step": 144
},
{
"epoch": 0.14702154626108999,
"grad_norm": 3.1006598472595215,
"learning_rate": 1.9507452117043736e-05,
"loss": 2.7275,
"step": 145
},
{
"epoch": 0.14803548795944232,
"grad_norm": 2.8998913764953613,
"learning_rate": 1.9497005844998835e-05,
"loss": 2.9219,
"step": 146
},
{
"epoch": 0.14904942965779466,
"grad_norm": 3.2651214599609375,
"learning_rate": 1.9486452813372586e-05,
"loss": 2.7337,
"step": 147
},
{
"epoch": 0.15006337135614703,
"grad_norm": 3.1806275844573975,
"learning_rate": 1.947579314079577e-05,
"loss": 2.7466,
"step": 148
},
{
"epoch": 0.15107731305449937,
"grad_norm": 3.137009859085083,
"learning_rate": 1.946502694709796e-05,
"loss": 2.8247,
"step": 149
},
{
"epoch": 0.1520912547528517,
"grad_norm": 3.00809645652771,
"learning_rate": 1.945415435330618e-05,
"loss": 2.6825,
"step": 150
},
{
"epoch": 0.15310519645120405,
"grad_norm": 3.1236531734466553,
"learning_rate": 1.9443175481643536e-05,
"loss": 2.85,
"step": 151
},
{
"epoch": 0.1541191381495564,
"grad_norm": 2.8082268238067627,
"learning_rate": 1.9432090455527847e-05,
"loss": 2.8106,
"step": 152
},
{
"epoch": 0.15513307984790875,
"grad_norm": 3.262784957885742,
"learning_rate": 1.942089939957026e-05,
"loss": 2.5627,
"step": 153
},
{
"epoch": 0.1561470215462611,
"grad_norm": 3.1046485900878906,
"learning_rate": 1.940960243957383e-05,
"loss": 2.8634,
"step": 154
},
{
"epoch": 0.15716096324461343,
"grad_norm": 3.412945508956909,
"learning_rate": 1.9398199702532143e-05,
"loss": 2.6015,
"step": 155
},
{
"epoch": 0.15817490494296577,
"grad_norm": 3.2352962493896484,
"learning_rate": 1.9386691316627845e-05,
"loss": 2.5903,
"step": 156
},
{
"epoch": 0.15918884664131813,
"grad_norm": 3.3233420848846436,
"learning_rate": 1.937507741123124e-05,
"loss": 2.7309,
"step": 157
},
{
"epoch": 0.16020278833967047,
"grad_norm": 3.314918279647827,
"learning_rate": 1.9363358116898804e-05,
"loss": 2.4656,
"step": 158
},
{
"epoch": 0.1612167300380228,
"grad_norm": 3.487234354019165,
"learning_rate": 1.9351533565371747e-05,
"loss": 2.7646,
"step": 159
},
{
"epoch": 0.16223067173637515,
"grad_norm": 3.012712001800537,
"learning_rate": 1.9339603889574498e-05,
"loss": 2.6756,
"step": 160
},
{
"epoch": 0.16324461343472751,
"grad_norm": 3.2188124656677246,
"learning_rate": 1.932756922361325e-05,
"loss": 2.6434,
"step": 161
},
{
"epoch": 0.16425855513307985,
"grad_norm": 3.085340738296509,
"learning_rate": 1.9315429702774408e-05,
"loss": 2.7135,
"step": 162
},
{
"epoch": 0.1652724968314322,
"grad_norm": 3.1618096828460693,
"learning_rate": 1.9303185463523108e-05,
"loss": 2.916,
"step": 163
},
{
"epoch": 0.16628643852978453,
"grad_norm": 2.805600881576538,
"learning_rate": 1.929083664350167e-05,
"loss": 2.8468,
"step": 164
},
{
"epoch": 0.16730038022813687,
"grad_norm": 3.0501015186309814,
"learning_rate": 1.9278383381528036e-05,
"loss": 2.5067,
"step": 165
},
{
"epoch": 0.16831432192648924,
"grad_norm": 3.0693836212158203,
"learning_rate": 1.9265825817594232e-05,
"loss": 2.7927,
"step": 166
},
{
"epoch": 0.16932826362484157,
"grad_norm": 3.132962226867676,
"learning_rate": 1.9253164092864768e-05,
"loss": 2.7414,
"step": 167
},
{
"epoch": 0.1703422053231939,
"grad_norm": 2.9955735206604004,
"learning_rate": 1.9240398349675083e-05,
"loss": 2.8164,
"step": 168
},
{
"epoch": 0.17135614702154625,
"grad_norm": 2.998502254486084,
"learning_rate": 1.922752873152992e-05,
"loss": 2.895,
"step": 169
},
{
"epoch": 0.17237008871989862,
"grad_norm": 2.969083786010742,
"learning_rate": 1.9214555383101724e-05,
"loss": 2.9225,
"step": 170
},
{
"epoch": 0.17338403041825096,
"grad_norm": 3.307370901107788,
"learning_rate": 1.9201478450229012e-05,
"loss": 2.4807,
"step": 171
},
{
"epoch": 0.1743979721166033,
"grad_norm": 2.9617295265197754,
"learning_rate": 1.918829807991473e-05,
"loss": 2.6784,
"step": 172
},
{
"epoch": 0.17541191381495563,
"grad_norm": 3.0904929637908936,
"learning_rate": 1.9175014420324613e-05,
"loss": 2.644,
"step": 173
},
{
"epoch": 0.17642585551330797,
"grad_norm": 3.1362547874450684,
"learning_rate": 1.916162762078551e-05,
"loss": 2.9906,
"step": 174
},
{
"epoch": 0.17743979721166034,
"grad_norm": 3.120621681213379,
"learning_rate": 1.91481378317837e-05,
"loss": 2.7463,
"step": 175
},
{
"epoch": 0.17845373891001268,
"grad_norm": 2.8338675498962402,
"learning_rate": 1.9134545204963214e-05,
"loss": 2.663,
"step": 176
},
{
"epoch": 0.17946768060836502,
"grad_norm": 3.007894515991211,
"learning_rate": 1.912084989312412e-05,
"loss": 2.5924,
"step": 177
},
{
"epoch": 0.18048162230671735,
"grad_norm": 3.334587812423706,
"learning_rate": 1.9107052050220808e-05,
"loss": 2.668,
"step": 178
},
{
"epoch": 0.18149556400506972,
"grad_norm": 3.349618911743164,
"learning_rate": 1.9093151831360268e-05,
"loss": 2.3898,
"step": 179
},
{
"epoch": 0.18250950570342206,
"grad_norm": 3.2005631923675537,
"learning_rate": 1.907914939280033e-05,
"loss": 2.7596,
"step": 180
},
{
"epoch": 0.1835234474017744,
"grad_norm": 2.9870338439941406,
"learning_rate": 1.906504489194791e-05,
"loss": 2.6952,
"step": 181
},
{
"epoch": 0.18453738910012674,
"grad_norm": 3.391935110092163,
"learning_rate": 1.9050838487357267e-05,
"loss": 2.8396,
"step": 182
},
{
"epoch": 0.18555133079847907,
"grad_norm": 3.0437772274017334,
"learning_rate": 1.903653033872818e-05,
"loss": 2.7672,
"step": 183
},
{
"epoch": 0.18656527249683144,
"grad_norm": 3.137467384338379,
"learning_rate": 1.902212060690418e-05,
"loss": 2.5997,
"step": 184
},
{
"epoch": 0.18757921419518378,
"grad_norm": 3.1098976135253906,
"learning_rate": 1.9007609453870738e-05,
"loss": 2.5968,
"step": 185
},
{
"epoch": 0.18859315589353612,
"grad_norm": 2.889174461364746,
"learning_rate": 1.8992997042753437e-05,
"loss": 2.8723,
"step": 186
},
{
"epoch": 0.18960709759188846,
"grad_norm": 2.9620108604431152,
"learning_rate": 1.897828353781614e-05,
"loss": 2.8452,
"step": 187
},
{
"epoch": 0.19062103929024082,
"grad_norm": 3.097540855407715,
"learning_rate": 1.8963469104459157e-05,
"loss": 2.6636,
"step": 188
},
{
"epoch": 0.19163498098859316,
"grad_norm": 3.580857753753662,
"learning_rate": 1.8948553909217354e-05,
"loss": 2.9052,
"step": 189
},
{
"epoch": 0.1926489226869455,
"grad_norm": 3.1843173503875732,
"learning_rate": 1.893353811975832e-05,
"loss": 2.6518,
"step": 190
},
{
"epoch": 0.19366286438529784,
"grad_norm": 3.3880133628845215,
"learning_rate": 1.891842190488045e-05,
"loss": 2.6159,
"step": 191
},
{
"epoch": 0.19467680608365018,
"grad_norm": 3.032352924346924,
"learning_rate": 1.8903205434511072e-05,
"loss": 2.7976,
"step": 192
},
{
"epoch": 0.19569074778200254,
"grad_norm": 3.150266647338867,
"learning_rate": 1.888788887970452e-05,
"loss": 2.6576,
"step": 193
},
{
"epoch": 0.19670468948035488,
"grad_norm": 3.5360684394836426,
"learning_rate": 1.8872472412640207e-05,
"loss": 2.6702,
"step": 194
},
{
"epoch": 0.19771863117870722,
"grad_norm": 3.3476462364196777,
"learning_rate": 1.8856956206620717e-05,
"loss": 2.7005,
"step": 195
},
{
"epoch": 0.19873257287705956,
"grad_norm": 3.220618963241577,
"learning_rate": 1.8841340436069825e-05,
"loss": 2.9062,
"step": 196
},
{
"epoch": 0.19974651457541193,
"grad_norm": 3.2296738624572754,
"learning_rate": 1.8825625276530558e-05,
"loss": 2.6451,
"step": 197
},
{
"epoch": 0.20076045627376427,
"grad_norm": 2.9868884086608887,
"learning_rate": 1.880981090466321e-05,
"loss": 2.9176,
"step": 198
},
{
"epoch": 0.2017743979721166,
"grad_norm": 2.976165771484375,
"learning_rate": 1.879389749824336e-05,
"loss": 2.7571,
"step": 199
},
{
"epoch": 0.20278833967046894,
"grad_norm": 3.1209869384765625,
"learning_rate": 1.877788523615988e-05,
"loss": 2.7242,
"step": 200
},
{
"epoch": 0.20380228136882128,
"grad_norm": 2.7696938514709473,
"learning_rate": 1.8761774298412905e-05,
"loss": 2.8417,
"step": 201
},
{
"epoch": 0.20481622306717365,
"grad_norm": 3.3098506927490234,
"learning_rate": 1.874556486611183e-05,
"loss": 2.72,
"step": 202
},
{
"epoch": 0.20583016476552599,
"grad_norm": 3.050853967666626,
"learning_rate": 1.8729257121473262e-05,
"loss": 2.6551,
"step": 203
},
{
"epoch": 0.20684410646387832,
"grad_norm": 2.8224573135375977,
"learning_rate": 1.8712851247818985e-05,
"loss": 2.7885,
"step": 204
},
{
"epoch": 0.20785804816223066,
"grad_norm": 3.180731773376465,
"learning_rate": 1.869634742957388e-05,
"loss": 2.6939,
"step": 205
},
{
"epoch": 0.20887198986058303,
"grad_norm": 3.3162038326263428,
"learning_rate": 1.867974585226386e-05,
"loss": 2.687,
"step": 206
},
{
"epoch": 0.20988593155893537,
"grad_norm": 2.7698941230773926,
"learning_rate": 1.8663046702513795e-05,
"loss": 2.6386,
"step": 207
},
{
"epoch": 0.2108998732572877,
"grad_norm": 2.8454737663269043,
"learning_rate": 1.8646250168045402e-05,
"loss": 2.6816,
"step": 208
},
{
"epoch": 0.21191381495564005,
"grad_norm": 3.039781332015991,
"learning_rate": 1.862935643767514e-05,
"loss": 2.6857,
"step": 209
},
{
"epoch": 0.21292775665399238,
"grad_norm": 2.972322463989258,
"learning_rate": 1.8612365701312075e-05,
"loss": 2.7863,
"step": 210
},
{
"epoch": 0.21394169835234475,
"grad_norm": 3.0482544898986816,
"learning_rate": 1.859527814995577e-05,
"loss": 2.5569,
"step": 211
},
{
"epoch": 0.2149556400506971,
"grad_norm": 3.240053653717041,
"learning_rate": 1.8578093975694116e-05,
"loss": 2.6481,
"step": 212
},
{
"epoch": 0.21596958174904943,
"grad_norm": 3.193847179412842,
"learning_rate": 1.8560813371701174e-05,
"loss": 2.7145,
"step": 213
},
{
"epoch": 0.21698352344740177,
"grad_norm": 3.1215016841888428,
"learning_rate": 1.8543436532235024e-05,
"loss": 2.8692,
"step": 214
},
{
"epoch": 0.21799746514575413,
"grad_norm": 2.8203959465026855,
"learning_rate": 1.8525963652635556e-05,
"loss": 2.8702,
"step": 215
},
{
"epoch": 0.21901140684410647,
"grad_norm": 3.0731961727142334,
"learning_rate": 1.8508394929322287e-05,
"loss": 2.6172,
"step": 216
},
{
"epoch": 0.2200253485424588,
"grad_norm": 2.7819225788116455,
"learning_rate": 1.8490730559792153e-05,
"loss": 2.6681,
"step": 217
},
{
"epoch": 0.22103929024081115,
"grad_norm": 2.939059019088745,
"learning_rate": 1.8472970742617284e-05,
"loss": 2.538,
"step": 218
},
{
"epoch": 0.2220532319391635,
"grad_norm": 3.051414728164673,
"learning_rate": 1.8455115677442782e-05,
"loss": 2.969,
"step": 219
},
{
"epoch": 0.22306717363751585,
"grad_norm": 3.1464333534240723,
"learning_rate": 1.8437165564984455e-05,
"loss": 2.8688,
"step": 220
},
{
"epoch": 0.2240811153358682,
"grad_norm": 2.8206515312194824,
"learning_rate": 1.841912060702659e-05,
"loss": 2.662,
"step": 221
},
{
"epoch": 0.22509505703422053,
"grad_norm": 3.0998098850250244,
"learning_rate": 1.8400981006419663e-05,
"loss": 2.657,
"step": 222
},
{
"epoch": 0.22610899873257287,
"grad_norm": 3.293029308319092,
"learning_rate": 1.8382746967078063e-05,
"loss": 2.5864,
"step": 223
},
{
"epoch": 0.22712294043092524,
"grad_norm": 3.0117993354797363,
"learning_rate": 1.8364418693977803e-05,
"loss": 2.7637,
"step": 224
},
{
"epoch": 0.22813688212927757,
"grad_norm": 3.1829309463500977,
"learning_rate": 1.834599639315422e-05,
"loss": 2.7799,
"step": 225
},
{
"epoch": 0.2291508238276299,
"grad_norm": 2.91520619392395,
"learning_rate": 1.8327480271699647e-05,
"loss": 2.678,
"step": 226
},
{
"epoch": 0.23016476552598225,
"grad_norm": 2.9226183891296387,
"learning_rate": 1.8308870537761094e-05,
"loss": 2.5984,
"step": 227
},
{
"epoch": 0.2311787072243346,
"grad_norm": 3.276668071746826,
"learning_rate": 1.829016740053791e-05,
"loss": 2.7147,
"step": 228
},
{
"epoch": 0.23219264892268696,
"grad_norm": 2.882828950881958,
"learning_rate": 1.8271371070279418e-05,
"loss": 2.7388,
"step": 229
},
{
"epoch": 0.2332065906210393,
"grad_norm": 2.8925859928131104,
"learning_rate": 1.8252481758282573e-05,
"loss": 2.9969,
"step": 230
},
{
"epoch": 0.23422053231939163,
"grad_norm": 2.904247760772705,
"learning_rate": 1.8233499676889556e-05,
"loss": 2.7041,
"step": 231
},
{
"epoch": 0.23523447401774397,
"grad_norm": 2.9626731872558594,
"learning_rate": 1.8214425039485428e-05,
"loss": 2.7114,
"step": 232
},
{
"epoch": 0.23624841571609634,
"grad_norm": 3.2437496185302734,
"learning_rate": 1.8195258060495693e-05,
"loss": 2.6311,
"step": 233
},
{
"epoch": 0.23726235741444868,
"grad_norm": 2.8737614154815674,
"learning_rate": 1.8175998955383906e-05,
"loss": 2.7573,
"step": 234
},
{
"epoch": 0.23827629911280102,
"grad_norm": 2.9143168926239014,
"learning_rate": 1.815664794064925e-05,
"loss": 2.4661,
"step": 235
},
{
"epoch": 0.23929024081115335,
"grad_norm": 3.0798699855804443,
"learning_rate": 1.81372052338241e-05,
"loss": 2.9082,
"step": 236
},
{
"epoch": 0.2403041825095057,
"grad_norm": 2.7913010120391846,
"learning_rate": 1.8117671053471576e-05,
"loss": 2.7929,
"step": 237
},
{
"epoch": 0.24131812420785806,
"grad_norm": 3.1236677169799805,
"learning_rate": 1.8098045619183092e-05,
"loss": 2.7495,
"step": 238
},
{
"epoch": 0.2423320659062104,
"grad_norm": 2.8255903720855713,
"learning_rate": 1.8078329151575874e-05,
"loss": 2.7466,
"step": 239
},
{
"epoch": 0.24334600760456274,
"grad_norm": 2.974113702774048,
"learning_rate": 1.8058521872290505e-05,
"loss": 2.904,
"step": 240
},
{
"epoch": 0.24435994930291507,
"grad_norm": 3.112579107284546,
"learning_rate": 1.8038624003988406e-05,
"loss": 2.4985,
"step": 241
},
{
"epoch": 0.2453738910012674,
"grad_norm": 2.8516876697540283,
"learning_rate": 1.8018635770349343e-05,
"loss": 2.6785,
"step": 242
},
{
"epoch": 0.24638783269961978,
"grad_norm": 2.862663745880127,
"learning_rate": 1.7998557396068923e-05,
"loss": 2.7665,
"step": 243
},
{
"epoch": 0.24740177439797212,
"grad_norm": 2.899121046066284,
"learning_rate": 1.7978389106856056e-05,
"loss": 2.5681,
"step": 244
},
{
"epoch": 0.24841571609632446,
"grad_norm": 2.860578775405884,
"learning_rate": 1.7958131129430417e-05,
"loss": 2.5012,
"step": 245
},
{
"epoch": 0.2494296577946768,
"grad_norm": 2.837778091430664,
"learning_rate": 1.793778369151991e-05,
"loss": 2.4009,
"step": 246
},
{
"epoch": 0.25044359949302913,
"grad_norm": 2.9025585651397705,
"learning_rate": 1.7917347021858092e-05,
"loss": 2.7712,
"step": 247
},
{
"epoch": 0.2514575411913815,
"grad_norm": 2.781691074371338,
"learning_rate": 1.7896821350181613e-05,
"loss": 2.7241,
"step": 248
},
{
"epoch": 0.25247148288973387,
"grad_norm": 3.0579018592834473,
"learning_rate": 1.7876206907227628e-05,
"loss": 2.7345,
"step": 249
},
{
"epoch": 0.2534854245880862,
"grad_norm": 3.0960402488708496,
"learning_rate": 1.7855503924731205e-05,
"loss": 2.8208,
"step": 250
},
{
"epoch": 0.25449936628643854,
"grad_norm": 2.78804349899292,
"learning_rate": 1.7834712635422718e-05,
"loss": 2.8634,
"step": 251
},
{
"epoch": 0.2555133079847909,
"grad_norm": 3.100924491882324,
"learning_rate": 1.7813833273025237e-05,
"loss": 2.6371,
"step": 252
},
{
"epoch": 0.2565272496831432,
"grad_norm": 2.7627742290496826,
"learning_rate": 1.77928660722519e-05,
"loss": 2.5151,
"step": 253
},
{
"epoch": 0.25754119138149556,
"grad_norm": 3.0632212162017822,
"learning_rate": 1.7771811268803258e-05,
"loss": 2.8475,
"step": 254
},
{
"epoch": 0.2585551330798479,
"grad_norm": 3.1408166885375977,
"learning_rate": 1.7750669099364643e-05,
"loss": 2.7642,
"step": 255
},
{
"epoch": 0.25956907477820024,
"grad_norm": 2.8695948123931885,
"learning_rate": 1.772943980160351e-05,
"loss": 2.6326,
"step": 256
},
{
"epoch": 0.2605830164765526,
"grad_norm": 2.7842013835906982,
"learning_rate": 1.770812361416675e-05,
"loss": 2.6616,
"step": 257
},
{
"epoch": 0.26159695817490497,
"grad_norm": 2.9320297241210938,
"learning_rate": 1.768672077667802e-05,
"loss": 2.559,
"step": 258
},
{
"epoch": 0.2626108998732573,
"grad_norm": 3.1390798091888428,
"learning_rate": 1.7665231529735042e-05,
"loss": 2.6741,
"step": 259
},
{
"epoch": 0.26362484157160965,
"grad_norm": 2.9545533657073975,
"learning_rate": 1.7643656114906895e-05,
"loss": 2.9671,
"step": 260
},
{
"epoch": 0.264638783269962,
"grad_norm": 2.8401505947113037,
"learning_rate": 1.762199477473131e-05,
"loss": 3.0152,
"step": 261
},
{
"epoch": 0.2656527249683143,
"grad_norm": 3.3156533241271973,
"learning_rate": 1.7600247752711952e-05,
"loss": 2.6879,
"step": 262
},
{
"epoch": 0.26666666666666666,
"grad_norm": 3.1197516918182373,
"learning_rate": 1.7578415293315646e-05,
"loss": 2.6275,
"step": 263
},
{
"epoch": 0.267680608365019,
"grad_norm": 3.0614259243011475,
"learning_rate": 1.7556497641969658e-05,
"loss": 2.52,
"step": 264
},
{
"epoch": 0.26869455006337134,
"grad_norm": 3.004460573196411,
"learning_rate": 1.7534495045058947e-05,
"loss": 2.5844,
"step": 265
},
{
"epoch": 0.2697084917617237,
"grad_norm": 2.928093433380127,
"learning_rate": 1.751240774992336e-05,
"loss": 2.6644,
"step": 266
},
{
"epoch": 0.2707224334600761,
"grad_norm": 2.934621572494507,
"learning_rate": 1.749023600485488e-05,
"loss": 2.4901,
"step": 267
},
{
"epoch": 0.2717363751584284,
"grad_norm": 2.828896999359131,
"learning_rate": 1.7467980059094817e-05,
"loss": 2.5597,
"step": 268
},
{
"epoch": 0.27275031685678075,
"grad_norm": 3.0283548831939697,
"learning_rate": 1.744564016283102e-05,
"loss": 2.7064,
"step": 269
},
{
"epoch": 0.2737642585551331,
"grad_norm": 2.7558844089508057,
"learning_rate": 1.742321656719506e-05,
"loss": 2.7417,
"step": 270
},
{
"epoch": 0.2747782002534854,
"grad_norm": 2.749242067337036,
"learning_rate": 1.74007095242594e-05,
"loss": 2.7896,
"step": 271
},
{
"epoch": 0.27579214195183777,
"grad_norm": 2.8161332607269287,
"learning_rate": 1.737811928703457e-05,
"loss": 2.6507,
"step": 272
},
{
"epoch": 0.2768060836501901,
"grad_norm": 3.237185001373291,
"learning_rate": 1.7355446109466326e-05,
"loss": 2.4539,
"step": 273
},
{
"epoch": 0.27782002534854244,
"grad_norm": 2.7998554706573486,
"learning_rate": 1.7332690246432774e-05,
"loss": 2.6713,
"step": 274
},
{
"epoch": 0.2788339670468948,
"grad_norm": 3.0803017616271973,
"learning_rate": 1.7309851953741532e-05,
"loss": 2.816,
"step": 275
},
{
"epoch": 0.2798479087452472,
"grad_norm": 2.995108127593994,
"learning_rate": 1.728693148812684e-05,
"loss": 2.6192,
"step": 276
},
{
"epoch": 0.2808618504435995,
"grad_norm": 3.189038038253784,
"learning_rate": 1.7263929107246672e-05,
"loss": 2.7477,
"step": 277
},
{
"epoch": 0.28187579214195185,
"grad_norm": 3.0140798091888428,
"learning_rate": 1.724084506967985e-05,
"loss": 2.5284,
"step": 278
},
{
"epoch": 0.2828897338403042,
"grad_norm": 3.156503438949585,
"learning_rate": 1.721767963492313e-05,
"loss": 2.8121,
"step": 279
},
{
"epoch": 0.28390367553865653,
"grad_norm": 2.9829587936401367,
"learning_rate": 1.7194433063388273e-05,
"loss": 2.833,
"step": 280
},
{
"epoch": 0.28491761723700887,
"grad_norm": 2.734154224395752,
"learning_rate": 1.7171105616399153e-05,
"loss": 2.934,
"step": 281
},
{
"epoch": 0.2859315589353612,
"grad_norm": 3.1216416358947754,
"learning_rate": 1.714769755618878e-05,
"loss": 2.5699,
"step": 282
},
{
"epoch": 0.28694550063371355,
"grad_norm": 2.832775592803955,
"learning_rate": 1.712420914589637e-05,
"loss": 2.6746,
"step": 283
},
{
"epoch": 0.2879594423320659,
"grad_norm": 2.7044105529785156,
"learning_rate": 1.7100640649564396e-05,
"loss": 2.8415,
"step": 284
},
{
"epoch": 0.2889733840304182,
"grad_norm": 2.996109962463379,
"learning_rate": 1.7076992332135595e-05,
"loss": 2.6205,
"step": 285
},
{
"epoch": 0.2899873257287706,
"grad_norm": 2.9617865085601807,
"learning_rate": 1.7053264459450023e-05,
"loss": 2.4976,
"step": 286
},
{
"epoch": 0.29100126742712296,
"grad_norm": 2.9564125537872314,
"learning_rate": 1.7029457298242035e-05,
"loss": 2.6155,
"step": 287
},
{
"epoch": 0.2920152091254753,
"grad_norm": 3.1350152492523193,
"learning_rate": 1.70055711161373e-05,
"loss": 2.829,
"step": 288
},
{
"epoch": 0.29302915082382763,
"grad_norm": 3.3603250980377197,
"learning_rate": 1.698160618164979e-05,
"loss": 2.7119,
"step": 289
},
{
"epoch": 0.29404309252217997,
"grad_norm": 2.923590898513794,
"learning_rate": 1.6957562764178774e-05,
"loss": 2.6255,
"step": 290
},
{
"epoch": 0.2950570342205323,
"grad_norm": 3.131751537322998,
"learning_rate": 1.6933441134005774e-05,
"loss": 2.7487,
"step": 291
},
{
"epoch": 0.29607097591888465,
"grad_norm": 3.136584758758545,
"learning_rate": 1.6909241562291522e-05,
"loss": 2.6563,
"step": 292
},
{
"epoch": 0.297084917617237,
"grad_norm": 2.8328192234039307,
"learning_rate": 1.6884964321072938e-05,
"loss": 2.801,
"step": 293
},
{
"epoch": 0.2980988593155893,
"grad_norm": 3.432396173477173,
"learning_rate": 1.686060968326005e-05,
"loss": 2.5774,
"step": 294
},
{
"epoch": 0.2991128010139417,
"grad_norm": 3.122170925140381,
"learning_rate": 1.6836177922632918e-05,
"loss": 2.6086,
"step": 295
},
{
"epoch": 0.30012674271229406,
"grad_norm": 2.932734489440918,
"learning_rate": 1.681166931383859e-05,
"loss": 2.6581,
"step": 296
},
{
"epoch": 0.3011406844106464,
"grad_norm": 3.0569913387298584,
"learning_rate": 1.6787084132387987e-05,
"loss": 2.7375,
"step": 297
},
{
"epoch": 0.30215462610899874,
"grad_norm": 3.207404613494873,
"learning_rate": 1.6762422654652806e-05,
"loss": 2.9635,
"step": 298
},
{
"epoch": 0.3031685678073511,
"grad_norm": 3.0299222469329834,
"learning_rate": 1.6737685157862428e-05,
"loss": 2.7321,
"step": 299
},
{
"epoch": 0.3041825095057034,
"grad_norm": 3.0222506523132324,
"learning_rate": 1.6712871920100796e-05,
"loss": 2.8171,
"step": 300
},
{
"epoch": 0.30519645120405575,
"grad_norm": 2.854688882827759,
"learning_rate": 1.668798322030328e-05,
"loss": 2.5802,
"step": 301
},
{
"epoch": 0.3062103929024081,
"grad_norm": 2.8383164405822754,
"learning_rate": 1.6663019338253556e-05,
"loss": 2.7979,
"step": 302
},
{
"epoch": 0.30722433460076043,
"grad_norm": 3.0571515560150146,
"learning_rate": 1.6637980554580447e-05,
"loss": 2.7113,
"step": 303
},
{
"epoch": 0.3082382762991128,
"grad_norm": 3.032212018966675,
"learning_rate": 1.6612867150754776e-05,
"loss": 2.7551,
"step": 304
},
{
"epoch": 0.30925221799746516,
"grad_norm": 3.250274896621704,
"learning_rate": 1.6587679409086207e-05,
"loss": 2.6444,
"step": 305
},
{
"epoch": 0.3102661596958175,
"grad_norm": 2.826704740524292,
"learning_rate": 1.6562417612720055e-05,
"loss": 2.6877,
"step": 306
},
{
"epoch": 0.31128010139416984,
"grad_norm": 3.150624990463257,
"learning_rate": 1.6537082045634116e-05,
"loss": 2.7872,
"step": 307
},
{
"epoch": 0.3122940430925222,
"grad_norm": 3.249372959136963,
"learning_rate": 1.6511672992635478e-05,
"loss": 2.8034,
"step": 308
},
{
"epoch": 0.3133079847908745,
"grad_norm": 2.992123603820801,
"learning_rate": 1.6486190739357307e-05,
"loss": 2.7451,
"step": 309
},
{
"epoch": 0.31432192648922685,
"grad_norm": 2.8580946922302246,
"learning_rate": 1.6460635572255644e-05,
"loss": 2.5405,
"step": 310
},
{
"epoch": 0.3153358681875792,
"grad_norm": 2.7302498817443848,
"learning_rate": 1.6435007778606177e-05,
"loss": 2.5465,
"step": 311
},
{
"epoch": 0.31634980988593153,
"grad_norm": 2.9869136810302734,
"learning_rate": 1.6409307646501032e-05,
"loss": 2.852,
"step": 312
},
{
"epoch": 0.3173637515842839,
"grad_norm": 2.967942237854004,
"learning_rate": 1.6383535464845507e-05,
"loss": 2.4507,
"step": 313
},
{
"epoch": 0.31837769328263626,
"grad_norm": 3.0977964401245117,
"learning_rate": 1.635769152335484e-05,
"loss": 2.5726,
"step": 314
},
{
"epoch": 0.3193916349809886,
"grad_norm": 2.805619955062866,
"learning_rate": 1.6331776112550956e-05,
"loss": 2.6916,
"step": 315
},
{
"epoch": 0.32040557667934094,
"grad_norm": 2.807981014251709,
"learning_rate": 1.6305789523759186e-05,
"loss": 2.6778,
"step": 316
},
{
"epoch": 0.3214195183776933,
"grad_norm": 3.1002237796783447,
"learning_rate": 1.6279732049105e-05,
"loss": 2.5728,
"step": 317
},
{
"epoch": 0.3224334600760456,
"grad_norm": 3.0552313327789307,
"learning_rate": 1.6253603981510742e-05,
"loss": 2.7891,
"step": 318
},
{
"epoch": 0.32344740177439796,
"grad_norm": 3.123044013977051,
"learning_rate": 1.6227405614692295e-05,
"loss": 2.5147,
"step": 319
},
{
"epoch": 0.3244613434727503,
"grad_norm": 3.1019504070281982,
"learning_rate": 1.6201137243155815e-05,
"loss": 2.6905,
"step": 320
},
{
"epoch": 0.32547528517110264,
"grad_norm": 2.9532058238983154,
"learning_rate": 1.617479916219441e-05,
"loss": 2.9354,
"step": 321
},
{
"epoch": 0.32648922686945503,
"grad_norm": 2.9724395275115967,
"learning_rate": 1.614839166788481e-05,
"loss": 2.3712,
"step": 322
},
{
"epoch": 0.32750316856780737,
"grad_norm": 3.3524832725524902,
"learning_rate": 1.6121915057084064e-05,
"loss": 2.8975,
"step": 323
},
{
"epoch": 0.3285171102661597,
"grad_norm": 3.259046792984009,
"learning_rate": 1.609536962742617e-05,
"loss": 2.6001,
"step": 324
},
{
"epoch": 0.32953105196451205,
"grad_norm": 2.8253602981567383,
"learning_rate": 1.606875567731876e-05,
"loss": 2.794,
"step": 325
},
{
"epoch": 0.3305449936628644,
"grad_norm": 2.986966609954834,
"learning_rate": 1.6042073505939718e-05,
"loss": 2.6207,
"step": 326
},
{
"epoch": 0.3315589353612167,
"grad_norm": 2.9867730140686035,
"learning_rate": 1.6015323413233838e-05,
"loss": 2.8266,
"step": 327
},
{
"epoch": 0.33257287705956906,
"grad_norm": 2.8082540035247803,
"learning_rate": 1.598850569990944e-05,
"loss": 2.5946,
"step": 328
},
{
"epoch": 0.3335868187579214,
"grad_norm": 2.93464732170105,
"learning_rate": 1.5961620667434997e-05,
"loss": 2.6015,
"step": 329
},
{
"epoch": 0.33460076045627374,
"grad_norm": 2.888319969177246,
"learning_rate": 1.593466861803575e-05,
"loss": 2.6121,
"step": 330
},
{
"epoch": 0.33561470215462613,
"grad_norm": 2.75567889213562,
"learning_rate": 1.5907649854690292e-05,
"loss": 2.716,
"step": 331
},
{
"epoch": 0.33662864385297847,
"grad_norm": 2.6718239784240723,
"learning_rate": 1.5880564681127172e-05,
"loss": 2.6195,
"step": 332
},
{
"epoch": 0.3376425855513308,
"grad_norm": 2.933192253112793,
"learning_rate": 1.58534134018215e-05,
"loss": 2.8557,
"step": 333
},
{
"epoch": 0.33865652724968315,
"grad_norm": 4.560988903045654,
"learning_rate": 1.5826196321991484e-05,
"loss": 2.7728,
"step": 334
},
{
"epoch": 0.3396704689480355,
"grad_norm": 2.9383349418640137,
"learning_rate": 1.5798913747595038e-05,
"loss": 2.5062,
"step": 335
},
{
"epoch": 0.3406844106463878,
"grad_norm": 3.134274482727051,
"learning_rate": 1.5771565985326323e-05,
"loss": 2.5433,
"step": 336
},
{
"epoch": 0.34169835234474016,
"grad_norm": 2.9582736492156982,
"learning_rate": 1.57441533426123e-05,
"loss": 2.777,
"step": 337
},
{
"epoch": 0.3427122940430925,
"grad_norm": 3.3106932640075684,
"learning_rate": 1.5716676127609277e-05,
"loss": 2.5834,
"step": 338
},
{
"epoch": 0.34372623574144484,
"grad_norm": 2.8983476161956787,
"learning_rate": 1.568913464919944e-05,
"loss": 2.5572,
"step": 339
},
{
"epoch": 0.34474017743979724,
"grad_norm": 3.2694075107574463,
"learning_rate": 1.5661529216987393e-05,
"loss": 2.4801,
"step": 340
},
{
"epoch": 0.3457541191381496,
"grad_norm": 3.032731056213379,
"learning_rate": 1.563386014129667e-05,
"loss": 2.8144,
"step": 341
},
{
"epoch": 0.3467680608365019,
"grad_norm": 2.8788082599639893,
"learning_rate": 1.5606127733166237e-05,
"loss": 2.7998,
"step": 342
},
{
"epoch": 0.34778200253485425,
"grad_norm": 2.8466122150421143,
"learning_rate": 1.5578332304347016e-05,
"loss": 2.7319,
"step": 343
},
{
"epoch": 0.3487959442332066,
"grad_norm": 3.006892442703247,
"learning_rate": 1.5550474167298364e-05,
"loss": 2.7248,
"step": 344
},
{
"epoch": 0.34980988593155893,
"grad_norm": 2.8438527584075928,
"learning_rate": 1.5522553635184567e-05,
"loss": 2.7117,
"step": 345
},
{
"epoch": 0.35082382762991127,
"grad_norm": 3.038827896118164,
"learning_rate": 1.549457102187131e-05,
"loss": 2.6968,
"step": 346
},
{
"epoch": 0.3518377693282636,
"grad_norm": 3.1003973484039307,
"learning_rate": 1.5466526641922174e-05,
"loss": 2.5117,
"step": 347
},
{
"epoch": 0.35285171102661594,
"grad_norm": 2.9051380157470703,
"learning_rate": 1.5438420810595073e-05,
"loss": 2.5873,
"step": 348
},
{
"epoch": 0.35386565272496834,
"grad_norm": 2.885514259338379,
"learning_rate": 1.5410253843838717e-05,
"loss": 3.1226,
"step": 349
},
{
"epoch": 0.3548795944233207,
"grad_norm": 2.6996588706970215,
"learning_rate": 1.538202605828907e-05,
"loss": 2.7712,
"step": 350
},
{
"epoch": 0.355893536121673,
"grad_norm": 2.927683115005493,
"learning_rate": 1.5353737771265785e-05,
"loss": 2.5532,
"step": 351
},
{
"epoch": 0.35690747782002535,
"grad_norm": 3.2832837104797363,
"learning_rate": 1.532538930076863e-05,
"loss": 2.753,
"step": 352
},
{
"epoch": 0.3579214195183777,
"grad_norm": 3.019700765609741,
"learning_rate": 1.5296980965473918e-05,
"loss": 2.7967,
"step": 353
},
{
"epoch": 0.35893536121673003,
"grad_norm": 2.9347846508026123,
"learning_rate": 1.5268513084730935e-05,
"loss": 2.7381,
"step": 354
},
{
"epoch": 0.35994930291508237,
"grad_norm": 3.088667392730713,
"learning_rate": 1.5239985978558333e-05,
"loss": 2.5695,
"step": 355
},
{
"epoch": 0.3609632446134347,
"grad_norm": 2.810241460800171,
"learning_rate": 1.521139996764054e-05,
"loss": 2.7083,
"step": 356
},
{
"epoch": 0.36197718631178705,
"grad_norm": 2.9103872776031494,
"learning_rate": 1.5182755373324162e-05,
"loss": 2.7304,
"step": 357
},
{
"epoch": 0.36299112801013944,
"grad_norm": 2.860936403274536,
"learning_rate": 1.5154052517614361e-05,
"loss": 2.7048,
"step": 358
},
{
"epoch": 0.3640050697084918,
"grad_norm": 2.7018895149230957,
"learning_rate": 1.512529172317123e-05,
"loss": 2.7032,
"step": 359
},
{
"epoch": 0.3650190114068441,
"grad_norm": 2.985334873199463,
"learning_rate": 1.509647331330619e-05,
"loss": 2.6939,
"step": 360
},
{
"epoch": 0.36603295310519646,
"grad_norm": 2.89749813079834,
"learning_rate": 1.506759761197833e-05,
"loss": 2.6779,
"step": 361
},
{
"epoch": 0.3670468948035488,
"grad_norm": 3.2000341415405273,
"learning_rate": 1.5038664943790768e-05,
"loss": 2.6007,
"step": 362
},
{
"epoch": 0.36806083650190113,
"grad_norm": 2.913217544555664,
"learning_rate": 1.5009675633987027e-05,
"loss": 2.3883,
"step": 363
},
{
"epoch": 0.3690747782002535,
"grad_norm": 3.0969278812408447,
"learning_rate": 1.4980630008447343e-05,
"loss": 2.7225,
"step": 364
},
{
"epoch": 0.3700887198986058,
"grad_norm": 2.8502883911132812,
"learning_rate": 1.4951528393685033e-05,
"loss": 2.7935,
"step": 365
},
{
"epoch": 0.37110266159695815,
"grad_norm": 3.0395283699035645,
"learning_rate": 1.49223711168428e-05,
"loss": 2.6212,
"step": 366
},
{
"epoch": 0.37211660329531054,
"grad_norm": 2.8926784992218018,
"learning_rate": 1.4893158505689071e-05,
"loss": 2.7873,
"step": 367
},
{
"epoch": 0.3731305449936629,
"grad_norm": 3.2163937091827393,
"learning_rate": 1.4863890888614314e-05,
"loss": 2.7154,
"step": 368
},
{
"epoch": 0.3741444866920152,
"grad_norm": 2.7381045818328857,
"learning_rate": 1.483456859462733e-05,
"loss": 2.6874,
"step": 369
},
{
"epoch": 0.37515842839036756,
"grad_norm": 2.733754873275757,
"learning_rate": 1.480519195335157e-05,
"loss": 2.5197,
"step": 370
},
{
"epoch": 0.3761723700887199,
"grad_norm": 2.765723705291748,
"learning_rate": 1.4775761295021418e-05,
"loss": 2.6769,
"step": 371
},
{
"epoch": 0.37718631178707224,
"grad_norm": 2.723877191543579,
"learning_rate": 1.47462769504785e-05,
"loss": 2.9282,
"step": 372
},
{
"epoch": 0.3782002534854246,
"grad_norm": 3.054982900619507,
"learning_rate": 1.4716739251167931e-05,
"loss": 2.9797,
"step": 373
},
{
"epoch": 0.3792141951837769,
"grad_norm": 2.886594295501709,
"learning_rate": 1.4687148529134621e-05,
"loss": 2.806,
"step": 374
},
{
"epoch": 0.38022813688212925,
"grad_norm": 3.0350804328918457,
"learning_rate": 1.4657505117019523e-05,
"loss": 2.6211,
"step": 375
},
{
"epoch": 0.38124207858048165,
"grad_norm": 2.7764170169830322,
"learning_rate": 1.4627809348055908e-05,
"loss": 2.7099,
"step": 376
},
{
"epoch": 0.382256020278834,
"grad_norm": 2.9754507541656494,
"learning_rate": 1.4598061556065598e-05,
"loss": 2.6209,
"step": 377
},
{
"epoch": 0.3832699619771863,
"grad_norm": 3.0619449615478516,
"learning_rate": 1.4568262075455237e-05,
"loss": 2.6551,
"step": 378
},
{
"epoch": 0.38428390367553866,
"grad_norm": 2.7044217586517334,
"learning_rate": 1.4538411241212518e-05,
"loss": 2.529,
"step": 379
},
{
"epoch": 0.385297845373891,
"grad_norm": 2.4933485984802246,
"learning_rate": 1.4508509388902421e-05,
"loss": 2.6247,
"step": 380
},
{
"epoch": 0.38631178707224334,
"grad_norm": 2.8276021480560303,
"learning_rate": 1.4478556854663435e-05,
"loss": 2.6108,
"step": 381
},
{
"epoch": 0.3873257287705957,
"grad_norm": 2.8554747104644775,
"learning_rate": 1.444855397520379e-05,
"loss": 2.7067,
"step": 382
},
{
"epoch": 0.388339670468948,
"grad_norm": 2.901643753051758,
"learning_rate": 1.4418501087797667e-05,
"loss": 2.5073,
"step": 383
},
{
"epoch": 0.38935361216730036,
"grad_norm": 2.9580893516540527,
"learning_rate": 1.4388398530281403e-05,
"loss": 2.7195,
"step": 384
},
{
"epoch": 0.39036755386565275,
"grad_norm": 2.950740337371826,
"learning_rate": 1.4358246641049696e-05,
"loss": 2.5772,
"step": 385
},
{
"epoch": 0.3913814955640051,
"grad_norm": 2.9490175247192383,
"learning_rate": 1.4328045759051805e-05,
"loss": 2.7675,
"step": 386
},
{
"epoch": 0.3923954372623574,
"grad_norm": 2.7582757472991943,
"learning_rate": 1.4297796223787734e-05,
"loss": 2.8246,
"step": 387
},
{
"epoch": 0.39340937896070977,
"grad_norm": 3.29480242729187,
"learning_rate": 1.4267498375304417e-05,
"loss": 2.5724,
"step": 388
},
{
"epoch": 0.3944233206590621,
"grad_norm": 2.785078287124634,
"learning_rate": 1.4237152554191889e-05,
"loss": 2.7085,
"step": 389
},
{
"epoch": 0.39543726235741444,
"grad_norm": 2.72640323638916,
"learning_rate": 1.4206759101579481e-05,
"loss": 2.8608,
"step": 390
},
{
"epoch": 0.3964512040557668,
"grad_norm": 2.696939706802368,
"learning_rate": 1.4176318359131955e-05,
"loss": 2.7745,
"step": 391
},
{
"epoch": 0.3974651457541191,
"grad_norm": 3.107379674911499,
"learning_rate": 1.414583066904568e-05,
"loss": 2.529,
"step": 392
},
{
"epoch": 0.39847908745247146,
"grad_norm": 2.836038589477539,
"learning_rate": 1.411529637404478e-05,
"loss": 2.5382,
"step": 393
},
{
"epoch": 0.39949302915082385,
"grad_norm": 2.7665674686431885,
"learning_rate": 1.4084715817377292e-05,
"loss": 2.7435,
"step": 394
},
{
"epoch": 0.4005069708491762,
"grad_norm": 3.098053455352783,
"learning_rate": 1.4054089342811286e-05,
"loss": 2.728,
"step": 395
},
{
"epoch": 0.40152091254752853,
"grad_norm": 3.359023332595825,
"learning_rate": 1.4023417294631019e-05,
"loss": 2.4446,
"step": 396
},
{
"epoch": 0.40253485424588087,
"grad_norm": 2.905782461166382,
"learning_rate": 1.3992700017633063e-05,
"loss": 2.5065,
"step": 397
},
{
"epoch": 0.4035487959442332,
"grad_norm": 3.0287387371063232,
"learning_rate": 1.3961937857122418e-05,
"loss": 2.7894,
"step": 398
},
{
"epoch": 0.40456273764258555,
"grad_norm": 2.878178358078003,
"learning_rate": 1.3931131158908644e-05,
"loss": 2.758,
"step": 399
},
{
"epoch": 0.4055766793409379,
"grad_norm": 2.683164596557617,
"learning_rate": 1.3900280269301957e-05,
"loss": 2.7725,
"step": 400
},
{
"epoch": 0.4065906210392902,
"grad_norm": 2.802175521850586,
"learning_rate": 1.3869385535109358e-05,
"loss": 2.8893,
"step": 401
},
{
"epoch": 0.40760456273764256,
"grad_norm": 2.7916531562805176,
"learning_rate": 1.3838447303630713e-05,
"loss": 2.9633,
"step": 402
},
{
"epoch": 0.40861850443599496,
"grad_norm": 3.0303285121917725,
"learning_rate": 1.3807465922654863e-05,
"loss": 2.5742,
"step": 403
},
{
"epoch": 0.4096324461343473,
"grad_norm": 2.9529471397399902,
"learning_rate": 1.3776441740455706e-05,
"loss": 2.6557,
"step": 404
},
{
"epoch": 0.41064638783269963,
"grad_norm": 2.719038486480713,
"learning_rate": 1.374537510578829e-05,
"loss": 2.6693,
"step": 405
},
{
"epoch": 0.41166032953105197,
"grad_norm": 2.768009901046753,
"learning_rate": 1.3714266367884883e-05,
"loss": 2.8518,
"step": 406
},
{
"epoch": 0.4126742712294043,
"grad_norm": 2.946180582046509,
"learning_rate": 1.3683115876451054e-05,
"loss": 2.5067,
"step": 407
},
{
"epoch": 0.41368821292775665,
"grad_norm": 3.0021729469299316,
"learning_rate": 1.3651923981661741e-05,
"loss": 2.8727,
"step": 408
},
{
"epoch": 0.414702154626109,
"grad_norm": 3.17314076423645,
"learning_rate": 1.3620691034157314e-05,
"loss": 2.4562,
"step": 409
},
{
"epoch": 0.4157160963244613,
"grad_norm": 2.908395528793335,
"learning_rate": 1.358941738503963e-05,
"loss": 2.7654,
"step": 410
},
{
"epoch": 0.41673003802281366,
"grad_norm": 3.0189144611358643,
"learning_rate": 1.3558103385868087e-05,
"loss": 2.8481,
"step": 411
},
{
"epoch": 0.41774397972116606,
"grad_norm": 2.8879098892211914,
"learning_rate": 1.352674938865568e-05,
"loss": 2.629,
"step": 412
},
{
"epoch": 0.4187579214195184,
"grad_norm": 2.8246898651123047,
"learning_rate": 1.3495355745865038e-05,
"loss": 2.6649,
"step": 413
},
{
"epoch": 0.41977186311787074,
"grad_norm": 2.962284564971924,
"learning_rate": 1.3463922810404448e-05,
"loss": 2.5616,
"step": 414
},
{
"epoch": 0.4207858048162231,
"grad_norm": 2.845741033554077,
"learning_rate": 1.3432450935623922e-05,
"loss": 2.7799,
"step": 415
},
{
"epoch": 0.4217997465145754,
"grad_norm": 2.6955134868621826,
"learning_rate": 1.3400940475311193e-05,
"loss": 2.7114,
"step": 416
},
{
"epoch": 0.42281368821292775,
"grad_norm": 2.7694082260131836,
"learning_rate": 1.3369391783687742e-05,
"loss": 2.6646,
"step": 417
},
{
"epoch": 0.4238276299112801,
"grad_norm": 3.0028157234191895,
"learning_rate": 1.3337805215404837e-05,
"loss": 2.7271,
"step": 418
},
{
"epoch": 0.42484157160963243,
"grad_norm": 3.1340060234069824,
"learning_rate": 1.3306181125539528e-05,
"loss": 2.7531,
"step": 419
},
{
"epoch": 0.42585551330798477,
"grad_norm": 3.0588529109954834,
"learning_rate": 1.3274519869590656e-05,
"loss": 2.629,
"step": 420
},
{
"epoch": 0.42686945500633716,
"grad_norm": 2.9157087802886963,
"learning_rate": 1.3242821803474861e-05,
"loss": 2.5446,
"step": 421
},
{
"epoch": 0.4278833967046895,
"grad_norm": 3.051769256591797,
"learning_rate": 1.3211087283522586e-05,
"loss": 2.7211,
"step": 422
},
{
"epoch": 0.42889733840304184,
"grad_norm": 2.8915231227874756,
"learning_rate": 1.3179316666474063e-05,
"loss": 2.5781,
"step": 423
},
{
"epoch": 0.4299112801013942,
"grad_norm": 3.18194842338562,
"learning_rate": 1.3147510309475301e-05,
"loss": 2.8725,
"step": 424
},
{
"epoch": 0.4309252217997465,
"grad_norm": 3.0383052825927734,
"learning_rate": 1.3115668570074083e-05,
"loss": 2.6157,
"step": 425
},
{
"epoch": 0.43193916349809885,
"grad_norm": 3.316793203353882,
"learning_rate": 1.308379180621594e-05,
"loss": 2.5316,
"step": 426
},
{
"epoch": 0.4329531051964512,
"grad_norm": 2.822645664215088,
"learning_rate": 1.3051880376240117e-05,
"loss": 2.471,
"step": 427
},
{
"epoch": 0.43396704689480353,
"grad_norm": 2.7731168270111084,
"learning_rate": 1.3019934638875565e-05,
"loss": 2.4718,
"step": 428
},
{
"epoch": 0.43498098859315587,
"grad_norm": 2.7981114387512207,
"learning_rate": 1.298795495323689e-05,
"loss": 2.6722,
"step": 429
},
{
"epoch": 0.43599493029150826,
"grad_norm": 2.8708651065826416,
"learning_rate": 1.2955941678820332e-05,
"loss": 2.8799,
"step": 430
},
{
"epoch": 0.4370088719898606,
"grad_norm": 2.806706666946411,
"learning_rate": 1.292389517549971e-05,
"loss": 2.6245,
"step": 431
},
{
"epoch": 0.43802281368821294,
"grad_norm": 2.9992716312408447,
"learning_rate": 1.2891815803522378e-05,
"loss": 2.6867,
"step": 432
},
{
"epoch": 0.4390367553865653,
"grad_norm": 2.9060275554656982,
"learning_rate": 1.2859703923505194e-05,
"loss": 2.5628,
"step": 433
},
{
"epoch": 0.4400506970849176,
"grad_norm": 2.9179909229278564,
"learning_rate": 1.2827559896430437e-05,
"loss": 2.7091,
"step": 434
},
{
"epoch": 0.44106463878326996,
"grad_norm": 2.7098734378814697,
"learning_rate": 1.279538408364177e-05,
"loss": 2.6439,
"step": 435
},
{
"epoch": 0.4420785804816223,
"grad_norm": 2.9812943935394287,
"learning_rate": 1.276317684684017e-05,
"loss": 2.8656,
"step": 436
},
{
"epoch": 0.44309252217997463,
"grad_norm": 2.730379581451416,
"learning_rate": 1.2730938548079873e-05,
"loss": 2.7245,
"step": 437
},
{
"epoch": 0.444106463878327,
"grad_norm": 2.8220674991607666,
"learning_rate": 1.2698669549764272e-05,
"loss": 2.7681,
"step": 438
},
{
"epoch": 0.44512040557667937,
"grad_norm": 2.7935733795166016,
"learning_rate": 1.266637021464189e-05,
"loss": 2.581,
"step": 439
},
{
"epoch": 0.4461343472750317,
"grad_norm": 2.727238178253174,
"learning_rate": 1.2634040905802267e-05,
"loss": 2.4567,
"step": 440
},
{
"epoch": 0.44714828897338404,
"grad_norm": 3.0209293365478516,
"learning_rate": 1.260168198667189e-05,
"loss": 2.5923,
"step": 441
},
{
"epoch": 0.4481622306717364,
"grad_norm": 3.109890937805176,
"learning_rate": 1.2569293821010109e-05,
"loss": 2.546,
"step": 442
},
{
"epoch": 0.4491761723700887,
"grad_norm": 2.8236355781555176,
"learning_rate": 1.253687677290504e-05,
"loss": 2.7402,
"step": 443
},
{
"epoch": 0.45019011406844106,
"grad_norm": 2.7868268489837646,
"learning_rate": 1.2504431206769487e-05,
"loss": 2.561,
"step": 444
},
{
"epoch": 0.4512040557667934,
"grad_norm": 3.127213954925537,
"learning_rate": 1.247195748733683e-05,
"loss": 2.6075,
"step": 445
},
{
"epoch": 0.45221799746514574,
"grad_norm": 2.916252374649048,
"learning_rate": 1.2439455979656931e-05,
"loss": 2.6565,
"step": 446
},
{
"epoch": 0.4532319391634981,
"grad_norm": 2.737009286880493,
"learning_rate": 1.2406927049092034e-05,
"loss": 2.565,
"step": 447
},
{
"epoch": 0.45424588086185047,
"grad_norm": 3.0931363105773926,
"learning_rate": 1.2374371061312655e-05,
"loss": 2.6302,
"step": 448
},
{
"epoch": 0.4552598225602028,
"grad_norm": 2.862377166748047,
"learning_rate": 1.2341788382293467e-05,
"loss": 2.8832,
"step": 449
},
{
"epoch": 0.45627376425855515,
"grad_norm": 2.9104745388031006,
"learning_rate": 1.2309179378309188e-05,
"loss": 2.4398,
"step": 450
},
{
"epoch": 0.4572877059569075,
"grad_norm": 2.879220485687256,
"learning_rate": 1.2276544415930476e-05,
"loss": 2.733,
"step": 451
},
{
"epoch": 0.4583016476552598,
"grad_norm": 2.7797646522521973,
"learning_rate": 1.2243883862019787e-05,
"loss": 2.7244,
"step": 452
},
{
"epoch": 0.45931558935361216,
"grad_norm": 2.9330852031707764,
"learning_rate": 1.2211198083727262e-05,
"loss": 2.9154,
"step": 453
},
{
"epoch": 0.4603295310519645,
"grad_norm": 3.2787609100341797,
"learning_rate": 1.2178487448486607e-05,
"loss": 2.7193,
"step": 454
},
{
"epoch": 0.46134347275031684,
"grad_norm": 3.0352675914764404,
"learning_rate": 1.2145752324010948e-05,
"loss": 2.4823,
"step": 455
},
{
"epoch": 0.4623574144486692,
"grad_norm": 2.8447976112365723,
"learning_rate": 1.2112993078288702e-05,
"loss": 2.5812,
"step": 456
},
{
"epoch": 0.4633713561470216,
"grad_norm": 2.8668317794799805,
"learning_rate": 1.2080210079579452e-05,
"loss": 2.5691,
"step": 457
},
{
"epoch": 0.4643852978453739,
"grad_norm": 2.8658692836761475,
"learning_rate": 1.2047403696409787e-05,
"loss": 2.4761,
"step": 458
},
{
"epoch": 0.46539923954372625,
"grad_norm": 2.8215885162353516,
"learning_rate": 1.2014574297569182e-05,
"loss": 2.7344,
"step": 459
},
{
"epoch": 0.4664131812420786,
"grad_norm": 3.016486644744873,
"learning_rate": 1.1981722252105827e-05,
"loss": 2.7488,
"step": 460
},
{
"epoch": 0.46742712294043093,
"grad_norm": 2.955415964126587,
"learning_rate": 1.1948847929322498e-05,
"loss": 2.9456,
"step": 461
},
{
"epoch": 0.46844106463878327,
"grad_norm": 2.714385986328125,
"learning_rate": 1.1915951698772403e-05,
"loss": 2.6273,
"step": 462
},
{
"epoch": 0.4694550063371356,
"grad_norm": 2.84726881980896,
"learning_rate": 1.1883033930255018e-05,
"loss": 2.6909,
"step": 463
},
{
"epoch": 0.47046894803548794,
"grad_norm": 2.9898338317871094,
"learning_rate": 1.1850094993811936e-05,
"loss": 2.5776,
"step": 464
},
{
"epoch": 0.4714828897338403,
"grad_norm": 2.896883964538574,
"learning_rate": 1.1817135259722707e-05,
"loss": 2.7808,
"step": 465
},
{
"epoch": 0.4724968314321927,
"grad_norm": 3.1968889236450195,
"learning_rate": 1.1784155098500682e-05,
"loss": 2.3759,
"step": 466
},
{
"epoch": 0.473510773130545,
"grad_norm": 2.7873728275299072,
"learning_rate": 1.1751154880888835e-05,
"loss": 2.4607,
"step": 467
},
{
"epoch": 0.47452471482889735,
"grad_norm": 3.0125274658203125,
"learning_rate": 1.17181349778556e-05,
"loss": 2.6283,
"step": 468
},
{
"epoch": 0.4755386565272497,
"grad_norm": 2.9340808391571045,
"learning_rate": 1.1685095760590706e-05,
"loss": 2.8088,
"step": 469
},
{
"epoch": 0.47655259822560203,
"grad_norm": 2.9202420711517334,
"learning_rate": 1.1652037600501007e-05,
"loss": 2.7714,
"step": 470
},
{
"epoch": 0.47756653992395437,
"grad_norm": 3.133967638015747,
"learning_rate": 1.1618960869206287e-05,
"loss": 2.247,
"step": 471
},
{
"epoch": 0.4785804816223067,
"grad_norm": 2.8348443508148193,
"learning_rate": 1.1585865938535106e-05,
"loss": 2.7938,
"step": 472
},
{
"epoch": 0.47959442332065905,
"grad_norm": 2.7735393047332764,
"learning_rate": 1.1552753180520612e-05,
"loss": 2.8799,
"step": 473
},
{
"epoch": 0.4806083650190114,
"grad_norm": 2.82151198387146,
"learning_rate": 1.1519622967396347e-05,
"loss": 2.4982,
"step": 474
},
{
"epoch": 0.4816223067173637,
"grad_norm": 2.670712471008301,
"learning_rate": 1.1486475671592084e-05,
"loss": 2.5624,
"step": 475
},
{
"epoch": 0.4826362484157161,
"grad_norm": 2.903679370880127,
"learning_rate": 1.1453311665729618e-05,
"loss": 2.6059,
"step": 476
},
{
"epoch": 0.48365019011406846,
"grad_norm": 2.8284971714019775,
"learning_rate": 1.1420131322618601e-05,
"loss": 2.5285,
"step": 477
},
{
"epoch": 0.4846641318124208,
"grad_norm": 3.5497357845306396,
"learning_rate": 1.138693501525233e-05,
"loss": 2.5756,
"step": 478
},
{
"epoch": 0.48567807351077313,
"grad_norm": 3.3905792236328125,
"learning_rate": 1.135372311680356e-05,
"loss": 2.584,
"step": 479
},
{
"epoch": 0.4866920152091255,
"grad_norm": 3.1427557468414307,
"learning_rate": 1.1320496000620325e-05,
"loss": 2.677,
"step": 480
},
{
"epoch": 0.4877059569074778,
"grad_norm": 2.9492239952087402,
"learning_rate": 1.128725404022171e-05,
"loss": 2.7053,
"step": 481
},
{
"epoch": 0.48871989860583015,
"grad_norm": 2.99006986618042,
"learning_rate": 1.1253997609293684e-05,
"loss": 2.7686,
"step": 482
},
{
"epoch": 0.4897338403041825,
"grad_norm": 2.840709924697876,
"learning_rate": 1.122072708168487e-05,
"loss": 2.8237,
"step": 483
},
{
"epoch": 0.4907477820025348,
"grad_norm": 2.6475729942321777,
"learning_rate": 1.1187442831402378e-05,
"loss": 2.7403,
"step": 484
},
{
"epoch": 0.4917617237008872,
"grad_norm": 2.9385244846343994,
"learning_rate": 1.1154145232607558e-05,
"loss": 2.487,
"step": 485
},
{
"epoch": 0.49277566539923956,
"grad_norm": 3.061269760131836,
"learning_rate": 1.1120834659611832e-05,
"loss": 2.3668,
"step": 486
},
{
"epoch": 0.4937896070975919,
"grad_norm": 2.8122615814208984,
"learning_rate": 1.1087511486872461e-05,
"loss": 2.4546,
"step": 487
},
{
"epoch": 0.49480354879594424,
"grad_norm": 2.9320120811462402,
"learning_rate": 1.1054176088988352e-05,
"loss": 2.6993,
"step": 488
},
{
"epoch": 0.4958174904942966,
"grad_norm": 3.0139100551605225,
"learning_rate": 1.1020828840695836e-05,
"loss": 2.7321,
"step": 489
},
{
"epoch": 0.4968314321926489,
"grad_norm": 2.842785120010376,
"learning_rate": 1.0987470116864454e-05,
"loss": 2.7172,
"step": 490
},
{
"epoch": 0.49784537389100125,
"grad_norm": 2.6393532752990723,
"learning_rate": 1.0954100292492758e-05,
"loss": 2.816,
"step": 491
},
{
"epoch": 0.4988593155893536,
"grad_norm": 2.8561477661132812,
"learning_rate": 1.0920719742704071e-05,
"loss": 2.8228,
"step": 492
},
{
"epoch": 0.49987325728770593,
"grad_norm": 2.7492289543151855,
"learning_rate": 1.0887328842742307e-05,
"loss": 2.6292,
"step": 493
},
{
"epoch": 0.5008871989860583,
"grad_norm": 2.8030145168304443,
"learning_rate": 1.0853927967967705e-05,
"loss": 2.7269,
"step": 494
},
{
"epoch": 0.5019011406844106,
"grad_norm": 2.7845194339752197,
"learning_rate": 1.0820517493852655e-05,
"loss": 2.5418,
"step": 495
},
{
"epoch": 0.502915082382763,
"grad_norm": 2.990659475326538,
"learning_rate": 1.0787097795977447e-05,
"loss": 2.8466,
"step": 496
},
{
"epoch": 0.5039290240811153,
"grad_norm": 2.703700304031372,
"learning_rate": 1.0753669250026062e-05,
"loss": 2.7314,
"step": 497
},
{
"epoch": 0.5049429657794677,
"grad_norm": 2.9919204711914062,
"learning_rate": 1.0720232231781944e-05,
"loss": 2.6237,
"step": 498
},
{
"epoch": 0.5059569074778201,
"grad_norm": 2.7852721214294434,
"learning_rate": 1.0686787117123776e-05,
"loss": 2.7113,
"step": 499
},
{
"epoch": 0.5069708491761724,
"grad_norm": 3.1660399436950684,
"learning_rate": 1.0653334282021261e-05,
"loss": 2.5095,
"step": 500
},
{
"epoch": 0.5079847908745247,
"grad_norm": 2.977863073348999,
"learning_rate": 1.0619874102530886e-05,
"loss": 2.6218,
"step": 501
},
{
"epoch": 0.5089987325728771,
"grad_norm": 3.039355993270874,
"learning_rate": 1.0586406954791702e-05,
"loss": 2.5317,
"step": 502
},
{
"epoch": 0.5100126742712294,
"grad_norm": 2.98754620552063,
"learning_rate": 1.0552933215021088e-05,
"loss": 2.6559,
"step": 503
},
{
"epoch": 0.5110266159695818,
"grad_norm": 3.1123383045196533,
"learning_rate": 1.0519453259510535e-05,
"loss": 2.6153,
"step": 504
},
{
"epoch": 0.5120405576679341,
"grad_norm": 2.846510887145996,
"learning_rate": 1.0485967464621401e-05,
"loss": 2.7191,
"step": 505
},
{
"epoch": 0.5130544993662864,
"grad_norm": 2.6948962211608887,
"learning_rate": 1.0452476206780686e-05,
"loss": 2.6106,
"step": 506
},
{
"epoch": 0.5140684410646388,
"grad_norm": 2.7842648029327393,
"learning_rate": 1.041897986247681e-05,
"loss": 2.871,
"step": 507
},
{
"epoch": 0.5150823827629911,
"grad_norm": 2.8970916271209717,
"learning_rate": 1.0385478808255358e-05,
"loss": 2.5389,
"step": 508
},
{
"epoch": 0.5160963244613435,
"grad_norm": 3.109102249145508,
"learning_rate": 1.0351973420714878e-05,
"loss": 2.4805,
"step": 509
},
{
"epoch": 0.5171102661596958,
"grad_norm": 2.8477885723114014,
"learning_rate": 1.031846407650261e-05,
"loss": 2.7004,
"step": 510
},
{
"epoch": 0.5181242078580481,
"grad_norm": 3.3832929134368896,
"learning_rate": 1.0284951152310292e-05,
"loss": 2.409,
"step": 511
},
{
"epoch": 0.5191381495564005,
"grad_norm": 2.941394329071045,
"learning_rate": 1.0251435024869894e-05,
"loss": 2.5179,
"step": 512
},
{
"epoch": 0.5201520912547528,
"grad_norm": 2.7577157020568848,
"learning_rate": 1.0217916070949405e-05,
"loss": 2.6069,
"step": 513
},
{
"epoch": 0.5211660329531052,
"grad_norm": 3.035717487335205,
"learning_rate": 1.0184394667348572e-05,
"loss": 2.2385,
"step": 514
},
{
"epoch": 0.5221799746514575,
"grad_norm": 2.8246779441833496,
"learning_rate": 1.0150871190894693e-05,
"loss": 2.8377,
"step": 515
},
{
"epoch": 0.5231939163498099,
"grad_norm": 2.863114356994629,
"learning_rate": 1.0117346018438367e-05,
"loss": 2.8131,
"step": 516
},
{
"epoch": 0.5242078580481623,
"grad_norm": 3.119429588317871,
"learning_rate": 1.008381952684925e-05,
"loss": 2.5764,
"step": 517
},
{
"epoch": 0.5252217997465146,
"grad_norm": 2.976614236831665,
"learning_rate": 1.0050292093011835e-05,
"loss": 2.4699,
"step": 518
},
{
"epoch": 0.526235741444867,
"grad_norm": 2.798595666885376,
"learning_rate": 1.0016764093821203e-05,
"loss": 2.6019,
"step": 519
},
{
"epoch": 0.5272496831432193,
"grad_norm": 2.6253113746643066,
"learning_rate": 9.983235906178798e-06,
"loss": 2.6515,
"step": 520
},
{
"epoch": 0.5282636248415716,
"grad_norm": 2.9238436222076416,
"learning_rate": 9.949707906988165e-06,
"loss": 2.6297,
"step": 521
},
{
"epoch": 0.529277566539924,
"grad_norm": 2.8237082958221436,
"learning_rate": 9.916180473150753e-06,
"loss": 2.7979,
"step": 522
},
{
"epoch": 0.5302915082382763,
"grad_norm": 3.0830626487731934,
"learning_rate": 9.882653981561638e-06,
"loss": 2.5663,
"step": 523
},
{
"epoch": 0.5313054499366286,
"grad_norm": 2.914485216140747,
"learning_rate": 9.849128809105309e-06,
"loss": 2.6301,
"step": 524
},
{
"epoch": 0.532319391634981,
"grad_norm": 3.0977110862731934,
"learning_rate": 9.815605332651433e-06,
"loss": 2.5711,
"step": 525
},
{
"epoch": 0.5333333333333333,
"grad_norm": 3.0849609375,
"learning_rate": 9.782083929050601e-06,
"loss": 2.6879,
"step": 526
},
{
"epoch": 0.5343472750316857,
"grad_norm": 3.0231521129608154,
"learning_rate": 9.748564975130106e-06,
"loss": 2.7279,
"step": 527
},
{
"epoch": 0.535361216730038,
"grad_norm": 2.895653247833252,
"learning_rate": 9.71504884768971e-06,
"loss": 2.436,
"step": 528
},
{
"epoch": 0.5363751584283903,
"grad_norm": 3.167012929916382,
"learning_rate": 9.681535923497394e-06,
"loss": 2.4616,
"step": 529
},
{
"epoch": 0.5373891001267427,
"grad_norm": 3.1906981468200684,
"learning_rate": 9.648026579285125e-06,
"loss": 2.7383,
"step": 530
},
{
"epoch": 0.538403041825095,
"grad_norm": 3.151561975479126,
"learning_rate": 9.614521191744644e-06,
"loss": 2.7495,
"step": 531
},
{
"epoch": 0.5394169835234474,
"grad_norm": 2.9482996463775635,
"learning_rate": 9.581020137523192e-06,
"loss": 2.6012,
"step": 532
},
{
"epoch": 0.5404309252217997,
"grad_norm": 2.8822813034057617,
"learning_rate": 9.547523793219315e-06,
"loss": 2.5973,
"step": 533
},
{
"epoch": 0.5414448669201521,
"grad_norm": 3.1009838581085205,
"learning_rate": 9.514032535378604e-06,
"loss": 2.6613,
"step": 534
},
{
"epoch": 0.5424588086185045,
"grad_norm": 3.2803237438201904,
"learning_rate": 9.480546740489468e-06,
"loss": 2.5809,
"step": 535
},
{
"epoch": 0.5434727503168568,
"grad_norm": 2.8916573524475098,
"learning_rate": 9.447066784978914e-06,
"loss": 2.7318,
"step": 536
},
{
"epoch": 0.5444866920152092,
"grad_norm": 2.8549203872680664,
"learning_rate": 9.413593045208303e-06,
"loss": 2.6984,
"step": 537
},
{
"epoch": 0.5455006337135615,
"grad_norm": 2.9765243530273438,
"learning_rate": 9.380125897469116e-06,
"loss": 2.6687,
"step": 538
},
{
"epoch": 0.5465145754119138,
"grad_norm": 3.034773588180542,
"learning_rate": 9.346665717978742e-06,
"loss": 2.4404,
"step": 539
},
{
"epoch": 0.5475285171102662,
"grad_norm": 3.125175714492798,
"learning_rate": 9.313212882876228e-06,
"loss": 2.3855,
"step": 540
},
{
"epoch": 0.5485424588086185,
"grad_norm": 2.9196765422821045,
"learning_rate": 9.279767768218058e-06,
"loss": 2.7998,
"step": 541
},
{
"epoch": 0.5495564005069709,
"grad_norm": 2.6196236610412598,
"learning_rate": 9.246330749973943e-06,
"loss": 2.7261,
"step": 542
},
{
"epoch": 0.5505703422053232,
"grad_norm": 2.931138038635254,
"learning_rate": 9.212902204022556e-06,
"loss": 2.7242,
"step": 543
},
{
"epoch": 0.5515842839036755,
"grad_norm": 3.1061699390411377,
"learning_rate": 9.179482506147346e-06,
"loss": 2.5819,
"step": 544
},
{
"epoch": 0.5525982256020279,
"grad_norm": 2.7744109630584717,
"learning_rate": 9.146072032032298e-06,
"loss": 2.6329,
"step": 545
},
{
"epoch": 0.5536121673003802,
"grad_norm": 3.1657395362854004,
"learning_rate": 9.112671157257698e-06,
"loss": 2.8361,
"step": 546
},
{
"epoch": 0.5546261089987325,
"grad_norm": 2.9443604946136475,
"learning_rate": 9.07928025729593e-06,
"loss": 2.5906,
"step": 547
},
{
"epoch": 0.5556400506970849,
"grad_norm": 2.9069535732269287,
"learning_rate": 9.045899707507247e-06,
"loss": 2.5274,
"step": 548
},
{
"epoch": 0.5566539923954372,
"grad_norm": 3.0094900131225586,
"learning_rate": 9.012529883135548e-06,
"loss": 2.6779,
"step": 549
},
{
"epoch": 0.5576679340937896,
"grad_norm": 2.8510031700134277,
"learning_rate": 8.979171159304166e-06,
"loss": 2.6337,
"step": 550
},
{
"epoch": 0.5586818757921419,
"grad_norm": 2.801830768585205,
"learning_rate": 8.94582391101165e-06,
"loss": 2.6673,
"step": 551
},
{
"epoch": 0.5596958174904944,
"grad_norm": 2.9249250888824463,
"learning_rate": 8.912488513127539e-06,
"loss": 2.5263,
"step": 552
},
{
"epoch": 0.5607097591888467,
"grad_norm": 2.9476025104522705,
"learning_rate": 8.879165340388171e-06,
"loss": 2.517,
"step": 553
},
{
"epoch": 0.561723700887199,
"grad_norm": 3.0242457389831543,
"learning_rate": 8.845854767392448e-06,
"loss": 2.657,
"step": 554
},
{
"epoch": 0.5627376425855514,
"grad_norm": 2.858503580093384,
"learning_rate": 8.812557168597626e-06,
"loss": 2.684,
"step": 555
},
{
"epoch": 0.5637515842839037,
"grad_norm": 2.6426961421966553,
"learning_rate": 8.779272918315135e-06,
"loss": 2.6881,
"step": 556
},
{
"epoch": 0.564765525982256,
"grad_norm": 2.9629528522491455,
"learning_rate": 8.746002390706318e-06,
"loss": 2.5888,
"step": 557
},
{
"epoch": 0.5657794676806084,
"grad_norm": 2.738772392272949,
"learning_rate": 8.712745959778293e-06,
"loss": 2.6913,
"step": 558
},
{
"epoch": 0.5667934093789607,
"grad_norm": 2.958136796951294,
"learning_rate": 8.679503999379679e-06,
"loss": 2.6486,
"step": 559
},
{
"epoch": 0.5678073510773131,
"grad_norm": 2.908803701400757,
"learning_rate": 8.646276883196438e-06,
"loss": 2.7243,
"step": 560
},
{
"epoch": 0.5688212927756654,
"grad_norm": 2.9704983234405518,
"learning_rate": 8.613064984747672e-06,
"loss": 2.689,
"step": 561
},
{
"epoch": 0.5698352344740177,
"grad_norm": 2.7091917991638184,
"learning_rate": 8.5798686773814e-06,
"loss": 2.6094,
"step": 562
},
{
"epoch": 0.5708491761723701,
"grad_norm": 2.8450615406036377,
"learning_rate": 8.546688334270381e-06,
"loss": 2.5753,
"step": 563
},
{
"epoch": 0.5718631178707224,
"grad_norm": 2.8870737552642822,
"learning_rate": 8.51352432840792e-06,
"loss": 2.76,
"step": 564
},
{
"epoch": 0.5728770595690748,
"grad_norm": 2.8496322631835938,
"learning_rate": 8.480377032603658e-06,
"loss": 2.4644,
"step": 565
},
{
"epoch": 0.5738910012674271,
"grad_norm": 2.944342851638794,
"learning_rate": 8.44724681947939e-06,
"loss": 2.5489,
"step": 566
},
{
"epoch": 0.5749049429657794,
"grad_norm": 2.785560369491577,
"learning_rate": 8.414134061464898e-06,
"loss": 2.7094,
"step": 567
},
{
"epoch": 0.5759188846641318,
"grad_norm": 2.931671619415283,
"learning_rate": 8.381039130793718e-06,
"loss": 2.527,
"step": 568
},
{
"epoch": 0.5769328263624841,
"grad_norm": 2.7206380367279053,
"learning_rate": 8.347962399498996e-06,
"loss": 2.7661,
"step": 569
},
{
"epoch": 0.5779467680608364,
"grad_norm": 3.110283613204956,
"learning_rate": 8.314904239409295e-06,
"loss": 2.6634,
"step": 570
},
{
"epoch": 0.5789607097591889,
"grad_norm": 2.9628753662109375,
"learning_rate": 8.281865022144403e-06,
"loss": 2.5929,
"step": 571
},
{
"epoch": 0.5799746514575412,
"grad_norm": 2.844921112060547,
"learning_rate": 8.248845119111168e-06,
"loss": 2.5214,
"step": 572
},
{
"epoch": 0.5809885931558936,
"grad_norm": 2.7899045944213867,
"learning_rate": 8.21584490149932e-06,
"loss": 2.6966,
"step": 573
},
{
"epoch": 0.5820025348542459,
"grad_norm": 2.985085964202881,
"learning_rate": 8.182864740277293e-06,
"loss": 2.5197,
"step": 574
},
{
"epoch": 0.5830164765525983,
"grad_norm": 2.994450092315674,
"learning_rate": 8.149905006188067e-06,
"loss": 2.5945,
"step": 575
},
{
"epoch": 0.5840304182509506,
"grad_norm": 3.165738582611084,
"learning_rate": 8.116966069744987e-06,
"loss": 2.4739,
"step": 576
},
{
"epoch": 0.5850443599493029,
"grad_norm": 2.733795404434204,
"learning_rate": 8.084048301227597e-06,
"loss": 2.5658,
"step": 577
},
{
"epoch": 0.5860583016476553,
"grad_norm": 3.101283311843872,
"learning_rate": 8.051152070677504e-06,
"loss": 2.5252,
"step": 578
},
{
"epoch": 0.5870722433460076,
"grad_norm": 3.125256299972534,
"learning_rate": 8.018277747894178e-06,
"loss": 2.6929,
"step": 579
},
{
"epoch": 0.5880861850443599,
"grad_norm": 2.952458620071411,
"learning_rate": 7.985425702430821e-06,
"loss": 2.7738,
"step": 580
},
{
"epoch": 0.5891001267427123,
"grad_norm": 2.94323468208313,
"learning_rate": 7.952596303590215e-06,
"loss": 2.5122,
"step": 581
},
{
"epoch": 0.5901140684410646,
"grad_norm": 3.326063394546509,
"learning_rate": 7.91978992042055e-06,
"loss": 2.492,
"step": 582
},
{
"epoch": 0.591128010139417,
"grad_norm": 2.7418370246887207,
"learning_rate": 7.887006921711301e-06,
"loss": 2.5467,
"step": 583
},
{
"epoch": 0.5921419518377693,
"grad_norm": 2.820239543914795,
"learning_rate": 7.854247675989057e-06,
"loss": 2.5913,
"step": 584
},
{
"epoch": 0.5931558935361216,
"grad_norm": 3.0596511363983154,
"learning_rate": 7.821512551513395e-06,
"loss": 2.3895,
"step": 585
},
{
"epoch": 0.594169835234474,
"grad_norm": 3.111802577972412,
"learning_rate": 7.788801916272739e-06,
"loss": 2.3851,
"step": 586
},
{
"epoch": 0.5951837769328263,
"grad_norm": 2.8186888694763184,
"learning_rate": 7.75611613798022e-06,
"loss": 2.4647,
"step": 587
},
{
"epoch": 0.5961977186311787,
"grad_norm": 2.870859146118164,
"learning_rate": 7.723455584069524e-06,
"loss": 2.8919,
"step": 588
},
{
"epoch": 0.5972116603295311,
"grad_norm": 2.9670660495758057,
"learning_rate": 7.690820621690815e-06,
"loss": 2.3913,
"step": 589
},
{
"epoch": 0.5982256020278834,
"grad_norm": 2.862382650375366,
"learning_rate": 7.65821161770654e-06,
"loss": 2.5883,
"step": 590
},
{
"epoch": 0.5992395437262358,
"grad_norm": 2.7800912857055664,
"learning_rate": 7.625628938687349e-06,
"loss": 2.4849,
"step": 591
},
{
"epoch": 0.6002534854245881,
"grad_norm": 3.122760534286499,
"learning_rate": 7.593072950907969e-06,
"loss": 2.5025,
"step": 592
},
{
"epoch": 0.6012674271229405,
"grad_norm": 2.840243339538574,
"learning_rate": 7.560544020343071e-06,
"loss": 2.7803,
"step": 593
},
{
"epoch": 0.6022813688212928,
"grad_norm": 2.8622286319732666,
"learning_rate": 7.528042512663174e-06,
"loss": 2.7921,
"step": 594
},
{
"epoch": 0.6032953105196451,
"grad_norm": 3.1672146320343018,
"learning_rate": 7.495568793230516e-06,
"loss": 2.3436,
"step": 595
},
{
"epoch": 0.6043092522179975,
"grad_norm": 2.5706326961517334,
"learning_rate": 7.463123227094962e-06,
"loss": 2.6355,
"step": 596
},
{
"epoch": 0.6053231939163498,
"grad_norm": 5.168808937072754,
"learning_rate": 7.430706178989895e-06,
"loss": 2.5555,
"step": 597
},
{
"epoch": 0.6063371356147021,
"grad_norm": 2.9276273250579834,
"learning_rate": 7.398318013328112e-06,
"loss": 2.4021,
"step": 598
},
{
"epoch": 0.6073510773130545,
"grad_norm": 2.889470338821411,
"learning_rate": 7.365959094197734e-06,
"loss": 2.5384,
"step": 599
},
{
"epoch": 0.6083650190114068,
"grad_norm": 3.013488292694092,
"learning_rate": 7.3336297853581115e-06,
"loss": 2.4627,
"step": 600
},
{
"epoch": 0.6093789607097592,
"grad_norm": 3.0487990379333496,
"learning_rate": 7.301330450235733e-06,
"loss": 2.5182,
"step": 601
},
{
"epoch": 0.6103929024081115,
"grad_norm": 3.199193000793457,
"learning_rate": 7.2690614519201315e-06,
"loss": 2.7172,
"step": 602
},
{
"epoch": 0.6114068441064638,
"grad_norm": 2.6934726238250732,
"learning_rate": 7.236823153159832e-06,
"loss": 2.446,
"step": 603
},
{
"epoch": 0.6124207858048162,
"grad_norm": 3.0447793006896973,
"learning_rate": 7.204615916358234e-06,
"loss": 2.5658,
"step": 604
},
{
"epoch": 0.6134347275031685,
"grad_norm": 2.838165521621704,
"learning_rate": 7.172440103569566e-06,
"loss": 2.9585,
"step": 605
},
{
"epoch": 0.6144486692015209,
"grad_norm": 2.9292893409729004,
"learning_rate": 7.140296076494809e-06,
"loss": 2.884,
"step": 606
},
{
"epoch": 0.6154626108998733,
"grad_norm": 2.599578857421875,
"learning_rate": 7.108184196477622e-06,
"loss": 2.5872,
"step": 607
},
{
"epoch": 0.6164765525982256,
"grad_norm": 2.8179094791412354,
"learning_rate": 7.076104824500294e-06,
"loss": 2.6202,
"step": 608
},
{
"epoch": 0.617490494296578,
"grad_norm": 3.000666379928589,
"learning_rate": 7.044058321179671e-06,
"loss": 2.7415,
"step": 609
},
{
"epoch": 0.6185044359949303,
"grad_norm": 3.1160929203033447,
"learning_rate": 7.012045046763111e-06,
"loss": 2.3608,
"step": 610
},
{
"epoch": 0.6195183776932827,
"grad_norm": 2.8095080852508545,
"learning_rate": 6.980065361124437e-06,
"loss": 2.4156,
"step": 611
},
{
"epoch": 0.620532319391635,
"grad_norm": 3.549515962600708,
"learning_rate": 6.948119623759888e-06,
"loss": 2.7265,
"step": 612
},
{
"epoch": 0.6215462610899873,
"grad_norm": 2.868363618850708,
"learning_rate": 6.916208193784062e-06,
"loss": 2.7277,
"step": 613
},
{
"epoch": 0.6225602027883397,
"grad_norm": 2.762838125228882,
"learning_rate": 6.884331429925919e-06,
"loss": 2.8547,
"step": 614
},
{
"epoch": 0.623574144486692,
"grad_norm": 2.8724491596221924,
"learning_rate": 6.852489690524703e-06,
"loss": 2.7768,
"step": 615
},
{
"epoch": 0.6245880861850444,
"grad_norm": 2.692626476287842,
"learning_rate": 6.820683333525942e-06,
"loss": 2.7537,
"step": 616
},
{
"epoch": 0.6256020278833967,
"grad_norm": 3.1846024990081787,
"learning_rate": 6.788912716477417e-06,
"loss": 2.432,
"step": 617
},
{
"epoch": 0.626615969581749,
"grad_norm": 3.2031455039978027,
"learning_rate": 6.7571781965251405e-06,
"loss": 2.5432,
"step": 618
},
{
"epoch": 0.6276299112801014,
"grad_norm": 2.8481509685516357,
"learning_rate": 6.725480130409347e-06,
"loss": 2.3483,
"step": 619
},
{
"epoch": 0.6286438529784537,
"grad_norm": 2.80601167678833,
"learning_rate": 6.693818874460475e-06,
"loss": 2.6803,
"step": 620
},
{
"epoch": 0.629657794676806,
"grad_norm": 2.8626856803894043,
"learning_rate": 6.662194784595164e-06,
"loss": 2.9226,
"step": 621
},
{
"epoch": 0.6306717363751584,
"grad_norm": 2.815453290939331,
"learning_rate": 6.63060821631226e-06,
"loss": 2.4712,
"step": 622
},
{
"epoch": 0.6316856780735107,
"grad_norm": 3.127218723297119,
"learning_rate": 6.599059524688813e-06,
"loss": 2.4857,
"step": 623
},
{
"epoch": 0.6326996197718631,
"grad_norm": 2.635438919067383,
"learning_rate": 6.567549064376078e-06,
"loss": 2.5239,
"step": 624
},
{
"epoch": 0.6337135614702155,
"grad_norm": 2.744525909423828,
"learning_rate": 6.536077189595554e-06,
"loss": 2.5372,
"step": 625
},
{
"epoch": 0.6347275031685679,
"grad_norm": 2.62393856048584,
"learning_rate": 6.504644254134969e-06,
"loss": 2.5232,
"step": 626
},
{
"epoch": 0.6357414448669202,
"grad_norm": 2.9516711235046387,
"learning_rate": 6.4732506113443215e-06,
"loss": 2.6682,
"step": 627
},
{
"epoch": 0.6367553865652725,
"grad_norm": 2.90000319480896,
"learning_rate": 6.441896614131918e-06,
"loss": 2.6046,
"step": 628
},
{
"epoch": 0.6377693282636249,
"grad_norm": 3.0262489318847656,
"learning_rate": 6.410582614960375e-06,
"loss": 2.861,
"step": 629
},
{
"epoch": 0.6387832699619772,
"grad_norm": 2.911041498184204,
"learning_rate": 6.379308965842689e-06,
"loss": 2.6306,
"step": 630
},
{
"epoch": 0.6397972116603295,
"grad_norm": 2.819647789001465,
"learning_rate": 6.34807601833826e-06,
"loss": 2.4897,
"step": 631
},
{
"epoch": 0.6408111533586819,
"grad_norm": 2.881241798400879,
"learning_rate": 6.316884123548947e-06,
"loss": 2.6941,
"step": 632
},
{
"epoch": 0.6418250950570342,
"grad_norm": 2.7107315063476562,
"learning_rate": 6.285733632115118e-06,
"loss": 2.8886,
"step": 633
},
{
"epoch": 0.6428390367553866,
"grad_norm": 2.76619553565979,
"learning_rate": 6.2546248942117134e-06,
"loss": 2.6311,
"step": 634
},
{
"epoch": 0.6438529784537389,
"grad_norm": 3.2412569522857666,
"learning_rate": 6.2235582595442935e-06,
"loss": 2.4515,
"step": 635
},
{
"epoch": 0.6448669201520912,
"grad_norm": 2.7237205505371094,
"learning_rate": 6.19253407734514e-06,
"loss": 2.6107,
"step": 636
},
{
"epoch": 0.6458808618504436,
"grad_norm": 2.886448383331299,
"learning_rate": 6.161552696369291e-06,
"loss": 2.56,
"step": 637
},
{
"epoch": 0.6468948035487959,
"grad_norm": 2.9349300861358643,
"learning_rate": 6.130614464890645e-06,
"loss": 2.6451,
"step": 638
},
{
"epoch": 0.6479087452471483,
"grad_norm": 2.790083646774292,
"learning_rate": 6.099719730698046e-06,
"loss": 2.6844,
"step": 639
},
{
"epoch": 0.6489226869455006,
"grad_norm": 2.8875372409820557,
"learning_rate": 6.068868841091361e-06,
"loss": 2.5449,
"step": 640
},
{
"epoch": 0.6499366286438529,
"grad_norm": 2.927098512649536,
"learning_rate": 6.038062142877583e-06,
"loss": 2.4283,
"step": 641
},
{
"epoch": 0.6509505703422053,
"grad_norm": 2.8668813705444336,
"learning_rate": 6.00729998236694e-06,
"loss": 2.4133,
"step": 642
},
{
"epoch": 0.6519645120405577,
"grad_norm": 2.6912641525268555,
"learning_rate": 5.976582705368982e-06,
"loss": 2.6628,
"step": 643
},
{
"epoch": 0.6529784537389101,
"grad_norm": 2.862189292907715,
"learning_rate": 5.945910657188717e-06,
"loss": 2.6867,
"step": 644
},
{
"epoch": 0.6539923954372624,
"grad_norm": 2.767120122909546,
"learning_rate": 5.9152841826227136e-06,
"loss": 2.6689,
"step": 645
},
{
"epoch": 0.6550063371356147,
"grad_norm": 2.7868754863739014,
"learning_rate": 5.884703625955219e-06,
"loss": 2.6241,
"step": 646
},
{
"epoch": 0.6560202788339671,
"grad_norm": 2.879535436630249,
"learning_rate": 5.854169330954324e-06,
"loss": 2.5689,
"step": 647
},
{
"epoch": 0.6570342205323194,
"grad_norm": 2.824422836303711,
"learning_rate": 5.823681640868049e-06,
"loss": 2.4534,
"step": 648
},
{
"epoch": 0.6580481622306718,
"grad_norm": 2.8804638385772705,
"learning_rate": 5.793240898420521e-06,
"loss": 2.4938,
"step": 649
},
{
"epoch": 0.6590621039290241,
"grad_norm": 2.7216782569885254,
"learning_rate": 5.762847445808111e-06,
"loss": 2.5811,
"step": 650
},
{
"epoch": 0.6600760456273764,
"grad_norm": 2.975800037384033,
"learning_rate": 5.73250162469559e-06,
"loss": 2.4781,
"step": 651
},
{
"epoch": 0.6610899873257288,
"grad_norm": 2.927669048309326,
"learning_rate": 5.702203776212269e-06,
"loss": 2.6511,
"step": 652
},
{
"epoch": 0.6621039290240811,
"grad_norm": 2.6778831481933594,
"learning_rate": 5.6719542409482e-06,
"loss": 2.5375,
"step": 653
},
{
"epoch": 0.6631178707224334,
"grad_norm": 2.6449270248413086,
"learning_rate": 5.6417533589503036e-06,
"loss": 2.7249,
"step": 654
},
{
"epoch": 0.6641318124207858,
"grad_norm": 2.714900016784668,
"learning_rate": 5.611601469718601e-06,
"loss": 2.7848,
"step": 655
},
{
"epoch": 0.6651457541191381,
"grad_norm": 3.157137393951416,
"learning_rate": 5.5814989122023385e-06,
"loss": 2.2345,
"step": 656
},
{
"epoch": 0.6661596958174905,
"grad_norm": 2.847174882888794,
"learning_rate": 5.551446024796214e-06,
"loss": 2.6439,
"step": 657
},
{
"epoch": 0.6671736375158428,
"grad_norm": 2.6003525257110596,
"learning_rate": 5.521443145336568e-06,
"loss": 2.6906,
"step": 658
},
{
"epoch": 0.6681875792141951,
"grad_norm": 2.806028366088867,
"learning_rate": 5.491490611097586e-06,
"loss": 2.8077,
"step": 659
},
{
"epoch": 0.6692015209125475,
"grad_norm": 2.8221116065979004,
"learning_rate": 5.461588758787484e-06,
"loss": 2.6715,
"step": 660
},
{
"epoch": 0.6702154626108999,
"grad_norm": 2.9865975379943848,
"learning_rate": 5.431737924544763e-06,
"loss": 2.7053,
"step": 661
},
{
"epoch": 0.6712294043092523,
"grad_norm": 2.7677266597747803,
"learning_rate": 5.401938443934405e-06,
"loss": 2.6684,
"step": 662
},
{
"epoch": 0.6722433460076046,
"grad_norm": 3.146745443344116,
"learning_rate": 5.3721906519440945e-06,
"loss": 2.2221,
"step": 663
},
{
"epoch": 0.6732572877059569,
"grad_norm": 2.9099998474121094,
"learning_rate": 5.34249488298048e-06,
"loss": 2.5466,
"step": 664
},
{
"epoch": 0.6742712294043093,
"grad_norm": 3.0901663303375244,
"learning_rate": 5.312851470865383e-06,
"loss": 2.367,
"step": 665
},
{
"epoch": 0.6752851711026616,
"grad_norm": 2.6712629795074463,
"learning_rate": 5.283260748832072e-06,
"loss": 2.6655,
"step": 666
},
{
"epoch": 0.676299112801014,
"grad_norm": 2.7886345386505127,
"learning_rate": 5.253723049521507e-06,
"loss": 2.7724,
"step": 667
},
{
"epoch": 0.6773130544993663,
"grad_norm": 2.9444360733032227,
"learning_rate": 5.224238704978584e-06,
"loss": 2.5407,
"step": 668
},
{
"epoch": 0.6783269961977186,
"grad_norm": 2.850425958633423,
"learning_rate": 5.194808046648434e-06,
"loss": 2.8026,
"step": 669
},
{
"epoch": 0.679340937896071,
"grad_norm": 3.44856333732605,
"learning_rate": 5.165431405372674e-06,
"loss": 2.7905,
"step": 670
},
{
"epoch": 0.6803548795944233,
"grad_norm": 2.809282064437866,
"learning_rate": 5.1361091113856875e-06,
"loss": 2.6393,
"step": 671
},
{
"epoch": 0.6813688212927757,
"grad_norm": 2.956043004989624,
"learning_rate": 5.106841494310929e-06,
"loss": 2.3758,
"step": 672
},
{
"epoch": 0.682382762991128,
"grad_norm": 2.8450045585632324,
"learning_rate": 5.077628883157205e-06,
"loss": 2.666,
"step": 673
},
{
"epoch": 0.6833967046894803,
"grad_norm": 2.9813477993011475,
"learning_rate": 5.048471606314971e-06,
"loss": 2.3843,
"step": 674
},
{
"epoch": 0.6844106463878327,
"grad_norm": 2.9916036128997803,
"learning_rate": 5.019369991552658e-06,
"loss": 2.4761,
"step": 675
},
{
"epoch": 0.685424588086185,
"grad_norm": 2.921954393386841,
"learning_rate": 4.990324366012977e-06,
"loss": 2.4414,
"step": 676
},
{
"epoch": 0.6864385297845373,
"grad_norm": 3.0995535850524902,
"learning_rate": 4.961335056209234e-06,
"loss": 2.6297,
"step": 677
},
{
"epoch": 0.6874524714828897,
"grad_norm": 2.7778313159942627,
"learning_rate": 4.932402388021677e-06,
"loss": 2.6954,
"step": 678
},
{
"epoch": 0.6884664131812421,
"grad_norm": 3.039264678955078,
"learning_rate": 4.9035266866938125e-06,
"loss": 2.731,
"step": 679
},
{
"epoch": 0.6894803548795945,
"grad_norm": 2.6352975368499756,
"learning_rate": 4.87470827682877e-06,
"loss": 2.7248,
"step": 680
},
{
"epoch": 0.6904942965779468,
"grad_norm": 3.0940101146698,
"learning_rate": 4.8459474823856445e-06,
"loss": 2.4792,
"step": 681
},
{
"epoch": 0.6915082382762991,
"grad_norm": 2.724853038787842,
"learning_rate": 4.81724462667584e-06,
"loss": 2.5454,
"step": 682
},
{
"epoch": 0.6925221799746515,
"grad_norm": 2.9925169944763184,
"learning_rate": 4.788600032359461e-06,
"loss": 2.5472,
"step": 683
},
{
"epoch": 0.6935361216730038,
"grad_norm": 2.9303464889526367,
"learning_rate": 4.760014021441671e-06,
"loss": 2.731,
"step": 684
},
{
"epoch": 0.6945500633713562,
"grad_norm": 2.8822903633117676,
"learning_rate": 4.731486915269066e-06,
"loss": 2.7233,
"step": 685
},
{
"epoch": 0.6955640050697085,
"grad_norm": 2.7755796909332275,
"learning_rate": 4.703019034526082e-06,
"loss": 2.7116,
"step": 686
},
{
"epoch": 0.6965779467680608,
"grad_norm": 2.6300008296966553,
"learning_rate": 4.674610699231373e-06,
"loss": 2.5184,
"step": 687
},
{
"epoch": 0.6975918884664132,
"grad_norm": 2.8265411853790283,
"learning_rate": 4.6462622287342154e-06,
"loss": 2.5711,
"step": 688
},
{
"epoch": 0.6986058301647655,
"grad_norm": 2.8336949348449707,
"learning_rate": 4.617973941710932e-06,
"loss": 2.66,
"step": 689
},
{
"epoch": 0.6996197718631179,
"grad_norm": 2.909071683883667,
"learning_rate": 4.5897461561612814e-06,
"loss": 2.8292,
"step": 690
},
{
"epoch": 0.7006337135614702,
"grad_norm": 2.6716184616088867,
"learning_rate": 4.561579189404929e-06,
"loss": 2.7653,
"step": 691
},
{
"epoch": 0.7016476552598225,
"grad_norm": 3.014017105102539,
"learning_rate": 4.5334733580778305e-06,
"loss": 2.6507,
"step": 692
},
{
"epoch": 0.7026615969581749,
"grad_norm": 2.738978624343872,
"learning_rate": 4.505428978128693e-06,
"loss": 2.5285,
"step": 693
},
{
"epoch": 0.7036755386565272,
"grad_norm": 2.7485463619232178,
"learning_rate": 4.4774463648154375e-06,
"loss": 2.854,
"step": 694
},
{
"epoch": 0.7046894803548795,
"grad_norm": 2.6583411693573,
"learning_rate": 4.4495258327016415e-06,
"loss": 2.6731,
"step": 695
},
{
"epoch": 0.7057034220532319,
"grad_norm": 2.986009359359741,
"learning_rate": 4.421667695652987e-06,
"loss": 2.6313,
"step": 696
},
{
"epoch": 0.7067173637515843,
"grad_norm": 2.9571311473846436,
"learning_rate": 4.393872266833764e-06,
"loss": 2.4516,
"step": 697
},
{
"epoch": 0.7077313054499367,
"grad_norm": 2.93039870262146,
"learning_rate": 4.3661398587033355e-06,
"loss": 2.5464,
"step": 698
},
{
"epoch": 0.708745247148289,
"grad_norm": 3.0327370166778564,
"learning_rate": 4.338470783012609e-06,
"loss": 2.3182,
"step": 699
},
{
"epoch": 0.7097591888466414,
"grad_norm": 2.8209986686706543,
"learning_rate": 4.310865350800566e-06,
"loss": 2.6407,
"step": 700
},
{
"epoch": 0.7107731305449937,
"grad_norm": 2.630613327026367,
"learning_rate": 4.283323872390728e-06,
"loss": 2.3327,
"step": 701
},
{
"epoch": 0.711787072243346,
"grad_norm": 3.7512900829315186,
"learning_rate": 4.255846657387701e-06,
"loss": 2.5038,
"step": 702
},
{
"epoch": 0.7128010139416984,
"grad_norm": 2.8261454105377197,
"learning_rate": 4.228434014673679e-06,
"loss": 2.5155,
"step": 703
},
{
"epoch": 0.7138149556400507,
"grad_norm": 2.7256920337677,
"learning_rate": 4.201086252404962e-06,
"loss": 2.7317,
"step": 704
},
{
"epoch": 0.714828897338403,
"grad_norm": 2.859297513961792,
"learning_rate": 4.1738036780085175e-06,
"loss": 2.4831,
"step": 705
},
{
"epoch": 0.7158428390367554,
"grad_norm": 3.3157968521118164,
"learning_rate": 4.1465865981785055e-06,
"loss": 2.8,
"step": 706
},
{
"epoch": 0.7168567807351077,
"grad_norm": 3.097968816757202,
"learning_rate": 4.11943531887283e-06,
"loss": 2.7645,
"step": 707
},
{
"epoch": 0.7178707224334601,
"grad_norm": 2.6948904991149902,
"learning_rate": 4.0923501453097115e-06,
"loss": 2.7608,
"step": 708
},
{
"epoch": 0.7188846641318124,
"grad_norm": 2.7945899963378906,
"learning_rate": 4.065331381964252e-06,
"loss": 2.6215,
"step": 709
},
{
"epoch": 0.7198986058301647,
"grad_norm": 3.100442409515381,
"learning_rate": 4.0383793325650025e-06,
"loss": 2.5183,
"step": 710
},
{
"epoch": 0.7209125475285171,
"grad_norm": 2.81402850151062,
"learning_rate": 4.0114943000905645e-06,
"loss": 2.6106,
"step": 711
},
{
"epoch": 0.7219264892268694,
"grad_norm": 2.5642473697662354,
"learning_rate": 3.984676586766167e-06,
"loss": 2.6009,
"step": 712
},
{
"epoch": 0.7229404309252218,
"grad_norm": 3.010683536529541,
"learning_rate": 3.957926494060285e-06,
"loss": 2.5473,
"step": 713
},
{
"epoch": 0.7239543726235741,
"grad_norm": 3.6138861179351807,
"learning_rate": 3.931244322681243e-06,
"loss": 2.5092,
"step": 714
},
{
"epoch": 0.7249683143219265,
"grad_norm": 2.7241833209991455,
"learning_rate": 3.90463037257383e-06,
"loss": 2.5499,
"step": 715
},
{
"epoch": 0.7259822560202789,
"grad_norm": 2.8698906898498535,
"learning_rate": 3.8780849429159365e-06,
"loss": 2.5449,
"step": 716
},
{
"epoch": 0.7269961977186312,
"grad_norm": 2.8715462684631348,
"learning_rate": 3.851608332115192e-06,
"loss": 2.6355,
"step": 717
},
{
"epoch": 0.7280101394169836,
"grad_norm": 3.0901811122894287,
"learning_rate": 3.825200837805595e-06,
"loss": 2.5549,
"step": 718
},
{
"epoch": 0.7290240811153359,
"grad_norm": 2.8652830123901367,
"learning_rate": 3.7988627568441884e-06,
"loss": 2.7197,
"step": 719
},
{
"epoch": 0.7300380228136882,
"grad_norm": 2.704918146133423,
"learning_rate": 3.7725943853077105e-06,
"loss": 2.666,
"step": 720
},
{
"epoch": 0.7310519645120406,
"grad_norm": 3.070158004760742,
"learning_rate": 3.746396018489261e-06,
"loss": 2.679,
"step": 721
},
{
"epoch": 0.7320659062103929,
"grad_norm": 3.097076416015625,
"learning_rate": 3.7202679508950015e-06,
"loss": 2.4041,
"step": 722
},
{
"epoch": 0.7330798479087453,
"grad_norm": 3.099027395248413,
"learning_rate": 3.6942104762408183e-06,
"loss": 2.3216,
"step": 723
},
{
"epoch": 0.7340937896070976,
"grad_norm": 3.0055911540985107,
"learning_rate": 3.6682238874490463e-06,
"loss": 2.5474,
"step": 724
},
{
"epoch": 0.7351077313054499,
"grad_norm": 2.9727060794830322,
"learning_rate": 3.6423084766451622e-06,
"loss": 2.3275,
"step": 725
},
{
"epoch": 0.7361216730038023,
"grad_norm": 3.111468553543091,
"learning_rate": 3.6164645351544956e-06,
"loss": 2.6528,
"step": 726
},
{
"epoch": 0.7371356147021546,
"grad_norm": 3.101276397705078,
"learning_rate": 3.590692353498968e-06,
"loss": 2.5109,
"step": 727
},
{
"epoch": 0.738149556400507,
"grad_norm": 2.7668955326080322,
"learning_rate": 3.564992221393825e-06,
"loss": 2.6107,
"step": 728
},
{
"epoch": 0.7391634980988593,
"grad_norm": 2.7561287879943848,
"learning_rate": 3.5393644277443596e-06,
"loss": 2.4118,
"step": 729
},
{
"epoch": 0.7401774397972116,
"grad_norm": 3.077685594558716,
"learning_rate": 3.513809260642694e-06,
"loss": 2.5809,
"step": 730
},
{
"epoch": 0.741191381495564,
"grad_norm": 3.1228139400482178,
"learning_rate": 3.488327007364525e-06,
"loss": 2.3581,
"step": 731
},
{
"epoch": 0.7422053231939163,
"grad_norm": 3.054448127746582,
"learning_rate": 3.4629179543658852e-06,
"loss": 2.3218,
"step": 732
},
{
"epoch": 0.7432192648922687,
"grad_norm": 3.002232313156128,
"learning_rate": 3.437582387279946e-06,
"loss": 2.3987,
"step": 733
},
{
"epoch": 0.7442332065906211,
"grad_norm": 2.848593235015869,
"learning_rate": 3.412320590913796e-06,
"loss": 2.3704,
"step": 734
},
{
"epoch": 0.7452471482889734,
"grad_norm": 2.9372968673706055,
"learning_rate": 3.387132849245224e-06,
"loss": 2.7844,
"step": 735
},
{
"epoch": 0.7462610899873258,
"grad_norm": 3.0542829036712646,
"learning_rate": 3.3620194454195565e-06,
"loss": 2.3873,
"step": 736
},
{
"epoch": 0.7472750316856781,
"grad_norm": 3.165482759475708,
"learning_rate": 3.336980661746446e-06,
"loss": 2.6344,
"step": 737
},
{
"epoch": 0.7482889733840304,
"grad_norm": 2.833448648452759,
"learning_rate": 3.3120167796967195e-06,
"loss": 2.3989,
"step": 738
},
{
"epoch": 0.7493029150823828,
"grad_norm": 2.797510862350464,
"learning_rate": 3.2871280798992065e-06,
"loss": 2.5353,
"step": 739
},
{
"epoch": 0.7503168567807351,
"grad_norm": 2.9084537029266357,
"learning_rate": 3.262314842137573e-06,
"loss": 2.3216,
"step": 740
},
{
"epoch": 0.7513307984790875,
"grad_norm": 3.0350236892700195,
"learning_rate": 3.237577345347196e-06,
"loss": 2.5678,
"step": 741
},
{
"epoch": 0.7523447401774398,
"grad_norm": 2.864340305328369,
"learning_rate": 3.2129158676120176e-06,
"loss": 2.3784,
"step": 742
},
{
"epoch": 0.7533586818757921,
"grad_norm": 3.111661672592163,
"learning_rate": 3.1883306861614104e-06,
"loss": 2.5248,
"step": 743
},
{
"epoch": 0.7543726235741445,
"grad_norm": 3.0593674182891846,
"learning_rate": 3.1638220773670825e-06,
"loss": 2.5014,
"step": 744
},
{
"epoch": 0.7553865652724968,
"grad_norm": 3.0310568809509277,
"learning_rate": 3.1393903167399553e-06,
"loss": 2.9185,
"step": 745
},
{
"epoch": 0.7564005069708492,
"grad_norm": 2.886401891708374,
"learning_rate": 3.115035678927063e-06,
"loss": 2.5067,
"step": 746
},
{
"epoch": 0.7574144486692015,
"grad_norm": 2.6182854175567627,
"learning_rate": 3.090758437708482e-06,
"loss": 2.7723,
"step": 747
},
{
"epoch": 0.7584283903675538,
"grad_norm": 2.6697745323181152,
"learning_rate": 3.0665588659942314e-06,
"loss": 2.5685,
"step": 748
},
{
"epoch": 0.7594423320659062,
"grad_norm": 2.8407485485076904,
"learning_rate": 3.0424372358212285e-06,
"loss": 2.5759,
"step": 749
},
{
"epoch": 0.7604562737642585,
"grad_norm": 2.7550413608551025,
"learning_rate": 3.0183938183502147e-06,
"loss": 2.8251,
"step": 750
},
{
"epoch": 0.761470215462611,
"grad_norm": 2.9071035385131836,
"learning_rate": 2.9944288838627055e-06,
"loss": 2.4514,
"step": 751
},
{
"epoch": 0.7624841571609633,
"grad_norm": 2.9431047439575195,
"learning_rate": 2.970542701757967e-06,
"loss": 2.3782,
"step": 752
},
{
"epoch": 0.7634980988593156,
"grad_norm": 2.7645556926727295,
"learning_rate": 2.9467355405499788e-06,
"loss": 2.6667,
"step": 753
},
{
"epoch": 0.764512040557668,
"grad_norm": 2.9341866970062256,
"learning_rate": 2.923007667864405e-06,
"loss": 2.4445,
"step": 754
},
{
"epoch": 0.7655259822560203,
"grad_norm": 3.016434907913208,
"learning_rate": 2.8993593504356065e-06,
"loss": 2.6087,
"step": 755
},
{
"epoch": 0.7665399239543726,
"grad_norm": 2.9525911808013916,
"learning_rate": 2.8757908541036338e-06,
"loss": 2.4655,
"step": 756
},
{
"epoch": 0.767553865652725,
"grad_norm": 2.9173786640167236,
"learning_rate": 2.8523024438112236e-06,
"loss": 2.4067,
"step": 757
},
{
"epoch": 0.7685678073510773,
"grad_norm": 2.665409803390503,
"learning_rate": 2.828894383600851e-06,
"loss": 2.4272,
"step": 758
},
{
"epoch": 0.7695817490494297,
"grad_norm": 2.8478453159332275,
"learning_rate": 2.805566936611728e-06,
"loss": 2.8092,
"step": 759
},
{
"epoch": 0.770595690747782,
"grad_norm": 2.8794307708740234,
"learning_rate": 2.782320365076874e-06,
"loss": 2.5053,
"step": 760
},
{
"epoch": 0.7716096324461343,
"grad_norm": 3.1319518089294434,
"learning_rate": 2.7591549303201513e-06,
"loss": 2.6314,
"step": 761
},
{
"epoch": 0.7726235741444867,
"grad_norm": 3.0643534660339355,
"learning_rate": 2.7360708927533285e-06,
"loss": 2.588,
"step": 762
},
{
"epoch": 0.773637515842839,
"grad_norm": 2.7867236137390137,
"learning_rate": 2.7130685118731615e-06,
"loss": 2.6412,
"step": 763
},
{
"epoch": 0.7746514575411914,
"grad_norm": 2.8751060962677,
"learning_rate": 2.6901480462584707e-06,
"loss": 2.6974,
"step": 764
},
{
"epoch": 0.7756653992395437,
"grad_norm": 2.7692108154296875,
"learning_rate": 2.6673097535672287e-06,
"loss": 2.5394,
"step": 765
},
{
"epoch": 0.776679340937896,
"grad_norm": 2.860275983810425,
"learning_rate": 2.6445538905336764e-06,
"loss": 2.5796,
"step": 766
},
{
"epoch": 0.7776932826362484,
"grad_norm": 3.0435452461242676,
"learning_rate": 2.621880712965431e-06,
"loss": 2.4691,
"step": 767
},
{
"epoch": 0.7787072243346007,
"grad_norm": 2.8065273761749268,
"learning_rate": 2.5992904757406025e-06,
"loss": 2.4452,
"step": 768
},
{
"epoch": 0.779721166032953,
"grad_norm": 2.893676996231079,
"learning_rate": 2.5767834328049444e-06,
"loss": 2.5815,
"step": 769
},
{
"epoch": 0.7807351077313055,
"grad_norm": 2.7362284660339355,
"learning_rate": 2.5543598371689826e-06,
"loss": 2.4685,
"step": 770
},
{
"epoch": 0.7817490494296578,
"grad_norm": 3.473865509033203,
"learning_rate": 2.532019940905186e-06,
"loss": 2.8166,
"step": 771
},
{
"epoch": 0.7827629911280102,
"grad_norm": 2.8457305431365967,
"learning_rate": 2.5097639951451247e-06,
"loss": 2.6752,
"step": 772
},
{
"epoch": 0.7837769328263625,
"grad_norm": 2.654867172241211,
"learning_rate": 2.4875922500766414e-06,
"loss": 2.3966,
"step": 773
},
{
"epoch": 0.7847908745247149,
"grad_norm": 2.7692761421203613,
"learning_rate": 2.4655049549410535e-06,
"loss": 2.5635,
"step": 774
},
{
"epoch": 0.7858048162230672,
"grad_norm": 3.1793344020843506,
"learning_rate": 2.443502358030344e-06,
"loss": 2.7287,
"step": 775
},
{
"epoch": 0.7868187579214195,
"grad_norm": 2.9192662239074707,
"learning_rate": 2.421584706684359e-06,
"loss": 2.4833,
"step": 776
},
{
"epoch": 0.7878326996197719,
"grad_norm": 3.0066676139831543,
"learning_rate": 2.3997522472880496e-06,
"loss": 2.5308,
"step": 777
},
{
"epoch": 0.7888466413181242,
"grad_norm": 2.8126232624053955,
"learning_rate": 2.378005225268689e-06,
"loss": 2.4983,
"step": 778
},
{
"epoch": 0.7898605830164765,
"grad_norm": 2.7183611392974854,
"learning_rate": 2.3563438850931076e-06,
"loss": 2.6066,
"step": 779
},
{
"epoch": 0.7908745247148289,
"grad_norm": 2.824969530105591,
"learning_rate": 2.334768470264963e-06,
"loss": 2.4692,
"step": 780
},
{
"epoch": 0.7918884664131812,
"grad_norm": 2.9735774993896484,
"learning_rate": 2.3132792233219814e-06,
"loss": 2.7286,
"step": 781
},
{
"epoch": 0.7929024081115336,
"grad_norm": 2.792377471923828,
"learning_rate": 2.2918763858332503e-06,
"loss": 2.4081,
"step": 782
},
{
"epoch": 0.7939163498098859,
"grad_norm": 2.925187826156616,
"learning_rate": 2.2705601983964933e-06,
"loss": 2.5134,
"step": 783
},
{
"epoch": 0.7949302915082382,
"grad_norm": 2.921086549758911,
"learning_rate": 2.249330900635359e-06,
"loss": 2.4154,
"step": 784
},
{
"epoch": 0.7959442332065906,
"grad_norm": 3.2176079750061035,
"learning_rate": 2.2281887311967454e-06,
"loss": 2.5693,
"step": 785
},
{
"epoch": 0.7969581749049429,
"grad_norm": 2.746103048324585,
"learning_rate": 2.207133927748104e-06,
"loss": 2.5852,
"step": 786
},
{
"epoch": 0.7979721166032953,
"grad_norm": 2.905566453933716,
"learning_rate": 2.1861667269747623e-06,
"loss": 2.6516,
"step": 787
},
{
"epoch": 0.7989860583016477,
"grad_norm": 2.67879581451416,
"learning_rate": 2.165287364577282e-06,
"loss": 2.5244,
"step": 788
},
{
"epoch": 0.8,
"grad_norm": 2.5994820594787598,
"learning_rate": 2.1444960752687994e-06,
"loss": 2.7985,
"step": 789
},
{
"epoch": 0.8010139416983524,
"grad_norm": 2.890216112136841,
"learning_rate": 2.1237930927723736e-06,
"loss": 2.5104,
"step": 790
},
{
"epoch": 0.8020278833967047,
"grad_norm": 2.9422292709350586,
"learning_rate": 2.103178649818387e-06,
"loss": 2.6468,
"step": 791
},
{
"epoch": 0.8030418250950571,
"grad_norm": 2.9980218410491943,
"learning_rate": 2.0826529781419092e-06,
"loss": 2.6556,
"step": 792
},
{
"epoch": 0.8040557667934094,
"grad_norm": 2.815119981765747,
"learning_rate": 2.0622163084800904e-06,
"loss": 2.5436,
"step": 793
},
{
"epoch": 0.8050697084917617,
"grad_norm": 2.761706590652466,
"learning_rate": 2.0418688705695846e-06,
"loss": 2.6906,
"step": 794
},
{
"epoch": 0.8060836501901141,
"grad_norm": 2.60274338722229,
"learning_rate": 2.021610893143947e-06,
"loss": 2.7891,
"step": 795
},
{
"epoch": 0.8070975918884664,
"grad_norm": 2.892871618270874,
"learning_rate": 2.0014426039310786e-06,
"loss": 2.5267,
"step": 796
},
{
"epoch": 0.8081115335868188,
"grad_norm": 2.916266679763794,
"learning_rate": 1.9813642296506606e-06,
"loss": 2.689,
"step": 797
},
{
"epoch": 0.8091254752851711,
"grad_norm": 2.925898551940918,
"learning_rate": 1.9613759960115986e-06,
"loss": 2.6084,
"step": 798
},
{
"epoch": 0.8101394169835234,
"grad_norm": 2.8048884868621826,
"learning_rate": 1.9414781277094963e-06,
"loss": 2.6537,
"step": 799
},
{
"epoch": 0.8111533586818758,
"grad_norm": 3.465637445449829,
"learning_rate": 1.9216708484241275e-06,
"loss": 2.4329,
"step": 800
},
{
"epoch": 0.8121673003802281,
"grad_norm": 3.048293113708496,
"learning_rate": 1.9019543808169117e-06,
"loss": 2.3845,
"step": 801
},
{
"epoch": 0.8131812420785804,
"grad_norm": 2.635202169418335,
"learning_rate": 1.8823289465284244e-06,
"loss": 2.73,
"step": 802
},
{
"epoch": 0.8141951837769328,
"grad_norm": 3.521111249923706,
"learning_rate": 1.8627947661759027e-06,
"loss": 2.5559,
"step": 803
},
{
"epoch": 0.8152091254752851,
"grad_norm": 3.151444673538208,
"learning_rate": 1.8433520593507515e-06,
"loss": 2.6452,
"step": 804
},
{
"epoch": 0.8162230671736375,
"grad_norm": 2.964160680770874,
"learning_rate": 1.8240010446160973e-06,
"loss": 2.731,
"step": 805
},
{
"epoch": 0.8172370088719899,
"grad_norm": 3.4860939979553223,
"learning_rate": 1.8047419395043086e-06,
"loss": 2.437,
"step": 806
},
{
"epoch": 0.8182509505703423,
"grad_norm": 2.681875705718994,
"learning_rate": 1.7855749605145722e-06,
"loss": 2.7035,
"step": 807
},
{
"epoch": 0.8192648922686946,
"grad_norm": 2.733168601989746,
"learning_rate": 1.766500323110445e-06,
"loss": 2.7602,
"step": 808
},
{
"epoch": 0.8202788339670469,
"grad_norm": 3.0019381046295166,
"learning_rate": 1.7475182417174318e-06,
"loss": 2.561,
"step": 809
},
{
"epoch": 0.8212927756653993,
"grad_norm": 2.899308443069458,
"learning_rate": 1.7286289297205826e-06,
"loss": 2.5541,
"step": 810
},
{
"epoch": 0.8223067173637516,
"grad_norm": 3.142498254776001,
"learning_rate": 1.7098325994620934e-06,
"loss": 2.4551,
"step": 811
},
{
"epoch": 0.8233206590621039,
"grad_norm": 2.99397873878479,
"learning_rate": 1.6911294622389075e-06,
"loss": 2.8406,
"step": 812
},
{
"epoch": 0.8243346007604563,
"grad_norm": 2.874096155166626,
"learning_rate": 1.6725197283003548e-06,
"loss": 2.3478,
"step": 813
},
{
"epoch": 0.8253485424588086,
"grad_norm": 2.8119425773620605,
"learning_rate": 1.6540036068457833e-06,
"loss": 2.4847,
"step": 814
},
{
"epoch": 0.826362484157161,
"grad_norm": 2.8022053241729736,
"learning_rate": 1.6355813060221993e-06,
"loss": 2.4975,
"step": 815
},
{
"epoch": 0.8273764258555133,
"grad_norm": 2.844733715057373,
"learning_rate": 1.6172530329219416e-06,
"loss": 2.5104,
"step": 816
},
{
"epoch": 0.8283903675538656,
"grad_norm": 2.806781530380249,
"learning_rate": 1.5990189935803402e-06,
"loss": 2.7239,
"step": 817
},
{
"epoch": 0.829404309252218,
"grad_norm": 2.7376742362976074,
"learning_rate": 1.58087939297341e-06,
"loss": 2.6657,
"step": 818
},
{
"epoch": 0.8304182509505703,
"grad_norm": 2.7552506923675537,
"learning_rate": 1.5628344350155477e-06,
"loss": 2.8755,
"step": 819
},
{
"epoch": 0.8314321926489227,
"grad_norm": 2.751073122024536,
"learning_rate": 1.5448843225572218e-06,
"loss": 2.4813,
"step": 820
},
{
"epoch": 0.832446134347275,
"grad_norm": 2.7169299125671387,
"learning_rate": 1.5270292573827173e-06,
"loss": 2.473,
"step": 821
},
{
"epoch": 0.8334600760456273,
"grad_norm": 2.8066329956054688,
"learning_rate": 1.509269440207851e-06,
"loss": 2.6954,
"step": 822
},
{
"epoch": 0.8344740177439797,
"grad_norm": 2.620018720626831,
"learning_rate": 1.4916050706777185e-06,
"loss": 2.577,
"step": 823
},
{
"epoch": 0.8354879594423321,
"grad_norm": 2.9988808631896973,
"learning_rate": 1.474036347364446e-06,
"loss": 2.5485,
"step": 824
},
{
"epoch": 0.8365019011406845,
"grad_norm": 2.9113547801971436,
"learning_rate": 1.4565634677649786e-06,
"loss": 2.4276,
"step": 825
},
{
"epoch": 0.8375158428390368,
"grad_norm": 2.95076060295105,
"learning_rate": 1.4391866282988266e-06,
"loss": 2.5785,
"step": 826
},
{
"epoch": 0.8385297845373891,
"grad_norm": 2.605067253112793,
"learning_rate": 1.4219060243058879e-06,
"loss": 2.8107,
"step": 827
},
{
"epoch": 0.8395437262357415,
"grad_norm": 2.895616292953491,
"learning_rate": 1.4047218500442305e-06,
"loss": 2.9408,
"step": 828
},
{
"epoch": 0.8405576679340938,
"grad_norm": 3.193804979324341,
"learning_rate": 1.3876342986879243e-06,
"loss": 2.3704,
"step": 829
},
{
"epoch": 0.8415716096324461,
"grad_norm": 3.0509471893310547,
"learning_rate": 1.3706435623248627e-06,
"loss": 2.6771,
"step": 830
},
{
"epoch": 0.8425855513307985,
"grad_norm": 2.8694870471954346,
"learning_rate": 1.3537498319545984e-06,
"loss": 2.8275,
"step": 831
},
{
"epoch": 0.8435994930291508,
"grad_norm": 2.6654863357543945,
"learning_rate": 1.3369532974862053e-06,
"loss": 2.4856,
"step": 832
},
{
"epoch": 0.8446134347275032,
"grad_norm": 2.8032288551330566,
"learning_rate": 1.3202541477361441e-06,
"loss": 2.7882,
"step": 833
},
{
"epoch": 0.8456273764258555,
"grad_norm": 2.8845303058624268,
"learning_rate": 1.303652570426125e-06,
"loss": 2.6482,
"step": 834
},
{
"epoch": 0.8466413181242078,
"grad_norm": 3.2056705951690674,
"learning_rate": 1.2871487521810166e-06,
"loss": 2.821,
"step": 835
},
{
"epoch": 0.8476552598225602,
"grad_norm": 2.8983800411224365,
"learning_rate": 1.2707428785267396e-06,
"loss": 2.6287,
"step": 836
},
{
"epoch": 0.8486692015209125,
"grad_norm": 2.935938596725464,
"learning_rate": 1.2544351338881721e-06,
"loss": 2.8519,
"step": 837
},
{
"epoch": 0.8496831432192649,
"grad_norm": 2.7424423694610596,
"learning_rate": 1.2382257015870957e-06,
"loss": 2.8495,
"step": 838
},
{
"epoch": 0.8506970849176172,
"grad_norm": 2.661113739013672,
"learning_rate": 1.222114763840121e-06,
"loss": 2.6281,
"step": 839
},
{
"epoch": 0.8517110266159695,
"grad_norm": 2.766042947769165,
"learning_rate": 1.2061025017566374e-06,
"loss": 2.821,
"step": 840
},
{
"epoch": 0.8527249683143219,
"grad_norm": 2.742476224899292,
"learning_rate": 1.190189095336791e-06,
"loss": 2.8773,
"step": 841
},
{
"epoch": 0.8537389100126743,
"grad_norm": 2.753896474838257,
"learning_rate": 1.1743747234694437e-06,
"loss": 2.3899,
"step": 842
},
{
"epoch": 0.8547528517110267,
"grad_norm": 2.794480562210083,
"learning_rate": 1.1586595639301768e-06,
"loss": 2.4172,
"step": 843
},
{
"epoch": 0.855766793409379,
"grad_norm": 3.079993486404419,
"learning_rate": 1.143043793379287e-06,
"loss": 2.4708,
"step": 844
},
{
"epoch": 0.8567807351077313,
"grad_norm": 2.8267061710357666,
"learning_rate": 1.1275275873597957e-06,
"loss": 2.8664,
"step": 845
},
{
"epoch": 0.8577946768060837,
"grad_norm": 2.92879056930542,
"learning_rate": 1.1121111202954836e-06,
"loss": 2.5214,
"step": 846
},
{
"epoch": 0.858808618504436,
"grad_norm": 3.016521692276001,
"learning_rate": 1.096794565488929e-06,
"loss": 2.4585,
"step": 847
},
{
"epoch": 0.8598225602027884,
"grad_norm": 2.8173906803131104,
"learning_rate": 1.0815780951195521e-06,
"loss": 2.5214,
"step": 848
},
{
"epoch": 0.8608365019011407,
"grad_norm": 2.774216651916504,
"learning_rate": 1.0664618802416814e-06,
"loss": 2.3993,
"step": 849
},
{
"epoch": 0.861850443599493,
"grad_norm": 2.5246875286102295,
"learning_rate": 1.0514460907826473e-06,
"loss": 2.6165,
"step": 850
},
{
"epoch": 0.8628643852978454,
"grad_norm": 2.8525874614715576,
"learning_rate": 1.0365308955408459e-06,
"loss": 2.5919,
"step": 851
},
{
"epoch": 0.8638783269961977,
"grad_norm": 2.689223527908325,
"learning_rate": 1.0217164621838605e-06,
"loss": 2.7843,
"step": 852
},
{
"epoch": 0.86489226869455,
"grad_norm": 2.990201473236084,
"learning_rate": 1.0070029572465657e-06,
"loss": 2.7435,
"step": 853
},
{
"epoch": 0.8659062103929024,
"grad_norm": 2.704960346221924,
"learning_rate": 9.923905461292638e-07,
"loss": 2.7226,
"step": 854
},
{
"epoch": 0.8669201520912547,
"grad_norm": 2.8485541343688965,
"learning_rate": 9.77879393095823e-07,
"loss": 2.4598,
"step": 855
},
{
"epoch": 0.8679340937896071,
"grad_norm": 2.8298146724700928,
"learning_rate": 9.634696612718242e-07,
"loss": 2.6507,
"step": 856
},
{
"epoch": 0.8689480354879594,
"grad_norm": 2.8180055618286133,
"learning_rate": 9.491615126427356e-07,
"loss": 2.3754,
"step": 857
},
{
"epoch": 0.8699619771863117,
"grad_norm": 2.896484136581421,
"learning_rate": 9.349551080520913e-07,
"loss": 2.6731,
"step": 858
},
{
"epoch": 0.8709759188846641,
"grad_norm": 2.691772699356079,
"learning_rate": 9.208506071996759e-07,
"loss": 2.7255,
"step": 859
},
{
"epoch": 0.8719898605830165,
"grad_norm": 2.80303692817688,
"learning_rate": 9.068481686397324e-07,
"loss": 2.7703,
"step": 860
},
{
"epoch": 0.8730038022813689,
"grad_norm": 2.9904346466064453,
"learning_rate": 8.929479497791926e-07,
"loss": 2.6612,
"step": 861
},
{
"epoch": 0.8740177439797212,
"grad_norm": 2.7394704818725586,
"learning_rate": 8.791501068758823e-07,
"loss": 2.4495,
"step": 862
},
{
"epoch": 0.8750316856780735,
"grad_norm": 2.5872786045074463,
"learning_rate": 8.654547950367898e-07,
"loss": 2.432,
"step": 863
},
{
"epoch": 0.8760456273764259,
"grad_norm": 2.8211286067962646,
"learning_rate": 8.51862168216303e-07,
"loss": 2.6065,
"step": 864
},
{
"epoch": 0.8770595690747782,
"grad_norm": 2.8662006855010986,
"learning_rate": 8.383723792144916e-07,
"loss": 2.5171,
"step": 865
},
{
"epoch": 0.8780735107731306,
"grad_norm": 3.0293424129486084,
"learning_rate": 8.249855796753881e-07,
"loss": 2.4755,
"step": 866
},
{
"epoch": 0.8790874524714829,
"grad_norm": 3.1426124572753906,
"learning_rate": 8.117019200852716e-07,
"loss": 2.635,
"step": 867
},
{
"epoch": 0.8801013941698352,
"grad_norm": 2.7930748462677,
"learning_rate": 7.985215497709909e-07,
"loss": 2.4437,
"step": 868
},
{
"epoch": 0.8811153358681876,
"grad_norm": 2.8567628860473633,
"learning_rate": 7.854446168982777e-07,
"loss": 2.602,
"step": 869
},
{
"epoch": 0.8821292775665399,
"grad_norm": 2.7896361351013184,
"learning_rate": 7.724712684700819e-07,
"loss": 2.4404,
"step": 870
},
{
"epoch": 0.8831432192648923,
"grad_norm": 2.7019593715667725,
"learning_rate": 7.59601650324917e-07,
"loss": 2.6032,
"step": 871
},
{
"epoch": 0.8841571609632446,
"grad_norm": 3.116281747817993,
"learning_rate": 7.468359071352338e-07,
"loss": 2.4003,
"step": 872
},
{
"epoch": 0.8851711026615969,
"grad_norm": 3.2974696159362793,
"learning_rate": 7.341741824057713e-07,
"loss": 2.3171,
"step": 873
},
{
"epoch": 0.8861850443599493,
"grad_norm": 2.7963881492614746,
"learning_rate": 7.216166184719653e-07,
"loss": 2.4472,
"step": 874
},
{
"epoch": 0.8871989860583016,
"grad_norm": 3.155557155609131,
"learning_rate": 7.091633564983314e-07,
"loss": 2.3749,
"step": 875
},
{
"epoch": 0.888212927756654,
"grad_norm": 2.9222681522369385,
"learning_rate": 6.96814536476893e-07,
"loss": 2.8385,
"step": 876
},
{
"epoch": 0.8892268694550063,
"grad_norm": 2.622359275817871,
"learning_rate": 6.845702972255974e-07,
"loss": 2.6269,
"step": 877
},
{
"epoch": 0.8902408111533587,
"grad_norm": 2.6436614990234375,
"learning_rate": 6.724307763867555e-07,
"loss": 2.6646,
"step": 878
},
{
"epoch": 0.8912547528517111,
"grad_norm": 2.8023335933685303,
"learning_rate": 6.603961104255018e-07,
"loss": 2.5532,
"step": 879
},
{
"epoch": 0.8922686945500634,
"grad_norm": 3.015249729156494,
"learning_rate": 6.484664346282555e-07,
"loss": 2.6042,
"step": 880
},
{
"epoch": 0.8932826362484158,
"grad_norm": 2.866190195083618,
"learning_rate": 6.366418831011955e-07,
"loss": 2.4562,
"step": 881
},
{
"epoch": 0.8942965779467681,
"grad_norm": 2.755678653717041,
"learning_rate": 6.249225887687615e-07,
"loss": 2.4521,
"step": 882
},
{
"epoch": 0.8953105196451204,
"grad_norm": 3.0547993183135986,
"learning_rate": 6.133086833721569e-07,
"loss": 2.5823,
"step": 883
},
{
"epoch": 0.8963244613434728,
"grad_norm": 2.689230442047119,
"learning_rate": 6.018002974678616e-07,
"loss": 2.8689,
"step": 884
},
{
"epoch": 0.8973384030418251,
"grad_norm": 2.649055004119873,
"learning_rate": 5.903975604261725e-07,
"loss": 2.3195,
"step": 885
},
{
"epoch": 0.8983523447401774,
"grad_norm": 2.8953123092651367,
"learning_rate": 5.791006004297451e-07,
"loss": 2.5405,
"step": 886
},
{
"epoch": 0.8993662864385298,
"grad_norm": 2.914782762527466,
"learning_rate": 5.679095444721538e-07,
"loss": 2.6512,
"step": 887
},
{
"epoch": 0.9003802281368821,
"grad_norm": 2.9468483924865723,
"learning_rate": 5.568245183564669e-07,
"loss": 2.5355,
"step": 888
},
{
"epoch": 0.9013941698352345,
"grad_norm": 3.1716208457946777,
"learning_rate": 5.458456466938233e-07,
"loss": 2.7402,
"step": 889
},
{
"epoch": 0.9024081115335868,
"grad_norm": 3.0763444900512695,
"learning_rate": 5.349730529020436e-07,
"loss": 2.5703,
"step": 890
},
{
"epoch": 0.9034220532319391,
"grad_norm": 2.981691360473633,
"learning_rate": 5.242068592042349e-07,
"loss": 2.4712,
"step": 891
},
{
"epoch": 0.9044359949302915,
"grad_norm": 2.7758986949920654,
"learning_rate": 5.135471866274167e-07,
"loss": 2.8763,
"step": 892
},
{
"epoch": 0.9054499366286438,
"grad_norm": 2.766244411468506,
"learning_rate": 5.029941550011663e-07,
"loss": 2.5244,
"step": 893
},
{
"epoch": 0.9064638783269962,
"grad_norm": 2.747422218322754,
"learning_rate": 4.925478829562668e-07,
"loss": 2.7606,
"step": 894
},
{
"epoch": 0.9074778200253485,
"grad_norm": 2.6289525032043457,
"learning_rate": 4.822084879233746e-07,
"loss": 2.7443,
"step": 895
},
{
"epoch": 0.9084917617237009,
"grad_norm": 2.7913079261779785,
"learning_rate": 4.7197608613169685e-07,
"loss": 2.6684,
"step": 896
},
{
"epoch": 0.9095057034220533,
"grad_norm": 2.585127353668213,
"learning_rate": 4.618507926076954e-07,
"loss": 2.6322,
"step": 897
},
{
"epoch": 0.9105196451204056,
"grad_norm": 2.8578853607177734,
"learning_rate": 4.518327211737761e-07,
"loss": 2.7563,
"step": 898
},
{
"epoch": 0.911533586818758,
"grad_norm": 2.9707372188568115,
"learning_rate": 4.4192198444702685e-07,
"loss": 2.4981,
"step": 899
},
{
"epoch": 0.9125475285171103,
"grad_norm": 2.8918051719665527,
"learning_rate": 4.3211869383793735e-07,
"loss": 2.4355,
"step": 900
},
{
"epoch": 0.9135614702154626,
"grad_norm": 2.5744242668151855,
"learning_rate": 4.2242295954915913e-07,
"loss": 2.8608,
"step": 901
},
{
"epoch": 0.914575411913815,
"grad_norm": 3.1196258068084717,
"learning_rate": 4.128348905742585e-07,
"loss": 2.6406,
"step": 902
},
{
"epoch": 0.9155893536121673,
"grad_norm": 2.755964994430542,
"learning_rate": 4.0335459469649117e-07,
"loss": 2.4686,
"step": 903
},
{
"epoch": 0.9166032953105197,
"grad_norm": 2.6820852756500244,
"learning_rate": 3.9398217848759637e-07,
"loss": 2.6021,
"step": 904
},
{
"epoch": 0.917617237008872,
"grad_norm": 2.6925339698791504,
"learning_rate": 3.847177473065955e-07,
"loss": 2.3577,
"step": 905
},
{
"epoch": 0.9186311787072243,
"grad_norm": 3.050182819366455,
"learning_rate": 3.755614052986056e-07,
"loss": 2.7241,
"step": 906
},
{
"epoch": 0.9196451204055767,
"grad_norm": 2.7628440856933594,
"learning_rate": 3.66513255393669e-07,
"loss": 2.2088,
"step": 907
},
{
"epoch": 0.920659062103929,
"grad_norm": 2.9826819896698,
"learning_rate": 3.575733993056063e-07,
"loss": 2.2886,
"step": 908
},
{
"epoch": 0.9216730038022813,
"grad_norm": 2.6961429119110107,
"learning_rate": 3.4874193753085426e-07,
"loss": 2.5012,
"step": 909
},
{
"epoch": 0.9226869455006337,
"grad_norm": 2.7817845344543457,
"learning_rate": 3.4001896934735436e-07,
"loss": 2.6966,
"step": 910
},
{
"epoch": 0.923700887198986,
"grad_norm": 2.5238335132598877,
"learning_rate": 3.314045928134224e-07,
"loss": 2.5631,
"step": 911
},
{
"epoch": 0.9247148288973384,
"grad_norm": 2.87608003616333,
"learning_rate": 3.2289890476665975e-07,
"loss": 2.4905,
"step": 912
},
{
"epoch": 0.9257287705956907,
"grad_norm": 2.7551474571228027,
"learning_rate": 3.145020008228539e-07,
"loss": 2.2909,
"step": 913
},
{
"epoch": 0.9267427122940431,
"grad_norm": 2.788245677947998,
"learning_rate": 3.0621397537490494e-07,
"loss": 2.7361,
"step": 914
},
{
"epoch": 0.9277566539923955,
"grad_norm": 2.9203968048095703,
"learning_rate": 2.9803492159177103e-07,
"loss": 2.3171,
"step": 915
},
{
"epoch": 0.9287705956907478,
"grad_norm": 2.6838016510009766,
"learning_rate": 2.8996493141741686e-07,
"loss": 2.5996,
"step": 916
},
{
"epoch": 0.9297845373891002,
"grad_norm": 2.8150994777679443,
"learning_rate": 2.8200409556977894e-07,
"loss": 2.5204,
"step": 917
},
{
"epoch": 0.9307984790874525,
"grad_norm": 2.642317533493042,
"learning_rate": 2.74152503539743e-07,
"loss": 2.7703,
"step": 918
},
{
"epoch": 0.9318124207858048,
"grad_norm": 2.8072235584259033,
"learning_rate": 2.6641024359015056e-07,
"loss": 2.6409,
"step": 919
},
{
"epoch": 0.9328263624841572,
"grad_norm": 3.1344387531280518,
"learning_rate": 2.587774027547918e-07,
"loss": 2.3684,
"step": 920
},
{
"epoch": 0.9338403041825095,
"grad_norm": 2.733198642730713,
"learning_rate": 2.5125406683743417e-07,
"loss": 2.5659,
"step": 921
},
{
"epoch": 0.9348542458808619,
"grad_norm": 2.8597402572631836,
"learning_rate": 2.438403204108597e-07,
"loss": 2.6037,
"step": 922
},
{
"epoch": 0.9358681875792142,
"grad_norm": 2.898003578186035,
"learning_rate": 2.3653624681591048e-07,
"loss": 2.5481,
"step": 923
},
{
"epoch": 0.9368821292775665,
"grad_norm": 2.923548698425293,
"learning_rate": 2.2934192816055355e-07,
"loss": 2.6664,
"step": 924
},
{
"epoch": 0.9378960709759189,
"grad_norm": 2.659433364868164,
"learning_rate": 2.2225744531895632e-07,
"loss": 2.6727,
"step": 925
},
{
"epoch": 0.9389100126742712,
"grad_norm": 3.0512163639068604,
"learning_rate": 2.1528287793057934e-07,
"loss": 2.3878,
"step": 926
},
{
"epoch": 0.9399239543726235,
"grad_norm": 2.760099411010742,
"learning_rate": 2.0841830439928045e-07,
"loss": 2.5098,
"step": 927
},
{
"epoch": 0.9409378960709759,
"grad_norm": 3.138976812362671,
"learning_rate": 2.016638018924344e-07,
"loss": 2.5367,
"step": 928
},
{
"epoch": 0.9419518377693282,
"grad_norm": 2.7796897888183594,
"learning_rate": 1.95019446340059e-07,
"loss": 2.4234,
"step": 929
},
{
"epoch": 0.9429657794676806,
"grad_norm": 2.895256519317627,
"learning_rate": 1.8848531243397471e-07,
"loss": 2.3556,
"step": 930
},
{
"epoch": 0.9439797211660329,
"grad_norm": 2.9773645401000977,
"learning_rate": 1.8206147362695214e-07,
"loss": 2.341,
"step": 931
},
{
"epoch": 0.9449936628643854,
"grad_norm": 2.657684564590454,
"learning_rate": 1.7574800213189137e-07,
"loss": 2.5773,
"step": 932
},
{
"epoch": 0.9460076045627377,
"grad_norm": 2.844395160675049,
"learning_rate": 1.6954496892101047e-07,
"loss": 2.6748,
"step": 933
},
{
"epoch": 0.94702154626109,
"grad_norm": 3.145855188369751,
"learning_rate": 1.6345244372504842e-07,
"loss": 2.597,
"step": 934
},
{
"epoch": 0.9480354879594424,
"grad_norm": 3.008000373840332,
"learning_rate": 1.5747049503248013e-07,
"loss": 2.4269,
"step": 935
},
{
"epoch": 0.9490494296577947,
"grad_norm": 2.8397281169891357,
"learning_rate": 1.5159919008874368e-07,
"loss": 2.5656,
"step": 936
},
{
"epoch": 0.950063371356147,
"grad_norm": 2.601118803024292,
"learning_rate": 1.458385948954899e-07,
"loss": 2.7157,
"step": 937
},
{
"epoch": 0.9510773130544994,
"grad_norm": 2.9411909580230713,
"learning_rate": 1.4018877420983956e-07,
"loss": 2.3314,
"step": 938
},
{
"epoch": 0.9520912547528517,
"grad_norm": 2.7778210639953613,
"learning_rate": 1.3464979154364844e-07,
"loss": 2.558,
"step": 939
},
{
"epoch": 0.9531051964512041,
"grad_norm": 2.8902809619903564,
"learning_rate": 1.2922170916280118e-07,
"loss": 2.7303,
"step": 940
},
{
"epoch": 0.9541191381495564,
"grad_norm": 2.9819915294647217,
"learning_rate": 1.2390458808651085e-07,
"loss": 2.2839,
"step": 941
},
{
"epoch": 0.9551330798479087,
"grad_norm": 2.7157628536224365,
"learning_rate": 1.186984880866271e-07,
"loss": 2.5223,
"step": 942
},
{
"epoch": 0.9561470215462611,
"grad_norm": 2.838313579559326,
"learning_rate": 1.1360346768696907e-07,
"loss": 2.4969,
"step": 943
},
{
"epoch": 0.9571609632446134,
"grad_norm": 2.63149094581604,
"learning_rate": 1.0861958416266805e-07,
"loss": 2.8127,
"step": 944
},
{
"epoch": 0.9581749049429658,
"grad_norm": 2.867000102996826,
"learning_rate": 1.0374689353952027e-07,
"loss": 2.5835,
"step": 945
},
{
"epoch": 0.9591888466413181,
"grad_norm": 3.0157129764556885,
"learning_rate": 9.898545059335852e-08,
"loss": 2.4226,
"step": 946
},
{
"epoch": 0.9602027883396704,
"grad_norm": 3.223989486694336,
"learning_rate": 9.433530884943698e-08,
"loss": 2.5887,
"step": 947
},
{
"epoch": 0.9612167300380228,
"grad_norm": 2.9932541847229004,
"learning_rate": 8.979652058183185e-08,
"loss": 2.7176,
"step": 948
},
{
"epoch": 0.9622306717363751,
"grad_norm": 3.0052859783172607,
"learning_rate": 8.536913681284731e-08,
"loss": 2.3166,
"step": 949
},
{
"epoch": 0.9632446134347274,
"grad_norm": 2.8431527614593506,
"learning_rate": 8.105320731244703e-08,
"loss": 2.2966,
"step": 950
},
{
"epoch": 0.9642585551330799,
"grad_norm": 2.9385998249053955,
"learning_rate": 7.684878059769363e-08,
"loss": 2.7675,
"step": 951
},
{
"epoch": 0.9652724968314322,
"grad_norm": 2.7830967903137207,
"learning_rate": 7.275590393220456e-08,
"loss": 2.5262,
"step": 952
},
{
"epoch": 0.9662864385297846,
"grad_norm": 2.4871981143951416,
"learning_rate": 6.877462332561479e-08,
"loss": 2.4801,
"step": 953
},
{
"epoch": 0.9673003802281369,
"grad_norm": 3.1035871505737305,
"learning_rate": 6.49049835330684e-08,
"loss": 2.6659,
"step": 954
},
{
"epoch": 0.9683143219264893,
"grad_norm": 2.684152603149414,
"learning_rate": 6.114702805471107e-08,
"loss": 2.6706,
"step": 955
},
{
"epoch": 0.9693282636248416,
"grad_norm": 2.5241572856903076,
"learning_rate": 5.750079913519835e-08,
"loss": 2.6851,
"step": 956
},
{
"epoch": 0.9703422053231939,
"grad_norm": 2.860586166381836,
"learning_rate": 5.3966337763223795e-08,
"loss": 2.8105,
"step": 957
},
{
"epoch": 0.9713561470215463,
"grad_norm": 2.7309417724609375,
"learning_rate": 5.054368367106044e-08,
"loss": 2.4778,
"step": 958
},
{
"epoch": 0.9723700887198986,
"grad_norm": 2.763823986053467,
"learning_rate": 4.723287533411003e-08,
"loss": 2.6899,
"step": 959
},
{
"epoch": 0.973384030418251,
"grad_norm": 3.758967399597168,
"learning_rate": 4.403394997047339e-08,
"loss": 2.6874,
"step": 960
},
{
"epoch": 0.9743979721166033,
"grad_norm": 2.725534200668335,
"learning_rate": 4.094694354052742e-08,
"loss": 2.4807,
"step": 961
},
{
"epoch": 0.9754119138149556,
"grad_norm": 2.951953172683716,
"learning_rate": 3.797189074652874e-08,
"loss": 2.7076,
"step": 962
},
{
"epoch": 0.976425855513308,
"grad_norm": 2.8677074909210205,
"learning_rate": 3.5108825032217355e-08,
"loss": 2.6133,
"step": 963
},
{
"epoch": 0.9774397972116603,
"grad_norm": 2.856584310531616,
"learning_rate": 3.235777858244027e-08,
"loss": 2.733,
"step": 964
},
{
"epoch": 0.9784537389100126,
"grad_norm": 2.748422622680664,
"learning_rate": 2.9718782322794015e-08,
"loss": 2.5125,
"step": 965
},
{
"epoch": 0.979467680608365,
"grad_norm": 2.9938743114471436,
"learning_rate": 2.719186591927603e-08,
"loss": 2.5371,
"step": 966
},
{
"epoch": 0.9804816223067173,
"grad_norm": 3.208367109298706,
"learning_rate": 2.4777057777946034e-08,
"loss": 2.5367,
"step": 967
},
{
"epoch": 0.9814955640050697,
"grad_norm": 2.9483325481414795,
"learning_rate": 2.2474385044615188e-08,
"loss": 2.4301,
"step": 968
},
{
"epoch": 0.9825095057034221,
"grad_norm": 2.8662376403808594,
"learning_rate": 2.028387360453188e-08,
"loss": 2.4986,
"step": 969
},
{
"epoch": 0.9835234474017744,
"grad_norm": 2.7627525329589844,
"learning_rate": 1.8205548082099733e-08,
"loss": 2.353,
"step": 970
},
{
"epoch": 0.9845373891001268,
"grad_norm": 2.735853910446167,
"learning_rate": 1.623943184059229e-08,
"loss": 2.9126,
"step": 971
},
{
"epoch": 0.9855513307984791,
"grad_norm": 2.9360198974609375,
"learning_rate": 1.4385546981897647e-08,
"loss": 2.3977,
"step": 972
},
{
"epoch": 0.9865652724968315,
"grad_norm": 3.8521199226379395,
"learning_rate": 1.264391434626533e-08,
"loss": 2.6582,
"step": 973
},
{
"epoch": 0.9875792141951838,
"grad_norm": 2.916172981262207,
"learning_rate": 1.1014553512072036e-08,
"loss": 2.4163,
"step": 974
},
{
"epoch": 0.9885931558935361,
"grad_norm": 2.8792824745178223,
"learning_rate": 9.49748279560514e-09,
"loss": 2.369,
"step": 975
},
{
"epoch": 0.9896070975918885,
"grad_norm": 2.9305737018585205,
"learning_rate": 8.092719250853975e-09,
"loss": 2.5004,
"step": 976
},
{
"epoch": 0.9906210392902408,
"grad_norm": 2.9403040409088135,
"learning_rate": 6.800278669317762e-09,
"loss": 2.4728,
"step": 977
},
{
"epoch": 0.9916349809885932,
"grad_norm": 2.7486960887908936,
"learning_rate": 5.6201755798313e-09,
"loss": 2.6734,
"step": 978
},
{
"epoch": 0.9926489226869455,
"grad_norm": 2.7036991119384766,
"learning_rate": 4.55242324839622e-09,
"loss": 2.7176,
"step": 979
},
{
"epoch": 0.9936628643852978,
"grad_norm": 2.9134089946746826,
"learning_rate": 3.597033678038875e-09,
"loss": 2.6166,
"step": 980
},
{
"epoch": 0.9946768060836502,
"grad_norm": 3.0970189571380615,
"learning_rate": 2.7540176086671145e-09,
"loss": 2.4814,
"step": 981
},
{
"epoch": 0.9956907477820025,
"grad_norm": 2.951559066772461,
"learning_rate": 2.02338451695816e-09,
"loss": 2.4173,
"step": 982
},
{
"epoch": 0.9967046894803548,
"grad_norm": 2.8410286903381348,
"learning_rate": 1.4051426162464687e-09,
"loss": 2.7181,
"step": 983
},
{
"epoch": 0.9977186311787072,
"grad_norm": 2.9073750972747803,
"learning_rate": 8.992988564315852e-10,
"loss": 2.7888,
"step": 984
},
{
"epoch": 0.9987325728770595,
"grad_norm": 2.890777826309204,
"learning_rate": 5.058589239026468e-10,
"loss": 2.5691,
"step": 985
},
{
"epoch": 0.9997465145754119,
"grad_norm": 2.7914986610412598,
"learning_rate": 2.2482724147177005e-10,
"loss": 2.6085,
"step": 986
},
{
"epoch": 1.0,
"grad_norm": 7.341601371765137,
"learning_rate": 5.620696832964179e-11,
"loss": 1.953,
"step": 987
}
],
"logging_steps": 1,
"max_steps": 987,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 6.404770150455706e+16,
"train_batch_size": 6,
"trial_name": null,
"trial_params": null
}