llama-3B-diffusion / last-checkpoint /trainer_state.json
rootxhacker's picture
Training in progress, step 27301, checkpoint
b7d96d9 verified
{
"best_global_step": 24500,
"best_metric": 1.4431298971176147,
"best_model_checkpoint": "./ar-diffusion-checkpoints/checkpoint-24500",
"epoch": 2.09991539112376,
"eval_steps": 250,
"global_step": 27301,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.003845858010922237,
"grad_norm": 8.077690124511719,
"learning_rate": 6.579999999999999e-05,
"loss": 10.7559,
"step": 50
},
{
"epoch": 0.007691716021844474,
"grad_norm": 7.270859241485596,
"learning_rate": 0.00013299999999999998,
"loss": 6.4993,
"step": 100
},
{
"epoch": 0.01153757403276671,
"grad_norm": 6.350255012512207,
"learning_rate": 0.00013976839086798278,
"loss": 5.8214,
"step": 150
},
{
"epoch": 0.015383432043688947,
"grad_norm": 5.809306621551514,
"learning_rate": 0.00013951104738796366,
"loss": 5.3663,
"step": 200
},
{
"epoch": 0.019229290054611183,
"grad_norm": 3.9576303958892822,
"learning_rate": 0.00013925370390794456,
"loss": 5.3697,
"step": 250
},
{
"epoch": 0.019229290054611183,
"eval_loss": 5.2464423179626465,
"eval_runtime": 18.6939,
"eval_samples_per_second": 53.493,
"eval_steps_per_second": 13.373,
"step": 250
},
{
"epoch": 0.02307514806553342,
"grad_norm": 5.081186771392822,
"learning_rate": 0.00013899636042792544,
"loss": 5.0419,
"step": 300
},
{
"epoch": 0.02692100607645566,
"grad_norm": 5.957707405090332,
"learning_rate": 0.0001387390169479063,
"loss": 4.8305,
"step": 350
},
{
"epoch": 0.030766864087377895,
"grad_norm": 3.9519667625427246,
"learning_rate": 0.0001384816734678872,
"loss": 5.1118,
"step": 400
},
{
"epoch": 0.03461272209830013,
"grad_norm": 2.498075485229492,
"learning_rate": 0.00013822432998786808,
"loss": 5.0262,
"step": 450
},
{
"epoch": 0.038458580109222366,
"grad_norm": 4.084473609924316,
"learning_rate": 0.00013796698650784896,
"loss": 5.0738,
"step": 500
},
{
"epoch": 0.038458580109222366,
"eval_loss": 4.9789862632751465,
"eval_runtime": 18.8768,
"eval_samples_per_second": 52.975,
"eval_steps_per_second": 13.244,
"step": 500
},
{
"epoch": 0.0423044381201446,
"grad_norm": 6.3689374923706055,
"learning_rate": 0.00013771478989743022,
"loss": 4.9228,
"step": 550
},
{
"epoch": 0.04615029613106684,
"grad_norm": 3.9407873153686523,
"learning_rate": 0.0001374574464174111,
"loss": 4.976,
"step": 600
},
{
"epoch": 0.04999615414198908,
"grad_norm": 4.298041343688965,
"learning_rate": 0.00013720010293739198,
"loss": 4.6802,
"step": 650
},
{
"epoch": 0.05384201215291132,
"grad_norm": 3.756016492843628,
"learning_rate": 0.0001369427594573729,
"loss": 4.7095,
"step": 700
},
{
"epoch": 0.05768787016383355,
"grad_norm": 4.344913959503174,
"learning_rate": 0.00013668541597735377,
"loss": 4.8664,
"step": 750
},
{
"epoch": 0.05768787016383355,
"eval_loss": 4.762838363647461,
"eval_runtime": 18.772,
"eval_samples_per_second": 53.271,
"eval_steps_per_second": 13.318,
"step": 750
},
{
"epoch": 0.06153372817475579,
"grad_norm": 4.1537275314331055,
"learning_rate": 0.00013642807249733465,
"loss": 4.9688,
"step": 800
},
{
"epoch": 0.06537958618567802,
"grad_norm": 4.85400915145874,
"learning_rate": 0.00013617072901731553,
"loss": 4.8658,
"step": 850
},
{
"epoch": 0.06922544419660026,
"grad_norm": 4.026614189147949,
"learning_rate": 0.0001359133855372964,
"loss": 4.893,
"step": 900
},
{
"epoch": 0.0730713022075225,
"grad_norm": 3.84721040725708,
"learning_rate": 0.0001356560420572773,
"loss": 4.6926,
"step": 950
},
{
"epoch": 0.07691716021844473,
"grad_norm": 9.182045936584473,
"learning_rate": 0.00013539869857725817,
"loss": 4.881,
"step": 1000
},
{
"epoch": 0.07691716021844473,
"eval_loss": 4.709664344787598,
"eval_runtime": 18.8053,
"eval_samples_per_second": 53.177,
"eval_steps_per_second": 13.294,
"step": 1000
},
{
"epoch": 0.08076301822936698,
"grad_norm": 5.442048072814941,
"learning_rate": 0.00013514135509723907,
"loss": 4.6134,
"step": 1050
},
{
"epoch": 0.0846088762402892,
"grad_norm": 4.779583930969238,
"learning_rate": 0.00013488401161721995,
"loss": 4.7226,
"step": 1100
},
{
"epoch": 0.08845473425121145,
"grad_norm": 3.221238851547241,
"learning_rate": 0.0001346266681372008,
"loss": 4.6837,
"step": 1150
},
{
"epoch": 0.09230059226213368,
"grad_norm": 5.55983304977417,
"learning_rate": 0.0001343693246571817,
"loss": 4.672,
"step": 1200
},
{
"epoch": 0.09614645027305592,
"grad_norm": 6.964417934417725,
"learning_rate": 0.0001341119811771626,
"loss": 4.9043,
"step": 1250
},
{
"epoch": 0.09614645027305592,
"eval_loss": 4.7052001953125,
"eval_runtime": 18.9307,
"eval_samples_per_second": 52.824,
"eval_steps_per_second": 13.206,
"step": 1250
},
{
"epoch": 0.09999230828397816,
"grad_norm": 7.476005554199219,
"learning_rate": 0.00013385463769714347,
"loss": 4.7776,
"step": 1300
},
{
"epoch": 0.10383816629490039,
"grad_norm": 3.4916040897369385,
"learning_rate": 0.00013359729421712435,
"loss": 4.7738,
"step": 1350
},
{
"epoch": 0.10768402430582263,
"grad_norm": 4.028671741485596,
"learning_rate": 0.00013333995073710526,
"loss": 4.6459,
"step": 1400
},
{
"epoch": 0.11152988231674486,
"grad_norm": 4.597095489501953,
"learning_rate": 0.0001330826072570861,
"loss": 4.6778,
"step": 1450
},
{
"epoch": 0.1153757403276671,
"grad_norm": 5.779391288757324,
"learning_rate": 0.000132825263777067,
"loss": 4.7938,
"step": 1500
},
{
"epoch": 0.1153757403276671,
"eval_loss": 4.696172714233398,
"eval_runtime": 18.8705,
"eval_samples_per_second": 52.993,
"eval_steps_per_second": 13.248,
"step": 1500
},
{
"epoch": 0.11922159833858934,
"grad_norm": 3.801748752593994,
"learning_rate": 0.0001325679202970479,
"loss": 4.7912,
"step": 1550
},
{
"epoch": 0.12306745634951158,
"grad_norm": 8.367344856262207,
"learning_rate": 0.00013231057681702878,
"loss": 4.7281,
"step": 1600
},
{
"epoch": 0.12691331436043382,
"grad_norm": 4.299734592437744,
"learning_rate": 0.00013205323333700966,
"loss": 4.7263,
"step": 1650
},
{
"epoch": 0.13075917237135604,
"grad_norm": 6.152933597564697,
"learning_rate": 0.00013179588985699054,
"loss": 4.8519,
"step": 1700
},
{
"epoch": 0.13460503038227828,
"grad_norm": 4.300355434417725,
"learning_rate": 0.00013153854637697142,
"loss": 4.8359,
"step": 1750
},
{
"epoch": 0.13460503038227828,
"eval_loss": 4.635708808898926,
"eval_runtime": 18.5455,
"eval_samples_per_second": 53.922,
"eval_steps_per_second": 13.48,
"step": 1750
},
{
"epoch": 0.13845088839320052,
"grad_norm": 2.1330080032348633,
"learning_rate": 0.0001312812028969523,
"loss": 4.807,
"step": 1800
},
{
"epoch": 0.14229674640412276,
"grad_norm": 4.667717456817627,
"learning_rate": 0.00013102385941693318,
"loss": 4.6633,
"step": 1850
},
{
"epoch": 0.146142604415045,
"grad_norm": 6.904145240783691,
"learning_rate": 0.00013076651593691408,
"loss": 4.7899,
"step": 1900
},
{
"epoch": 0.14998846242596722,
"grad_norm": 2.930926561355591,
"learning_rate": 0.00013050917245689496,
"loss": 4.6692,
"step": 1950
},
{
"epoch": 0.15383432043688947,
"grad_norm": 3.6246345043182373,
"learning_rate": 0.00013025182897687584,
"loss": 4.781,
"step": 2000
},
{
"epoch": 0.15383432043688947,
"eval_loss": 4.620576858520508,
"eval_runtime": 18.7692,
"eval_samples_per_second": 53.279,
"eval_steps_per_second": 13.32,
"step": 2000
},
{
"epoch": 0.1576801784478117,
"grad_norm": 3.5292210578918457,
"learning_rate": 0.00012999448549685672,
"loss": 4.7815,
"step": 2050
},
{
"epoch": 0.16152603645873395,
"grad_norm": 4.665738105773926,
"learning_rate": 0.0001297371420168376,
"loss": 4.6789,
"step": 2100
},
{
"epoch": 0.16537189446965617,
"grad_norm": 4.332949161529541,
"learning_rate": 0.00012947979853681848,
"loss": 4.5991,
"step": 2150
},
{
"epoch": 0.1692177524805784,
"grad_norm": 3.8279120922088623,
"learning_rate": 0.00012922245505679936,
"loss": 4.5791,
"step": 2200
},
{
"epoch": 0.17306361049150065,
"grad_norm": 1.9522042274475098,
"learning_rate": 0.00012896511157678027,
"loss": 4.5643,
"step": 2250
},
{
"epoch": 0.17306361049150065,
"eval_loss": 4.609655857086182,
"eval_runtime": 18.946,
"eval_samples_per_second": 52.782,
"eval_steps_per_second": 13.195,
"step": 2250
},
{
"epoch": 0.1769094685024229,
"grad_norm": 4.264033794403076,
"learning_rate": 0.00012870776809676115,
"loss": 4.6666,
"step": 2300
},
{
"epoch": 0.18075532651334514,
"grad_norm": 4.572433948516846,
"learning_rate": 0.000128450424616742,
"loss": 4.6096,
"step": 2350
},
{
"epoch": 0.18460118452426735,
"grad_norm": 3.8559391498565674,
"learning_rate": 0.0001281930811367229,
"loss": 4.6425,
"step": 2400
},
{
"epoch": 0.1884470425351896,
"grad_norm": 2.9414010047912598,
"learning_rate": 0.0001279357376567038,
"loss": 4.6336,
"step": 2450
},
{
"epoch": 0.19229290054611184,
"grad_norm": 4.745160102844238,
"learning_rate": 0.00012767839417668467,
"loss": 4.6792,
"step": 2500
},
{
"epoch": 0.19229290054611184,
"eval_loss": 4.558788776397705,
"eval_runtime": 18.9882,
"eval_samples_per_second": 52.664,
"eval_steps_per_second": 13.166,
"step": 2500
},
{
"epoch": 0.19613875855703408,
"grad_norm": 2.456908702850342,
"learning_rate": 0.00012742105069666555,
"loss": 4.3847,
"step": 2550
},
{
"epoch": 0.19998461656795632,
"grad_norm": 5.154629707336426,
"learning_rate": 0.00012716370721664645,
"loss": 4.6019,
"step": 2600
},
{
"epoch": 0.20383047457887854,
"grad_norm": 3.0423479080200195,
"learning_rate": 0.0001269063637366273,
"loss": 4.4796,
"step": 2650
},
{
"epoch": 0.20767633258980078,
"grad_norm": 4.218437194824219,
"learning_rate": 0.00012664902025660819,
"loss": 4.5566,
"step": 2700
},
{
"epoch": 0.21152219060072303,
"grad_norm": 5.20380163192749,
"learning_rate": 0.0001263916767765891,
"loss": 4.3311,
"step": 2750
},
{
"epoch": 0.21152219060072303,
"eval_loss": 4.574987888336182,
"eval_runtime": 18.8565,
"eval_samples_per_second": 53.032,
"eval_steps_per_second": 13.258,
"step": 2750
},
{
"epoch": 0.21536804861164527,
"grad_norm": 4.369246482849121,
"learning_rate": 0.00012613433329656997,
"loss": 4.4131,
"step": 2800
},
{
"epoch": 0.21921390662256748,
"grad_norm": 5.0442376136779785,
"learning_rate": 0.00012587698981655085,
"loss": 4.5027,
"step": 2850
},
{
"epoch": 0.22305976463348973,
"grad_norm": 3.6387200355529785,
"learning_rate": 0.00012561964633653173,
"loss": 4.6659,
"step": 2900
},
{
"epoch": 0.22690562264441197,
"grad_norm": 3.7960562705993652,
"learning_rate": 0.0001253623028565126,
"loss": 4.4826,
"step": 2950
},
{
"epoch": 0.2307514806553342,
"grad_norm": 4.273965835571289,
"learning_rate": 0.0001251049593764935,
"loss": 4.5869,
"step": 3000
},
{
"epoch": 0.2307514806553342,
"eval_loss": 4.55267858505249,
"eval_runtime": 18.9735,
"eval_samples_per_second": 52.705,
"eval_steps_per_second": 13.176,
"step": 3000
},
{
"epoch": 0.23459733866625646,
"grad_norm": 4.74845027923584,
"learning_rate": 0.00012484761589647437,
"loss": 4.6248,
"step": 3050
},
{
"epoch": 0.23844319667717867,
"grad_norm": 6.299524784088135,
"learning_rate": 0.00012459027241645528,
"loss": 4.5457,
"step": 3100
},
{
"epoch": 0.2422890546881009,
"grad_norm": 5.853606700897217,
"learning_rate": 0.00012433292893643616,
"loss": 4.5135,
"step": 3150
},
{
"epoch": 0.24613491269902316,
"grad_norm": 3.1514365673065186,
"learning_rate": 0.00012407558545641704,
"loss": 4.672,
"step": 3200
},
{
"epoch": 0.2499807707099454,
"grad_norm": 8.455827713012695,
"learning_rate": 0.00012381824197639792,
"loss": 4.6545,
"step": 3250
},
{
"epoch": 0.2499807707099454,
"eval_loss": 4.550297737121582,
"eval_runtime": 18.9801,
"eval_samples_per_second": 52.687,
"eval_steps_per_second": 13.172,
"step": 3250
},
{
"epoch": 0.25382662872086764,
"grad_norm": 2.8094310760498047,
"learning_rate": 0.0001235608984963788,
"loss": 4.5392,
"step": 3300
},
{
"epoch": 0.2576724867317899,
"grad_norm": 3.2565436363220215,
"learning_rate": 0.00012330355501635968,
"loss": 4.481,
"step": 3350
},
{
"epoch": 0.26151834474271207,
"grad_norm": 3.5588488578796387,
"learning_rate": 0.00012304621153634056,
"loss": 4.5543,
"step": 3400
},
{
"epoch": 0.2653642027536343,
"grad_norm": 3.0696310997009277,
"learning_rate": 0.00012278886805632146,
"loss": 4.5858,
"step": 3450
},
{
"epoch": 0.26921006076455656,
"grad_norm": 3.886117935180664,
"learning_rate": 0.00012253152457630234,
"loss": 4.4694,
"step": 3500
},
{
"epoch": 0.26921006076455656,
"eval_loss": 4.488556861877441,
"eval_runtime": 18.9212,
"eval_samples_per_second": 52.851,
"eval_steps_per_second": 13.213,
"step": 3500
},
{
"epoch": 0.2730559187754788,
"grad_norm": 3.794307231903076,
"learning_rate": 0.00012227418109628322,
"loss": 4.4994,
"step": 3550
},
{
"epoch": 0.27690177678640104,
"grad_norm": 3.5770812034606934,
"learning_rate": 0.00012201683761626409,
"loss": 4.5888,
"step": 3600
},
{
"epoch": 0.2807476347973233,
"grad_norm": 4.770874500274658,
"learning_rate": 0.00012175949413624498,
"loss": 4.5644,
"step": 3650
},
{
"epoch": 0.28459349280824553,
"grad_norm": 3.4447147846221924,
"learning_rate": 0.00012150215065622586,
"loss": 4.5301,
"step": 3700
},
{
"epoch": 0.2884393508191678,
"grad_norm": 4.76978063583374,
"learning_rate": 0.00012124480717620675,
"loss": 4.5563,
"step": 3750
},
{
"epoch": 0.2884393508191678,
"eval_loss": 4.53049898147583,
"eval_runtime": 18.9074,
"eval_samples_per_second": 52.889,
"eval_steps_per_second": 13.222,
"step": 3750
},
{
"epoch": 0.29228520883009,
"grad_norm": 5.7456512451171875,
"learning_rate": 0.00012098746369618763,
"loss": 4.5612,
"step": 3800
},
{
"epoch": 0.2961310668410122,
"grad_norm": 5.577849864959717,
"learning_rate": 0.00012073012021616851,
"loss": 4.4629,
"step": 3850
},
{
"epoch": 0.29997692485193445,
"grad_norm": 4.432284832000732,
"learning_rate": 0.00012047277673614939,
"loss": 4.6661,
"step": 3900
},
{
"epoch": 0.3038227828628567,
"grad_norm": 5.174475193023682,
"learning_rate": 0.00012021543325613027,
"loss": 4.4835,
"step": 3950
},
{
"epoch": 0.30766864087377893,
"grad_norm": 3.5657413005828857,
"learning_rate": 0.00011995808977611117,
"loss": 4.4894,
"step": 4000
},
{
"epoch": 0.30766864087377893,
"eval_loss": 4.5068535804748535,
"eval_runtime": 18.7951,
"eval_samples_per_second": 53.205,
"eval_steps_per_second": 13.301,
"step": 4000
},
{
"epoch": 0.3115144988847012,
"grad_norm": 3.854024648666382,
"learning_rate": 0.00011970074629609205,
"loss": 4.4989,
"step": 4050
},
{
"epoch": 0.3153603568956234,
"grad_norm": 4.0870490074157715,
"learning_rate": 0.00011944340281607294,
"loss": 4.3779,
"step": 4100
},
{
"epoch": 0.31920621490654566,
"grad_norm": 4.4627251625061035,
"learning_rate": 0.0001191860593360538,
"loss": 4.5526,
"step": 4150
},
{
"epoch": 0.3230520729174679,
"grad_norm": 7.568991184234619,
"learning_rate": 0.00011892871585603468,
"loss": 4.6285,
"step": 4200
},
{
"epoch": 0.32689793092839015,
"grad_norm": 4.214425086975098,
"learning_rate": 0.00011867137237601558,
"loss": 4.5328,
"step": 4250
},
{
"epoch": 0.32689793092839015,
"eval_loss": 4.511099815368652,
"eval_runtime": 18.7154,
"eval_samples_per_second": 53.432,
"eval_steps_per_second": 13.358,
"step": 4250
},
{
"epoch": 0.33074378893931233,
"grad_norm": 2.3888497352600098,
"learning_rate": 0.00011841402889599646,
"loss": 4.5408,
"step": 4300
},
{
"epoch": 0.3345896469502346,
"grad_norm": 3.128143548965454,
"learning_rate": 0.00011815668541597735,
"loss": 4.3879,
"step": 4350
},
{
"epoch": 0.3384355049611568,
"grad_norm": 4.353067874908447,
"learning_rate": 0.00011789934193595823,
"loss": 4.5091,
"step": 4400
},
{
"epoch": 0.34228136297207906,
"grad_norm": 4.771759986877441,
"learning_rate": 0.00011764199845593911,
"loss": 4.407,
"step": 4450
},
{
"epoch": 0.3461272209830013,
"grad_norm": 2.9524829387664795,
"learning_rate": 0.00011738465497591999,
"loss": 4.3798,
"step": 4500
},
{
"epoch": 0.3461272209830013,
"eval_loss": 4.479401588439941,
"eval_runtime": 18.8172,
"eval_samples_per_second": 53.143,
"eval_steps_per_second": 13.286,
"step": 4500
},
{
"epoch": 0.34997307899392355,
"grad_norm": 4.825377941131592,
"learning_rate": 0.00011712731149590087,
"loss": 4.5321,
"step": 4550
},
{
"epoch": 0.3538189370048458,
"grad_norm": 3.5786240100860596,
"learning_rate": 0.00011686996801588176,
"loss": 4.5819,
"step": 4600
},
{
"epoch": 0.35766479501576803,
"grad_norm": 4.445742130279541,
"learning_rate": 0.00011661262453586264,
"loss": 4.5954,
"step": 4650
},
{
"epoch": 0.3615106530266903,
"grad_norm": 4.670301914215088,
"learning_rate": 0.00011635528105584354,
"loss": 4.3381,
"step": 4700
},
{
"epoch": 0.36535651103761246,
"grad_norm": 3.0563037395477295,
"learning_rate": 0.0001160979375758244,
"loss": 4.4451,
"step": 4750
},
{
"epoch": 0.36535651103761246,
"eval_loss": 4.503940582275391,
"eval_runtime": 19.0274,
"eval_samples_per_second": 52.556,
"eval_steps_per_second": 13.139,
"step": 4750
},
{
"epoch": 0.3692023690485347,
"grad_norm": 4.921920299530029,
"learning_rate": 0.00011584059409580528,
"loss": 4.5505,
"step": 4800
},
{
"epoch": 0.37304822705945695,
"grad_norm": 4.440188407897949,
"learning_rate": 0.00011558325061578617,
"loss": 4.5339,
"step": 4850
},
{
"epoch": 0.3768940850703792,
"grad_norm": 4.123379707336426,
"learning_rate": 0.00011532590713576705,
"loss": 4.5001,
"step": 4900
},
{
"epoch": 0.38073994308130144,
"grad_norm": 3.6461265087127686,
"learning_rate": 0.00011506856365574795,
"loss": 4.4704,
"step": 4950
},
{
"epoch": 0.3845858010922237,
"grad_norm": 4.586422443389893,
"learning_rate": 0.00011481122017572883,
"loss": 4.5607,
"step": 5000
},
{
"epoch": 0.3845858010922237,
"eval_loss": 4.414160251617432,
"eval_runtime": 18.6554,
"eval_samples_per_second": 53.604,
"eval_steps_per_second": 13.401,
"step": 5000
},
{
"epoch": 0.3884316591031459,
"grad_norm": 2.658412456512451,
"learning_rate": 0.00011455387669570971,
"loss": 4.5453,
"step": 5050
},
{
"epoch": 0.39227751711406816,
"grad_norm": 2.231886148452759,
"learning_rate": 0.00011429653321569059,
"loss": 4.5524,
"step": 5100
},
{
"epoch": 0.3961233751249904,
"grad_norm": 4.202503204345703,
"learning_rate": 0.00011403918973567147,
"loss": 4.5274,
"step": 5150
},
{
"epoch": 0.39996923313591265,
"grad_norm": 2.8525800704956055,
"learning_rate": 0.00011378184625565236,
"loss": 4.5095,
"step": 5200
},
{
"epoch": 0.40381509114683484,
"grad_norm": 3.2517142295837402,
"learning_rate": 0.00011352964964523362,
"loss": 4.5043,
"step": 5250
},
{
"epoch": 0.40381509114683484,
"eval_loss": 4.595612525939941,
"eval_runtime": 18.9024,
"eval_samples_per_second": 52.903,
"eval_steps_per_second": 13.226,
"step": 5250
},
{
"epoch": 0.4076609491577571,
"grad_norm": 5.091184616088867,
"learning_rate": 0.00011327745303481488,
"loss": 4.4768,
"step": 5300
},
{
"epoch": 0.4115068071686793,
"grad_norm": 6.631587028503418,
"learning_rate": 0.00011302010955479578,
"loss": 4.5572,
"step": 5350
},
{
"epoch": 0.41535266517960157,
"grad_norm": 3.529118299484253,
"learning_rate": 0.00011276276607477666,
"loss": 4.6685,
"step": 5400
},
{
"epoch": 0.4191985231905238,
"grad_norm": 3.1017537117004395,
"learning_rate": 0.00011250542259475754,
"loss": 4.4271,
"step": 5450
},
{
"epoch": 0.42304438120144605,
"grad_norm": 3.930664300918579,
"learning_rate": 0.00011224807911473842,
"loss": 4.5501,
"step": 5500
},
{
"epoch": 0.42304438120144605,
"eval_loss": 4.486245632171631,
"eval_runtime": 18.9209,
"eval_samples_per_second": 52.851,
"eval_steps_per_second": 13.213,
"step": 5500
},
{
"epoch": 0.4268902392123683,
"grad_norm": 4.470078945159912,
"learning_rate": 0.00011199073563471931,
"loss": 4.543,
"step": 5550
},
{
"epoch": 0.43073609722329054,
"grad_norm": 5.099395751953125,
"learning_rate": 0.00011173339215470019,
"loss": 4.4515,
"step": 5600
},
{
"epoch": 0.4345819552342128,
"grad_norm": 3.210951805114746,
"learning_rate": 0.00011147604867468107,
"loss": 4.4605,
"step": 5650
},
{
"epoch": 0.43842781324513497,
"grad_norm": 4.092874050140381,
"learning_rate": 0.00011121870519466196,
"loss": 4.6267,
"step": 5700
},
{
"epoch": 0.4422736712560572,
"grad_norm": 2.756460666656494,
"learning_rate": 0.00011096136171464283,
"loss": 4.4338,
"step": 5750
},
{
"epoch": 0.4422736712560572,
"eval_loss": 4.457804203033447,
"eval_runtime": 18.7914,
"eval_samples_per_second": 53.216,
"eval_steps_per_second": 13.304,
"step": 5750
},
{
"epoch": 0.44611952926697945,
"grad_norm": 5.140827178955078,
"learning_rate": 0.00011070401823462372,
"loss": 4.5102,
"step": 5800
},
{
"epoch": 0.4499653872779017,
"grad_norm": 6.364997863769531,
"learning_rate": 0.0001104466747546046,
"loss": 4.5594,
"step": 5850
},
{
"epoch": 0.45381124528882394,
"grad_norm": 5.3479695320129395,
"learning_rate": 0.00011018933127458548,
"loss": 4.4067,
"step": 5900
},
{
"epoch": 0.4576571032997462,
"grad_norm": 3.728893518447876,
"learning_rate": 0.00010993198779456637,
"loss": 4.4689,
"step": 5950
},
{
"epoch": 0.4615029613106684,
"grad_norm": 6.3881611824035645,
"learning_rate": 0.00010967464431454724,
"loss": 4.5641,
"step": 6000
},
{
"epoch": 0.4615029613106684,
"eval_loss": 4.457447052001953,
"eval_runtime": 18.8382,
"eval_samples_per_second": 53.084,
"eval_steps_per_second": 13.271,
"step": 6000
},
{
"epoch": 0.46534881932159067,
"grad_norm": 3.6767919063568115,
"learning_rate": 0.00010941730083452813,
"loss": 4.5798,
"step": 6050
},
{
"epoch": 0.4691946773325129,
"grad_norm": 3.8597254753112793,
"learning_rate": 0.00010915995735450901,
"loss": 4.5867,
"step": 6100
},
{
"epoch": 0.4730405353434351,
"grad_norm": 2.8041980266571045,
"learning_rate": 0.0001089026138744899,
"loss": 4.4825,
"step": 6150
},
{
"epoch": 0.47688639335435734,
"grad_norm": 3.3872950077056885,
"learning_rate": 0.00010864527039447078,
"loss": 4.5624,
"step": 6200
},
{
"epoch": 0.4807322513652796,
"grad_norm": 3.698118209838867,
"learning_rate": 0.00010838792691445166,
"loss": 4.4889,
"step": 6250
},
{
"epoch": 0.4807322513652796,
"eval_loss": 4.451441287994385,
"eval_runtime": 19.2349,
"eval_samples_per_second": 51.989,
"eval_steps_per_second": 12.997,
"step": 6250
},
{
"epoch": 0.4845781093762018,
"grad_norm": 3.7140421867370605,
"learning_rate": 0.00010813058343443254,
"loss": 4.4654,
"step": 6300
},
{
"epoch": 0.48842396738712407,
"grad_norm": 3.095348834991455,
"learning_rate": 0.00010787323995441342,
"loss": 4.4761,
"step": 6350
},
{
"epoch": 0.4922698253980463,
"grad_norm": 3.289018392562866,
"learning_rate": 0.00010761589647439432,
"loss": 4.5459,
"step": 6400
},
{
"epoch": 0.49611568340896856,
"grad_norm": 3.9891817569732666,
"learning_rate": 0.0001073585529943752,
"loss": 4.3685,
"step": 6450
},
{
"epoch": 0.4999615414198908,
"grad_norm": 4.315449237823486,
"learning_rate": 0.00010710120951435608,
"loss": 4.4197,
"step": 6500
},
{
"epoch": 0.4999615414198908,
"eval_loss": 4.4507598876953125,
"eval_runtime": 18.8652,
"eval_samples_per_second": 53.008,
"eval_steps_per_second": 13.252,
"step": 6500
},
{
"epoch": 0.503807399430813,
"grad_norm": 4.299264430999756,
"learning_rate": 0.00010684386603433697,
"loss": 4.6103,
"step": 6550
},
{
"epoch": 0.5076532574417353,
"grad_norm": 4.186795234680176,
"learning_rate": 0.00010659166942391823,
"loss": 4.5303,
"step": 6600
},
{
"epoch": 0.5114991154526575,
"grad_norm": 2.925708293914795,
"learning_rate": 0.00010633432594389911,
"loss": 4.4265,
"step": 6650
},
{
"epoch": 0.5153449734635798,
"grad_norm": 6.368393421173096,
"learning_rate": 0.00010607698246388,
"loss": 4.3358,
"step": 6700
},
{
"epoch": 0.519190831474502,
"grad_norm": 4.947482585906982,
"learning_rate": 0.00010581963898386088,
"loss": 4.5812,
"step": 6750
},
{
"epoch": 0.519190831474502,
"eval_loss": 4.466405868530273,
"eval_runtime": 18.8333,
"eval_samples_per_second": 53.097,
"eval_steps_per_second": 13.274,
"step": 6750
},
{
"epoch": 0.5230366894854241,
"grad_norm": 2.469914674758911,
"learning_rate": 0.00010556229550384175,
"loss": 4.3623,
"step": 6800
},
{
"epoch": 0.5268825474963464,
"grad_norm": 5.027404308319092,
"learning_rate": 0.00010530495202382264,
"loss": 4.5466,
"step": 6850
},
{
"epoch": 0.5307284055072686,
"grad_norm": 4.797220706939697,
"learning_rate": 0.00010504760854380352,
"loss": 4.4486,
"step": 6900
},
{
"epoch": 0.5345742635181909,
"grad_norm": 5.403319358825684,
"learning_rate": 0.00010479026506378442,
"loss": 4.4919,
"step": 6950
},
{
"epoch": 0.5384201215291131,
"grad_norm": 4.601899147033691,
"learning_rate": 0.0001045329215837653,
"loss": 4.4703,
"step": 7000
},
{
"epoch": 0.5384201215291131,
"eval_loss": 4.411437034606934,
"eval_runtime": 18.8691,
"eval_samples_per_second": 52.997,
"eval_steps_per_second": 13.249,
"step": 7000
},
{
"epoch": 0.5422659795400354,
"grad_norm": 5.943952560424805,
"learning_rate": 0.00010427557810374618,
"loss": 4.3737,
"step": 7050
},
{
"epoch": 0.5461118375509576,
"grad_norm": 4.010414123535156,
"learning_rate": 0.00010401823462372706,
"loss": 4.5472,
"step": 7100
},
{
"epoch": 0.5499576955618799,
"grad_norm": 3.5218944549560547,
"learning_rate": 0.00010376089114370794,
"loss": 4.5854,
"step": 7150
},
{
"epoch": 0.5538035535728021,
"grad_norm": 9.44631290435791,
"learning_rate": 0.00010350354766368883,
"loss": 4.3883,
"step": 7200
},
{
"epoch": 0.5576494115837243,
"grad_norm": 4.5443434715271,
"learning_rate": 0.00010324620418366971,
"loss": 4.6685,
"step": 7250
},
{
"epoch": 0.5576494115837243,
"eval_loss": 4.4039154052734375,
"eval_runtime": 18.856,
"eval_samples_per_second": 53.034,
"eval_steps_per_second": 13.258,
"step": 7250
},
{
"epoch": 0.5614952695946466,
"grad_norm": 3.646768569946289,
"learning_rate": 0.0001029888607036506,
"loss": 4.5259,
"step": 7300
},
{
"epoch": 0.5653411276055688,
"grad_norm": 3.510744571685791,
"learning_rate": 0.00010273151722363148,
"loss": 4.4461,
"step": 7350
},
{
"epoch": 0.5691869856164911,
"grad_norm": 3.874558687210083,
"learning_rate": 0.00010247417374361235,
"loss": 4.3743,
"step": 7400
},
{
"epoch": 0.5730328436274132,
"grad_norm": 2.755722761154175,
"learning_rate": 0.00010221683026359324,
"loss": 4.4979,
"step": 7450
},
{
"epoch": 0.5768787016383355,
"grad_norm": 3.5653252601623535,
"learning_rate": 0.00010195948678357412,
"loss": 4.5442,
"step": 7500
},
{
"epoch": 0.5768787016383355,
"eval_loss": 4.44308614730835,
"eval_runtime": 18.8004,
"eval_samples_per_second": 53.19,
"eval_steps_per_second": 13.298,
"step": 7500
},
{
"epoch": 0.5807245596492577,
"grad_norm": 3.4961936473846436,
"learning_rate": 0.00010170214330355501,
"loss": 4.5194,
"step": 7550
},
{
"epoch": 0.58457041766018,
"grad_norm": 2.529500961303711,
"learning_rate": 0.00010144479982353589,
"loss": 4.3337,
"step": 7600
},
{
"epoch": 0.5884162756711022,
"grad_norm": 3.346160888671875,
"learning_rate": 0.00010118745634351679,
"loss": 4.5422,
"step": 7650
},
{
"epoch": 0.5922621336820244,
"grad_norm": 3.8311049938201904,
"learning_rate": 0.00010093011286349765,
"loss": 4.4191,
"step": 7700
},
{
"epoch": 0.5961079916929467,
"grad_norm": 4.324901580810547,
"learning_rate": 0.00010067276938347853,
"loss": 4.4613,
"step": 7750
},
{
"epoch": 0.5961079916929467,
"eval_loss": 4.4118547439575195,
"eval_runtime": 18.9517,
"eval_samples_per_second": 52.766,
"eval_steps_per_second": 13.191,
"step": 7750
},
{
"epoch": 0.5999538497038689,
"grad_norm": 3.888192653656006,
"learning_rate": 0.00010041542590345943,
"loss": 4.5492,
"step": 7800
},
{
"epoch": 0.6037997077147912,
"grad_norm": 2.718320608139038,
"learning_rate": 0.0001001580824234403,
"loss": 4.5371,
"step": 7850
},
{
"epoch": 0.6076455657257134,
"grad_norm": 3.5970869064331055,
"learning_rate": 9.99007389434212e-05,
"loss": 4.4835,
"step": 7900
},
{
"epoch": 0.6114914237366357,
"grad_norm": 4.563399314880371,
"learning_rate": 9.964339546340208e-05,
"loss": 4.4494,
"step": 7950
},
{
"epoch": 0.6153372817475579,
"grad_norm": 5.080177307128906,
"learning_rate": 9.938605198338294e-05,
"loss": 4.6072,
"step": 8000
},
{
"epoch": 0.6153372817475579,
"eval_loss": 4.428142547607422,
"eval_runtime": 18.8815,
"eval_samples_per_second": 52.962,
"eval_steps_per_second": 13.241,
"step": 8000
},
{
"epoch": 0.6191831397584802,
"grad_norm": 4.333257675170898,
"learning_rate": 9.912870850336384e-05,
"loss": 4.3148,
"step": 8050
},
{
"epoch": 0.6230289977694023,
"grad_norm": 5.497674465179443,
"learning_rate": 9.887136502334472e-05,
"loss": 4.5952,
"step": 8100
},
{
"epoch": 0.6268748557803245,
"grad_norm": 4.110482215881348,
"learning_rate": 9.861402154332561e-05,
"loss": 4.5036,
"step": 8150
},
{
"epoch": 0.6307207137912468,
"grad_norm": 3.9359841346740723,
"learning_rate": 9.835667806330649e-05,
"loss": 4.409,
"step": 8200
},
{
"epoch": 0.634566571802169,
"grad_norm": 4.095981597900391,
"learning_rate": 9.809933458328738e-05,
"loss": 4.3515,
"step": 8250
},
{
"epoch": 0.634566571802169,
"eval_loss": 4.438499927520752,
"eval_runtime": 18.9189,
"eval_samples_per_second": 52.857,
"eval_steps_per_second": 13.214,
"step": 8250
},
{
"epoch": 0.6384124298130913,
"grad_norm": 4.357822895050049,
"learning_rate": 9.784199110326825e-05,
"loss": 4.3767,
"step": 8300
},
{
"epoch": 0.6422582878240135,
"grad_norm": 3.039700508117676,
"learning_rate": 9.758979449284952e-05,
"loss": 4.4542,
"step": 8350
},
{
"epoch": 0.6461041458349358,
"grad_norm": 6.7661919593811035,
"learning_rate": 9.73324510128304e-05,
"loss": 4.4073,
"step": 8400
},
{
"epoch": 0.649950003845858,
"grad_norm": 4.223692893981934,
"learning_rate": 9.70751075328113e-05,
"loss": 4.4904,
"step": 8450
},
{
"epoch": 0.6537958618567803,
"grad_norm": 4.621217250823975,
"learning_rate": 9.681776405279216e-05,
"loss": 4.7717,
"step": 8500
},
{
"epoch": 0.6537958618567803,
"eval_loss": 4.392988204956055,
"eval_runtime": 18.8399,
"eval_samples_per_second": 53.079,
"eval_steps_per_second": 13.27,
"step": 8500
},
{
"epoch": 0.6576417198677025,
"grad_norm": 2.6913883686065674,
"learning_rate": 9.656042057277304e-05,
"loss": 4.4409,
"step": 8550
},
{
"epoch": 0.6614875778786247,
"grad_norm": 3.749894618988037,
"learning_rate": 9.630307709275394e-05,
"loss": 4.5101,
"step": 8600
},
{
"epoch": 0.665333435889547,
"grad_norm": 4.93977165222168,
"learning_rate": 9.604573361273482e-05,
"loss": 4.4504,
"step": 8650
},
{
"epoch": 0.6691792939004692,
"grad_norm": 4.311313152313232,
"learning_rate": 9.578839013271571e-05,
"loss": 4.4857,
"step": 8700
},
{
"epoch": 0.6730251519113915,
"grad_norm": 3.646656036376953,
"learning_rate": 9.553104665269659e-05,
"loss": 4.387,
"step": 8750
},
{
"epoch": 0.6730251519113915,
"eval_loss": 4.401506423950195,
"eval_runtime": 18.7931,
"eval_samples_per_second": 53.211,
"eval_steps_per_second": 13.303,
"step": 8750
},
{
"epoch": 0.6768710099223136,
"grad_norm": 4.352843284606934,
"learning_rate": 9.527370317267746e-05,
"loss": 4.5279,
"step": 8800
},
{
"epoch": 0.6807168679332359,
"grad_norm": 3.890216827392578,
"learning_rate": 9.501635969265835e-05,
"loss": 4.4485,
"step": 8850
},
{
"epoch": 0.6845627259441581,
"grad_norm": 3.4119713306427,
"learning_rate": 9.475901621263923e-05,
"loss": 4.4428,
"step": 8900
},
{
"epoch": 0.6884085839550804,
"grad_norm": 7.813595294952393,
"learning_rate": 9.450167273262012e-05,
"loss": 4.3308,
"step": 8950
},
{
"epoch": 0.6922544419660026,
"grad_norm": 3.079829692840576,
"learning_rate": 9.4244329252601e-05,
"loss": 4.368,
"step": 9000
},
{
"epoch": 0.6922544419660026,
"eval_loss": 4.393312931060791,
"eval_runtime": 18.7727,
"eval_samples_per_second": 53.269,
"eval_steps_per_second": 13.317,
"step": 9000
},
{
"epoch": 0.6961002999769248,
"grad_norm": 9.26623821258545,
"learning_rate": 9.39869857725819e-05,
"loss": 4.3073,
"step": 9050
},
{
"epoch": 0.6999461579878471,
"grad_norm": 3.5981953144073486,
"learning_rate": 9.372964229256276e-05,
"loss": 4.3923,
"step": 9100
},
{
"epoch": 0.7037920159987693,
"grad_norm": 3.734813690185547,
"learning_rate": 9.347229881254364e-05,
"loss": 4.2449,
"step": 9150
},
{
"epoch": 0.7076378740096916,
"grad_norm": 5.646871566772461,
"learning_rate": 9.321495533252453e-05,
"loss": 4.3953,
"step": 9200
},
{
"epoch": 0.7114837320206138,
"grad_norm": 4.284733295440674,
"learning_rate": 9.295761185250541e-05,
"loss": 4.475,
"step": 9250
},
{
"epoch": 0.7114837320206138,
"eval_loss": 4.348310470581055,
"eval_runtime": 19.0285,
"eval_samples_per_second": 52.553,
"eval_steps_per_second": 13.138,
"step": 9250
},
{
"epoch": 0.7153295900315361,
"grad_norm": 5.92791223526001,
"learning_rate": 9.27002683724863e-05,
"loss": 4.5493,
"step": 9300
},
{
"epoch": 0.7191754480424583,
"grad_norm": 4.768808841705322,
"learning_rate": 9.244292489246719e-05,
"loss": 4.2508,
"step": 9350
},
{
"epoch": 0.7230213060533806,
"grad_norm": 3.473097562789917,
"learning_rate": 9.218558141244805e-05,
"loss": 4.4534,
"step": 9400
},
{
"epoch": 0.7268671640643027,
"grad_norm": 10.189091682434082,
"learning_rate": 9.192823793242895e-05,
"loss": 4.3883,
"step": 9450
},
{
"epoch": 0.7307130220752249,
"grad_norm": 1.9577853679656982,
"learning_rate": 9.167089445240982e-05,
"loss": 4.3191,
"step": 9500
},
{
"epoch": 0.7307130220752249,
"eval_loss": 4.328299045562744,
"eval_runtime": 18.8631,
"eval_samples_per_second": 53.014,
"eval_steps_per_second": 13.253,
"step": 9500
},
{
"epoch": 0.7345588800861472,
"grad_norm": 3.9685990810394287,
"learning_rate": 9.141355097239072e-05,
"loss": 4.325,
"step": 9550
},
{
"epoch": 0.7384047380970694,
"grad_norm": 5.303285121917725,
"learning_rate": 9.11562074923716e-05,
"loss": 4.4277,
"step": 9600
},
{
"epoch": 0.7422505961079917,
"grad_norm": 2.70599627494812,
"learning_rate": 9.089886401235249e-05,
"loss": 4.4329,
"step": 9650
},
{
"epoch": 0.7460964541189139,
"grad_norm": 4.711449146270752,
"learning_rate": 9.064152053233336e-05,
"loss": 4.251,
"step": 9700
},
{
"epoch": 0.7499423121298362,
"grad_norm": 3.0169851779937744,
"learning_rate": 9.038417705231424e-05,
"loss": 4.3483,
"step": 9750
},
{
"epoch": 0.7499423121298362,
"eval_loss": 4.341108322143555,
"eval_runtime": 18.9063,
"eval_samples_per_second": 52.893,
"eval_steps_per_second": 13.223,
"step": 9750
},
{
"epoch": 0.7537881701407584,
"grad_norm": 3.375880002975464,
"learning_rate": 9.012683357229513e-05,
"loss": 4.313,
"step": 9800
},
{
"epoch": 0.7576340281516807,
"grad_norm": 1.707850456237793,
"learning_rate": 8.986949009227601e-05,
"loss": 4.3062,
"step": 9850
},
{
"epoch": 0.7614798861626029,
"grad_norm": 3.6718738079071045,
"learning_rate": 8.96121466122569e-05,
"loss": 4.4415,
"step": 9900
},
{
"epoch": 0.7653257441735252,
"grad_norm": 3.5382699966430664,
"learning_rate": 8.935480313223778e-05,
"loss": 4.3754,
"step": 9950
},
{
"epoch": 0.7691716021844474,
"grad_norm": 4.678229808807373,
"learning_rate": 8.909745965221865e-05,
"loss": 4.4404,
"step": 10000
},
{
"epoch": 0.7691716021844474,
"eval_loss": 4.3746819496154785,
"eval_runtime": 18.7221,
"eval_samples_per_second": 53.413,
"eval_steps_per_second": 13.353,
"step": 10000
},
{
"epoch": 0.7730174601953695,
"grad_norm": 3.490699529647827,
"learning_rate": 8.884011617219954e-05,
"loss": 4.5294,
"step": 10050
},
{
"epoch": 0.7768633182062918,
"grad_norm": 4.614148139953613,
"learning_rate": 8.858277269218042e-05,
"loss": 4.2371,
"step": 10100
},
{
"epoch": 0.780709176217214,
"grad_norm": 5.6906962394714355,
"learning_rate": 8.832542921216132e-05,
"loss": 4.5472,
"step": 10150
},
{
"epoch": 0.7845550342281363,
"grad_norm": 4.382456302642822,
"learning_rate": 8.80680857321422e-05,
"loss": 4.4282,
"step": 10200
},
{
"epoch": 0.7884008922390585,
"grad_norm": 4.546772003173828,
"learning_rate": 8.781074225212309e-05,
"loss": 4.4004,
"step": 10250
},
{
"epoch": 0.7884008922390585,
"eval_loss": 4.373971462249756,
"eval_runtime": 18.9303,
"eval_samples_per_second": 52.825,
"eval_steps_per_second": 13.206,
"step": 10250
},
{
"epoch": 0.7922467502499808,
"grad_norm": 3.784317970275879,
"learning_rate": 8.755339877210395e-05,
"loss": 4.4422,
"step": 10300
},
{
"epoch": 0.796092608260903,
"grad_norm": 3.11979341506958,
"learning_rate": 8.729605529208483e-05,
"loss": 4.4909,
"step": 10350
},
{
"epoch": 0.7999384662718253,
"grad_norm": 4.9711012840271,
"learning_rate": 8.703871181206573e-05,
"loss": 4.2955,
"step": 10400
},
{
"epoch": 0.8037843242827475,
"grad_norm": 3.7663426399230957,
"learning_rate": 8.678136833204661e-05,
"loss": 4.5105,
"step": 10450
},
{
"epoch": 0.8076301822936697,
"grad_norm": 4.679628372192383,
"learning_rate": 8.65240248520275e-05,
"loss": 4.5038,
"step": 10500
},
{
"epoch": 0.8076301822936697,
"eval_loss": 4.3565592765808105,
"eval_runtime": 18.9119,
"eval_samples_per_second": 52.877,
"eval_steps_per_second": 13.219,
"step": 10500
},
{
"epoch": 0.811476040304592,
"grad_norm": 4.561670303344727,
"learning_rate": 8.626668137200838e-05,
"loss": 4.6428,
"step": 10550
},
{
"epoch": 0.8153218983155142,
"grad_norm": 3.155518054962158,
"learning_rate": 8.600933789198925e-05,
"loss": 4.4605,
"step": 10600
},
{
"epoch": 0.8191677563264365,
"grad_norm": 4.021768093109131,
"learning_rate": 8.575199441197014e-05,
"loss": 4.2982,
"step": 10650
},
{
"epoch": 0.8230136143373586,
"grad_norm": 4.348796844482422,
"learning_rate": 8.549465093195102e-05,
"loss": 4.649,
"step": 10700
},
{
"epoch": 0.8268594723482809,
"grad_norm": 4.647562503814697,
"learning_rate": 8.523730745193191e-05,
"loss": 4.2873,
"step": 10750
},
{
"epoch": 0.8268594723482809,
"eval_loss": 4.3483662605285645,
"eval_runtime": 18.9227,
"eval_samples_per_second": 52.847,
"eval_steps_per_second": 13.212,
"step": 10750
},
{
"epoch": 0.8307053303592031,
"grad_norm": 3.9260427951812744,
"learning_rate": 8.497996397191279e-05,
"loss": 4.3823,
"step": 10800
},
{
"epoch": 0.8345511883701254,
"grad_norm": 3.7108564376831055,
"learning_rate": 8.472262049189368e-05,
"loss": 4.42,
"step": 10850
},
{
"epoch": 0.8383970463810476,
"grad_norm": 4.9123663902282715,
"learning_rate": 8.446527701187455e-05,
"loss": 4.5828,
"step": 10900
},
{
"epoch": 0.8422429043919698,
"grad_norm": 3.7289183139801025,
"learning_rate": 8.420793353185543e-05,
"loss": 4.3134,
"step": 10950
},
{
"epoch": 0.8460887624028921,
"grad_norm": 4.0350542068481445,
"learning_rate": 8.395059005183632e-05,
"loss": 4.3768,
"step": 11000
},
{
"epoch": 0.8460887624028921,
"eval_loss": 4.307990074157715,
"eval_runtime": 18.7713,
"eval_samples_per_second": 53.273,
"eval_steps_per_second": 13.318,
"step": 11000
},
{
"epoch": 0.8499346204138143,
"grad_norm": 5.336431503295898,
"learning_rate": 8.36983934414176e-05,
"loss": 4.3977,
"step": 11050
},
{
"epoch": 0.8537804784247366,
"grad_norm": 4.175157070159912,
"learning_rate": 8.344104996139847e-05,
"loss": 4.4053,
"step": 11100
},
{
"epoch": 0.8576263364356588,
"grad_norm": 4.384688377380371,
"learning_rate": 8.318370648137934e-05,
"loss": 4.26,
"step": 11150
},
{
"epoch": 0.8614721944465811,
"grad_norm": 3.6022467613220215,
"learning_rate": 8.292636300136024e-05,
"loss": 4.2993,
"step": 11200
},
{
"epoch": 0.8653180524575033,
"grad_norm": 4.252429485321045,
"learning_rate": 8.266901952134112e-05,
"loss": 4.299,
"step": 11250
},
{
"epoch": 0.8653180524575033,
"eval_loss": 4.334308624267578,
"eval_runtime": 18.9071,
"eval_samples_per_second": 52.89,
"eval_steps_per_second": 13.223,
"step": 11250
},
{
"epoch": 0.8691639104684256,
"grad_norm": 3.4003775119781494,
"learning_rate": 8.241167604132201e-05,
"loss": 4.2806,
"step": 11300
},
{
"epoch": 0.8730097684793477,
"grad_norm": 3.7436835765838623,
"learning_rate": 8.215433256130289e-05,
"loss": 4.2694,
"step": 11350
},
{
"epoch": 0.8768556264902699,
"grad_norm": 2.8963701725006104,
"learning_rate": 8.189698908128376e-05,
"loss": 4.362,
"step": 11400
},
{
"epoch": 0.8807014845011922,
"grad_norm": 3.3496339321136475,
"learning_rate": 8.163964560126465e-05,
"loss": 4.3698,
"step": 11450
},
{
"epoch": 0.8845473425121144,
"grad_norm": 4.4007487297058105,
"learning_rate": 8.138230212124553e-05,
"loss": 4.2994,
"step": 11500
},
{
"epoch": 0.8845473425121144,
"eval_loss": 4.315768241882324,
"eval_runtime": 18.8056,
"eval_samples_per_second": 53.176,
"eval_steps_per_second": 13.294,
"step": 11500
},
{
"epoch": 0.8883932005230367,
"grad_norm": 5.072123050689697,
"learning_rate": 8.112495864122642e-05,
"loss": 4.5564,
"step": 11550
},
{
"epoch": 0.8922390585339589,
"grad_norm": 3.130788564682007,
"learning_rate": 8.08676151612073e-05,
"loss": 4.427,
"step": 11600
},
{
"epoch": 0.8960849165448812,
"grad_norm": 2.615147352218628,
"learning_rate": 8.06102716811882e-05,
"loss": 4.3831,
"step": 11650
},
{
"epoch": 0.8999307745558034,
"grad_norm": 8.039403915405273,
"learning_rate": 8.035292820116906e-05,
"loss": 4.3388,
"step": 11700
},
{
"epoch": 0.9037766325667257,
"grad_norm": 2.6177854537963867,
"learning_rate": 8.009558472114994e-05,
"loss": 1.4931,
"step": 11750
},
{
"epoch": 0.9037766325667257,
"eval_loss": 1.534182071685791,
"eval_runtime": 18.0719,
"eval_samples_per_second": 55.335,
"eval_steps_per_second": 13.834,
"step": 11750
},
{
"epoch": 0.9076224905776479,
"grad_norm": 1.4090014696121216,
"learning_rate": 7.983824124113084e-05,
"loss": 1.5524,
"step": 11800
},
{
"epoch": 0.9114683485885701,
"grad_norm": 1.4773452281951904,
"learning_rate": 7.958089776111171e-05,
"loss": 1.4703,
"step": 11850
},
{
"epoch": 0.9153142065994924,
"grad_norm": 1.7350648641586304,
"learning_rate": 7.932355428109261e-05,
"loss": 1.4752,
"step": 11900
},
{
"epoch": 0.9191600646104146,
"grad_norm": 1.9704972505569458,
"learning_rate": 7.906621080107349e-05,
"loss": 1.5257,
"step": 11950
},
{
"epoch": 0.9230059226213368,
"grad_norm": 1.6183151006698608,
"learning_rate": 7.880886732105437e-05,
"loss": 1.4704,
"step": 12000
},
{
"epoch": 0.9230059226213368,
"eval_loss": 1.5159597396850586,
"eval_runtime": 17.891,
"eval_samples_per_second": 55.894,
"eval_steps_per_second": 13.974,
"step": 12000
},
{
"epoch": 0.926851780632259,
"grad_norm": 1.736138939857483,
"learning_rate": 7.855152384103525e-05,
"loss": 1.5304,
"step": 12050
},
{
"epoch": 0.9306976386431813,
"grad_norm": 1.807916283607483,
"learning_rate": 7.829418036101613e-05,
"loss": 1.4984,
"step": 12100
},
{
"epoch": 0.9345434966541035,
"grad_norm": 1.1977109909057617,
"learning_rate": 7.803683688099702e-05,
"loss": 1.4307,
"step": 12150
},
{
"epoch": 0.9383893546650258,
"grad_norm": 0.8386535048484802,
"learning_rate": 7.77794934009779e-05,
"loss": 1.444,
"step": 12200
},
{
"epoch": 0.942235212675948,
"grad_norm": 1.395053744316101,
"learning_rate": 7.752214992095878e-05,
"loss": 1.4866,
"step": 12250
},
{
"epoch": 0.942235212675948,
"eval_loss": 1.5108226537704468,
"eval_runtime": 18.0888,
"eval_samples_per_second": 55.283,
"eval_steps_per_second": 13.821,
"step": 12250
},
{
"epoch": 0.9460810706868702,
"grad_norm": 1.5271111726760864,
"learning_rate": 7.726480644093966e-05,
"loss": 1.4849,
"step": 12300
},
{
"epoch": 0.9499269286977925,
"grad_norm": 3.0610506534576416,
"learning_rate": 7.700746296092054e-05,
"loss": 1.4613,
"step": 12350
},
{
"epoch": 0.9537727867087147,
"grad_norm": 1.8968026638031006,
"learning_rate": 7.675011948090143e-05,
"loss": 1.591,
"step": 12400
},
{
"epoch": 0.957618644719637,
"grad_norm": 1.748979926109314,
"learning_rate": 7.649277600088231e-05,
"loss": 1.4781,
"step": 12450
},
{
"epoch": 0.9614645027305592,
"grad_norm": 1.6586661338806152,
"learning_rate": 7.62354325208632e-05,
"loss": 1.4668,
"step": 12500
},
{
"epoch": 0.9614645027305592,
"eval_loss": 1.5503162145614624,
"eval_runtime": 17.9222,
"eval_samples_per_second": 55.797,
"eval_steps_per_second": 13.949,
"step": 12500
},
{
"epoch": 0.9653103607414815,
"grad_norm": 7.388810634613037,
"learning_rate": 7.597808904084407e-05,
"loss": 1.5683,
"step": 12550
},
{
"epoch": 0.9691562187524037,
"grad_norm": 1.5548075437545776,
"learning_rate": 7.572074556082496e-05,
"loss": 1.4956,
"step": 12600
},
{
"epoch": 0.973002076763326,
"grad_norm": 1.5935887098312378,
"learning_rate": 7.546340208080584e-05,
"loss": 1.5363,
"step": 12650
},
{
"epoch": 0.9768479347742481,
"grad_norm": 1.985238790512085,
"learning_rate": 7.520605860078672e-05,
"loss": 1.5314,
"step": 12700
},
{
"epoch": 0.9806937927851703,
"grad_norm": 1.5040565729141235,
"learning_rate": 7.494871512076762e-05,
"loss": 1.5108,
"step": 12750
},
{
"epoch": 0.9806937927851703,
"eval_loss": 1.5085468292236328,
"eval_runtime": 18.0531,
"eval_samples_per_second": 55.392,
"eval_steps_per_second": 13.848,
"step": 12750
},
{
"epoch": 0.9845396507960926,
"grad_norm": 1.2956914901733398,
"learning_rate": 7.46913716407485e-05,
"loss": 1.4287,
"step": 12800
},
{
"epoch": 0.9883855088070148,
"grad_norm": 1.1903409957885742,
"learning_rate": 7.443402816072938e-05,
"loss": 1.5583,
"step": 12850
},
{
"epoch": 0.9922313668179371,
"grad_norm": 1.9069184064865112,
"learning_rate": 7.417668468071026e-05,
"loss": 1.5214,
"step": 12900
},
{
"epoch": 0.9960772248288593,
"grad_norm": 1.7362926006317139,
"learning_rate": 7.391934120069114e-05,
"loss": 1.55,
"step": 12950
},
{
"epoch": 0.9999230828397816,
"grad_norm": 1.2136348485946655,
"learning_rate": 7.366199772067203e-05,
"loss": 1.5035,
"step": 13000
},
{
"epoch": 0.9999230828397816,
"eval_loss": 1.5033278465270996,
"eval_runtime": 18.189,
"eval_samples_per_second": 54.978,
"eval_steps_per_second": 13.745,
"step": 13000
},
{
"epoch": 1.0037689408507038,
"grad_norm": 1.291033387184143,
"learning_rate": 7.340465424065291e-05,
"loss": 1.4455,
"step": 13050
},
{
"epoch": 1.007614798861626,
"grad_norm": 1.247129201889038,
"learning_rate": 7.31473107606338e-05,
"loss": 1.4629,
"step": 13100
},
{
"epoch": 1.0114606568725482,
"grad_norm": 1.2177772521972656,
"learning_rate": 7.288996728061467e-05,
"loss": 1.5715,
"step": 13150
},
{
"epoch": 1.0153065148834706,
"grad_norm": 1.2471716403961182,
"learning_rate": 7.263262380059556e-05,
"loss": 1.4244,
"step": 13200
},
{
"epoch": 1.0191523728943928,
"grad_norm": 0.8932450413703918,
"learning_rate": 7.237528032057644e-05,
"loss": 1.4278,
"step": 13250
},
{
"epoch": 1.0191523728943928,
"eval_loss": 1.5201970338821411,
"eval_runtime": 17.9356,
"eval_samples_per_second": 55.755,
"eval_steps_per_second": 13.939,
"step": 13250
},
{
"epoch": 1.022998230905315,
"grad_norm": 1.9957834482192993,
"learning_rate": 7.211793684055732e-05,
"loss": 1.5017,
"step": 13300
},
{
"epoch": 1.0268440889162371,
"grad_norm": 1.432619571685791,
"learning_rate": 7.186059336053821e-05,
"loss": 1.4271,
"step": 13350
},
{
"epoch": 1.0306899469271595,
"grad_norm": 1.3298619985580444,
"learning_rate": 7.16032498805191e-05,
"loss": 1.5726,
"step": 13400
},
{
"epoch": 1.0345358049380817,
"grad_norm": 10.102746963500977,
"learning_rate": 7.134590640049997e-05,
"loss": 1.3938,
"step": 13450
},
{
"epoch": 1.038381662949004,
"grad_norm": 1.9288721084594727,
"learning_rate": 7.108856292048085e-05,
"loss": 1.4264,
"step": 13500
},
{
"epoch": 1.038381662949004,
"eval_loss": 1.5168194770812988,
"eval_runtime": 18.139,
"eval_samples_per_second": 55.13,
"eval_steps_per_second": 13.782,
"step": 13500
},
{
"epoch": 1.042227520959926,
"grad_norm": 2.8053858280181885,
"learning_rate": 7.083121944046175e-05,
"loss": 1.5338,
"step": 13550
},
{
"epoch": 1.0460733789708483,
"grad_norm": 1.2761131525039673,
"learning_rate": 7.057387596044263e-05,
"loss": 1.4137,
"step": 13600
},
{
"epoch": 1.0499192369817707,
"grad_norm": 1.614910364151001,
"learning_rate": 7.03165324804235e-05,
"loss": 1.4634,
"step": 13650
},
{
"epoch": 1.0537650949926929,
"grad_norm": 1.8560376167297363,
"learning_rate": 7.00591890004044e-05,
"loss": 1.5173,
"step": 13700
},
{
"epoch": 1.057610953003615,
"grad_norm": 1.3471609354019165,
"learning_rate": 6.980184552038528e-05,
"loss": 1.4887,
"step": 13750
},
{
"epoch": 1.057610953003615,
"eval_loss": 1.5006794929504395,
"eval_runtime": 18.2151,
"eval_samples_per_second": 54.9,
"eval_steps_per_second": 13.725,
"step": 13750
},
{
"epoch": 1.0614568110145373,
"grad_norm": 1.661996841430664,
"learning_rate": 6.954450204036616e-05,
"loss": 1.4428,
"step": 13800
},
{
"epoch": 1.0653026690254597,
"grad_norm": 1.2982336282730103,
"learning_rate": 6.928715856034704e-05,
"loss": 1.4565,
"step": 13850
},
{
"epoch": 1.0691485270363819,
"grad_norm": 0.9250918626785278,
"learning_rate": 6.902981508032792e-05,
"loss": 1.5353,
"step": 13900
},
{
"epoch": 1.072994385047304,
"grad_norm": 1.8084945678710938,
"learning_rate": 6.877247160030881e-05,
"loss": 1.5047,
"step": 13950
},
{
"epoch": 1.0768402430582262,
"grad_norm": 1.1049927473068237,
"learning_rate": 6.851512812028969e-05,
"loss": 1.5058,
"step": 14000
},
{
"epoch": 1.0768402430582262,
"eval_loss": 1.5043680667877197,
"eval_runtime": 18.1464,
"eval_samples_per_second": 55.107,
"eval_steps_per_second": 13.777,
"step": 14000
},
{
"epoch": 1.0806861010691486,
"grad_norm": 1.7406409978866577,
"learning_rate": 6.825778464027057e-05,
"loss": 1.3945,
"step": 14050
},
{
"epoch": 1.0845319590800708,
"grad_norm": 1.1657389402389526,
"learning_rate": 6.800044116025146e-05,
"loss": 1.4528,
"step": 14100
},
{
"epoch": 1.088377817090993,
"grad_norm": 1.380635380744934,
"learning_rate": 6.774309768023234e-05,
"loss": 1.442,
"step": 14150
},
{
"epoch": 1.0922236751019152,
"grad_norm": 1.7555848360061646,
"learning_rate": 6.748575420021322e-05,
"loss": 1.5061,
"step": 14200
},
{
"epoch": 1.0960695331128374,
"grad_norm": 1.6465975046157837,
"learning_rate": 6.72284107201941e-05,
"loss": 1.5004,
"step": 14250
},
{
"epoch": 1.0960695331128374,
"eval_loss": 1.5090863704681396,
"eval_runtime": 18.0174,
"eval_samples_per_second": 55.502,
"eval_steps_per_second": 13.876,
"step": 14250
},
{
"epoch": 1.0999153911237598,
"grad_norm": 2.0214383602142334,
"learning_rate": 6.697106724017498e-05,
"loss": 1.5436,
"step": 14300
},
{
"epoch": 1.103761249134682,
"grad_norm": 1.399170160293579,
"learning_rate": 6.671372376015588e-05,
"loss": 1.5242,
"step": 14350
},
{
"epoch": 1.1076071071456042,
"grad_norm": 2.1806626319885254,
"learning_rate": 6.645638028013676e-05,
"loss": 1.4609,
"step": 14400
},
{
"epoch": 1.1114529651565264,
"grad_norm": 1.1671562194824219,
"learning_rate": 6.619903680011763e-05,
"loss": 1.3789,
"step": 14450
},
{
"epoch": 1.1152988231674485,
"grad_norm": 1.0041520595550537,
"learning_rate": 6.594169332009851e-05,
"loss": 1.4909,
"step": 14500
},
{
"epoch": 1.1152988231674485,
"eval_loss": 1.509366750717163,
"eval_runtime": 18.0148,
"eval_samples_per_second": 55.51,
"eval_steps_per_second": 13.877,
"step": 14500
},
{
"epoch": 1.119144681178371,
"grad_norm": 1.9716360569000244,
"learning_rate": 6.568434984007941e-05,
"loss": 1.5349,
"step": 14550
},
{
"epoch": 1.1229905391892931,
"grad_norm": 0.710033655166626,
"learning_rate": 6.542700636006029e-05,
"loss": 1.4107,
"step": 14600
},
{
"epoch": 1.1268363972002153,
"grad_norm": 1.4398375749588013,
"learning_rate": 6.516966288004117e-05,
"loss": 1.4185,
"step": 14650
},
{
"epoch": 1.1306822552111375,
"grad_norm": 2.5566532611846924,
"learning_rate": 6.491231940002206e-05,
"loss": 1.5758,
"step": 14700
},
{
"epoch": 1.13452811322206,
"grad_norm": 1.2500799894332886,
"learning_rate": 6.465497592000294e-05,
"loss": 1.4751,
"step": 14750
},
{
"epoch": 1.13452811322206,
"eval_loss": 1.4990500211715698,
"eval_runtime": 17.9979,
"eval_samples_per_second": 55.562,
"eval_steps_per_second": 13.891,
"step": 14750
},
{
"epoch": 1.1383739712329821,
"grad_norm": 1.5937495231628418,
"learning_rate": 6.439763243998382e-05,
"loss": 1.5215,
"step": 14800
},
{
"epoch": 1.1422198292439043,
"grad_norm": 1.362358570098877,
"learning_rate": 6.41402889599647e-05,
"loss": 1.5125,
"step": 14850
},
{
"epoch": 1.1460656872548265,
"grad_norm": 2.1192502975463867,
"learning_rate": 6.388294547994558e-05,
"loss": 1.4485,
"step": 14900
},
{
"epoch": 1.149911545265749,
"grad_norm": 1.4089174270629883,
"learning_rate": 6.362560199992647e-05,
"loss": 1.5331,
"step": 14950
},
{
"epoch": 1.153757403276671,
"grad_norm": 1.3750373125076294,
"learning_rate": 6.336825851990735e-05,
"loss": 1.5177,
"step": 15000
},
{
"epoch": 1.153757403276671,
"eval_loss": 1.5118192434310913,
"eval_runtime": 17.9213,
"eval_samples_per_second": 55.799,
"eval_steps_per_second": 13.95,
"step": 15000
},
{
"epoch": 1.1576032612875933,
"grad_norm": 1.5460007190704346,
"learning_rate": 6.311091503988823e-05,
"loss": 1.442,
"step": 15050
},
{
"epoch": 1.1614491192985155,
"grad_norm": 1.001439094543457,
"learning_rate": 6.285357155986911e-05,
"loss": 1.5308,
"step": 15100
},
{
"epoch": 1.1652949773094377,
"grad_norm": 0.8740602731704712,
"learning_rate": 6.259622807985e-05,
"loss": 1.455,
"step": 15150
},
{
"epoch": 1.16914083532036,
"grad_norm": 2.034207820892334,
"learning_rate": 6.233888459983088e-05,
"loss": 1.5089,
"step": 15200
},
{
"epoch": 1.1729866933312822,
"grad_norm": 1.8656599521636963,
"learning_rate": 6.208154111981176e-05,
"loss": 1.5368,
"step": 15250
},
{
"epoch": 1.1729866933312822,
"eval_loss": 1.4986381530761719,
"eval_runtime": 18.1736,
"eval_samples_per_second": 55.025,
"eval_steps_per_second": 13.756,
"step": 15250
},
{
"epoch": 1.1768325513422044,
"grad_norm": 1.2697277069091797,
"learning_rate": 6.182419763979266e-05,
"loss": 1.4239,
"step": 15300
},
{
"epoch": 1.1806784093531266,
"grad_norm": 1.1131771802902222,
"learning_rate": 6.156685415977354e-05,
"loss": 1.4309,
"step": 15350
},
{
"epoch": 1.1845242673640488,
"grad_norm": 1.5322145223617554,
"learning_rate": 6.130951067975442e-05,
"loss": 1.4793,
"step": 15400
},
{
"epoch": 1.1883701253749712,
"grad_norm": 1.1703407764434814,
"learning_rate": 6.10521671997353e-05,
"loss": 1.4761,
"step": 15450
},
{
"epoch": 1.1922159833858934,
"grad_norm": 1.4056655168533325,
"learning_rate": 6.079482371971618e-05,
"loss": 1.5311,
"step": 15500
},
{
"epoch": 1.1922159833858934,
"eval_loss": 1.4925825595855713,
"eval_runtime": 18.2116,
"eval_samples_per_second": 54.91,
"eval_steps_per_second": 13.727,
"step": 15500
},
{
"epoch": 1.1960618413968156,
"grad_norm": 2.7062911987304688,
"learning_rate": 6.053748023969707e-05,
"loss": 1.4145,
"step": 15550
},
{
"epoch": 1.1999076994077378,
"grad_norm": 1.5163620710372925,
"learning_rate": 6.028013675967794e-05,
"loss": 1.4322,
"step": 15600
},
{
"epoch": 1.2037535574186602,
"grad_norm": 1.342063546180725,
"learning_rate": 6.002279327965883e-05,
"loss": 1.4696,
"step": 15650
},
{
"epoch": 1.2075994154295824,
"grad_norm": 1.8180099725723267,
"learning_rate": 5.9765449799639715e-05,
"loss": 1.4647,
"step": 15700
},
{
"epoch": 1.2114452734405046,
"grad_norm": 1.951982855796814,
"learning_rate": 5.9508106319620595e-05,
"loss": 1.5141,
"step": 15750
},
{
"epoch": 1.2114452734405046,
"eval_loss": 1.4893933534622192,
"eval_runtime": 18.1951,
"eval_samples_per_second": 54.96,
"eval_steps_per_second": 13.74,
"step": 15750
},
{
"epoch": 1.2152911314514268,
"grad_norm": 1.7536894083023071,
"learning_rate": 5.925076283960148e-05,
"loss": 1.514,
"step": 15800
},
{
"epoch": 1.2191369894623492,
"grad_norm": 1.1857939958572388,
"learning_rate": 5.899341935958237e-05,
"loss": 1.4745,
"step": 15850
},
{
"epoch": 1.2229828474732714,
"grad_norm": 1.2500842809677124,
"learning_rate": 5.873607587956324e-05,
"loss": 1.4325,
"step": 15900
},
{
"epoch": 1.2268287054841935,
"grad_norm": 2.025336742401123,
"learning_rate": 5.847873239954413e-05,
"loss": 1.4913,
"step": 15950
},
{
"epoch": 1.2306745634951157,
"grad_norm": 1.1440426111221313,
"learning_rate": 5.8221388919525014e-05,
"loss": 1.451,
"step": 16000
},
{
"epoch": 1.2306745634951157,
"eval_loss": 1.492313265800476,
"eval_runtime": 18.0024,
"eval_samples_per_second": 55.548,
"eval_steps_per_second": 13.887,
"step": 16000
},
{
"epoch": 1.234520421506038,
"grad_norm": 1.1019631624221802,
"learning_rate": 5.796404543950589e-05,
"loss": 1.3918,
"step": 16050
},
{
"epoch": 1.2383662795169603,
"grad_norm": 1.7206593751907349,
"learning_rate": 5.770670195948678e-05,
"loss": 1.4726,
"step": 16100
},
{
"epoch": 1.2422121375278825,
"grad_norm": 1.9747880697250366,
"learning_rate": 5.7449358479467666e-05,
"loss": 1.4829,
"step": 16150
},
{
"epoch": 1.2460579955388047,
"grad_norm": 1.605573058128357,
"learning_rate": 5.719201499944854e-05,
"loss": 1.4476,
"step": 16200
},
{
"epoch": 1.2499038535497269,
"grad_norm": 1.180405616760254,
"learning_rate": 5.6934671519429426e-05,
"loss": 1.3904,
"step": 16250
},
{
"epoch": 1.2499038535497269,
"eval_loss": 1.4850120544433594,
"eval_runtime": 18.0422,
"eval_samples_per_second": 55.426,
"eval_steps_per_second": 13.856,
"step": 16250
},
{
"epoch": 1.253749711560649,
"grad_norm": 1.9959101676940918,
"learning_rate": 5.667732803941031e-05,
"loss": 1.4512,
"step": 16300
},
{
"epoch": 1.2575955695715715,
"grad_norm": 1.8853541612625122,
"learning_rate": 5.641998455939119e-05,
"loss": 1.458,
"step": 16350
},
{
"epoch": 1.2614414275824937,
"grad_norm": 1.4618902206420898,
"learning_rate": 5.616264107937208e-05,
"loss": 1.4968,
"step": 16400
},
{
"epoch": 1.2652872855934159,
"grad_norm": 1.4913650751113892,
"learning_rate": 5.5905297599352965e-05,
"loss": 1.3966,
"step": 16450
},
{
"epoch": 1.2691331436043383,
"grad_norm": 1.3095403909683228,
"learning_rate": 5.564795411933384e-05,
"loss": 1.4484,
"step": 16500
},
{
"epoch": 1.2691331436043383,
"eval_loss": 1.4897910356521606,
"eval_runtime": 18.0248,
"eval_samples_per_second": 55.479,
"eval_steps_per_second": 13.87,
"step": 16500
},
{
"epoch": 1.2729790016152602,
"grad_norm": 1.4080452919006348,
"learning_rate": 5.5390610639314724e-05,
"loss": 1.4667,
"step": 16550
},
{
"epoch": 1.2768248596261826,
"grad_norm": 1.6634443998336792,
"learning_rate": 5.513326715929561e-05,
"loss": 1.4619,
"step": 16600
},
{
"epoch": 1.2806707176371048,
"grad_norm": 2.0469400882720947,
"learning_rate": 5.487592367927649e-05,
"loss": 1.4105,
"step": 16650
},
{
"epoch": 1.284516575648027,
"grad_norm": 1.5735753774642944,
"learning_rate": 5.461858019925738e-05,
"loss": 1.4002,
"step": 16700
},
{
"epoch": 1.2883624336589494,
"grad_norm": 1.43183434009552,
"learning_rate": 5.436123671923826e-05,
"loss": 1.4586,
"step": 16750
},
{
"epoch": 1.2883624336589494,
"eval_loss": 1.4708431959152222,
"eval_runtime": 18.2152,
"eval_samples_per_second": 54.899,
"eval_steps_per_second": 13.725,
"step": 16750
},
{
"epoch": 1.2922082916698716,
"grad_norm": 1.6342015266418457,
"learning_rate": 5.4103893239219136e-05,
"loss": 1.4113,
"step": 16800
},
{
"epoch": 1.2960541496807938,
"grad_norm": 3.80155873298645,
"learning_rate": 5.384654975920002e-05,
"loss": 1.4793,
"step": 16850
},
{
"epoch": 1.299900007691716,
"grad_norm": 1.4240097999572754,
"learning_rate": 5.358920627918091e-05,
"loss": 1.4072,
"step": 16900
},
{
"epoch": 1.3037458657026382,
"grad_norm": 1.4548074007034302,
"learning_rate": 5.333186279916179e-05,
"loss": 1.4275,
"step": 16950
},
{
"epoch": 1.3075917237135606,
"grad_norm": 1.7287901639938354,
"learning_rate": 5.3074519319142675e-05,
"loss": 1.4741,
"step": 17000
},
{
"epoch": 1.3075917237135606,
"eval_loss": 1.4836150407791138,
"eval_runtime": 18.0219,
"eval_samples_per_second": 55.488,
"eval_steps_per_second": 13.872,
"step": 17000
},
{
"epoch": 1.3114375817244828,
"grad_norm": 1.732088327407837,
"learning_rate": 5.281717583912356e-05,
"loss": 1.5014,
"step": 17050
},
{
"epoch": 1.315283439735405,
"grad_norm": 2.144697427749634,
"learning_rate": 5.2559832359104435e-05,
"loss": 1.4436,
"step": 17100
},
{
"epoch": 1.3191292977463271,
"grad_norm": 1.649965763092041,
"learning_rate": 5.230248887908532e-05,
"loss": 1.4334,
"step": 17150
},
{
"epoch": 1.3229751557572493,
"grad_norm": 0.8667518496513367,
"learning_rate": 5.204514539906621e-05,
"loss": 1.487,
"step": 17200
},
{
"epoch": 1.3268210137681717,
"grad_norm": 1.4567649364471436,
"learning_rate": 5.178780191904709e-05,
"loss": 1.4714,
"step": 17250
},
{
"epoch": 1.3268210137681717,
"eval_loss": 1.479749321937561,
"eval_runtime": 17.9466,
"eval_samples_per_second": 55.721,
"eval_steps_per_second": 13.93,
"step": 17250
},
{
"epoch": 1.330666871779094,
"grad_norm": 1.8523489236831665,
"learning_rate": 5.1530458439027974e-05,
"loss": 1.4718,
"step": 17300
},
{
"epoch": 1.3345127297900161,
"grad_norm": 1.091204047203064,
"learning_rate": 5.127311495900886e-05,
"loss": 1.4012,
"step": 17350
},
{
"epoch": 1.3383585878009385,
"grad_norm": 1.8271427154541016,
"learning_rate": 5.101577147898973e-05,
"loss": 1.4547,
"step": 17400
},
{
"epoch": 1.3422044458118605,
"grad_norm": 1.8682465553283691,
"learning_rate": 5.075842799897062e-05,
"loss": 1.4373,
"step": 17450
},
{
"epoch": 1.346050303822783,
"grad_norm": 2.1932857036590576,
"learning_rate": 5.0501084518951506e-05,
"loss": 1.4628,
"step": 17500
},
{
"epoch": 1.346050303822783,
"eval_loss": 1.4871113300323486,
"eval_runtime": 17.9165,
"eval_samples_per_second": 55.814,
"eval_steps_per_second": 13.954,
"step": 17500
},
{
"epoch": 1.349896161833705,
"grad_norm": 1.6970813274383545,
"learning_rate": 5.0243741038932386e-05,
"loss": 1.4442,
"step": 17550
},
{
"epoch": 1.3537420198446273,
"grad_norm": 1.0942292213439941,
"learning_rate": 4.998639755891327e-05,
"loss": 1.4769,
"step": 17600
},
{
"epoch": 1.3575878778555497,
"grad_norm": 1.720035195350647,
"learning_rate": 4.972905407889416e-05,
"loss": 1.4519,
"step": 17650
},
{
"epoch": 1.3614337358664719,
"grad_norm": 0.8887185454368591,
"learning_rate": 4.947171059887503e-05,
"loss": 1.4201,
"step": 17700
},
{
"epoch": 1.365279593877394,
"grad_norm": 1.9557030200958252,
"learning_rate": 4.921436711885592e-05,
"loss": 1.4848,
"step": 17750
},
{
"epoch": 1.365279593877394,
"eval_loss": 1.476893424987793,
"eval_runtime": 17.9988,
"eval_samples_per_second": 55.559,
"eval_steps_per_second": 13.89,
"step": 17750
},
{
"epoch": 1.3691254518883162,
"grad_norm": 1.471414566040039,
"learning_rate": 4.8957023638836804e-05,
"loss": 1.4541,
"step": 17800
},
{
"epoch": 1.3729713098992384,
"grad_norm": 1.350690484046936,
"learning_rate": 4.8699680158817684e-05,
"loss": 1.3954,
"step": 17850
},
{
"epoch": 1.3768171679101608,
"grad_norm": 0.7363431453704834,
"learning_rate": 4.844233667879857e-05,
"loss": 1.4919,
"step": 17900
},
{
"epoch": 1.380663025921083,
"grad_norm": 1.8820909261703491,
"learning_rate": 4.818499319877946e-05,
"loss": 1.4177,
"step": 17950
},
{
"epoch": 1.3845088839320052,
"grad_norm": 0.8440986275672913,
"learning_rate": 4.792764971876033e-05,
"loss": 1.3995,
"step": 18000
},
{
"epoch": 1.3845088839320052,
"eval_loss": 1.4794726371765137,
"eval_runtime": 17.9989,
"eval_samples_per_second": 55.559,
"eval_steps_per_second": 13.89,
"step": 18000
},
{
"epoch": 1.3883547419429274,
"grad_norm": 1.6790105104446411,
"learning_rate": 4.7670306238741216e-05,
"loss": 1.4791,
"step": 18050
},
{
"epoch": 1.3922005999538496,
"grad_norm": 1.1840436458587646,
"learning_rate": 4.74129627587221e-05,
"loss": 1.4021,
"step": 18100
},
{
"epoch": 1.396046457964772,
"grad_norm": 1.7883968353271484,
"learning_rate": 4.715561927870298e-05,
"loss": 1.4637,
"step": 18150
},
{
"epoch": 1.3998923159756942,
"grad_norm": 1.2177505493164062,
"learning_rate": 4.689827579868387e-05,
"loss": 1.5123,
"step": 18200
},
{
"epoch": 1.4037381739866164,
"grad_norm": 1.439232349395752,
"learning_rate": 4.6640932318664756e-05,
"loss": 1.4579,
"step": 18250
},
{
"epoch": 1.4037381739866164,
"eval_loss": 1.4953014850616455,
"eval_runtime": 17.9127,
"eval_samples_per_second": 55.826,
"eval_steps_per_second": 13.957,
"step": 18250
},
{
"epoch": 1.4075840319975388,
"grad_norm": 2.0796408653259277,
"learning_rate": 4.638358883864563e-05,
"loss": 1.4295,
"step": 18300
},
{
"epoch": 1.4114298900084608,
"grad_norm": 1.3032926321029663,
"learning_rate": 4.6126245358626515e-05,
"loss": 1.4733,
"step": 18350
},
{
"epoch": 1.4152757480193832,
"grad_norm": 0.9058660864830017,
"learning_rate": 4.58689018786074e-05,
"loss": 1.4446,
"step": 18400
},
{
"epoch": 1.4191216060303053,
"grad_norm": 2.05460786819458,
"learning_rate": 4.561155839858828e-05,
"loss": 1.4133,
"step": 18450
},
{
"epoch": 1.4229674640412275,
"grad_norm": 0.8309249877929688,
"learning_rate": 4.535421491856917e-05,
"loss": 1.456,
"step": 18500
},
{
"epoch": 1.4229674640412275,
"eval_loss": 1.480312466621399,
"eval_runtime": 18.2137,
"eval_samples_per_second": 54.904,
"eval_steps_per_second": 13.726,
"step": 18500
},
{
"epoch": 1.42681332205215,
"grad_norm": 1.0496591329574585,
"learning_rate": 4.5096871438550054e-05,
"loss": 1.3723,
"step": 18550
},
{
"epoch": 1.4306591800630721,
"grad_norm": 1.273758053779602,
"learning_rate": 4.483952795853093e-05,
"loss": 1.4747,
"step": 18600
},
{
"epoch": 1.4345050380739943,
"grad_norm": 1.3594483137130737,
"learning_rate": 4.458218447851181e-05,
"loss": 1.564,
"step": 18650
},
{
"epoch": 1.4383508960849165,
"grad_norm": 1.773634672164917,
"learning_rate": 4.43248409984927e-05,
"loss": 1.4344,
"step": 18700
},
{
"epoch": 1.4421967540958387,
"grad_norm": 0.7939924001693726,
"learning_rate": 4.406749751847358e-05,
"loss": 1.3798,
"step": 18750
},
{
"epoch": 1.4421967540958387,
"eval_loss": 1.4680087566375732,
"eval_runtime": 18.0287,
"eval_samples_per_second": 55.467,
"eval_steps_per_second": 13.867,
"step": 18750
},
{
"epoch": 1.446042612106761,
"grad_norm": 1.4785016775131226,
"learning_rate": 4.3810154038454466e-05,
"loss": 1.5316,
"step": 18800
},
{
"epoch": 1.4498884701176833,
"grad_norm": 2.1929142475128174,
"learning_rate": 4.355281055843535e-05,
"loss": 1.4498,
"step": 18850
},
{
"epoch": 1.4537343281286055,
"grad_norm": 1.816432237625122,
"learning_rate": 4.3295467078416225e-05,
"loss": 1.5089,
"step": 18900
},
{
"epoch": 1.4575801861395277,
"grad_norm": 2.589778423309326,
"learning_rate": 4.303812359839711e-05,
"loss": 1.4011,
"step": 18950
},
{
"epoch": 1.4614260441504499,
"grad_norm": 1.6828664541244507,
"learning_rate": 4.2780780118378e-05,
"loss": 1.3803,
"step": 19000
},
{
"epoch": 1.4614260441504499,
"eval_loss": 1.4737956523895264,
"eval_runtime": 17.9628,
"eval_samples_per_second": 55.67,
"eval_steps_per_second": 13.918,
"step": 19000
},
{
"epoch": 1.4652719021613723,
"grad_norm": 1.3094508647918701,
"learning_rate": 4.252343663835888e-05,
"loss": 1.4726,
"step": 19050
},
{
"epoch": 1.4691177601722945,
"grad_norm": 2.1354212760925293,
"learning_rate": 4.2266093158339764e-05,
"loss": 1.4343,
"step": 19100
},
{
"epoch": 1.4729636181832166,
"grad_norm": 1.395593523979187,
"learning_rate": 4.200874967832065e-05,
"loss": 1.4834,
"step": 19150
},
{
"epoch": 1.476809476194139,
"grad_norm": 0.8917800784111023,
"learning_rate": 4.1751406198301524e-05,
"loss": 1.4625,
"step": 19200
},
{
"epoch": 1.480655334205061,
"grad_norm": 2.179772138595581,
"learning_rate": 4.149406271828241e-05,
"loss": 1.4832,
"step": 19250
},
{
"epoch": 1.480655334205061,
"eval_loss": 1.480191946029663,
"eval_runtime": 17.952,
"eval_samples_per_second": 55.704,
"eval_steps_per_second": 13.926,
"step": 19250
},
{
"epoch": 1.4845011922159834,
"grad_norm": 1.3308861255645752,
"learning_rate": 4.12367192382633e-05,
"loss": 1.4555,
"step": 19300
},
{
"epoch": 1.4883470502269056,
"grad_norm": 1.6867352724075317,
"learning_rate": 4.0979375758244176e-05,
"loss": 1.4116,
"step": 19350
},
{
"epoch": 1.4921929082378278,
"grad_norm": 2.161247491836548,
"learning_rate": 4.072203227822506e-05,
"loss": 1.4262,
"step": 19400
},
{
"epoch": 1.4960387662487502,
"grad_norm": 1.717690110206604,
"learning_rate": 4.046468879820595e-05,
"loss": 1.3896,
"step": 19450
},
{
"epoch": 1.4998846242596724,
"grad_norm": 1.0118234157562256,
"learning_rate": 4.020734531818682e-05,
"loss": 1.4503,
"step": 19500
},
{
"epoch": 1.4998846242596724,
"eval_loss": 1.478628396987915,
"eval_runtime": 18.0209,
"eval_samples_per_second": 55.491,
"eval_steps_per_second": 13.873,
"step": 19500
},
{
"epoch": 1.5037304822705946,
"grad_norm": 0.8779070377349854,
"learning_rate": 3.995000183816771e-05,
"loss": 1.3728,
"step": 19550
},
{
"epoch": 1.5075763402815168,
"grad_norm": 1.6068123579025269,
"learning_rate": 3.9692658358148595e-05,
"loss": 1.5204,
"step": 19600
},
{
"epoch": 1.511422198292439,
"grad_norm": 1.7712832689285278,
"learning_rate": 3.9435314878129475e-05,
"loss": 1.514,
"step": 19650
},
{
"epoch": 1.5152680563033614,
"grad_norm": 1.2519572973251343,
"learning_rate": 3.917797139811036e-05,
"loss": 1.3953,
"step": 19700
},
{
"epoch": 1.5191139143142836,
"grad_norm": 1.5644786357879639,
"learning_rate": 3.892062791809125e-05,
"loss": 1.4772,
"step": 19750
},
{
"epoch": 1.5191139143142836,
"eval_loss": 1.4710900783538818,
"eval_runtime": 18.4205,
"eval_samples_per_second": 54.287,
"eval_steps_per_second": 13.572,
"step": 19750
},
{
"epoch": 1.5229597723252057,
"grad_norm": 1.6755670309066772,
"learning_rate": 3.866328443807212e-05,
"loss": 1.4148,
"step": 19800
},
{
"epoch": 1.5268056303361282,
"grad_norm": 1.7168843746185303,
"learning_rate": 3.840594095805301e-05,
"loss": 1.4211,
"step": 19850
},
{
"epoch": 1.5306514883470501,
"grad_norm": 1.5205817222595215,
"learning_rate": 3.8148597478033894e-05,
"loss": 1.4663,
"step": 19900
},
{
"epoch": 1.5344973463579725,
"grad_norm": 1.608231544494629,
"learning_rate": 3.789125399801477e-05,
"loss": 1.3634,
"step": 19950
},
{
"epoch": 1.5383432043688947,
"grad_norm": 1.5260729789733887,
"learning_rate": 3.763391051799566e-05,
"loss": 1.4114,
"step": 20000
},
{
"epoch": 1.5383432043688947,
"eval_loss": 1.4733539819717407,
"eval_runtime": 18.105,
"eval_samples_per_second": 55.233,
"eval_steps_per_second": 13.808,
"step": 20000
},
{
"epoch": 1.542189062379817,
"grad_norm": 1.4523636102676392,
"learning_rate": 3.7376567037976546e-05,
"loss": 1.4538,
"step": 20050
},
{
"epoch": 1.5460349203907393,
"grad_norm": 1.854066252708435,
"learning_rate": 3.7119223557957426e-05,
"loss": 1.4532,
"step": 20100
},
{
"epoch": 1.5498807784016613,
"grad_norm": 1.8892920017242432,
"learning_rate": 3.6861880077938306e-05,
"loss": 1.4301,
"step": 20150
},
{
"epoch": 1.5537266364125837,
"grad_norm": 1.2957504987716675,
"learning_rate": 3.6609683467519574e-05,
"loss": 1.4613,
"step": 20200
},
{
"epoch": 1.5575724944235059,
"grad_norm": 1.9040348529815674,
"learning_rate": 3.635233998750046e-05,
"loss": 1.3847,
"step": 20250
},
{
"epoch": 1.5575724944235059,
"eval_loss": 1.4672300815582275,
"eval_runtime": 17.9888,
"eval_samples_per_second": 55.59,
"eval_steps_per_second": 13.898,
"step": 20250
},
{
"epoch": 1.561418352434428,
"grad_norm": 1.4990596771240234,
"learning_rate": 3.609499650748134e-05,
"loss": 1.4243,
"step": 20300
},
{
"epoch": 1.5652642104453505,
"grad_norm": 2.344515562057495,
"learning_rate": 3.583765302746222e-05,
"loss": 1.4971,
"step": 20350
},
{
"epoch": 1.5691100684562724,
"grad_norm": 2.2836570739746094,
"learning_rate": 3.5580309547443106e-05,
"loss": 1.4641,
"step": 20400
},
{
"epoch": 1.5729559264671948,
"grad_norm": 1.0165778398513794,
"learning_rate": 3.5322966067423986e-05,
"loss": 1.4268,
"step": 20450
},
{
"epoch": 1.576801784478117,
"grad_norm": 0.5663600564002991,
"learning_rate": 3.506562258740487e-05,
"loss": 1.3487,
"step": 20500
},
{
"epoch": 1.576801784478117,
"eval_loss": 1.4733059406280518,
"eval_runtime": 18.0399,
"eval_samples_per_second": 55.433,
"eval_steps_per_second": 13.858,
"step": 20500
},
{
"epoch": 1.5806476424890392,
"grad_norm": 1.36208176612854,
"learning_rate": 3.480827910738575e-05,
"loss": 1.3615,
"step": 20550
},
{
"epoch": 1.5844935004999616,
"grad_norm": 1.6889315843582153,
"learning_rate": 3.455093562736664e-05,
"loss": 1.4174,
"step": 20600
},
{
"epoch": 1.5883393585108838,
"grad_norm": 1.2735401391983032,
"learning_rate": 3.429359214734752e-05,
"loss": 1.4482,
"step": 20650
},
{
"epoch": 1.592185216521806,
"grad_norm": 1.668188452720642,
"learning_rate": 3.4036248667328405e-05,
"loss": 1.4193,
"step": 20700
},
{
"epoch": 1.5960310745327284,
"grad_norm": 1.8626503944396973,
"learning_rate": 3.3778905187309284e-05,
"loss": 1.4477,
"step": 20750
},
{
"epoch": 1.5960310745327284,
"eval_loss": 1.4779850244522095,
"eval_runtime": 18.0373,
"eval_samples_per_second": 55.441,
"eval_steps_per_second": 13.86,
"step": 20750
},
{
"epoch": 1.5998769325436504,
"grad_norm": 1.2189550399780273,
"learning_rate": 3.352156170729017e-05,
"loss": 1.5325,
"step": 20800
},
{
"epoch": 1.6037227905545728,
"grad_norm": 2.126854658126831,
"learning_rate": 3.326421822727105e-05,
"loss": 1.5096,
"step": 20850
},
{
"epoch": 1.607568648565495,
"grad_norm": 1.7529182434082031,
"learning_rate": 3.300687474725194e-05,
"loss": 1.4629,
"step": 20900
},
{
"epoch": 1.6114145065764172,
"grad_norm": 2.2533035278320312,
"learning_rate": 3.2749531267232824e-05,
"loss": 1.4266,
"step": 20950
},
{
"epoch": 1.6152603645873396,
"grad_norm": 1.6632803678512573,
"learning_rate": 3.24921877872137e-05,
"loss": 1.5018,
"step": 21000
},
{
"epoch": 1.6152603645873396,
"eval_loss": 1.467063307762146,
"eval_runtime": 18.0767,
"eval_samples_per_second": 55.32,
"eval_steps_per_second": 13.83,
"step": 21000
},
{
"epoch": 1.6191062225982615,
"grad_norm": 2.016814708709717,
"learning_rate": 3.223484430719458e-05,
"loss": 1.434,
"step": 21050
},
{
"epoch": 1.622952080609184,
"grad_norm": 1.5766371488571167,
"learning_rate": 3.197750082717547e-05,
"loss": 1.4249,
"step": 21100
},
{
"epoch": 1.6267979386201061,
"grad_norm": 2.3865230083465576,
"learning_rate": 3.172015734715635e-05,
"loss": 1.6,
"step": 21150
},
{
"epoch": 1.6306437966310283,
"grad_norm": 1.193731427192688,
"learning_rate": 3.1462813867137236e-05,
"loss": 1.5674,
"step": 21200
},
{
"epoch": 1.6344896546419507,
"grad_norm": 1.4854563474655151,
"learning_rate": 3.120547038711812e-05,
"loss": 1.4788,
"step": 21250
},
{
"epoch": 1.6344896546419507,
"eval_loss": 1.4725981950759888,
"eval_runtime": 18.2185,
"eval_samples_per_second": 54.889,
"eval_steps_per_second": 13.722,
"step": 21250
},
{
"epoch": 1.6383355126528727,
"grad_norm": 1.3907707929611206,
"learning_rate": 3.0948126907099e-05,
"loss": 1.4752,
"step": 21300
},
{
"epoch": 1.642181370663795,
"grad_norm": 1.5267348289489746,
"learning_rate": 3.069078342707988e-05,
"loss": 1.4198,
"step": 21350
},
{
"epoch": 1.6460272286747173,
"grad_norm": 1.2138367891311646,
"learning_rate": 3.0433439947060768e-05,
"loss": 1.4302,
"step": 21400
},
{
"epoch": 1.6498730866856395,
"grad_norm": 1.3399436473846436,
"learning_rate": 3.017609646704165e-05,
"loss": 1.5098,
"step": 21450
},
{
"epoch": 1.6537189446965619,
"grad_norm": 1.543906569480896,
"learning_rate": 2.991875298702253e-05,
"loss": 1.4577,
"step": 21500
},
{
"epoch": 1.6537189446965619,
"eval_loss": 1.475114345550537,
"eval_runtime": 18.0585,
"eval_samples_per_second": 55.376,
"eval_steps_per_second": 13.844,
"step": 21500
},
{
"epoch": 1.657564802707484,
"grad_norm": 1.2780442237854004,
"learning_rate": 2.9661409507003417e-05,
"loss": 1.5179,
"step": 21550
},
{
"epoch": 1.6614106607184063,
"grad_norm": 1.206725835800171,
"learning_rate": 2.94040660269843e-05,
"loss": 1.4438,
"step": 21600
},
{
"epoch": 1.6652565187293287,
"grad_norm": 2.1834638118743896,
"learning_rate": 2.914672254696518e-05,
"loss": 1.4783,
"step": 21650
},
{
"epoch": 1.6691023767402506,
"grad_norm": 1.5568137168884277,
"learning_rate": 2.8889379066946066e-05,
"loss": 1.38,
"step": 21700
},
{
"epoch": 1.672948234751173,
"grad_norm": 1.6938014030456543,
"learning_rate": 2.863203558692695e-05,
"loss": 1.3754,
"step": 21750
},
{
"epoch": 1.672948234751173,
"eval_loss": 1.466833472251892,
"eval_runtime": 18.1069,
"eval_samples_per_second": 55.228,
"eval_steps_per_second": 13.807,
"step": 21750
},
{
"epoch": 1.6767940927620952,
"grad_norm": 1.3192166090011597,
"learning_rate": 2.837469210690783e-05,
"loss": 1.4388,
"step": 21800
},
{
"epoch": 1.6806399507730174,
"grad_norm": 2.0135934352874756,
"learning_rate": 2.8117348626888716e-05,
"loss": 1.429,
"step": 21850
},
{
"epoch": 1.6844858087839398,
"grad_norm": 1.4457674026489258,
"learning_rate": 2.78600051468696e-05,
"loss": 1.5154,
"step": 21900
},
{
"epoch": 1.6883316667948618,
"grad_norm": 1.225411295890808,
"learning_rate": 2.760266166685048e-05,
"loss": 1.4658,
"step": 21950
},
{
"epoch": 1.6921775248057842,
"grad_norm": 1.8256678581237793,
"learning_rate": 2.7345318186831365e-05,
"loss": 1.5004,
"step": 22000
},
{
"epoch": 1.6921775248057842,
"eval_loss": 1.4664525985717773,
"eval_runtime": 18.0331,
"eval_samples_per_second": 55.454,
"eval_steps_per_second": 13.863,
"step": 22000
},
{
"epoch": 1.6960233828167064,
"grad_norm": 0.8262001276016235,
"learning_rate": 2.7087974706812248e-05,
"loss": 1.4304,
"step": 22050
},
{
"epoch": 1.6998692408276286,
"grad_norm": 1.6224443912506104,
"learning_rate": 2.6830631226793128e-05,
"loss": 1.4127,
"step": 22100
},
{
"epoch": 1.703715098838551,
"grad_norm": 1.3338160514831543,
"learning_rate": 2.6573287746774014e-05,
"loss": 1.4842,
"step": 22150
},
{
"epoch": 1.707560956849473,
"grad_norm": 1.940238356590271,
"learning_rate": 2.6315944266754897e-05,
"loss": 1.4279,
"step": 22200
},
{
"epoch": 1.7114068148603954,
"grad_norm": 2.091132164001465,
"learning_rate": 2.6058600786735777e-05,
"loss": 1.3779,
"step": 22250
},
{
"epoch": 1.7114068148603954,
"eval_loss": 1.457463264465332,
"eval_runtime": 18.1835,
"eval_samples_per_second": 54.995,
"eval_steps_per_second": 13.749,
"step": 22250
},
{
"epoch": 1.7152526728713176,
"grad_norm": 1.4367913007736206,
"learning_rate": 2.5801257306716663e-05,
"loss": 1.4821,
"step": 22300
},
{
"epoch": 1.7190985308822397,
"grad_norm": 1.9735435247421265,
"learning_rate": 2.5543913826697546e-05,
"loss": 1.3754,
"step": 22350
},
{
"epoch": 1.7229443888931621,
"grad_norm": 1.4968055486679077,
"learning_rate": 2.5286570346678426e-05,
"loss": 1.4045,
"step": 22400
},
{
"epoch": 1.7267902469040843,
"grad_norm": 1.0449949502944946,
"learning_rate": 2.5029226866659312e-05,
"loss": 1.4458,
"step": 22450
},
{
"epoch": 1.7306361049150065,
"grad_norm": 1.164890170097351,
"learning_rate": 2.4771883386640196e-05,
"loss": 1.4407,
"step": 22500
},
{
"epoch": 1.7306361049150065,
"eval_loss": 1.4607012271881104,
"eval_runtime": 18.2079,
"eval_samples_per_second": 54.921,
"eval_steps_per_second": 13.73,
"step": 22500
},
{
"epoch": 1.734481962925929,
"grad_norm": 0.9285104870796204,
"learning_rate": 2.4514539906621075e-05,
"loss": 1.4243,
"step": 22550
},
{
"epoch": 1.738327820936851,
"grad_norm": 1.2848355770111084,
"learning_rate": 2.4257196426601962e-05,
"loss": 1.4596,
"step": 22600
},
{
"epoch": 1.7421736789477733,
"grad_norm": 1.4614371061325073,
"learning_rate": 2.3999852946582845e-05,
"loss": 1.3918,
"step": 22650
},
{
"epoch": 1.7460195369586955,
"grad_norm": 0.9543781876564026,
"learning_rate": 2.3742509466563724e-05,
"loss": 1.4044,
"step": 22700
},
{
"epoch": 1.7498653949696177,
"grad_norm": 1.602250099182129,
"learning_rate": 2.348516598654461e-05,
"loss": 1.4607,
"step": 22750
},
{
"epoch": 1.7498653949696177,
"eval_loss": 1.4677520990371704,
"eval_runtime": 18.158,
"eval_samples_per_second": 55.072,
"eval_steps_per_second": 13.768,
"step": 22750
},
{
"epoch": 1.75371125298054,
"grad_norm": 1.1664291620254517,
"learning_rate": 2.3227822506525494e-05,
"loss": 1.5153,
"step": 22800
},
{
"epoch": 1.757557110991462,
"grad_norm": 1.472679853439331,
"learning_rate": 2.2970479026506374e-05,
"loss": 1.4774,
"step": 22850
},
{
"epoch": 1.7614029690023845,
"grad_norm": 1.7927029132843018,
"learning_rate": 2.271313554648726e-05,
"loss": 1.4551,
"step": 22900
},
{
"epoch": 1.7652488270133067,
"grad_norm": 2.9085824489593506,
"learning_rate": 2.2455792066468143e-05,
"loss": 1.4474,
"step": 22950
},
{
"epoch": 1.7690946850242288,
"grad_norm": 1.8322957754135132,
"learning_rate": 2.2198448586449026e-05,
"loss": 1.4642,
"step": 23000
},
{
"epoch": 1.7690946850242288,
"eval_loss": 1.4676103591918945,
"eval_runtime": 17.9158,
"eval_samples_per_second": 55.817,
"eval_steps_per_second": 13.954,
"step": 23000
},
{
"epoch": 1.7729405430351513,
"grad_norm": 0.7428656220436096,
"learning_rate": 2.194110510642991e-05,
"loss": 1.4475,
"step": 23050
},
{
"epoch": 1.7767864010460732,
"grad_norm": 1.4552706480026245,
"learning_rate": 2.1683761626410793e-05,
"loss": 1.517,
"step": 23100
},
{
"epoch": 1.7806322590569956,
"grad_norm": 1.1563323736190796,
"learning_rate": 2.1426418146391676e-05,
"loss": 1.4806,
"step": 23150
},
{
"epoch": 1.7844781170679178,
"grad_norm": 1.7244662046432495,
"learning_rate": 2.116907466637256e-05,
"loss": 1.4492,
"step": 23200
},
{
"epoch": 1.78832397507884,
"grad_norm": 1.642321228981018,
"learning_rate": 2.0911731186353442e-05,
"loss": 1.4196,
"step": 23250
},
{
"epoch": 1.78832397507884,
"eval_loss": 1.4725000858306885,
"eval_runtime": 18.1814,
"eval_samples_per_second": 55.001,
"eval_steps_per_second": 13.75,
"step": 23250
},
{
"epoch": 1.7921698330897624,
"grad_norm": 1.1381646394729614,
"learning_rate": 2.0654387706334325e-05,
"loss": 1.4653,
"step": 23300
},
{
"epoch": 1.7960156911006846,
"grad_norm": 1.2550010681152344,
"learning_rate": 2.0397044226315208e-05,
"loss": 1.4836,
"step": 23350
},
{
"epoch": 1.7998615491116068,
"grad_norm": 1.4335628747940063,
"learning_rate": 2.013970074629609e-05,
"loss": 1.4403,
"step": 23400
},
{
"epoch": 1.8037074071225292,
"grad_norm": 1.8901276588439941,
"learning_rate": 1.9882357266276974e-05,
"loss": 1.4562,
"step": 23450
},
{
"epoch": 1.8075532651334512,
"grad_norm": 1.2078189849853516,
"learning_rate": 1.9625013786257857e-05,
"loss": 1.4221,
"step": 23500
},
{
"epoch": 1.8075532651334512,
"eval_loss": 1.4660383462905884,
"eval_runtime": 18.0656,
"eval_samples_per_second": 55.354,
"eval_steps_per_second": 13.838,
"step": 23500
},
{
"epoch": 1.8113991231443736,
"grad_norm": 1.6915593147277832,
"learning_rate": 1.936767030623874e-05,
"loss": 1.4296,
"step": 23550
},
{
"epoch": 1.8152449811552958,
"grad_norm": 1.9247820377349854,
"learning_rate": 1.9110326826219623e-05,
"loss": 1.4513,
"step": 23600
},
{
"epoch": 1.819090839166218,
"grad_norm": 2.794621229171753,
"learning_rate": 1.8852983346200506e-05,
"loss": 1.4381,
"step": 23650
},
{
"epoch": 1.8229366971771404,
"grad_norm": 1.3829151391983032,
"learning_rate": 1.859563986618139e-05,
"loss": 1.4344,
"step": 23700
},
{
"epoch": 1.8267825551880623,
"grad_norm": 1.8067855834960938,
"learning_rate": 1.8338296386162273e-05,
"loss": 1.4337,
"step": 23750
},
{
"epoch": 1.8267825551880623,
"eval_loss": 1.4543312788009644,
"eval_runtime": 18.2116,
"eval_samples_per_second": 54.91,
"eval_steps_per_second": 13.727,
"step": 23750
},
{
"epoch": 1.8306284131989847,
"grad_norm": 1.829542875289917,
"learning_rate": 1.8080952906143156e-05,
"loss": 1.3986,
"step": 23800
},
{
"epoch": 1.834474271209907,
"grad_norm": 1.8767279386520386,
"learning_rate": 1.782360942612404e-05,
"loss": 1.4873,
"step": 23850
},
{
"epoch": 1.838320129220829,
"grad_norm": 0.9735344052314758,
"learning_rate": 1.7566265946104922e-05,
"loss": 1.4105,
"step": 23900
},
{
"epoch": 1.8421659872317515,
"grad_norm": 1.5424654483795166,
"learning_rate": 1.7308922466085805e-05,
"loss": 1.4357,
"step": 23950
},
{
"epoch": 1.8460118452426735,
"grad_norm": 0.9316624999046326,
"learning_rate": 1.7051578986066688e-05,
"loss": 1.4616,
"step": 24000
},
{
"epoch": 1.8460118452426735,
"eval_loss": 1.4611330032348633,
"eval_runtime": 18.0655,
"eval_samples_per_second": 55.354,
"eval_steps_per_second": 13.839,
"step": 24000
},
{
"epoch": 1.8498577032535959,
"grad_norm": 1.3933135271072388,
"learning_rate": 1.679423550604757e-05,
"loss": 1.45,
"step": 24050
},
{
"epoch": 1.853703561264518,
"grad_norm": 1.1157580614089966,
"learning_rate": 1.6536892026028454e-05,
"loss": 1.4916,
"step": 24100
},
{
"epoch": 1.8575494192754403,
"grad_norm": 1.7401970624923706,
"learning_rate": 1.6279548546009337e-05,
"loss": 1.4563,
"step": 24150
},
{
"epoch": 1.8613952772863627,
"grad_norm": 1.4699925184249878,
"learning_rate": 1.602220506599022e-05,
"loss": 1.4211,
"step": 24200
},
{
"epoch": 1.8652411352972849,
"grad_norm": 1.1760289669036865,
"learning_rate": 1.5764861585971103e-05,
"loss": 1.4212,
"step": 24250
},
{
"epoch": 1.8652411352972849,
"eval_loss": 1.460072636604309,
"eval_runtime": 17.8176,
"eval_samples_per_second": 56.124,
"eval_steps_per_second": 14.031,
"step": 24250
},
{
"epoch": 1.869086993308207,
"grad_norm": 1.8243287801742554,
"learning_rate": 1.5507518105951986e-05,
"loss": 1.4594,
"step": 24300
},
{
"epoch": 1.8729328513191295,
"grad_norm": 0.8821312785148621,
"learning_rate": 1.5250174625932868e-05,
"loss": 1.3837,
"step": 24350
},
{
"epoch": 1.8767787093300514,
"grad_norm": 1.673240065574646,
"learning_rate": 1.4992831145913753e-05,
"loss": 1.395,
"step": 24400
},
{
"epoch": 1.8806245673409738,
"grad_norm": 1.4853135347366333,
"learning_rate": 1.4735487665894636e-05,
"loss": 1.5031,
"step": 24450
},
{
"epoch": 1.884470425351896,
"grad_norm": 2.507054567337036,
"learning_rate": 1.4478144185875517e-05,
"loss": 1.3909,
"step": 24500
},
{
"epoch": 1.884470425351896,
"eval_loss": 1.4431298971176147,
"eval_runtime": 17.9815,
"eval_samples_per_second": 55.613,
"eval_steps_per_second": 13.903,
"step": 24500
},
{
"epoch": 1.8883162833628182,
"grad_norm": 1.8027464151382446,
"learning_rate": 1.4220800705856402e-05,
"loss": 1.4855,
"step": 24550
},
{
"epoch": 1.8921621413737406,
"grad_norm": 1.139756679534912,
"learning_rate": 1.3963457225837285e-05,
"loss": 1.3773,
"step": 24600
},
{
"epoch": 1.8960079993846626,
"grad_norm": 1.377536654472351,
"learning_rate": 1.3706113745818166e-05,
"loss": 1.4274,
"step": 24650
},
{
"epoch": 1.899853857395585,
"grad_norm": 1.2132219076156616,
"learning_rate": 1.3448770265799051e-05,
"loss": 1.3772,
"step": 24700
},
{
"epoch": 1.9036997154065072,
"grad_norm": 1.7106857299804688,
"learning_rate": 1.3191426785779932e-05,
"loss": 1.41,
"step": 24750
},
{
"epoch": 1.9036997154065072,
"eval_loss": 1.472328782081604,
"eval_runtime": 18.0789,
"eval_samples_per_second": 55.313,
"eval_steps_per_second": 13.828,
"step": 24750
},
{
"epoch": 1.9075455734174294,
"grad_norm": 0.9809736013412476,
"learning_rate": 1.2939230175361197e-05,
"loss": 1.4547,
"step": 24800
},
{
"epoch": 1.9113914314283518,
"grad_norm": 1.476722240447998,
"learning_rate": 1.2681886695342082e-05,
"loss": 1.4546,
"step": 24850
},
{
"epoch": 1.9152372894392737,
"grad_norm": 2.078511953353882,
"learning_rate": 1.2424543215322965e-05,
"loss": 1.4971,
"step": 24900
},
{
"epoch": 1.9190831474501961,
"grad_norm": 0.7233028411865234,
"learning_rate": 1.2167199735303847e-05,
"loss": 1.3622,
"step": 24950
},
{
"epoch": 1.9229290054611183,
"grad_norm": 1.3686310052871704,
"learning_rate": 1.1909856255284731e-05,
"loss": 1.5232,
"step": 25000
},
{
"epoch": 1.9229290054611183,
"eval_loss": 1.461082935333252,
"eval_runtime": 18.2695,
"eval_samples_per_second": 54.736,
"eval_steps_per_second": 13.684,
"step": 25000
},
{
"epoch": 1.9267748634720405,
"grad_norm": 1.1179672479629517,
"learning_rate": 1.1652512775265614e-05,
"loss": 1.5076,
"step": 25050
},
{
"epoch": 1.930620721482963,
"grad_norm": 0.9407248497009277,
"learning_rate": 1.1395169295246496e-05,
"loss": 1.468,
"step": 25100
},
{
"epoch": 1.9344665794938851,
"grad_norm": 1.498488426208496,
"learning_rate": 1.113782581522738e-05,
"loss": 1.4566,
"step": 25150
},
{
"epoch": 1.9383124375048073,
"grad_norm": 0.6983101963996887,
"learning_rate": 1.0880482335208264e-05,
"loss": 1.4621,
"step": 25200
},
{
"epoch": 1.9421582955157297,
"grad_norm": 1.954953908920288,
"learning_rate": 1.0623138855189145e-05,
"loss": 1.417,
"step": 25250
},
{
"epoch": 1.9421582955157297,
"eval_loss": 1.4591727256774902,
"eval_runtime": 18.0732,
"eval_samples_per_second": 55.331,
"eval_steps_per_second": 13.833,
"step": 25250
},
{
"epoch": 1.9460041535266517,
"grad_norm": 1.6467170715332031,
"learning_rate": 1.036579537517003e-05,
"loss": 1.4942,
"step": 25300
},
{
"epoch": 1.949850011537574,
"grad_norm": 1.4509849548339844,
"learning_rate": 1.0108451895150913e-05,
"loss": 1.4539,
"step": 25350
},
{
"epoch": 1.9536958695484963,
"grad_norm": 1.6131352186203003,
"learning_rate": 9.851108415131796e-06,
"loss": 1.3993,
"step": 25400
},
{
"epoch": 1.9575417275594185,
"grad_norm": 1.880043387413025,
"learning_rate": 9.593764935112679e-06,
"loss": 1.4449,
"step": 25450
},
{
"epoch": 1.9613875855703409,
"grad_norm": 1.3041406869888306,
"learning_rate": 9.336421455093562e-06,
"loss": 1.4918,
"step": 25500
},
{
"epoch": 1.9613875855703409,
"eval_loss": 1.4548134803771973,
"eval_runtime": 18.0544,
"eval_samples_per_second": 55.388,
"eval_steps_per_second": 13.847,
"step": 25500
},
{
"epoch": 1.9652334435812628,
"grad_norm": 1.8318700790405273,
"learning_rate": 9.079077975074445e-06,
"loss": 1.42,
"step": 25550
},
{
"epoch": 1.9690793015921852,
"grad_norm": 1.7966841459274292,
"learning_rate": 8.821734495055328e-06,
"loss": 1.3236,
"step": 25600
},
{
"epoch": 1.9729251596031074,
"grad_norm": 0.7579635977745056,
"learning_rate": 8.564391015036211e-06,
"loss": 1.3957,
"step": 25650
},
{
"epoch": 1.9767710176140296,
"grad_norm": 1.4515990018844604,
"learning_rate": 8.307047535017094e-06,
"loss": 1.3347,
"step": 25700
},
{
"epoch": 1.980616875624952,
"grad_norm": 1.5671380758285522,
"learning_rate": 8.049704054997977e-06,
"loss": 1.4624,
"step": 25750
},
{
"epoch": 1.980616875624952,
"eval_loss": 1.450337290763855,
"eval_runtime": 17.9548,
"eval_samples_per_second": 55.695,
"eval_steps_per_second": 13.924,
"step": 25750
},
{
"epoch": 1.984462733635874,
"grad_norm": 1.7020714282989502,
"learning_rate": 7.79236057497886e-06,
"loss": 1.3822,
"step": 25800
},
{
"epoch": 1.9883085916467964,
"grad_norm": 1.297658920288086,
"learning_rate": 7.535017094959743e-06,
"loss": 1.4008,
"step": 25850
},
{
"epoch": 1.9921544496577186,
"grad_norm": 1.8151623010635376,
"learning_rate": 7.277673614940627e-06,
"loss": 1.4408,
"step": 25900
},
{
"epoch": 1.9960003076686408,
"grad_norm": 0.8869682550430298,
"learning_rate": 7.02033013492151e-06,
"loss": 1.4767,
"step": 25950
},
{
"epoch": 1.9998461656795632,
"grad_norm": 1.898775339126587,
"learning_rate": 6.762986654902392e-06,
"loss": 1.5032,
"step": 26000
},
{
"epoch": 1.9998461656795632,
"eval_loss": 1.4542045593261719,
"eval_runtime": 18.0059,
"eval_samples_per_second": 55.537,
"eval_steps_per_second": 13.884,
"step": 26000
},
{
"epoch": 2.003692023690485,
"grad_norm": 1.7356750965118408,
"learning_rate": 6.505643174883276e-06,
"loss": 1.3839,
"step": 26050
},
{
"epoch": 2.0075378817014076,
"grad_norm": 2.3067352771759033,
"learning_rate": 6.248299694864159e-06,
"loss": 1.4348,
"step": 26100
},
{
"epoch": 2.01138373971233,
"grad_norm": 1.343248724937439,
"learning_rate": 5.990956214845041e-06,
"loss": 1.3703,
"step": 26150
},
{
"epoch": 2.015229597723252,
"grad_norm": 1.9424471855163574,
"learning_rate": 5.733612734825925e-06,
"loss": 1.4304,
"step": 26200
},
{
"epoch": 2.0190754557341744,
"grad_norm": 1.5383673906326294,
"learning_rate": 5.476269254806808e-06,
"loss": 1.4118,
"step": 26250
},
{
"epoch": 2.0190754557341744,
"eval_loss": 1.474881649017334,
"eval_runtime": 18.1751,
"eval_samples_per_second": 55.02,
"eval_steps_per_second": 13.755,
"step": 26250
},
{
"epoch": 2.0229213137450963,
"grad_norm": 1.803488850593567,
"learning_rate": 5.2189257747876905e-06,
"loss": 1.4537,
"step": 26300
},
{
"epoch": 2.0267671717560187,
"grad_norm": 1.8623336553573608,
"learning_rate": 4.961582294768574e-06,
"loss": 1.3659,
"step": 26350
},
{
"epoch": 2.030613029766941,
"grad_norm": 1.1901572942733765,
"learning_rate": 4.7042388147494575e-06,
"loss": 1.4175,
"step": 26400
},
{
"epoch": 2.034458887777863,
"grad_norm": 1.2967520952224731,
"learning_rate": 4.4468953347303406e-06,
"loss": 1.458,
"step": 26450
},
{
"epoch": 2.0383047457887855,
"grad_norm": 1.2987436056137085,
"learning_rate": 4.189551854711224e-06,
"loss": 1.3965,
"step": 26500
},
{
"epoch": 2.0383047457887855,
"eval_loss": 1.4528058767318726,
"eval_runtime": 18.2495,
"eval_samples_per_second": 54.796,
"eval_steps_per_second": 13.699,
"step": 26500
},
{
"epoch": 2.042150603799708,
"grad_norm": 1.0049172639846802,
"learning_rate": 3.932208374692107e-06,
"loss": 1.3012,
"step": 26550
},
{
"epoch": 2.04599646181063,
"grad_norm": 1.193533182144165,
"learning_rate": 3.6748648946729894e-06,
"loss": 1.4038,
"step": 26600
},
{
"epoch": 2.0498423198215523,
"grad_norm": 1.6459178924560547,
"learning_rate": 3.417521414653873e-06,
"loss": 1.4089,
"step": 26650
},
{
"epoch": 2.0536881778324743,
"grad_norm": 0.546062171459198,
"learning_rate": 3.160177934634756e-06,
"loss": 1.3675,
"step": 26700
},
{
"epoch": 2.0575340358433967,
"grad_norm": 1.7894645929336548,
"learning_rate": 2.9028344546156386e-06,
"loss": 1.4585,
"step": 26750
},
{
"epoch": 2.0575340358433967,
"eval_loss": 1.460014820098877,
"eval_runtime": 18.2356,
"eval_samples_per_second": 54.838,
"eval_steps_per_second": 13.709,
"step": 26750
},
{
"epoch": 2.061379893854319,
"grad_norm": 1.1368170976638794,
"learning_rate": 2.645490974596522e-06,
"loss": 1.4038,
"step": 26800
},
{
"epoch": 2.065225751865241,
"grad_norm": 1.698556900024414,
"learning_rate": 2.388147494577405e-06,
"loss": 1.4592,
"step": 26850
},
{
"epoch": 2.0690716098761635,
"grad_norm": 1.3114346265792847,
"learning_rate": 2.130804014558288e-06,
"loss": 1.4566,
"step": 26900
},
{
"epoch": 2.0729174678870854,
"grad_norm": 1.7974728345870972,
"learning_rate": 1.8734605345391713e-06,
"loss": 1.5074,
"step": 26950
},
{
"epoch": 2.076763325898008,
"grad_norm": 1.4648147821426392,
"learning_rate": 1.6161170545200544e-06,
"loss": 1.4478,
"step": 27000
},
{
"epoch": 2.076763325898008,
"eval_loss": 1.4667593240737915,
"eval_runtime": 18.1467,
"eval_samples_per_second": 55.107,
"eval_steps_per_second": 13.777,
"step": 27000
},
{
"epoch": 2.0806091839089302,
"grad_norm": 0.9924139380455017,
"learning_rate": 1.3587735745009373e-06,
"loss": 1.5088,
"step": 27050
},
{
"epoch": 2.084455041919852,
"grad_norm": 1.1177709102630615,
"learning_rate": 1.1014300944818204e-06,
"loss": 1.4285,
"step": 27100
},
{
"epoch": 2.0883008999307746,
"grad_norm": 1.7112759351730347,
"learning_rate": 8.440866144627034e-07,
"loss": 1.433,
"step": 27150
},
{
"epoch": 2.0921467579416966,
"grad_norm": 1.9338856935501099,
"learning_rate": 5.867431344435866e-07,
"loss": 1.4008,
"step": 27200
},
{
"epoch": 2.095992615952619,
"grad_norm": 3.0200393199920654,
"learning_rate": 3.2939965442446964e-07,
"loss": 1.4285,
"step": 27250
},
{
"epoch": 2.095992615952619,
"eval_loss": 1.4686814546585083,
"eval_runtime": 18.028,
"eval_samples_per_second": 55.469,
"eval_steps_per_second": 13.867,
"step": 27250
},
{
"epoch": 2.0998384739635414,
"grad_norm": 1.5137439966201782,
"learning_rate": 7.205617440535274e-08,
"loss": 1.4596,
"step": 27300
}
],
"logging_steps": 50,
"max_steps": 27301,
"num_input_tokens_seen": 0,
"num_train_epochs": 3,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 0.0,
"train_batch_size": 4,
"trial_name": null,
"trial_params": null
}