|
{ |
|
"best_global_step": 24500, |
|
"best_metric": 1.4431298971176147, |
|
"best_model_checkpoint": "./ar-diffusion-checkpoints/checkpoint-24500", |
|
"epoch": 2.09991539112376, |
|
"eval_steps": 250, |
|
"global_step": 27301, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.003845858010922237, |
|
"grad_norm": 8.077690124511719, |
|
"learning_rate": 6.579999999999999e-05, |
|
"loss": 10.7559, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.007691716021844474, |
|
"grad_norm": 7.270859241485596, |
|
"learning_rate": 0.00013299999999999998, |
|
"loss": 6.4993, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.01153757403276671, |
|
"grad_norm": 6.350255012512207, |
|
"learning_rate": 0.00013976839086798278, |
|
"loss": 5.8214, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.015383432043688947, |
|
"grad_norm": 5.809306621551514, |
|
"learning_rate": 0.00013951104738796366, |
|
"loss": 5.3663, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.019229290054611183, |
|
"grad_norm": 3.9576303958892822, |
|
"learning_rate": 0.00013925370390794456, |
|
"loss": 5.3697, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 0.019229290054611183, |
|
"eval_loss": 5.2464423179626465, |
|
"eval_runtime": 18.6939, |
|
"eval_samples_per_second": 53.493, |
|
"eval_steps_per_second": 13.373, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 0.02307514806553342, |
|
"grad_norm": 5.081186771392822, |
|
"learning_rate": 0.00013899636042792544, |
|
"loss": 5.0419, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.02692100607645566, |
|
"grad_norm": 5.957707405090332, |
|
"learning_rate": 0.0001387390169479063, |
|
"loss": 4.8305, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 0.030766864087377895, |
|
"grad_norm": 3.9519667625427246, |
|
"learning_rate": 0.0001384816734678872, |
|
"loss": 5.1118, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.03461272209830013, |
|
"grad_norm": 2.498075485229492, |
|
"learning_rate": 0.00013822432998786808, |
|
"loss": 5.0262, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 0.038458580109222366, |
|
"grad_norm": 4.084473609924316, |
|
"learning_rate": 0.00013796698650784896, |
|
"loss": 5.0738, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.038458580109222366, |
|
"eval_loss": 4.9789862632751465, |
|
"eval_runtime": 18.8768, |
|
"eval_samples_per_second": 52.975, |
|
"eval_steps_per_second": 13.244, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.0423044381201446, |
|
"grad_norm": 6.3689374923706055, |
|
"learning_rate": 0.00013771478989743022, |
|
"loss": 4.9228, |
|
"step": 550 |
|
}, |
|
{ |
|
"epoch": 0.04615029613106684, |
|
"grad_norm": 3.9407873153686523, |
|
"learning_rate": 0.0001374574464174111, |
|
"loss": 4.976, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 0.04999615414198908, |
|
"grad_norm": 4.298041343688965, |
|
"learning_rate": 0.00013720010293739198, |
|
"loss": 4.6802, |
|
"step": 650 |
|
}, |
|
{ |
|
"epoch": 0.05384201215291132, |
|
"grad_norm": 3.756016492843628, |
|
"learning_rate": 0.0001369427594573729, |
|
"loss": 4.7095, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 0.05768787016383355, |
|
"grad_norm": 4.344913959503174, |
|
"learning_rate": 0.00013668541597735377, |
|
"loss": 4.8664, |
|
"step": 750 |
|
}, |
|
{ |
|
"epoch": 0.05768787016383355, |
|
"eval_loss": 4.762838363647461, |
|
"eval_runtime": 18.772, |
|
"eval_samples_per_second": 53.271, |
|
"eval_steps_per_second": 13.318, |
|
"step": 750 |
|
}, |
|
{ |
|
"epoch": 0.06153372817475579, |
|
"grad_norm": 4.1537275314331055, |
|
"learning_rate": 0.00013642807249733465, |
|
"loss": 4.9688, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 0.06537958618567802, |
|
"grad_norm": 4.85400915145874, |
|
"learning_rate": 0.00013617072901731553, |
|
"loss": 4.8658, |
|
"step": 850 |
|
}, |
|
{ |
|
"epoch": 0.06922544419660026, |
|
"grad_norm": 4.026614189147949, |
|
"learning_rate": 0.0001359133855372964, |
|
"loss": 4.893, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 0.0730713022075225, |
|
"grad_norm": 3.84721040725708, |
|
"learning_rate": 0.0001356560420572773, |
|
"loss": 4.6926, |
|
"step": 950 |
|
}, |
|
{ |
|
"epoch": 0.07691716021844473, |
|
"grad_norm": 9.182045936584473, |
|
"learning_rate": 0.00013539869857725817, |
|
"loss": 4.881, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 0.07691716021844473, |
|
"eval_loss": 4.709664344787598, |
|
"eval_runtime": 18.8053, |
|
"eval_samples_per_second": 53.177, |
|
"eval_steps_per_second": 13.294, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 0.08076301822936698, |
|
"grad_norm": 5.442048072814941, |
|
"learning_rate": 0.00013514135509723907, |
|
"loss": 4.6134, |
|
"step": 1050 |
|
}, |
|
{ |
|
"epoch": 0.0846088762402892, |
|
"grad_norm": 4.779583930969238, |
|
"learning_rate": 0.00013488401161721995, |
|
"loss": 4.7226, |
|
"step": 1100 |
|
}, |
|
{ |
|
"epoch": 0.08845473425121145, |
|
"grad_norm": 3.221238851547241, |
|
"learning_rate": 0.0001346266681372008, |
|
"loss": 4.6837, |
|
"step": 1150 |
|
}, |
|
{ |
|
"epoch": 0.09230059226213368, |
|
"grad_norm": 5.55983304977417, |
|
"learning_rate": 0.0001343693246571817, |
|
"loss": 4.672, |
|
"step": 1200 |
|
}, |
|
{ |
|
"epoch": 0.09614645027305592, |
|
"grad_norm": 6.964417934417725, |
|
"learning_rate": 0.0001341119811771626, |
|
"loss": 4.9043, |
|
"step": 1250 |
|
}, |
|
{ |
|
"epoch": 0.09614645027305592, |
|
"eval_loss": 4.7052001953125, |
|
"eval_runtime": 18.9307, |
|
"eval_samples_per_second": 52.824, |
|
"eval_steps_per_second": 13.206, |
|
"step": 1250 |
|
}, |
|
{ |
|
"epoch": 0.09999230828397816, |
|
"grad_norm": 7.476005554199219, |
|
"learning_rate": 0.00013385463769714347, |
|
"loss": 4.7776, |
|
"step": 1300 |
|
}, |
|
{ |
|
"epoch": 0.10383816629490039, |
|
"grad_norm": 3.4916040897369385, |
|
"learning_rate": 0.00013359729421712435, |
|
"loss": 4.7738, |
|
"step": 1350 |
|
}, |
|
{ |
|
"epoch": 0.10768402430582263, |
|
"grad_norm": 4.028671741485596, |
|
"learning_rate": 0.00013333995073710526, |
|
"loss": 4.6459, |
|
"step": 1400 |
|
}, |
|
{ |
|
"epoch": 0.11152988231674486, |
|
"grad_norm": 4.597095489501953, |
|
"learning_rate": 0.0001330826072570861, |
|
"loss": 4.6778, |
|
"step": 1450 |
|
}, |
|
{ |
|
"epoch": 0.1153757403276671, |
|
"grad_norm": 5.779391288757324, |
|
"learning_rate": 0.000132825263777067, |
|
"loss": 4.7938, |
|
"step": 1500 |
|
}, |
|
{ |
|
"epoch": 0.1153757403276671, |
|
"eval_loss": 4.696172714233398, |
|
"eval_runtime": 18.8705, |
|
"eval_samples_per_second": 52.993, |
|
"eval_steps_per_second": 13.248, |
|
"step": 1500 |
|
}, |
|
{ |
|
"epoch": 0.11922159833858934, |
|
"grad_norm": 3.801748752593994, |
|
"learning_rate": 0.0001325679202970479, |
|
"loss": 4.7912, |
|
"step": 1550 |
|
}, |
|
{ |
|
"epoch": 0.12306745634951158, |
|
"grad_norm": 8.367344856262207, |
|
"learning_rate": 0.00013231057681702878, |
|
"loss": 4.7281, |
|
"step": 1600 |
|
}, |
|
{ |
|
"epoch": 0.12691331436043382, |
|
"grad_norm": 4.299734592437744, |
|
"learning_rate": 0.00013205323333700966, |
|
"loss": 4.7263, |
|
"step": 1650 |
|
}, |
|
{ |
|
"epoch": 0.13075917237135604, |
|
"grad_norm": 6.152933597564697, |
|
"learning_rate": 0.00013179588985699054, |
|
"loss": 4.8519, |
|
"step": 1700 |
|
}, |
|
{ |
|
"epoch": 0.13460503038227828, |
|
"grad_norm": 4.300355434417725, |
|
"learning_rate": 0.00013153854637697142, |
|
"loss": 4.8359, |
|
"step": 1750 |
|
}, |
|
{ |
|
"epoch": 0.13460503038227828, |
|
"eval_loss": 4.635708808898926, |
|
"eval_runtime": 18.5455, |
|
"eval_samples_per_second": 53.922, |
|
"eval_steps_per_second": 13.48, |
|
"step": 1750 |
|
}, |
|
{ |
|
"epoch": 0.13845088839320052, |
|
"grad_norm": 2.1330080032348633, |
|
"learning_rate": 0.0001312812028969523, |
|
"loss": 4.807, |
|
"step": 1800 |
|
}, |
|
{ |
|
"epoch": 0.14229674640412276, |
|
"grad_norm": 4.667717456817627, |
|
"learning_rate": 0.00013102385941693318, |
|
"loss": 4.6633, |
|
"step": 1850 |
|
}, |
|
{ |
|
"epoch": 0.146142604415045, |
|
"grad_norm": 6.904145240783691, |
|
"learning_rate": 0.00013076651593691408, |
|
"loss": 4.7899, |
|
"step": 1900 |
|
}, |
|
{ |
|
"epoch": 0.14998846242596722, |
|
"grad_norm": 2.930926561355591, |
|
"learning_rate": 0.00013050917245689496, |
|
"loss": 4.6692, |
|
"step": 1950 |
|
}, |
|
{ |
|
"epoch": 0.15383432043688947, |
|
"grad_norm": 3.6246345043182373, |
|
"learning_rate": 0.00013025182897687584, |
|
"loss": 4.781, |
|
"step": 2000 |
|
}, |
|
{ |
|
"epoch": 0.15383432043688947, |
|
"eval_loss": 4.620576858520508, |
|
"eval_runtime": 18.7692, |
|
"eval_samples_per_second": 53.279, |
|
"eval_steps_per_second": 13.32, |
|
"step": 2000 |
|
}, |
|
{ |
|
"epoch": 0.1576801784478117, |
|
"grad_norm": 3.5292210578918457, |
|
"learning_rate": 0.00012999448549685672, |
|
"loss": 4.7815, |
|
"step": 2050 |
|
}, |
|
{ |
|
"epoch": 0.16152603645873395, |
|
"grad_norm": 4.665738105773926, |
|
"learning_rate": 0.0001297371420168376, |
|
"loss": 4.6789, |
|
"step": 2100 |
|
}, |
|
{ |
|
"epoch": 0.16537189446965617, |
|
"grad_norm": 4.332949161529541, |
|
"learning_rate": 0.00012947979853681848, |
|
"loss": 4.5991, |
|
"step": 2150 |
|
}, |
|
{ |
|
"epoch": 0.1692177524805784, |
|
"grad_norm": 3.8279120922088623, |
|
"learning_rate": 0.00012922245505679936, |
|
"loss": 4.5791, |
|
"step": 2200 |
|
}, |
|
{ |
|
"epoch": 0.17306361049150065, |
|
"grad_norm": 1.9522042274475098, |
|
"learning_rate": 0.00012896511157678027, |
|
"loss": 4.5643, |
|
"step": 2250 |
|
}, |
|
{ |
|
"epoch": 0.17306361049150065, |
|
"eval_loss": 4.609655857086182, |
|
"eval_runtime": 18.946, |
|
"eval_samples_per_second": 52.782, |
|
"eval_steps_per_second": 13.195, |
|
"step": 2250 |
|
}, |
|
{ |
|
"epoch": 0.1769094685024229, |
|
"grad_norm": 4.264033794403076, |
|
"learning_rate": 0.00012870776809676115, |
|
"loss": 4.6666, |
|
"step": 2300 |
|
}, |
|
{ |
|
"epoch": 0.18075532651334514, |
|
"grad_norm": 4.572433948516846, |
|
"learning_rate": 0.000128450424616742, |
|
"loss": 4.6096, |
|
"step": 2350 |
|
}, |
|
{ |
|
"epoch": 0.18460118452426735, |
|
"grad_norm": 3.8559391498565674, |
|
"learning_rate": 0.0001281930811367229, |
|
"loss": 4.6425, |
|
"step": 2400 |
|
}, |
|
{ |
|
"epoch": 0.1884470425351896, |
|
"grad_norm": 2.9414010047912598, |
|
"learning_rate": 0.0001279357376567038, |
|
"loss": 4.6336, |
|
"step": 2450 |
|
}, |
|
{ |
|
"epoch": 0.19229290054611184, |
|
"grad_norm": 4.745160102844238, |
|
"learning_rate": 0.00012767839417668467, |
|
"loss": 4.6792, |
|
"step": 2500 |
|
}, |
|
{ |
|
"epoch": 0.19229290054611184, |
|
"eval_loss": 4.558788776397705, |
|
"eval_runtime": 18.9882, |
|
"eval_samples_per_second": 52.664, |
|
"eval_steps_per_second": 13.166, |
|
"step": 2500 |
|
}, |
|
{ |
|
"epoch": 0.19613875855703408, |
|
"grad_norm": 2.456908702850342, |
|
"learning_rate": 0.00012742105069666555, |
|
"loss": 4.3847, |
|
"step": 2550 |
|
}, |
|
{ |
|
"epoch": 0.19998461656795632, |
|
"grad_norm": 5.154629707336426, |
|
"learning_rate": 0.00012716370721664645, |
|
"loss": 4.6019, |
|
"step": 2600 |
|
}, |
|
{ |
|
"epoch": 0.20383047457887854, |
|
"grad_norm": 3.0423479080200195, |
|
"learning_rate": 0.0001269063637366273, |
|
"loss": 4.4796, |
|
"step": 2650 |
|
}, |
|
{ |
|
"epoch": 0.20767633258980078, |
|
"grad_norm": 4.218437194824219, |
|
"learning_rate": 0.00012664902025660819, |
|
"loss": 4.5566, |
|
"step": 2700 |
|
}, |
|
{ |
|
"epoch": 0.21152219060072303, |
|
"grad_norm": 5.20380163192749, |
|
"learning_rate": 0.0001263916767765891, |
|
"loss": 4.3311, |
|
"step": 2750 |
|
}, |
|
{ |
|
"epoch": 0.21152219060072303, |
|
"eval_loss": 4.574987888336182, |
|
"eval_runtime": 18.8565, |
|
"eval_samples_per_second": 53.032, |
|
"eval_steps_per_second": 13.258, |
|
"step": 2750 |
|
}, |
|
{ |
|
"epoch": 0.21536804861164527, |
|
"grad_norm": 4.369246482849121, |
|
"learning_rate": 0.00012613433329656997, |
|
"loss": 4.4131, |
|
"step": 2800 |
|
}, |
|
{ |
|
"epoch": 0.21921390662256748, |
|
"grad_norm": 5.0442376136779785, |
|
"learning_rate": 0.00012587698981655085, |
|
"loss": 4.5027, |
|
"step": 2850 |
|
}, |
|
{ |
|
"epoch": 0.22305976463348973, |
|
"grad_norm": 3.6387200355529785, |
|
"learning_rate": 0.00012561964633653173, |
|
"loss": 4.6659, |
|
"step": 2900 |
|
}, |
|
{ |
|
"epoch": 0.22690562264441197, |
|
"grad_norm": 3.7960562705993652, |
|
"learning_rate": 0.0001253623028565126, |
|
"loss": 4.4826, |
|
"step": 2950 |
|
}, |
|
{ |
|
"epoch": 0.2307514806553342, |
|
"grad_norm": 4.273965835571289, |
|
"learning_rate": 0.0001251049593764935, |
|
"loss": 4.5869, |
|
"step": 3000 |
|
}, |
|
{ |
|
"epoch": 0.2307514806553342, |
|
"eval_loss": 4.55267858505249, |
|
"eval_runtime": 18.9735, |
|
"eval_samples_per_second": 52.705, |
|
"eval_steps_per_second": 13.176, |
|
"step": 3000 |
|
}, |
|
{ |
|
"epoch": 0.23459733866625646, |
|
"grad_norm": 4.74845027923584, |
|
"learning_rate": 0.00012484761589647437, |
|
"loss": 4.6248, |
|
"step": 3050 |
|
}, |
|
{ |
|
"epoch": 0.23844319667717867, |
|
"grad_norm": 6.299524784088135, |
|
"learning_rate": 0.00012459027241645528, |
|
"loss": 4.5457, |
|
"step": 3100 |
|
}, |
|
{ |
|
"epoch": 0.2422890546881009, |
|
"grad_norm": 5.853606700897217, |
|
"learning_rate": 0.00012433292893643616, |
|
"loss": 4.5135, |
|
"step": 3150 |
|
}, |
|
{ |
|
"epoch": 0.24613491269902316, |
|
"grad_norm": 3.1514365673065186, |
|
"learning_rate": 0.00012407558545641704, |
|
"loss": 4.672, |
|
"step": 3200 |
|
}, |
|
{ |
|
"epoch": 0.2499807707099454, |
|
"grad_norm": 8.455827713012695, |
|
"learning_rate": 0.00012381824197639792, |
|
"loss": 4.6545, |
|
"step": 3250 |
|
}, |
|
{ |
|
"epoch": 0.2499807707099454, |
|
"eval_loss": 4.550297737121582, |
|
"eval_runtime": 18.9801, |
|
"eval_samples_per_second": 52.687, |
|
"eval_steps_per_second": 13.172, |
|
"step": 3250 |
|
}, |
|
{ |
|
"epoch": 0.25382662872086764, |
|
"grad_norm": 2.8094310760498047, |
|
"learning_rate": 0.0001235608984963788, |
|
"loss": 4.5392, |
|
"step": 3300 |
|
}, |
|
{ |
|
"epoch": 0.2576724867317899, |
|
"grad_norm": 3.2565436363220215, |
|
"learning_rate": 0.00012330355501635968, |
|
"loss": 4.481, |
|
"step": 3350 |
|
}, |
|
{ |
|
"epoch": 0.26151834474271207, |
|
"grad_norm": 3.5588488578796387, |
|
"learning_rate": 0.00012304621153634056, |
|
"loss": 4.5543, |
|
"step": 3400 |
|
}, |
|
{ |
|
"epoch": 0.2653642027536343, |
|
"grad_norm": 3.0696310997009277, |
|
"learning_rate": 0.00012278886805632146, |
|
"loss": 4.5858, |
|
"step": 3450 |
|
}, |
|
{ |
|
"epoch": 0.26921006076455656, |
|
"grad_norm": 3.886117935180664, |
|
"learning_rate": 0.00012253152457630234, |
|
"loss": 4.4694, |
|
"step": 3500 |
|
}, |
|
{ |
|
"epoch": 0.26921006076455656, |
|
"eval_loss": 4.488556861877441, |
|
"eval_runtime": 18.9212, |
|
"eval_samples_per_second": 52.851, |
|
"eval_steps_per_second": 13.213, |
|
"step": 3500 |
|
}, |
|
{ |
|
"epoch": 0.2730559187754788, |
|
"grad_norm": 3.794307231903076, |
|
"learning_rate": 0.00012227418109628322, |
|
"loss": 4.4994, |
|
"step": 3550 |
|
}, |
|
{ |
|
"epoch": 0.27690177678640104, |
|
"grad_norm": 3.5770812034606934, |
|
"learning_rate": 0.00012201683761626409, |
|
"loss": 4.5888, |
|
"step": 3600 |
|
}, |
|
{ |
|
"epoch": 0.2807476347973233, |
|
"grad_norm": 4.770874500274658, |
|
"learning_rate": 0.00012175949413624498, |
|
"loss": 4.5644, |
|
"step": 3650 |
|
}, |
|
{ |
|
"epoch": 0.28459349280824553, |
|
"grad_norm": 3.4447147846221924, |
|
"learning_rate": 0.00012150215065622586, |
|
"loss": 4.5301, |
|
"step": 3700 |
|
}, |
|
{ |
|
"epoch": 0.2884393508191678, |
|
"grad_norm": 4.76978063583374, |
|
"learning_rate": 0.00012124480717620675, |
|
"loss": 4.5563, |
|
"step": 3750 |
|
}, |
|
{ |
|
"epoch": 0.2884393508191678, |
|
"eval_loss": 4.53049898147583, |
|
"eval_runtime": 18.9074, |
|
"eval_samples_per_second": 52.889, |
|
"eval_steps_per_second": 13.222, |
|
"step": 3750 |
|
}, |
|
{ |
|
"epoch": 0.29228520883009, |
|
"grad_norm": 5.7456512451171875, |
|
"learning_rate": 0.00012098746369618763, |
|
"loss": 4.5612, |
|
"step": 3800 |
|
}, |
|
{ |
|
"epoch": 0.2961310668410122, |
|
"grad_norm": 5.577849864959717, |
|
"learning_rate": 0.00012073012021616851, |
|
"loss": 4.4629, |
|
"step": 3850 |
|
}, |
|
{ |
|
"epoch": 0.29997692485193445, |
|
"grad_norm": 4.432284832000732, |
|
"learning_rate": 0.00012047277673614939, |
|
"loss": 4.6661, |
|
"step": 3900 |
|
}, |
|
{ |
|
"epoch": 0.3038227828628567, |
|
"grad_norm": 5.174475193023682, |
|
"learning_rate": 0.00012021543325613027, |
|
"loss": 4.4835, |
|
"step": 3950 |
|
}, |
|
{ |
|
"epoch": 0.30766864087377893, |
|
"grad_norm": 3.5657413005828857, |
|
"learning_rate": 0.00011995808977611117, |
|
"loss": 4.4894, |
|
"step": 4000 |
|
}, |
|
{ |
|
"epoch": 0.30766864087377893, |
|
"eval_loss": 4.5068535804748535, |
|
"eval_runtime": 18.7951, |
|
"eval_samples_per_second": 53.205, |
|
"eval_steps_per_second": 13.301, |
|
"step": 4000 |
|
}, |
|
{ |
|
"epoch": 0.3115144988847012, |
|
"grad_norm": 3.854024648666382, |
|
"learning_rate": 0.00011970074629609205, |
|
"loss": 4.4989, |
|
"step": 4050 |
|
}, |
|
{ |
|
"epoch": 0.3153603568956234, |
|
"grad_norm": 4.0870490074157715, |
|
"learning_rate": 0.00011944340281607294, |
|
"loss": 4.3779, |
|
"step": 4100 |
|
}, |
|
{ |
|
"epoch": 0.31920621490654566, |
|
"grad_norm": 4.4627251625061035, |
|
"learning_rate": 0.0001191860593360538, |
|
"loss": 4.5526, |
|
"step": 4150 |
|
}, |
|
{ |
|
"epoch": 0.3230520729174679, |
|
"grad_norm": 7.568991184234619, |
|
"learning_rate": 0.00011892871585603468, |
|
"loss": 4.6285, |
|
"step": 4200 |
|
}, |
|
{ |
|
"epoch": 0.32689793092839015, |
|
"grad_norm": 4.214425086975098, |
|
"learning_rate": 0.00011867137237601558, |
|
"loss": 4.5328, |
|
"step": 4250 |
|
}, |
|
{ |
|
"epoch": 0.32689793092839015, |
|
"eval_loss": 4.511099815368652, |
|
"eval_runtime": 18.7154, |
|
"eval_samples_per_second": 53.432, |
|
"eval_steps_per_second": 13.358, |
|
"step": 4250 |
|
}, |
|
{ |
|
"epoch": 0.33074378893931233, |
|
"grad_norm": 2.3888497352600098, |
|
"learning_rate": 0.00011841402889599646, |
|
"loss": 4.5408, |
|
"step": 4300 |
|
}, |
|
{ |
|
"epoch": 0.3345896469502346, |
|
"grad_norm": 3.128143548965454, |
|
"learning_rate": 0.00011815668541597735, |
|
"loss": 4.3879, |
|
"step": 4350 |
|
}, |
|
{ |
|
"epoch": 0.3384355049611568, |
|
"grad_norm": 4.353067874908447, |
|
"learning_rate": 0.00011789934193595823, |
|
"loss": 4.5091, |
|
"step": 4400 |
|
}, |
|
{ |
|
"epoch": 0.34228136297207906, |
|
"grad_norm": 4.771759986877441, |
|
"learning_rate": 0.00011764199845593911, |
|
"loss": 4.407, |
|
"step": 4450 |
|
}, |
|
{ |
|
"epoch": 0.3461272209830013, |
|
"grad_norm": 2.9524829387664795, |
|
"learning_rate": 0.00011738465497591999, |
|
"loss": 4.3798, |
|
"step": 4500 |
|
}, |
|
{ |
|
"epoch": 0.3461272209830013, |
|
"eval_loss": 4.479401588439941, |
|
"eval_runtime": 18.8172, |
|
"eval_samples_per_second": 53.143, |
|
"eval_steps_per_second": 13.286, |
|
"step": 4500 |
|
}, |
|
{ |
|
"epoch": 0.34997307899392355, |
|
"grad_norm": 4.825377941131592, |
|
"learning_rate": 0.00011712731149590087, |
|
"loss": 4.5321, |
|
"step": 4550 |
|
}, |
|
{ |
|
"epoch": 0.3538189370048458, |
|
"grad_norm": 3.5786240100860596, |
|
"learning_rate": 0.00011686996801588176, |
|
"loss": 4.5819, |
|
"step": 4600 |
|
}, |
|
{ |
|
"epoch": 0.35766479501576803, |
|
"grad_norm": 4.445742130279541, |
|
"learning_rate": 0.00011661262453586264, |
|
"loss": 4.5954, |
|
"step": 4650 |
|
}, |
|
{ |
|
"epoch": 0.3615106530266903, |
|
"grad_norm": 4.670301914215088, |
|
"learning_rate": 0.00011635528105584354, |
|
"loss": 4.3381, |
|
"step": 4700 |
|
}, |
|
{ |
|
"epoch": 0.36535651103761246, |
|
"grad_norm": 3.0563037395477295, |
|
"learning_rate": 0.0001160979375758244, |
|
"loss": 4.4451, |
|
"step": 4750 |
|
}, |
|
{ |
|
"epoch": 0.36535651103761246, |
|
"eval_loss": 4.503940582275391, |
|
"eval_runtime": 19.0274, |
|
"eval_samples_per_second": 52.556, |
|
"eval_steps_per_second": 13.139, |
|
"step": 4750 |
|
}, |
|
{ |
|
"epoch": 0.3692023690485347, |
|
"grad_norm": 4.921920299530029, |
|
"learning_rate": 0.00011584059409580528, |
|
"loss": 4.5505, |
|
"step": 4800 |
|
}, |
|
{ |
|
"epoch": 0.37304822705945695, |
|
"grad_norm": 4.440188407897949, |
|
"learning_rate": 0.00011558325061578617, |
|
"loss": 4.5339, |
|
"step": 4850 |
|
}, |
|
{ |
|
"epoch": 0.3768940850703792, |
|
"grad_norm": 4.123379707336426, |
|
"learning_rate": 0.00011532590713576705, |
|
"loss": 4.5001, |
|
"step": 4900 |
|
}, |
|
{ |
|
"epoch": 0.38073994308130144, |
|
"grad_norm": 3.6461265087127686, |
|
"learning_rate": 0.00011506856365574795, |
|
"loss": 4.4704, |
|
"step": 4950 |
|
}, |
|
{ |
|
"epoch": 0.3845858010922237, |
|
"grad_norm": 4.586422443389893, |
|
"learning_rate": 0.00011481122017572883, |
|
"loss": 4.5607, |
|
"step": 5000 |
|
}, |
|
{ |
|
"epoch": 0.3845858010922237, |
|
"eval_loss": 4.414160251617432, |
|
"eval_runtime": 18.6554, |
|
"eval_samples_per_second": 53.604, |
|
"eval_steps_per_second": 13.401, |
|
"step": 5000 |
|
}, |
|
{ |
|
"epoch": 0.3884316591031459, |
|
"grad_norm": 2.658412456512451, |
|
"learning_rate": 0.00011455387669570971, |
|
"loss": 4.5453, |
|
"step": 5050 |
|
}, |
|
{ |
|
"epoch": 0.39227751711406816, |
|
"grad_norm": 2.231886148452759, |
|
"learning_rate": 0.00011429653321569059, |
|
"loss": 4.5524, |
|
"step": 5100 |
|
}, |
|
{ |
|
"epoch": 0.3961233751249904, |
|
"grad_norm": 4.202503204345703, |
|
"learning_rate": 0.00011403918973567147, |
|
"loss": 4.5274, |
|
"step": 5150 |
|
}, |
|
{ |
|
"epoch": 0.39996923313591265, |
|
"grad_norm": 2.8525800704956055, |
|
"learning_rate": 0.00011378184625565236, |
|
"loss": 4.5095, |
|
"step": 5200 |
|
}, |
|
{ |
|
"epoch": 0.40381509114683484, |
|
"grad_norm": 3.2517142295837402, |
|
"learning_rate": 0.00011352964964523362, |
|
"loss": 4.5043, |
|
"step": 5250 |
|
}, |
|
{ |
|
"epoch": 0.40381509114683484, |
|
"eval_loss": 4.595612525939941, |
|
"eval_runtime": 18.9024, |
|
"eval_samples_per_second": 52.903, |
|
"eval_steps_per_second": 13.226, |
|
"step": 5250 |
|
}, |
|
{ |
|
"epoch": 0.4076609491577571, |
|
"grad_norm": 5.091184616088867, |
|
"learning_rate": 0.00011327745303481488, |
|
"loss": 4.4768, |
|
"step": 5300 |
|
}, |
|
{ |
|
"epoch": 0.4115068071686793, |
|
"grad_norm": 6.631587028503418, |
|
"learning_rate": 0.00011302010955479578, |
|
"loss": 4.5572, |
|
"step": 5350 |
|
}, |
|
{ |
|
"epoch": 0.41535266517960157, |
|
"grad_norm": 3.529118299484253, |
|
"learning_rate": 0.00011276276607477666, |
|
"loss": 4.6685, |
|
"step": 5400 |
|
}, |
|
{ |
|
"epoch": 0.4191985231905238, |
|
"grad_norm": 3.1017537117004395, |
|
"learning_rate": 0.00011250542259475754, |
|
"loss": 4.4271, |
|
"step": 5450 |
|
}, |
|
{ |
|
"epoch": 0.42304438120144605, |
|
"grad_norm": 3.930664300918579, |
|
"learning_rate": 0.00011224807911473842, |
|
"loss": 4.5501, |
|
"step": 5500 |
|
}, |
|
{ |
|
"epoch": 0.42304438120144605, |
|
"eval_loss": 4.486245632171631, |
|
"eval_runtime": 18.9209, |
|
"eval_samples_per_second": 52.851, |
|
"eval_steps_per_second": 13.213, |
|
"step": 5500 |
|
}, |
|
{ |
|
"epoch": 0.4268902392123683, |
|
"grad_norm": 4.470078945159912, |
|
"learning_rate": 0.00011199073563471931, |
|
"loss": 4.543, |
|
"step": 5550 |
|
}, |
|
{ |
|
"epoch": 0.43073609722329054, |
|
"grad_norm": 5.099395751953125, |
|
"learning_rate": 0.00011173339215470019, |
|
"loss": 4.4515, |
|
"step": 5600 |
|
}, |
|
{ |
|
"epoch": 0.4345819552342128, |
|
"grad_norm": 3.210951805114746, |
|
"learning_rate": 0.00011147604867468107, |
|
"loss": 4.4605, |
|
"step": 5650 |
|
}, |
|
{ |
|
"epoch": 0.43842781324513497, |
|
"grad_norm": 4.092874050140381, |
|
"learning_rate": 0.00011121870519466196, |
|
"loss": 4.6267, |
|
"step": 5700 |
|
}, |
|
{ |
|
"epoch": 0.4422736712560572, |
|
"grad_norm": 2.756460666656494, |
|
"learning_rate": 0.00011096136171464283, |
|
"loss": 4.4338, |
|
"step": 5750 |
|
}, |
|
{ |
|
"epoch": 0.4422736712560572, |
|
"eval_loss": 4.457804203033447, |
|
"eval_runtime": 18.7914, |
|
"eval_samples_per_second": 53.216, |
|
"eval_steps_per_second": 13.304, |
|
"step": 5750 |
|
}, |
|
{ |
|
"epoch": 0.44611952926697945, |
|
"grad_norm": 5.140827178955078, |
|
"learning_rate": 0.00011070401823462372, |
|
"loss": 4.5102, |
|
"step": 5800 |
|
}, |
|
{ |
|
"epoch": 0.4499653872779017, |
|
"grad_norm": 6.364997863769531, |
|
"learning_rate": 0.0001104466747546046, |
|
"loss": 4.5594, |
|
"step": 5850 |
|
}, |
|
{ |
|
"epoch": 0.45381124528882394, |
|
"grad_norm": 5.3479695320129395, |
|
"learning_rate": 0.00011018933127458548, |
|
"loss": 4.4067, |
|
"step": 5900 |
|
}, |
|
{ |
|
"epoch": 0.4576571032997462, |
|
"grad_norm": 3.728893518447876, |
|
"learning_rate": 0.00010993198779456637, |
|
"loss": 4.4689, |
|
"step": 5950 |
|
}, |
|
{ |
|
"epoch": 0.4615029613106684, |
|
"grad_norm": 6.3881611824035645, |
|
"learning_rate": 0.00010967464431454724, |
|
"loss": 4.5641, |
|
"step": 6000 |
|
}, |
|
{ |
|
"epoch": 0.4615029613106684, |
|
"eval_loss": 4.457447052001953, |
|
"eval_runtime": 18.8382, |
|
"eval_samples_per_second": 53.084, |
|
"eval_steps_per_second": 13.271, |
|
"step": 6000 |
|
}, |
|
{ |
|
"epoch": 0.46534881932159067, |
|
"grad_norm": 3.6767919063568115, |
|
"learning_rate": 0.00010941730083452813, |
|
"loss": 4.5798, |
|
"step": 6050 |
|
}, |
|
{ |
|
"epoch": 0.4691946773325129, |
|
"grad_norm": 3.8597254753112793, |
|
"learning_rate": 0.00010915995735450901, |
|
"loss": 4.5867, |
|
"step": 6100 |
|
}, |
|
{ |
|
"epoch": 0.4730405353434351, |
|
"grad_norm": 2.8041980266571045, |
|
"learning_rate": 0.0001089026138744899, |
|
"loss": 4.4825, |
|
"step": 6150 |
|
}, |
|
{ |
|
"epoch": 0.47688639335435734, |
|
"grad_norm": 3.3872950077056885, |
|
"learning_rate": 0.00010864527039447078, |
|
"loss": 4.5624, |
|
"step": 6200 |
|
}, |
|
{ |
|
"epoch": 0.4807322513652796, |
|
"grad_norm": 3.698118209838867, |
|
"learning_rate": 0.00010838792691445166, |
|
"loss": 4.4889, |
|
"step": 6250 |
|
}, |
|
{ |
|
"epoch": 0.4807322513652796, |
|
"eval_loss": 4.451441287994385, |
|
"eval_runtime": 19.2349, |
|
"eval_samples_per_second": 51.989, |
|
"eval_steps_per_second": 12.997, |
|
"step": 6250 |
|
}, |
|
{ |
|
"epoch": 0.4845781093762018, |
|
"grad_norm": 3.7140421867370605, |
|
"learning_rate": 0.00010813058343443254, |
|
"loss": 4.4654, |
|
"step": 6300 |
|
}, |
|
{ |
|
"epoch": 0.48842396738712407, |
|
"grad_norm": 3.095348834991455, |
|
"learning_rate": 0.00010787323995441342, |
|
"loss": 4.4761, |
|
"step": 6350 |
|
}, |
|
{ |
|
"epoch": 0.4922698253980463, |
|
"grad_norm": 3.289018392562866, |
|
"learning_rate": 0.00010761589647439432, |
|
"loss": 4.5459, |
|
"step": 6400 |
|
}, |
|
{ |
|
"epoch": 0.49611568340896856, |
|
"grad_norm": 3.9891817569732666, |
|
"learning_rate": 0.0001073585529943752, |
|
"loss": 4.3685, |
|
"step": 6450 |
|
}, |
|
{ |
|
"epoch": 0.4999615414198908, |
|
"grad_norm": 4.315449237823486, |
|
"learning_rate": 0.00010710120951435608, |
|
"loss": 4.4197, |
|
"step": 6500 |
|
}, |
|
{ |
|
"epoch": 0.4999615414198908, |
|
"eval_loss": 4.4507598876953125, |
|
"eval_runtime": 18.8652, |
|
"eval_samples_per_second": 53.008, |
|
"eval_steps_per_second": 13.252, |
|
"step": 6500 |
|
}, |
|
{ |
|
"epoch": 0.503807399430813, |
|
"grad_norm": 4.299264430999756, |
|
"learning_rate": 0.00010684386603433697, |
|
"loss": 4.6103, |
|
"step": 6550 |
|
}, |
|
{ |
|
"epoch": 0.5076532574417353, |
|
"grad_norm": 4.186795234680176, |
|
"learning_rate": 0.00010659166942391823, |
|
"loss": 4.5303, |
|
"step": 6600 |
|
}, |
|
{ |
|
"epoch": 0.5114991154526575, |
|
"grad_norm": 2.925708293914795, |
|
"learning_rate": 0.00010633432594389911, |
|
"loss": 4.4265, |
|
"step": 6650 |
|
}, |
|
{ |
|
"epoch": 0.5153449734635798, |
|
"grad_norm": 6.368393421173096, |
|
"learning_rate": 0.00010607698246388, |
|
"loss": 4.3358, |
|
"step": 6700 |
|
}, |
|
{ |
|
"epoch": 0.519190831474502, |
|
"grad_norm": 4.947482585906982, |
|
"learning_rate": 0.00010581963898386088, |
|
"loss": 4.5812, |
|
"step": 6750 |
|
}, |
|
{ |
|
"epoch": 0.519190831474502, |
|
"eval_loss": 4.466405868530273, |
|
"eval_runtime": 18.8333, |
|
"eval_samples_per_second": 53.097, |
|
"eval_steps_per_second": 13.274, |
|
"step": 6750 |
|
}, |
|
{ |
|
"epoch": 0.5230366894854241, |
|
"grad_norm": 2.469914674758911, |
|
"learning_rate": 0.00010556229550384175, |
|
"loss": 4.3623, |
|
"step": 6800 |
|
}, |
|
{ |
|
"epoch": 0.5268825474963464, |
|
"grad_norm": 5.027404308319092, |
|
"learning_rate": 0.00010530495202382264, |
|
"loss": 4.5466, |
|
"step": 6850 |
|
}, |
|
{ |
|
"epoch": 0.5307284055072686, |
|
"grad_norm": 4.797220706939697, |
|
"learning_rate": 0.00010504760854380352, |
|
"loss": 4.4486, |
|
"step": 6900 |
|
}, |
|
{ |
|
"epoch": 0.5345742635181909, |
|
"grad_norm": 5.403319358825684, |
|
"learning_rate": 0.00010479026506378442, |
|
"loss": 4.4919, |
|
"step": 6950 |
|
}, |
|
{ |
|
"epoch": 0.5384201215291131, |
|
"grad_norm": 4.601899147033691, |
|
"learning_rate": 0.0001045329215837653, |
|
"loss": 4.4703, |
|
"step": 7000 |
|
}, |
|
{ |
|
"epoch": 0.5384201215291131, |
|
"eval_loss": 4.411437034606934, |
|
"eval_runtime": 18.8691, |
|
"eval_samples_per_second": 52.997, |
|
"eval_steps_per_second": 13.249, |
|
"step": 7000 |
|
}, |
|
{ |
|
"epoch": 0.5422659795400354, |
|
"grad_norm": 5.943952560424805, |
|
"learning_rate": 0.00010427557810374618, |
|
"loss": 4.3737, |
|
"step": 7050 |
|
}, |
|
{ |
|
"epoch": 0.5461118375509576, |
|
"grad_norm": 4.010414123535156, |
|
"learning_rate": 0.00010401823462372706, |
|
"loss": 4.5472, |
|
"step": 7100 |
|
}, |
|
{ |
|
"epoch": 0.5499576955618799, |
|
"grad_norm": 3.5218944549560547, |
|
"learning_rate": 0.00010376089114370794, |
|
"loss": 4.5854, |
|
"step": 7150 |
|
}, |
|
{ |
|
"epoch": 0.5538035535728021, |
|
"grad_norm": 9.44631290435791, |
|
"learning_rate": 0.00010350354766368883, |
|
"loss": 4.3883, |
|
"step": 7200 |
|
}, |
|
{ |
|
"epoch": 0.5576494115837243, |
|
"grad_norm": 4.5443434715271, |
|
"learning_rate": 0.00010324620418366971, |
|
"loss": 4.6685, |
|
"step": 7250 |
|
}, |
|
{ |
|
"epoch": 0.5576494115837243, |
|
"eval_loss": 4.4039154052734375, |
|
"eval_runtime": 18.856, |
|
"eval_samples_per_second": 53.034, |
|
"eval_steps_per_second": 13.258, |
|
"step": 7250 |
|
}, |
|
{ |
|
"epoch": 0.5614952695946466, |
|
"grad_norm": 3.646768569946289, |
|
"learning_rate": 0.0001029888607036506, |
|
"loss": 4.5259, |
|
"step": 7300 |
|
}, |
|
{ |
|
"epoch": 0.5653411276055688, |
|
"grad_norm": 3.510744571685791, |
|
"learning_rate": 0.00010273151722363148, |
|
"loss": 4.4461, |
|
"step": 7350 |
|
}, |
|
{ |
|
"epoch": 0.5691869856164911, |
|
"grad_norm": 3.874558687210083, |
|
"learning_rate": 0.00010247417374361235, |
|
"loss": 4.3743, |
|
"step": 7400 |
|
}, |
|
{ |
|
"epoch": 0.5730328436274132, |
|
"grad_norm": 2.755722761154175, |
|
"learning_rate": 0.00010221683026359324, |
|
"loss": 4.4979, |
|
"step": 7450 |
|
}, |
|
{ |
|
"epoch": 0.5768787016383355, |
|
"grad_norm": 3.5653252601623535, |
|
"learning_rate": 0.00010195948678357412, |
|
"loss": 4.5442, |
|
"step": 7500 |
|
}, |
|
{ |
|
"epoch": 0.5768787016383355, |
|
"eval_loss": 4.44308614730835, |
|
"eval_runtime": 18.8004, |
|
"eval_samples_per_second": 53.19, |
|
"eval_steps_per_second": 13.298, |
|
"step": 7500 |
|
}, |
|
{ |
|
"epoch": 0.5807245596492577, |
|
"grad_norm": 3.4961936473846436, |
|
"learning_rate": 0.00010170214330355501, |
|
"loss": 4.5194, |
|
"step": 7550 |
|
}, |
|
{ |
|
"epoch": 0.58457041766018, |
|
"grad_norm": 2.529500961303711, |
|
"learning_rate": 0.00010144479982353589, |
|
"loss": 4.3337, |
|
"step": 7600 |
|
}, |
|
{ |
|
"epoch": 0.5884162756711022, |
|
"grad_norm": 3.346160888671875, |
|
"learning_rate": 0.00010118745634351679, |
|
"loss": 4.5422, |
|
"step": 7650 |
|
}, |
|
{ |
|
"epoch": 0.5922621336820244, |
|
"grad_norm": 3.8311049938201904, |
|
"learning_rate": 0.00010093011286349765, |
|
"loss": 4.4191, |
|
"step": 7700 |
|
}, |
|
{ |
|
"epoch": 0.5961079916929467, |
|
"grad_norm": 4.324901580810547, |
|
"learning_rate": 0.00010067276938347853, |
|
"loss": 4.4613, |
|
"step": 7750 |
|
}, |
|
{ |
|
"epoch": 0.5961079916929467, |
|
"eval_loss": 4.4118547439575195, |
|
"eval_runtime": 18.9517, |
|
"eval_samples_per_second": 52.766, |
|
"eval_steps_per_second": 13.191, |
|
"step": 7750 |
|
}, |
|
{ |
|
"epoch": 0.5999538497038689, |
|
"grad_norm": 3.888192653656006, |
|
"learning_rate": 0.00010041542590345943, |
|
"loss": 4.5492, |
|
"step": 7800 |
|
}, |
|
{ |
|
"epoch": 0.6037997077147912, |
|
"grad_norm": 2.718320608139038, |
|
"learning_rate": 0.0001001580824234403, |
|
"loss": 4.5371, |
|
"step": 7850 |
|
}, |
|
{ |
|
"epoch": 0.6076455657257134, |
|
"grad_norm": 3.5970869064331055, |
|
"learning_rate": 9.99007389434212e-05, |
|
"loss": 4.4835, |
|
"step": 7900 |
|
}, |
|
{ |
|
"epoch": 0.6114914237366357, |
|
"grad_norm": 4.563399314880371, |
|
"learning_rate": 9.964339546340208e-05, |
|
"loss": 4.4494, |
|
"step": 7950 |
|
}, |
|
{ |
|
"epoch": 0.6153372817475579, |
|
"grad_norm": 5.080177307128906, |
|
"learning_rate": 9.938605198338294e-05, |
|
"loss": 4.6072, |
|
"step": 8000 |
|
}, |
|
{ |
|
"epoch": 0.6153372817475579, |
|
"eval_loss": 4.428142547607422, |
|
"eval_runtime": 18.8815, |
|
"eval_samples_per_second": 52.962, |
|
"eval_steps_per_second": 13.241, |
|
"step": 8000 |
|
}, |
|
{ |
|
"epoch": 0.6191831397584802, |
|
"grad_norm": 4.333257675170898, |
|
"learning_rate": 9.912870850336384e-05, |
|
"loss": 4.3148, |
|
"step": 8050 |
|
}, |
|
{ |
|
"epoch": 0.6230289977694023, |
|
"grad_norm": 5.497674465179443, |
|
"learning_rate": 9.887136502334472e-05, |
|
"loss": 4.5952, |
|
"step": 8100 |
|
}, |
|
{ |
|
"epoch": 0.6268748557803245, |
|
"grad_norm": 4.110482215881348, |
|
"learning_rate": 9.861402154332561e-05, |
|
"loss": 4.5036, |
|
"step": 8150 |
|
}, |
|
{ |
|
"epoch": 0.6307207137912468, |
|
"grad_norm": 3.9359841346740723, |
|
"learning_rate": 9.835667806330649e-05, |
|
"loss": 4.409, |
|
"step": 8200 |
|
}, |
|
{ |
|
"epoch": 0.634566571802169, |
|
"grad_norm": 4.095981597900391, |
|
"learning_rate": 9.809933458328738e-05, |
|
"loss": 4.3515, |
|
"step": 8250 |
|
}, |
|
{ |
|
"epoch": 0.634566571802169, |
|
"eval_loss": 4.438499927520752, |
|
"eval_runtime": 18.9189, |
|
"eval_samples_per_second": 52.857, |
|
"eval_steps_per_second": 13.214, |
|
"step": 8250 |
|
}, |
|
{ |
|
"epoch": 0.6384124298130913, |
|
"grad_norm": 4.357822895050049, |
|
"learning_rate": 9.784199110326825e-05, |
|
"loss": 4.3767, |
|
"step": 8300 |
|
}, |
|
{ |
|
"epoch": 0.6422582878240135, |
|
"grad_norm": 3.039700508117676, |
|
"learning_rate": 9.758979449284952e-05, |
|
"loss": 4.4542, |
|
"step": 8350 |
|
}, |
|
{ |
|
"epoch": 0.6461041458349358, |
|
"grad_norm": 6.7661919593811035, |
|
"learning_rate": 9.73324510128304e-05, |
|
"loss": 4.4073, |
|
"step": 8400 |
|
}, |
|
{ |
|
"epoch": 0.649950003845858, |
|
"grad_norm": 4.223692893981934, |
|
"learning_rate": 9.70751075328113e-05, |
|
"loss": 4.4904, |
|
"step": 8450 |
|
}, |
|
{ |
|
"epoch": 0.6537958618567803, |
|
"grad_norm": 4.621217250823975, |
|
"learning_rate": 9.681776405279216e-05, |
|
"loss": 4.7717, |
|
"step": 8500 |
|
}, |
|
{ |
|
"epoch": 0.6537958618567803, |
|
"eval_loss": 4.392988204956055, |
|
"eval_runtime": 18.8399, |
|
"eval_samples_per_second": 53.079, |
|
"eval_steps_per_second": 13.27, |
|
"step": 8500 |
|
}, |
|
{ |
|
"epoch": 0.6576417198677025, |
|
"grad_norm": 2.6913883686065674, |
|
"learning_rate": 9.656042057277304e-05, |
|
"loss": 4.4409, |
|
"step": 8550 |
|
}, |
|
{ |
|
"epoch": 0.6614875778786247, |
|
"grad_norm": 3.749894618988037, |
|
"learning_rate": 9.630307709275394e-05, |
|
"loss": 4.5101, |
|
"step": 8600 |
|
}, |
|
{ |
|
"epoch": 0.665333435889547, |
|
"grad_norm": 4.93977165222168, |
|
"learning_rate": 9.604573361273482e-05, |
|
"loss": 4.4504, |
|
"step": 8650 |
|
}, |
|
{ |
|
"epoch": 0.6691792939004692, |
|
"grad_norm": 4.311313152313232, |
|
"learning_rate": 9.578839013271571e-05, |
|
"loss": 4.4857, |
|
"step": 8700 |
|
}, |
|
{ |
|
"epoch": 0.6730251519113915, |
|
"grad_norm": 3.646656036376953, |
|
"learning_rate": 9.553104665269659e-05, |
|
"loss": 4.387, |
|
"step": 8750 |
|
}, |
|
{ |
|
"epoch": 0.6730251519113915, |
|
"eval_loss": 4.401506423950195, |
|
"eval_runtime": 18.7931, |
|
"eval_samples_per_second": 53.211, |
|
"eval_steps_per_second": 13.303, |
|
"step": 8750 |
|
}, |
|
{ |
|
"epoch": 0.6768710099223136, |
|
"grad_norm": 4.352843284606934, |
|
"learning_rate": 9.527370317267746e-05, |
|
"loss": 4.5279, |
|
"step": 8800 |
|
}, |
|
{ |
|
"epoch": 0.6807168679332359, |
|
"grad_norm": 3.890216827392578, |
|
"learning_rate": 9.501635969265835e-05, |
|
"loss": 4.4485, |
|
"step": 8850 |
|
}, |
|
{ |
|
"epoch": 0.6845627259441581, |
|
"grad_norm": 3.4119713306427, |
|
"learning_rate": 9.475901621263923e-05, |
|
"loss": 4.4428, |
|
"step": 8900 |
|
}, |
|
{ |
|
"epoch": 0.6884085839550804, |
|
"grad_norm": 7.813595294952393, |
|
"learning_rate": 9.450167273262012e-05, |
|
"loss": 4.3308, |
|
"step": 8950 |
|
}, |
|
{ |
|
"epoch": 0.6922544419660026, |
|
"grad_norm": 3.079829692840576, |
|
"learning_rate": 9.4244329252601e-05, |
|
"loss": 4.368, |
|
"step": 9000 |
|
}, |
|
{ |
|
"epoch": 0.6922544419660026, |
|
"eval_loss": 4.393312931060791, |
|
"eval_runtime": 18.7727, |
|
"eval_samples_per_second": 53.269, |
|
"eval_steps_per_second": 13.317, |
|
"step": 9000 |
|
}, |
|
{ |
|
"epoch": 0.6961002999769248, |
|
"grad_norm": 9.26623821258545, |
|
"learning_rate": 9.39869857725819e-05, |
|
"loss": 4.3073, |
|
"step": 9050 |
|
}, |
|
{ |
|
"epoch": 0.6999461579878471, |
|
"grad_norm": 3.5981953144073486, |
|
"learning_rate": 9.372964229256276e-05, |
|
"loss": 4.3923, |
|
"step": 9100 |
|
}, |
|
{ |
|
"epoch": 0.7037920159987693, |
|
"grad_norm": 3.734813690185547, |
|
"learning_rate": 9.347229881254364e-05, |
|
"loss": 4.2449, |
|
"step": 9150 |
|
}, |
|
{ |
|
"epoch": 0.7076378740096916, |
|
"grad_norm": 5.646871566772461, |
|
"learning_rate": 9.321495533252453e-05, |
|
"loss": 4.3953, |
|
"step": 9200 |
|
}, |
|
{ |
|
"epoch": 0.7114837320206138, |
|
"grad_norm": 4.284733295440674, |
|
"learning_rate": 9.295761185250541e-05, |
|
"loss": 4.475, |
|
"step": 9250 |
|
}, |
|
{ |
|
"epoch": 0.7114837320206138, |
|
"eval_loss": 4.348310470581055, |
|
"eval_runtime": 19.0285, |
|
"eval_samples_per_second": 52.553, |
|
"eval_steps_per_second": 13.138, |
|
"step": 9250 |
|
}, |
|
{ |
|
"epoch": 0.7153295900315361, |
|
"grad_norm": 5.92791223526001, |
|
"learning_rate": 9.27002683724863e-05, |
|
"loss": 4.5493, |
|
"step": 9300 |
|
}, |
|
{ |
|
"epoch": 0.7191754480424583, |
|
"grad_norm": 4.768808841705322, |
|
"learning_rate": 9.244292489246719e-05, |
|
"loss": 4.2508, |
|
"step": 9350 |
|
}, |
|
{ |
|
"epoch": 0.7230213060533806, |
|
"grad_norm": 3.473097562789917, |
|
"learning_rate": 9.218558141244805e-05, |
|
"loss": 4.4534, |
|
"step": 9400 |
|
}, |
|
{ |
|
"epoch": 0.7268671640643027, |
|
"grad_norm": 10.189091682434082, |
|
"learning_rate": 9.192823793242895e-05, |
|
"loss": 4.3883, |
|
"step": 9450 |
|
}, |
|
{ |
|
"epoch": 0.7307130220752249, |
|
"grad_norm": 1.9577853679656982, |
|
"learning_rate": 9.167089445240982e-05, |
|
"loss": 4.3191, |
|
"step": 9500 |
|
}, |
|
{ |
|
"epoch": 0.7307130220752249, |
|
"eval_loss": 4.328299045562744, |
|
"eval_runtime": 18.8631, |
|
"eval_samples_per_second": 53.014, |
|
"eval_steps_per_second": 13.253, |
|
"step": 9500 |
|
}, |
|
{ |
|
"epoch": 0.7345588800861472, |
|
"grad_norm": 3.9685990810394287, |
|
"learning_rate": 9.141355097239072e-05, |
|
"loss": 4.325, |
|
"step": 9550 |
|
}, |
|
{ |
|
"epoch": 0.7384047380970694, |
|
"grad_norm": 5.303285121917725, |
|
"learning_rate": 9.11562074923716e-05, |
|
"loss": 4.4277, |
|
"step": 9600 |
|
}, |
|
{ |
|
"epoch": 0.7422505961079917, |
|
"grad_norm": 2.70599627494812, |
|
"learning_rate": 9.089886401235249e-05, |
|
"loss": 4.4329, |
|
"step": 9650 |
|
}, |
|
{ |
|
"epoch": 0.7460964541189139, |
|
"grad_norm": 4.711449146270752, |
|
"learning_rate": 9.064152053233336e-05, |
|
"loss": 4.251, |
|
"step": 9700 |
|
}, |
|
{ |
|
"epoch": 0.7499423121298362, |
|
"grad_norm": 3.0169851779937744, |
|
"learning_rate": 9.038417705231424e-05, |
|
"loss": 4.3483, |
|
"step": 9750 |
|
}, |
|
{ |
|
"epoch": 0.7499423121298362, |
|
"eval_loss": 4.341108322143555, |
|
"eval_runtime": 18.9063, |
|
"eval_samples_per_second": 52.893, |
|
"eval_steps_per_second": 13.223, |
|
"step": 9750 |
|
}, |
|
{ |
|
"epoch": 0.7537881701407584, |
|
"grad_norm": 3.375880002975464, |
|
"learning_rate": 9.012683357229513e-05, |
|
"loss": 4.313, |
|
"step": 9800 |
|
}, |
|
{ |
|
"epoch": 0.7576340281516807, |
|
"grad_norm": 1.707850456237793, |
|
"learning_rate": 8.986949009227601e-05, |
|
"loss": 4.3062, |
|
"step": 9850 |
|
}, |
|
{ |
|
"epoch": 0.7614798861626029, |
|
"grad_norm": 3.6718738079071045, |
|
"learning_rate": 8.96121466122569e-05, |
|
"loss": 4.4415, |
|
"step": 9900 |
|
}, |
|
{ |
|
"epoch": 0.7653257441735252, |
|
"grad_norm": 3.5382699966430664, |
|
"learning_rate": 8.935480313223778e-05, |
|
"loss": 4.3754, |
|
"step": 9950 |
|
}, |
|
{ |
|
"epoch": 0.7691716021844474, |
|
"grad_norm": 4.678229808807373, |
|
"learning_rate": 8.909745965221865e-05, |
|
"loss": 4.4404, |
|
"step": 10000 |
|
}, |
|
{ |
|
"epoch": 0.7691716021844474, |
|
"eval_loss": 4.3746819496154785, |
|
"eval_runtime": 18.7221, |
|
"eval_samples_per_second": 53.413, |
|
"eval_steps_per_second": 13.353, |
|
"step": 10000 |
|
}, |
|
{ |
|
"epoch": 0.7730174601953695, |
|
"grad_norm": 3.490699529647827, |
|
"learning_rate": 8.884011617219954e-05, |
|
"loss": 4.5294, |
|
"step": 10050 |
|
}, |
|
{ |
|
"epoch": 0.7768633182062918, |
|
"grad_norm": 4.614148139953613, |
|
"learning_rate": 8.858277269218042e-05, |
|
"loss": 4.2371, |
|
"step": 10100 |
|
}, |
|
{ |
|
"epoch": 0.780709176217214, |
|
"grad_norm": 5.6906962394714355, |
|
"learning_rate": 8.832542921216132e-05, |
|
"loss": 4.5472, |
|
"step": 10150 |
|
}, |
|
{ |
|
"epoch": 0.7845550342281363, |
|
"grad_norm": 4.382456302642822, |
|
"learning_rate": 8.80680857321422e-05, |
|
"loss": 4.4282, |
|
"step": 10200 |
|
}, |
|
{ |
|
"epoch": 0.7884008922390585, |
|
"grad_norm": 4.546772003173828, |
|
"learning_rate": 8.781074225212309e-05, |
|
"loss": 4.4004, |
|
"step": 10250 |
|
}, |
|
{ |
|
"epoch": 0.7884008922390585, |
|
"eval_loss": 4.373971462249756, |
|
"eval_runtime": 18.9303, |
|
"eval_samples_per_second": 52.825, |
|
"eval_steps_per_second": 13.206, |
|
"step": 10250 |
|
}, |
|
{ |
|
"epoch": 0.7922467502499808, |
|
"grad_norm": 3.784317970275879, |
|
"learning_rate": 8.755339877210395e-05, |
|
"loss": 4.4422, |
|
"step": 10300 |
|
}, |
|
{ |
|
"epoch": 0.796092608260903, |
|
"grad_norm": 3.11979341506958, |
|
"learning_rate": 8.729605529208483e-05, |
|
"loss": 4.4909, |
|
"step": 10350 |
|
}, |
|
{ |
|
"epoch": 0.7999384662718253, |
|
"grad_norm": 4.9711012840271, |
|
"learning_rate": 8.703871181206573e-05, |
|
"loss": 4.2955, |
|
"step": 10400 |
|
}, |
|
{ |
|
"epoch": 0.8037843242827475, |
|
"grad_norm": 3.7663426399230957, |
|
"learning_rate": 8.678136833204661e-05, |
|
"loss": 4.5105, |
|
"step": 10450 |
|
}, |
|
{ |
|
"epoch": 0.8076301822936697, |
|
"grad_norm": 4.679628372192383, |
|
"learning_rate": 8.65240248520275e-05, |
|
"loss": 4.5038, |
|
"step": 10500 |
|
}, |
|
{ |
|
"epoch": 0.8076301822936697, |
|
"eval_loss": 4.3565592765808105, |
|
"eval_runtime": 18.9119, |
|
"eval_samples_per_second": 52.877, |
|
"eval_steps_per_second": 13.219, |
|
"step": 10500 |
|
}, |
|
{ |
|
"epoch": 0.811476040304592, |
|
"grad_norm": 4.561670303344727, |
|
"learning_rate": 8.626668137200838e-05, |
|
"loss": 4.6428, |
|
"step": 10550 |
|
}, |
|
{ |
|
"epoch": 0.8153218983155142, |
|
"grad_norm": 3.155518054962158, |
|
"learning_rate": 8.600933789198925e-05, |
|
"loss": 4.4605, |
|
"step": 10600 |
|
}, |
|
{ |
|
"epoch": 0.8191677563264365, |
|
"grad_norm": 4.021768093109131, |
|
"learning_rate": 8.575199441197014e-05, |
|
"loss": 4.2982, |
|
"step": 10650 |
|
}, |
|
{ |
|
"epoch": 0.8230136143373586, |
|
"grad_norm": 4.348796844482422, |
|
"learning_rate": 8.549465093195102e-05, |
|
"loss": 4.649, |
|
"step": 10700 |
|
}, |
|
{ |
|
"epoch": 0.8268594723482809, |
|
"grad_norm": 4.647562503814697, |
|
"learning_rate": 8.523730745193191e-05, |
|
"loss": 4.2873, |
|
"step": 10750 |
|
}, |
|
{ |
|
"epoch": 0.8268594723482809, |
|
"eval_loss": 4.3483662605285645, |
|
"eval_runtime": 18.9227, |
|
"eval_samples_per_second": 52.847, |
|
"eval_steps_per_second": 13.212, |
|
"step": 10750 |
|
}, |
|
{ |
|
"epoch": 0.8307053303592031, |
|
"grad_norm": 3.9260427951812744, |
|
"learning_rate": 8.497996397191279e-05, |
|
"loss": 4.3823, |
|
"step": 10800 |
|
}, |
|
{ |
|
"epoch": 0.8345511883701254, |
|
"grad_norm": 3.7108564376831055, |
|
"learning_rate": 8.472262049189368e-05, |
|
"loss": 4.42, |
|
"step": 10850 |
|
}, |
|
{ |
|
"epoch": 0.8383970463810476, |
|
"grad_norm": 4.9123663902282715, |
|
"learning_rate": 8.446527701187455e-05, |
|
"loss": 4.5828, |
|
"step": 10900 |
|
}, |
|
{ |
|
"epoch": 0.8422429043919698, |
|
"grad_norm": 3.7289183139801025, |
|
"learning_rate": 8.420793353185543e-05, |
|
"loss": 4.3134, |
|
"step": 10950 |
|
}, |
|
{ |
|
"epoch": 0.8460887624028921, |
|
"grad_norm": 4.0350542068481445, |
|
"learning_rate": 8.395059005183632e-05, |
|
"loss": 4.3768, |
|
"step": 11000 |
|
}, |
|
{ |
|
"epoch": 0.8460887624028921, |
|
"eval_loss": 4.307990074157715, |
|
"eval_runtime": 18.7713, |
|
"eval_samples_per_second": 53.273, |
|
"eval_steps_per_second": 13.318, |
|
"step": 11000 |
|
}, |
|
{ |
|
"epoch": 0.8499346204138143, |
|
"grad_norm": 5.336431503295898, |
|
"learning_rate": 8.36983934414176e-05, |
|
"loss": 4.3977, |
|
"step": 11050 |
|
}, |
|
{ |
|
"epoch": 0.8537804784247366, |
|
"grad_norm": 4.175157070159912, |
|
"learning_rate": 8.344104996139847e-05, |
|
"loss": 4.4053, |
|
"step": 11100 |
|
}, |
|
{ |
|
"epoch": 0.8576263364356588, |
|
"grad_norm": 4.384688377380371, |
|
"learning_rate": 8.318370648137934e-05, |
|
"loss": 4.26, |
|
"step": 11150 |
|
}, |
|
{ |
|
"epoch": 0.8614721944465811, |
|
"grad_norm": 3.6022467613220215, |
|
"learning_rate": 8.292636300136024e-05, |
|
"loss": 4.2993, |
|
"step": 11200 |
|
}, |
|
{ |
|
"epoch": 0.8653180524575033, |
|
"grad_norm": 4.252429485321045, |
|
"learning_rate": 8.266901952134112e-05, |
|
"loss": 4.299, |
|
"step": 11250 |
|
}, |
|
{ |
|
"epoch": 0.8653180524575033, |
|
"eval_loss": 4.334308624267578, |
|
"eval_runtime": 18.9071, |
|
"eval_samples_per_second": 52.89, |
|
"eval_steps_per_second": 13.223, |
|
"step": 11250 |
|
}, |
|
{ |
|
"epoch": 0.8691639104684256, |
|
"grad_norm": 3.4003775119781494, |
|
"learning_rate": 8.241167604132201e-05, |
|
"loss": 4.2806, |
|
"step": 11300 |
|
}, |
|
{ |
|
"epoch": 0.8730097684793477, |
|
"grad_norm": 3.7436835765838623, |
|
"learning_rate": 8.215433256130289e-05, |
|
"loss": 4.2694, |
|
"step": 11350 |
|
}, |
|
{ |
|
"epoch": 0.8768556264902699, |
|
"grad_norm": 2.8963701725006104, |
|
"learning_rate": 8.189698908128376e-05, |
|
"loss": 4.362, |
|
"step": 11400 |
|
}, |
|
{ |
|
"epoch": 0.8807014845011922, |
|
"grad_norm": 3.3496339321136475, |
|
"learning_rate": 8.163964560126465e-05, |
|
"loss": 4.3698, |
|
"step": 11450 |
|
}, |
|
{ |
|
"epoch": 0.8845473425121144, |
|
"grad_norm": 4.4007487297058105, |
|
"learning_rate": 8.138230212124553e-05, |
|
"loss": 4.2994, |
|
"step": 11500 |
|
}, |
|
{ |
|
"epoch": 0.8845473425121144, |
|
"eval_loss": 4.315768241882324, |
|
"eval_runtime": 18.8056, |
|
"eval_samples_per_second": 53.176, |
|
"eval_steps_per_second": 13.294, |
|
"step": 11500 |
|
}, |
|
{ |
|
"epoch": 0.8883932005230367, |
|
"grad_norm": 5.072123050689697, |
|
"learning_rate": 8.112495864122642e-05, |
|
"loss": 4.5564, |
|
"step": 11550 |
|
}, |
|
{ |
|
"epoch": 0.8922390585339589, |
|
"grad_norm": 3.130788564682007, |
|
"learning_rate": 8.08676151612073e-05, |
|
"loss": 4.427, |
|
"step": 11600 |
|
}, |
|
{ |
|
"epoch": 0.8960849165448812, |
|
"grad_norm": 2.615147352218628, |
|
"learning_rate": 8.06102716811882e-05, |
|
"loss": 4.3831, |
|
"step": 11650 |
|
}, |
|
{ |
|
"epoch": 0.8999307745558034, |
|
"grad_norm": 8.039403915405273, |
|
"learning_rate": 8.035292820116906e-05, |
|
"loss": 4.3388, |
|
"step": 11700 |
|
}, |
|
{ |
|
"epoch": 0.9037766325667257, |
|
"grad_norm": 2.6177854537963867, |
|
"learning_rate": 8.009558472114994e-05, |
|
"loss": 1.4931, |
|
"step": 11750 |
|
}, |
|
{ |
|
"epoch": 0.9037766325667257, |
|
"eval_loss": 1.534182071685791, |
|
"eval_runtime": 18.0719, |
|
"eval_samples_per_second": 55.335, |
|
"eval_steps_per_second": 13.834, |
|
"step": 11750 |
|
}, |
|
{ |
|
"epoch": 0.9076224905776479, |
|
"grad_norm": 1.4090014696121216, |
|
"learning_rate": 7.983824124113084e-05, |
|
"loss": 1.5524, |
|
"step": 11800 |
|
}, |
|
{ |
|
"epoch": 0.9114683485885701, |
|
"grad_norm": 1.4773452281951904, |
|
"learning_rate": 7.958089776111171e-05, |
|
"loss": 1.4703, |
|
"step": 11850 |
|
}, |
|
{ |
|
"epoch": 0.9153142065994924, |
|
"grad_norm": 1.7350648641586304, |
|
"learning_rate": 7.932355428109261e-05, |
|
"loss": 1.4752, |
|
"step": 11900 |
|
}, |
|
{ |
|
"epoch": 0.9191600646104146, |
|
"grad_norm": 1.9704972505569458, |
|
"learning_rate": 7.906621080107349e-05, |
|
"loss": 1.5257, |
|
"step": 11950 |
|
}, |
|
{ |
|
"epoch": 0.9230059226213368, |
|
"grad_norm": 1.6183151006698608, |
|
"learning_rate": 7.880886732105437e-05, |
|
"loss": 1.4704, |
|
"step": 12000 |
|
}, |
|
{ |
|
"epoch": 0.9230059226213368, |
|
"eval_loss": 1.5159597396850586, |
|
"eval_runtime": 17.891, |
|
"eval_samples_per_second": 55.894, |
|
"eval_steps_per_second": 13.974, |
|
"step": 12000 |
|
}, |
|
{ |
|
"epoch": 0.926851780632259, |
|
"grad_norm": 1.736138939857483, |
|
"learning_rate": 7.855152384103525e-05, |
|
"loss": 1.5304, |
|
"step": 12050 |
|
}, |
|
{ |
|
"epoch": 0.9306976386431813, |
|
"grad_norm": 1.807916283607483, |
|
"learning_rate": 7.829418036101613e-05, |
|
"loss": 1.4984, |
|
"step": 12100 |
|
}, |
|
{ |
|
"epoch": 0.9345434966541035, |
|
"grad_norm": 1.1977109909057617, |
|
"learning_rate": 7.803683688099702e-05, |
|
"loss": 1.4307, |
|
"step": 12150 |
|
}, |
|
{ |
|
"epoch": 0.9383893546650258, |
|
"grad_norm": 0.8386535048484802, |
|
"learning_rate": 7.77794934009779e-05, |
|
"loss": 1.444, |
|
"step": 12200 |
|
}, |
|
{ |
|
"epoch": 0.942235212675948, |
|
"grad_norm": 1.395053744316101, |
|
"learning_rate": 7.752214992095878e-05, |
|
"loss": 1.4866, |
|
"step": 12250 |
|
}, |
|
{ |
|
"epoch": 0.942235212675948, |
|
"eval_loss": 1.5108226537704468, |
|
"eval_runtime": 18.0888, |
|
"eval_samples_per_second": 55.283, |
|
"eval_steps_per_second": 13.821, |
|
"step": 12250 |
|
}, |
|
{ |
|
"epoch": 0.9460810706868702, |
|
"grad_norm": 1.5271111726760864, |
|
"learning_rate": 7.726480644093966e-05, |
|
"loss": 1.4849, |
|
"step": 12300 |
|
}, |
|
{ |
|
"epoch": 0.9499269286977925, |
|
"grad_norm": 3.0610506534576416, |
|
"learning_rate": 7.700746296092054e-05, |
|
"loss": 1.4613, |
|
"step": 12350 |
|
}, |
|
{ |
|
"epoch": 0.9537727867087147, |
|
"grad_norm": 1.8968026638031006, |
|
"learning_rate": 7.675011948090143e-05, |
|
"loss": 1.591, |
|
"step": 12400 |
|
}, |
|
{ |
|
"epoch": 0.957618644719637, |
|
"grad_norm": 1.748979926109314, |
|
"learning_rate": 7.649277600088231e-05, |
|
"loss": 1.4781, |
|
"step": 12450 |
|
}, |
|
{ |
|
"epoch": 0.9614645027305592, |
|
"grad_norm": 1.6586661338806152, |
|
"learning_rate": 7.62354325208632e-05, |
|
"loss": 1.4668, |
|
"step": 12500 |
|
}, |
|
{ |
|
"epoch": 0.9614645027305592, |
|
"eval_loss": 1.5503162145614624, |
|
"eval_runtime": 17.9222, |
|
"eval_samples_per_second": 55.797, |
|
"eval_steps_per_second": 13.949, |
|
"step": 12500 |
|
}, |
|
{ |
|
"epoch": 0.9653103607414815, |
|
"grad_norm": 7.388810634613037, |
|
"learning_rate": 7.597808904084407e-05, |
|
"loss": 1.5683, |
|
"step": 12550 |
|
}, |
|
{ |
|
"epoch": 0.9691562187524037, |
|
"grad_norm": 1.5548075437545776, |
|
"learning_rate": 7.572074556082496e-05, |
|
"loss": 1.4956, |
|
"step": 12600 |
|
}, |
|
{ |
|
"epoch": 0.973002076763326, |
|
"grad_norm": 1.5935887098312378, |
|
"learning_rate": 7.546340208080584e-05, |
|
"loss": 1.5363, |
|
"step": 12650 |
|
}, |
|
{ |
|
"epoch": 0.9768479347742481, |
|
"grad_norm": 1.985238790512085, |
|
"learning_rate": 7.520605860078672e-05, |
|
"loss": 1.5314, |
|
"step": 12700 |
|
}, |
|
{ |
|
"epoch": 0.9806937927851703, |
|
"grad_norm": 1.5040565729141235, |
|
"learning_rate": 7.494871512076762e-05, |
|
"loss": 1.5108, |
|
"step": 12750 |
|
}, |
|
{ |
|
"epoch": 0.9806937927851703, |
|
"eval_loss": 1.5085468292236328, |
|
"eval_runtime": 18.0531, |
|
"eval_samples_per_second": 55.392, |
|
"eval_steps_per_second": 13.848, |
|
"step": 12750 |
|
}, |
|
{ |
|
"epoch": 0.9845396507960926, |
|
"grad_norm": 1.2956914901733398, |
|
"learning_rate": 7.46913716407485e-05, |
|
"loss": 1.4287, |
|
"step": 12800 |
|
}, |
|
{ |
|
"epoch": 0.9883855088070148, |
|
"grad_norm": 1.1903409957885742, |
|
"learning_rate": 7.443402816072938e-05, |
|
"loss": 1.5583, |
|
"step": 12850 |
|
}, |
|
{ |
|
"epoch": 0.9922313668179371, |
|
"grad_norm": 1.9069184064865112, |
|
"learning_rate": 7.417668468071026e-05, |
|
"loss": 1.5214, |
|
"step": 12900 |
|
}, |
|
{ |
|
"epoch": 0.9960772248288593, |
|
"grad_norm": 1.7362926006317139, |
|
"learning_rate": 7.391934120069114e-05, |
|
"loss": 1.55, |
|
"step": 12950 |
|
}, |
|
{ |
|
"epoch": 0.9999230828397816, |
|
"grad_norm": 1.2136348485946655, |
|
"learning_rate": 7.366199772067203e-05, |
|
"loss": 1.5035, |
|
"step": 13000 |
|
}, |
|
{ |
|
"epoch": 0.9999230828397816, |
|
"eval_loss": 1.5033278465270996, |
|
"eval_runtime": 18.189, |
|
"eval_samples_per_second": 54.978, |
|
"eval_steps_per_second": 13.745, |
|
"step": 13000 |
|
}, |
|
{ |
|
"epoch": 1.0037689408507038, |
|
"grad_norm": 1.291033387184143, |
|
"learning_rate": 7.340465424065291e-05, |
|
"loss": 1.4455, |
|
"step": 13050 |
|
}, |
|
{ |
|
"epoch": 1.007614798861626, |
|
"grad_norm": 1.247129201889038, |
|
"learning_rate": 7.31473107606338e-05, |
|
"loss": 1.4629, |
|
"step": 13100 |
|
}, |
|
{ |
|
"epoch": 1.0114606568725482, |
|
"grad_norm": 1.2177772521972656, |
|
"learning_rate": 7.288996728061467e-05, |
|
"loss": 1.5715, |
|
"step": 13150 |
|
}, |
|
{ |
|
"epoch": 1.0153065148834706, |
|
"grad_norm": 1.2471716403961182, |
|
"learning_rate": 7.263262380059556e-05, |
|
"loss": 1.4244, |
|
"step": 13200 |
|
}, |
|
{ |
|
"epoch": 1.0191523728943928, |
|
"grad_norm": 0.8932450413703918, |
|
"learning_rate": 7.237528032057644e-05, |
|
"loss": 1.4278, |
|
"step": 13250 |
|
}, |
|
{ |
|
"epoch": 1.0191523728943928, |
|
"eval_loss": 1.5201970338821411, |
|
"eval_runtime": 17.9356, |
|
"eval_samples_per_second": 55.755, |
|
"eval_steps_per_second": 13.939, |
|
"step": 13250 |
|
}, |
|
{ |
|
"epoch": 1.022998230905315, |
|
"grad_norm": 1.9957834482192993, |
|
"learning_rate": 7.211793684055732e-05, |
|
"loss": 1.5017, |
|
"step": 13300 |
|
}, |
|
{ |
|
"epoch": 1.0268440889162371, |
|
"grad_norm": 1.432619571685791, |
|
"learning_rate": 7.186059336053821e-05, |
|
"loss": 1.4271, |
|
"step": 13350 |
|
}, |
|
{ |
|
"epoch": 1.0306899469271595, |
|
"grad_norm": 1.3298619985580444, |
|
"learning_rate": 7.16032498805191e-05, |
|
"loss": 1.5726, |
|
"step": 13400 |
|
}, |
|
{ |
|
"epoch": 1.0345358049380817, |
|
"grad_norm": 10.102746963500977, |
|
"learning_rate": 7.134590640049997e-05, |
|
"loss": 1.3938, |
|
"step": 13450 |
|
}, |
|
{ |
|
"epoch": 1.038381662949004, |
|
"grad_norm": 1.9288721084594727, |
|
"learning_rate": 7.108856292048085e-05, |
|
"loss": 1.4264, |
|
"step": 13500 |
|
}, |
|
{ |
|
"epoch": 1.038381662949004, |
|
"eval_loss": 1.5168194770812988, |
|
"eval_runtime": 18.139, |
|
"eval_samples_per_second": 55.13, |
|
"eval_steps_per_second": 13.782, |
|
"step": 13500 |
|
}, |
|
{ |
|
"epoch": 1.042227520959926, |
|
"grad_norm": 2.8053858280181885, |
|
"learning_rate": 7.083121944046175e-05, |
|
"loss": 1.5338, |
|
"step": 13550 |
|
}, |
|
{ |
|
"epoch": 1.0460733789708483, |
|
"grad_norm": 1.2761131525039673, |
|
"learning_rate": 7.057387596044263e-05, |
|
"loss": 1.4137, |
|
"step": 13600 |
|
}, |
|
{ |
|
"epoch": 1.0499192369817707, |
|
"grad_norm": 1.614910364151001, |
|
"learning_rate": 7.03165324804235e-05, |
|
"loss": 1.4634, |
|
"step": 13650 |
|
}, |
|
{ |
|
"epoch": 1.0537650949926929, |
|
"grad_norm": 1.8560376167297363, |
|
"learning_rate": 7.00591890004044e-05, |
|
"loss": 1.5173, |
|
"step": 13700 |
|
}, |
|
{ |
|
"epoch": 1.057610953003615, |
|
"grad_norm": 1.3471609354019165, |
|
"learning_rate": 6.980184552038528e-05, |
|
"loss": 1.4887, |
|
"step": 13750 |
|
}, |
|
{ |
|
"epoch": 1.057610953003615, |
|
"eval_loss": 1.5006794929504395, |
|
"eval_runtime": 18.2151, |
|
"eval_samples_per_second": 54.9, |
|
"eval_steps_per_second": 13.725, |
|
"step": 13750 |
|
}, |
|
{ |
|
"epoch": 1.0614568110145373, |
|
"grad_norm": 1.661996841430664, |
|
"learning_rate": 6.954450204036616e-05, |
|
"loss": 1.4428, |
|
"step": 13800 |
|
}, |
|
{ |
|
"epoch": 1.0653026690254597, |
|
"grad_norm": 1.2982336282730103, |
|
"learning_rate": 6.928715856034704e-05, |
|
"loss": 1.4565, |
|
"step": 13850 |
|
}, |
|
{ |
|
"epoch": 1.0691485270363819, |
|
"grad_norm": 0.9250918626785278, |
|
"learning_rate": 6.902981508032792e-05, |
|
"loss": 1.5353, |
|
"step": 13900 |
|
}, |
|
{ |
|
"epoch": 1.072994385047304, |
|
"grad_norm": 1.8084945678710938, |
|
"learning_rate": 6.877247160030881e-05, |
|
"loss": 1.5047, |
|
"step": 13950 |
|
}, |
|
{ |
|
"epoch": 1.0768402430582262, |
|
"grad_norm": 1.1049927473068237, |
|
"learning_rate": 6.851512812028969e-05, |
|
"loss": 1.5058, |
|
"step": 14000 |
|
}, |
|
{ |
|
"epoch": 1.0768402430582262, |
|
"eval_loss": 1.5043680667877197, |
|
"eval_runtime": 18.1464, |
|
"eval_samples_per_second": 55.107, |
|
"eval_steps_per_second": 13.777, |
|
"step": 14000 |
|
}, |
|
{ |
|
"epoch": 1.0806861010691486, |
|
"grad_norm": 1.7406409978866577, |
|
"learning_rate": 6.825778464027057e-05, |
|
"loss": 1.3945, |
|
"step": 14050 |
|
}, |
|
{ |
|
"epoch": 1.0845319590800708, |
|
"grad_norm": 1.1657389402389526, |
|
"learning_rate": 6.800044116025146e-05, |
|
"loss": 1.4528, |
|
"step": 14100 |
|
}, |
|
{ |
|
"epoch": 1.088377817090993, |
|
"grad_norm": 1.380635380744934, |
|
"learning_rate": 6.774309768023234e-05, |
|
"loss": 1.442, |
|
"step": 14150 |
|
}, |
|
{ |
|
"epoch": 1.0922236751019152, |
|
"grad_norm": 1.7555848360061646, |
|
"learning_rate": 6.748575420021322e-05, |
|
"loss": 1.5061, |
|
"step": 14200 |
|
}, |
|
{ |
|
"epoch": 1.0960695331128374, |
|
"grad_norm": 1.6465975046157837, |
|
"learning_rate": 6.72284107201941e-05, |
|
"loss": 1.5004, |
|
"step": 14250 |
|
}, |
|
{ |
|
"epoch": 1.0960695331128374, |
|
"eval_loss": 1.5090863704681396, |
|
"eval_runtime": 18.0174, |
|
"eval_samples_per_second": 55.502, |
|
"eval_steps_per_second": 13.876, |
|
"step": 14250 |
|
}, |
|
{ |
|
"epoch": 1.0999153911237598, |
|
"grad_norm": 2.0214383602142334, |
|
"learning_rate": 6.697106724017498e-05, |
|
"loss": 1.5436, |
|
"step": 14300 |
|
}, |
|
{ |
|
"epoch": 1.103761249134682, |
|
"grad_norm": 1.399170160293579, |
|
"learning_rate": 6.671372376015588e-05, |
|
"loss": 1.5242, |
|
"step": 14350 |
|
}, |
|
{ |
|
"epoch": 1.1076071071456042, |
|
"grad_norm": 2.1806626319885254, |
|
"learning_rate": 6.645638028013676e-05, |
|
"loss": 1.4609, |
|
"step": 14400 |
|
}, |
|
{ |
|
"epoch": 1.1114529651565264, |
|
"grad_norm": 1.1671562194824219, |
|
"learning_rate": 6.619903680011763e-05, |
|
"loss": 1.3789, |
|
"step": 14450 |
|
}, |
|
{ |
|
"epoch": 1.1152988231674485, |
|
"grad_norm": 1.0041520595550537, |
|
"learning_rate": 6.594169332009851e-05, |
|
"loss": 1.4909, |
|
"step": 14500 |
|
}, |
|
{ |
|
"epoch": 1.1152988231674485, |
|
"eval_loss": 1.509366750717163, |
|
"eval_runtime": 18.0148, |
|
"eval_samples_per_second": 55.51, |
|
"eval_steps_per_second": 13.877, |
|
"step": 14500 |
|
}, |
|
{ |
|
"epoch": 1.119144681178371, |
|
"grad_norm": 1.9716360569000244, |
|
"learning_rate": 6.568434984007941e-05, |
|
"loss": 1.5349, |
|
"step": 14550 |
|
}, |
|
{ |
|
"epoch": 1.1229905391892931, |
|
"grad_norm": 0.710033655166626, |
|
"learning_rate": 6.542700636006029e-05, |
|
"loss": 1.4107, |
|
"step": 14600 |
|
}, |
|
{ |
|
"epoch": 1.1268363972002153, |
|
"grad_norm": 1.4398375749588013, |
|
"learning_rate": 6.516966288004117e-05, |
|
"loss": 1.4185, |
|
"step": 14650 |
|
}, |
|
{ |
|
"epoch": 1.1306822552111375, |
|
"grad_norm": 2.5566532611846924, |
|
"learning_rate": 6.491231940002206e-05, |
|
"loss": 1.5758, |
|
"step": 14700 |
|
}, |
|
{ |
|
"epoch": 1.13452811322206, |
|
"grad_norm": 1.2500799894332886, |
|
"learning_rate": 6.465497592000294e-05, |
|
"loss": 1.4751, |
|
"step": 14750 |
|
}, |
|
{ |
|
"epoch": 1.13452811322206, |
|
"eval_loss": 1.4990500211715698, |
|
"eval_runtime": 17.9979, |
|
"eval_samples_per_second": 55.562, |
|
"eval_steps_per_second": 13.891, |
|
"step": 14750 |
|
}, |
|
{ |
|
"epoch": 1.1383739712329821, |
|
"grad_norm": 1.5937495231628418, |
|
"learning_rate": 6.439763243998382e-05, |
|
"loss": 1.5215, |
|
"step": 14800 |
|
}, |
|
{ |
|
"epoch": 1.1422198292439043, |
|
"grad_norm": 1.362358570098877, |
|
"learning_rate": 6.41402889599647e-05, |
|
"loss": 1.5125, |
|
"step": 14850 |
|
}, |
|
{ |
|
"epoch": 1.1460656872548265, |
|
"grad_norm": 2.1192502975463867, |
|
"learning_rate": 6.388294547994558e-05, |
|
"loss": 1.4485, |
|
"step": 14900 |
|
}, |
|
{ |
|
"epoch": 1.149911545265749, |
|
"grad_norm": 1.4089174270629883, |
|
"learning_rate": 6.362560199992647e-05, |
|
"loss": 1.5331, |
|
"step": 14950 |
|
}, |
|
{ |
|
"epoch": 1.153757403276671, |
|
"grad_norm": 1.3750373125076294, |
|
"learning_rate": 6.336825851990735e-05, |
|
"loss": 1.5177, |
|
"step": 15000 |
|
}, |
|
{ |
|
"epoch": 1.153757403276671, |
|
"eval_loss": 1.5118192434310913, |
|
"eval_runtime": 17.9213, |
|
"eval_samples_per_second": 55.799, |
|
"eval_steps_per_second": 13.95, |
|
"step": 15000 |
|
}, |
|
{ |
|
"epoch": 1.1576032612875933, |
|
"grad_norm": 1.5460007190704346, |
|
"learning_rate": 6.311091503988823e-05, |
|
"loss": 1.442, |
|
"step": 15050 |
|
}, |
|
{ |
|
"epoch": 1.1614491192985155, |
|
"grad_norm": 1.001439094543457, |
|
"learning_rate": 6.285357155986911e-05, |
|
"loss": 1.5308, |
|
"step": 15100 |
|
}, |
|
{ |
|
"epoch": 1.1652949773094377, |
|
"grad_norm": 0.8740602731704712, |
|
"learning_rate": 6.259622807985e-05, |
|
"loss": 1.455, |
|
"step": 15150 |
|
}, |
|
{ |
|
"epoch": 1.16914083532036, |
|
"grad_norm": 2.034207820892334, |
|
"learning_rate": 6.233888459983088e-05, |
|
"loss": 1.5089, |
|
"step": 15200 |
|
}, |
|
{ |
|
"epoch": 1.1729866933312822, |
|
"grad_norm": 1.8656599521636963, |
|
"learning_rate": 6.208154111981176e-05, |
|
"loss": 1.5368, |
|
"step": 15250 |
|
}, |
|
{ |
|
"epoch": 1.1729866933312822, |
|
"eval_loss": 1.4986381530761719, |
|
"eval_runtime": 18.1736, |
|
"eval_samples_per_second": 55.025, |
|
"eval_steps_per_second": 13.756, |
|
"step": 15250 |
|
}, |
|
{ |
|
"epoch": 1.1768325513422044, |
|
"grad_norm": 1.2697277069091797, |
|
"learning_rate": 6.182419763979266e-05, |
|
"loss": 1.4239, |
|
"step": 15300 |
|
}, |
|
{ |
|
"epoch": 1.1806784093531266, |
|
"grad_norm": 1.1131771802902222, |
|
"learning_rate": 6.156685415977354e-05, |
|
"loss": 1.4309, |
|
"step": 15350 |
|
}, |
|
{ |
|
"epoch": 1.1845242673640488, |
|
"grad_norm": 1.5322145223617554, |
|
"learning_rate": 6.130951067975442e-05, |
|
"loss": 1.4793, |
|
"step": 15400 |
|
}, |
|
{ |
|
"epoch": 1.1883701253749712, |
|
"grad_norm": 1.1703407764434814, |
|
"learning_rate": 6.10521671997353e-05, |
|
"loss": 1.4761, |
|
"step": 15450 |
|
}, |
|
{ |
|
"epoch": 1.1922159833858934, |
|
"grad_norm": 1.4056655168533325, |
|
"learning_rate": 6.079482371971618e-05, |
|
"loss": 1.5311, |
|
"step": 15500 |
|
}, |
|
{ |
|
"epoch": 1.1922159833858934, |
|
"eval_loss": 1.4925825595855713, |
|
"eval_runtime": 18.2116, |
|
"eval_samples_per_second": 54.91, |
|
"eval_steps_per_second": 13.727, |
|
"step": 15500 |
|
}, |
|
{ |
|
"epoch": 1.1960618413968156, |
|
"grad_norm": 2.7062911987304688, |
|
"learning_rate": 6.053748023969707e-05, |
|
"loss": 1.4145, |
|
"step": 15550 |
|
}, |
|
{ |
|
"epoch": 1.1999076994077378, |
|
"grad_norm": 1.5163620710372925, |
|
"learning_rate": 6.028013675967794e-05, |
|
"loss": 1.4322, |
|
"step": 15600 |
|
}, |
|
{ |
|
"epoch": 1.2037535574186602, |
|
"grad_norm": 1.342063546180725, |
|
"learning_rate": 6.002279327965883e-05, |
|
"loss": 1.4696, |
|
"step": 15650 |
|
}, |
|
{ |
|
"epoch": 1.2075994154295824, |
|
"grad_norm": 1.8180099725723267, |
|
"learning_rate": 5.9765449799639715e-05, |
|
"loss": 1.4647, |
|
"step": 15700 |
|
}, |
|
{ |
|
"epoch": 1.2114452734405046, |
|
"grad_norm": 1.951982855796814, |
|
"learning_rate": 5.9508106319620595e-05, |
|
"loss": 1.5141, |
|
"step": 15750 |
|
}, |
|
{ |
|
"epoch": 1.2114452734405046, |
|
"eval_loss": 1.4893933534622192, |
|
"eval_runtime": 18.1951, |
|
"eval_samples_per_second": 54.96, |
|
"eval_steps_per_second": 13.74, |
|
"step": 15750 |
|
}, |
|
{ |
|
"epoch": 1.2152911314514268, |
|
"grad_norm": 1.7536894083023071, |
|
"learning_rate": 5.925076283960148e-05, |
|
"loss": 1.514, |
|
"step": 15800 |
|
}, |
|
{ |
|
"epoch": 1.2191369894623492, |
|
"grad_norm": 1.1857939958572388, |
|
"learning_rate": 5.899341935958237e-05, |
|
"loss": 1.4745, |
|
"step": 15850 |
|
}, |
|
{ |
|
"epoch": 1.2229828474732714, |
|
"grad_norm": 1.2500842809677124, |
|
"learning_rate": 5.873607587956324e-05, |
|
"loss": 1.4325, |
|
"step": 15900 |
|
}, |
|
{ |
|
"epoch": 1.2268287054841935, |
|
"grad_norm": 2.025336742401123, |
|
"learning_rate": 5.847873239954413e-05, |
|
"loss": 1.4913, |
|
"step": 15950 |
|
}, |
|
{ |
|
"epoch": 1.2306745634951157, |
|
"grad_norm": 1.1440426111221313, |
|
"learning_rate": 5.8221388919525014e-05, |
|
"loss": 1.451, |
|
"step": 16000 |
|
}, |
|
{ |
|
"epoch": 1.2306745634951157, |
|
"eval_loss": 1.492313265800476, |
|
"eval_runtime": 18.0024, |
|
"eval_samples_per_second": 55.548, |
|
"eval_steps_per_second": 13.887, |
|
"step": 16000 |
|
}, |
|
{ |
|
"epoch": 1.234520421506038, |
|
"grad_norm": 1.1019631624221802, |
|
"learning_rate": 5.796404543950589e-05, |
|
"loss": 1.3918, |
|
"step": 16050 |
|
}, |
|
{ |
|
"epoch": 1.2383662795169603, |
|
"grad_norm": 1.7206593751907349, |
|
"learning_rate": 5.770670195948678e-05, |
|
"loss": 1.4726, |
|
"step": 16100 |
|
}, |
|
{ |
|
"epoch": 1.2422121375278825, |
|
"grad_norm": 1.9747880697250366, |
|
"learning_rate": 5.7449358479467666e-05, |
|
"loss": 1.4829, |
|
"step": 16150 |
|
}, |
|
{ |
|
"epoch": 1.2460579955388047, |
|
"grad_norm": 1.605573058128357, |
|
"learning_rate": 5.719201499944854e-05, |
|
"loss": 1.4476, |
|
"step": 16200 |
|
}, |
|
{ |
|
"epoch": 1.2499038535497269, |
|
"grad_norm": 1.180405616760254, |
|
"learning_rate": 5.6934671519429426e-05, |
|
"loss": 1.3904, |
|
"step": 16250 |
|
}, |
|
{ |
|
"epoch": 1.2499038535497269, |
|
"eval_loss": 1.4850120544433594, |
|
"eval_runtime": 18.0422, |
|
"eval_samples_per_second": 55.426, |
|
"eval_steps_per_second": 13.856, |
|
"step": 16250 |
|
}, |
|
{ |
|
"epoch": 1.253749711560649, |
|
"grad_norm": 1.9959101676940918, |
|
"learning_rate": 5.667732803941031e-05, |
|
"loss": 1.4512, |
|
"step": 16300 |
|
}, |
|
{ |
|
"epoch": 1.2575955695715715, |
|
"grad_norm": 1.8853541612625122, |
|
"learning_rate": 5.641998455939119e-05, |
|
"loss": 1.458, |
|
"step": 16350 |
|
}, |
|
{ |
|
"epoch": 1.2614414275824937, |
|
"grad_norm": 1.4618902206420898, |
|
"learning_rate": 5.616264107937208e-05, |
|
"loss": 1.4968, |
|
"step": 16400 |
|
}, |
|
{ |
|
"epoch": 1.2652872855934159, |
|
"grad_norm": 1.4913650751113892, |
|
"learning_rate": 5.5905297599352965e-05, |
|
"loss": 1.3966, |
|
"step": 16450 |
|
}, |
|
{ |
|
"epoch": 1.2691331436043383, |
|
"grad_norm": 1.3095403909683228, |
|
"learning_rate": 5.564795411933384e-05, |
|
"loss": 1.4484, |
|
"step": 16500 |
|
}, |
|
{ |
|
"epoch": 1.2691331436043383, |
|
"eval_loss": 1.4897910356521606, |
|
"eval_runtime": 18.0248, |
|
"eval_samples_per_second": 55.479, |
|
"eval_steps_per_second": 13.87, |
|
"step": 16500 |
|
}, |
|
{ |
|
"epoch": 1.2729790016152602, |
|
"grad_norm": 1.4080452919006348, |
|
"learning_rate": 5.5390610639314724e-05, |
|
"loss": 1.4667, |
|
"step": 16550 |
|
}, |
|
{ |
|
"epoch": 1.2768248596261826, |
|
"grad_norm": 1.6634443998336792, |
|
"learning_rate": 5.513326715929561e-05, |
|
"loss": 1.4619, |
|
"step": 16600 |
|
}, |
|
{ |
|
"epoch": 1.2806707176371048, |
|
"grad_norm": 2.0469400882720947, |
|
"learning_rate": 5.487592367927649e-05, |
|
"loss": 1.4105, |
|
"step": 16650 |
|
}, |
|
{ |
|
"epoch": 1.284516575648027, |
|
"grad_norm": 1.5735753774642944, |
|
"learning_rate": 5.461858019925738e-05, |
|
"loss": 1.4002, |
|
"step": 16700 |
|
}, |
|
{ |
|
"epoch": 1.2883624336589494, |
|
"grad_norm": 1.43183434009552, |
|
"learning_rate": 5.436123671923826e-05, |
|
"loss": 1.4586, |
|
"step": 16750 |
|
}, |
|
{ |
|
"epoch": 1.2883624336589494, |
|
"eval_loss": 1.4708431959152222, |
|
"eval_runtime": 18.2152, |
|
"eval_samples_per_second": 54.899, |
|
"eval_steps_per_second": 13.725, |
|
"step": 16750 |
|
}, |
|
{ |
|
"epoch": 1.2922082916698716, |
|
"grad_norm": 1.6342015266418457, |
|
"learning_rate": 5.4103893239219136e-05, |
|
"loss": 1.4113, |
|
"step": 16800 |
|
}, |
|
{ |
|
"epoch": 1.2960541496807938, |
|
"grad_norm": 3.80155873298645, |
|
"learning_rate": 5.384654975920002e-05, |
|
"loss": 1.4793, |
|
"step": 16850 |
|
}, |
|
{ |
|
"epoch": 1.299900007691716, |
|
"grad_norm": 1.4240097999572754, |
|
"learning_rate": 5.358920627918091e-05, |
|
"loss": 1.4072, |
|
"step": 16900 |
|
}, |
|
{ |
|
"epoch": 1.3037458657026382, |
|
"grad_norm": 1.4548074007034302, |
|
"learning_rate": 5.333186279916179e-05, |
|
"loss": 1.4275, |
|
"step": 16950 |
|
}, |
|
{ |
|
"epoch": 1.3075917237135606, |
|
"grad_norm": 1.7287901639938354, |
|
"learning_rate": 5.3074519319142675e-05, |
|
"loss": 1.4741, |
|
"step": 17000 |
|
}, |
|
{ |
|
"epoch": 1.3075917237135606, |
|
"eval_loss": 1.4836150407791138, |
|
"eval_runtime": 18.0219, |
|
"eval_samples_per_second": 55.488, |
|
"eval_steps_per_second": 13.872, |
|
"step": 17000 |
|
}, |
|
{ |
|
"epoch": 1.3114375817244828, |
|
"grad_norm": 1.732088327407837, |
|
"learning_rate": 5.281717583912356e-05, |
|
"loss": 1.5014, |
|
"step": 17050 |
|
}, |
|
{ |
|
"epoch": 1.315283439735405, |
|
"grad_norm": 2.144697427749634, |
|
"learning_rate": 5.2559832359104435e-05, |
|
"loss": 1.4436, |
|
"step": 17100 |
|
}, |
|
{ |
|
"epoch": 1.3191292977463271, |
|
"grad_norm": 1.649965763092041, |
|
"learning_rate": 5.230248887908532e-05, |
|
"loss": 1.4334, |
|
"step": 17150 |
|
}, |
|
{ |
|
"epoch": 1.3229751557572493, |
|
"grad_norm": 0.8667518496513367, |
|
"learning_rate": 5.204514539906621e-05, |
|
"loss": 1.487, |
|
"step": 17200 |
|
}, |
|
{ |
|
"epoch": 1.3268210137681717, |
|
"grad_norm": 1.4567649364471436, |
|
"learning_rate": 5.178780191904709e-05, |
|
"loss": 1.4714, |
|
"step": 17250 |
|
}, |
|
{ |
|
"epoch": 1.3268210137681717, |
|
"eval_loss": 1.479749321937561, |
|
"eval_runtime": 17.9466, |
|
"eval_samples_per_second": 55.721, |
|
"eval_steps_per_second": 13.93, |
|
"step": 17250 |
|
}, |
|
{ |
|
"epoch": 1.330666871779094, |
|
"grad_norm": 1.8523489236831665, |
|
"learning_rate": 5.1530458439027974e-05, |
|
"loss": 1.4718, |
|
"step": 17300 |
|
}, |
|
{ |
|
"epoch": 1.3345127297900161, |
|
"grad_norm": 1.091204047203064, |
|
"learning_rate": 5.127311495900886e-05, |
|
"loss": 1.4012, |
|
"step": 17350 |
|
}, |
|
{ |
|
"epoch": 1.3383585878009385, |
|
"grad_norm": 1.8271427154541016, |
|
"learning_rate": 5.101577147898973e-05, |
|
"loss": 1.4547, |
|
"step": 17400 |
|
}, |
|
{ |
|
"epoch": 1.3422044458118605, |
|
"grad_norm": 1.8682465553283691, |
|
"learning_rate": 5.075842799897062e-05, |
|
"loss": 1.4373, |
|
"step": 17450 |
|
}, |
|
{ |
|
"epoch": 1.346050303822783, |
|
"grad_norm": 2.1932857036590576, |
|
"learning_rate": 5.0501084518951506e-05, |
|
"loss": 1.4628, |
|
"step": 17500 |
|
}, |
|
{ |
|
"epoch": 1.346050303822783, |
|
"eval_loss": 1.4871113300323486, |
|
"eval_runtime": 17.9165, |
|
"eval_samples_per_second": 55.814, |
|
"eval_steps_per_second": 13.954, |
|
"step": 17500 |
|
}, |
|
{ |
|
"epoch": 1.349896161833705, |
|
"grad_norm": 1.6970813274383545, |
|
"learning_rate": 5.0243741038932386e-05, |
|
"loss": 1.4442, |
|
"step": 17550 |
|
}, |
|
{ |
|
"epoch": 1.3537420198446273, |
|
"grad_norm": 1.0942292213439941, |
|
"learning_rate": 4.998639755891327e-05, |
|
"loss": 1.4769, |
|
"step": 17600 |
|
}, |
|
{ |
|
"epoch": 1.3575878778555497, |
|
"grad_norm": 1.720035195350647, |
|
"learning_rate": 4.972905407889416e-05, |
|
"loss": 1.4519, |
|
"step": 17650 |
|
}, |
|
{ |
|
"epoch": 1.3614337358664719, |
|
"grad_norm": 0.8887185454368591, |
|
"learning_rate": 4.947171059887503e-05, |
|
"loss": 1.4201, |
|
"step": 17700 |
|
}, |
|
{ |
|
"epoch": 1.365279593877394, |
|
"grad_norm": 1.9557030200958252, |
|
"learning_rate": 4.921436711885592e-05, |
|
"loss": 1.4848, |
|
"step": 17750 |
|
}, |
|
{ |
|
"epoch": 1.365279593877394, |
|
"eval_loss": 1.476893424987793, |
|
"eval_runtime": 17.9988, |
|
"eval_samples_per_second": 55.559, |
|
"eval_steps_per_second": 13.89, |
|
"step": 17750 |
|
}, |
|
{ |
|
"epoch": 1.3691254518883162, |
|
"grad_norm": 1.471414566040039, |
|
"learning_rate": 4.8957023638836804e-05, |
|
"loss": 1.4541, |
|
"step": 17800 |
|
}, |
|
{ |
|
"epoch": 1.3729713098992384, |
|
"grad_norm": 1.350690484046936, |
|
"learning_rate": 4.8699680158817684e-05, |
|
"loss": 1.3954, |
|
"step": 17850 |
|
}, |
|
{ |
|
"epoch": 1.3768171679101608, |
|
"grad_norm": 0.7363431453704834, |
|
"learning_rate": 4.844233667879857e-05, |
|
"loss": 1.4919, |
|
"step": 17900 |
|
}, |
|
{ |
|
"epoch": 1.380663025921083, |
|
"grad_norm": 1.8820909261703491, |
|
"learning_rate": 4.818499319877946e-05, |
|
"loss": 1.4177, |
|
"step": 17950 |
|
}, |
|
{ |
|
"epoch": 1.3845088839320052, |
|
"grad_norm": 0.8440986275672913, |
|
"learning_rate": 4.792764971876033e-05, |
|
"loss": 1.3995, |
|
"step": 18000 |
|
}, |
|
{ |
|
"epoch": 1.3845088839320052, |
|
"eval_loss": 1.4794726371765137, |
|
"eval_runtime": 17.9989, |
|
"eval_samples_per_second": 55.559, |
|
"eval_steps_per_second": 13.89, |
|
"step": 18000 |
|
}, |
|
{ |
|
"epoch": 1.3883547419429274, |
|
"grad_norm": 1.6790105104446411, |
|
"learning_rate": 4.7670306238741216e-05, |
|
"loss": 1.4791, |
|
"step": 18050 |
|
}, |
|
{ |
|
"epoch": 1.3922005999538496, |
|
"grad_norm": 1.1840436458587646, |
|
"learning_rate": 4.74129627587221e-05, |
|
"loss": 1.4021, |
|
"step": 18100 |
|
}, |
|
{ |
|
"epoch": 1.396046457964772, |
|
"grad_norm": 1.7883968353271484, |
|
"learning_rate": 4.715561927870298e-05, |
|
"loss": 1.4637, |
|
"step": 18150 |
|
}, |
|
{ |
|
"epoch": 1.3998923159756942, |
|
"grad_norm": 1.2177505493164062, |
|
"learning_rate": 4.689827579868387e-05, |
|
"loss": 1.5123, |
|
"step": 18200 |
|
}, |
|
{ |
|
"epoch": 1.4037381739866164, |
|
"grad_norm": 1.439232349395752, |
|
"learning_rate": 4.6640932318664756e-05, |
|
"loss": 1.4579, |
|
"step": 18250 |
|
}, |
|
{ |
|
"epoch": 1.4037381739866164, |
|
"eval_loss": 1.4953014850616455, |
|
"eval_runtime": 17.9127, |
|
"eval_samples_per_second": 55.826, |
|
"eval_steps_per_second": 13.957, |
|
"step": 18250 |
|
}, |
|
{ |
|
"epoch": 1.4075840319975388, |
|
"grad_norm": 2.0796408653259277, |
|
"learning_rate": 4.638358883864563e-05, |
|
"loss": 1.4295, |
|
"step": 18300 |
|
}, |
|
{ |
|
"epoch": 1.4114298900084608, |
|
"grad_norm": 1.3032926321029663, |
|
"learning_rate": 4.6126245358626515e-05, |
|
"loss": 1.4733, |
|
"step": 18350 |
|
}, |
|
{ |
|
"epoch": 1.4152757480193832, |
|
"grad_norm": 0.9058660864830017, |
|
"learning_rate": 4.58689018786074e-05, |
|
"loss": 1.4446, |
|
"step": 18400 |
|
}, |
|
{ |
|
"epoch": 1.4191216060303053, |
|
"grad_norm": 2.05460786819458, |
|
"learning_rate": 4.561155839858828e-05, |
|
"loss": 1.4133, |
|
"step": 18450 |
|
}, |
|
{ |
|
"epoch": 1.4229674640412275, |
|
"grad_norm": 0.8309249877929688, |
|
"learning_rate": 4.535421491856917e-05, |
|
"loss": 1.456, |
|
"step": 18500 |
|
}, |
|
{ |
|
"epoch": 1.4229674640412275, |
|
"eval_loss": 1.480312466621399, |
|
"eval_runtime": 18.2137, |
|
"eval_samples_per_second": 54.904, |
|
"eval_steps_per_second": 13.726, |
|
"step": 18500 |
|
}, |
|
{ |
|
"epoch": 1.42681332205215, |
|
"grad_norm": 1.0496591329574585, |
|
"learning_rate": 4.5096871438550054e-05, |
|
"loss": 1.3723, |
|
"step": 18550 |
|
}, |
|
{ |
|
"epoch": 1.4306591800630721, |
|
"grad_norm": 1.273758053779602, |
|
"learning_rate": 4.483952795853093e-05, |
|
"loss": 1.4747, |
|
"step": 18600 |
|
}, |
|
{ |
|
"epoch": 1.4345050380739943, |
|
"grad_norm": 1.3594483137130737, |
|
"learning_rate": 4.458218447851181e-05, |
|
"loss": 1.564, |
|
"step": 18650 |
|
}, |
|
{ |
|
"epoch": 1.4383508960849165, |
|
"grad_norm": 1.773634672164917, |
|
"learning_rate": 4.43248409984927e-05, |
|
"loss": 1.4344, |
|
"step": 18700 |
|
}, |
|
{ |
|
"epoch": 1.4421967540958387, |
|
"grad_norm": 0.7939924001693726, |
|
"learning_rate": 4.406749751847358e-05, |
|
"loss": 1.3798, |
|
"step": 18750 |
|
}, |
|
{ |
|
"epoch": 1.4421967540958387, |
|
"eval_loss": 1.4680087566375732, |
|
"eval_runtime": 18.0287, |
|
"eval_samples_per_second": 55.467, |
|
"eval_steps_per_second": 13.867, |
|
"step": 18750 |
|
}, |
|
{ |
|
"epoch": 1.446042612106761, |
|
"grad_norm": 1.4785016775131226, |
|
"learning_rate": 4.3810154038454466e-05, |
|
"loss": 1.5316, |
|
"step": 18800 |
|
}, |
|
{ |
|
"epoch": 1.4498884701176833, |
|
"grad_norm": 2.1929142475128174, |
|
"learning_rate": 4.355281055843535e-05, |
|
"loss": 1.4498, |
|
"step": 18850 |
|
}, |
|
{ |
|
"epoch": 1.4537343281286055, |
|
"grad_norm": 1.816432237625122, |
|
"learning_rate": 4.3295467078416225e-05, |
|
"loss": 1.5089, |
|
"step": 18900 |
|
}, |
|
{ |
|
"epoch": 1.4575801861395277, |
|
"grad_norm": 2.589778423309326, |
|
"learning_rate": 4.303812359839711e-05, |
|
"loss": 1.4011, |
|
"step": 18950 |
|
}, |
|
{ |
|
"epoch": 1.4614260441504499, |
|
"grad_norm": 1.6828664541244507, |
|
"learning_rate": 4.2780780118378e-05, |
|
"loss": 1.3803, |
|
"step": 19000 |
|
}, |
|
{ |
|
"epoch": 1.4614260441504499, |
|
"eval_loss": 1.4737956523895264, |
|
"eval_runtime": 17.9628, |
|
"eval_samples_per_second": 55.67, |
|
"eval_steps_per_second": 13.918, |
|
"step": 19000 |
|
}, |
|
{ |
|
"epoch": 1.4652719021613723, |
|
"grad_norm": 1.3094508647918701, |
|
"learning_rate": 4.252343663835888e-05, |
|
"loss": 1.4726, |
|
"step": 19050 |
|
}, |
|
{ |
|
"epoch": 1.4691177601722945, |
|
"grad_norm": 2.1354212760925293, |
|
"learning_rate": 4.2266093158339764e-05, |
|
"loss": 1.4343, |
|
"step": 19100 |
|
}, |
|
{ |
|
"epoch": 1.4729636181832166, |
|
"grad_norm": 1.395593523979187, |
|
"learning_rate": 4.200874967832065e-05, |
|
"loss": 1.4834, |
|
"step": 19150 |
|
}, |
|
{ |
|
"epoch": 1.476809476194139, |
|
"grad_norm": 0.8917800784111023, |
|
"learning_rate": 4.1751406198301524e-05, |
|
"loss": 1.4625, |
|
"step": 19200 |
|
}, |
|
{ |
|
"epoch": 1.480655334205061, |
|
"grad_norm": 2.179772138595581, |
|
"learning_rate": 4.149406271828241e-05, |
|
"loss": 1.4832, |
|
"step": 19250 |
|
}, |
|
{ |
|
"epoch": 1.480655334205061, |
|
"eval_loss": 1.480191946029663, |
|
"eval_runtime": 17.952, |
|
"eval_samples_per_second": 55.704, |
|
"eval_steps_per_second": 13.926, |
|
"step": 19250 |
|
}, |
|
{ |
|
"epoch": 1.4845011922159834, |
|
"grad_norm": 1.3308861255645752, |
|
"learning_rate": 4.12367192382633e-05, |
|
"loss": 1.4555, |
|
"step": 19300 |
|
}, |
|
{ |
|
"epoch": 1.4883470502269056, |
|
"grad_norm": 1.6867352724075317, |
|
"learning_rate": 4.0979375758244176e-05, |
|
"loss": 1.4116, |
|
"step": 19350 |
|
}, |
|
{ |
|
"epoch": 1.4921929082378278, |
|
"grad_norm": 2.161247491836548, |
|
"learning_rate": 4.072203227822506e-05, |
|
"loss": 1.4262, |
|
"step": 19400 |
|
}, |
|
{ |
|
"epoch": 1.4960387662487502, |
|
"grad_norm": 1.717690110206604, |
|
"learning_rate": 4.046468879820595e-05, |
|
"loss": 1.3896, |
|
"step": 19450 |
|
}, |
|
{ |
|
"epoch": 1.4998846242596724, |
|
"grad_norm": 1.0118234157562256, |
|
"learning_rate": 4.020734531818682e-05, |
|
"loss": 1.4503, |
|
"step": 19500 |
|
}, |
|
{ |
|
"epoch": 1.4998846242596724, |
|
"eval_loss": 1.478628396987915, |
|
"eval_runtime": 18.0209, |
|
"eval_samples_per_second": 55.491, |
|
"eval_steps_per_second": 13.873, |
|
"step": 19500 |
|
}, |
|
{ |
|
"epoch": 1.5037304822705946, |
|
"grad_norm": 0.8779070377349854, |
|
"learning_rate": 3.995000183816771e-05, |
|
"loss": 1.3728, |
|
"step": 19550 |
|
}, |
|
{ |
|
"epoch": 1.5075763402815168, |
|
"grad_norm": 1.6068123579025269, |
|
"learning_rate": 3.9692658358148595e-05, |
|
"loss": 1.5204, |
|
"step": 19600 |
|
}, |
|
{ |
|
"epoch": 1.511422198292439, |
|
"grad_norm": 1.7712832689285278, |
|
"learning_rate": 3.9435314878129475e-05, |
|
"loss": 1.514, |
|
"step": 19650 |
|
}, |
|
{ |
|
"epoch": 1.5152680563033614, |
|
"grad_norm": 1.2519572973251343, |
|
"learning_rate": 3.917797139811036e-05, |
|
"loss": 1.3953, |
|
"step": 19700 |
|
}, |
|
{ |
|
"epoch": 1.5191139143142836, |
|
"grad_norm": 1.5644786357879639, |
|
"learning_rate": 3.892062791809125e-05, |
|
"loss": 1.4772, |
|
"step": 19750 |
|
}, |
|
{ |
|
"epoch": 1.5191139143142836, |
|
"eval_loss": 1.4710900783538818, |
|
"eval_runtime": 18.4205, |
|
"eval_samples_per_second": 54.287, |
|
"eval_steps_per_second": 13.572, |
|
"step": 19750 |
|
}, |
|
{ |
|
"epoch": 1.5229597723252057, |
|
"grad_norm": 1.6755670309066772, |
|
"learning_rate": 3.866328443807212e-05, |
|
"loss": 1.4148, |
|
"step": 19800 |
|
}, |
|
{ |
|
"epoch": 1.5268056303361282, |
|
"grad_norm": 1.7168843746185303, |
|
"learning_rate": 3.840594095805301e-05, |
|
"loss": 1.4211, |
|
"step": 19850 |
|
}, |
|
{ |
|
"epoch": 1.5306514883470501, |
|
"grad_norm": 1.5205817222595215, |
|
"learning_rate": 3.8148597478033894e-05, |
|
"loss": 1.4663, |
|
"step": 19900 |
|
}, |
|
{ |
|
"epoch": 1.5344973463579725, |
|
"grad_norm": 1.608231544494629, |
|
"learning_rate": 3.789125399801477e-05, |
|
"loss": 1.3634, |
|
"step": 19950 |
|
}, |
|
{ |
|
"epoch": 1.5383432043688947, |
|
"grad_norm": 1.5260729789733887, |
|
"learning_rate": 3.763391051799566e-05, |
|
"loss": 1.4114, |
|
"step": 20000 |
|
}, |
|
{ |
|
"epoch": 1.5383432043688947, |
|
"eval_loss": 1.4733539819717407, |
|
"eval_runtime": 18.105, |
|
"eval_samples_per_second": 55.233, |
|
"eval_steps_per_second": 13.808, |
|
"step": 20000 |
|
}, |
|
{ |
|
"epoch": 1.542189062379817, |
|
"grad_norm": 1.4523636102676392, |
|
"learning_rate": 3.7376567037976546e-05, |
|
"loss": 1.4538, |
|
"step": 20050 |
|
}, |
|
{ |
|
"epoch": 1.5460349203907393, |
|
"grad_norm": 1.854066252708435, |
|
"learning_rate": 3.7119223557957426e-05, |
|
"loss": 1.4532, |
|
"step": 20100 |
|
}, |
|
{ |
|
"epoch": 1.5498807784016613, |
|
"grad_norm": 1.8892920017242432, |
|
"learning_rate": 3.6861880077938306e-05, |
|
"loss": 1.4301, |
|
"step": 20150 |
|
}, |
|
{ |
|
"epoch": 1.5537266364125837, |
|
"grad_norm": 1.2957504987716675, |
|
"learning_rate": 3.6609683467519574e-05, |
|
"loss": 1.4613, |
|
"step": 20200 |
|
}, |
|
{ |
|
"epoch": 1.5575724944235059, |
|
"grad_norm": 1.9040348529815674, |
|
"learning_rate": 3.635233998750046e-05, |
|
"loss": 1.3847, |
|
"step": 20250 |
|
}, |
|
{ |
|
"epoch": 1.5575724944235059, |
|
"eval_loss": 1.4672300815582275, |
|
"eval_runtime": 17.9888, |
|
"eval_samples_per_second": 55.59, |
|
"eval_steps_per_second": 13.898, |
|
"step": 20250 |
|
}, |
|
{ |
|
"epoch": 1.561418352434428, |
|
"grad_norm": 1.4990596771240234, |
|
"learning_rate": 3.609499650748134e-05, |
|
"loss": 1.4243, |
|
"step": 20300 |
|
}, |
|
{ |
|
"epoch": 1.5652642104453505, |
|
"grad_norm": 2.344515562057495, |
|
"learning_rate": 3.583765302746222e-05, |
|
"loss": 1.4971, |
|
"step": 20350 |
|
}, |
|
{ |
|
"epoch": 1.5691100684562724, |
|
"grad_norm": 2.2836570739746094, |
|
"learning_rate": 3.5580309547443106e-05, |
|
"loss": 1.4641, |
|
"step": 20400 |
|
}, |
|
{ |
|
"epoch": 1.5729559264671948, |
|
"grad_norm": 1.0165778398513794, |
|
"learning_rate": 3.5322966067423986e-05, |
|
"loss": 1.4268, |
|
"step": 20450 |
|
}, |
|
{ |
|
"epoch": 1.576801784478117, |
|
"grad_norm": 0.5663600564002991, |
|
"learning_rate": 3.506562258740487e-05, |
|
"loss": 1.3487, |
|
"step": 20500 |
|
}, |
|
{ |
|
"epoch": 1.576801784478117, |
|
"eval_loss": 1.4733059406280518, |
|
"eval_runtime": 18.0399, |
|
"eval_samples_per_second": 55.433, |
|
"eval_steps_per_second": 13.858, |
|
"step": 20500 |
|
}, |
|
{ |
|
"epoch": 1.5806476424890392, |
|
"grad_norm": 1.36208176612854, |
|
"learning_rate": 3.480827910738575e-05, |
|
"loss": 1.3615, |
|
"step": 20550 |
|
}, |
|
{ |
|
"epoch": 1.5844935004999616, |
|
"grad_norm": 1.6889315843582153, |
|
"learning_rate": 3.455093562736664e-05, |
|
"loss": 1.4174, |
|
"step": 20600 |
|
}, |
|
{ |
|
"epoch": 1.5883393585108838, |
|
"grad_norm": 1.2735401391983032, |
|
"learning_rate": 3.429359214734752e-05, |
|
"loss": 1.4482, |
|
"step": 20650 |
|
}, |
|
{ |
|
"epoch": 1.592185216521806, |
|
"grad_norm": 1.668188452720642, |
|
"learning_rate": 3.4036248667328405e-05, |
|
"loss": 1.4193, |
|
"step": 20700 |
|
}, |
|
{ |
|
"epoch": 1.5960310745327284, |
|
"grad_norm": 1.8626503944396973, |
|
"learning_rate": 3.3778905187309284e-05, |
|
"loss": 1.4477, |
|
"step": 20750 |
|
}, |
|
{ |
|
"epoch": 1.5960310745327284, |
|
"eval_loss": 1.4779850244522095, |
|
"eval_runtime": 18.0373, |
|
"eval_samples_per_second": 55.441, |
|
"eval_steps_per_second": 13.86, |
|
"step": 20750 |
|
}, |
|
{ |
|
"epoch": 1.5998769325436504, |
|
"grad_norm": 1.2189550399780273, |
|
"learning_rate": 3.352156170729017e-05, |
|
"loss": 1.5325, |
|
"step": 20800 |
|
}, |
|
{ |
|
"epoch": 1.6037227905545728, |
|
"grad_norm": 2.126854658126831, |
|
"learning_rate": 3.326421822727105e-05, |
|
"loss": 1.5096, |
|
"step": 20850 |
|
}, |
|
{ |
|
"epoch": 1.607568648565495, |
|
"grad_norm": 1.7529182434082031, |
|
"learning_rate": 3.300687474725194e-05, |
|
"loss": 1.4629, |
|
"step": 20900 |
|
}, |
|
{ |
|
"epoch": 1.6114145065764172, |
|
"grad_norm": 2.2533035278320312, |
|
"learning_rate": 3.2749531267232824e-05, |
|
"loss": 1.4266, |
|
"step": 20950 |
|
}, |
|
{ |
|
"epoch": 1.6152603645873396, |
|
"grad_norm": 1.6632803678512573, |
|
"learning_rate": 3.24921877872137e-05, |
|
"loss": 1.5018, |
|
"step": 21000 |
|
}, |
|
{ |
|
"epoch": 1.6152603645873396, |
|
"eval_loss": 1.467063307762146, |
|
"eval_runtime": 18.0767, |
|
"eval_samples_per_second": 55.32, |
|
"eval_steps_per_second": 13.83, |
|
"step": 21000 |
|
}, |
|
{ |
|
"epoch": 1.6191062225982615, |
|
"grad_norm": 2.016814708709717, |
|
"learning_rate": 3.223484430719458e-05, |
|
"loss": 1.434, |
|
"step": 21050 |
|
}, |
|
{ |
|
"epoch": 1.622952080609184, |
|
"grad_norm": 1.5766371488571167, |
|
"learning_rate": 3.197750082717547e-05, |
|
"loss": 1.4249, |
|
"step": 21100 |
|
}, |
|
{ |
|
"epoch": 1.6267979386201061, |
|
"grad_norm": 2.3865230083465576, |
|
"learning_rate": 3.172015734715635e-05, |
|
"loss": 1.6, |
|
"step": 21150 |
|
}, |
|
{ |
|
"epoch": 1.6306437966310283, |
|
"grad_norm": 1.193731427192688, |
|
"learning_rate": 3.1462813867137236e-05, |
|
"loss": 1.5674, |
|
"step": 21200 |
|
}, |
|
{ |
|
"epoch": 1.6344896546419507, |
|
"grad_norm": 1.4854563474655151, |
|
"learning_rate": 3.120547038711812e-05, |
|
"loss": 1.4788, |
|
"step": 21250 |
|
}, |
|
{ |
|
"epoch": 1.6344896546419507, |
|
"eval_loss": 1.4725981950759888, |
|
"eval_runtime": 18.2185, |
|
"eval_samples_per_second": 54.889, |
|
"eval_steps_per_second": 13.722, |
|
"step": 21250 |
|
}, |
|
{ |
|
"epoch": 1.6383355126528727, |
|
"grad_norm": 1.3907707929611206, |
|
"learning_rate": 3.0948126907099e-05, |
|
"loss": 1.4752, |
|
"step": 21300 |
|
}, |
|
{ |
|
"epoch": 1.642181370663795, |
|
"grad_norm": 1.5267348289489746, |
|
"learning_rate": 3.069078342707988e-05, |
|
"loss": 1.4198, |
|
"step": 21350 |
|
}, |
|
{ |
|
"epoch": 1.6460272286747173, |
|
"grad_norm": 1.2138367891311646, |
|
"learning_rate": 3.0433439947060768e-05, |
|
"loss": 1.4302, |
|
"step": 21400 |
|
}, |
|
{ |
|
"epoch": 1.6498730866856395, |
|
"grad_norm": 1.3399436473846436, |
|
"learning_rate": 3.017609646704165e-05, |
|
"loss": 1.5098, |
|
"step": 21450 |
|
}, |
|
{ |
|
"epoch": 1.6537189446965619, |
|
"grad_norm": 1.543906569480896, |
|
"learning_rate": 2.991875298702253e-05, |
|
"loss": 1.4577, |
|
"step": 21500 |
|
}, |
|
{ |
|
"epoch": 1.6537189446965619, |
|
"eval_loss": 1.475114345550537, |
|
"eval_runtime": 18.0585, |
|
"eval_samples_per_second": 55.376, |
|
"eval_steps_per_second": 13.844, |
|
"step": 21500 |
|
}, |
|
{ |
|
"epoch": 1.657564802707484, |
|
"grad_norm": 1.2780442237854004, |
|
"learning_rate": 2.9661409507003417e-05, |
|
"loss": 1.5179, |
|
"step": 21550 |
|
}, |
|
{ |
|
"epoch": 1.6614106607184063, |
|
"grad_norm": 1.206725835800171, |
|
"learning_rate": 2.94040660269843e-05, |
|
"loss": 1.4438, |
|
"step": 21600 |
|
}, |
|
{ |
|
"epoch": 1.6652565187293287, |
|
"grad_norm": 2.1834638118743896, |
|
"learning_rate": 2.914672254696518e-05, |
|
"loss": 1.4783, |
|
"step": 21650 |
|
}, |
|
{ |
|
"epoch": 1.6691023767402506, |
|
"grad_norm": 1.5568137168884277, |
|
"learning_rate": 2.8889379066946066e-05, |
|
"loss": 1.38, |
|
"step": 21700 |
|
}, |
|
{ |
|
"epoch": 1.672948234751173, |
|
"grad_norm": 1.6938014030456543, |
|
"learning_rate": 2.863203558692695e-05, |
|
"loss": 1.3754, |
|
"step": 21750 |
|
}, |
|
{ |
|
"epoch": 1.672948234751173, |
|
"eval_loss": 1.466833472251892, |
|
"eval_runtime": 18.1069, |
|
"eval_samples_per_second": 55.228, |
|
"eval_steps_per_second": 13.807, |
|
"step": 21750 |
|
}, |
|
{ |
|
"epoch": 1.6767940927620952, |
|
"grad_norm": 1.3192166090011597, |
|
"learning_rate": 2.837469210690783e-05, |
|
"loss": 1.4388, |
|
"step": 21800 |
|
}, |
|
{ |
|
"epoch": 1.6806399507730174, |
|
"grad_norm": 2.0135934352874756, |
|
"learning_rate": 2.8117348626888716e-05, |
|
"loss": 1.429, |
|
"step": 21850 |
|
}, |
|
{ |
|
"epoch": 1.6844858087839398, |
|
"grad_norm": 1.4457674026489258, |
|
"learning_rate": 2.78600051468696e-05, |
|
"loss": 1.5154, |
|
"step": 21900 |
|
}, |
|
{ |
|
"epoch": 1.6883316667948618, |
|
"grad_norm": 1.225411295890808, |
|
"learning_rate": 2.760266166685048e-05, |
|
"loss": 1.4658, |
|
"step": 21950 |
|
}, |
|
{ |
|
"epoch": 1.6921775248057842, |
|
"grad_norm": 1.8256678581237793, |
|
"learning_rate": 2.7345318186831365e-05, |
|
"loss": 1.5004, |
|
"step": 22000 |
|
}, |
|
{ |
|
"epoch": 1.6921775248057842, |
|
"eval_loss": 1.4664525985717773, |
|
"eval_runtime": 18.0331, |
|
"eval_samples_per_second": 55.454, |
|
"eval_steps_per_second": 13.863, |
|
"step": 22000 |
|
}, |
|
{ |
|
"epoch": 1.6960233828167064, |
|
"grad_norm": 0.8262001276016235, |
|
"learning_rate": 2.7087974706812248e-05, |
|
"loss": 1.4304, |
|
"step": 22050 |
|
}, |
|
{ |
|
"epoch": 1.6998692408276286, |
|
"grad_norm": 1.6224443912506104, |
|
"learning_rate": 2.6830631226793128e-05, |
|
"loss": 1.4127, |
|
"step": 22100 |
|
}, |
|
{ |
|
"epoch": 1.703715098838551, |
|
"grad_norm": 1.3338160514831543, |
|
"learning_rate": 2.6573287746774014e-05, |
|
"loss": 1.4842, |
|
"step": 22150 |
|
}, |
|
{ |
|
"epoch": 1.707560956849473, |
|
"grad_norm": 1.940238356590271, |
|
"learning_rate": 2.6315944266754897e-05, |
|
"loss": 1.4279, |
|
"step": 22200 |
|
}, |
|
{ |
|
"epoch": 1.7114068148603954, |
|
"grad_norm": 2.091132164001465, |
|
"learning_rate": 2.6058600786735777e-05, |
|
"loss": 1.3779, |
|
"step": 22250 |
|
}, |
|
{ |
|
"epoch": 1.7114068148603954, |
|
"eval_loss": 1.457463264465332, |
|
"eval_runtime": 18.1835, |
|
"eval_samples_per_second": 54.995, |
|
"eval_steps_per_second": 13.749, |
|
"step": 22250 |
|
}, |
|
{ |
|
"epoch": 1.7152526728713176, |
|
"grad_norm": 1.4367913007736206, |
|
"learning_rate": 2.5801257306716663e-05, |
|
"loss": 1.4821, |
|
"step": 22300 |
|
}, |
|
{ |
|
"epoch": 1.7190985308822397, |
|
"grad_norm": 1.9735435247421265, |
|
"learning_rate": 2.5543913826697546e-05, |
|
"loss": 1.3754, |
|
"step": 22350 |
|
}, |
|
{ |
|
"epoch": 1.7229443888931621, |
|
"grad_norm": 1.4968055486679077, |
|
"learning_rate": 2.5286570346678426e-05, |
|
"loss": 1.4045, |
|
"step": 22400 |
|
}, |
|
{ |
|
"epoch": 1.7267902469040843, |
|
"grad_norm": 1.0449949502944946, |
|
"learning_rate": 2.5029226866659312e-05, |
|
"loss": 1.4458, |
|
"step": 22450 |
|
}, |
|
{ |
|
"epoch": 1.7306361049150065, |
|
"grad_norm": 1.164890170097351, |
|
"learning_rate": 2.4771883386640196e-05, |
|
"loss": 1.4407, |
|
"step": 22500 |
|
}, |
|
{ |
|
"epoch": 1.7306361049150065, |
|
"eval_loss": 1.4607012271881104, |
|
"eval_runtime": 18.2079, |
|
"eval_samples_per_second": 54.921, |
|
"eval_steps_per_second": 13.73, |
|
"step": 22500 |
|
}, |
|
{ |
|
"epoch": 1.734481962925929, |
|
"grad_norm": 0.9285104870796204, |
|
"learning_rate": 2.4514539906621075e-05, |
|
"loss": 1.4243, |
|
"step": 22550 |
|
}, |
|
{ |
|
"epoch": 1.738327820936851, |
|
"grad_norm": 1.2848355770111084, |
|
"learning_rate": 2.4257196426601962e-05, |
|
"loss": 1.4596, |
|
"step": 22600 |
|
}, |
|
{ |
|
"epoch": 1.7421736789477733, |
|
"grad_norm": 1.4614371061325073, |
|
"learning_rate": 2.3999852946582845e-05, |
|
"loss": 1.3918, |
|
"step": 22650 |
|
}, |
|
{ |
|
"epoch": 1.7460195369586955, |
|
"grad_norm": 0.9543781876564026, |
|
"learning_rate": 2.3742509466563724e-05, |
|
"loss": 1.4044, |
|
"step": 22700 |
|
}, |
|
{ |
|
"epoch": 1.7498653949696177, |
|
"grad_norm": 1.602250099182129, |
|
"learning_rate": 2.348516598654461e-05, |
|
"loss": 1.4607, |
|
"step": 22750 |
|
}, |
|
{ |
|
"epoch": 1.7498653949696177, |
|
"eval_loss": 1.4677520990371704, |
|
"eval_runtime": 18.158, |
|
"eval_samples_per_second": 55.072, |
|
"eval_steps_per_second": 13.768, |
|
"step": 22750 |
|
}, |
|
{ |
|
"epoch": 1.75371125298054, |
|
"grad_norm": 1.1664291620254517, |
|
"learning_rate": 2.3227822506525494e-05, |
|
"loss": 1.5153, |
|
"step": 22800 |
|
}, |
|
{ |
|
"epoch": 1.757557110991462, |
|
"grad_norm": 1.472679853439331, |
|
"learning_rate": 2.2970479026506374e-05, |
|
"loss": 1.4774, |
|
"step": 22850 |
|
}, |
|
{ |
|
"epoch": 1.7614029690023845, |
|
"grad_norm": 1.7927029132843018, |
|
"learning_rate": 2.271313554648726e-05, |
|
"loss": 1.4551, |
|
"step": 22900 |
|
}, |
|
{ |
|
"epoch": 1.7652488270133067, |
|
"grad_norm": 2.9085824489593506, |
|
"learning_rate": 2.2455792066468143e-05, |
|
"loss": 1.4474, |
|
"step": 22950 |
|
}, |
|
{ |
|
"epoch": 1.7690946850242288, |
|
"grad_norm": 1.8322957754135132, |
|
"learning_rate": 2.2198448586449026e-05, |
|
"loss": 1.4642, |
|
"step": 23000 |
|
}, |
|
{ |
|
"epoch": 1.7690946850242288, |
|
"eval_loss": 1.4676103591918945, |
|
"eval_runtime": 17.9158, |
|
"eval_samples_per_second": 55.817, |
|
"eval_steps_per_second": 13.954, |
|
"step": 23000 |
|
}, |
|
{ |
|
"epoch": 1.7729405430351513, |
|
"grad_norm": 0.7428656220436096, |
|
"learning_rate": 2.194110510642991e-05, |
|
"loss": 1.4475, |
|
"step": 23050 |
|
}, |
|
{ |
|
"epoch": 1.7767864010460732, |
|
"grad_norm": 1.4552706480026245, |
|
"learning_rate": 2.1683761626410793e-05, |
|
"loss": 1.517, |
|
"step": 23100 |
|
}, |
|
{ |
|
"epoch": 1.7806322590569956, |
|
"grad_norm": 1.1563323736190796, |
|
"learning_rate": 2.1426418146391676e-05, |
|
"loss": 1.4806, |
|
"step": 23150 |
|
}, |
|
{ |
|
"epoch": 1.7844781170679178, |
|
"grad_norm": 1.7244662046432495, |
|
"learning_rate": 2.116907466637256e-05, |
|
"loss": 1.4492, |
|
"step": 23200 |
|
}, |
|
{ |
|
"epoch": 1.78832397507884, |
|
"grad_norm": 1.642321228981018, |
|
"learning_rate": 2.0911731186353442e-05, |
|
"loss": 1.4196, |
|
"step": 23250 |
|
}, |
|
{ |
|
"epoch": 1.78832397507884, |
|
"eval_loss": 1.4725000858306885, |
|
"eval_runtime": 18.1814, |
|
"eval_samples_per_second": 55.001, |
|
"eval_steps_per_second": 13.75, |
|
"step": 23250 |
|
}, |
|
{ |
|
"epoch": 1.7921698330897624, |
|
"grad_norm": 1.1381646394729614, |
|
"learning_rate": 2.0654387706334325e-05, |
|
"loss": 1.4653, |
|
"step": 23300 |
|
}, |
|
{ |
|
"epoch": 1.7960156911006846, |
|
"grad_norm": 1.2550010681152344, |
|
"learning_rate": 2.0397044226315208e-05, |
|
"loss": 1.4836, |
|
"step": 23350 |
|
}, |
|
{ |
|
"epoch": 1.7998615491116068, |
|
"grad_norm": 1.4335628747940063, |
|
"learning_rate": 2.013970074629609e-05, |
|
"loss": 1.4403, |
|
"step": 23400 |
|
}, |
|
{ |
|
"epoch": 1.8037074071225292, |
|
"grad_norm": 1.8901276588439941, |
|
"learning_rate": 1.9882357266276974e-05, |
|
"loss": 1.4562, |
|
"step": 23450 |
|
}, |
|
{ |
|
"epoch": 1.8075532651334512, |
|
"grad_norm": 1.2078189849853516, |
|
"learning_rate": 1.9625013786257857e-05, |
|
"loss": 1.4221, |
|
"step": 23500 |
|
}, |
|
{ |
|
"epoch": 1.8075532651334512, |
|
"eval_loss": 1.4660383462905884, |
|
"eval_runtime": 18.0656, |
|
"eval_samples_per_second": 55.354, |
|
"eval_steps_per_second": 13.838, |
|
"step": 23500 |
|
}, |
|
{ |
|
"epoch": 1.8113991231443736, |
|
"grad_norm": 1.6915593147277832, |
|
"learning_rate": 1.936767030623874e-05, |
|
"loss": 1.4296, |
|
"step": 23550 |
|
}, |
|
{ |
|
"epoch": 1.8152449811552958, |
|
"grad_norm": 1.9247820377349854, |
|
"learning_rate": 1.9110326826219623e-05, |
|
"loss": 1.4513, |
|
"step": 23600 |
|
}, |
|
{ |
|
"epoch": 1.819090839166218, |
|
"grad_norm": 2.794621229171753, |
|
"learning_rate": 1.8852983346200506e-05, |
|
"loss": 1.4381, |
|
"step": 23650 |
|
}, |
|
{ |
|
"epoch": 1.8229366971771404, |
|
"grad_norm": 1.3829151391983032, |
|
"learning_rate": 1.859563986618139e-05, |
|
"loss": 1.4344, |
|
"step": 23700 |
|
}, |
|
{ |
|
"epoch": 1.8267825551880623, |
|
"grad_norm": 1.8067855834960938, |
|
"learning_rate": 1.8338296386162273e-05, |
|
"loss": 1.4337, |
|
"step": 23750 |
|
}, |
|
{ |
|
"epoch": 1.8267825551880623, |
|
"eval_loss": 1.4543312788009644, |
|
"eval_runtime": 18.2116, |
|
"eval_samples_per_second": 54.91, |
|
"eval_steps_per_second": 13.727, |
|
"step": 23750 |
|
}, |
|
{ |
|
"epoch": 1.8306284131989847, |
|
"grad_norm": 1.829542875289917, |
|
"learning_rate": 1.8080952906143156e-05, |
|
"loss": 1.3986, |
|
"step": 23800 |
|
}, |
|
{ |
|
"epoch": 1.834474271209907, |
|
"grad_norm": 1.8767279386520386, |
|
"learning_rate": 1.782360942612404e-05, |
|
"loss": 1.4873, |
|
"step": 23850 |
|
}, |
|
{ |
|
"epoch": 1.838320129220829, |
|
"grad_norm": 0.9735344052314758, |
|
"learning_rate": 1.7566265946104922e-05, |
|
"loss": 1.4105, |
|
"step": 23900 |
|
}, |
|
{ |
|
"epoch": 1.8421659872317515, |
|
"grad_norm": 1.5424654483795166, |
|
"learning_rate": 1.7308922466085805e-05, |
|
"loss": 1.4357, |
|
"step": 23950 |
|
}, |
|
{ |
|
"epoch": 1.8460118452426735, |
|
"grad_norm": 0.9316624999046326, |
|
"learning_rate": 1.7051578986066688e-05, |
|
"loss": 1.4616, |
|
"step": 24000 |
|
}, |
|
{ |
|
"epoch": 1.8460118452426735, |
|
"eval_loss": 1.4611330032348633, |
|
"eval_runtime": 18.0655, |
|
"eval_samples_per_second": 55.354, |
|
"eval_steps_per_second": 13.839, |
|
"step": 24000 |
|
}, |
|
{ |
|
"epoch": 1.8498577032535959, |
|
"grad_norm": 1.3933135271072388, |
|
"learning_rate": 1.679423550604757e-05, |
|
"loss": 1.45, |
|
"step": 24050 |
|
}, |
|
{ |
|
"epoch": 1.853703561264518, |
|
"grad_norm": 1.1157580614089966, |
|
"learning_rate": 1.6536892026028454e-05, |
|
"loss": 1.4916, |
|
"step": 24100 |
|
}, |
|
{ |
|
"epoch": 1.8575494192754403, |
|
"grad_norm": 1.7401970624923706, |
|
"learning_rate": 1.6279548546009337e-05, |
|
"loss": 1.4563, |
|
"step": 24150 |
|
}, |
|
{ |
|
"epoch": 1.8613952772863627, |
|
"grad_norm": 1.4699925184249878, |
|
"learning_rate": 1.602220506599022e-05, |
|
"loss": 1.4211, |
|
"step": 24200 |
|
}, |
|
{ |
|
"epoch": 1.8652411352972849, |
|
"grad_norm": 1.1760289669036865, |
|
"learning_rate": 1.5764861585971103e-05, |
|
"loss": 1.4212, |
|
"step": 24250 |
|
}, |
|
{ |
|
"epoch": 1.8652411352972849, |
|
"eval_loss": 1.460072636604309, |
|
"eval_runtime": 17.8176, |
|
"eval_samples_per_second": 56.124, |
|
"eval_steps_per_second": 14.031, |
|
"step": 24250 |
|
}, |
|
{ |
|
"epoch": 1.869086993308207, |
|
"grad_norm": 1.8243287801742554, |
|
"learning_rate": 1.5507518105951986e-05, |
|
"loss": 1.4594, |
|
"step": 24300 |
|
}, |
|
{ |
|
"epoch": 1.8729328513191295, |
|
"grad_norm": 0.8821312785148621, |
|
"learning_rate": 1.5250174625932868e-05, |
|
"loss": 1.3837, |
|
"step": 24350 |
|
}, |
|
{ |
|
"epoch": 1.8767787093300514, |
|
"grad_norm": 1.673240065574646, |
|
"learning_rate": 1.4992831145913753e-05, |
|
"loss": 1.395, |
|
"step": 24400 |
|
}, |
|
{ |
|
"epoch": 1.8806245673409738, |
|
"grad_norm": 1.4853135347366333, |
|
"learning_rate": 1.4735487665894636e-05, |
|
"loss": 1.5031, |
|
"step": 24450 |
|
}, |
|
{ |
|
"epoch": 1.884470425351896, |
|
"grad_norm": 2.507054567337036, |
|
"learning_rate": 1.4478144185875517e-05, |
|
"loss": 1.3909, |
|
"step": 24500 |
|
}, |
|
{ |
|
"epoch": 1.884470425351896, |
|
"eval_loss": 1.4431298971176147, |
|
"eval_runtime": 17.9815, |
|
"eval_samples_per_second": 55.613, |
|
"eval_steps_per_second": 13.903, |
|
"step": 24500 |
|
}, |
|
{ |
|
"epoch": 1.8883162833628182, |
|
"grad_norm": 1.8027464151382446, |
|
"learning_rate": 1.4220800705856402e-05, |
|
"loss": 1.4855, |
|
"step": 24550 |
|
}, |
|
{ |
|
"epoch": 1.8921621413737406, |
|
"grad_norm": 1.139756679534912, |
|
"learning_rate": 1.3963457225837285e-05, |
|
"loss": 1.3773, |
|
"step": 24600 |
|
}, |
|
{ |
|
"epoch": 1.8960079993846626, |
|
"grad_norm": 1.377536654472351, |
|
"learning_rate": 1.3706113745818166e-05, |
|
"loss": 1.4274, |
|
"step": 24650 |
|
}, |
|
{ |
|
"epoch": 1.899853857395585, |
|
"grad_norm": 1.2132219076156616, |
|
"learning_rate": 1.3448770265799051e-05, |
|
"loss": 1.3772, |
|
"step": 24700 |
|
}, |
|
{ |
|
"epoch": 1.9036997154065072, |
|
"grad_norm": 1.7106857299804688, |
|
"learning_rate": 1.3191426785779932e-05, |
|
"loss": 1.41, |
|
"step": 24750 |
|
}, |
|
{ |
|
"epoch": 1.9036997154065072, |
|
"eval_loss": 1.472328782081604, |
|
"eval_runtime": 18.0789, |
|
"eval_samples_per_second": 55.313, |
|
"eval_steps_per_second": 13.828, |
|
"step": 24750 |
|
}, |
|
{ |
|
"epoch": 1.9075455734174294, |
|
"grad_norm": 0.9809736013412476, |
|
"learning_rate": 1.2939230175361197e-05, |
|
"loss": 1.4547, |
|
"step": 24800 |
|
}, |
|
{ |
|
"epoch": 1.9113914314283518, |
|
"grad_norm": 1.476722240447998, |
|
"learning_rate": 1.2681886695342082e-05, |
|
"loss": 1.4546, |
|
"step": 24850 |
|
}, |
|
{ |
|
"epoch": 1.9152372894392737, |
|
"grad_norm": 2.078511953353882, |
|
"learning_rate": 1.2424543215322965e-05, |
|
"loss": 1.4971, |
|
"step": 24900 |
|
}, |
|
{ |
|
"epoch": 1.9190831474501961, |
|
"grad_norm": 0.7233028411865234, |
|
"learning_rate": 1.2167199735303847e-05, |
|
"loss": 1.3622, |
|
"step": 24950 |
|
}, |
|
{ |
|
"epoch": 1.9229290054611183, |
|
"grad_norm": 1.3686310052871704, |
|
"learning_rate": 1.1909856255284731e-05, |
|
"loss": 1.5232, |
|
"step": 25000 |
|
}, |
|
{ |
|
"epoch": 1.9229290054611183, |
|
"eval_loss": 1.461082935333252, |
|
"eval_runtime": 18.2695, |
|
"eval_samples_per_second": 54.736, |
|
"eval_steps_per_second": 13.684, |
|
"step": 25000 |
|
}, |
|
{ |
|
"epoch": 1.9267748634720405, |
|
"grad_norm": 1.1179672479629517, |
|
"learning_rate": 1.1652512775265614e-05, |
|
"loss": 1.5076, |
|
"step": 25050 |
|
}, |
|
{ |
|
"epoch": 1.930620721482963, |
|
"grad_norm": 0.9407248497009277, |
|
"learning_rate": 1.1395169295246496e-05, |
|
"loss": 1.468, |
|
"step": 25100 |
|
}, |
|
{ |
|
"epoch": 1.9344665794938851, |
|
"grad_norm": 1.498488426208496, |
|
"learning_rate": 1.113782581522738e-05, |
|
"loss": 1.4566, |
|
"step": 25150 |
|
}, |
|
{ |
|
"epoch": 1.9383124375048073, |
|
"grad_norm": 0.6983101963996887, |
|
"learning_rate": 1.0880482335208264e-05, |
|
"loss": 1.4621, |
|
"step": 25200 |
|
}, |
|
{ |
|
"epoch": 1.9421582955157297, |
|
"grad_norm": 1.954953908920288, |
|
"learning_rate": 1.0623138855189145e-05, |
|
"loss": 1.417, |
|
"step": 25250 |
|
}, |
|
{ |
|
"epoch": 1.9421582955157297, |
|
"eval_loss": 1.4591727256774902, |
|
"eval_runtime": 18.0732, |
|
"eval_samples_per_second": 55.331, |
|
"eval_steps_per_second": 13.833, |
|
"step": 25250 |
|
}, |
|
{ |
|
"epoch": 1.9460041535266517, |
|
"grad_norm": 1.6467170715332031, |
|
"learning_rate": 1.036579537517003e-05, |
|
"loss": 1.4942, |
|
"step": 25300 |
|
}, |
|
{ |
|
"epoch": 1.949850011537574, |
|
"grad_norm": 1.4509849548339844, |
|
"learning_rate": 1.0108451895150913e-05, |
|
"loss": 1.4539, |
|
"step": 25350 |
|
}, |
|
{ |
|
"epoch": 1.9536958695484963, |
|
"grad_norm": 1.6131352186203003, |
|
"learning_rate": 9.851108415131796e-06, |
|
"loss": 1.3993, |
|
"step": 25400 |
|
}, |
|
{ |
|
"epoch": 1.9575417275594185, |
|
"grad_norm": 1.880043387413025, |
|
"learning_rate": 9.593764935112679e-06, |
|
"loss": 1.4449, |
|
"step": 25450 |
|
}, |
|
{ |
|
"epoch": 1.9613875855703409, |
|
"grad_norm": 1.3041406869888306, |
|
"learning_rate": 9.336421455093562e-06, |
|
"loss": 1.4918, |
|
"step": 25500 |
|
}, |
|
{ |
|
"epoch": 1.9613875855703409, |
|
"eval_loss": 1.4548134803771973, |
|
"eval_runtime": 18.0544, |
|
"eval_samples_per_second": 55.388, |
|
"eval_steps_per_second": 13.847, |
|
"step": 25500 |
|
}, |
|
{ |
|
"epoch": 1.9652334435812628, |
|
"grad_norm": 1.8318700790405273, |
|
"learning_rate": 9.079077975074445e-06, |
|
"loss": 1.42, |
|
"step": 25550 |
|
}, |
|
{ |
|
"epoch": 1.9690793015921852, |
|
"grad_norm": 1.7966841459274292, |
|
"learning_rate": 8.821734495055328e-06, |
|
"loss": 1.3236, |
|
"step": 25600 |
|
}, |
|
{ |
|
"epoch": 1.9729251596031074, |
|
"grad_norm": 0.7579635977745056, |
|
"learning_rate": 8.564391015036211e-06, |
|
"loss": 1.3957, |
|
"step": 25650 |
|
}, |
|
{ |
|
"epoch": 1.9767710176140296, |
|
"grad_norm": 1.4515990018844604, |
|
"learning_rate": 8.307047535017094e-06, |
|
"loss": 1.3347, |
|
"step": 25700 |
|
}, |
|
{ |
|
"epoch": 1.980616875624952, |
|
"grad_norm": 1.5671380758285522, |
|
"learning_rate": 8.049704054997977e-06, |
|
"loss": 1.4624, |
|
"step": 25750 |
|
}, |
|
{ |
|
"epoch": 1.980616875624952, |
|
"eval_loss": 1.450337290763855, |
|
"eval_runtime": 17.9548, |
|
"eval_samples_per_second": 55.695, |
|
"eval_steps_per_second": 13.924, |
|
"step": 25750 |
|
}, |
|
{ |
|
"epoch": 1.984462733635874, |
|
"grad_norm": 1.7020714282989502, |
|
"learning_rate": 7.79236057497886e-06, |
|
"loss": 1.3822, |
|
"step": 25800 |
|
}, |
|
{ |
|
"epoch": 1.9883085916467964, |
|
"grad_norm": 1.297658920288086, |
|
"learning_rate": 7.535017094959743e-06, |
|
"loss": 1.4008, |
|
"step": 25850 |
|
}, |
|
{ |
|
"epoch": 1.9921544496577186, |
|
"grad_norm": 1.8151623010635376, |
|
"learning_rate": 7.277673614940627e-06, |
|
"loss": 1.4408, |
|
"step": 25900 |
|
}, |
|
{ |
|
"epoch": 1.9960003076686408, |
|
"grad_norm": 0.8869682550430298, |
|
"learning_rate": 7.02033013492151e-06, |
|
"loss": 1.4767, |
|
"step": 25950 |
|
}, |
|
{ |
|
"epoch": 1.9998461656795632, |
|
"grad_norm": 1.898775339126587, |
|
"learning_rate": 6.762986654902392e-06, |
|
"loss": 1.5032, |
|
"step": 26000 |
|
}, |
|
{ |
|
"epoch": 1.9998461656795632, |
|
"eval_loss": 1.4542045593261719, |
|
"eval_runtime": 18.0059, |
|
"eval_samples_per_second": 55.537, |
|
"eval_steps_per_second": 13.884, |
|
"step": 26000 |
|
}, |
|
{ |
|
"epoch": 2.003692023690485, |
|
"grad_norm": 1.7356750965118408, |
|
"learning_rate": 6.505643174883276e-06, |
|
"loss": 1.3839, |
|
"step": 26050 |
|
}, |
|
{ |
|
"epoch": 2.0075378817014076, |
|
"grad_norm": 2.3067352771759033, |
|
"learning_rate": 6.248299694864159e-06, |
|
"loss": 1.4348, |
|
"step": 26100 |
|
}, |
|
{ |
|
"epoch": 2.01138373971233, |
|
"grad_norm": 1.343248724937439, |
|
"learning_rate": 5.990956214845041e-06, |
|
"loss": 1.3703, |
|
"step": 26150 |
|
}, |
|
{ |
|
"epoch": 2.015229597723252, |
|
"grad_norm": 1.9424471855163574, |
|
"learning_rate": 5.733612734825925e-06, |
|
"loss": 1.4304, |
|
"step": 26200 |
|
}, |
|
{ |
|
"epoch": 2.0190754557341744, |
|
"grad_norm": 1.5383673906326294, |
|
"learning_rate": 5.476269254806808e-06, |
|
"loss": 1.4118, |
|
"step": 26250 |
|
}, |
|
{ |
|
"epoch": 2.0190754557341744, |
|
"eval_loss": 1.474881649017334, |
|
"eval_runtime": 18.1751, |
|
"eval_samples_per_second": 55.02, |
|
"eval_steps_per_second": 13.755, |
|
"step": 26250 |
|
}, |
|
{ |
|
"epoch": 2.0229213137450963, |
|
"grad_norm": 1.803488850593567, |
|
"learning_rate": 5.2189257747876905e-06, |
|
"loss": 1.4537, |
|
"step": 26300 |
|
}, |
|
{ |
|
"epoch": 2.0267671717560187, |
|
"grad_norm": 1.8623336553573608, |
|
"learning_rate": 4.961582294768574e-06, |
|
"loss": 1.3659, |
|
"step": 26350 |
|
}, |
|
{ |
|
"epoch": 2.030613029766941, |
|
"grad_norm": 1.1901572942733765, |
|
"learning_rate": 4.7042388147494575e-06, |
|
"loss": 1.4175, |
|
"step": 26400 |
|
}, |
|
{ |
|
"epoch": 2.034458887777863, |
|
"grad_norm": 1.2967520952224731, |
|
"learning_rate": 4.4468953347303406e-06, |
|
"loss": 1.458, |
|
"step": 26450 |
|
}, |
|
{ |
|
"epoch": 2.0383047457887855, |
|
"grad_norm": 1.2987436056137085, |
|
"learning_rate": 4.189551854711224e-06, |
|
"loss": 1.3965, |
|
"step": 26500 |
|
}, |
|
{ |
|
"epoch": 2.0383047457887855, |
|
"eval_loss": 1.4528058767318726, |
|
"eval_runtime": 18.2495, |
|
"eval_samples_per_second": 54.796, |
|
"eval_steps_per_second": 13.699, |
|
"step": 26500 |
|
}, |
|
{ |
|
"epoch": 2.042150603799708, |
|
"grad_norm": 1.0049172639846802, |
|
"learning_rate": 3.932208374692107e-06, |
|
"loss": 1.3012, |
|
"step": 26550 |
|
}, |
|
{ |
|
"epoch": 2.04599646181063, |
|
"grad_norm": 1.193533182144165, |
|
"learning_rate": 3.6748648946729894e-06, |
|
"loss": 1.4038, |
|
"step": 26600 |
|
}, |
|
{ |
|
"epoch": 2.0498423198215523, |
|
"grad_norm": 1.6459178924560547, |
|
"learning_rate": 3.417521414653873e-06, |
|
"loss": 1.4089, |
|
"step": 26650 |
|
}, |
|
{ |
|
"epoch": 2.0536881778324743, |
|
"grad_norm": 0.546062171459198, |
|
"learning_rate": 3.160177934634756e-06, |
|
"loss": 1.3675, |
|
"step": 26700 |
|
}, |
|
{ |
|
"epoch": 2.0575340358433967, |
|
"grad_norm": 1.7894645929336548, |
|
"learning_rate": 2.9028344546156386e-06, |
|
"loss": 1.4585, |
|
"step": 26750 |
|
}, |
|
{ |
|
"epoch": 2.0575340358433967, |
|
"eval_loss": 1.460014820098877, |
|
"eval_runtime": 18.2356, |
|
"eval_samples_per_second": 54.838, |
|
"eval_steps_per_second": 13.709, |
|
"step": 26750 |
|
}, |
|
{ |
|
"epoch": 2.061379893854319, |
|
"grad_norm": 1.1368170976638794, |
|
"learning_rate": 2.645490974596522e-06, |
|
"loss": 1.4038, |
|
"step": 26800 |
|
}, |
|
{ |
|
"epoch": 2.065225751865241, |
|
"grad_norm": 1.698556900024414, |
|
"learning_rate": 2.388147494577405e-06, |
|
"loss": 1.4592, |
|
"step": 26850 |
|
}, |
|
{ |
|
"epoch": 2.0690716098761635, |
|
"grad_norm": 1.3114346265792847, |
|
"learning_rate": 2.130804014558288e-06, |
|
"loss": 1.4566, |
|
"step": 26900 |
|
}, |
|
{ |
|
"epoch": 2.0729174678870854, |
|
"grad_norm": 1.7974728345870972, |
|
"learning_rate": 1.8734605345391713e-06, |
|
"loss": 1.5074, |
|
"step": 26950 |
|
}, |
|
{ |
|
"epoch": 2.076763325898008, |
|
"grad_norm": 1.4648147821426392, |
|
"learning_rate": 1.6161170545200544e-06, |
|
"loss": 1.4478, |
|
"step": 27000 |
|
}, |
|
{ |
|
"epoch": 2.076763325898008, |
|
"eval_loss": 1.4667593240737915, |
|
"eval_runtime": 18.1467, |
|
"eval_samples_per_second": 55.107, |
|
"eval_steps_per_second": 13.777, |
|
"step": 27000 |
|
}, |
|
{ |
|
"epoch": 2.0806091839089302, |
|
"grad_norm": 0.9924139380455017, |
|
"learning_rate": 1.3587735745009373e-06, |
|
"loss": 1.5088, |
|
"step": 27050 |
|
}, |
|
{ |
|
"epoch": 2.084455041919852, |
|
"grad_norm": 1.1177709102630615, |
|
"learning_rate": 1.1014300944818204e-06, |
|
"loss": 1.4285, |
|
"step": 27100 |
|
}, |
|
{ |
|
"epoch": 2.0883008999307746, |
|
"grad_norm": 1.7112759351730347, |
|
"learning_rate": 8.440866144627034e-07, |
|
"loss": 1.433, |
|
"step": 27150 |
|
}, |
|
{ |
|
"epoch": 2.0921467579416966, |
|
"grad_norm": 1.9338856935501099, |
|
"learning_rate": 5.867431344435866e-07, |
|
"loss": 1.4008, |
|
"step": 27200 |
|
}, |
|
{ |
|
"epoch": 2.095992615952619, |
|
"grad_norm": 3.0200393199920654, |
|
"learning_rate": 3.2939965442446964e-07, |
|
"loss": 1.4285, |
|
"step": 27250 |
|
}, |
|
{ |
|
"epoch": 2.095992615952619, |
|
"eval_loss": 1.4686814546585083, |
|
"eval_runtime": 18.028, |
|
"eval_samples_per_second": 55.469, |
|
"eval_steps_per_second": 13.867, |
|
"step": 27250 |
|
}, |
|
{ |
|
"epoch": 2.0998384739635414, |
|
"grad_norm": 1.5137439966201782, |
|
"learning_rate": 7.205617440535274e-08, |
|
"loss": 1.4596, |
|
"step": 27300 |
|
} |
|
], |
|
"logging_steps": 50, |
|
"max_steps": 27301, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 3, |
|
"save_steps": 500, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 0.0, |
|
"train_batch_size": 4, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|