| { | |
| "best_global_step": null, | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 2.0, | |
| "eval_steps": 500, | |
| "global_step": 9064, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.0022067747986317995, | |
| "grad_norm": 15360.0, | |
| "learning_rate": 1.982378854625551e-07, | |
| "loss": 112.3522, | |
| "mean_token_accuracy": 0.3915941849350929, | |
| "num_tokens": 1158893.0, | |
| "step": 10 | |
| }, | |
| { | |
| "epoch": 0.004413549597263599, | |
| "grad_norm": 15040.0, | |
| "learning_rate": 4.1850220264317185e-07, | |
| "loss": 111.0737, | |
| "mean_token_accuracy": 0.3893237330019474, | |
| "num_tokens": 2293560.0, | |
| "step": 20 | |
| }, | |
| { | |
| "epoch": 0.006620324395895399, | |
| "grad_norm": 13440.0, | |
| "learning_rate": 6.387665198237886e-07, | |
| "loss": 94.6936, | |
| "mean_token_accuracy": 0.4719069264829159, | |
| "num_tokens": 3455157.0, | |
| "step": 30 | |
| }, | |
| { | |
| "epoch": 0.008827099194527198, | |
| "grad_norm": 10304.0, | |
| "learning_rate": 8.590308370044054e-07, | |
| "loss": 72.0728, | |
| "mean_token_accuracy": 0.4683383949100971, | |
| "num_tokens": 4588729.0, | |
| "step": 40 | |
| }, | |
| { | |
| "epoch": 0.011033873993158997, | |
| "grad_norm": 6080.0, | |
| "learning_rate": 1.0792951541850223e-06, | |
| "loss": 45.5343, | |
| "mean_token_accuracy": 0.6681212864816188, | |
| "num_tokens": 5734400.0, | |
| "step": 50 | |
| }, | |
| { | |
| "epoch": 0.013240648791790799, | |
| "grad_norm": 1752.0, | |
| "learning_rate": 1.299559471365639e-06, | |
| "loss": 17.5839, | |
| "mean_token_accuracy": 0.832381109893322, | |
| "num_tokens": 6882569.0, | |
| "step": 60 | |
| }, | |
| { | |
| "epoch": 0.015447423590422598, | |
| "grad_norm": 1088.0, | |
| "learning_rate": 1.5198237885462555e-06, | |
| "loss": 10.6062, | |
| "mean_token_accuracy": 0.872735770046711, | |
| "num_tokens": 8034141.0, | |
| "step": 70 | |
| }, | |
| { | |
| "epoch": 0.017654198389054396, | |
| "grad_norm": 544.0, | |
| "learning_rate": 1.7400881057268722e-06, | |
| "loss": 7.2489, | |
| "mean_token_accuracy": 0.9335811570286751, | |
| "num_tokens": 9185000.0, | |
| "step": 80 | |
| }, | |
| { | |
| "epoch": 0.019860973187686197, | |
| "grad_norm": 286.0, | |
| "learning_rate": 1.960352422907489e-06, | |
| "loss": 4.8974, | |
| "mean_token_accuracy": 0.9433379590511322, | |
| "num_tokens": 10337565.0, | |
| "step": 90 | |
| }, | |
| { | |
| "epoch": 0.022067747986317995, | |
| "grad_norm": 239.0, | |
| "learning_rate": 2.180616740088106e-06, | |
| "loss": 3.6981, | |
| "mean_token_accuracy": 0.9486727714538574, | |
| "num_tokens": 11472738.0, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.024274522784949796, | |
| "grad_norm": 332.0, | |
| "learning_rate": 2.400881057268723e-06, | |
| "loss": 3.2606, | |
| "mean_token_accuracy": 0.9512267947196961, | |
| "num_tokens": 12612721.0, | |
| "step": 110 | |
| }, | |
| { | |
| "epoch": 0.026481297583581597, | |
| "grad_norm": 128.0, | |
| "learning_rate": 2.6211453744493394e-06, | |
| "loss": 3.2884, | |
| "mean_token_accuracy": 0.9522141054272651, | |
| "num_tokens": 13775315.0, | |
| "step": 120 | |
| }, | |
| { | |
| "epoch": 0.028688072382213395, | |
| "grad_norm": 219.0, | |
| "learning_rate": 2.841409691629956e-06, | |
| "loss": 3.1377, | |
| "mean_token_accuracy": 0.9537438541650772, | |
| "num_tokens": 14919796.0, | |
| "step": 130 | |
| }, | |
| { | |
| "epoch": 0.030894847180845196, | |
| "grad_norm": 58.75, | |
| "learning_rate": 3.061674008810573e-06, | |
| "loss": 2.8069, | |
| "mean_token_accuracy": 0.955996623635292, | |
| "num_tokens": 16076676.0, | |
| "step": 140 | |
| }, | |
| { | |
| "epoch": 0.033101621979477, | |
| "grad_norm": 137.0, | |
| "learning_rate": 3.2819383259911898e-06, | |
| "loss": 2.9774, | |
| "mean_token_accuracy": 0.9532946646213531, | |
| "num_tokens": 17220322.0, | |
| "step": 150 | |
| }, | |
| { | |
| "epoch": 0.03530839677810879, | |
| "grad_norm": 96.0, | |
| "learning_rate": 3.5022026431718063e-06, | |
| "loss": 2.9419, | |
| "mean_token_accuracy": 0.9567780137062073, | |
| "num_tokens": 18363831.0, | |
| "step": 160 | |
| }, | |
| { | |
| "epoch": 0.03751517157674059, | |
| "grad_norm": 96.5, | |
| "learning_rate": 3.7224669603524232e-06, | |
| "loss": 2.9692, | |
| "mean_token_accuracy": 0.9560762420296669, | |
| "num_tokens": 19523552.0, | |
| "step": 170 | |
| }, | |
| { | |
| "epoch": 0.039721946375372394, | |
| "grad_norm": 107.0, | |
| "learning_rate": 3.94273127753304e-06, | |
| "loss": 2.9879, | |
| "mean_token_accuracy": 0.9554866880178452, | |
| "num_tokens": 20677532.0, | |
| "step": 180 | |
| }, | |
| { | |
| "epoch": 0.041928721174004195, | |
| "grad_norm": 110.0, | |
| "learning_rate": 4.162995594713657e-06, | |
| "loss": 2.7148, | |
| "mean_token_accuracy": 0.96178168207407, | |
| "num_tokens": 21821335.0, | |
| "step": 190 | |
| }, | |
| { | |
| "epoch": 0.04413549597263599, | |
| "grad_norm": 209.0, | |
| "learning_rate": 4.383259911894274e-06, | |
| "loss": 2.9714, | |
| "mean_token_accuracy": 0.954236464202404, | |
| "num_tokens": 22972210.0, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 0.04634227077126779, | |
| "grad_norm": 88.5, | |
| "learning_rate": 4.60352422907489e-06, | |
| "loss": 3.0126, | |
| "mean_token_accuracy": 0.9582175269722939, | |
| "num_tokens": 24131759.0, | |
| "step": 210 | |
| }, | |
| { | |
| "epoch": 0.04854904556989959, | |
| "grad_norm": 55.5, | |
| "learning_rate": 4.823788546255507e-06, | |
| "loss": 2.9006, | |
| "mean_token_accuracy": 0.9566345065832138, | |
| "num_tokens": 25279653.0, | |
| "step": 220 | |
| }, | |
| { | |
| "epoch": 0.05075582036853139, | |
| "grad_norm": 138.0, | |
| "learning_rate": 5.044052863436124e-06, | |
| "loss": 2.7452, | |
| "mean_token_accuracy": 0.9619189321994781, | |
| "num_tokens": 26429259.0, | |
| "step": 230 | |
| }, | |
| { | |
| "epoch": 0.052962595167163194, | |
| "grad_norm": 161.0, | |
| "learning_rate": 5.2643171806167406e-06, | |
| "loss": 2.7416, | |
| "mean_token_accuracy": 0.9591059356927871, | |
| "num_tokens": 27579200.0, | |
| "step": 240 | |
| }, | |
| { | |
| "epoch": 0.05516936996579499, | |
| "grad_norm": 55.5, | |
| "learning_rate": 5.484581497797358e-06, | |
| "loss": 2.7083, | |
| "mean_token_accuracy": 0.958624179661274, | |
| "num_tokens": 28742498.0, | |
| "step": 250 | |
| }, | |
| { | |
| "epoch": 0.05737614476442679, | |
| "grad_norm": 68.5, | |
| "learning_rate": 5.704845814977974e-06, | |
| "loss": 2.848, | |
| "mean_token_accuracy": 0.9569541916251183, | |
| "num_tokens": 29895308.0, | |
| "step": 260 | |
| }, | |
| { | |
| "epoch": 0.05958291956305859, | |
| "grad_norm": 76.5, | |
| "learning_rate": 5.925110132158591e-06, | |
| "loss": 2.8535, | |
| "mean_token_accuracy": 0.9561435878276825, | |
| "num_tokens": 31051444.0, | |
| "step": 270 | |
| }, | |
| { | |
| "epoch": 0.06178969436169039, | |
| "grad_norm": 179.0, | |
| "learning_rate": 6.1453744493392075e-06, | |
| "loss": 2.7223, | |
| "mean_token_accuracy": 0.9576126232743263, | |
| "num_tokens": 32212821.0, | |
| "step": 280 | |
| }, | |
| { | |
| "epoch": 0.06399646916032219, | |
| "grad_norm": 109.5, | |
| "learning_rate": 6.365638766519824e-06, | |
| "loss": 2.7515, | |
| "mean_token_accuracy": 0.9572753980755806, | |
| "num_tokens": 33359370.0, | |
| "step": 290 | |
| }, | |
| { | |
| "epoch": 0.066203243958954, | |
| "grad_norm": 114.0, | |
| "learning_rate": 6.585903083700441e-06, | |
| "loss": 2.9286, | |
| "mean_token_accuracy": 0.956420823931694, | |
| "num_tokens": 34491274.0, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 0.06841001875758579, | |
| "grad_norm": 97.0, | |
| "learning_rate": 6.806167400881057e-06, | |
| "loss": 2.6391, | |
| "mean_token_accuracy": 0.9597338289022446, | |
| "num_tokens": 35627797.0, | |
| "step": 310 | |
| }, | |
| { | |
| "epoch": 0.07061679355621758, | |
| "grad_norm": 145.0, | |
| "learning_rate": 7.026431718061674e-06, | |
| "loss": 2.81, | |
| "mean_token_accuracy": 0.95515988022089, | |
| "num_tokens": 36788468.0, | |
| "step": 320 | |
| }, | |
| { | |
| "epoch": 0.07282356835484939, | |
| "grad_norm": 57.75, | |
| "learning_rate": 7.246696035242291e-06, | |
| "loss": 2.8411, | |
| "mean_token_accuracy": 0.9558294728398323, | |
| "num_tokens": 37946453.0, | |
| "step": 330 | |
| }, | |
| { | |
| "epoch": 0.07503034315348119, | |
| "grad_norm": 122.0, | |
| "learning_rate": 7.466960352422908e-06, | |
| "loss": 2.7231, | |
| "mean_token_accuracy": 0.9582507729530334, | |
| "num_tokens": 39116128.0, | |
| "step": 340 | |
| }, | |
| { | |
| "epoch": 0.077237117952113, | |
| "grad_norm": 81.0, | |
| "learning_rate": 7.687224669603525e-06, | |
| "loss": 2.4829, | |
| "mean_token_accuracy": 0.9622300952672959, | |
| "num_tokens": 40266495.0, | |
| "step": 350 | |
| }, | |
| { | |
| "epoch": 0.07944389275074479, | |
| "grad_norm": 43.75, | |
| "learning_rate": 7.907488986784141e-06, | |
| "loss": 2.6574, | |
| "mean_token_accuracy": 0.9608349114656448, | |
| "num_tokens": 41431490.0, | |
| "step": 360 | |
| }, | |
| { | |
| "epoch": 0.08165066754937658, | |
| "grad_norm": 145.0, | |
| "learning_rate": 8.127753303964758e-06, | |
| "loss": 2.7356, | |
| "mean_token_accuracy": 0.9601844310760498, | |
| "num_tokens": 42582300.0, | |
| "step": 370 | |
| }, | |
| { | |
| "epoch": 0.08385744234800839, | |
| "grad_norm": 77.5, | |
| "learning_rate": 8.348017621145376e-06, | |
| "loss": 2.7453, | |
| "mean_token_accuracy": 0.9596911519765854, | |
| "num_tokens": 43720343.0, | |
| "step": 380 | |
| }, | |
| { | |
| "epoch": 0.08606421714664018, | |
| "grad_norm": 33.0, | |
| "learning_rate": 8.568281938325993e-06, | |
| "loss": 2.6165, | |
| "mean_token_accuracy": 0.9593339174985885, | |
| "num_tokens": 44876871.0, | |
| "step": 390 | |
| }, | |
| { | |
| "epoch": 0.08827099194527198, | |
| "grad_norm": 48.0, | |
| "learning_rate": 8.788546255506607e-06, | |
| "loss": 2.5848, | |
| "mean_token_accuracy": 0.9619518518447876, | |
| "num_tokens": 46016653.0, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 0.09047776674390379, | |
| "grad_norm": 86.5, | |
| "learning_rate": 9.008810572687226e-06, | |
| "loss": 2.5459, | |
| "mean_token_accuracy": 0.9626090213656425, | |
| "num_tokens": 47178496.0, | |
| "step": 410 | |
| }, | |
| { | |
| "epoch": 0.09268454154253558, | |
| "grad_norm": 224.0, | |
| "learning_rate": 9.229074889867842e-06, | |
| "loss": 2.8052, | |
| "mean_token_accuracy": 0.9597306191921234, | |
| "num_tokens": 48336902.0, | |
| "step": 420 | |
| }, | |
| { | |
| "epoch": 0.09489131634116739, | |
| "grad_norm": 104.5, | |
| "learning_rate": 9.449339207048459e-06, | |
| "loss": 2.7697, | |
| "mean_token_accuracy": 0.9567511335015297, | |
| "num_tokens": 49487989.0, | |
| "step": 430 | |
| }, | |
| { | |
| "epoch": 0.09709809113979918, | |
| "grad_norm": 75.5, | |
| "learning_rate": 9.669603524229075e-06, | |
| "loss": 2.5994, | |
| "mean_token_accuracy": 0.9600183099508286, | |
| "num_tokens": 50623988.0, | |
| "step": 440 | |
| }, | |
| { | |
| "epoch": 0.09930486593843098, | |
| "grad_norm": 207.0, | |
| "learning_rate": 9.889867841409693e-06, | |
| "loss": 2.622, | |
| "mean_token_accuracy": 0.9624109387397766, | |
| "num_tokens": 51777451.0, | |
| "step": 450 | |
| }, | |
| { | |
| "epoch": 0.10151164073706279, | |
| "grad_norm": 52.75, | |
| "learning_rate": 9.99999167904182e-06, | |
| "loss": 2.7163, | |
| "mean_token_accuracy": 0.9591552257537842, | |
| "num_tokens": 52921713.0, | |
| "step": 460 | |
| }, | |
| { | |
| "epoch": 0.10371841553569458, | |
| "grad_norm": 149.0, | |
| "learning_rate": 9.999925111542544e-06, | |
| "loss": 2.5862, | |
| "mean_token_accuracy": 0.9607134088873863, | |
| "num_tokens": 54064629.0, | |
| "step": 470 | |
| }, | |
| { | |
| "epoch": 0.10592519033432639, | |
| "grad_norm": 48.0, | |
| "learning_rate": 9.999791977430238e-06, | |
| "loss": 2.4264, | |
| "mean_token_accuracy": 0.9649755835533143, | |
| "num_tokens": 55216744.0, | |
| "step": 480 | |
| }, | |
| { | |
| "epoch": 0.10813196513295818, | |
| "grad_norm": 163.0, | |
| "learning_rate": 9.999592278477389e-06, | |
| "loss": 2.4471, | |
| "mean_token_accuracy": 0.9621559053659439, | |
| "num_tokens": 56364553.0, | |
| "step": 490 | |
| }, | |
| { | |
| "epoch": 0.11033873993158998, | |
| "grad_norm": 36.0, | |
| "learning_rate": 9.999326017342688e-06, | |
| "loss": 2.5235, | |
| "mean_token_accuracy": 0.9647494912147522, | |
| "num_tokens": 57506700.0, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 0.11254551473022179, | |
| "grad_norm": 78.0, | |
| "learning_rate": 9.998993197571014e-06, | |
| "loss": 2.6031, | |
| "mean_token_accuracy": 0.9625773161649704, | |
| "num_tokens": 58662842.0, | |
| "step": 510 | |
| }, | |
| { | |
| "epoch": 0.11475228952885358, | |
| "grad_norm": 139.0, | |
| "learning_rate": 9.99859382359337e-06, | |
| "loss": 2.5659, | |
| "mean_token_accuracy": 0.9636919751763344, | |
| "num_tokens": 59826463.0, | |
| "step": 520 | |
| }, | |
| { | |
| "epoch": 0.11695906432748537, | |
| "grad_norm": 60.5, | |
| "learning_rate": 9.998127900726825e-06, | |
| "loss": 2.5474, | |
| "mean_token_accuracy": 0.9611397817730903, | |
| "num_tokens": 60977307.0, | |
| "step": 530 | |
| }, | |
| { | |
| "epoch": 0.11916583912611718, | |
| "grad_norm": 51.75, | |
| "learning_rate": 9.997595435174461e-06, | |
| "loss": 2.4206, | |
| "mean_token_accuracy": 0.9649364203214645, | |
| "num_tokens": 62123313.0, | |
| "step": 540 | |
| }, | |
| { | |
| "epoch": 0.12137261392474898, | |
| "grad_norm": 110.5, | |
| "learning_rate": 9.996996434025264e-06, | |
| "loss": 2.3962, | |
| "mean_token_accuracy": 0.9631159469485283, | |
| "num_tokens": 63265019.0, | |
| "step": 550 | |
| }, | |
| { | |
| "epoch": 0.12357938872338078, | |
| "grad_norm": 38.75, | |
| "learning_rate": 9.99633090525405e-06, | |
| "loss": 2.3899, | |
| "mean_token_accuracy": 0.9632542356848717, | |
| "num_tokens": 64417905.0, | |
| "step": 560 | |
| }, | |
| { | |
| "epoch": 0.12578616352201258, | |
| "grad_norm": 165.0, | |
| "learning_rate": 9.995598857721354e-06, | |
| "loss": 2.4027, | |
| "mean_token_accuracy": 0.9661416321992874, | |
| "num_tokens": 65577278.0, | |
| "step": 570 | |
| }, | |
| { | |
| "epoch": 0.12799293832064437, | |
| "grad_norm": 161.0, | |
| "learning_rate": 9.994800301173303e-06, | |
| "loss": 2.4892, | |
| "mean_token_accuracy": 0.9631411463022232, | |
| "num_tokens": 66718763.0, | |
| "step": 580 | |
| }, | |
| { | |
| "epoch": 0.13019971311927617, | |
| "grad_norm": 243.0, | |
| "learning_rate": 9.9939352462415e-06, | |
| "loss": 2.5168, | |
| "mean_token_accuracy": 0.9630593791604042, | |
| "num_tokens": 67866232.0, | |
| "step": 590 | |
| }, | |
| { | |
| "epoch": 0.132406487917908, | |
| "grad_norm": 130.0, | |
| "learning_rate": 9.99300370444287e-06, | |
| "loss": 2.6167, | |
| "mean_token_accuracy": 0.9613253712654114, | |
| "num_tokens": 69026176.0, | |
| "step": 600 | |
| }, | |
| { | |
| "epoch": 0.13461326271653978, | |
| "grad_norm": 92.0, | |
| "learning_rate": 9.992005688179518e-06, | |
| "loss": 2.5877, | |
| "mean_token_accuracy": 0.9626603454351426, | |
| "num_tokens": 70179142.0, | |
| "step": 610 | |
| }, | |
| { | |
| "epoch": 0.13682003751517158, | |
| "grad_norm": 30.625, | |
| "learning_rate": 9.990941210738553e-06, | |
| "loss": 2.3826, | |
| "mean_token_accuracy": 0.964666198194027, | |
| "num_tokens": 71341360.0, | |
| "step": 620 | |
| }, | |
| { | |
| "epoch": 0.13902681231380337, | |
| "grad_norm": 43.75, | |
| "learning_rate": 9.989810286291923e-06, | |
| "loss": 2.3826, | |
| "mean_token_accuracy": 0.9661737963557243, | |
| "num_tokens": 72481378.0, | |
| "step": 630 | |
| }, | |
| { | |
| "epoch": 0.14123358711243517, | |
| "grad_norm": 182.0, | |
| "learning_rate": 9.988612929896211e-06, | |
| "loss": 2.5232, | |
| "mean_token_accuracy": 0.9642651349306106, | |
| "num_tokens": 73638168.0, | |
| "step": 640 | |
| }, | |
| { | |
| "epoch": 0.143440361911067, | |
| "grad_norm": 150.0, | |
| "learning_rate": 9.98734915749245e-06, | |
| "loss": 2.4876, | |
| "mean_token_accuracy": 0.9659786492586135, | |
| "num_tokens": 74770020.0, | |
| "step": 650 | |
| }, | |
| { | |
| "epoch": 0.14564713670969878, | |
| "grad_norm": 236.0, | |
| "learning_rate": 9.986018985905901e-06, | |
| "loss": 2.578, | |
| "mean_token_accuracy": 0.9620574504137039, | |
| "num_tokens": 75929468.0, | |
| "step": 660 | |
| }, | |
| { | |
| "epoch": 0.14785391150833058, | |
| "grad_norm": 252.0, | |
| "learning_rate": 9.984622432845835e-06, | |
| "loss": 2.3973, | |
| "mean_token_accuracy": 0.9658336862921715, | |
| "num_tokens": 77084264.0, | |
| "step": 670 | |
| }, | |
| { | |
| "epoch": 0.15006068630696237, | |
| "grad_norm": 123.5, | |
| "learning_rate": 9.983159516905287e-06, | |
| "loss": 2.4901, | |
| "mean_token_accuracy": 0.9643094524741173, | |
| "num_tokens": 78244631.0, | |
| "step": 680 | |
| }, | |
| { | |
| "epoch": 0.15226746110559417, | |
| "grad_norm": 57.75, | |
| "learning_rate": 9.981630257560825e-06, | |
| "loss": 2.4516, | |
| "mean_token_accuracy": 0.9648495331406594, | |
| "num_tokens": 79381562.0, | |
| "step": 690 | |
| }, | |
| { | |
| "epoch": 0.154474235904226, | |
| "grad_norm": 77.5, | |
| "learning_rate": 9.980034675172274e-06, | |
| "loss": 2.363, | |
| "mean_token_accuracy": 0.9666372835636139, | |
| "num_tokens": 80545754.0, | |
| "step": 700 | |
| }, | |
| { | |
| "epoch": 0.15668101070285778, | |
| "grad_norm": 32.75, | |
| "learning_rate": 9.978372790982457e-06, | |
| "loss": 2.3985, | |
| "mean_token_accuracy": 0.9662070587277413, | |
| "num_tokens": 81697063.0, | |
| "step": 710 | |
| }, | |
| { | |
| "epoch": 0.15888778550148958, | |
| "grad_norm": 30.75, | |
| "learning_rate": 9.976644627116906e-06, | |
| "loss": 2.3612, | |
| "mean_token_accuracy": 0.9662289813160896, | |
| "num_tokens": 82847344.0, | |
| "step": 720 | |
| }, | |
| { | |
| "epoch": 0.16109456030012137, | |
| "grad_norm": 67.5, | |
| "learning_rate": 9.97485020658357e-06, | |
| "loss": 2.4291, | |
| "mean_token_accuracy": 0.9649844944477082, | |
| "num_tokens": 83996879.0, | |
| "step": 730 | |
| }, | |
| { | |
| "epoch": 0.16330133509875316, | |
| "grad_norm": 38.25, | |
| "learning_rate": 9.972989553272501e-06, | |
| "loss": 2.4389, | |
| "mean_token_accuracy": 0.9694315686821937, | |
| "num_tokens": 85160887.0, | |
| "step": 740 | |
| }, | |
| { | |
| "epoch": 0.16550810989738496, | |
| "grad_norm": 31.625, | |
| "learning_rate": 9.971062691955553e-06, | |
| "loss": 2.5073, | |
| "mean_token_accuracy": 0.9623410046100617, | |
| "num_tokens": 86322015.0, | |
| "step": 750 | |
| }, | |
| { | |
| "epoch": 0.16771488469601678, | |
| "grad_norm": 74.0, | |
| "learning_rate": 9.969069648286034e-06, | |
| "loss": 2.2842, | |
| "mean_token_accuracy": 0.9688465252518654, | |
| "num_tokens": 87471908.0, | |
| "step": 760 | |
| }, | |
| { | |
| "epoch": 0.16992165949464857, | |
| "grad_norm": 72.0, | |
| "learning_rate": 9.967010448798376e-06, | |
| "loss": 2.4433, | |
| "mean_token_accuracy": 0.9661948889493942, | |
| "num_tokens": 88631163.0, | |
| "step": 770 | |
| }, | |
| { | |
| "epoch": 0.17212843429328037, | |
| "grad_norm": 158.0, | |
| "learning_rate": 9.964885120907777e-06, | |
| "loss": 2.3431, | |
| "mean_token_accuracy": 0.966378802061081, | |
| "num_tokens": 89795187.0, | |
| "step": 780 | |
| }, | |
| { | |
| "epoch": 0.17433520909191216, | |
| "grad_norm": 76.0, | |
| "learning_rate": 9.962693692909834e-06, | |
| "loss": 2.3471, | |
| "mean_token_accuracy": 0.9679164841771126, | |
| "num_tokens": 90935889.0, | |
| "step": 790 | |
| }, | |
| { | |
| "epoch": 0.17654198389054396, | |
| "grad_norm": 30.0, | |
| "learning_rate": 9.960436193980175e-06, | |
| "loss": 2.2831, | |
| "mean_token_accuracy": 0.9679559648036957, | |
| "num_tokens": 92081345.0, | |
| "step": 800 | |
| }, | |
| { | |
| "epoch": 0.17874875868917578, | |
| "grad_norm": 129.0, | |
| "learning_rate": 9.958112654174058e-06, | |
| "loss": 2.4766, | |
| "mean_token_accuracy": 0.969143983721733, | |
| "num_tokens": 93198801.0, | |
| "step": 810 | |
| }, | |
| { | |
| "epoch": 0.18095553348780757, | |
| "grad_norm": 49.25, | |
| "learning_rate": 9.955723104425986e-06, | |
| "loss": 2.3432, | |
| "mean_token_accuracy": 0.9679255992174148, | |
| "num_tokens": 94337864.0, | |
| "step": 820 | |
| }, | |
| { | |
| "epoch": 0.18316230828643937, | |
| "grad_norm": 24.375, | |
| "learning_rate": 9.953267576549279e-06, | |
| "loss": 2.3188, | |
| "mean_token_accuracy": 0.9674392059445381, | |
| "num_tokens": 95494807.0, | |
| "step": 830 | |
| }, | |
| { | |
| "epoch": 0.18536908308507116, | |
| "grad_norm": 116.5, | |
| "learning_rate": 9.950746103235663e-06, | |
| "loss": 2.1544, | |
| "mean_token_accuracy": 0.9704195275902748, | |
| "num_tokens": 96645723.0, | |
| "step": 840 | |
| }, | |
| { | |
| "epoch": 0.18757585788370296, | |
| "grad_norm": 27.25, | |
| "learning_rate": 9.948158718054828e-06, | |
| "loss": 2.1829, | |
| "mean_token_accuracy": 0.9701510265469551, | |
| "num_tokens": 97801303.0, | |
| "step": 850 | |
| }, | |
| { | |
| "epoch": 0.18978263268233478, | |
| "grad_norm": 40.0, | |
| "learning_rate": 9.945505455453983e-06, | |
| "loss": 2.3169, | |
| "mean_token_accuracy": 0.9691279828548431, | |
| "num_tokens": 98960359.0, | |
| "step": 860 | |
| }, | |
| { | |
| "epoch": 0.19198940748096657, | |
| "grad_norm": 59.5, | |
| "learning_rate": 9.942786350757398e-06, | |
| "loss": 2.3463, | |
| "mean_token_accuracy": 0.9667802095413208, | |
| "num_tokens": 100121451.0, | |
| "step": 870 | |
| }, | |
| { | |
| "epoch": 0.19419618227959837, | |
| "grad_norm": 171.0, | |
| "learning_rate": 9.940001440165934e-06, | |
| "loss": 2.3023, | |
| "mean_token_accuracy": 0.9676673591136933, | |
| "num_tokens": 101277861.0, | |
| "step": 880 | |
| }, | |
| { | |
| "epoch": 0.19640295707823016, | |
| "grad_norm": 119.5, | |
| "learning_rate": 9.93715076075656e-06, | |
| "loss": 2.2991, | |
| "mean_token_accuracy": 0.9698756992816925, | |
| "num_tokens": 102429499.0, | |
| "step": 890 | |
| }, | |
| { | |
| "epoch": 0.19860973187686196, | |
| "grad_norm": 21.0, | |
| "learning_rate": 9.934234350481856e-06, | |
| "loss": 2.384, | |
| "mean_token_accuracy": 0.9653509497642517, | |
| "num_tokens": 103582879.0, | |
| "step": 900 | |
| }, | |
| { | |
| "epoch": 0.20081650667549378, | |
| "grad_norm": 61.0, | |
| "learning_rate": 9.931252248169518e-06, | |
| "loss": 2.3101, | |
| "mean_token_accuracy": 0.9676768973469734, | |
| "num_tokens": 104727856.0, | |
| "step": 910 | |
| }, | |
| { | |
| "epoch": 0.20302328147412557, | |
| "grad_norm": 92.5, | |
| "learning_rate": 9.92820449352183e-06, | |
| "loss": 2.2126, | |
| "mean_token_accuracy": 0.9710656419396401, | |
| "num_tokens": 105867599.0, | |
| "step": 920 | |
| }, | |
| { | |
| "epoch": 0.20523005627275737, | |
| "grad_norm": 122.0, | |
| "learning_rate": 9.925091127115139e-06, | |
| "loss": 2.1657, | |
| "mean_token_accuracy": 0.9696655005216599, | |
| "num_tokens": 106989144.0, | |
| "step": 930 | |
| }, | |
| { | |
| "epoch": 0.20743683107138916, | |
| "grad_norm": 105.5, | |
| "learning_rate": 9.921912190399317e-06, | |
| "loss": 2.2949, | |
| "mean_token_accuracy": 0.9690271243453026, | |
| "num_tokens": 108139309.0, | |
| "step": 940 | |
| }, | |
| { | |
| "epoch": 0.20964360587002095, | |
| "grad_norm": 46.75, | |
| "learning_rate": 9.91866772569721e-06, | |
| "loss": 2.2424, | |
| "mean_token_accuracy": 0.9674097329378128, | |
| "num_tokens": 109299402.0, | |
| "step": 950 | |
| }, | |
| { | |
| "epoch": 0.21185038066865278, | |
| "grad_norm": 59.5, | |
| "learning_rate": 9.91535777620407e-06, | |
| "loss": 2.4691, | |
| "mean_token_accuracy": 0.9633127674460411, | |
| "num_tokens": 110444962.0, | |
| "step": 960 | |
| }, | |
| { | |
| "epoch": 0.21405715546728457, | |
| "grad_norm": 83.5, | |
| "learning_rate": 9.91198238598698e-06, | |
| "loss": 2.0154, | |
| "mean_token_accuracy": 0.9716773480176926, | |
| "num_tokens": 111604731.0, | |
| "step": 970 | |
| }, | |
| { | |
| "epoch": 0.21626393026591637, | |
| "grad_norm": 45.5, | |
| "learning_rate": 9.908541599984276e-06, | |
| "loss": 2.3079, | |
| "mean_token_accuracy": 0.9678294911980629, | |
| "num_tokens": 112761627.0, | |
| "step": 980 | |
| }, | |
| { | |
| "epoch": 0.21847070506454816, | |
| "grad_norm": 38.75, | |
| "learning_rate": 9.905035464004935e-06, | |
| "loss": 2.3254, | |
| "mean_token_accuracy": 0.9679719492793083, | |
| "num_tokens": 113906065.0, | |
| "step": 990 | |
| }, | |
| { | |
| "epoch": 0.22067747986317995, | |
| "grad_norm": 89.0, | |
| "learning_rate": 9.901464024727976e-06, | |
| "loss": 2.1956, | |
| "mean_token_accuracy": 0.9693391814827919, | |
| "num_tokens": 115056456.0, | |
| "step": 1000 | |
| }, | |
| { | |
| "epoch": 0.22288425466181178, | |
| "grad_norm": 23.375, | |
| "learning_rate": 9.897827329701834e-06, | |
| "loss": 2.2681, | |
| "mean_token_accuracy": 0.9669407427310943, | |
| "num_tokens": 116190018.0, | |
| "step": 1010 | |
| }, | |
| { | |
| "epoch": 0.22509102946044357, | |
| "grad_norm": 143.0, | |
| "learning_rate": 9.89412542734373e-06, | |
| "loss": 2.5653, | |
| "mean_token_accuracy": 0.9642222300171852, | |
| "num_tokens": 117323921.0, | |
| "step": 1020 | |
| }, | |
| { | |
| "epoch": 0.22729780425907536, | |
| "grad_norm": 96.0, | |
| "learning_rate": 9.890358366939021e-06, | |
| "loss": 2.4263, | |
| "mean_token_accuracy": 0.967412082850933, | |
| "num_tokens": 118476348.0, | |
| "step": 1030 | |
| }, | |
| { | |
| "epoch": 0.22950457905770716, | |
| "grad_norm": 123.0, | |
| "learning_rate": 9.88652619864055e-06, | |
| "loss": 2.2097, | |
| "mean_token_accuracy": 0.9696822956204414, | |
| "num_tokens": 119618276.0, | |
| "step": 1040 | |
| }, | |
| { | |
| "epoch": 0.23171135385633895, | |
| "grad_norm": 44.5, | |
| "learning_rate": 9.882628973467972e-06, | |
| "loss": 2.1779, | |
| "mean_token_accuracy": 0.9709473550319672, | |
| "num_tokens": 120776256.0, | |
| "step": 1050 | |
| }, | |
| { | |
| "epoch": 0.23391812865497075, | |
| "grad_norm": 153.0, | |
| "learning_rate": 9.878666743307083e-06, | |
| "loss": 2.1194, | |
| "mean_token_accuracy": 0.9711328238248825, | |
| "num_tokens": 121927985.0, | |
| "step": 1060 | |
| }, | |
| { | |
| "epoch": 0.23612490345360257, | |
| "grad_norm": 29.875, | |
| "learning_rate": 9.874639560909118e-06, | |
| "loss": 2.1436, | |
| "mean_token_accuracy": 0.9701176419854164, | |
| "num_tokens": 123080549.0, | |
| "step": 1070 | |
| }, | |
| { | |
| "epoch": 0.23833167825223436, | |
| "grad_norm": 45.5, | |
| "learning_rate": 9.870547479890062e-06, | |
| "loss": 2.3837, | |
| "mean_token_accuracy": 0.9671823754906654, | |
| "num_tokens": 124230629.0, | |
| "step": 1080 | |
| }, | |
| { | |
| "epoch": 0.24053845305086616, | |
| "grad_norm": 27.625, | |
| "learning_rate": 9.866390554729923e-06, | |
| "loss": 2.2526, | |
| "mean_token_accuracy": 0.9689402997493743, | |
| "num_tokens": 125390142.0, | |
| "step": 1090 | |
| }, | |
| { | |
| "epoch": 0.24274522784949795, | |
| "grad_norm": 75.5, | |
| "learning_rate": 9.862168840772018e-06, | |
| "loss": 2.2863, | |
| "mean_token_accuracy": 0.966990028321743, | |
| "num_tokens": 126545718.0, | |
| "step": 1100 | |
| }, | |
| { | |
| "epoch": 0.24495200264812975, | |
| "grad_norm": 89.0, | |
| "learning_rate": 9.857882394222225e-06, | |
| "loss": 2.2897, | |
| "mean_token_accuracy": 0.9691839888691902, | |
| "num_tokens": 127692122.0, | |
| "step": 1110 | |
| }, | |
| { | |
| "epoch": 0.24715877744676157, | |
| "grad_norm": 107.0, | |
| "learning_rate": 9.853531272148248e-06, | |
| "loss": 2.281, | |
| "mean_token_accuracy": 0.9689178034663201, | |
| "num_tokens": 128840384.0, | |
| "step": 1120 | |
| }, | |
| { | |
| "epoch": 0.24936555224539336, | |
| "grad_norm": 34.75, | |
| "learning_rate": 9.849115532478848e-06, | |
| "loss": 2.3402, | |
| "mean_token_accuracy": 0.9678497686982155, | |
| "num_tokens": 130004140.0, | |
| "step": 1130 | |
| }, | |
| { | |
| "epoch": 0.25157232704402516, | |
| "grad_norm": 157.0, | |
| "learning_rate": 9.844635234003067e-06, | |
| "loss": 2.2553, | |
| "mean_token_accuracy": 0.967849250137806, | |
| "num_tokens": 131152238.0, | |
| "step": 1140 | |
| }, | |
| { | |
| "epoch": 0.25377910184265695, | |
| "grad_norm": 99.0, | |
| "learning_rate": 9.840090436369458e-06, | |
| "loss": 2.197, | |
| "mean_token_accuracy": 0.9709160849452019, | |
| "num_tokens": 132305852.0, | |
| "step": 1150 | |
| }, | |
| { | |
| "epoch": 0.25598587664128875, | |
| "grad_norm": 47.75, | |
| "learning_rate": 9.83548120008529e-06, | |
| "loss": 2.1032, | |
| "mean_token_accuracy": 0.971674793958664, | |
| "num_tokens": 133459118.0, | |
| "step": 1160 | |
| }, | |
| { | |
| "epoch": 0.25819265143992054, | |
| "grad_norm": 76.5, | |
| "learning_rate": 9.830807586515726e-06, | |
| "loss": 2.2028, | |
| "mean_token_accuracy": 0.970433022081852, | |
| "num_tokens": 134619378.0, | |
| "step": 1170 | |
| }, | |
| { | |
| "epoch": 0.26039942623855233, | |
| "grad_norm": 68.0, | |
| "learning_rate": 9.826069657883027e-06, | |
| "loss": 2.0633, | |
| "mean_token_accuracy": 0.970002381503582, | |
| "num_tokens": 135763226.0, | |
| "step": 1180 | |
| }, | |
| { | |
| "epoch": 0.26260620103718413, | |
| "grad_norm": 71.0, | |
| "learning_rate": 9.821267477265705e-06, | |
| "loss": 2.2051, | |
| "mean_token_accuracy": 0.9684244245290756, | |
| "num_tokens": 136919931.0, | |
| "step": 1190 | |
| }, | |
| { | |
| "epoch": 0.264812975835816, | |
| "grad_norm": 119.5, | |
| "learning_rate": 9.816401108597704e-06, | |
| "loss": 2.365, | |
| "mean_token_accuracy": 0.9691165268421174, | |
| "num_tokens": 138081130.0, | |
| "step": 1200 | |
| }, | |
| { | |
| "epoch": 0.2670197506344478, | |
| "grad_norm": 72.0, | |
| "learning_rate": 9.811470616667525e-06, | |
| "loss": 2.3234, | |
| "mean_token_accuracy": 0.9671659022569656, | |
| "num_tokens": 139233437.0, | |
| "step": 1210 | |
| }, | |
| { | |
| "epoch": 0.26922652543307957, | |
| "grad_norm": 125.5, | |
| "learning_rate": 9.806476067117384e-06, | |
| "loss": 2.1961, | |
| "mean_token_accuracy": 0.9710487142205239, | |
| "num_tokens": 140393910.0, | |
| "step": 1220 | |
| }, | |
| { | |
| "epoch": 0.27143330023171136, | |
| "grad_norm": 38.75, | |
| "learning_rate": 9.801417526442326e-06, | |
| "loss": 2.1493, | |
| "mean_token_accuracy": 0.9713955089449883, | |
| "num_tokens": 141547585.0, | |
| "step": 1230 | |
| }, | |
| { | |
| "epoch": 0.27364007503034316, | |
| "grad_norm": 26.25, | |
| "learning_rate": 9.79629506198934e-06, | |
| "loss": 2.3303, | |
| "mean_token_accuracy": 0.9683890417218208, | |
| "num_tokens": 142702841.0, | |
| "step": 1240 | |
| }, | |
| { | |
| "epoch": 0.27584684982897495, | |
| "grad_norm": 258.0, | |
| "learning_rate": 9.791108741956476e-06, | |
| "loss": 2.3053, | |
| "mean_token_accuracy": 0.9674874410033226, | |
| "num_tokens": 143841896.0, | |
| "step": 1250 | |
| }, | |
| { | |
| "epoch": 0.27805362462760674, | |
| "grad_norm": 95.5, | |
| "learning_rate": 9.785858635391913e-06, | |
| "loss": 2.3401, | |
| "mean_token_accuracy": 0.9667026609182358, | |
| "num_tokens": 144997681.0, | |
| "step": 1260 | |
| }, | |
| { | |
| "epoch": 0.28026039942623854, | |
| "grad_norm": 84.0, | |
| "learning_rate": 9.780544812193065e-06, | |
| "loss": 2.1887, | |
| "mean_token_accuracy": 0.969435966014862, | |
| "num_tokens": 146170846.0, | |
| "step": 1270 | |
| }, | |
| { | |
| "epoch": 0.28246717422487033, | |
| "grad_norm": 38.0, | |
| "learning_rate": 9.77516734310563e-06, | |
| "loss": 2.2981, | |
| "mean_token_accuracy": 0.9670219540596008, | |
| "num_tokens": 147320400.0, | |
| "step": 1280 | |
| }, | |
| { | |
| "epoch": 0.2846739490235021, | |
| "grad_norm": 62.5, | |
| "learning_rate": 9.769726299722668e-06, | |
| "loss": 2.2532, | |
| "mean_token_accuracy": 0.9693857550621032, | |
| "num_tokens": 148501676.0, | |
| "step": 1290 | |
| }, | |
| { | |
| "epoch": 0.286880723822134, | |
| "grad_norm": 83.5, | |
| "learning_rate": 9.764221754483623e-06, | |
| "loss": 2.1214, | |
| "mean_token_accuracy": 0.9703208059072495, | |
| "num_tokens": 149671956.0, | |
| "step": 1300 | |
| }, | |
| { | |
| "epoch": 0.28908749862076577, | |
| "grad_norm": 118.5, | |
| "learning_rate": 9.758653780673381e-06, | |
| "loss": 2.1819, | |
| "mean_token_accuracy": 0.9711541831493378, | |
| "num_tokens": 150840868.0, | |
| "step": 1310 | |
| }, | |
| { | |
| "epoch": 0.29129427341939756, | |
| "grad_norm": 41.75, | |
| "learning_rate": 9.753022452421286e-06, | |
| "loss": 2.1127, | |
| "mean_token_accuracy": 0.9700117990374565, | |
| "num_tokens": 152005694.0, | |
| "step": 1320 | |
| }, | |
| { | |
| "epoch": 0.29350104821802936, | |
| "grad_norm": 72.5, | |
| "learning_rate": 9.747327844700147e-06, | |
| "loss": 2.114, | |
| "mean_token_accuracy": 0.97126604616642, | |
| "num_tokens": 153165015.0, | |
| "step": 1330 | |
| }, | |
| { | |
| "epoch": 0.29570782301666115, | |
| "grad_norm": 54.25, | |
| "learning_rate": 9.741570033325254e-06, | |
| "loss": 2.2238, | |
| "mean_token_accuracy": 0.9685475513339042, | |
| "num_tokens": 154325061.0, | |
| "step": 1340 | |
| }, | |
| { | |
| "epoch": 0.29791459781529295, | |
| "grad_norm": 69.0, | |
| "learning_rate": 9.73574909495335e-06, | |
| "loss": 2.1849, | |
| "mean_token_accuracy": 0.9697886392474174, | |
| "num_tokens": 155475633.0, | |
| "step": 1350 | |
| }, | |
| { | |
| "epoch": 0.30012137261392474, | |
| "grad_norm": 66.0, | |
| "learning_rate": 9.729865107081631e-06, | |
| "loss": 2.2535, | |
| "mean_token_accuracy": 0.9687330171465873, | |
| "num_tokens": 156642023.0, | |
| "step": 1360 | |
| }, | |
| { | |
| "epoch": 0.30232814741255654, | |
| "grad_norm": 32.25, | |
| "learning_rate": 9.723918148046696e-06, | |
| "loss": 2.1744, | |
| "mean_token_accuracy": 0.9708977922797203, | |
| "num_tokens": 157792568.0, | |
| "step": 1370 | |
| }, | |
| { | |
| "epoch": 0.30453492221118833, | |
| "grad_norm": 144.0, | |
| "learning_rate": 9.717908297023517e-06, | |
| "loss": 2.2753, | |
| "mean_token_accuracy": 0.9678496509790421, | |
| "num_tokens": 158954415.0, | |
| "step": 1380 | |
| }, | |
| { | |
| "epoch": 0.3067416970098201, | |
| "grad_norm": 29.25, | |
| "learning_rate": 9.711835634024378e-06, | |
| "loss": 2.0073, | |
| "mean_token_accuracy": 0.9731367215514183, | |
| "num_tokens": 160126079.0, | |
| "step": 1390 | |
| }, | |
| { | |
| "epoch": 0.308948471808452, | |
| "grad_norm": 29.75, | |
| "learning_rate": 9.705700239897809e-06, | |
| "loss": 2.111, | |
| "mean_token_accuracy": 0.9711406901478767, | |
| "num_tokens": 161277711.0, | |
| "step": 1400 | |
| }, | |
| { | |
| "epoch": 0.31115524660708377, | |
| "grad_norm": 95.5, | |
| "learning_rate": 9.699502196327515e-06, | |
| "loss": 2.2518, | |
| "mean_token_accuracy": 0.9678638756275177, | |
| "num_tokens": 162424273.0, | |
| "step": 1410 | |
| }, | |
| { | |
| "epoch": 0.31336202140571556, | |
| "grad_norm": 24.125, | |
| "learning_rate": 9.69324158583129e-06, | |
| "loss": 2.1652, | |
| "mean_token_accuracy": 0.9701473265886307, | |
| "num_tokens": 163563475.0, | |
| "step": 1420 | |
| }, | |
| { | |
| "epoch": 0.31556879620434736, | |
| "grad_norm": 29.5, | |
| "learning_rate": 9.686918491759904e-06, | |
| "loss": 2.2377, | |
| "mean_token_accuracy": 0.9685998216271401, | |
| "num_tokens": 164730359.0, | |
| "step": 1430 | |
| }, | |
| { | |
| "epoch": 0.31777557100297915, | |
| "grad_norm": 109.5, | |
| "learning_rate": 9.68053299829601e-06, | |
| "loss": 2.0887, | |
| "mean_token_accuracy": 0.9721654012799263, | |
| "num_tokens": 165904507.0, | |
| "step": 1440 | |
| }, | |
| { | |
| "epoch": 0.31998234580161095, | |
| "grad_norm": 216.0, | |
| "learning_rate": 9.67408519045302e-06, | |
| "loss": 2.1169, | |
| "mean_token_accuracy": 0.9720436498522759, | |
| "num_tokens": 167026147.0, | |
| "step": 1450 | |
| }, | |
| { | |
| "epoch": 0.32218912060024274, | |
| "grad_norm": 36.0, | |
| "learning_rate": 9.667575154073962e-06, | |
| "loss": 2.1948, | |
| "mean_token_accuracy": 0.9705771505832672, | |
| "num_tokens": 168181751.0, | |
| "step": 1460 | |
| }, | |
| { | |
| "epoch": 0.32439589539887453, | |
| "grad_norm": 47.0, | |
| "learning_rate": 9.66100297583035e-06, | |
| "loss": 2.0542, | |
| "mean_token_accuracy": 0.9731536105275154, | |
| "num_tokens": 169314982.0, | |
| "step": 1470 | |
| }, | |
| { | |
| "epoch": 0.32660267019750633, | |
| "grad_norm": 55.0, | |
| "learning_rate": 9.654368743221022e-06, | |
| "loss": 2.0857, | |
| "mean_token_accuracy": 0.9731913954019547, | |
| "num_tokens": 170457387.0, | |
| "step": 1480 | |
| }, | |
| { | |
| "epoch": 0.3288094449961381, | |
| "grad_norm": 157.0, | |
| "learning_rate": 9.647672544570981e-06, | |
| "loss": 2.1983, | |
| "mean_token_accuracy": 0.9701191037893295, | |
| "num_tokens": 171609482.0, | |
| "step": 1490 | |
| }, | |
| { | |
| "epoch": 0.3310162197947699, | |
| "grad_norm": 24.5, | |
| "learning_rate": 9.640914469030216e-06, | |
| "loss": 2.0796, | |
| "mean_token_accuracy": 0.9706763669848442, | |
| "num_tokens": 172763347.0, | |
| "step": 1500 | |
| }, | |
| { | |
| "epoch": 0.33322299459340177, | |
| "grad_norm": 39.75, | |
| "learning_rate": 9.634094606572515e-06, | |
| "loss": 2.1677, | |
| "mean_token_accuracy": 0.969539861381054, | |
| "num_tokens": 173906059.0, | |
| "step": 1510 | |
| }, | |
| { | |
| "epoch": 0.33542976939203356, | |
| "grad_norm": 46.75, | |
| "learning_rate": 9.627213047994265e-06, | |
| "loss": 2.1746, | |
| "mean_token_accuracy": 0.9711151927709579, | |
| "num_tokens": 175056528.0, | |
| "step": 1520 | |
| }, | |
| { | |
| "epoch": 0.33763654419066536, | |
| "grad_norm": 61.25, | |
| "learning_rate": 9.620269884913247e-06, | |
| "loss": 2.0543, | |
| "mean_token_accuracy": 0.9714053064584732, | |
| "num_tokens": 176216895.0, | |
| "step": 1530 | |
| }, | |
| { | |
| "epoch": 0.33984331898929715, | |
| "grad_norm": 66.5, | |
| "learning_rate": 9.613265209767417e-06, | |
| "loss": 2.1693, | |
| "mean_token_accuracy": 0.9703157678246498, | |
| "num_tokens": 177382380.0, | |
| "step": 1540 | |
| }, | |
| { | |
| "epoch": 0.34205009378792894, | |
| "grad_norm": 22.75, | |
| "learning_rate": 9.606199115813672e-06, | |
| "loss": 2.0737, | |
| "mean_token_accuracy": 0.9730365663766861, | |
| "num_tokens": 178541743.0, | |
| "step": 1550 | |
| }, | |
| { | |
| "epoch": 0.34425686858656074, | |
| "grad_norm": 64.5, | |
| "learning_rate": 9.599071697126608e-06, | |
| "loss": 2.0885, | |
| "mean_token_accuracy": 0.9699146062135696, | |
| "num_tokens": 179690359.0, | |
| "step": 1560 | |
| }, | |
| { | |
| "epoch": 0.34646364338519253, | |
| "grad_norm": 33.0, | |
| "learning_rate": 9.591883048597273e-06, | |
| "loss": 2.2101, | |
| "mean_token_accuracy": 0.9682660028338432, | |
| "num_tokens": 180857119.0, | |
| "step": 1570 | |
| }, | |
| { | |
| "epoch": 0.3486704181838243, | |
| "grad_norm": 34.0, | |
| "learning_rate": 9.584633265931894e-06, | |
| "loss": 2.1311, | |
| "mean_token_accuracy": 0.9704745352268219, | |
| "num_tokens": 181999028.0, | |
| "step": 1580 | |
| }, | |
| { | |
| "epoch": 0.3508771929824561, | |
| "grad_norm": 76.5, | |
| "learning_rate": 9.577322445650616e-06, | |
| "loss": 2.184, | |
| "mean_token_accuracy": 0.970292191207409, | |
| "num_tokens": 183161337.0, | |
| "step": 1590 | |
| }, | |
| { | |
| "epoch": 0.3530839677810879, | |
| "grad_norm": 108.5, | |
| "learning_rate": 9.569950685086202e-06, | |
| "loss": 2.1499, | |
| "mean_token_accuracy": 0.9709933206439019, | |
| "num_tokens": 184314452.0, | |
| "step": 1600 | |
| }, | |
| { | |
| "epoch": 0.35529074257971976, | |
| "grad_norm": 33.5, | |
| "learning_rate": 9.562518082382751e-06, | |
| "loss": 2.2254, | |
| "mean_token_accuracy": 0.9703863441944123, | |
| "num_tokens": 185464891.0, | |
| "step": 1610 | |
| }, | |
| { | |
| "epoch": 0.35749751737835156, | |
| "grad_norm": 63.75, | |
| "learning_rate": 9.555024736494382e-06, | |
| "loss": 2.067, | |
| "mean_token_accuracy": 0.9729075342416763, | |
| "num_tokens": 186629335.0, | |
| "step": 1620 | |
| }, | |
| { | |
| "epoch": 0.35970429217698335, | |
| "grad_norm": 25.375, | |
| "learning_rate": 9.54747074718392e-06, | |
| "loss": 2.3358, | |
| "mean_token_accuracy": 0.9676305696368217, | |
| "num_tokens": 187792990.0, | |
| "step": 1630 | |
| }, | |
| { | |
| "epoch": 0.36191106697561515, | |
| "grad_norm": 29.375, | |
| "learning_rate": 9.539856215021568e-06, | |
| "loss": 2.0665, | |
| "mean_token_accuracy": 0.971627376973629, | |
| "num_tokens": 188941540.0, | |
| "step": 1640 | |
| }, | |
| { | |
| "epoch": 0.36411784177424694, | |
| "grad_norm": 27.625, | |
| "learning_rate": 9.53218124138357e-06, | |
| "loss": 2.1855, | |
| "mean_token_accuracy": 0.9698077514767647, | |
| "num_tokens": 190109790.0, | |
| "step": 1650 | |
| }, | |
| { | |
| "epoch": 0.36632461657287874, | |
| "grad_norm": 59.75, | |
| "learning_rate": 9.524445928450851e-06, | |
| "loss": 2.102, | |
| "mean_token_accuracy": 0.970787413418293, | |
| "num_tokens": 191253589.0, | |
| "step": 1660 | |
| }, | |
| { | |
| "epoch": 0.36853139137151053, | |
| "grad_norm": 56.5, | |
| "learning_rate": 9.516650379207677e-06, | |
| "loss": 2.1411, | |
| "mean_token_accuracy": 0.9712065562605858, | |
| "num_tokens": 192386106.0, | |
| "step": 1670 | |
| }, | |
| { | |
| "epoch": 0.3707381661701423, | |
| "grad_norm": 61.25, | |
| "learning_rate": 9.508794697440257e-06, | |
| "loss": 2.2386, | |
| "mean_token_accuracy": 0.969098924100399, | |
| "num_tokens": 193527888.0, | |
| "step": 1680 | |
| }, | |
| { | |
| "epoch": 0.3729449409687741, | |
| "grad_norm": 37.0, | |
| "learning_rate": 9.50087898773539e-06, | |
| "loss": 2.0885, | |
| "mean_token_accuracy": 0.9735700085759162, | |
| "num_tokens": 194689653.0, | |
| "step": 1690 | |
| }, | |
| { | |
| "epoch": 0.3751517157674059, | |
| "grad_norm": 38.0, | |
| "learning_rate": 9.492903355479047e-06, | |
| "loss": 2.153, | |
| "mean_token_accuracy": 0.9712708279490471, | |
| "num_tokens": 195835780.0, | |
| "step": 1700 | |
| }, | |
| { | |
| "epoch": 0.37735849056603776, | |
| "grad_norm": 92.0, | |
| "learning_rate": 9.484867906854986e-06, | |
| "loss": 2.1319, | |
| "mean_token_accuracy": 0.9716506630182267, | |
| "num_tokens": 196990250.0, | |
| "step": 1710 | |
| }, | |
| { | |
| "epoch": 0.37956526536466956, | |
| "grad_norm": 27.375, | |
| "learning_rate": 9.476772748843327e-06, | |
| "loss": 2.0856, | |
| "mean_token_accuracy": 0.970129369199276, | |
| "num_tokens": 198128385.0, | |
| "step": 1720 | |
| }, | |
| { | |
| "epoch": 0.38177204016330135, | |
| "grad_norm": 20.125, | |
| "learning_rate": 9.468617989219136e-06, | |
| "loss": 2.0881, | |
| "mean_token_accuracy": 0.9723739951848984, | |
| "num_tokens": 199270632.0, | |
| "step": 1730 | |
| }, | |
| { | |
| "epoch": 0.38397881496193315, | |
| "grad_norm": 72.5, | |
| "learning_rate": 9.460403736550982e-06, | |
| "loss": 1.9717, | |
| "mean_token_accuracy": 0.9740812569856644, | |
| "num_tokens": 200408726.0, | |
| "step": 1740 | |
| }, | |
| { | |
| "epoch": 0.38618558976056494, | |
| "grad_norm": 87.5, | |
| "learning_rate": 9.452130100199504e-06, | |
| "loss": 2.0455, | |
| "mean_token_accuracy": 0.9730176210403443, | |
| "num_tokens": 201567549.0, | |
| "step": 1750 | |
| }, | |
| { | |
| "epoch": 0.38839236455919673, | |
| "grad_norm": 56.75, | |
| "learning_rate": 9.443797190315938e-06, | |
| "loss": 2.0779, | |
| "mean_token_accuracy": 0.972916804254055, | |
| "num_tokens": 202715410.0, | |
| "step": 1760 | |
| }, | |
| { | |
| "epoch": 0.39059913935782853, | |
| "grad_norm": 55.0, | |
| "learning_rate": 9.435405117840662e-06, | |
| "loss": 2.178, | |
| "mean_token_accuracy": 0.9705028817057609, | |
| "num_tokens": 203862993.0, | |
| "step": 1770 | |
| }, | |
| { | |
| "epoch": 0.3928059141564603, | |
| "grad_norm": 54.5, | |
| "learning_rate": 9.42695399450172e-06, | |
| "loss": 2.0909, | |
| "mean_token_accuracy": 0.9725421443581581, | |
| "num_tokens": 205027241.0, | |
| "step": 1780 | |
| }, | |
| { | |
| "epoch": 0.3950126889550921, | |
| "grad_norm": 81.0, | |
| "learning_rate": 9.418443932813328e-06, | |
| "loss": 2.0431, | |
| "mean_token_accuracy": 0.9726249724626541, | |
| "num_tokens": 206188556.0, | |
| "step": 1790 | |
| }, | |
| { | |
| "epoch": 0.3972194637537239, | |
| "grad_norm": 52.25, | |
| "learning_rate": 9.409875046074379e-06, | |
| "loss": 2.1315, | |
| "mean_token_accuracy": 0.9729362472891807, | |
| "num_tokens": 207335561.0, | |
| "step": 1800 | |
| }, | |
| { | |
| "epoch": 0.3994262385523557, | |
| "grad_norm": 27.0, | |
| "learning_rate": 9.401247448366937e-06, | |
| "loss": 2.1977, | |
| "mean_token_accuracy": 0.9716293200850487, | |
| "num_tokens": 208475730.0, | |
| "step": 1810 | |
| }, | |
| { | |
| "epoch": 0.40163301335098756, | |
| "grad_norm": 24.375, | |
| "learning_rate": 9.392561254554712e-06, | |
| "loss": 2.1676, | |
| "mean_token_accuracy": 0.9698420405387879, | |
| "num_tokens": 209615143.0, | |
| "step": 1820 | |
| }, | |
| { | |
| "epoch": 0.40383978814961935, | |
| "grad_norm": 24.75, | |
| "learning_rate": 9.383816580281539e-06, | |
| "loss": 2.1174, | |
| "mean_token_accuracy": 0.971437631547451, | |
| "num_tokens": 210770105.0, | |
| "step": 1830 | |
| }, | |
| { | |
| "epoch": 0.40604656294825114, | |
| "grad_norm": 27.25, | |
| "learning_rate": 9.375013541969828e-06, | |
| "loss": 2.1794, | |
| "mean_token_accuracy": 0.9704042539000511, | |
| "num_tokens": 211919518.0, | |
| "step": 1840 | |
| }, | |
| { | |
| "epoch": 0.40825333774688294, | |
| "grad_norm": 42.0, | |
| "learning_rate": 9.366152256819025e-06, | |
| "loss": 2.3106, | |
| "mean_token_accuracy": 0.9682258501648903, | |
| "num_tokens": 213075286.0, | |
| "step": 1850 | |
| }, | |
| { | |
| "epoch": 0.41046011254551473, | |
| "grad_norm": 20.125, | |
| "learning_rate": 9.357232842804045e-06, | |
| "loss": 2.1905, | |
| "mean_token_accuracy": 0.9694794774055481, | |
| "num_tokens": 214241852.0, | |
| "step": 1860 | |
| }, | |
| { | |
| "epoch": 0.4126668873441465, | |
| "grad_norm": 243.0, | |
| "learning_rate": 9.348255418673702e-06, | |
| "loss": 2.1436, | |
| "mean_token_accuracy": 0.9688614338636399, | |
| "num_tokens": 215391101.0, | |
| "step": 1870 | |
| }, | |
| { | |
| "epoch": 0.4148736621427783, | |
| "grad_norm": 124.0, | |
| "learning_rate": 9.339220103949132e-06, | |
| "loss": 2.2558, | |
| "mean_token_accuracy": 0.9694797903299331, | |
| "num_tokens": 216535129.0, | |
| "step": 1880 | |
| }, | |
| { | |
| "epoch": 0.4170804369414101, | |
| "grad_norm": 216.0, | |
| "learning_rate": 9.330127018922195e-06, | |
| "loss": 2.0114, | |
| "mean_token_accuracy": 0.9715709060430526, | |
| "num_tokens": 217681553.0, | |
| "step": 1890 | |
| }, | |
| { | |
| "epoch": 0.4192872117400419, | |
| "grad_norm": 25.625, | |
| "learning_rate": 9.320976284653877e-06, | |
| "loss": 2.0081, | |
| "mean_token_accuracy": 0.9716795921325684, | |
| "num_tokens": 218833071.0, | |
| "step": 1900 | |
| }, | |
| { | |
| "epoch": 0.4214939865386737, | |
| "grad_norm": 59.0, | |
| "learning_rate": 9.311768022972682e-06, | |
| "loss": 2.0676, | |
| "mean_token_accuracy": 0.9731668412685395, | |
| "num_tokens": 219974902.0, | |
| "step": 1910 | |
| }, | |
| { | |
| "epoch": 0.42370076133730555, | |
| "grad_norm": 60.25, | |
| "learning_rate": 9.302502356473006e-06, | |
| "loss": 2.3082, | |
| "mean_token_accuracy": 0.9672795906662941, | |
| "num_tokens": 221140517.0, | |
| "step": 1920 | |
| }, | |
| { | |
| "epoch": 0.42590753613593735, | |
| "grad_norm": 102.5, | |
| "learning_rate": 9.293179408513501e-06, | |
| "loss": 2.1827, | |
| "mean_token_accuracy": 0.9701474636793137, | |
| "num_tokens": 222292419.0, | |
| "step": 1930 | |
| }, | |
| { | |
| "epoch": 0.42811431093456914, | |
| "grad_norm": 38.25, | |
| "learning_rate": 9.283799303215442e-06, | |
| "loss": 2.0868, | |
| "mean_token_accuracy": 0.9712743669748306, | |
| "num_tokens": 223458807.0, | |
| "step": 1940 | |
| }, | |
| { | |
| "epoch": 0.43032108573320094, | |
| "grad_norm": 48.75, | |
| "learning_rate": 9.274362165461064e-06, | |
| "loss": 2.2142, | |
| "mean_token_accuracy": 0.9703401148319244, | |
| "num_tokens": 224604886.0, | |
| "step": 1950 | |
| }, | |
| { | |
| "epoch": 0.43252786053183273, | |
| "grad_norm": 83.0, | |
| "learning_rate": 9.264868120891913e-06, | |
| "loss": 2.27, | |
| "mean_token_accuracy": 0.9706149518489837, | |
| "num_tokens": 225746965.0, | |
| "step": 1960 | |
| }, | |
| { | |
| "epoch": 0.4347346353304645, | |
| "grad_norm": 31.875, | |
| "learning_rate": 9.255317295907158e-06, | |
| "loss": 2.0635, | |
| "mean_token_accuracy": 0.9707323223352432, | |
| "num_tokens": 226913200.0, | |
| "step": 1970 | |
| }, | |
| { | |
| "epoch": 0.4369414101290963, | |
| "grad_norm": 116.0, | |
| "learning_rate": 9.245709817661917e-06, | |
| "loss": 1.918, | |
| "mean_token_accuracy": 0.9732501715421676, | |
| "num_tokens": 228066878.0, | |
| "step": 1980 | |
| }, | |
| { | |
| "epoch": 0.4391481849277281, | |
| "grad_norm": 36.0, | |
| "learning_rate": 9.236045814065563e-06, | |
| "loss": 2.001, | |
| "mean_token_accuracy": 0.9713159516453743, | |
| "num_tokens": 229211377.0, | |
| "step": 1990 | |
| }, | |
| { | |
| "epoch": 0.4413549597263599, | |
| "grad_norm": 27.125, | |
| "learning_rate": 9.226325413780021e-06, | |
| "loss": 2.0794, | |
| "mean_token_accuracy": 0.9728094398975372, | |
| "num_tokens": 230347755.0, | |
| "step": 2000 | |
| }, | |
| { | |
| "epoch": 0.4435617345249917, | |
| "grad_norm": 176.0, | |
| "learning_rate": 9.216548746218056e-06, | |
| "loss": 2.0674, | |
| "mean_token_accuracy": 0.973133884370327, | |
| "num_tokens": 231508177.0, | |
| "step": 2010 | |
| }, | |
| { | |
| "epoch": 0.44576850932362355, | |
| "grad_norm": 61.75, | |
| "learning_rate": 9.206715941541547e-06, | |
| "loss": 2.101, | |
| "mean_token_accuracy": 0.9707593634724617, | |
| "num_tokens": 232654808.0, | |
| "step": 2020 | |
| }, | |
| { | |
| "epoch": 0.44797528412225535, | |
| "grad_norm": 22.5, | |
| "learning_rate": 9.196827130659752e-06, | |
| "loss": 2.1187, | |
| "mean_token_accuracy": 0.9709117472171783, | |
| "num_tokens": 233821713.0, | |
| "step": 2030 | |
| }, | |
| { | |
| "epoch": 0.45018205892088714, | |
| "grad_norm": 58.75, | |
| "learning_rate": 9.186882445227572e-06, | |
| "loss": 1.9673, | |
| "mean_token_accuracy": 0.9743405088782311, | |
| "num_tokens": 234969349.0, | |
| "step": 2040 | |
| }, | |
| { | |
| "epoch": 0.45238883371951893, | |
| "grad_norm": 22.125, | |
| "learning_rate": 9.1768820176438e-06, | |
| "loss": 2.1645, | |
| "mean_token_accuracy": 0.9699445232748986, | |
| "num_tokens": 236103583.0, | |
| "step": 2050 | |
| }, | |
| { | |
| "epoch": 0.45459560851815073, | |
| "grad_norm": 30.875, | |
| "learning_rate": 9.166825981049345e-06, | |
| "loss": 2.0988, | |
| "mean_token_accuracy": 0.9720931738615036, | |
| "num_tokens": 237255347.0, | |
| "step": 2060 | |
| }, | |
| { | |
| "epoch": 0.4568023833167825, | |
| "grad_norm": 30.75, | |
| "learning_rate": 9.156714469325474e-06, | |
| "loss": 2.1111, | |
| "mean_token_accuracy": 0.9703241810202599, | |
| "num_tokens": 238408728.0, | |
| "step": 2070 | |
| }, | |
| { | |
| "epoch": 0.4590091581154143, | |
| "grad_norm": 25.5, | |
| "learning_rate": 9.14654761709202e-06, | |
| "loss": 2.0255, | |
| "mean_token_accuracy": 0.9722090765833855, | |
| "num_tokens": 239565441.0, | |
| "step": 2080 | |
| }, | |
| { | |
| "epoch": 0.4612159329140461, | |
| "grad_norm": 40.5, | |
| "learning_rate": 9.136325559705593e-06, | |
| "loss": 1.8571, | |
| "mean_token_accuracy": 0.9744715243577957, | |
| "num_tokens": 240728868.0, | |
| "step": 2090 | |
| }, | |
| { | |
| "epoch": 0.4634227077126779, | |
| "grad_norm": 51.0, | |
| "learning_rate": 9.12604843325778e-06, | |
| "loss": 2.0201, | |
| "mean_token_accuracy": 0.972464868426323, | |
| "num_tokens": 241881571.0, | |
| "step": 2100 | |
| }, | |
| { | |
| "epoch": 0.4656294825113097, | |
| "grad_norm": 22.125, | |
| "learning_rate": 9.11571637457333e-06, | |
| "loss": 2.3128, | |
| "mean_token_accuracy": 0.9690719544887543, | |
| "num_tokens": 243031077.0, | |
| "step": 2110 | |
| }, | |
| { | |
| "epoch": 0.4678362573099415, | |
| "grad_norm": 71.0, | |
| "learning_rate": 9.105329521208334e-06, | |
| "loss": 2.1229, | |
| "mean_token_accuracy": 0.9686960980296135, | |
| "num_tokens": 244183526.0, | |
| "step": 2120 | |
| }, | |
| { | |
| "epoch": 0.47004303210857334, | |
| "grad_norm": 151.0, | |
| "learning_rate": 9.094888011448391e-06, | |
| "loss": 2.2859, | |
| "mean_token_accuracy": 0.9715722545981407, | |
| "num_tokens": 245350675.0, | |
| "step": 2130 | |
| }, | |
| { | |
| "epoch": 0.47224980690720514, | |
| "grad_norm": 119.0, | |
| "learning_rate": 9.084391984306775e-06, | |
| "loss": 2.185, | |
| "mean_token_accuracy": 0.9692530021071434, | |
| "num_tokens": 246493568.0, | |
| "step": 2140 | |
| }, | |
| { | |
| "epoch": 0.47445658170583693, | |
| "grad_norm": 36.0, | |
| "learning_rate": 9.073841579522571e-06, | |
| "loss": 2.1148, | |
| "mean_token_accuracy": 0.9710839301347732, | |
| "num_tokens": 247628717.0, | |
| "step": 2150 | |
| }, | |
| { | |
| "epoch": 0.4766633565044687, | |
| "grad_norm": 42.5, | |
| "learning_rate": 9.063236937558826e-06, | |
| "loss": 1.962, | |
| "mean_token_accuracy": 0.9727049171924591, | |
| "num_tokens": 248778776.0, | |
| "step": 2160 | |
| }, | |
| { | |
| "epoch": 0.4788701313031005, | |
| "grad_norm": 30.25, | |
| "learning_rate": 9.052578199600675e-06, | |
| "loss": 2.1658, | |
| "mean_token_accuracy": 0.9716869577765465, | |
| "num_tokens": 249941829.0, | |
| "step": 2170 | |
| }, | |
| { | |
| "epoch": 0.4810769061017323, | |
| "grad_norm": 76.0, | |
| "learning_rate": 9.041865507553458e-06, | |
| "loss": 2.118, | |
| "mean_token_accuracy": 0.9711204588413238, | |
| "num_tokens": 251093922.0, | |
| "step": 2180 | |
| }, | |
| { | |
| "epoch": 0.4832836809003641, | |
| "grad_norm": 20.125, | |
| "learning_rate": 9.031099004040841e-06, | |
| "loss": 1.9971, | |
| "mean_token_accuracy": 0.9736538395285607, | |
| "num_tokens": 252241596.0, | |
| "step": 2190 | |
| }, | |
| { | |
| "epoch": 0.4854904556989959, | |
| "grad_norm": 138.0, | |
| "learning_rate": 9.020278832402902e-06, | |
| "loss": 2.2066, | |
| "mean_token_accuracy": 0.970404052734375, | |
| "num_tokens": 253396378.0, | |
| "step": 2200 | |
| }, | |
| { | |
| "epoch": 0.4876972304976277, | |
| "grad_norm": 44.5, | |
| "learning_rate": 9.009405136694234e-06, | |
| "loss": 2.0519, | |
| "mean_token_accuracy": 0.9723546028137207, | |
| "num_tokens": 254545434.0, | |
| "step": 2210 | |
| }, | |
| { | |
| "epoch": 0.4899040052962595, | |
| "grad_norm": 30.375, | |
| "learning_rate": 8.998478061682025e-06, | |
| "loss": 2.0713, | |
| "mean_token_accuracy": 0.972881618142128, | |
| "num_tokens": 255698903.0, | |
| "step": 2220 | |
| }, | |
| { | |
| "epoch": 0.49211078009489134, | |
| "grad_norm": 73.5, | |
| "learning_rate": 8.987497752844132e-06, | |
| "loss": 2.0487, | |
| "mean_token_accuracy": 0.9707805126905441, | |
| "num_tokens": 256850100.0, | |
| "step": 2230 | |
| }, | |
| { | |
| "epoch": 0.49431755489352314, | |
| "grad_norm": 26.25, | |
| "learning_rate": 8.976464356367133e-06, | |
| "loss": 2.1556, | |
| "mean_token_accuracy": 0.9715987130999565, | |
| "num_tokens": 257993006.0, | |
| "step": 2240 | |
| }, | |
| { | |
| "epoch": 0.49652432969215493, | |
| "grad_norm": 31.875, | |
| "learning_rate": 8.965378019144397e-06, | |
| "loss": 2.1089, | |
| "mean_token_accuracy": 0.9720606461167336, | |
| "num_tokens": 259142750.0, | |
| "step": 2250 | |
| }, | |
| { | |
| "epoch": 0.4987311044907867, | |
| "grad_norm": 102.0, | |
| "learning_rate": 8.95423888877412e-06, | |
| "loss": 2.1274, | |
| "mean_token_accuracy": 0.9720024764537811, | |
| "num_tokens": 260305182.0, | |
| "step": 2260 | |
| }, | |
| { | |
| "epoch": 0.5009378792894185, | |
| "grad_norm": 89.0, | |
| "learning_rate": 8.943047113557358e-06, | |
| "loss": 2.1421, | |
| "mean_token_accuracy": 0.9720711082220077, | |
| "num_tokens": 261465998.0, | |
| "step": 2270 | |
| }, | |
| { | |
| "epoch": 0.5031446540880503, | |
| "grad_norm": 34.0, | |
| "learning_rate": 8.931802842496056e-06, | |
| "loss": 2.0891, | |
| "mean_token_accuracy": 0.9718857824802398, | |
| "num_tokens": 262623192.0, | |
| "step": 2280 | |
| }, | |
| { | |
| "epoch": 0.5053514288866822, | |
| "grad_norm": 116.5, | |
| "learning_rate": 8.920506225291067e-06, | |
| "loss": 2.0626, | |
| "mean_token_accuracy": 0.9728048339486122, | |
| "num_tokens": 263782530.0, | |
| "step": 2290 | |
| }, | |
| { | |
| "epoch": 0.5075582036853139, | |
| "grad_norm": 25.5, | |
| "learning_rate": 8.90915741234015e-06, | |
| "loss": 2.1828, | |
| "mean_token_accuracy": 0.9730463489890099, | |
| "num_tokens": 264917551.0, | |
| "step": 2300 | |
| }, | |
| { | |
| "epoch": 0.5097649784839458, | |
| "grad_norm": 21.625, | |
| "learning_rate": 8.897756554735976e-06, | |
| "loss": 2.1544, | |
| "mean_token_accuracy": 0.9686936601996422, | |
| "num_tokens": 266067631.0, | |
| "step": 2310 | |
| }, | |
| { | |
| "epoch": 0.5119717532825775, | |
| "grad_norm": 61.5, | |
| "learning_rate": 8.886303804264117e-06, | |
| "loss": 2.0282, | |
| "mean_token_accuracy": 0.9724765509366989, | |
| "num_tokens": 267224470.0, | |
| "step": 2320 | |
| }, | |
| { | |
| "epoch": 0.5141785280812093, | |
| "grad_norm": 99.5, | |
| "learning_rate": 8.874799313401014e-06, | |
| "loss": 1.9519, | |
| "mean_token_accuracy": 0.9725208267569542, | |
| "num_tokens": 268368173.0, | |
| "step": 2330 | |
| }, | |
| { | |
| "epoch": 0.5163853028798411, | |
| "grad_norm": 17.25, | |
| "learning_rate": 8.863243235311964e-06, | |
| "loss": 2.0613, | |
| "mean_token_accuracy": 0.9732014089822769, | |
| "num_tokens": 269521032.0, | |
| "step": 2340 | |
| }, | |
| { | |
| "epoch": 0.5185920776784729, | |
| "grad_norm": 44.25, | |
| "learning_rate": 8.851635723849062e-06, | |
| "loss": 2.0514, | |
| "mean_token_accuracy": 0.9732575222849846, | |
| "num_tokens": 270660849.0, | |
| "step": 2350 | |
| }, | |
| { | |
| "epoch": 0.5207988524771047, | |
| "grad_norm": 36.0, | |
| "learning_rate": 8.839976933549173e-06, | |
| "loss": 2.1547, | |
| "mean_token_accuracy": 0.9706027075648308, | |
| "num_tokens": 271814303.0, | |
| "step": 2360 | |
| }, | |
| { | |
| "epoch": 0.5230056272757365, | |
| "grad_norm": 90.5, | |
| "learning_rate": 8.828267019631852e-06, | |
| "loss": 2.0824, | |
| "mean_token_accuracy": 0.9714950412511826, | |
| "num_tokens": 272973336.0, | |
| "step": 2370 | |
| }, | |
| { | |
| "epoch": 0.5252124020743683, | |
| "grad_norm": 113.5, | |
| "learning_rate": 8.8165061379973e-06, | |
| "loss": 2.139, | |
| "mean_token_accuracy": 0.971430093050003, | |
| "num_tokens": 274111194.0, | |
| "step": 2380 | |
| }, | |
| { | |
| "epoch": 0.5274191768730001, | |
| "grad_norm": 31.0, | |
| "learning_rate": 8.804694445224274e-06, | |
| "loss": 2.0423, | |
| "mean_token_accuracy": 0.9729425087571144, | |
| "num_tokens": 275259048.0, | |
| "step": 2390 | |
| }, | |
| { | |
| "epoch": 0.529625951671632, | |
| "grad_norm": 88.0, | |
| "learning_rate": 8.792832098568002e-06, | |
| "loss": 2.1412, | |
| "mean_token_accuracy": 0.9716853976249695, | |
| "num_tokens": 276425321.0, | |
| "step": 2400 | |
| }, | |
| { | |
| "epoch": 0.5318327264702637, | |
| "grad_norm": 145.0, | |
| "learning_rate": 8.7809192559581e-06, | |
| "loss": 2.1382, | |
| "mean_token_accuracy": 0.9710038855671883, | |
| "num_tokens": 277589607.0, | |
| "step": 2410 | |
| }, | |
| { | |
| "epoch": 0.5340395012688955, | |
| "grad_norm": 65.0, | |
| "learning_rate": 8.76895607599646e-06, | |
| "loss": 1.9629, | |
| "mean_token_accuracy": 0.9735075205564498, | |
| "num_tokens": 278744612.0, | |
| "step": 2420 | |
| }, | |
| { | |
| "epoch": 0.5362462760675273, | |
| "grad_norm": 103.0, | |
| "learning_rate": 8.756942717955142e-06, | |
| "loss": 2.0242, | |
| "mean_token_accuracy": 0.9731355354189872, | |
| "num_tokens": 279891771.0, | |
| "step": 2430 | |
| }, | |
| { | |
| "epoch": 0.5384530508661591, | |
| "grad_norm": 22.875, | |
| "learning_rate": 8.744879341774251e-06, | |
| "loss": 2.1573, | |
| "mean_token_accuracy": 0.9701264292001724, | |
| "num_tokens": 281039349.0, | |
| "step": 2440 | |
| }, | |
| { | |
| "epoch": 0.5406598256647909, | |
| "grad_norm": 23.75, | |
| "learning_rate": 8.732766108059814e-06, | |
| "loss": 2.2048, | |
| "mean_token_accuracy": 0.9715155780315399, | |
| "num_tokens": 282192901.0, | |
| "step": 2450 | |
| }, | |
| { | |
| "epoch": 0.5428666004634227, | |
| "grad_norm": 38.75, | |
| "learning_rate": 8.720603178081632e-06, | |
| "loss": 2.0297, | |
| "mean_token_accuracy": 0.9723820731043815, | |
| "num_tokens": 283350829.0, | |
| "step": 2460 | |
| }, | |
| { | |
| "epoch": 0.5450733752620545, | |
| "grad_norm": 43.75, | |
| "learning_rate": 8.708390713771145e-06, | |
| "loss": 1.9863, | |
| "mean_token_accuracy": 0.9735831454396248, | |
| "num_tokens": 284496558.0, | |
| "step": 2470 | |
| }, | |
| { | |
| "epoch": 0.5472801500606863, | |
| "grad_norm": 32.5, | |
| "learning_rate": 8.696128877719258e-06, | |
| "loss": 2.0849, | |
| "mean_token_accuracy": 0.973354434967041, | |
| "num_tokens": 285649122.0, | |
| "step": 2480 | |
| }, | |
| { | |
| "epoch": 0.5494869248593182, | |
| "grad_norm": 37.0, | |
| "learning_rate": 8.683817833174204e-06, | |
| "loss": 1.9877, | |
| "mean_token_accuracy": 0.972400875389576, | |
| "num_tokens": 286805979.0, | |
| "step": 2490 | |
| }, | |
| { | |
| "epoch": 0.5516936996579499, | |
| "grad_norm": 81.0, | |
| "learning_rate": 8.67145774403934e-06, | |
| "loss": 2.2094, | |
| "mean_token_accuracy": 0.968398331105709, | |
| "num_tokens": 287967501.0, | |
| "step": 2500 | |
| }, | |
| { | |
| "epoch": 0.5539004744565817, | |
| "grad_norm": 60.75, | |
| "learning_rate": 8.659048774870986e-06, | |
| "loss": 2.0722, | |
| "mean_token_accuracy": 0.9725334390997886, | |
| "num_tokens": 289123305.0, | |
| "step": 2510 | |
| }, | |
| { | |
| "epoch": 0.5561072492552135, | |
| "grad_norm": 42.25, | |
| "learning_rate": 8.646591090876225e-06, | |
| "loss": 1.9228, | |
| "mean_token_accuracy": 0.9744925335049629, | |
| "num_tokens": 290255229.0, | |
| "step": 2520 | |
| }, | |
| { | |
| "epoch": 0.5583140240538453, | |
| "grad_norm": 99.0, | |
| "learning_rate": 8.634084857910709e-06, | |
| "loss": 2.2221, | |
| "mean_token_accuracy": 0.9712776750326156, | |
| "num_tokens": 291393248.0, | |
| "step": 2530 | |
| }, | |
| { | |
| "epoch": 0.5605207988524771, | |
| "grad_norm": 21.25, | |
| "learning_rate": 8.621530242476446e-06, | |
| "loss": 2.0688, | |
| "mean_token_accuracy": 0.9726179495453835, | |
| "num_tokens": 292536784.0, | |
| "step": 2540 | |
| }, | |
| { | |
| "epoch": 0.5627275736511089, | |
| "grad_norm": 78.0, | |
| "learning_rate": 8.608927411719585e-06, | |
| "loss": 2.1445, | |
| "mean_token_accuracy": 0.9696800112724304, | |
| "num_tokens": 293691169.0, | |
| "step": 2550 | |
| }, | |
| { | |
| "epoch": 0.5649343484497407, | |
| "grad_norm": 146.0, | |
| "learning_rate": 8.59627653342819e-06, | |
| "loss": 2.2383, | |
| "mean_token_accuracy": 0.97010178565979, | |
| "num_tokens": 294839102.0, | |
| "step": 2560 | |
| }, | |
| { | |
| "epoch": 0.5671411232483725, | |
| "grad_norm": 133.0, | |
| "learning_rate": 8.583577776030005e-06, | |
| "loss": 2.0977, | |
| "mean_token_accuracy": 0.9701031193137168, | |
| "num_tokens": 296022598.0, | |
| "step": 2570 | |
| }, | |
| { | |
| "epoch": 0.5693478980470043, | |
| "grad_norm": 86.0, | |
| "learning_rate": 8.570831308590219e-06, | |
| "loss": 2.0667, | |
| "mean_token_accuracy": 0.9718448206782341, | |
| "num_tokens": 297174948.0, | |
| "step": 2580 | |
| }, | |
| { | |
| "epoch": 0.5715546728456361, | |
| "grad_norm": 69.0, | |
| "learning_rate": 8.558037300809209e-06, | |
| "loss": 2.0147, | |
| "mean_token_accuracy": 0.9745479449629784, | |
| "num_tokens": 298320791.0, | |
| "step": 2590 | |
| }, | |
| { | |
| "epoch": 0.573761447644268, | |
| "grad_norm": 65.0, | |
| "learning_rate": 8.545195923020273e-06, | |
| "loss": 2.0344, | |
| "mean_token_accuracy": 0.9733903989195823, | |
| "num_tokens": 299463593.0, | |
| "step": 2600 | |
| }, | |
| { | |
| "epoch": 0.5759682224428997, | |
| "grad_norm": 58.5, | |
| "learning_rate": 8.532307346187384e-06, | |
| "loss": 1.9952, | |
| "mean_token_accuracy": 0.9741147130727768, | |
| "num_tokens": 300610885.0, | |
| "step": 2610 | |
| }, | |
| { | |
| "epoch": 0.5781749972415315, | |
| "grad_norm": 40.75, | |
| "learning_rate": 8.519371741902888e-06, | |
| "loss": 1.9168, | |
| "mean_token_accuracy": 0.9743043631315231, | |
| "num_tokens": 301767297.0, | |
| "step": 2620 | |
| }, | |
| { | |
| "epoch": 0.5803817720401633, | |
| "grad_norm": 49.25, | |
| "learning_rate": 8.506389282385242e-06, | |
| "loss": 1.9265, | |
| "mean_token_accuracy": 0.9748492762446404, | |
| "num_tokens": 302914250.0, | |
| "step": 2630 | |
| }, | |
| { | |
| "epoch": 0.5825885468387951, | |
| "grad_norm": 31.0, | |
| "learning_rate": 8.493360140476699e-06, | |
| "loss": 2.044, | |
| "mean_token_accuracy": 0.9722670823335647, | |
| "num_tokens": 304078421.0, | |
| "step": 2640 | |
| }, | |
| { | |
| "epoch": 0.5847953216374269, | |
| "grad_norm": 43.75, | |
| "learning_rate": 8.480284489641034e-06, | |
| "loss": 2.0599, | |
| "mean_token_accuracy": 0.9714385136961937, | |
| "num_tokens": 305246598.0, | |
| "step": 2650 | |
| }, | |
| { | |
| "epoch": 0.5870020964360587, | |
| "grad_norm": 51.5, | |
| "learning_rate": 8.467162503961209e-06, | |
| "loss": 2.2316, | |
| "mean_token_accuracy": 0.9701140210032463, | |
| "num_tokens": 306403518.0, | |
| "step": 2660 | |
| }, | |
| { | |
| "epoch": 0.5892088712346905, | |
| "grad_norm": 61.25, | |
| "learning_rate": 8.45399435813707e-06, | |
| "loss": 2.0623, | |
| "mean_token_accuracy": 0.9728678569197655, | |
| "num_tokens": 307551805.0, | |
| "step": 2670 | |
| }, | |
| { | |
| "epoch": 0.5914156460333223, | |
| "grad_norm": 23.5, | |
| "learning_rate": 8.440780227483016e-06, | |
| "loss": 2.1453, | |
| "mean_token_accuracy": 0.9702819734811783, | |
| "num_tokens": 308697429.0, | |
| "step": 2680 | |
| }, | |
| { | |
| "epoch": 0.593622420831954, | |
| "grad_norm": 124.0, | |
| "learning_rate": 8.427520287925669e-06, | |
| "loss": 1.9928, | |
| "mean_token_accuracy": 0.9724789828062057, | |
| "num_tokens": 309857041.0, | |
| "step": 2690 | |
| }, | |
| { | |
| "epoch": 0.5958291956305859, | |
| "grad_norm": 41.0, | |
| "learning_rate": 8.414214716001519e-06, | |
| "loss": 2.1511, | |
| "mean_token_accuracy": 0.9718689009547233, | |
| "num_tokens": 311006379.0, | |
| "step": 2700 | |
| }, | |
| { | |
| "epoch": 0.5980359704292177, | |
| "grad_norm": 21.5, | |
| "learning_rate": 8.400863688854598e-06, | |
| "loss": 1.9249, | |
| "mean_token_accuracy": 0.9752550989389419, | |
| "num_tokens": 312144080.0, | |
| "step": 2710 | |
| }, | |
| { | |
| "epoch": 0.6002427452278495, | |
| "grad_norm": 19.25, | |
| "learning_rate": 8.387467384234096e-06, | |
| "loss": 2.056, | |
| "mean_token_accuracy": 0.9734023571014404, | |
| "num_tokens": 313292785.0, | |
| "step": 2720 | |
| }, | |
| { | |
| "epoch": 0.6024495200264813, | |
| "grad_norm": 142.0, | |
| "learning_rate": 8.37402598049201e-06, | |
| "loss": 1.9148, | |
| "mean_token_accuracy": 0.9750416144728661, | |
| "num_tokens": 314424933.0, | |
| "step": 2730 | |
| }, | |
| { | |
| "epoch": 0.6046562948251131, | |
| "grad_norm": 50.75, | |
| "learning_rate": 8.360539656580768e-06, | |
| "loss": 2.0538, | |
| "mean_token_accuracy": 0.9707178220152854, | |
| "num_tokens": 315584036.0, | |
| "step": 2740 | |
| }, | |
| { | |
| "epoch": 0.6068630696237449, | |
| "grad_norm": 50.0, | |
| "learning_rate": 8.347008592050834e-06, | |
| "loss": 2.0466, | |
| "mean_token_accuracy": 0.9731885403394699, | |
| "num_tokens": 316722761.0, | |
| "step": 2750 | |
| }, | |
| { | |
| "epoch": 0.6090698444223767, | |
| "grad_norm": 56.25, | |
| "learning_rate": 8.333432967048339e-06, | |
| "loss": 2.1593, | |
| "mean_token_accuracy": 0.971230548620224, | |
| "num_tokens": 317876682.0, | |
| "step": 2760 | |
| }, | |
| { | |
| "epoch": 0.6112766192210085, | |
| "grad_norm": 38.75, | |
| "learning_rate": 8.319812962312662e-06, | |
| "loss": 2.0091, | |
| "mean_token_accuracy": 0.9733019053936005, | |
| "num_tokens": 319024910.0, | |
| "step": 2770 | |
| }, | |
| { | |
| "epoch": 0.6134833940196402, | |
| "grad_norm": 121.0, | |
| "learning_rate": 8.306148759174036e-06, | |
| "loss": 1.9893, | |
| "mean_token_accuracy": 0.9746699512004853, | |
| "num_tokens": 320158889.0, | |
| "step": 2780 | |
| }, | |
| { | |
| "epoch": 0.6156901688182721, | |
| "grad_norm": 123.0, | |
| "learning_rate": 8.292440539551132e-06, | |
| "loss": 2.1569, | |
| "mean_token_accuracy": 0.9709573060274124, | |
| "num_tokens": 321318555.0, | |
| "step": 2790 | |
| }, | |
| { | |
| "epoch": 0.617896943616904, | |
| "grad_norm": 50.75, | |
| "learning_rate": 8.278688485948634e-06, | |
| "loss": 2.0323, | |
| "mean_token_accuracy": 0.9733840838074684, | |
| "num_tokens": 322454391.0, | |
| "step": 2800 | |
| }, | |
| { | |
| "epoch": 0.6201037184155357, | |
| "grad_norm": 45.75, | |
| "learning_rate": 8.264892781454807e-06, | |
| "loss": 2.1117, | |
| "mean_token_accuracy": 0.9699335008859634, | |
| "num_tokens": 323606695.0, | |
| "step": 2810 | |
| }, | |
| { | |
| "epoch": 0.6223104932141675, | |
| "grad_norm": 95.5, | |
| "learning_rate": 8.25105360973907e-06, | |
| "loss": 2.0165, | |
| "mean_token_accuracy": 0.9741360947489739, | |
| "num_tokens": 324765910.0, | |
| "step": 2820 | |
| }, | |
| { | |
| "epoch": 0.6245172680127993, | |
| "grad_norm": 49.75, | |
| "learning_rate": 8.237171155049539e-06, | |
| "loss": 1.9423, | |
| "mean_token_accuracy": 0.9731440529227257, | |
| "num_tokens": 325902759.0, | |
| "step": 2830 | |
| }, | |
| { | |
| "epoch": 0.6267240428114311, | |
| "grad_norm": 27.0, | |
| "learning_rate": 8.22324560221058e-06, | |
| "loss": 2.0187, | |
| "mean_token_accuracy": 0.9744000375270844, | |
| "num_tokens": 327054288.0, | |
| "step": 2840 | |
| }, | |
| { | |
| "epoch": 0.6289308176100629, | |
| "grad_norm": 21.875, | |
| "learning_rate": 8.209277136620348e-06, | |
| "loss": 1.9792, | |
| "mean_token_accuracy": 0.972829869389534, | |
| "num_tokens": 328208344.0, | |
| "step": 2850 | |
| }, | |
| { | |
| "epoch": 0.6311375924086947, | |
| "grad_norm": 52.5, | |
| "learning_rate": 8.195265944248315e-06, | |
| "loss": 2.1293, | |
| "mean_token_accuracy": 0.9708218917250633, | |
| "num_tokens": 329356990.0, | |
| "step": 2860 | |
| }, | |
| { | |
| "epoch": 0.6333443672073265, | |
| "grad_norm": 116.0, | |
| "learning_rate": 8.1812122116328e-06, | |
| "loss": 2.2068, | |
| "mean_token_accuracy": 0.9700580164790154, | |
| "num_tokens": 330517106.0, | |
| "step": 2870 | |
| }, | |
| { | |
| "epoch": 0.6355511420059583, | |
| "grad_norm": 130.0, | |
| "learning_rate": 8.167116125878483e-06, | |
| "loss": 2.1431, | |
| "mean_token_accuracy": 0.9697193175554275, | |
| "num_tokens": 331665781.0, | |
| "step": 2880 | |
| }, | |
| { | |
| "epoch": 0.63775791680459, | |
| "grad_norm": 127.5, | |
| "learning_rate": 8.152977874653909e-06, | |
| "loss": 1.9529, | |
| "mean_token_accuracy": 0.9740060716867447, | |
| "num_tokens": 332818811.0, | |
| "step": 2890 | |
| }, | |
| { | |
| "epoch": 0.6399646916032219, | |
| "grad_norm": 96.5, | |
| "learning_rate": 8.138797646189e-06, | |
| "loss": 2.1792, | |
| "mean_token_accuracy": 0.9703377351164818, | |
| "num_tokens": 333972881.0, | |
| "step": 2900 | |
| }, | |
| { | |
| "epoch": 0.6421714664018537, | |
| "grad_norm": 42.5, | |
| "learning_rate": 8.12457562927254e-06, | |
| "loss": 1.9051, | |
| "mean_token_accuracy": 0.9740207836031913, | |
| "num_tokens": 335124741.0, | |
| "step": 2910 | |
| }, | |
| { | |
| "epoch": 0.6443782412004855, | |
| "grad_norm": 37.5, | |
| "learning_rate": 8.11031201324966e-06, | |
| "loss": 2.2503, | |
| "mean_token_accuracy": 0.9717813417315483, | |
| "num_tokens": 336268246.0, | |
| "step": 2920 | |
| }, | |
| { | |
| "epoch": 0.6465850159991173, | |
| "grad_norm": 32.75, | |
| "learning_rate": 8.096006988019331e-06, | |
| "loss": 2.0364, | |
| "mean_token_accuracy": 0.9727631896734238, | |
| "num_tokens": 337408491.0, | |
| "step": 2930 | |
| }, | |
| { | |
| "epoch": 0.6487917907977491, | |
| "grad_norm": 23.5, | |
| "learning_rate": 8.081660744031818e-06, | |
| "loss": 2.0711, | |
| "mean_token_accuracy": 0.9743640199303627, | |
| "num_tokens": 338547657.0, | |
| "step": 2940 | |
| }, | |
| { | |
| "epoch": 0.6509985655963809, | |
| "grad_norm": 27.75, | |
| "learning_rate": 8.067273472286158e-06, | |
| "loss": 1.9144, | |
| "mean_token_accuracy": 0.9742624044418335, | |
| "num_tokens": 339694948.0, | |
| "step": 2950 | |
| }, | |
| { | |
| "epoch": 0.6532053403950127, | |
| "grad_norm": 22.125, | |
| "learning_rate": 8.052845364327609e-06, | |
| "loss": 2.0644, | |
| "mean_token_accuracy": 0.9725037887692451, | |
| "num_tokens": 340831900.0, | |
| "step": 2960 | |
| }, | |
| { | |
| "epoch": 0.6554121151936445, | |
| "grad_norm": 80.0, | |
| "learning_rate": 8.038376612245104e-06, | |
| "loss": 1.9717, | |
| "mean_token_accuracy": 0.9739261493086815, | |
| "num_tokens": 341976387.0, | |
| "step": 2970 | |
| }, | |
| { | |
| "epoch": 0.6576188899922762, | |
| "grad_norm": 63.0, | |
| "learning_rate": 8.023867408668692e-06, | |
| "loss": 1.9895, | |
| "mean_token_accuracy": 0.9710915103554726, | |
| "num_tokens": 343121358.0, | |
| "step": 2980 | |
| }, | |
| { | |
| "epoch": 0.6598256647909081, | |
| "grad_norm": 44.0, | |
| "learning_rate": 8.009317946766975e-06, | |
| "loss": 2.2102, | |
| "mean_token_accuracy": 0.9707724586129188, | |
| "num_tokens": 344286503.0, | |
| "step": 2990 | |
| }, | |
| { | |
| "epoch": 0.6620324395895398, | |
| "grad_norm": 80.5, | |
| "learning_rate": 7.994728420244533e-06, | |
| "loss": 2.0231, | |
| "mean_token_accuracy": 0.973461589217186, | |
| "num_tokens": 345446119.0, | |
| "step": 3000 | |
| }, | |
| { | |
| "epoch": 0.6642392143881717, | |
| "grad_norm": 22.0, | |
| "learning_rate": 7.98009902333935e-06, | |
| "loss": 2.1338, | |
| "mean_token_accuracy": 0.9714898496866227, | |
| "num_tokens": 346611461.0, | |
| "step": 3010 | |
| }, | |
| { | |
| "epoch": 0.6664459891868035, | |
| "grad_norm": 59.0, | |
| "learning_rate": 7.965429950820222e-06, | |
| "loss": 1.9669, | |
| "mean_token_accuracy": 0.97147196829319, | |
| "num_tokens": 347762352.0, | |
| "step": 3020 | |
| }, | |
| { | |
| "epoch": 0.6686527639854353, | |
| "grad_norm": 27.625, | |
| "learning_rate": 7.95072139798417e-06, | |
| "loss": 1.8776, | |
| "mean_token_accuracy": 0.9752503156661987, | |
| "num_tokens": 348918299.0, | |
| "step": 3030 | |
| }, | |
| { | |
| "epoch": 0.6708595387840671, | |
| "grad_norm": 145.0, | |
| "learning_rate": 7.935973560653838e-06, | |
| "loss": 2.0565, | |
| "mean_token_accuracy": 0.972902238368988, | |
| "num_tokens": 350054629.0, | |
| "step": 3040 | |
| }, | |
| { | |
| "epoch": 0.6730663135826989, | |
| "grad_norm": 30.625, | |
| "learning_rate": 7.92118663517488e-06, | |
| "loss": 2.0735, | |
| "mean_token_accuracy": 0.9734035804867744, | |
| "num_tokens": 351206148.0, | |
| "step": 3050 | |
| }, | |
| { | |
| "epoch": 0.6752730883813307, | |
| "grad_norm": 113.0, | |
| "learning_rate": 7.906360818413354e-06, | |
| "loss": 2.1869, | |
| "mean_token_accuracy": 0.9721048012375831, | |
| "num_tokens": 352359524.0, | |
| "step": 3060 | |
| }, | |
| { | |
| "epoch": 0.6774798631799624, | |
| "grad_norm": 102.0, | |
| "learning_rate": 7.891496307753099e-06, | |
| "loss": 2.0249, | |
| "mean_token_accuracy": 0.9733120024204254, | |
| "num_tokens": 353503230.0, | |
| "step": 3070 | |
| }, | |
| { | |
| "epoch": 0.6796866379785943, | |
| "grad_norm": 23.625, | |
| "learning_rate": 7.876593301093104e-06, | |
| "loss": 1.9728, | |
| "mean_token_accuracy": 0.9718259304761887, | |
| "num_tokens": 354658570.0, | |
| "step": 3080 | |
| }, | |
| { | |
| "epoch": 0.681893412777226, | |
| "grad_norm": 153.0, | |
| "learning_rate": 7.861651996844877e-06, | |
| "loss": 2.001, | |
| "mean_token_accuracy": 0.9736142039299012, | |
| "num_tokens": 355812944.0, | |
| "step": 3090 | |
| }, | |
| { | |
| "epoch": 0.6841001875758579, | |
| "grad_norm": 93.0, | |
| "learning_rate": 7.8466725939298e-06, | |
| "loss": 1.9335, | |
| "mean_token_accuracy": 0.9742251902818679, | |
| "num_tokens": 356972747.0, | |
| "step": 3100 | |
| }, | |
| { | |
| "epoch": 0.6863069623744897, | |
| "grad_norm": 41.5, | |
| "learning_rate": 7.831655291776484e-06, | |
| "loss": 2.125, | |
| "mean_token_accuracy": 0.9710106894373893, | |
| "num_tokens": 358131882.0, | |
| "step": 3110 | |
| }, | |
| { | |
| "epoch": 0.6885137371731215, | |
| "grad_norm": 56.25, | |
| "learning_rate": 7.81660029031811e-06, | |
| "loss": 2.3097, | |
| "mean_token_accuracy": 0.9698337525129318, | |
| "num_tokens": 359299520.0, | |
| "step": 3120 | |
| }, | |
| { | |
| "epoch": 0.6907205119717533, | |
| "grad_norm": 136.0, | |
| "learning_rate": 7.801507789989775e-06, | |
| "loss": 2.1308, | |
| "mean_token_accuracy": 0.9719786927103996, | |
| "num_tokens": 360454015.0, | |
| "step": 3130 | |
| }, | |
| { | |
| "epoch": 0.6929272867703851, | |
| "grad_norm": 62.0, | |
| "learning_rate": 7.786377991725813e-06, | |
| "loss": 1.9726, | |
| "mean_token_accuracy": 0.9731140062212944, | |
| "num_tokens": 361600146.0, | |
| "step": 3140 | |
| }, | |
| { | |
| "epoch": 0.6951340615690169, | |
| "grad_norm": 22.625, | |
| "learning_rate": 7.771211096957125e-06, | |
| "loss": 2.1164, | |
| "mean_token_accuracy": 0.9709695622324943, | |
| "num_tokens": 362760084.0, | |
| "step": 3150 | |
| }, | |
| { | |
| "epoch": 0.6973408363676487, | |
| "grad_norm": 114.5, | |
| "learning_rate": 7.756007307608498e-06, | |
| "loss": 1.8929, | |
| "mean_token_accuracy": 0.9743599608540535, | |
| "num_tokens": 363913574.0, | |
| "step": 3160 | |
| }, | |
| { | |
| "epoch": 0.6995476111662805, | |
| "grad_norm": 36.75, | |
| "learning_rate": 7.740766826095918e-06, | |
| "loss": 2.1488, | |
| "mean_token_accuracy": 0.9718498140573502, | |
| "num_tokens": 365077588.0, | |
| "step": 3170 | |
| }, | |
| { | |
| "epoch": 0.7017543859649122, | |
| "grad_norm": 45.5, | |
| "learning_rate": 7.725489855323869e-06, | |
| "loss": 1.9263, | |
| "mean_token_accuracy": 0.9727461755275726, | |
| "num_tokens": 366224173.0, | |
| "step": 3180 | |
| }, | |
| { | |
| "epoch": 0.7039611607635441, | |
| "grad_norm": 20.375, | |
| "learning_rate": 7.710176598682639e-06, | |
| "loss": 1.9469, | |
| "mean_token_accuracy": 0.9736356347799301, | |
| "num_tokens": 367384660.0, | |
| "step": 3190 | |
| }, | |
| { | |
| "epoch": 0.7061679355621758, | |
| "grad_norm": 65.5, | |
| "learning_rate": 7.694827260045608e-06, | |
| "loss": 1.9055, | |
| "mean_token_accuracy": 0.9741185575723648, | |
| "num_tokens": 368527453.0, | |
| "step": 3200 | |
| }, | |
| { | |
| "epoch": 0.7083747103608077, | |
| "grad_norm": 82.5, | |
| "learning_rate": 7.679442043766534e-06, | |
| "loss": 1.8263, | |
| "mean_token_accuracy": 0.9747278049588204, | |
| "num_tokens": 369692607.0, | |
| "step": 3210 | |
| }, | |
| { | |
| "epoch": 0.7105814851594395, | |
| "grad_norm": 49.25, | |
| "learning_rate": 7.664021154676828e-06, | |
| "loss": 2.1727, | |
| "mean_token_accuracy": 0.9689811453223228, | |
| "num_tokens": 370843483.0, | |
| "step": 3220 | |
| }, | |
| { | |
| "epoch": 0.7127882599580713, | |
| "grad_norm": 68.0, | |
| "learning_rate": 7.648564798082842e-06, | |
| "loss": 1.9461, | |
| "mean_token_accuracy": 0.9737423777580261, | |
| "num_tokens": 371999795.0, | |
| "step": 3230 | |
| }, | |
| { | |
| "epoch": 0.7149950347567031, | |
| "grad_norm": 113.0, | |
| "learning_rate": 7.63307317976312e-06, | |
| "loss": 1.8929, | |
| "mean_token_accuracy": 0.9735087290406227, | |
| "num_tokens": 373132504.0, | |
| "step": 3240 | |
| }, | |
| { | |
| "epoch": 0.7172018095553349, | |
| "grad_norm": 32.0, | |
| "learning_rate": 7.617546505965658e-06, | |
| "loss": 2.176, | |
| "mean_token_accuracy": 0.9702001288533211, | |
| "num_tokens": 374278633.0, | |
| "step": 3250 | |
| }, | |
| { | |
| "epoch": 0.7194085843539667, | |
| "grad_norm": 61.5, | |
| "learning_rate": 7.601984983405173e-06, | |
| "loss": 1.874, | |
| "mean_token_accuracy": 0.9737308770418167, | |
| "num_tokens": 375435610.0, | |
| "step": 3260 | |
| }, | |
| { | |
| "epoch": 0.7216153591525984, | |
| "grad_norm": 46.0, | |
| "learning_rate": 7.586388819260338e-06, | |
| "loss": 1.9906, | |
| "mean_token_accuracy": 0.9728427931666375, | |
| "num_tokens": 376585019.0, | |
| "step": 3270 | |
| }, | |
| { | |
| "epoch": 0.7238221339512303, | |
| "grad_norm": 66.0, | |
| "learning_rate": 7.5707582211710265e-06, | |
| "loss": 2.0389, | |
| "mean_token_accuracy": 0.9714148506522179, | |
| "num_tokens": 377729221.0, | |
| "step": 3280 | |
| }, | |
| { | |
| "epoch": 0.726028908749862, | |
| "grad_norm": 50.5, | |
| "learning_rate": 7.555093397235553e-06, | |
| "loss": 1.9117, | |
| "mean_token_accuracy": 0.972545000910759, | |
| "num_tokens": 378881369.0, | |
| "step": 3290 | |
| }, | |
| { | |
| "epoch": 0.7282356835484939, | |
| "grad_norm": 54.0, | |
| "learning_rate": 7.539394556007892e-06, | |
| "loss": 2.0735, | |
| "mean_token_accuracy": 0.9732621818780899, | |
| "num_tokens": 380031674.0, | |
| "step": 3300 | |
| }, | |
| { | |
| "epoch": 0.7304424583471256, | |
| "grad_norm": 102.0, | |
| "learning_rate": 7.523661906494913e-06, | |
| "loss": 2.0019, | |
| "mean_token_accuracy": 0.9710119545459748, | |
| "num_tokens": 381203145.0, | |
| "step": 3310 | |
| }, | |
| { | |
| "epoch": 0.7326492331457575, | |
| "grad_norm": 36.25, | |
| "learning_rate": 7.507895658153594e-06, | |
| "loss": 2.0323, | |
| "mean_token_accuracy": 0.9723460420966148, | |
| "num_tokens": 382372687.0, | |
| "step": 3320 | |
| }, | |
| { | |
| "epoch": 0.7348560079443893, | |
| "grad_norm": 30.5, | |
| "learning_rate": 7.492096020888227e-06, | |
| "loss": 2.1082, | |
| "mean_token_accuracy": 0.9706227511167527, | |
| "num_tokens": 383536359.0, | |
| "step": 3330 | |
| }, | |
| { | |
| "epoch": 0.7370627827430211, | |
| "grad_norm": 45.5, | |
| "learning_rate": 7.476263205047629e-06, | |
| "loss": 2.0984, | |
| "mean_token_accuracy": 0.9733631387352943, | |
| "num_tokens": 384692264.0, | |
| "step": 3340 | |
| }, | |
| { | |
| "epoch": 0.7392695575416529, | |
| "grad_norm": 48.25, | |
| "learning_rate": 7.460397421422346e-06, | |
| "loss": 1.9579, | |
| "mean_token_accuracy": 0.9755487963557243, | |
| "num_tokens": 385854489.0, | |
| "step": 3350 | |
| }, | |
| { | |
| "epoch": 0.7414763323402846, | |
| "grad_norm": 38.5, | |
| "learning_rate": 7.444498881241835e-06, | |
| "loss": 1.883, | |
| "mean_token_accuracy": 0.9751752257347107, | |
| "num_tokens": 387020276.0, | |
| "step": 3360 | |
| }, | |
| { | |
| "epoch": 0.7436831071389165, | |
| "grad_norm": 93.5, | |
| "learning_rate": 7.428567796171662e-06, | |
| "loss": 1.8614, | |
| "mean_token_accuracy": 0.9743314564228058, | |
| "num_tokens": 388165843.0, | |
| "step": 3370 | |
| }, | |
| { | |
| "epoch": 0.7458898819375482, | |
| "grad_norm": 86.0, | |
| "learning_rate": 7.412604378310677e-06, | |
| "loss": 2.0376, | |
| "mean_token_accuracy": 0.9736068055033684, | |
| "num_tokens": 389329983.0, | |
| "step": 3380 | |
| }, | |
| { | |
| "epoch": 0.7480966567361801, | |
| "grad_norm": 41.5, | |
| "learning_rate": 7.3966088401881975e-06, | |
| "loss": 2.0117, | |
| "mean_token_accuracy": 0.9720857679843903, | |
| "num_tokens": 390480961.0, | |
| "step": 3390 | |
| }, | |
| { | |
| "epoch": 0.7503034315348118, | |
| "grad_norm": 49.25, | |
| "learning_rate": 7.380581394761169e-06, | |
| "loss": 1.9471, | |
| "mean_token_accuracy": 0.9743036434054375, | |
| "num_tokens": 391648757.0, | |
| "step": 3400 | |
| }, | |
| { | |
| "epoch": 0.7525102063334437, | |
| "grad_norm": 42.0, | |
| "learning_rate": 7.364522255411342e-06, | |
| "loss": 1.9498, | |
| "mean_token_accuracy": 0.9737771943211555, | |
| "num_tokens": 392812461.0, | |
| "step": 3410 | |
| }, | |
| { | |
| "epoch": 0.7547169811320755, | |
| "grad_norm": 32.5, | |
| "learning_rate": 7.348431635942421e-06, | |
| "loss": 1.9114, | |
| "mean_token_accuracy": 0.9731463566422462, | |
| "num_tokens": 393984685.0, | |
| "step": 3420 | |
| }, | |
| { | |
| "epoch": 0.7569237559307073, | |
| "grad_norm": 38.5, | |
| "learning_rate": 7.3323097505772225e-06, | |
| "loss": 2.1514, | |
| "mean_token_accuracy": 0.9719810456037521, | |
| "num_tokens": 395114801.0, | |
| "step": 3430 | |
| }, | |
| { | |
| "epoch": 0.7591305307293391, | |
| "grad_norm": 147.0, | |
| "learning_rate": 7.316156813954821e-06, | |
| "loss": 2.1332, | |
| "mean_token_accuracy": 0.9735054716467857, | |
| "num_tokens": 396258212.0, | |
| "step": 3440 | |
| }, | |
| { | |
| "epoch": 0.7613373055279709, | |
| "grad_norm": 34.25, | |
| "learning_rate": 7.299973041127695e-06, | |
| "loss": 2.2288, | |
| "mean_token_accuracy": 0.9703874930739402, | |
| "num_tokens": 397415678.0, | |
| "step": 3450 | |
| }, | |
| { | |
| "epoch": 0.7635440803266027, | |
| "grad_norm": 25.625, | |
| "learning_rate": 7.28375864755886e-06, | |
| "loss": 2.0425, | |
| "mean_token_accuracy": 0.9717745870351792, | |
| "num_tokens": 398569653.0, | |
| "step": 3460 | |
| }, | |
| { | |
| "epoch": 0.7657508551252344, | |
| "grad_norm": 36.25, | |
| "learning_rate": 7.267513849119001e-06, | |
| "loss": 2.1182, | |
| "mean_token_accuracy": 0.9726355031132699, | |
| "num_tokens": 399718564.0, | |
| "step": 3470 | |
| }, | |
| { | |
| "epoch": 0.7679576299238663, | |
| "grad_norm": 60.75, | |
| "learning_rate": 7.251238862083602e-06, | |
| "loss": 2.2483, | |
| "mean_token_accuracy": 0.9701059967279434, | |
| "num_tokens": 400873786.0, | |
| "step": 3480 | |
| }, | |
| { | |
| "epoch": 0.770164404722498, | |
| "grad_norm": 56.75, | |
| "learning_rate": 7.234933903130057e-06, | |
| "loss": 1.9868, | |
| "mean_token_accuracy": 0.9732056260108948, | |
| "num_tokens": 402027889.0, | |
| "step": 3490 | |
| }, | |
| { | |
| "epoch": 0.7723711795211299, | |
| "grad_norm": 118.0, | |
| "learning_rate": 7.218599189334799e-06, | |
| "loss": 2.1064, | |
| "mean_token_accuracy": 0.9707829400897026, | |
| "num_tokens": 403183692.0, | |
| "step": 3500 | |
| }, | |
| { | |
| "epoch": 0.7745779543197616, | |
| "grad_norm": 25.375, | |
| "learning_rate": 7.202234938170399e-06, | |
| "loss": 1.8403, | |
| "mean_token_accuracy": 0.9762239485979081, | |
| "num_tokens": 404332296.0, | |
| "step": 3510 | |
| }, | |
| { | |
| "epoch": 0.7767847291183935, | |
| "grad_norm": 72.0, | |
| "learning_rate": 7.185841367502675e-06, | |
| "loss": 2.0122, | |
| "mean_token_accuracy": 0.9730216577649117, | |
| "num_tokens": 405494063.0, | |
| "step": 3520 | |
| }, | |
| { | |
| "epoch": 0.7789915039170253, | |
| "grad_norm": 75.5, | |
| "learning_rate": 7.169418695587791e-06, | |
| "loss": 2.0618, | |
| "mean_token_accuracy": 0.9724257558584213, | |
| "num_tokens": 406624626.0, | |
| "step": 3530 | |
| }, | |
| { | |
| "epoch": 0.7811982787156571, | |
| "grad_norm": 67.0, | |
| "learning_rate": 7.152967141069351e-06, | |
| "loss": 2.1283, | |
| "mean_token_accuracy": 0.9700005114078522, | |
| "num_tokens": 407777186.0, | |
| "step": 3540 | |
| }, | |
| { | |
| "epoch": 0.7834050535142889, | |
| "grad_norm": 81.0, | |
| "learning_rate": 7.136486922975489e-06, | |
| "loss": 2.029, | |
| "mean_token_accuracy": 0.9731312170624733, | |
| "num_tokens": 408930294.0, | |
| "step": 3550 | |
| }, | |
| { | |
| "epoch": 0.7856118283129206, | |
| "grad_norm": 84.0, | |
| "learning_rate": 7.1199782607159494e-06, | |
| "loss": 1.9562, | |
| "mean_token_accuracy": 0.9732894912362099, | |
| "num_tokens": 410069873.0, | |
| "step": 3560 | |
| }, | |
| { | |
| "epoch": 0.7878186031115525, | |
| "grad_norm": 23.875, | |
| "learning_rate": 7.1034413740791705e-06, | |
| "loss": 2.0378, | |
| "mean_token_accuracy": 0.9730590000748635, | |
| "num_tokens": 411241330.0, | |
| "step": 3570 | |
| }, | |
| { | |
| "epoch": 0.7900253779101842, | |
| "grad_norm": 21.125, | |
| "learning_rate": 7.086876483229359e-06, | |
| "loss": 2.0968, | |
| "mean_token_accuracy": 0.9724582180380821, | |
| "num_tokens": 412379922.0, | |
| "step": 3580 | |
| }, | |
| { | |
| "epoch": 0.7922321527088161, | |
| "grad_norm": 179.0, | |
| "learning_rate": 7.070283808703553e-06, | |
| "loss": 1.8324, | |
| "mean_token_accuracy": 0.9765561237931252, | |
| "num_tokens": 413513375.0, | |
| "step": 3590 | |
| }, | |
| { | |
| "epoch": 0.7944389275074478, | |
| "grad_norm": 40.75, | |
| "learning_rate": 7.05366357140869e-06, | |
| "loss": 1.9408, | |
| "mean_token_accuracy": 0.9745825082063675, | |
| "num_tokens": 414647824.0, | |
| "step": 3600 | |
| }, | |
| { | |
| "epoch": 0.7966457023060797, | |
| "grad_norm": 49.25, | |
| "learning_rate": 7.0370159926186645e-06, | |
| "loss": 2.0276, | |
| "mean_token_accuracy": 0.9720786541700364, | |
| "num_tokens": 415793929.0, | |
| "step": 3610 | |
| }, | |
| { | |
| "epoch": 0.7988524771047114, | |
| "grad_norm": 87.5, | |
| "learning_rate": 7.020341293971383e-06, | |
| "loss": 2.1084, | |
| "mean_token_accuracy": 0.9713257819414138, | |
| "num_tokens": 416932978.0, | |
| "step": 3620 | |
| }, | |
| { | |
| "epoch": 0.8010592519033433, | |
| "grad_norm": 21.125, | |
| "learning_rate": 7.003639697465813e-06, | |
| "loss": 2.0516, | |
| "mean_token_accuracy": 0.9730439364910126, | |
| "num_tokens": 418079331.0, | |
| "step": 3630 | |
| }, | |
| { | |
| "epoch": 0.8032660267019751, | |
| "grad_norm": 87.5, | |
| "learning_rate": 6.986911425459028e-06, | |
| "loss": 1.9512, | |
| "mean_token_accuracy": 0.9737675413489342, | |
| "num_tokens": 419238043.0, | |
| "step": 3640 | |
| }, | |
| { | |
| "epoch": 0.8054728015006068, | |
| "grad_norm": 22.375, | |
| "learning_rate": 6.970156700663244e-06, | |
| "loss": 2.1664, | |
| "mean_token_accuracy": 0.9700018242001534, | |
| "num_tokens": 420388302.0, | |
| "step": 3650 | |
| }, | |
| { | |
| "epoch": 0.8076795762992387, | |
| "grad_norm": 25.875, | |
| "learning_rate": 6.953375746142861e-06, | |
| "loss": 2.0332, | |
| "mean_token_accuracy": 0.971338514983654, | |
| "num_tokens": 421540248.0, | |
| "step": 3660 | |
| }, | |
| { | |
| "epoch": 0.8098863510978704, | |
| "grad_norm": 36.75, | |
| "learning_rate": 6.936568785311484e-06, | |
| "loss": 2.2015, | |
| "mean_token_accuracy": 0.9694642707705498, | |
| "num_tokens": 422692464.0, | |
| "step": 3670 | |
| }, | |
| { | |
| "epoch": 0.8120931258965023, | |
| "grad_norm": 48.25, | |
| "learning_rate": 6.919736041928956e-06, | |
| "loss": 2.0133, | |
| "mean_token_accuracy": 0.9728905394673347, | |
| "num_tokens": 423836708.0, | |
| "step": 3680 | |
| }, | |
| { | |
| "epoch": 0.814299900695134, | |
| "grad_norm": 94.0, | |
| "learning_rate": 6.902877740098377e-06, | |
| "loss": 1.9576, | |
| "mean_token_accuracy": 0.9759998172521591, | |
| "num_tokens": 424974866.0, | |
| "step": 3690 | |
| }, | |
| { | |
| "epoch": 0.8165066754937659, | |
| "grad_norm": 125.5, | |
| "learning_rate": 6.885994104263122e-06, | |
| "loss": 2.0295, | |
| "mean_token_accuracy": 0.9710850700736046, | |
| "num_tokens": 426139483.0, | |
| "step": 3700 | |
| }, | |
| { | |
| "epoch": 0.8187134502923976, | |
| "grad_norm": 90.5, | |
| "learning_rate": 6.869085359203844e-06, | |
| "loss": 1.8736, | |
| "mean_token_accuracy": 0.9742442756891251, | |
| "num_tokens": 427285632.0, | |
| "step": 3710 | |
| }, | |
| { | |
| "epoch": 0.8209202250910295, | |
| "grad_norm": 22.125, | |
| "learning_rate": 6.852151730035497e-06, | |
| "loss": 1.9585, | |
| "mean_token_accuracy": 0.974020728468895, | |
| "num_tokens": 428443780.0, | |
| "step": 3720 | |
| }, | |
| { | |
| "epoch": 0.8231269998896613, | |
| "grad_norm": 108.0, | |
| "learning_rate": 6.835193442204322e-06, | |
| "loss": 1.8939, | |
| "mean_token_accuracy": 0.9728642031550407, | |
| "num_tokens": 429598368.0, | |
| "step": 3730 | |
| }, | |
| { | |
| "epoch": 0.825333774688293, | |
| "grad_norm": 91.0, | |
| "learning_rate": 6.818210721484859e-06, | |
| "loss": 2.0947, | |
| "mean_token_accuracy": 0.9714571252465248, | |
| "num_tokens": 430730120.0, | |
| "step": 3740 | |
| }, | |
| { | |
| "epoch": 0.8275405494869249, | |
| "grad_norm": 57.75, | |
| "learning_rate": 6.801203793976933e-06, | |
| "loss": 1.975, | |
| "mean_token_accuracy": 0.9747939929366112, | |
| "num_tokens": 431876764.0, | |
| "step": 3750 | |
| }, | |
| { | |
| "epoch": 0.8297473242855566, | |
| "grad_norm": 31.875, | |
| "learning_rate": 6.784172886102649e-06, | |
| "loss": 2.146, | |
| "mean_token_accuracy": 0.970741206407547, | |
| "num_tokens": 433028529.0, | |
| "step": 3760 | |
| }, | |
| { | |
| "epoch": 0.8319540990841885, | |
| "grad_norm": 45.75, | |
| "learning_rate": 6.767118224603374e-06, | |
| "loss": 2.1145, | |
| "mean_token_accuracy": 0.97153180539608, | |
| "num_tokens": 434182536.0, | |
| "step": 3770 | |
| }, | |
| { | |
| "epoch": 0.8341608738828202, | |
| "grad_norm": 44.75, | |
| "learning_rate": 6.750040036536718e-06, | |
| "loss": 2.0201, | |
| "mean_token_accuracy": 0.9715746879577637, | |
| "num_tokens": 435331431.0, | |
| "step": 3780 | |
| }, | |
| { | |
| "epoch": 0.8363676486814521, | |
| "grad_norm": 38.0, | |
| "learning_rate": 6.732938549273517e-06, | |
| "loss": 1.951, | |
| "mean_token_accuracy": 0.9750014156103134, | |
| "num_tokens": 436472717.0, | |
| "step": 3790 | |
| }, | |
| { | |
| "epoch": 0.8385744234800838, | |
| "grad_norm": 40.0, | |
| "learning_rate": 6.715813990494793e-06, | |
| "loss": 1.945, | |
| "mean_token_accuracy": 0.9741330206394195, | |
| "num_tokens": 437624930.0, | |
| "step": 3800 | |
| }, | |
| { | |
| "epoch": 0.8407811982787157, | |
| "grad_norm": 42.25, | |
| "learning_rate": 6.698666588188738e-06, | |
| "loss": 1.9308, | |
| "mean_token_accuracy": 0.9744031310081482, | |
| "num_tokens": 438753341.0, | |
| "step": 3810 | |
| }, | |
| { | |
| "epoch": 0.8429879730773474, | |
| "grad_norm": 40.75, | |
| "learning_rate": 6.681496570647672e-06, | |
| "loss": 1.9288, | |
| "mean_token_accuracy": 0.9738052412867546, | |
| "num_tokens": 439898241.0, | |
| "step": 3820 | |
| }, | |
| { | |
| "epoch": 0.8451947478759793, | |
| "grad_norm": 155.0, | |
| "learning_rate": 6.664304166465e-06, | |
| "loss": 2.071, | |
| "mean_token_accuracy": 0.9715085223317146, | |
| "num_tokens": 441063233.0, | |
| "step": 3830 | |
| }, | |
| { | |
| "epoch": 0.8474015226746111, | |
| "grad_norm": 116.0, | |
| "learning_rate": 6.647089604532174e-06, | |
| "loss": 1.9598, | |
| "mean_token_accuracy": 0.9733199805021286, | |
| "num_tokens": 442224063.0, | |
| "step": 3840 | |
| }, | |
| { | |
| "epoch": 0.8496082974732428, | |
| "grad_norm": 51.5, | |
| "learning_rate": 6.629853114035643e-06, | |
| "loss": 1.9578, | |
| "mean_token_accuracy": 0.9738478988409043, | |
| "num_tokens": 443373342.0, | |
| "step": 3850 | |
| }, | |
| { | |
| "epoch": 0.8518150722718747, | |
| "grad_norm": 71.0, | |
| "learning_rate": 6.612594924453801e-06, | |
| "loss": 1.9895, | |
| "mean_token_accuracy": 0.9734658822417259, | |
| "num_tokens": 444525326.0, | |
| "step": 3860 | |
| }, | |
| { | |
| "epoch": 0.8540218470705064, | |
| "grad_norm": 51.25, | |
| "learning_rate": 6.595315265553938e-06, | |
| "loss": 1.9769, | |
| "mean_token_accuracy": 0.9753797441720963, | |
| "num_tokens": 445680273.0, | |
| "step": 3870 | |
| }, | |
| { | |
| "epoch": 0.8562286218691383, | |
| "grad_norm": 93.5, | |
| "learning_rate": 6.578014367389173e-06, | |
| "loss": 1.926, | |
| "mean_token_accuracy": 0.9731380641460419, | |
| "num_tokens": 446824826.0, | |
| "step": 3880 | |
| }, | |
| { | |
| "epoch": 0.85843539666777, | |
| "grad_norm": 88.0, | |
| "learning_rate": 6.5606924602953925e-06, | |
| "loss": 1.9587, | |
| "mean_token_accuracy": 0.9737185269594193, | |
| "num_tokens": 447982855.0, | |
| "step": 3890 | |
| }, | |
| { | |
| "epoch": 0.8606421714664019, | |
| "grad_norm": 148.0, | |
| "learning_rate": 6.543349774888188e-06, | |
| "loss": 2.038, | |
| "mean_token_accuracy": 0.9721207752823829, | |
| "num_tokens": 449125897.0, | |
| "step": 3900 | |
| }, | |
| { | |
| "epoch": 0.8628489462650336, | |
| "grad_norm": 74.0, | |
| "learning_rate": 6.525986542059783e-06, | |
| "loss": 2.0435, | |
| "mean_token_accuracy": 0.971932566165924, | |
| "num_tokens": 450261988.0, | |
| "step": 3910 | |
| }, | |
| { | |
| "epoch": 0.8650557210636655, | |
| "grad_norm": 35.75, | |
| "learning_rate": 6.508602992975963e-06, | |
| "loss": 1.9373, | |
| "mean_token_accuracy": 0.9739039227366447, | |
| "num_tokens": 451410402.0, | |
| "step": 3920 | |
| }, | |
| { | |
| "epoch": 0.8672624958622972, | |
| "grad_norm": 24.375, | |
| "learning_rate": 6.4911993590729885e-06, | |
| "loss": 2.0521, | |
| "mean_token_accuracy": 0.9729105517268181, | |
| "num_tokens": 452555348.0, | |
| "step": 3930 | |
| }, | |
| { | |
| "epoch": 0.869469270660929, | |
| "grad_norm": 69.0, | |
| "learning_rate": 6.473775872054522e-06, | |
| "loss": 2.1666, | |
| "mean_token_accuracy": 0.970484359562397, | |
| "num_tokens": 453705084.0, | |
| "step": 3940 | |
| }, | |
| { | |
| "epoch": 0.8716760454595609, | |
| "grad_norm": 76.0, | |
| "learning_rate": 6.456332763888544e-06, | |
| "loss": 1.9717, | |
| "mean_token_accuracy": 0.9747552618384361, | |
| "num_tokens": 454860803.0, | |
| "step": 3950 | |
| }, | |
| { | |
| "epoch": 0.8738828202581926, | |
| "grad_norm": 18.125, | |
| "learning_rate": 6.438870266804258e-06, | |
| "loss": 1.8728, | |
| "mean_token_accuracy": 0.9757704868912697, | |
| "num_tokens": 456010882.0, | |
| "step": 3960 | |
| }, | |
| { | |
| "epoch": 0.8760895950568245, | |
| "grad_norm": 41.75, | |
| "learning_rate": 6.421388613289003e-06, | |
| "loss": 2.0784, | |
| "mean_token_accuracy": 0.9714920371770859, | |
| "num_tokens": 457172526.0, | |
| "step": 3970 | |
| }, | |
| { | |
| "epoch": 0.8782963698554562, | |
| "grad_norm": 56.75, | |
| "learning_rate": 6.403888036085155e-06, | |
| "loss": 1.9718, | |
| "mean_token_accuracy": 0.9731816455721856, | |
| "num_tokens": 458331301.0, | |
| "step": 3980 | |
| }, | |
| { | |
| "epoch": 0.8805031446540881, | |
| "grad_norm": 112.5, | |
| "learning_rate": 6.38636876818704e-06, | |
| "loss": 1.9707, | |
| "mean_token_accuracy": 0.9742238566279411, | |
| "num_tokens": 459478691.0, | |
| "step": 3990 | |
| }, | |
| { | |
| "epoch": 0.8827099194527198, | |
| "grad_norm": 65.5, | |
| "learning_rate": 6.368831042837813e-06, | |
| "loss": 1.7388, | |
| "mean_token_accuracy": 0.9770509406924248, | |
| "num_tokens": 460641537.0, | |
| "step": 4000 | |
| }, | |
| { | |
| "epoch": 0.8849166942513517, | |
| "grad_norm": 72.0, | |
| "learning_rate": 6.3512750935263664e-06, | |
| "loss": 2.2354, | |
| "mean_token_accuracy": 0.9715738728642463, | |
| "num_tokens": 461782096.0, | |
| "step": 4010 | |
| }, | |
| { | |
| "epoch": 0.8871234690499834, | |
| "grad_norm": 83.0, | |
| "learning_rate": 6.3337011539842195e-06, | |
| "loss": 2.0882, | |
| "mean_token_accuracy": 0.9732289671897888, | |
| "num_tokens": 462937379.0, | |
| "step": 4020 | |
| }, | |
| { | |
| "epoch": 0.8893302438486153, | |
| "grad_norm": 150.0, | |
| "learning_rate": 6.316109458182402e-06, | |
| "loss": 1.8487, | |
| "mean_token_accuracy": 0.9749891921877861, | |
| "num_tokens": 464087113.0, | |
| "step": 4030 | |
| }, | |
| { | |
| "epoch": 0.8915370186472471, | |
| "grad_norm": 111.0, | |
| "learning_rate": 6.298500240328342e-06, | |
| "loss": 2.1394, | |
| "mean_token_accuracy": 0.9707197308540344, | |
| "num_tokens": 465247534.0, | |
| "step": 4040 | |
| }, | |
| { | |
| "epoch": 0.8937437934458788, | |
| "grad_norm": 103.5, | |
| "learning_rate": 6.2808737348627514e-06, | |
| "loss": 1.9198, | |
| "mean_token_accuracy": 0.9757748633623123, | |
| "num_tokens": 466410461.0, | |
| "step": 4050 | |
| }, | |
| { | |
| "epoch": 0.8959505682445107, | |
| "grad_norm": 114.5, | |
| "learning_rate": 6.263230176456497e-06, | |
| "loss": 2.0151, | |
| "mean_token_accuracy": 0.971070083975792, | |
| "num_tokens": 467567854.0, | |
| "step": 4060 | |
| }, | |
| { | |
| "epoch": 0.8981573430431424, | |
| "grad_norm": 81.5, | |
| "learning_rate": 6.245569800007484e-06, | |
| "loss": 1.9747, | |
| "mean_token_accuracy": 0.9723834574222565, | |
| "num_tokens": 468711819.0, | |
| "step": 4070 | |
| }, | |
| { | |
| "epoch": 0.9003641178417743, | |
| "grad_norm": 62.75, | |
| "learning_rate": 6.227892840637521e-06, | |
| "loss": 1.9789, | |
| "mean_token_accuracy": 0.971935348212719, | |
| "num_tokens": 469870580.0, | |
| "step": 4080 | |
| }, | |
| { | |
| "epoch": 0.902570892640406, | |
| "grad_norm": 131.0, | |
| "learning_rate": 6.210199533689196e-06, | |
| "loss": 2.013, | |
| "mean_token_accuracy": 0.9719853803515435, | |
| "num_tokens": 471035721.0, | |
| "step": 4090 | |
| }, | |
| { | |
| "epoch": 0.9047776674390379, | |
| "grad_norm": 78.5, | |
| "learning_rate": 6.192490114722741e-06, | |
| "loss": 1.9299, | |
| "mean_token_accuracy": 0.9731788396835327, | |
| "num_tokens": 472203289.0, | |
| "step": 4100 | |
| }, | |
| { | |
| "epoch": 0.9069844422376696, | |
| "grad_norm": 25.0, | |
| "learning_rate": 6.174764819512895e-06, | |
| "loss": 1.9679, | |
| "mean_token_accuracy": 0.9731206983327866, | |
| "num_tokens": 473356133.0, | |
| "step": 4110 | |
| }, | |
| { | |
| "epoch": 0.9091912170363015, | |
| "grad_norm": 48.0, | |
| "learning_rate": 6.157023884045766e-06, | |
| "loss": 1.993, | |
| "mean_token_accuracy": 0.9731161788105964, | |
| "num_tokens": 474505797.0, | |
| "step": 4120 | |
| }, | |
| { | |
| "epoch": 0.9113979918349332, | |
| "grad_norm": 70.0, | |
| "learning_rate": 6.139267544515689e-06, | |
| "loss": 2.1029, | |
| "mean_token_accuracy": 0.9712511390447617, | |
| "num_tokens": 475655265.0, | |
| "step": 4130 | |
| }, | |
| { | |
| "epoch": 0.913604766633565, | |
| "grad_norm": 53.5, | |
| "learning_rate": 6.121496037322081e-06, | |
| "loss": 1.9634, | |
| "mean_token_accuracy": 0.9742270961403847, | |
| "num_tokens": 476812532.0, | |
| "step": 4140 | |
| }, | |
| { | |
| "epoch": 0.9158115414321969, | |
| "grad_norm": 88.5, | |
| "learning_rate": 6.103709599066293e-06, | |
| "loss": 2.106, | |
| "mean_token_accuracy": 0.97165088057518, | |
| "num_tokens": 477945245.0, | |
| "step": 4150 | |
| }, | |
| { | |
| "epoch": 0.9180183162308286, | |
| "grad_norm": 42.25, | |
| "learning_rate": 6.0859084665484645e-06, | |
| "loss": 1.9638, | |
| "mean_token_accuracy": 0.9749818623065949, | |
| "num_tokens": 479088823.0, | |
| "step": 4160 | |
| }, | |
| { | |
| "epoch": 0.9202250910294605, | |
| "grad_norm": 33.25, | |
| "learning_rate": 6.068092876764365e-06, | |
| "loss": 1.997, | |
| "mean_token_accuracy": 0.9727446466684342, | |
| "num_tokens": 480259334.0, | |
| "step": 4170 | |
| }, | |
| { | |
| "epoch": 0.9224318658280922, | |
| "grad_norm": 26.0, | |
| "learning_rate": 6.050263066902239e-06, | |
| "loss": 1.862, | |
| "mean_token_accuracy": 0.9741847306489945, | |
| "num_tokens": 481410657.0, | |
| "step": 4180 | |
| }, | |
| { | |
| "epoch": 0.9246386406267241, | |
| "grad_norm": 27.625, | |
| "learning_rate": 6.032419274339654e-06, | |
| "loss": 1.8865, | |
| "mean_token_accuracy": 0.9747850999236107, | |
| "num_tokens": 482565499.0, | |
| "step": 4190 | |
| }, | |
| { | |
| "epoch": 0.9268454154253558, | |
| "grad_norm": 24.25, | |
| "learning_rate": 6.014561736640334e-06, | |
| "loss": 2.0536, | |
| "mean_token_accuracy": 0.9746803641319275, | |
| "num_tokens": 483720896.0, | |
| "step": 4200 | |
| }, | |
| { | |
| "epoch": 0.9290521902239877, | |
| "grad_norm": 49.25, | |
| "learning_rate": 5.996690691551002e-06, | |
| "loss": 1.936, | |
| "mean_token_accuracy": 0.9734649583697319, | |
| "num_tokens": 484876264.0, | |
| "step": 4210 | |
| }, | |
| { | |
| "epoch": 0.9312589650226194, | |
| "grad_norm": 29.375, | |
| "learning_rate": 5.978806376998209e-06, | |
| "loss": 2.1987, | |
| "mean_token_accuracy": 0.9715402513742447, | |
| "num_tokens": 486043467.0, | |
| "step": 4220 | |
| }, | |
| { | |
| "epoch": 0.9334657398212513, | |
| "grad_norm": 33.75, | |
| "learning_rate": 5.960909031085173e-06, | |
| "loss": 1.9104, | |
| "mean_token_accuracy": 0.9750516951084137, | |
| "num_tokens": 487199636.0, | |
| "step": 4230 | |
| }, | |
| { | |
| "epoch": 0.935672514619883, | |
| "grad_norm": 34.75, | |
| "learning_rate": 5.942998892088598e-06, | |
| "loss": 1.9747, | |
| "mean_token_accuracy": 0.9729895129799843, | |
| "num_tokens": 488358973.0, | |
| "step": 4240 | |
| }, | |
| { | |
| "epoch": 0.9378792894185148, | |
| "grad_norm": 88.5, | |
| "learning_rate": 5.925076198455517e-06, | |
| "loss": 1.9454, | |
| "mean_token_accuracy": 0.975823138654232, | |
| "num_tokens": 489491808.0, | |
| "step": 4250 | |
| }, | |
| { | |
| "epoch": 0.9400860642171467, | |
| "grad_norm": 72.0, | |
| "learning_rate": 5.907141188800106e-06, | |
| "loss": 1.8519, | |
| "mean_token_accuracy": 0.9764188945293426, | |
| "num_tokens": 490644658.0, | |
| "step": 4260 | |
| }, | |
| { | |
| "epoch": 0.9422928390157784, | |
| "grad_norm": 83.0, | |
| "learning_rate": 5.8891941019005095e-06, | |
| "loss": 2.1013, | |
| "mean_token_accuracy": 0.9695283144712448, | |
| "num_tokens": 491805320.0, | |
| "step": 4270 | |
| }, | |
| { | |
| "epoch": 0.9444996138144103, | |
| "grad_norm": 29.0, | |
| "learning_rate": 5.871235176695664e-06, | |
| "loss": 2.0374, | |
| "mean_token_accuracy": 0.9735263183712959, | |
| "num_tokens": 492957724.0, | |
| "step": 4280 | |
| }, | |
| { | |
| "epoch": 0.946706388613042, | |
| "grad_norm": 22.75, | |
| "learning_rate": 5.853264652282118e-06, | |
| "loss": 2.0565, | |
| "mean_token_accuracy": 0.9717079788446427, | |
| "num_tokens": 494123799.0, | |
| "step": 4290 | |
| }, | |
| { | |
| "epoch": 0.9489131634116739, | |
| "grad_norm": 31.5, | |
| "learning_rate": 5.835282767910841e-06, | |
| "loss": 2.081, | |
| "mean_token_accuracy": 0.9734249457716941, | |
| "num_tokens": 495270311.0, | |
| "step": 4300 | |
| }, | |
| { | |
| "epoch": 0.9511199382103056, | |
| "grad_norm": 27.75, | |
| "learning_rate": 5.817289762984048e-06, | |
| "loss": 1.8168, | |
| "mean_token_accuracy": 0.9772966921329498, | |
| "num_tokens": 496407368.0, | |
| "step": 4310 | |
| }, | |
| { | |
| "epoch": 0.9533267130089375, | |
| "grad_norm": 116.5, | |
| "learning_rate": 5.799285877052007e-06, | |
| "loss": 1.8701, | |
| "mean_token_accuracy": 0.9749503463506699, | |
| "num_tokens": 497557772.0, | |
| "step": 4320 | |
| }, | |
| { | |
| "epoch": 0.9555334878075692, | |
| "grad_norm": 90.5, | |
| "learning_rate": 5.781271349809845e-06, | |
| "loss": 2.0159, | |
| "mean_token_accuracy": 0.9723986998200417, | |
| "num_tokens": 498693143.0, | |
| "step": 4330 | |
| }, | |
| { | |
| "epoch": 0.957740262606201, | |
| "grad_norm": 48.25, | |
| "learning_rate": 5.763246421094373e-06, | |
| "loss": 1.8929, | |
| "mean_token_accuracy": 0.9749439150094986, | |
| "num_tokens": 499840246.0, | |
| "step": 4340 | |
| }, | |
| { | |
| "epoch": 0.9599470374048328, | |
| "grad_norm": 22.375, | |
| "learning_rate": 5.745211330880872e-06, | |
| "loss": 2.116, | |
| "mean_token_accuracy": 0.9735443726181984, | |
| "num_tokens": 500979381.0, | |
| "step": 4350 | |
| }, | |
| { | |
| "epoch": 0.9621538122034646, | |
| "grad_norm": 112.5, | |
| "learning_rate": 5.727166319279915e-06, | |
| "loss": 1.9127, | |
| "mean_token_accuracy": 0.9724130749702453, | |
| "num_tokens": 502139137.0, | |
| "step": 4360 | |
| }, | |
| { | |
| "epoch": 0.9643605870020965, | |
| "grad_norm": 50.0, | |
| "learning_rate": 5.709111626534161e-06, | |
| "loss": 1.9868, | |
| "mean_token_accuracy": 0.9747680604457856, | |
| "num_tokens": 503279592.0, | |
| "step": 4370 | |
| }, | |
| { | |
| "epoch": 0.9665673618007282, | |
| "grad_norm": 30.0, | |
| "learning_rate": 5.691047493015157e-06, | |
| "loss": 1.8707, | |
| "mean_token_accuracy": 0.9753078848123551, | |
| "num_tokens": 504430413.0, | |
| "step": 4380 | |
| }, | |
| { | |
| "epoch": 0.9687741365993601, | |
| "grad_norm": 45.75, | |
| "learning_rate": 5.672974159220145e-06, | |
| "loss": 2.0623, | |
| "mean_token_accuracy": 0.9742168113589287, | |
| "num_tokens": 505570863.0, | |
| "step": 4390 | |
| }, | |
| { | |
| "epoch": 0.9709809113979918, | |
| "grad_norm": 41.25, | |
| "learning_rate": 5.65489186576885e-06, | |
| "loss": 1.9755, | |
| "mean_token_accuracy": 0.9720865845680237, | |
| "num_tokens": 506719123.0, | |
| "step": 4400 | |
| }, | |
| { | |
| "epoch": 0.9731876861966237, | |
| "grad_norm": 29.0, | |
| "learning_rate": 5.636800853400285e-06, | |
| "loss": 2.0437, | |
| "mean_token_accuracy": 0.975582891702652, | |
| "num_tokens": 507867011.0, | |
| "step": 4410 | |
| }, | |
| { | |
| "epoch": 0.9753944609952554, | |
| "grad_norm": 48.0, | |
| "learning_rate": 5.618701362969541e-06, | |
| "loss": 2.0357, | |
| "mean_token_accuracy": 0.972903598845005, | |
| "num_tokens": 509021659.0, | |
| "step": 4420 | |
| }, | |
| { | |
| "epoch": 0.9776012357938872, | |
| "grad_norm": 31.125, | |
| "learning_rate": 5.600593635444583e-06, | |
| "loss": 2.1222, | |
| "mean_token_accuracy": 0.9717710703611374, | |
| "num_tokens": 510182725.0, | |
| "step": 4430 | |
| }, | |
| { | |
| "epoch": 0.979808010592519, | |
| "grad_norm": 74.5, | |
| "learning_rate": 5.582477911903039e-06, | |
| "loss": 2.0526, | |
| "mean_token_accuracy": 0.9730535179376603, | |
| "num_tokens": 511344854.0, | |
| "step": 4440 | |
| }, | |
| { | |
| "epoch": 0.9820147853911508, | |
| "grad_norm": 144.0, | |
| "learning_rate": 5.564354433528993e-06, | |
| "loss": 1.9085, | |
| "mean_token_accuracy": 0.9728212565183639, | |
| "num_tokens": 512488195.0, | |
| "step": 4450 | |
| }, | |
| { | |
| "epoch": 0.9842215601897827, | |
| "grad_norm": 72.5, | |
| "learning_rate": 5.546223441609775e-06, | |
| "loss": 1.9854, | |
| "mean_token_accuracy": 0.9746579915285111, | |
| "num_tokens": 513631766.0, | |
| "step": 4460 | |
| }, | |
| { | |
| "epoch": 0.9864283349884144, | |
| "grad_norm": 149.0, | |
| "learning_rate": 5.5280851775327435e-06, | |
| "loss": 2.0096, | |
| "mean_token_accuracy": 0.9739761129021645, | |
| "num_tokens": 514787130.0, | |
| "step": 4470 | |
| }, | |
| { | |
| "epoch": 0.9886351097870463, | |
| "grad_norm": 38.5, | |
| "learning_rate": 5.509939882782077e-06, | |
| "loss": 2.0272, | |
| "mean_token_accuracy": 0.9739248499274253, | |
| "num_tokens": 515935657.0, | |
| "step": 4480 | |
| }, | |
| { | |
| "epoch": 0.990841884585678, | |
| "grad_norm": 25.625, | |
| "learning_rate": 5.491787798935557e-06, | |
| "loss": 2.0546, | |
| "mean_token_accuracy": 0.9719208747148513, | |
| "num_tokens": 517085947.0, | |
| "step": 4490 | |
| }, | |
| { | |
| "epoch": 0.9930486593843099, | |
| "grad_norm": 89.0, | |
| "learning_rate": 5.47362916766135e-06, | |
| "loss": 2.0255, | |
| "mean_token_accuracy": 0.9731628432869911, | |
| "num_tokens": 518245804.0, | |
| "step": 4500 | |
| }, | |
| { | |
| "epoch": 0.9952554341829416, | |
| "grad_norm": 69.5, | |
| "learning_rate": 5.455464230714794e-06, | |
| "loss": 1.9275, | |
| "mean_token_accuracy": 0.9760584264993668, | |
| "num_tokens": 519392437.0, | |
| "step": 4510 | |
| }, | |
| { | |
| "epoch": 0.9974622089815735, | |
| "grad_norm": 28.5, | |
| "learning_rate": 5.437293229935178e-06, | |
| "loss": 2.2264, | |
| "mean_token_accuracy": 0.9699186235666275, | |
| "num_tokens": 520544705.0, | |
| "step": 4520 | |
| }, | |
| { | |
| "epoch": 0.9996689837802052, | |
| "grad_norm": 27.25, | |
| "learning_rate": 5.4191164072425185e-06, | |
| "loss": 1.874, | |
| "mean_token_accuracy": 0.9743426635861396, | |
| "num_tokens": 521712409.0, | |
| "step": 4530 | |
| }, | |
| { | |
| "epoch": 1.0017654198389054, | |
| "grad_norm": 49.75, | |
| "learning_rate": 5.400934004634346e-06, | |
| "loss": 1.7911, | |
| "mean_token_accuracy": 0.9754668521253687, | |
| "num_tokens": 522797977.0, | |
| "step": 4540 | |
| }, | |
| { | |
| "epoch": 1.0039721946375373, | |
| "grad_norm": 57.5, | |
| "learning_rate": 5.38274626418248e-06, | |
| "loss": 1.9637, | |
| "mean_token_accuracy": 0.9749830231070519, | |
| "num_tokens": 523945363.0, | |
| "step": 4550 | |
| }, | |
| { | |
| "epoch": 1.0061789694361691, | |
| "grad_norm": 24.75, | |
| "learning_rate": 5.364553428029797e-06, | |
| "loss": 1.987, | |
| "mean_token_accuracy": 0.974488215148449, | |
| "num_tokens": 525107360.0, | |
| "step": 4560 | |
| }, | |
| { | |
| "epoch": 1.0083857442348008, | |
| "grad_norm": 43.5, | |
| "learning_rate": 5.346355738387028e-06, | |
| "loss": 1.8542, | |
| "mean_token_accuracy": 0.9762109011411667, | |
| "num_tokens": 526261535.0, | |
| "step": 4570 | |
| }, | |
| { | |
| "epoch": 1.0105925190334326, | |
| "grad_norm": 40.25, | |
| "learning_rate": 5.328153437529512e-06, | |
| "loss": 1.9898, | |
| "mean_token_accuracy": 0.9741319388151168, | |
| "num_tokens": 527418116.0, | |
| "step": 4580 | |
| }, | |
| { | |
| "epoch": 1.0127992938320645, | |
| "grad_norm": 19.125, | |
| "learning_rate": 5.309946767793982e-06, | |
| "loss": 1.8861, | |
| "mean_token_accuracy": 0.975490951538086, | |
| "num_tokens": 528576439.0, | |
| "step": 4590 | |
| }, | |
| { | |
| "epoch": 1.0150060686306963, | |
| "grad_norm": 36.0, | |
| "learning_rate": 5.291735971575336e-06, | |
| "loss": 1.8018, | |
| "mean_token_accuracy": 0.9778398305177689, | |
| "num_tokens": 529730593.0, | |
| "step": 4600 | |
| }, | |
| { | |
| "epoch": 1.017212843429328, | |
| "grad_norm": 34.5, | |
| "learning_rate": 5.273521291323411e-06, | |
| "loss": 1.9293, | |
| "mean_token_accuracy": 0.9727541863918304, | |
| "num_tokens": 530908237.0, | |
| "step": 4610 | |
| }, | |
| { | |
| "epoch": 1.0194196182279598, | |
| "grad_norm": 59.75, | |
| "learning_rate": 5.255302969539753e-06, | |
| "loss": 1.8204, | |
| "mean_token_accuracy": 0.9764805540442467, | |
| "num_tokens": 532068523.0, | |
| "step": 4620 | |
| }, | |
| { | |
| "epoch": 1.0216263930265916, | |
| "grad_norm": 42.0, | |
| "learning_rate": 5.237081248774391e-06, | |
| "loss": 1.9152, | |
| "mean_token_accuracy": 0.9731403976678848, | |
| "num_tokens": 533227407.0, | |
| "step": 4630 | |
| }, | |
| { | |
| "epoch": 1.0238331678252235, | |
| "grad_norm": 29.75, | |
| "learning_rate": 5.218856371622605e-06, | |
| "loss": 1.8332, | |
| "mean_token_accuracy": 0.9766795799136162, | |
| "num_tokens": 534363317.0, | |
| "step": 4640 | |
| }, | |
| { | |
| "epoch": 1.0260399426238553, | |
| "grad_norm": 39.5, | |
| "learning_rate": 5.200628580721698e-06, | |
| "loss": 1.9418, | |
| "mean_token_accuracy": 0.9743655249476433, | |
| "num_tokens": 535517138.0, | |
| "step": 4650 | |
| }, | |
| { | |
| "epoch": 1.028246717422487, | |
| "grad_norm": 113.0, | |
| "learning_rate": 5.182398118747766e-06, | |
| "loss": 2.0107, | |
| "mean_token_accuracy": 0.9719695821404457, | |
| "num_tokens": 536682266.0, | |
| "step": 4660 | |
| }, | |
| { | |
| "epoch": 1.0304534922211188, | |
| "grad_norm": 32.25, | |
| "learning_rate": 5.1641652284124645e-06, | |
| "loss": 1.7944, | |
| "mean_token_accuracy": 0.9765776932239533, | |
| "num_tokens": 537820737.0, | |
| "step": 4670 | |
| }, | |
| { | |
| "epoch": 1.0326602670197507, | |
| "grad_norm": 28.75, | |
| "learning_rate": 5.145930152459782e-06, | |
| "loss": 1.896, | |
| "mean_token_accuracy": 0.9745500922203064, | |
| "num_tokens": 538980001.0, | |
| "step": 4680 | |
| }, | |
| { | |
| "epoch": 1.0348670418183825, | |
| "grad_norm": 35.25, | |
| "learning_rate": 5.127693133662801e-06, | |
| "loss": 1.9241, | |
| "mean_token_accuracy": 0.9753615707159042, | |
| "num_tokens": 540130919.0, | |
| "step": 4690 | |
| }, | |
| { | |
| "epoch": 1.0370738166170141, | |
| "grad_norm": 31.0, | |
| "learning_rate": 5.109454414820475e-06, | |
| "loss": 1.9484, | |
| "mean_token_accuracy": 0.975345815718174, | |
| "num_tokens": 541284787.0, | |
| "step": 4700 | |
| }, | |
| { | |
| "epoch": 1.039280591415646, | |
| "grad_norm": 107.0, | |
| "learning_rate": 5.091214238754387e-06, | |
| "loss": 1.8047, | |
| "mean_token_accuracy": 0.9745490610599518, | |
| "num_tokens": 542430717.0, | |
| "step": 4710 | |
| }, | |
| { | |
| "epoch": 1.0414873662142778, | |
| "grad_norm": 27.375, | |
| "learning_rate": 5.072972848305525e-06, | |
| "loss": 1.7641, | |
| "mean_token_accuracy": 0.9768086537718773, | |
| "num_tokens": 543571400.0, | |
| "step": 4720 | |
| }, | |
| { | |
| "epoch": 1.0436941410129097, | |
| "grad_norm": 100.5, | |
| "learning_rate": 5.054730486331041e-06, | |
| "loss": 1.6919, | |
| "mean_token_accuracy": 0.9764266937971116, | |
| "num_tokens": 544715376.0, | |
| "step": 4730 | |
| }, | |
| { | |
| "epoch": 1.0459009158115413, | |
| "grad_norm": 37.75, | |
| "learning_rate": 5.036487395701021e-06, | |
| "loss": 1.5566, | |
| "mean_token_accuracy": 0.9794994667172432, | |
| "num_tokens": 545866473.0, | |
| "step": 4740 | |
| }, | |
| { | |
| "epoch": 1.0481076906101732, | |
| "grad_norm": 26.75, | |
| "learning_rate": 5.018243819295256e-06, | |
| "loss": 1.9682, | |
| "mean_token_accuracy": 0.9741592884063721, | |
| "num_tokens": 547031516.0, | |
| "step": 4750 | |
| }, | |
| { | |
| "epoch": 1.050314465408805, | |
| "grad_norm": 144.0, | |
| "learning_rate": 5e-06, | |
| "loss": 1.7894, | |
| "mean_token_accuracy": 0.9778555542230606, | |
| "num_tokens": 548191961.0, | |
| "step": 4760 | |
| }, | |
| { | |
| "epoch": 1.0525212402074369, | |
| "grad_norm": 164.0, | |
| "learning_rate": 4.981756180704746e-06, | |
| "loss": 1.7932, | |
| "mean_token_accuracy": 0.9751365810632706, | |
| "num_tokens": 549338774.0, | |
| "step": 4770 | |
| }, | |
| { | |
| "epoch": 1.0547280150060687, | |
| "grad_norm": 33.25, | |
| "learning_rate": 4.963512604298981e-06, | |
| "loss": 1.8521, | |
| "mean_token_accuracy": 0.9762362241744995, | |
| "num_tokens": 550495643.0, | |
| "step": 4780 | |
| }, | |
| { | |
| "epoch": 1.0569347898047003, | |
| "grad_norm": 120.0, | |
| "learning_rate": 4.945269513668962e-06, | |
| "loss": 1.803, | |
| "mean_token_accuracy": 0.9763126984238625, | |
| "num_tokens": 551650992.0, | |
| "step": 4790 | |
| }, | |
| { | |
| "epoch": 1.0591415646033322, | |
| "grad_norm": 30.875, | |
| "learning_rate": 4.927027151694478e-06, | |
| "loss": 1.8452, | |
| "mean_token_accuracy": 0.975006964802742, | |
| "num_tokens": 552799818.0, | |
| "step": 4800 | |
| }, | |
| { | |
| "epoch": 1.061348339401964, | |
| "grad_norm": 44.5, | |
| "learning_rate": 4.908785761245615e-06, | |
| "loss": 1.79, | |
| "mean_token_accuracy": 0.9755088686943054, | |
| "num_tokens": 553942218.0, | |
| "step": 4810 | |
| }, | |
| { | |
| "epoch": 1.063555114200596, | |
| "grad_norm": 30.0, | |
| "learning_rate": 4.890545585179527e-06, | |
| "loss": 1.7496, | |
| "mean_token_accuracy": 0.9778050869703293, | |
| "num_tokens": 555081142.0, | |
| "step": 4820 | |
| }, | |
| { | |
| "epoch": 1.0657618889992277, | |
| "grad_norm": 32.5, | |
| "learning_rate": 4.8723068663372005e-06, | |
| "loss": 1.9302, | |
| "mean_token_accuracy": 0.9765176177024841, | |
| "num_tokens": 556226987.0, | |
| "step": 4830 | |
| }, | |
| { | |
| "epoch": 1.0679686637978594, | |
| "grad_norm": 24.75, | |
| "learning_rate": 4.85406984754022e-06, | |
| "loss": 1.9513, | |
| "mean_token_accuracy": 0.9748401537537574, | |
| "num_tokens": 557363409.0, | |
| "step": 4840 | |
| }, | |
| { | |
| "epoch": 1.0701754385964912, | |
| "grad_norm": 31.0, | |
| "learning_rate": 4.835834771587537e-06, | |
| "loss": 2.0172, | |
| "mean_token_accuracy": 0.9743695884943009, | |
| "num_tokens": 558519321.0, | |
| "step": 4850 | |
| }, | |
| { | |
| "epoch": 1.072382213395123, | |
| "grad_norm": 25.5, | |
| "learning_rate": 4.817601881252236e-06, | |
| "loss": 1.9302, | |
| "mean_token_accuracy": 0.9744108885526657, | |
| "num_tokens": 559673648.0, | |
| "step": 4860 | |
| }, | |
| { | |
| "epoch": 1.074588988193755, | |
| "grad_norm": 28.25, | |
| "learning_rate": 4.799371419278303e-06, | |
| "loss": 1.7462, | |
| "mean_token_accuracy": 0.9762607038021087, | |
| "num_tokens": 560815025.0, | |
| "step": 4870 | |
| }, | |
| { | |
| "epoch": 1.0767957629923866, | |
| "grad_norm": 55.0, | |
| "learning_rate": 4.781143628377396e-06, | |
| "loss": 1.757, | |
| "mean_token_accuracy": 0.9752570196986199, | |
| "num_tokens": 561990920.0, | |
| "step": 4880 | |
| }, | |
| { | |
| "epoch": 1.0790025377910184, | |
| "grad_norm": 74.0, | |
| "learning_rate": 4.76291875122561e-06, | |
| "loss": 1.7756, | |
| "mean_token_accuracy": 0.9769013792276382, | |
| "num_tokens": 563152430.0, | |
| "step": 4890 | |
| }, | |
| { | |
| "epoch": 1.0812093125896503, | |
| "grad_norm": 59.5, | |
| "learning_rate": 4.744697030460248e-06, | |
| "loss": 1.9008, | |
| "mean_token_accuracy": 0.9748231634497643, | |
| "num_tokens": 564301290.0, | |
| "step": 4900 | |
| }, | |
| { | |
| "epoch": 1.083416087388282, | |
| "grad_norm": 32.25, | |
| "learning_rate": 4.72647870867659e-06, | |
| "loss": 1.7905, | |
| "mean_token_accuracy": 0.9767342403531074, | |
| "num_tokens": 565460149.0, | |
| "step": 4910 | |
| }, | |
| { | |
| "epoch": 1.0856228621869137, | |
| "grad_norm": 74.5, | |
| "learning_rate": 4.708264028424666e-06, | |
| "loss": 1.9031, | |
| "mean_token_accuracy": 0.9738263443112374, | |
| "num_tokens": 566606830.0, | |
| "step": 4920 | |
| }, | |
| { | |
| "epoch": 1.0878296369855456, | |
| "grad_norm": 32.75, | |
| "learning_rate": 4.69005323220602e-06, | |
| "loss": 1.7421, | |
| "mean_token_accuracy": 0.9773872897028923, | |
| "num_tokens": 567741343.0, | |
| "step": 4930 | |
| }, | |
| { | |
| "epoch": 1.0900364117841774, | |
| "grad_norm": 63.0, | |
| "learning_rate": 4.671846562470489e-06, | |
| "loss": 1.9476, | |
| "mean_token_accuracy": 0.9745059341192246, | |
| "num_tokens": 568892447.0, | |
| "step": 4940 | |
| }, | |
| { | |
| "epoch": 1.0922431865828093, | |
| "grad_norm": 45.0, | |
| "learning_rate": 4.653644261612972e-06, | |
| "loss": 1.7843, | |
| "mean_token_accuracy": 0.9787998363375664, | |
| "num_tokens": 570030671.0, | |
| "step": 4950 | |
| }, | |
| { | |
| "epoch": 1.0944499613814411, | |
| "grad_norm": 131.0, | |
| "learning_rate": 4.635446571970203e-06, | |
| "loss": 1.7284, | |
| "mean_token_accuracy": 0.9763140276074409, | |
| "num_tokens": 571182670.0, | |
| "step": 4960 | |
| }, | |
| { | |
| "epoch": 1.0966567361800728, | |
| "grad_norm": 33.5, | |
| "learning_rate": 4.617253735817522e-06, | |
| "loss": 1.8774, | |
| "mean_token_accuracy": 0.9738431140780449, | |
| "num_tokens": 572344441.0, | |
| "step": 4970 | |
| }, | |
| { | |
| "epoch": 1.0988635109787046, | |
| "grad_norm": 38.25, | |
| "learning_rate": 4.599065995365655e-06, | |
| "loss": 1.7964, | |
| "mean_token_accuracy": 0.9764374598860741, | |
| "num_tokens": 573492414.0, | |
| "step": 4980 | |
| }, | |
| { | |
| "epoch": 1.1010702857773365, | |
| "grad_norm": 26.25, | |
| "learning_rate": 4.580883592757482e-06, | |
| "loss": 1.8314, | |
| "mean_token_accuracy": 0.9758396491408348, | |
| "num_tokens": 574642118.0, | |
| "step": 4990 | |
| }, | |
| { | |
| "epoch": 1.1032770605759683, | |
| "grad_norm": 38.5, | |
| "learning_rate": 4.562706770064824e-06, | |
| "loss": 1.8463, | |
| "mean_token_accuracy": 0.9750605836510658, | |
| "num_tokens": 575797031.0, | |
| "step": 5000 | |
| }, | |
| { | |
| "epoch": 1.1054838353746, | |
| "grad_norm": 162.0, | |
| "learning_rate": 4.544535769285207e-06, | |
| "loss": 1.8476, | |
| "mean_token_accuracy": 0.9753901034593582, | |
| "num_tokens": 576946511.0, | |
| "step": 5010 | |
| }, | |
| { | |
| "epoch": 1.1076906101732318, | |
| "grad_norm": 26.25, | |
| "learning_rate": 4.526370832338652e-06, | |
| "loss": 1.6767, | |
| "mean_token_accuracy": 0.9771376997232437, | |
| "num_tokens": 578078352.0, | |
| "step": 5020 | |
| }, | |
| { | |
| "epoch": 1.1098973849718636, | |
| "grad_norm": 121.5, | |
| "learning_rate": 4.508212201064446e-06, | |
| "loss": 1.7671, | |
| "mean_token_accuracy": 0.9773701384663582, | |
| "num_tokens": 579230344.0, | |
| "step": 5030 | |
| }, | |
| { | |
| "epoch": 1.1121041597704955, | |
| "grad_norm": 35.75, | |
| "learning_rate": 4.490060117217925e-06, | |
| "loss": 1.9882, | |
| "mean_token_accuracy": 0.9741671547293663, | |
| "num_tokens": 580394388.0, | |
| "step": 5040 | |
| }, | |
| { | |
| "epoch": 1.114310934569127, | |
| "grad_norm": 50.25, | |
| "learning_rate": 4.471914822467259e-06, | |
| "loss": 1.8417, | |
| "mean_token_accuracy": 0.9756775140762329, | |
| "num_tokens": 581536226.0, | |
| "step": 5050 | |
| }, | |
| { | |
| "epoch": 1.116517709367759, | |
| "grad_norm": 111.0, | |
| "learning_rate": 4.453776558390225e-06, | |
| "loss": 1.7767, | |
| "mean_token_accuracy": 0.9756920024752617, | |
| "num_tokens": 582677619.0, | |
| "step": 5060 | |
| }, | |
| { | |
| "epoch": 1.1187244841663908, | |
| "grad_norm": 58.25, | |
| "learning_rate": 4.435645566471007e-06, | |
| "loss": 1.8175, | |
| "mean_token_accuracy": 0.9747438743710518, | |
| "num_tokens": 583830811.0, | |
| "step": 5070 | |
| }, | |
| { | |
| "epoch": 1.1209312589650227, | |
| "grad_norm": 49.5, | |
| "learning_rate": 4.4175220880969625e-06, | |
| "loss": 1.5481, | |
| "mean_token_accuracy": 0.979948528110981, | |
| "num_tokens": 584976018.0, | |
| "step": 5080 | |
| }, | |
| { | |
| "epoch": 1.1231380337636545, | |
| "grad_norm": 35.75, | |
| "learning_rate": 4.3994063645554185e-06, | |
| "loss": 1.8737, | |
| "mean_token_accuracy": 0.9762693852186203, | |
| "num_tokens": 586130461.0, | |
| "step": 5090 | |
| }, | |
| { | |
| "epoch": 1.1253448085622861, | |
| "grad_norm": 37.25, | |
| "learning_rate": 4.381298637030461e-06, | |
| "loss": 1.913, | |
| "mean_token_accuracy": 0.974817368388176, | |
| "num_tokens": 587277506.0, | |
| "step": 5100 | |
| }, | |
| { | |
| "epoch": 1.127551583360918, | |
| "grad_norm": 25.5, | |
| "learning_rate": 4.363199146599717e-06, | |
| "loss": 1.8553, | |
| "mean_token_accuracy": 0.9753854766488075, | |
| "num_tokens": 588429402.0, | |
| "step": 5110 | |
| }, | |
| { | |
| "epoch": 1.1297583581595498, | |
| "grad_norm": 131.0, | |
| "learning_rate": 4.345108134231152e-06, | |
| "loss": 1.8129, | |
| "mean_token_accuracy": 0.9739721834659576, | |
| "num_tokens": 589569193.0, | |
| "step": 5120 | |
| }, | |
| { | |
| "epoch": 1.1319651329581817, | |
| "grad_norm": 34.0, | |
| "learning_rate": 4.327025840779857e-06, | |
| "loss": 1.9727, | |
| "mean_token_accuracy": 0.9737168192863465, | |
| "num_tokens": 590726713.0, | |
| "step": 5130 | |
| }, | |
| { | |
| "epoch": 1.1341719077568135, | |
| "grad_norm": 52.0, | |
| "learning_rate": 4.308952506984844e-06, | |
| "loss": 1.725, | |
| "mean_token_accuracy": 0.9775279134511947, | |
| "num_tokens": 591872463.0, | |
| "step": 5140 | |
| }, | |
| { | |
| "epoch": 1.1363786825554452, | |
| "grad_norm": 21.625, | |
| "learning_rate": 4.290888373465841e-06, | |
| "loss": 1.8535, | |
| "mean_token_accuracy": 0.9759417116641999, | |
| "num_tokens": 593023782.0, | |
| "step": 5150 | |
| }, | |
| { | |
| "epoch": 1.138585457354077, | |
| "grad_norm": 45.0, | |
| "learning_rate": 4.272833680720086e-06, | |
| "loss": 1.8139, | |
| "mean_token_accuracy": 0.9759591981768608, | |
| "num_tokens": 594182249.0, | |
| "step": 5160 | |
| }, | |
| { | |
| "epoch": 1.1407922321527089, | |
| "grad_norm": 50.75, | |
| "learning_rate": 4.254788669119127e-06, | |
| "loss": 1.7701, | |
| "mean_token_accuracy": 0.9753040999174118, | |
| "num_tokens": 595332632.0, | |
| "step": 5170 | |
| }, | |
| { | |
| "epoch": 1.1429990069513407, | |
| "grad_norm": 35.0, | |
| "learning_rate": 4.236753578905627e-06, | |
| "loss": 1.7792, | |
| "mean_token_accuracy": 0.9774682566523551, | |
| "num_tokens": 596478168.0, | |
| "step": 5180 | |
| }, | |
| { | |
| "epoch": 1.1452057817499723, | |
| "grad_norm": 40.25, | |
| "learning_rate": 4.218728650190155e-06, | |
| "loss": 1.7347, | |
| "mean_token_accuracy": 0.976492403447628, | |
| "num_tokens": 597634860.0, | |
| "step": 5190 | |
| }, | |
| { | |
| "epoch": 1.1474125565486042, | |
| "grad_norm": 42.0, | |
| "learning_rate": 4.2007141229479955e-06, | |
| "loss": 1.8962, | |
| "mean_token_accuracy": 0.9743067398667336, | |
| "num_tokens": 598796015.0, | |
| "step": 5200 | |
| }, | |
| { | |
| "epoch": 1.149619331347236, | |
| "grad_norm": 41.0, | |
| "learning_rate": 4.1827102370159526e-06, | |
| "loss": 1.9991, | |
| "mean_token_accuracy": 0.9730441465973854, | |
| "num_tokens": 599944690.0, | |
| "step": 5210 | |
| }, | |
| { | |
| "epoch": 1.151826106145868, | |
| "grad_norm": 128.0, | |
| "learning_rate": 4.1647172320891595e-06, | |
| "loss": 2.0141, | |
| "mean_token_accuracy": 0.970869180560112, | |
| "num_tokens": 601095606.0, | |
| "step": 5220 | |
| }, | |
| { | |
| "epoch": 1.1540328809444995, | |
| "grad_norm": 151.0, | |
| "learning_rate": 4.146735347717883e-06, | |
| "loss": 1.8334, | |
| "mean_token_accuracy": 0.9745113655924798, | |
| "num_tokens": 602242421.0, | |
| "step": 5230 | |
| }, | |
| { | |
| "epoch": 1.1562396557431314, | |
| "grad_norm": 34.0, | |
| "learning_rate": 4.1287648233043366e-06, | |
| "loss": 1.939, | |
| "mean_token_accuracy": 0.9750206798315049, | |
| "num_tokens": 603393922.0, | |
| "step": 5240 | |
| }, | |
| { | |
| "epoch": 1.1584464305417632, | |
| "grad_norm": 112.5, | |
| "learning_rate": 4.110805898099492e-06, | |
| "loss": 1.7816, | |
| "mean_token_accuracy": 0.9762521848082543, | |
| "num_tokens": 604558266.0, | |
| "step": 5250 | |
| }, | |
| { | |
| "epoch": 1.160653205340395, | |
| "grad_norm": 94.0, | |
| "learning_rate": 4.092858811199896e-06, | |
| "loss": 1.8058, | |
| "mean_token_accuracy": 0.9763723388314247, | |
| "num_tokens": 605699821.0, | |
| "step": 5260 | |
| }, | |
| { | |
| "epoch": 1.162859980139027, | |
| "grad_norm": 25.0, | |
| "learning_rate": 4.074923801544485e-06, | |
| "loss": 1.7781, | |
| "mean_token_accuracy": 0.9768066555261612, | |
| "num_tokens": 606861912.0, | |
| "step": 5270 | |
| }, | |
| { | |
| "epoch": 1.1650667549376585, | |
| "grad_norm": 28.5, | |
| "learning_rate": 4.057001107911404e-06, | |
| "loss": 1.8822, | |
| "mean_token_accuracy": 0.973501381278038, | |
| "num_tokens": 608015619.0, | |
| "step": 5280 | |
| }, | |
| { | |
| "epoch": 1.1672735297362904, | |
| "grad_norm": 41.0, | |
| "learning_rate": 4.039090968914828e-06, | |
| "loss": 1.9512, | |
| "mean_token_accuracy": 0.9736144244670868, | |
| "num_tokens": 609178030.0, | |
| "step": 5290 | |
| }, | |
| { | |
| "epoch": 1.1694803045349222, | |
| "grad_norm": 78.0, | |
| "learning_rate": 4.0211936230017915e-06, | |
| "loss": 1.6921, | |
| "mean_token_accuracy": 0.9770965903997422, | |
| "num_tokens": 610335548.0, | |
| "step": 5300 | |
| }, | |
| { | |
| "epoch": 1.171687079333554, | |
| "grad_norm": 37.5, | |
| "learning_rate": 4.003309308448998e-06, | |
| "loss": 1.7889, | |
| "mean_token_accuracy": 0.9756392076611519, | |
| "num_tokens": 611464518.0, | |
| "step": 5310 | |
| }, | |
| { | |
| "epoch": 1.1738938541321857, | |
| "grad_norm": 45.75, | |
| "learning_rate": 3.985438263359667e-06, | |
| "loss": 1.8208, | |
| "mean_token_accuracy": 0.9757576256990432, | |
| "num_tokens": 612605369.0, | |
| "step": 5320 | |
| }, | |
| { | |
| "epoch": 1.1761006289308176, | |
| "grad_norm": 88.5, | |
| "learning_rate": 3.967580725660348e-06, | |
| "loss": 1.9713, | |
| "mean_token_accuracy": 0.9735803753137589, | |
| "num_tokens": 613769083.0, | |
| "step": 5330 | |
| }, | |
| { | |
| "epoch": 1.1783074037294494, | |
| "grad_norm": 79.5, | |
| "learning_rate": 3.949736933097763e-06, | |
| "loss": 2.001, | |
| "mean_token_accuracy": 0.9736290365457535, | |
| "num_tokens": 614910229.0, | |
| "step": 5340 | |
| }, | |
| { | |
| "epoch": 1.1805141785280813, | |
| "grad_norm": 59.25, | |
| "learning_rate": 3.931907123235638e-06, | |
| "loss": 1.7654, | |
| "mean_token_accuracy": 0.976622948050499, | |
| "num_tokens": 616068687.0, | |
| "step": 5350 | |
| }, | |
| { | |
| "epoch": 1.182720953326713, | |
| "grad_norm": 60.5, | |
| "learning_rate": 3.914091533451537e-06, | |
| "loss": 1.8328, | |
| "mean_token_accuracy": 0.9759069800376892, | |
| "num_tokens": 617211438.0, | |
| "step": 5360 | |
| }, | |
| { | |
| "epoch": 1.1849277281253447, | |
| "grad_norm": 30.25, | |
| "learning_rate": 3.896290400933709e-06, | |
| "loss": 1.8296, | |
| "mean_token_accuracy": 0.9748711869120598, | |
| "num_tokens": 618348189.0, | |
| "step": 5370 | |
| }, | |
| { | |
| "epoch": 1.1871345029239766, | |
| "grad_norm": 34.0, | |
| "learning_rate": 3.878503962677921e-06, | |
| "loss": 1.6976, | |
| "mean_token_accuracy": 0.9781778767704964, | |
| "num_tokens": 619513774.0, | |
| "step": 5380 | |
| }, | |
| { | |
| "epoch": 1.1893412777226084, | |
| "grad_norm": 28.625, | |
| "learning_rate": 3.860732455484314e-06, | |
| "loss": 1.7621, | |
| "mean_token_accuracy": 0.9769005045294762, | |
| "num_tokens": 620658294.0, | |
| "step": 5390 | |
| }, | |
| { | |
| "epoch": 1.1915480525212403, | |
| "grad_norm": 63.75, | |
| "learning_rate": 3.8429761159542345e-06, | |
| "loss": 1.7726, | |
| "mean_token_accuracy": 0.9763997599482537, | |
| "num_tokens": 621822786.0, | |
| "step": 5400 | |
| }, | |
| { | |
| "epoch": 1.193754827319872, | |
| "grad_norm": 76.5, | |
| "learning_rate": 3.825235180487105e-06, | |
| "loss": 1.7976, | |
| "mean_token_accuracy": 0.9776034832000733, | |
| "num_tokens": 622968221.0, | |
| "step": 5410 | |
| }, | |
| { | |
| "epoch": 1.1959616021185038, | |
| "grad_norm": 62.5, | |
| "learning_rate": 3.8075098852772607e-06, | |
| "loss": 1.7277, | |
| "mean_token_accuracy": 0.9763502150774002, | |
| "num_tokens": 624116917.0, | |
| "step": 5420 | |
| }, | |
| { | |
| "epoch": 1.1981683769171356, | |
| "grad_norm": 47.0, | |
| "learning_rate": 3.7898004663108055e-06, | |
| "loss": 1.8162, | |
| "mean_token_accuracy": 0.9771040230989456, | |
| "num_tokens": 625275805.0, | |
| "step": 5430 | |
| }, | |
| { | |
| "epoch": 1.2003751517157675, | |
| "grad_norm": 67.0, | |
| "learning_rate": 3.7721071593624806e-06, | |
| "loss": 1.8598, | |
| "mean_token_accuracy": 0.9766054973006248, | |
| "num_tokens": 626412494.0, | |
| "step": 5440 | |
| }, | |
| { | |
| "epoch": 1.2025819265143993, | |
| "grad_norm": 43.75, | |
| "learning_rate": 3.7544301999925176e-06, | |
| "loss": 1.7128, | |
| "mean_token_accuracy": 0.9759867653250694, | |
| "num_tokens": 627581796.0, | |
| "step": 5450 | |
| }, | |
| { | |
| "epoch": 1.204788701313031, | |
| "grad_norm": 22.875, | |
| "learning_rate": 3.7367698235435036e-06, | |
| "loss": 1.7985, | |
| "mean_token_accuracy": 0.9771082028746605, | |
| "num_tokens": 628749061.0, | |
| "step": 5460 | |
| }, | |
| { | |
| "epoch": 1.2069954761116628, | |
| "grad_norm": 25.0, | |
| "learning_rate": 3.7191262651372502e-06, | |
| "loss": 1.7364, | |
| "mean_token_accuracy": 0.9767947405576706, | |
| "num_tokens": 629902166.0, | |
| "step": 5470 | |
| }, | |
| { | |
| "epoch": 1.2092022509102947, | |
| "grad_norm": 49.25, | |
| "learning_rate": 3.7014997596716596e-06, | |
| "loss": 1.8096, | |
| "mean_token_accuracy": 0.9758952215313912, | |
| "num_tokens": 631045876.0, | |
| "step": 5480 | |
| }, | |
| { | |
| "epoch": 1.2114090257089263, | |
| "grad_norm": 35.5, | |
| "learning_rate": 3.6838905418176006e-06, | |
| "loss": 1.8957, | |
| "mean_token_accuracy": 0.9733474045991898, | |
| "num_tokens": 632198013.0, | |
| "step": 5490 | |
| }, | |
| { | |
| "epoch": 1.2136158005075581, | |
| "grad_norm": 92.5, | |
| "learning_rate": 3.666298846015783e-06, | |
| "loss": 1.663, | |
| "mean_token_accuracy": 0.9772789746522903, | |
| "num_tokens": 633348344.0, | |
| "step": 5500 | |
| }, | |
| { | |
| "epoch": 1.21582257530619, | |
| "grad_norm": 32.0, | |
| "learning_rate": 3.6487249064736352e-06, | |
| "loss": 1.8385, | |
| "mean_token_accuracy": 0.9739779710769654, | |
| "num_tokens": 634502768.0, | |
| "step": 5510 | |
| }, | |
| { | |
| "epoch": 1.2180293501048218, | |
| "grad_norm": 32.0, | |
| "learning_rate": 3.6311689571621873e-06, | |
| "loss": 1.8955, | |
| "mean_token_accuracy": 0.9765334606170655, | |
| "num_tokens": 635652274.0, | |
| "step": 5520 | |
| }, | |
| { | |
| "epoch": 1.2202361249034537, | |
| "grad_norm": 108.0, | |
| "learning_rate": 3.61363123181296e-06, | |
| "loss": 2.0033, | |
| "mean_token_accuracy": 0.9729743212461471, | |
| "num_tokens": 636803608.0, | |
| "step": 5530 | |
| }, | |
| { | |
| "epoch": 1.2224428997020853, | |
| "grad_norm": 28.625, | |
| "learning_rate": 3.5961119639148443e-06, | |
| "loss": 1.9413, | |
| "mean_token_accuracy": 0.9744836568832398, | |
| "num_tokens": 637958795.0, | |
| "step": 5540 | |
| }, | |
| { | |
| "epoch": 1.2246496745007172, | |
| "grad_norm": 56.0, | |
| "learning_rate": 3.5786113867109994e-06, | |
| "loss": 1.8501, | |
| "mean_token_accuracy": 0.9776576519012451, | |
| "num_tokens": 639105490.0, | |
| "step": 5550 | |
| }, | |
| { | |
| "epoch": 1.226856449299349, | |
| "grad_norm": 70.0, | |
| "learning_rate": 3.561129733195744e-06, | |
| "loss": 1.9056, | |
| "mean_token_accuracy": 0.9727635353803634, | |
| "num_tokens": 640260157.0, | |
| "step": 5560 | |
| }, | |
| { | |
| "epoch": 1.2290632240979809, | |
| "grad_norm": 33.0, | |
| "learning_rate": 3.543667236111458e-06, | |
| "loss": 1.8086, | |
| "mean_token_accuracy": 0.9766728147864342, | |
| "num_tokens": 641408567.0, | |
| "step": 5570 | |
| }, | |
| { | |
| "epoch": 1.2312699988966127, | |
| "grad_norm": 33.25, | |
| "learning_rate": 3.526224127945479e-06, | |
| "loss": 1.9042, | |
| "mean_token_accuracy": 0.973830196261406, | |
| "num_tokens": 642573013.0, | |
| "step": 5580 | |
| }, | |
| { | |
| "epoch": 1.2334767736952443, | |
| "grad_norm": 77.5, | |
| "learning_rate": 3.5088006409270136e-06, | |
| "loss": 1.9446, | |
| "mean_token_accuracy": 0.9753986790776252, | |
| "num_tokens": 643726591.0, | |
| "step": 5590 | |
| }, | |
| { | |
| "epoch": 1.2356835484938762, | |
| "grad_norm": 57.5, | |
| "learning_rate": 3.4913970070240388e-06, | |
| "loss": 1.7924, | |
| "mean_token_accuracy": 0.9772971168160438, | |
| "num_tokens": 644892695.0, | |
| "step": 5600 | |
| }, | |
| { | |
| "epoch": 1.237890323292508, | |
| "grad_norm": 75.5, | |
| "learning_rate": 3.474013457940218e-06, | |
| "loss": 1.812, | |
| "mean_token_accuracy": 0.9764907211065292, | |
| "num_tokens": 646053963.0, | |
| "step": 5610 | |
| }, | |
| { | |
| "epoch": 1.2400970980911399, | |
| "grad_norm": 29.0, | |
| "learning_rate": 3.456650225111815e-06, | |
| "loss": 1.9371, | |
| "mean_token_accuracy": 0.9732755541801452, | |
| "num_tokens": 647206734.0, | |
| "step": 5620 | |
| }, | |
| { | |
| "epoch": 1.2423038728897715, | |
| "grad_norm": 36.25, | |
| "learning_rate": 3.4393075397046105e-06, | |
| "loss": 1.935, | |
| "mean_token_accuracy": 0.9742804169654846, | |
| "num_tokens": 648354326.0, | |
| "step": 5630 | |
| }, | |
| { | |
| "epoch": 1.2445106476884034, | |
| "grad_norm": 41.5, | |
| "learning_rate": 3.4219856326108276e-06, | |
| "loss": 1.7302, | |
| "mean_token_accuracy": 0.9767812743782998, | |
| "num_tokens": 649495009.0, | |
| "step": 5640 | |
| }, | |
| { | |
| "epoch": 1.2467174224870352, | |
| "grad_norm": 78.5, | |
| "learning_rate": 3.4046847344460608e-06, | |
| "loss": 2.0114, | |
| "mean_token_accuracy": 0.9756919577717781, | |
| "num_tokens": 650641836.0, | |
| "step": 5650 | |
| }, | |
| { | |
| "epoch": 1.248924197285667, | |
| "grad_norm": 33.5, | |
| "learning_rate": 3.3874050755461984e-06, | |
| "loss": 1.905, | |
| "mean_token_accuracy": 0.9731512203812599, | |
| "num_tokens": 651790330.0, | |
| "step": 5660 | |
| }, | |
| { | |
| "epoch": 1.2511309720842987, | |
| "grad_norm": 26.5, | |
| "learning_rate": 3.3701468859643583e-06, | |
| "loss": 1.8816, | |
| "mean_token_accuracy": 0.976065294444561, | |
| "num_tokens": 652951960.0, | |
| "step": 5670 | |
| }, | |
| { | |
| "epoch": 1.2533377468829305, | |
| "grad_norm": 25.875, | |
| "learning_rate": 3.352910395467827e-06, | |
| "loss": 1.7958, | |
| "mean_token_accuracy": 0.9755621612071991, | |
| "num_tokens": 654101888.0, | |
| "step": 5680 | |
| }, | |
| { | |
| "epoch": 1.2555445216815624, | |
| "grad_norm": 63.5, | |
| "learning_rate": 3.335695833535001e-06, | |
| "loss": 1.7489, | |
| "mean_token_accuracy": 0.9761272236704827, | |
| "num_tokens": 655273030.0, | |
| "step": 5690 | |
| }, | |
| { | |
| "epoch": 1.2577512964801942, | |
| "grad_norm": 38.0, | |
| "learning_rate": 3.31850342935233e-06, | |
| "loss": 1.9327, | |
| "mean_token_accuracy": 0.97499048858881, | |
| "num_tokens": 656415802.0, | |
| "step": 5700 | |
| }, | |
| { | |
| "epoch": 1.259958071278826, | |
| "grad_norm": 48.5, | |
| "learning_rate": 3.301333411811264e-06, | |
| "loss": 1.8872, | |
| "mean_token_accuracy": 0.9762759670615196, | |
| "num_tokens": 657566900.0, | |
| "step": 5710 | |
| }, | |
| { | |
| "epoch": 1.2621648460774577, | |
| "grad_norm": 30.625, | |
| "learning_rate": 3.2841860095052096e-06, | |
| "loss": 1.7047, | |
| "mean_token_accuracy": 0.9778104826807976, | |
| "num_tokens": 658729945.0, | |
| "step": 5720 | |
| }, | |
| { | |
| "epoch": 1.2643716208760896, | |
| "grad_norm": 41.75, | |
| "learning_rate": 3.2670614507264863e-06, | |
| "loss": 1.8856, | |
| "mean_token_accuracy": 0.9746754586696624, | |
| "num_tokens": 659871577.0, | |
| "step": 5730 | |
| }, | |
| { | |
| "epoch": 1.2665783956747214, | |
| "grad_norm": 33.25, | |
| "learning_rate": 3.249959963463283e-06, | |
| "loss": 1.6874, | |
| "mean_token_accuracy": 0.9753165408968926, | |
| "num_tokens": 661036888.0, | |
| "step": 5740 | |
| }, | |
| { | |
| "epoch": 1.2687851704733533, | |
| "grad_norm": 42.75, | |
| "learning_rate": 3.232881775396626e-06, | |
| "loss": 1.7771, | |
| "mean_token_accuracy": 0.9771184712648392, | |
| "num_tokens": 662190333.0, | |
| "step": 5750 | |
| }, | |
| { | |
| "epoch": 1.2709919452719851, | |
| "grad_norm": 64.0, | |
| "learning_rate": 3.215827113897351e-06, | |
| "loss": 1.7967, | |
| "mean_token_accuracy": 0.9748601868748665, | |
| "num_tokens": 663339112.0, | |
| "step": 5760 | |
| }, | |
| { | |
| "epoch": 1.2731987200706167, | |
| "grad_norm": 44.5, | |
| "learning_rate": 3.1987962060230674e-06, | |
| "loss": 1.7581, | |
| "mean_token_accuracy": 0.9777951002120971, | |
| "num_tokens": 664486373.0, | |
| "step": 5770 | |
| }, | |
| { | |
| "epoch": 1.2754054948692486, | |
| "grad_norm": 82.5, | |
| "learning_rate": 3.1817892785151426e-06, | |
| "loss": 1.8409, | |
| "mean_token_accuracy": 0.9769984096288681, | |
| "num_tokens": 665618395.0, | |
| "step": 5780 | |
| }, | |
| { | |
| "epoch": 1.2776122696678804, | |
| "grad_norm": 31.25, | |
| "learning_rate": 3.164806557795679e-06, | |
| "loss": 1.8502, | |
| "mean_token_accuracy": 0.9746487557888031, | |
| "num_tokens": 666752016.0, | |
| "step": 5790 | |
| }, | |
| { | |
| "epoch": 1.279819044466512, | |
| "grad_norm": 26.125, | |
| "learning_rate": 3.1478482699645052e-06, | |
| "loss": 1.7379, | |
| "mean_token_accuracy": 0.9763323217630386, | |
| "num_tokens": 667906714.0, | |
| "step": 5800 | |
| }, | |
| { | |
| "epoch": 1.282025819265144, | |
| "grad_norm": 47.25, | |
| "learning_rate": 3.1309146407961565e-06, | |
| "loss": 1.8304, | |
| "mean_token_accuracy": 0.9753479897975922, | |
| "num_tokens": 669062941.0, | |
| "step": 5810 | |
| }, | |
| { | |
| "epoch": 1.2842325940637758, | |
| "grad_norm": 77.5, | |
| "learning_rate": 3.11400589573688e-06, | |
| "loss": 1.6762, | |
| "mean_token_accuracy": 0.9773499146103859, | |
| "num_tokens": 670205902.0, | |
| "step": 5820 | |
| }, | |
| { | |
| "epoch": 1.2864393688624076, | |
| "grad_norm": 32.25, | |
| "learning_rate": 3.0971222599016237e-06, | |
| "loss": 1.7482, | |
| "mean_token_accuracy": 0.9762661248445511, | |
| "num_tokens": 671363091.0, | |
| "step": 5830 | |
| }, | |
| { | |
| "epoch": 1.2886461436610395, | |
| "grad_norm": 67.0, | |
| "learning_rate": 3.0802639580710465e-06, | |
| "loss": 1.8429, | |
| "mean_token_accuracy": 0.9752171978354454, | |
| "num_tokens": 672499228.0, | |
| "step": 5840 | |
| }, | |
| { | |
| "epoch": 1.290852918459671, | |
| "grad_norm": 37.75, | |
| "learning_rate": 3.0634312146885193e-06, | |
| "loss": 2.0013, | |
| "mean_token_accuracy": 0.9739482790231705, | |
| "num_tokens": 673659411.0, | |
| "step": 5850 | |
| }, | |
| { | |
| "epoch": 1.293059693258303, | |
| "grad_norm": 54.5, | |
| "learning_rate": 3.0466242538571423e-06, | |
| "loss": 1.66, | |
| "mean_token_accuracy": 0.9776546716690063, | |
| "num_tokens": 674822650.0, | |
| "step": 5860 | |
| }, | |
| { | |
| "epoch": 1.2952664680569348, | |
| "grad_norm": 25.0, | |
| "learning_rate": 3.0298432993367577e-06, | |
| "loss": 1.6226, | |
| "mean_token_accuracy": 0.9785138204693794, | |
| "num_tokens": 675984971.0, | |
| "step": 5870 | |
| }, | |
| { | |
| "epoch": 1.2974732428555666, | |
| "grad_norm": 25.625, | |
| "learning_rate": 3.0130885745409744e-06, | |
| "loss": 1.8775, | |
| "mean_token_accuracy": 0.9746320694684982, | |
| "num_tokens": 677148669.0, | |
| "step": 5880 | |
| }, | |
| { | |
| "epoch": 1.2996800176541985, | |
| "grad_norm": 40.25, | |
| "learning_rate": 2.9963603025341894e-06, | |
| "loss": 1.85, | |
| "mean_token_accuracy": 0.9739915490150451, | |
| "num_tokens": 678304909.0, | |
| "step": 5890 | |
| }, | |
| { | |
| "epoch": 1.3018867924528301, | |
| "grad_norm": 22.625, | |
| "learning_rate": 2.979658706028619e-06, | |
| "loss": 1.782, | |
| "mean_token_accuracy": 0.9757357522845268, | |
| "num_tokens": 679456014.0, | |
| "step": 5900 | |
| }, | |
| { | |
| "epoch": 1.304093567251462, | |
| "grad_norm": 55.75, | |
| "learning_rate": 2.9629840073813376e-06, | |
| "loss": 1.8597, | |
| "mean_token_accuracy": 0.974382920563221, | |
| "num_tokens": 680593369.0, | |
| "step": 5910 | |
| }, | |
| { | |
| "epoch": 1.3063003420500938, | |
| "grad_norm": 34.5, | |
| "learning_rate": 2.9463364285913117e-06, | |
| "loss": 1.7998, | |
| "mean_token_accuracy": 0.9752948999404907, | |
| "num_tokens": 681741941.0, | |
| "step": 5920 | |
| }, | |
| { | |
| "epoch": 1.3085071168487254, | |
| "grad_norm": 43.75, | |
| "learning_rate": 2.9297161912964476e-06, | |
| "loss": 1.772, | |
| "mean_token_accuracy": 0.9756360232830048, | |
| "num_tokens": 682896982.0, | |
| "step": 5930 | |
| }, | |
| { | |
| "epoch": 1.3107138916473575, | |
| "grad_norm": 32.25, | |
| "learning_rate": 2.913123516770642e-06, | |
| "loss": 1.9013, | |
| "mean_token_accuracy": 0.9750306650996208, | |
| "num_tokens": 684035727.0, | |
| "step": 5940 | |
| }, | |
| { | |
| "epoch": 1.3129206664459891, | |
| "grad_norm": 54.0, | |
| "learning_rate": 2.8965586259208295e-06, | |
| "loss": 1.7491, | |
| "mean_token_accuracy": 0.9764790132641792, | |
| "num_tokens": 685182208.0, | |
| "step": 5950 | |
| }, | |
| { | |
| "epoch": 1.315127441244621, | |
| "grad_norm": 22.875, | |
| "learning_rate": 2.880021739284053e-06, | |
| "loss": 1.8907, | |
| "mean_token_accuracy": 0.9763995632529259, | |
| "num_tokens": 686343119.0, | |
| "step": 5960 | |
| }, | |
| { | |
| "epoch": 1.3173342160432528, | |
| "grad_norm": 65.5, | |
| "learning_rate": 2.8635130770245124e-06, | |
| "loss": 1.8058, | |
| "mean_token_accuracy": 0.9757361471652984, | |
| "num_tokens": 687496119.0, | |
| "step": 5970 | |
| }, | |
| { | |
| "epoch": 1.3195409908418845, | |
| "grad_norm": 54.25, | |
| "learning_rate": 2.8470328589306508e-06, | |
| "loss": 1.8771, | |
| "mean_token_accuracy": 0.975439578294754, | |
| "num_tokens": 688642777.0, | |
| "step": 5980 | |
| }, | |
| { | |
| "epoch": 1.3217477656405163, | |
| "grad_norm": 52.5, | |
| "learning_rate": 2.83058130441221e-06, | |
| "loss": 1.8662, | |
| "mean_token_accuracy": 0.9731543198227882, | |
| "num_tokens": 689793698.0, | |
| "step": 5990 | |
| }, | |
| { | |
| "epoch": 1.3239545404391482, | |
| "grad_norm": 63.25, | |
| "learning_rate": 2.8141586324973248e-06, | |
| "loss": 1.9043, | |
| "mean_token_accuracy": 0.9741871625185012, | |
| "num_tokens": 690949839.0, | |
| "step": 6000 | |
| }, | |
| { | |
| "epoch": 1.32616131523778, | |
| "grad_norm": 35.25, | |
| "learning_rate": 2.7977650618296026e-06, | |
| "loss": 1.7665, | |
| "mean_token_accuracy": 0.9764251247048378, | |
| "num_tokens": 692115106.0, | |
| "step": 6010 | |
| }, | |
| { | |
| "epoch": 1.3283680900364119, | |
| "grad_norm": 42.0, | |
| "learning_rate": 2.781400810665201e-06, | |
| "loss": 1.9609, | |
| "mean_token_accuracy": 0.9741915419697762, | |
| "num_tokens": 693262382.0, | |
| "step": 6020 | |
| }, | |
| { | |
| "epoch": 1.3305748648350435, | |
| "grad_norm": 42.0, | |
| "learning_rate": 2.765066096869945e-06, | |
| "loss": 1.7875, | |
| "mean_token_accuracy": 0.9765988096594811, | |
| "num_tokens": 694440462.0, | |
| "step": 6030 | |
| }, | |
| { | |
| "epoch": 1.3327816396336754, | |
| "grad_norm": 125.0, | |
| "learning_rate": 2.7487611379163997e-06, | |
| "loss": 1.913, | |
| "mean_token_accuracy": 0.974324369430542, | |
| "num_tokens": 695596599.0, | |
| "step": 6040 | |
| }, | |
| { | |
| "epoch": 1.3349884144323072, | |
| "grad_norm": 99.0, | |
| "learning_rate": 2.7324861508810007e-06, | |
| "loss": 1.8294, | |
| "mean_token_accuracy": 0.9766657844185829, | |
| "num_tokens": 696742749.0, | |
| "step": 6050 | |
| }, | |
| { | |
| "epoch": 1.337195189230939, | |
| "grad_norm": 29.0, | |
| "learning_rate": 2.716241352441141e-06, | |
| "loss": 1.8799, | |
| "mean_token_accuracy": 0.9746081337332726, | |
| "num_tokens": 697910420.0, | |
| "step": 6060 | |
| }, | |
| { | |
| "epoch": 1.339401964029571, | |
| "grad_norm": 37.5, | |
| "learning_rate": 2.7000269588723073e-06, | |
| "loss": 1.5976, | |
| "mean_token_accuracy": 0.9791813105344772, | |
| "num_tokens": 699056083.0, | |
| "step": 6070 | |
| }, | |
| { | |
| "epoch": 1.3416087388282025, | |
| "grad_norm": 69.0, | |
| "learning_rate": 2.6838431860451797e-06, | |
| "loss": 1.7528, | |
| "mean_token_accuracy": 0.9757783994078636, | |
| "num_tokens": 700215731.0, | |
| "step": 6080 | |
| }, | |
| { | |
| "epoch": 1.3438155136268344, | |
| "grad_norm": 41.5, | |
| "learning_rate": 2.6676902494227795e-06, | |
| "loss": 2.0134, | |
| "mean_token_accuracy": 0.9729572832584381, | |
| "num_tokens": 701356240.0, | |
| "step": 6090 | |
| }, | |
| { | |
| "epoch": 1.3460222884254662, | |
| "grad_norm": 31.5, | |
| "learning_rate": 2.65156836405758e-06, | |
| "loss": 1.7889, | |
| "mean_token_accuracy": 0.9774167835712433, | |
| "num_tokens": 702524355.0, | |
| "step": 6100 | |
| }, | |
| { | |
| "epoch": 1.3482290632240979, | |
| "grad_norm": 50.25, | |
| "learning_rate": 2.635477744588658e-06, | |
| "loss": 1.6896, | |
| "mean_token_accuracy": 0.9782000362873078, | |
| "num_tokens": 703660966.0, | |
| "step": 6110 | |
| }, | |
| { | |
| "epoch": 1.3504358380227297, | |
| "grad_norm": 24.625, | |
| "learning_rate": 2.6194186052388323e-06, | |
| "loss": 1.7588, | |
| "mean_token_accuracy": 0.9772956445813179, | |
| "num_tokens": 704814830.0, | |
| "step": 6120 | |
| }, | |
| { | |
| "epoch": 1.3526426128213616, | |
| "grad_norm": 57.25, | |
| "learning_rate": 2.6033911598118037e-06, | |
| "loss": 1.8643, | |
| "mean_token_accuracy": 0.9757556483149529, | |
| "num_tokens": 705962264.0, | |
| "step": 6130 | |
| }, | |
| { | |
| "epoch": 1.3548493876199934, | |
| "grad_norm": 67.5, | |
| "learning_rate": 2.587395621689325e-06, | |
| "loss": 1.7173, | |
| "mean_token_accuracy": 0.9784960508346557, | |
| "num_tokens": 707111319.0, | |
| "step": 6140 | |
| }, | |
| { | |
| "epoch": 1.3570561624186253, | |
| "grad_norm": 26.375, | |
| "learning_rate": 2.571432203828339e-06, | |
| "loss": 1.8329, | |
| "mean_token_accuracy": 0.9760042116045952, | |
| "num_tokens": 708259268.0, | |
| "step": 6150 | |
| }, | |
| { | |
| "epoch": 1.3592629372172569, | |
| "grad_norm": 112.0, | |
| "learning_rate": 2.555501118758167e-06, | |
| "loss": 2.0094, | |
| "mean_token_accuracy": 0.9748006328940392, | |
| "num_tokens": 709384260.0, | |
| "step": 6160 | |
| }, | |
| { | |
| "epoch": 1.3614697120158887, | |
| "grad_norm": 99.0, | |
| "learning_rate": 2.5396025785776545e-06, | |
| "loss": 1.7945, | |
| "mean_token_accuracy": 0.976505708694458, | |
| "num_tokens": 710543299.0, | |
| "step": 6170 | |
| }, | |
| { | |
| "epoch": 1.3636764868145206, | |
| "grad_norm": 55.25, | |
| "learning_rate": 2.523736794952373e-06, | |
| "loss": 2.0983, | |
| "mean_token_accuracy": 0.9727712392807006, | |
| "num_tokens": 711693117.0, | |
| "step": 6180 | |
| }, | |
| { | |
| "epoch": 1.3658832616131524, | |
| "grad_norm": 69.5, | |
| "learning_rate": 2.5079039791117748e-06, | |
| "loss": 1.7156, | |
| "mean_token_accuracy": 0.977227745950222, | |
| "num_tokens": 712829801.0, | |
| "step": 6190 | |
| }, | |
| { | |
| "epoch": 1.3680900364117843, | |
| "grad_norm": 45.75, | |
| "learning_rate": 2.4921043418464085e-06, | |
| "loss": 1.9493, | |
| "mean_token_accuracy": 0.9750061735510827, | |
| "num_tokens": 713974429.0, | |
| "step": 6200 | |
| }, | |
| { | |
| "epoch": 1.370296811210416, | |
| "grad_norm": 31.0, | |
| "learning_rate": 2.4763380935050878e-06, | |
| "loss": 1.9315, | |
| "mean_token_accuracy": 0.9749646574258805, | |
| "num_tokens": 715107968.0, | |
| "step": 6210 | |
| }, | |
| { | |
| "epoch": 1.3725035860090478, | |
| "grad_norm": 24.5, | |
| "learning_rate": 2.460605443992109e-06, | |
| "loss": 1.8737, | |
| "mean_token_accuracy": 0.9752072811126709, | |
| "num_tokens": 716262488.0, | |
| "step": 6220 | |
| }, | |
| { | |
| "epoch": 1.3747103608076796, | |
| "grad_norm": 103.5, | |
| "learning_rate": 2.4449066027644473e-06, | |
| "loss": 1.8298, | |
| "mean_token_accuracy": 0.9760120168328286, | |
| "num_tokens": 717399703.0, | |
| "step": 6230 | |
| }, | |
| { | |
| "epoch": 1.3769171356063112, | |
| "grad_norm": 71.0, | |
| "learning_rate": 2.4292417788289735e-06, | |
| "loss": 2.0194, | |
| "mean_token_accuracy": 0.9719763740897178, | |
| "num_tokens": 718558165.0, | |
| "step": 6240 | |
| }, | |
| { | |
| "epoch": 1.3791239104049433, | |
| "grad_norm": 72.0, | |
| "learning_rate": 2.4136111807396617e-06, | |
| "loss": 1.8567, | |
| "mean_token_accuracy": 0.97542824447155, | |
| "num_tokens": 719712112.0, | |
| "step": 6250 | |
| }, | |
| { | |
| "epoch": 1.381330685203575, | |
| "grad_norm": 101.5, | |
| "learning_rate": 2.398015016594828e-06, | |
| "loss": 1.759, | |
| "mean_token_accuracy": 0.9768665507435799, | |
| "num_tokens": 720860531.0, | |
| "step": 6260 | |
| }, | |
| { | |
| "epoch": 1.3835374600022068, | |
| "grad_norm": 84.0, | |
| "learning_rate": 2.382453494034344e-06, | |
| "loss": 1.6723, | |
| "mean_token_accuracy": 0.978014275431633, | |
| "num_tokens": 722012739.0, | |
| "step": 6270 | |
| }, | |
| { | |
| "epoch": 1.3857442348008386, | |
| "grad_norm": 32.5, | |
| "learning_rate": 2.366926820236882e-06, | |
| "loss": 1.9218, | |
| "mean_token_accuracy": 0.9760876625776291, | |
| "num_tokens": 723177458.0, | |
| "step": 6280 | |
| }, | |
| { | |
| "epoch": 1.3879510095994703, | |
| "grad_norm": 99.0, | |
| "learning_rate": 2.351435201917159e-06, | |
| "loss": 1.8077, | |
| "mean_token_accuracy": 0.9765541851520538, | |
| "num_tokens": 724326268.0, | |
| "step": 6290 | |
| }, | |
| { | |
| "epoch": 1.3901577843981021, | |
| "grad_norm": 75.5, | |
| "learning_rate": 2.3359788453231723e-06, | |
| "loss": 1.706, | |
| "mean_token_accuracy": 0.9775751575827598, | |
| "num_tokens": 725486233.0, | |
| "step": 6300 | |
| }, | |
| { | |
| "epoch": 1.392364559196734, | |
| "grad_norm": 66.0, | |
| "learning_rate": 2.3205579562334696e-06, | |
| "loss": 1.8631, | |
| "mean_token_accuracy": 0.9744323208928108, | |
| "num_tokens": 726653829.0, | |
| "step": 6310 | |
| }, | |
| { | |
| "epoch": 1.3945713339953658, | |
| "grad_norm": 32.5, | |
| "learning_rate": 2.3051727399543934e-06, | |
| "loss": 1.8198, | |
| "mean_token_accuracy": 0.9757519215345383, | |
| "num_tokens": 727794399.0, | |
| "step": 6320 | |
| }, | |
| { | |
| "epoch": 1.3967781087939977, | |
| "grad_norm": 84.5, | |
| "learning_rate": 2.289823401317363e-06, | |
| "loss": 1.9496, | |
| "mean_token_accuracy": 0.9731798902153969, | |
| "num_tokens": 728964825.0, | |
| "step": 6330 | |
| }, | |
| { | |
| "epoch": 1.3989848835926293, | |
| "grad_norm": 40.0, | |
| "learning_rate": 2.274510144676131e-06, | |
| "loss": 1.8433, | |
| "mean_token_accuracy": 0.976384311914444, | |
| "num_tokens": 730130565.0, | |
| "step": 6340 | |
| }, | |
| { | |
| "epoch": 1.4011916583912611, | |
| "grad_norm": 43.5, | |
| "learning_rate": 2.259233173904084e-06, | |
| "loss": 1.8212, | |
| "mean_token_accuracy": 0.974110347032547, | |
| "num_tokens": 731287831.0, | |
| "step": 6350 | |
| }, | |
| { | |
| "epoch": 1.403398433189893, | |
| "grad_norm": 37.75, | |
| "learning_rate": 2.2439926923915022e-06, | |
| "loss": 1.9299, | |
| "mean_token_accuracy": 0.9731192097067833, | |
| "num_tokens": 732434828.0, | |
| "step": 6360 | |
| }, | |
| { | |
| "epoch": 1.4056052079885248, | |
| "grad_norm": 92.5, | |
| "learning_rate": 2.228788903042877e-06, | |
| "loss": 1.805, | |
| "mean_token_accuracy": 0.9758853450417518, | |
| "num_tokens": 733607627.0, | |
| "step": 6370 | |
| }, | |
| { | |
| "epoch": 1.4078119827871567, | |
| "grad_norm": 87.0, | |
| "learning_rate": 2.2136220082741876e-06, | |
| "loss": 1.9836, | |
| "mean_token_accuracy": 0.971193365752697, | |
| "num_tokens": 734757141.0, | |
| "step": 6380 | |
| }, | |
| { | |
| "epoch": 1.4100187575857883, | |
| "grad_norm": 80.5, | |
| "learning_rate": 2.198492210010226e-06, | |
| "loss": 1.7572, | |
| "mean_token_accuracy": 0.9785696104168892, | |
| "num_tokens": 735904820.0, | |
| "step": 6390 | |
| }, | |
| { | |
| "epoch": 1.4122255323844202, | |
| "grad_norm": 39.5, | |
| "learning_rate": 2.1833997096818897e-06, | |
| "loss": 1.8246, | |
| "mean_token_accuracy": 0.9760110557079316, | |
| "num_tokens": 737044631.0, | |
| "step": 6400 | |
| }, | |
| { | |
| "epoch": 1.414432307183052, | |
| "grad_norm": 131.0, | |
| "learning_rate": 2.168344708223519e-06, | |
| "loss": 1.8687, | |
| "mean_token_accuracy": 0.9739629760384559, | |
| "num_tokens": 738208645.0, | |
| "step": 6410 | |
| }, | |
| { | |
| "epoch": 1.4166390819816836, | |
| "grad_norm": 104.0, | |
| "learning_rate": 2.1533274060702015e-06, | |
| "loss": 1.8425, | |
| "mean_token_accuracy": 0.977659723162651, | |
| "num_tokens": 739365910.0, | |
| "step": 6420 | |
| }, | |
| { | |
| "epoch": 1.4188458567803155, | |
| "grad_norm": 25.375, | |
| "learning_rate": 2.1383480031551257e-06, | |
| "loss": 1.6823, | |
| "mean_token_accuracy": 0.9790923193097114, | |
| "num_tokens": 740521431.0, | |
| "step": 6430 | |
| }, | |
| { | |
| "epoch": 1.4210526315789473, | |
| "grad_norm": 36.25, | |
| "learning_rate": 2.1234066989068972e-06, | |
| "loss": 1.9287, | |
| "mean_token_accuracy": 0.9745568603277206, | |
| "num_tokens": 741682181.0, | |
| "step": 6440 | |
| }, | |
| { | |
| "epoch": 1.4232594063775792, | |
| "grad_norm": 50.5, | |
| "learning_rate": 2.1085036922469017e-06, | |
| "loss": 1.8484, | |
| "mean_token_accuracy": 0.9764415845274925, | |
| "num_tokens": 742833342.0, | |
| "step": 6450 | |
| }, | |
| { | |
| "epoch": 1.425466181176211, | |
| "grad_norm": 157.0, | |
| "learning_rate": 2.093639181586647e-06, | |
| "loss": 1.6148, | |
| "mean_token_accuracy": 0.9789648145437241, | |
| "num_tokens": 743980679.0, | |
| "step": 6460 | |
| }, | |
| { | |
| "epoch": 1.4276729559748427, | |
| "grad_norm": 69.0, | |
| "learning_rate": 2.0788133648251207e-06, | |
| "loss": 1.8245, | |
| "mean_token_accuracy": 0.97651526927948, | |
| "num_tokens": 745144275.0, | |
| "step": 6470 | |
| }, | |
| { | |
| "epoch": 1.4298797307734745, | |
| "grad_norm": 113.0, | |
| "learning_rate": 2.064026439346163e-06, | |
| "loss": 1.6406, | |
| "mean_token_accuracy": 0.9798701673746109, | |
| "num_tokens": 746284459.0, | |
| "step": 6480 | |
| }, | |
| { | |
| "epoch": 1.4320865055721064, | |
| "grad_norm": 53.25, | |
| "learning_rate": 2.049278602015829e-06, | |
| "loss": 1.7722, | |
| "mean_token_accuracy": 0.9773429319262504, | |
| "num_tokens": 747434490.0, | |
| "step": 6490 | |
| }, | |
| { | |
| "epoch": 1.4342932803707382, | |
| "grad_norm": 124.0, | |
| "learning_rate": 2.0345700491797786e-06, | |
| "loss": 1.7964, | |
| "mean_token_accuracy": 0.977318100631237, | |
| "num_tokens": 748576563.0, | |
| "step": 6500 | |
| }, | |
| { | |
| "epoch": 1.43650005516937, | |
| "grad_norm": 38.5, | |
| "learning_rate": 2.019900976660651e-06, | |
| "loss": 1.9118, | |
| "mean_token_accuracy": 0.9749090164899826, | |
| "num_tokens": 749715849.0, | |
| "step": 6510 | |
| }, | |
| { | |
| "epoch": 1.4387068299680017, | |
| "grad_norm": 43.25, | |
| "learning_rate": 2.005271579755469e-06, | |
| "loss": 2.0353, | |
| "mean_token_accuracy": 0.9740123763680458, | |
| "num_tokens": 750886184.0, | |
| "step": 6520 | |
| }, | |
| { | |
| "epoch": 1.4409136047666335, | |
| "grad_norm": 97.0, | |
| "learning_rate": 1.9906820532330262e-06, | |
| "loss": 2.0424, | |
| "mean_token_accuracy": 0.9722412198781967, | |
| "num_tokens": 752058746.0, | |
| "step": 6530 | |
| }, | |
| { | |
| "epoch": 1.4431203795652654, | |
| "grad_norm": 28.75, | |
| "learning_rate": 1.97613259133131e-06, | |
| "loss": 2.0293, | |
| "mean_token_accuracy": 0.9734477370977401, | |
| "num_tokens": 753212790.0, | |
| "step": 6540 | |
| }, | |
| { | |
| "epoch": 1.445327154363897, | |
| "grad_norm": 46.25, | |
| "learning_rate": 1.961623387754897e-06, | |
| "loss": 1.9057, | |
| "mean_token_accuracy": 0.9753732338547707, | |
| "num_tokens": 754359329.0, | |
| "step": 6550 | |
| }, | |
| { | |
| "epoch": 1.447533929162529, | |
| "grad_norm": 110.0, | |
| "learning_rate": 1.947154635672393e-06, | |
| "loss": 1.8642, | |
| "mean_token_accuracy": 0.9741095468401909, | |
| "num_tokens": 755523206.0, | |
| "step": 6560 | |
| }, | |
| { | |
| "epoch": 1.4497407039611607, | |
| "grad_norm": 53.25, | |
| "learning_rate": 1.932726527713843e-06, | |
| "loss": 1.7712, | |
| "mean_token_accuracy": 0.9768127173185348, | |
| "num_tokens": 756672031.0, | |
| "step": 6570 | |
| }, | |
| { | |
| "epoch": 1.4519474787597926, | |
| "grad_norm": 67.5, | |
| "learning_rate": 1.9183392559681812e-06, | |
| "loss": 1.9435, | |
| "mean_token_accuracy": 0.9738660231232643, | |
| "num_tokens": 757820723.0, | |
| "step": 6580 | |
| }, | |
| { | |
| "epoch": 1.4541542535584244, | |
| "grad_norm": 46.0, | |
| "learning_rate": 1.9039930119806698e-06, | |
| "loss": 1.8462, | |
| "mean_token_accuracy": 0.9734216213226319, | |
| "num_tokens": 758975440.0, | |
| "step": 6590 | |
| }, | |
| { | |
| "epoch": 1.456361028357056, | |
| "grad_norm": 26.75, | |
| "learning_rate": 1.8896879867503392e-06, | |
| "loss": 1.7306, | |
| "mean_token_accuracy": 0.9771073400974274, | |
| "num_tokens": 760146513.0, | |
| "step": 6600 | |
| }, | |
| { | |
| "epoch": 1.458567803155688, | |
| "grad_norm": 52.0, | |
| "learning_rate": 1.8754243707274617e-06, | |
| "loss": 1.949, | |
| "mean_token_accuracy": 0.9736502662301063, | |
| "num_tokens": 761304174.0, | |
| "step": 6610 | |
| }, | |
| { | |
| "epoch": 1.4607745779543198, | |
| "grad_norm": 115.5, | |
| "learning_rate": 1.8612023538109998e-06, | |
| "loss": 1.8572, | |
| "mean_token_accuracy": 0.9754300698637962, | |
| "num_tokens": 762439915.0, | |
| "step": 6620 | |
| }, | |
| { | |
| "epoch": 1.4629813527529516, | |
| "grad_norm": 50.5, | |
| "learning_rate": 1.847022125346092e-06, | |
| "loss": 1.731, | |
| "mean_token_accuracy": 0.9769893154501915, | |
| "num_tokens": 763597054.0, | |
| "step": 6630 | |
| }, | |
| { | |
| "epoch": 1.4651881275515835, | |
| "grad_norm": 41.0, | |
| "learning_rate": 1.8328838741215187e-06, | |
| "loss": 1.8415, | |
| "mean_token_accuracy": 0.9750018402934074, | |
| "num_tokens": 764750468.0, | |
| "step": 6640 | |
| }, | |
| { | |
| "epoch": 1.467394902350215, | |
| "grad_norm": 32.5, | |
| "learning_rate": 1.8187877883672024e-06, | |
| "loss": 1.7813, | |
| "mean_token_accuracy": 0.978123389184475, | |
| "num_tokens": 765904561.0, | |
| "step": 6650 | |
| }, | |
| { | |
| "epoch": 1.469601677148847, | |
| "grad_norm": 38.5, | |
| "learning_rate": 1.8047340557516867e-06, | |
| "loss": 1.8384, | |
| "mean_token_accuracy": 0.9757016867399215, | |
| "num_tokens": 767061972.0, | |
| "step": 6660 | |
| }, | |
| { | |
| "epoch": 1.4718084519474788, | |
| "grad_norm": 35.75, | |
| "learning_rate": 1.7907228633796553e-06, | |
| "loss": 1.9053, | |
| "mean_token_accuracy": 0.9745875716209411, | |
| "num_tokens": 768223438.0, | |
| "step": 6670 | |
| }, | |
| { | |
| "epoch": 1.4740152267461106, | |
| "grad_norm": 62.5, | |
| "learning_rate": 1.7767543977894198e-06, | |
| "loss": 1.6459, | |
| "mean_token_accuracy": 0.9794975697994233, | |
| "num_tokens": 769360598.0, | |
| "step": 6680 | |
| }, | |
| { | |
| "epoch": 1.4762220015447425, | |
| "grad_norm": 38.25, | |
| "learning_rate": 1.7628288449504615e-06, | |
| "loss": 1.7289, | |
| "mean_token_accuracy": 0.9766364261507988, | |
| "num_tokens": 770506647.0, | |
| "step": 6690 | |
| }, | |
| { | |
| "epoch": 1.478428776343374, | |
| "grad_norm": 39.5, | |
| "learning_rate": 1.7489463902609294e-06, | |
| "loss": 1.8172, | |
| "mean_token_accuracy": 0.9757404074072837, | |
| "num_tokens": 771643701.0, | |
| "step": 6700 | |
| }, | |
| { | |
| "epoch": 1.480635551142006, | |
| "grad_norm": 67.0, | |
| "learning_rate": 1.7351072185451934e-06, | |
| "loss": 1.8295, | |
| "mean_token_accuracy": 0.9746380716562271, | |
| "num_tokens": 772788494.0, | |
| "step": 6710 | |
| }, | |
| { | |
| "epoch": 1.4828423259406378, | |
| "grad_norm": 41.0, | |
| "learning_rate": 1.7213115140513687e-06, | |
| "loss": 1.7748, | |
| "mean_token_accuracy": 0.9769538477063179, | |
| "num_tokens": 773936585.0, | |
| "step": 6720 | |
| }, | |
| { | |
| "epoch": 1.4850491007392694, | |
| "grad_norm": 43.5, | |
| "learning_rate": 1.7075594604488689e-06, | |
| "loss": 1.8264, | |
| "mean_token_accuracy": 0.9769259586930275, | |
| "num_tokens": 775083782.0, | |
| "step": 6730 | |
| }, | |
| { | |
| "epoch": 1.4872558755379013, | |
| "grad_norm": 27.375, | |
| "learning_rate": 1.6938512408259655e-06, | |
| "loss": 1.8941, | |
| "mean_token_accuracy": 0.9738390997052193, | |
| "num_tokens": 776235006.0, | |
| "step": 6740 | |
| }, | |
| { | |
| "epoch": 1.4894626503365331, | |
| "grad_norm": 24.875, | |
| "learning_rate": 1.6801870376873402e-06, | |
| "loss": 2.1158, | |
| "mean_token_accuracy": 0.9730419605970383, | |
| "num_tokens": 777392437.0, | |
| "step": 6750 | |
| }, | |
| { | |
| "epoch": 1.491669425135165, | |
| "grad_norm": 37.25, | |
| "learning_rate": 1.6665670329516643e-06, | |
| "loss": 1.7184, | |
| "mean_token_accuracy": 0.9780646279454231, | |
| "num_tokens": 778544664.0, | |
| "step": 6760 | |
| }, | |
| { | |
| "epoch": 1.4938761999337968, | |
| "grad_norm": 63.25, | |
| "learning_rate": 1.652991407949167e-06, | |
| "loss": 1.9856, | |
| "mean_token_accuracy": 0.9738316342234612, | |
| "num_tokens": 779689432.0, | |
| "step": 6770 | |
| }, | |
| { | |
| "epoch": 1.4960829747324285, | |
| "grad_norm": 39.75, | |
| "learning_rate": 1.6394603434192351e-06, | |
| "loss": 1.7468, | |
| "mean_token_accuracy": 0.9761510893702507, | |
| "num_tokens": 780862995.0, | |
| "step": 6780 | |
| }, | |
| { | |
| "epoch": 1.4982897495310603, | |
| "grad_norm": 88.0, | |
| "learning_rate": 1.6259740195079903e-06, | |
| "loss": 2.0472, | |
| "mean_token_accuracy": 0.9731170833110809, | |
| "num_tokens": 782006485.0, | |
| "step": 6790 | |
| }, | |
| { | |
| "epoch": 1.5004965243296922, | |
| "grad_norm": 25.25, | |
| "learning_rate": 1.6125326157659048e-06, | |
| "loss": 1.6369, | |
| "mean_token_accuracy": 0.9792605251073837, | |
| "num_tokens": 783177825.0, | |
| "step": 6800 | |
| }, | |
| { | |
| "epoch": 1.5027032991283238, | |
| "grad_norm": 29.875, | |
| "learning_rate": 1.5991363111454023e-06, | |
| "loss": 1.9928, | |
| "mean_token_accuracy": 0.9747489631175995, | |
| "num_tokens": 784326073.0, | |
| "step": 6810 | |
| }, | |
| { | |
| "epoch": 1.5049100739269559, | |
| "grad_norm": 54.0, | |
| "learning_rate": 1.5857852839984816e-06, | |
| "loss": 1.888, | |
| "mean_token_accuracy": 0.9756739720702171, | |
| "num_tokens": 785480380.0, | |
| "step": 6820 | |
| }, | |
| { | |
| "epoch": 1.5071168487255875, | |
| "grad_norm": 33.0, | |
| "learning_rate": 1.572479712074333e-06, | |
| "loss": 1.6903, | |
| "mean_token_accuracy": 0.9776247635483741, | |
| "num_tokens": 786627917.0, | |
| "step": 6830 | |
| }, | |
| { | |
| "epoch": 1.5093236235242193, | |
| "grad_norm": 63.75, | |
| "learning_rate": 1.5592197725169844e-06, | |
| "loss": 1.8444, | |
| "mean_token_accuracy": 0.9752008482813835, | |
| "num_tokens": 787779534.0, | |
| "step": 6840 | |
| }, | |
| { | |
| "epoch": 1.5115303983228512, | |
| "grad_norm": 48.75, | |
| "learning_rate": 1.54600564186293e-06, | |
| "loss": 1.9264, | |
| "mean_token_accuracy": 0.9752957716584205, | |
| "num_tokens": 788929621.0, | |
| "step": 6850 | |
| }, | |
| { | |
| "epoch": 1.5137371731214828, | |
| "grad_norm": 54.25, | |
| "learning_rate": 1.532837496038792e-06, | |
| "loss": 1.8475, | |
| "mean_token_accuracy": 0.9743396550416946, | |
| "num_tokens": 790077509.0, | |
| "step": 6860 | |
| }, | |
| { | |
| "epoch": 1.5159439479201149, | |
| "grad_norm": 43.75, | |
| "learning_rate": 1.5197155103589666e-06, | |
| "loss": 1.8658, | |
| "mean_token_accuracy": 0.9746336534619331, | |
| "num_tokens": 791226601.0, | |
| "step": 6870 | |
| }, | |
| { | |
| "epoch": 1.5181507227187465, | |
| "grad_norm": 52.5, | |
| "learning_rate": 1.5066398595233022e-06, | |
| "loss": 2.0468, | |
| "mean_token_accuracy": 0.9720411181449891, | |
| "num_tokens": 792386119.0, | |
| "step": 6880 | |
| }, | |
| { | |
| "epoch": 1.5203574975173784, | |
| "grad_norm": 44.5, | |
| "learning_rate": 1.4936107176147606e-06, | |
| "loss": 1.7982, | |
| "mean_token_accuracy": 0.9759523168206214, | |
| "num_tokens": 793533664.0, | |
| "step": 6890 | |
| }, | |
| { | |
| "epoch": 1.5225642723160102, | |
| "grad_norm": 32.75, | |
| "learning_rate": 1.4806282580971133e-06, | |
| "loss": 1.794, | |
| "mean_token_accuracy": 0.9744495674967766, | |
| "num_tokens": 794673077.0, | |
| "step": 6900 | |
| }, | |
| { | |
| "epoch": 1.5247710471146418, | |
| "grad_norm": 83.5, | |
| "learning_rate": 1.4676926538126185e-06, | |
| "loss": 1.8926, | |
| "mean_token_accuracy": 0.9752981513738632, | |
| "num_tokens": 795837119.0, | |
| "step": 6910 | |
| }, | |
| { | |
| "epoch": 1.526977821913274, | |
| "grad_norm": 37.5, | |
| "learning_rate": 1.4548040769797255e-06, | |
| "loss": 1.8478, | |
| "mean_token_accuracy": 0.9751256018877029, | |
| "num_tokens": 796994585.0, | |
| "step": 6920 | |
| }, | |
| { | |
| "epoch": 1.5291845967119055, | |
| "grad_norm": 27.75, | |
| "learning_rate": 1.4419626991907925e-06, | |
| "loss": 1.961, | |
| "mean_token_accuracy": 0.9742276698350907, | |
| "num_tokens": 798129000.0, | |
| "step": 6930 | |
| }, | |
| { | |
| "epoch": 1.5313913715105374, | |
| "grad_norm": 33.75, | |
| "learning_rate": 1.4291686914097802e-06, | |
| "loss": 1.8453, | |
| "mean_token_accuracy": 0.9755908146500587, | |
| "num_tokens": 799286955.0, | |
| "step": 6940 | |
| }, | |
| { | |
| "epoch": 1.5335981463091692, | |
| "grad_norm": 23.375, | |
| "learning_rate": 1.416422223969996e-06, | |
| "loss": 1.8907, | |
| "mean_token_accuracy": 0.9766012325882911, | |
| "num_tokens": 800434799.0, | |
| "step": 6950 | |
| }, | |
| { | |
| "epoch": 1.5358049211078009, | |
| "grad_norm": 71.0, | |
| "learning_rate": 1.4037234665718118e-06, | |
| "loss": 1.8667, | |
| "mean_token_accuracy": 0.974927519261837, | |
| "num_tokens": 801591336.0, | |
| "step": 6960 | |
| }, | |
| { | |
| "epoch": 1.5380116959064327, | |
| "grad_norm": 35.5, | |
| "learning_rate": 1.3910725882804166e-06, | |
| "loss": 1.9311, | |
| "mean_token_accuracy": 0.9754667162895203, | |
| "num_tokens": 802753222.0, | |
| "step": 6970 | |
| }, | |
| { | |
| "epoch": 1.5402184707050646, | |
| "grad_norm": 63.5, | |
| "learning_rate": 1.378469757523554e-06, | |
| "loss": 1.8214, | |
| "mean_token_accuracy": 0.9754064783453942, | |
| "num_tokens": 803902417.0, | |
| "step": 6980 | |
| }, | |
| { | |
| "epoch": 1.5424252455036962, | |
| "grad_norm": 113.5, | |
| "learning_rate": 1.3659151420892912e-06, | |
| "loss": 1.9385, | |
| "mean_token_accuracy": 0.9726540327072144, | |
| "num_tokens": 805057419.0, | |
| "step": 6990 | |
| }, | |
| { | |
| "epoch": 1.5446320203023283, | |
| "grad_norm": 75.5, | |
| "learning_rate": 1.3534089091237757e-06, | |
| "loss": 1.8152, | |
| "mean_token_accuracy": 0.9758626908063889, | |
| "num_tokens": 806201965.0, | |
| "step": 7000 | |
| }, | |
| { | |
| "epoch": 1.54683879510096, | |
| "grad_norm": 88.0, | |
| "learning_rate": 1.3409512251290164e-06, | |
| "loss": 1.8389, | |
| "mean_token_accuracy": 0.9754257082939148, | |
| "num_tokens": 807367764.0, | |
| "step": 7010 | |
| }, | |
| { | |
| "epoch": 1.5490455698995917, | |
| "grad_norm": 43.0, | |
| "learning_rate": 1.3285422559606615e-06, | |
| "loss": 1.8379, | |
| "mean_token_accuracy": 0.9763792231678963, | |
| "num_tokens": 808520934.0, | |
| "step": 7020 | |
| }, | |
| { | |
| "epoch": 1.5512523446982236, | |
| "grad_norm": 29.375, | |
| "learning_rate": 1.3161821668257969e-06, | |
| "loss": 1.7917, | |
| "mean_token_accuracy": 0.9762871384620666, | |
| "num_tokens": 809676061.0, | |
| "step": 7030 | |
| }, | |
| { | |
| "epoch": 1.5534591194968552, | |
| "grad_norm": 48.25, | |
| "learning_rate": 1.303871122280742e-06, | |
| "loss": 1.7125, | |
| "mean_token_accuracy": 0.977432382106781, | |
| "num_tokens": 810819820.0, | |
| "step": 7040 | |
| }, | |
| { | |
| "epoch": 1.5556658942954873, | |
| "grad_norm": 35.0, | |
| "learning_rate": 1.2916092862288571e-06, | |
| "loss": 1.7739, | |
| "mean_token_accuracy": 0.9771251499652862, | |
| "num_tokens": 811971595.0, | |
| "step": 7050 | |
| }, | |
| { | |
| "epoch": 1.557872669094119, | |
| "grad_norm": 164.0, | |
| "learning_rate": 1.279396821918369e-06, | |
| "loss": 1.9075, | |
| "mean_token_accuracy": 0.9747963815927505, | |
| "num_tokens": 813126837.0, | |
| "step": 7060 | |
| }, | |
| { | |
| "epoch": 1.5600794438927508, | |
| "grad_norm": 36.5, | |
| "learning_rate": 1.2672338919401866e-06, | |
| "loss": 1.8117, | |
| "mean_token_accuracy": 0.9763046145439148, | |
| "num_tokens": 814260797.0, | |
| "step": 7070 | |
| }, | |
| { | |
| "epoch": 1.5622862186913826, | |
| "grad_norm": 29.625, | |
| "learning_rate": 1.2551206582257502e-06, | |
| "loss": 1.7402, | |
| "mean_token_accuracy": 0.9773025214672089, | |
| "num_tokens": 815410332.0, | |
| "step": 7080 | |
| }, | |
| { | |
| "epoch": 1.5644929934900142, | |
| "grad_norm": 55.5, | |
| "learning_rate": 1.2430572820448593e-06, | |
| "loss": 1.8894, | |
| "mean_token_accuracy": 0.9753976702690125, | |
| "num_tokens": 816572165.0, | |
| "step": 7090 | |
| }, | |
| { | |
| "epoch": 1.566699768288646, | |
| "grad_norm": 104.0, | |
| "learning_rate": 1.2310439240035415e-06, | |
| "loss": 1.9407, | |
| "mean_token_accuracy": 0.9739749819040299, | |
| "num_tokens": 817717275.0, | |
| "step": 7100 | |
| }, | |
| { | |
| "epoch": 1.568906543087278, | |
| "grad_norm": 42.75, | |
| "learning_rate": 1.2190807440419012e-06, | |
| "loss": 1.894, | |
| "mean_token_accuracy": 0.9746904343366622, | |
| "num_tokens": 818867937.0, | |
| "step": 7110 | |
| }, | |
| { | |
| "epoch": 1.5711133178859096, | |
| "grad_norm": 43.0, | |
| "learning_rate": 1.2071679014320003e-06, | |
| "loss": 1.964, | |
| "mean_token_accuracy": 0.9753421515226364, | |
| "num_tokens": 820019631.0, | |
| "step": 7120 | |
| }, | |
| { | |
| "epoch": 1.5733200926845416, | |
| "grad_norm": 57.0, | |
| "learning_rate": 1.195305554775728e-06, | |
| "loss": 1.8358, | |
| "mean_token_accuracy": 0.976360110938549, | |
| "num_tokens": 821170174.0, | |
| "step": 7130 | |
| }, | |
| { | |
| "epoch": 1.5755268674831733, | |
| "grad_norm": 25.25, | |
| "learning_rate": 1.183493862002702e-06, | |
| "loss": 1.8615, | |
| "mean_token_accuracy": 0.9740883216261864, | |
| "num_tokens": 822319040.0, | |
| "step": 7140 | |
| }, | |
| { | |
| "epoch": 1.5777336422818051, | |
| "grad_norm": 30.875, | |
| "learning_rate": 1.1717329803681492e-06, | |
| "loss": 1.777, | |
| "mean_token_accuracy": 0.9767364248633384, | |
| "num_tokens": 823479239.0, | |
| "step": 7150 | |
| }, | |
| { | |
| "epoch": 1.579940417080437, | |
| "grad_norm": 49.25, | |
| "learning_rate": 1.1600230664508288e-06, | |
| "loss": 1.7765, | |
| "mean_token_accuracy": 0.9766557142138481, | |
| "num_tokens": 824648476.0, | |
| "step": 7160 | |
| }, | |
| { | |
| "epoch": 1.5821471918790686, | |
| "grad_norm": 60.5, | |
| "learning_rate": 1.1483642761509388e-06, | |
| "loss": 1.7881, | |
| "mean_token_accuracy": 0.9781842797994613, | |
| "num_tokens": 825784841.0, | |
| "step": 7170 | |
| }, | |
| { | |
| "epoch": 1.5843539666777007, | |
| "grad_norm": 29.875, | |
| "learning_rate": 1.1367567646880374e-06, | |
| "loss": 1.7844, | |
| "mean_token_accuracy": 0.9783096998929978, | |
| "num_tokens": 826921913.0, | |
| "step": 7180 | |
| }, | |
| { | |
| "epoch": 1.5865607414763323, | |
| "grad_norm": 29.25, | |
| "learning_rate": 1.1252006865989868e-06, | |
| "loss": 1.7796, | |
| "mean_token_accuracy": 0.9756049573421478, | |
| "num_tokens": 828076982.0, | |
| "step": 7190 | |
| }, | |
| { | |
| "epoch": 1.5887675162749642, | |
| "grad_norm": 73.0, | |
| "learning_rate": 1.1136961957358843e-06, | |
| "loss": 1.7381, | |
| "mean_token_accuracy": 0.9762054830789566, | |
| "num_tokens": 829237186.0, | |
| "step": 7200 | |
| }, | |
| { | |
| "epoch": 1.590974291073596, | |
| "grad_norm": 71.5, | |
| "learning_rate": 1.1022434452640252e-06, | |
| "loss": 1.7678, | |
| "mean_token_accuracy": 0.9790143951773643, | |
| "num_tokens": 830386049.0, | |
| "step": 7210 | |
| }, | |
| { | |
| "epoch": 1.5931810658722276, | |
| "grad_norm": 72.0, | |
| "learning_rate": 1.0908425876598512e-06, | |
| "loss": 1.6733, | |
| "mean_token_accuracy": 0.9789923951029778, | |
| "num_tokens": 831529912.0, | |
| "step": 7220 | |
| }, | |
| { | |
| "epoch": 1.5953878406708597, | |
| "grad_norm": 30.125, | |
| "learning_rate": 1.079493774708935e-06, | |
| "loss": 1.7101, | |
| "mean_token_accuracy": 0.9775262206792832, | |
| "num_tokens": 832706067.0, | |
| "step": 7230 | |
| }, | |
| { | |
| "epoch": 1.5975946154694913, | |
| "grad_norm": 38.25, | |
| "learning_rate": 1.0681971575039445e-06, | |
| "loss": 1.9157, | |
| "mean_token_accuracy": 0.9738598734140396, | |
| "num_tokens": 833868526.0, | |
| "step": 7240 | |
| }, | |
| { | |
| "epoch": 1.5998013902681232, | |
| "grad_norm": 24.75, | |
| "learning_rate": 1.0569528864426444e-06, | |
| "loss": 1.8957, | |
| "mean_token_accuracy": 0.9749908342957496, | |
| "num_tokens": 835015773.0, | |
| "step": 7250 | |
| }, | |
| { | |
| "epoch": 1.602008165066755, | |
| "grad_norm": 57.0, | |
| "learning_rate": 1.0457611112258813e-06, | |
| "loss": 1.9036, | |
| "mean_token_accuracy": 0.9752957850694657, | |
| "num_tokens": 836158800.0, | |
| "step": 7260 | |
| }, | |
| { | |
| "epoch": 1.6042149398653867, | |
| "grad_norm": 53.5, | |
| "learning_rate": 1.034621980855603e-06, | |
| "loss": 1.7597, | |
| "mean_token_accuracy": 0.9762044712901116, | |
| "num_tokens": 837317686.0, | |
| "step": 7270 | |
| }, | |
| { | |
| "epoch": 1.6064217146640185, | |
| "grad_norm": 42.75, | |
| "learning_rate": 1.0235356436328675e-06, | |
| "loss": 1.7372, | |
| "mean_token_accuracy": 0.975409984588623, | |
| "num_tokens": 838484460.0, | |
| "step": 7280 | |
| }, | |
| { | |
| "epoch": 1.6086284894626504, | |
| "grad_norm": 66.5, | |
| "learning_rate": 1.0125022471558694e-06, | |
| "loss": 1.8632, | |
| "mean_token_accuracy": 0.9752734184265137, | |
| "num_tokens": 839629176.0, | |
| "step": 7290 | |
| }, | |
| { | |
| "epoch": 1.610835264261282, | |
| "grad_norm": 37.0, | |
| "learning_rate": 1.001521938317974e-06, | |
| "loss": 1.9934, | |
| "mean_token_accuracy": 0.9744266137480736, | |
| "num_tokens": 840767656.0, | |
| "step": 7300 | |
| }, | |
| { | |
| "epoch": 1.613042039059914, | |
| "grad_norm": 43.75, | |
| "learning_rate": 9.905948633057666e-07, | |
| "loss": 1.9607, | |
| "mean_token_accuracy": 0.9736693039536476, | |
| "num_tokens": 841914828.0, | |
| "step": 7310 | |
| }, | |
| { | |
| "epoch": 1.6152488138585457, | |
| "grad_norm": 33.0, | |
| "learning_rate": 9.79721167597099e-07, | |
| "loss": 2.0275, | |
| "mean_token_accuracy": 0.9721111491322517, | |
| "num_tokens": 843066268.0, | |
| "step": 7320 | |
| }, | |
| { | |
| "epoch": 1.6174555886571775, | |
| "grad_norm": 56.5, | |
| "learning_rate": 9.689009959591605e-07, | |
| "loss": 1.843, | |
| "mean_token_accuracy": 0.9759933516383171, | |
| "num_tokens": 844210515.0, | |
| "step": 7330 | |
| }, | |
| { | |
| "epoch": 1.6196623634558094, | |
| "grad_norm": 43.5, | |
| "learning_rate": 9.58134492446543e-07, | |
| "loss": 1.914, | |
| "mean_token_accuracy": 0.9748274773359299, | |
| "num_tokens": 845354376.0, | |
| "step": 7340 | |
| }, | |
| { | |
| "epoch": 1.621869138254441, | |
| "grad_norm": 70.5, | |
| "learning_rate": 9.474218003993275e-07, | |
| "loss": 1.7126, | |
| "mean_token_accuracy": 0.9779505968093872, | |
| "num_tokens": 846502889.0, | |
| "step": 7350 | |
| }, | |
| { | |
| "epoch": 1.624075913053073, | |
| "grad_norm": 54.5, | |
| "learning_rate": 9.367630624411766e-07, | |
| "loss": 1.7565, | |
| "mean_token_accuracy": 0.9778892025351524, | |
| "num_tokens": 847654739.0, | |
| "step": 7360 | |
| }, | |
| { | |
| "epoch": 1.6262826878517047, | |
| "grad_norm": 40.75, | |
| "learning_rate": 9.26158420477431e-07, | |
| "loss": 1.8627, | |
| "mean_token_accuracy": 0.9770261690020561, | |
| "num_tokens": 848798881.0, | |
| "step": 7370 | |
| }, | |
| { | |
| "epoch": 1.6284894626503366, | |
| "grad_norm": 36.0, | |
| "learning_rate": 9.156080156932262e-07, | |
| "loss": 1.9729, | |
| "mean_token_accuracy": 0.9729376956820488, | |
| "num_tokens": 849966563.0, | |
| "step": 7380 | |
| }, | |
| { | |
| "epoch": 1.6306962374489684, | |
| "grad_norm": 83.0, | |
| "learning_rate": 9.051119885516085e-07, | |
| "loss": 1.7497, | |
| "mean_token_accuracy": 0.9768907964229584, | |
| "num_tokens": 851113926.0, | |
| "step": 7390 | |
| }, | |
| { | |
| "epoch": 1.6329030122476, | |
| "grad_norm": 92.5, | |
| "learning_rate": 8.946704787916676e-07, | |
| "loss": 1.9054, | |
| "mean_token_accuracy": 0.9769985690712929, | |
| "num_tokens": 852270523.0, | |
| "step": 7400 | |
| }, | |
| { | |
| "epoch": 1.6351097870462319, | |
| "grad_norm": 31.625, | |
| "learning_rate": 8.842836254266707e-07, | |
| "loss": 1.864, | |
| "mean_token_accuracy": 0.9771796703338623, | |
| "num_tokens": 853435020.0, | |
| "step": 7410 | |
| }, | |
| { | |
| "epoch": 1.6373165618448637, | |
| "grad_norm": 29.875, | |
| "learning_rate": 8.739515667422211e-07, | |
| "loss": 1.8413, | |
| "mean_token_accuracy": 0.9753375723958015, | |
| "num_tokens": 854591521.0, | |
| "step": 7420 | |
| }, | |
| { | |
| "epoch": 1.6395233366434954, | |
| "grad_norm": 30.25, | |
| "learning_rate": 8.636744402944075e-07, | |
| "loss": 1.926, | |
| "mean_token_accuracy": 0.9743264764547348, | |
| "num_tokens": 855749875.0, | |
| "step": 7430 | |
| }, | |
| { | |
| "epoch": 1.6417301114421274, | |
| "grad_norm": 23.625, | |
| "learning_rate": 8.534523829079816e-07, | |
| "loss": 1.8891, | |
| "mean_token_accuracy": 0.9753839418292045, | |
| "num_tokens": 856890857.0, | |
| "step": 7440 | |
| }, | |
| { | |
| "epoch": 1.643936886240759, | |
| "grad_norm": 52.25, | |
| "learning_rate": 8.432855306745264e-07, | |
| "loss": 1.808, | |
| "mean_token_accuracy": 0.9763372823596, | |
| "num_tokens": 858044917.0, | |
| "step": 7450 | |
| }, | |
| { | |
| "epoch": 1.646143661039391, | |
| "grad_norm": 29.125, | |
| "learning_rate": 8.33174018950656e-07, | |
| "loss": 1.7905, | |
| "mean_token_accuracy": 0.9768345803022385, | |
| "num_tokens": 859209419.0, | |
| "step": 7460 | |
| }, | |
| { | |
| "epoch": 1.6483504358380228, | |
| "grad_norm": 61.5, | |
| "learning_rate": 8.231179823562008e-07, | |
| "loss": 1.894, | |
| "mean_token_accuracy": 0.9733088418841362, | |
| "num_tokens": 860342766.0, | |
| "step": 7470 | |
| }, | |
| { | |
| "epoch": 1.6505572106366544, | |
| "grad_norm": 28.375, | |
| "learning_rate": 8.131175547724291e-07, | |
| "loss": 1.7814, | |
| "mean_token_accuracy": 0.976162138581276, | |
| "num_tokens": 861491592.0, | |
| "step": 7480 | |
| }, | |
| { | |
| "epoch": 1.6527639854352865, | |
| "grad_norm": 64.0, | |
| "learning_rate": 8.031728693402502e-07, | |
| "loss": 1.7842, | |
| "mean_token_accuracy": 0.9767463624477386, | |
| "num_tokens": 862631995.0, | |
| "step": 7490 | |
| }, | |
| { | |
| "epoch": 1.654970760233918, | |
| "grad_norm": 69.5, | |
| "learning_rate": 7.932840584584544e-07, | |
| "loss": 1.8256, | |
| "mean_token_accuracy": 0.9765890404582024, | |
| "num_tokens": 863797160.0, | |
| "step": 7500 | |
| }, | |
| { | |
| "epoch": 1.65717753503255, | |
| "grad_norm": 74.5, | |
| "learning_rate": 7.834512537819444e-07, | |
| "loss": 1.8569, | |
| "mean_token_accuracy": 0.9760210931301116, | |
| "num_tokens": 864956568.0, | |
| "step": 7510 | |
| }, | |
| { | |
| "epoch": 1.6593843098311818, | |
| "grad_norm": 80.5, | |
| "learning_rate": 7.736745862199785e-07, | |
| "loss": 1.8123, | |
| "mean_token_accuracy": 0.976027375459671, | |
| "num_tokens": 866099607.0, | |
| "step": 7520 | |
| }, | |
| { | |
| "epoch": 1.6615910846298134, | |
| "grad_norm": 26.5, | |
| "learning_rate": 7.639541859344385e-07, | |
| "loss": 1.8199, | |
| "mean_token_accuracy": 0.9762808188796044, | |
| "num_tokens": 867263485.0, | |
| "step": 7530 | |
| }, | |
| { | |
| "epoch": 1.6637978594284455, | |
| "grad_norm": 84.5, | |
| "learning_rate": 7.542901823380844e-07, | |
| "loss": 1.895, | |
| "mean_token_accuracy": 0.9750649750232696, | |
| "num_tokens": 868406792.0, | |
| "step": 7540 | |
| }, | |
| { | |
| "epoch": 1.6660046342270771, | |
| "grad_norm": 56.75, | |
| "learning_rate": 7.446827040928439e-07, | |
| "loss": 1.7466, | |
| "mean_token_accuracy": 0.9761609375476837, | |
| "num_tokens": 869555653.0, | |
| "step": 7550 | |
| }, | |
| { | |
| "epoch": 1.668211409025709, | |
| "grad_norm": 49.25, | |
| "learning_rate": 7.351318791080881e-07, | |
| "loss": 1.7431, | |
| "mean_token_accuracy": 0.9776811107993126, | |
| "num_tokens": 870700028.0, | |
| "step": 7560 | |
| }, | |
| { | |
| "epoch": 1.6704181838243408, | |
| "grad_norm": 31.0, | |
| "learning_rate": 7.25637834538937e-07, | |
| "loss": 1.8826, | |
| "mean_token_accuracy": 0.9743488490581512, | |
| "num_tokens": 871838540.0, | |
| "step": 7570 | |
| }, | |
| { | |
| "epoch": 1.6726249586229724, | |
| "grad_norm": 31.375, | |
| "learning_rate": 7.162006967845602e-07, | |
| "loss": 1.7738, | |
| "mean_token_accuracy": 0.9750529482960701, | |
| "num_tokens": 872989045.0, | |
| "step": 7580 | |
| }, | |
| { | |
| "epoch": 1.6748317334216043, | |
| "grad_norm": 37.5, | |
| "learning_rate": 7.068205914865012e-07, | |
| "loss": 1.8154, | |
| "mean_token_accuracy": 0.9770220369100571, | |
| "num_tokens": 874118868.0, | |
| "step": 7590 | |
| }, | |
| { | |
| "epoch": 1.6770385082202361, | |
| "grad_norm": 63.25, | |
| "learning_rate": 6.974976435269953e-07, | |
| "loss": 1.8056, | |
| "mean_token_accuracy": 0.9748851433396339, | |
| "num_tokens": 875278822.0, | |
| "step": 7600 | |
| }, | |
| { | |
| "epoch": 1.6792452830188678, | |
| "grad_norm": 42.0, | |
| "learning_rate": 6.882319770273193e-07, | |
| "loss": 1.6955, | |
| "mean_token_accuracy": 0.9785465553402901, | |
| "num_tokens": 876415819.0, | |
| "step": 7610 | |
| }, | |
| { | |
| "epoch": 1.6814520578174998, | |
| "grad_norm": 24.0, | |
| "learning_rate": 6.790237153461244e-07, | |
| "loss": 1.8635, | |
| "mean_token_accuracy": 0.9743424549698829, | |
| "num_tokens": 877576379.0, | |
| "step": 7620 | |
| }, | |
| { | |
| "epoch": 1.6836588326161315, | |
| "grad_norm": 85.0, | |
| "learning_rate": 6.698729810778065e-07, | |
| "loss": 1.8845, | |
| "mean_token_accuracy": 0.9739576920866966, | |
| "num_tokens": 878728181.0, | |
| "step": 7630 | |
| }, | |
| { | |
| "epoch": 1.6858656074147633, | |
| "grad_norm": 51.0, | |
| "learning_rate": 6.607798960508693e-07, | |
| "loss": 1.7837, | |
| "mean_token_accuracy": 0.9768923789262771, | |
| "num_tokens": 879893736.0, | |
| "step": 7640 | |
| }, | |
| { | |
| "epoch": 1.6880723822133952, | |
| "grad_norm": 39.5, | |
| "learning_rate": 6.517445813262985e-07, | |
| "loss": 1.9735, | |
| "mean_token_accuracy": 0.9742175087332725, | |
| "num_tokens": 881043763.0, | |
| "step": 7650 | |
| }, | |
| { | |
| "epoch": 1.6902791570120268, | |
| "grad_norm": 42.5, | |
| "learning_rate": 6.42767157195957e-07, | |
| "loss": 1.7473, | |
| "mean_token_accuracy": 0.978165277838707, | |
| "num_tokens": 882201644.0, | |
| "step": 7660 | |
| }, | |
| { | |
| "epoch": 1.6924859318106589, | |
| "grad_norm": 23.25, | |
| "learning_rate": 6.338477431809764e-07, | |
| "loss": 1.8191, | |
| "mean_token_accuracy": 0.9768994152545929, | |
| "num_tokens": 883356680.0, | |
| "step": 7670 | |
| }, | |
| { | |
| "epoch": 1.6946927066092905, | |
| "grad_norm": 30.625, | |
| "learning_rate": 6.24986458030174e-07, | |
| "loss": 1.9383, | |
| "mean_token_accuracy": 0.9729271680116653, | |
| "num_tokens": 884515378.0, | |
| "step": 7680 | |
| }, | |
| { | |
| "epoch": 1.6968994814079223, | |
| "grad_norm": 46.0, | |
| "learning_rate": 6.16183419718463e-07, | |
| "loss": 1.8683, | |
| "mean_token_accuracy": 0.9758946537971497, | |
| "num_tokens": 885654893.0, | |
| "step": 7690 | |
| }, | |
| { | |
| "epoch": 1.6991062562065542, | |
| "grad_norm": 53.75, | |
| "learning_rate": 6.074387454452891e-07, | |
| "loss": 1.8376, | |
| "mean_token_accuracy": 0.9753603234887123, | |
| "num_tokens": 886799692.0, | |
| "step": 7700 | |
| }, | |
| { | |
| "epoch": 1.7013130310051858, | |
| "grad_norm": 73.5, | |
| "learning_rate": 5.987525516330639e-07, | |
| "loss": 1.7183, | |
| "mean_token_accuracy": 0.978036743402481, | |
| "num_tokens": 887943262.0, | |
| "step": 7710 | |
| }, | |
| { | |
| "epoch": 1.7035198058038177, | |
| "grad_norm": 55.25, | |
| "learning_rate": 5.901249539256215e-07, | |
| "loss": 1.7492, | |
| "mean_token_accuracy": 0.9782552897930146, | |
| "num_tokens": 889099090.0, | |
| "step": 7720 | |
| }, | |
| { | |
| "epoch": 1.7057265806024495, | |
| "grad_norm": 28.375, | |
| "learning_rate": 5.815560671866721e-07, | |
| "loss": 1.891, | |
| "mean_token_accuracy": 0.9745562136173248, | |
| "num_tokens": 890238575.0, | |
| "step": 7730 | |
| }, | |
| { | |
| "epoch": 1.7079333554010812, | |
| "grad_norm": 27.625, | |
| "learning_rate": 5.730460054982806e-07, | |
| "loss": 1.8931, | |
| "mean_token_accuracy": 0.9749508768320083, | |
| "num_tokens": 891401883.0, | |
| "step": 7740 | |
| }, | |
| { | |
| "epoch": 1.7101401301997132, | |
| "grad_norm": 34.5, | |
| "learning_rate": 5.645948821593384e-07, | |
| "loss": 2.0326, | |
| "mean_token_accuracy": 0.9729828789830208, | |
| "num_tokens": 892541055.0, | |
| "step": 7750 | |
| }, | |
| { | |
| "epoch": 1.7123469049983449, | |
| "grad_norm": 63.5, | |
| "learning_rate": 5.562028096840638e-07, | |
| "loss": 1.7416, | |
| "mean_token_accuracy": 0.9780916512012482, | |
| "num_tokens": 893701428.0, | |
| "step": 7760 | |
| }, | |
| { | |
| "epoch": 1.7145536797969767, | |
| "grad_norm": 68.0, | |
| "learning_rate": 5.478698998004967e-07, | |
| "loss": 1.7709, | |
| "mean_token_accuracy": 0.9766147628426551, | |
| "num_tokens": 894866648.0, | |
| "step": 7770 | |
| }, | |
| { | |
| "epoch": 1.7167604545956086, | |
| "grad_norm": 38.75, | |
| "learning_rate": 5.395962634490182e-07, | |
| "loss": 2.0936, | |
| "mean_token_accuracy": 0.9720629766583443, | |
| "num_tokens": 896019860.0, | |
| "step": 7780 | |
| }, | |
| { | |
| "epoch": 1.7189672293942402, | |
| "grad_norm": 56.5, | |
| "learning_rate": 5.313820107808665e-07, | |
| "loss": 1.9193, | |
| "mean_token_accuracy": 0.9750956431031227, | |
| "num_tokens": 897161732.0, | |
| "step": 7790 | |
| }, | |
| { | |
| "epoch": 1.7211740041928723, | |
| "grad_norm": 21.875, | |
| "learning_rate": 5.232272511566744e-07, | |
| "loss": 1.618, | |
| "mean_token_accuracy": 0.9795330792665482, | |
| "num_tokens": 898310931.0, | |
| "step": 7800 | |
| }, | |
| { | |
| "epoch": 1.7233807789915039, | |
| "grad_norm": 74.0, | |
| "learning_rate": 5.15132093145016e-07, | |
| "loss": 1.7352, | |
| "mean_token_accuracy": 0.9784943521022796, | |
| "num_tokens": 899470545.0, | |
| "step": 7810 | |
| }, | |
| { | |
| "epoch": 1.7255875537901357, | |
| "grad_norm": 27.375, | |
| "learning_rate": 5.07096644520954e-07, | |
| "loss": 1.7813, | |
| "mean_token_accuracy": 0.9749257057905197, | |
| "num_tokens": 900616929.0, | |
| "step": 7820 | |
| }, | |
| { | |
| "epoch": 1.7277943285887676, | |
| "grad_norm": 44.75, | |
| "learning_rate": 4.991210122646117e-07, | |
| "loss": 1.8376, | |
| "mean_token_accuracy": 0.9760281756520272, | |
| "num_tokens": 901753539.0, | |
| "step": 7830 | |
| }, | |
| { | |
| "epoch": 1.7300011033873992, | |
| "grad_norm": 43.5, | |
| "learning_rate": 4.91205302559743e-07, | |
| "loss": 1.858, | |
| "mean_token_accuracy": 0.9755366012454033, | |
| "num_tokens": 902884735.0, | |
| "step": 7840 | |
| }, | |
| { | |
| "epoch": 1.7322078781860313, | |
| "grad_norm": 27.875, | |
| "learning_rate": 4.83349620792325e-07, | |
| "loss": 1.7956, | |
| "mean_token_accuracy": 0.9768936336040497, | |
| "num_tokens": 904029168.0, | |
| "step": 7850 | |
| }, | |
| { | |
| "epoch": 1.734414652984663, | |
| "grad_norm": 34.5, | |
| "learning_rate": 4.7555407154914855e-07, | |
| "loss": 1.9324, | |
| "mean_token_accuracy": 0.9739907309412956, | |
| "num_tokens": 905172256.0, | |
| "step": 7860 | |
| }, | |
| { | |
| "epoch": 1.7366214277832948, | |
| "grad_norm": 36.0, | |
| "learning_rate": 4.678187586164318e-07, | |
| "loss": 1.8682, | |
| "mean_token_accuracy": 0.9749459028244019, | |
| "num_tokens": 906314973.0, | |
| "step": 7870 | |
| }, | |
| { | |
| "epoch": 1.7388282025819266, | |
| "grad_norm": 102.5, | |
| "learning_rate": 4.601437849784318e-07, | |
| "loss": 1.7498, | |
| "mean_token_accuracy": 0.9763739034533501, | |
| "num_tokens": 907464012.0, | |
| "step": 7880 | |
| }, | |
| { | |
| "epoch": 1.7410349773805582, | |
| "grad_norm": 52.5, | |
| "learning_rate": 4.525292528160813e-07, | |
| "loss": 1.6824, | |
| "mean_token_accuracy": 0.9772188663482666, | |
| "num_tokens": 908618543.0, | |
| "step": 7890 | |
| }, | |
| { | |
| "epoch": 1.74324175217919, | |
| "grad_norm": 65.5, | |
| "learning_rate": 4.449752635056187e-07, | |
| "loss": 1.8854, | |
| "mean_token_accuracy": 0.9752214908599853, | |
| "num_tokens": 909765551.0, | |
| "step": 7900 | |
| }, | |
| { | |
| "epoch": 1.745448526977822, | |
| "grad_norm": 39.25, | |
| "learning_rate": 4.374819176172501e-07, | |
| "loss": 1.9245, | |
| "mean_token_accuracy": 0.9745707079768181, | |
| "num_tokens": 910911710.0, | |
| "step": 7910 | |
| }, | |
| { | |
| "epoch": 1.7476553017764536, | |
| "grad_norm": 31.0, | |
| "learning_rate": 4.3004931491379906e-07, | |
| "loss": 1.9105, | |
| "mean_token_accuracy": 0.9756983280181885, | |
| "num_tokens": 912054146.0, | |
| "step": 7920 | |
| }, | |
| { | |
| "epoch": 1.7498620765750856, | |
| "grad_norm": 60.25, | |
| "learning_rate": 4.2267755434938605e-07, | |
| "loss": 1.8189, | |
| "mean_token_accuracy": 0.9770970344543457, | |
| "num_tokens": 913194503.0, | |
| "step": 7930 | |
| }, | |
| { | |
| "epoch": 1.7520688513737173, | |
| "grad_norm": 63.5, | |
| "learning_rate": 4.153667340681067e-07, | |
| "loss": 1.8481, | |
| "mean_token_accuracy": 0.9758784458041191, | |
| "num_tokens": 914350105.0, | |
| "step": 7940 | |
| }, | |
| { | |
| "epoch": 1.7542756261723491, | |
| "grad_norm": 69.0, | |
| "learning_rate": 4.081169514027289e-07, | |
| "loss": 1.7163, | |
| "mean_token_accuracy": 0.9770554274320602, | |
| "num_tokens": 915509236.0, | |
| "step": 7950 | |
| }, | |
| { | |
| "epoch": 1.756482400970981, | |
| "grad_norm": 36.0, | |
| "learning_rate": 4.009283028733929e-07, | |
| "loss": 1.902, | |
| "mean_token_accuracy": 0.9744615390896797, | |
| "num_tokens": 916658298.0, | |
| "step": 7960 | |
| }, | |
| { | |
| "epoch": 1.7586891757696126, | |
| "grad_norm": 34.5, | |
| "learning_rate": 3.938008841863289e-07, | |
| "loss": 1.775, | |
| "mean_token_accuracy": 0.9760864600539207, | |
| "num_tokens": 917797467.0, | |
| "step": 7970 | |
| }, | |
| { | |
| "epoch": 1.7608959505682447, | |
| "grad_norm": 26.25, | |
| "learning_rate": 3.8673479023258464e-07, | |
| "loss": 1.7052, | |
| "mean_token_accuracy": 0.9785747811198234, | |
| "num_tokens": 918929082.0, | |
| "step": 7980 | |
| }, | |
| { | |
| "epoch": 1.7631027253668763, | |
| "grad_norm": 79.0, | |
| "learning_rate": 3.797301150867544e-07, | |
| "loss": 1.8692, | |
| "mean_token_accuracy": 0.9752358302474022, | |
| "num_tokens": 920082190.0, | |
| "step": 7990 | |
| }, | |
| { | |
| "epoch": 1.7653095001655081, | |
| "grad_norm": 33.5, | |
| "learning_rate": 3.7278695200573754e-07, | |
| "loss": 1.8362, | |
| "mean_token_accuracy": 0.9766950190067292, | |
| "num_tokens": 921235813.0, | |
| "step": 8000 | |
| }, | |
| { | |
| "epoch": 1.76751627496414, | |
| "grad_norm": 108.0, | |
| "learning_rate": 3.6590539342748645e-07, | |
| "loss": 1.7578, | |
| "mean_token_accuracy": 0.977169218659401, | |
| "num_tokens": 922380449.0, | |
| "step": 8010 | |
| }, | |
| { | |
| "epoch": 1.7697230497627716, | |
| "grad_norm": 25.0, | |
| "learning_rate": 3.590855309697844e-07, | |
| "loss": 1.6379, | |
| "mean_token_accuracy": 0.9787977397441864, | |
| "num_tokens": 923532540.0, | |
| "step": 8020 | |
| }, | |
| { | |
| "epoch": 1.7719298245614035, | |
| "grad_norm": 29.625, | |
| "learning_rate": 3.52327455429019e-07, | |
| "loss": 1.7934, | |
| "mean_token_accuracy": 0.9757225632667541, | |
| "num_tokens": 924682038.0, | |
| "step": 8030 | |
| }, | |
| { | |
| "epoch": 1.7741365993600353, | |
| "grad_norm": 29.125, | |
| "learning_rate": 3.4563125677897936e-07, | |
| "loss": 1.6337, | |
| "mean_token_accuracy": 0.9814859986305237, | |
| "num_tokens": 925829422.0, | |
| "step": 8040 | |
| }, | |
| { | |
| "epoch": 1.776343374158667, | |
| "grad_norm": 53.0, | |
| "learning_rate": 3.3899702416965166e-07, | |
| "loss": 1.7375, | |
| "mean_token_accuracy": 0.975970908999443, | |
| "num_tokens": 927003779.0, | |
| "step": 8050 | |
| }, | |
| { | |
| "epoch": 1.778550148957299, | |
| "grad_norm": 83.5, | |
| "learning_rate": 3.324248459260393e-07, | |
| "loss": 1.7965, | |
| "mean_token_accuracy": 0.9762446150183678, | |
| "num_tokens": 928158326.0, | |
| "step": 8060 | |
| }, | |
| { | |
| "epoch": 1.7807569237559306, | |
| "grad_norm": 34.0, | |
| "learning_rate": 3.2591480954698107e-07, | |
| "loss": 1.8108, | |
| "mean_token_accuracy": 0.9750507935881615, | |
| "num_tokens": 929312957.0, | |
| "step": 8070 | |
| }, | |
| { | |
| "epoch": 1.7829636985545625, | |
| "grad_norm": 29.0, | |
| "learning_rate": 3.194670017039897e-07, | |
| "loss": 1.7902, | |
| "mean_token_accuracy": 0.9753267720341683, | |
| "num_tokens": 930473164.0, | |
| "step": 8080 | |
| }, | |
| { | |
| "epoch": 1.7851704733531943, | |
| "grad_norm": 26.5, | |
| "learning_rate": 3.1308150824009785e-07, | |
| "loss": 1.6403, | |
| "mean_token_accuracy": 0.9798958003520966, | |
| "num_tokens": 931618845.0, | |
| "step": 8090 | |
| }, | |
| { | |
| "epoch": 1.787377248151826, | |
| "grad_norm": 51.75, | |
| "learning_rate": 3.0675841416871186e-07, | |
| "loss": 1.7878, | |
| "mean_token_accuracy": 0.976507730782032, | |
| "num_tokens": 932784403.0, | |
| "step": 8100 | |
| }, | |
| { | |
| "epoch": 1.789584022950458, | |
| "grad_norm": 30.375, | |
| "learning_rate": 3.004978036724854e-07, | |
| "loss": 1.9971, | |
| "mean_token_accuracy": 0.972666472196579, | |
| "num_tokens": 933918479.0, | |
| "step": 8110 | |
| }, | |
| { | |
| "epoch": 1.7917907977490897, | |
| "grad_norm": 59.0, | |
| "learning_rate": 2.942997601021924e-07, | |
| "loss": 1.6915, | |
| "mean_token_accuracy": 0.9778900042176246, | |
| "num_tokens": 935062458.0, | |
| "step": 8120 | |
| }, | |
| { | |
| "epoch": 1.7939975725477215, | |
| "grad_norm": 129.0, | |
| "learning_rate": 2.8816436597562336e-07, | |
| "loss": 1.7587, | |
| "mean_token_accuracy": 0.9765965178608894, | |
| "num_tokens": 936208787.0, | |
| "step": 8130 | |
| }, | |
| { | |
| "epoch": 1.7962043473463534, | |
| "grad_norm": 34.5, | |
| "learning_rate": 2.82091702976483e-07, | |
| "loss": 1.7985, | |
| "mean_token_accuracy": 0.977076581120491, | |
| "num_tokens": 937362609.0, | |
| "step": 8140 | |
| }, | |
| { | |
| "epoch": 1.798411122144985, | |
| "grad_norm": 40.0, | |
| "learning_rate": 2.760818519533037e-07, | |
| "loss": 1.8654, | |
| "mean_token_accuracy": 0.9753698080778122, | |
| "num_tokens": 938516307.0, | |
| "step": 8150 | |
| }, | |
| { | |
| "epoch": 1.800617896943617, | |
| "grad_norm": 30.625, | |
| "learning_rate": 2.701348929183695e-07, | |
| "loss": 1.8389, | |
| "mean_token_accuracy": 0.975410495698452, | |
| "num_tokens": 939677209.0, | |
| "step": 8160 | |
| }, | |
| { | |
| "epoch": 1.8028246717422487, | |
| "grad_norm": 30.5, | |
| "learning_rate": 2.642509050466502e-07, | |
| "loss": 1.7164, | |
| "mean_token_accuracy": 0.978900808095932, | |
| "num_tokens": 940831595.0, | |
| "step": 8170 | |
| }, | |
| { | |
| "epoch": 1.8050314465408805, | |
| "grad_norm": 62.25, | |
| "learning_rate": 2.584299666747475e-07, | |
| "loss": 1.6449, | |
| "mean_token_accuracy": 0.9774047553539276, | |
| "num_tokens": 941977946.0, | |
| "step": 8180 | |
| }, | |
| { | |
| "epoch": 1.8072382213395124, | |
| "grad_norm": 53.25, | |
| "learning_rate": 2.5267215529985346e-07, | |
| "loss": 1.6709, | |
| "mean_token_accuracy": 0.9787240222096443, | |
| "num_tokens": 943123031.0, | |
| "step": 8190 | |
| }, | |
| { | |
| "epoch": 1.809444996138144, | |
| "grad_norm": 37.25, | |
| "learning_rate": 2.4697754757871504e-07, | |
| "loss": 1.8088, | |
| "mean_token_accuracy": 0.9764915466308594, | |
| "num_tokens": 944273941.0, | |
| "step": 8200 | |
| }, | |
| { | |
| "epoch": 1.8116517709367759, | |
| "grad_norm": 30.625, | |
| "learning_rate": 2.4134621932661916e-07, | |
| "loss": 1.9576, | |
| "mean_token_accuracy": 0.973970752954483, | |
| "num_tokens": 945412830.0, | |
| "step": 8210 | |
| }, | |
| { | |
| "epoch": 1.8138585457354077, | |
| "grad_norm": 27.0, | |
| "learning_rate": 2.3577824551637818e-07, | |
| "loss": 1.8773, | |
| "mean_token_accuracy": 0.9754694759845733, | |
| "num_tokens": 946560307.0, | |
| "step": 8220 | |
| }, | |
| { | |
| "epoch": 1.8160653205340394, | |
| "grad_norm": 68.5, | |
| "learning_rate": 2.3027370027733308e-07, | |
| "loss": 1.8405, | |
| "mean_token_accuracy": 0.9756019473075866, | |
| "num_tokens": 947702446.0, | |
| "step": 8230 | |
| }, | |
| { | |
| "epoch": 1.8182720953326714, | |
| "grad_norm": 161.0, | |
| "learning_rate": 2.2483265689436929e-07, | |
| "loss": 2.0342, | |
| "mean_token_accuracy": 0.9740627348423004, | |
| "num_tokens": 948842508.0, | |
| "step": 8240 | |
| }, | |
| { | |
| "epoch": 1.820478870131303, | |
| "grad_norm": 55.5, | |
| "learning_rate": 2.194551878069362e-07, | |
| "loss": 1.8138, | |
| "mean_token_accuracy": 0.9764554813504219, | |
| "num_tokens": 949996024.0, | |
| "step": 8250 | |
| }, | |
| { | |
| "epoch": 1.822685644929935, | |
| "grad_norm": 88.5, | |
| "learning_rate": 2.141413646080881e-07, | |
| "loss": 1.9058, | |
| "mean_token_accuracy": 0.974327839910984, | |
| "num_tokens": 951155186.0, | |
| "step": 8260 | |
| }, | |
| { | |
| "epoch": 1.8248924197285668, | |
| "grad_norm": 53.0, | |
| "learning_rate": 2.0889125804352595e-07, | |
| "loss": 1.7181, | |
| "mean_token_accuracy": 0.9761965289711952, | |
| "num_tokens": 952309745.0, | |
| "step": 8270 | |
| }, | |
| { | |
| "epoch": 1.8270991945271984, | |
| "grad_norm": 64.0, | |
| "learning_rate": 2.0370493801066038e-07, | |
| "loss": 1.9259, | |
| "mean_token_accuracy": 0.9744319587945938, | |
| "num_tokens": 953464997.0, | |
| "step": 8280 | |
| }, | |
| { | |
| "epoch": 1.8293059693258304, | |
| "grad_norm": 24.625, | |
| "learning_rate": 1.985824735576758e-07, | |
| "loss": 1.738, | |
| "mean_token_accuracy": 0.9764760240912438, | |
| "num_tokens": 954601830.0, | |
| "step": 8290 | |
| }, | |
| { | |
| "epoch": 1.831512744124462, | |
| "grad_norm": 37.75, | |
| "learning_rate": 1.9352393288261717e-07, | |
| "loss": 1.7565, | |
| "mean_token_accuracy": 0.9767120108008385, | |
| "num_tokens": 955764590.0, | |
| "step": 8300 | |
| }, | |
| { | |
| "epoch": 1.833719518923094, | |
| "grad_norm": 48.0, | |
| "learning_rate": 1.8852938333247527e-07, | |
| "loss": 1.8292, | |
| "mean_token_accuracy": 0.9757956236600875, | |
| "num_tokens": 956903842.0, | |
| "step": 8310 | |
| }, | |
| { | |
| "epoch": 1.8359262937217258, | |
| "grad_norm": 54.0, | |
| "learning_rate": 1.8359889140229738e-07, | |
| "loss": 2.0579, | |
| "mean_token_accuracy": 0.9723797485232353, | |
| "num_tokens": 958062923.0, | |
| "step": 8320 | |
| }, | |
| { | |
| "epoch": 1.8381330685203574, | |
| "grad_norm": 88.5, | |
| "learning_rate": 1.787325227342951e-07, | |
| "loss": 1.8168, | |
| "mean_token_accuracy": 0.9749464869499207, | |
| "num_tokens": 959220528.0, | |
| "step": 8330 | |
| }, | |
| { | |
| "epoch": 1.8403398433189893, | |
| "grad_norm": 34.25, | |
| "learning_rate": 1.7393034211697524e-07, | |
| "loss": 2.0267, | |
| "mean_token_accuracy": 0.9744209840893745, | |
| "num_tokens": 960361218.0, | |
| "step": 8340 | |
| }, | |
| { | |
| "epoch": 1.842546618117621, | |
| "grad_norm": 30.25, | |
| "learning_rate": 1.6919241348427485e-07, | |
| "loss": 1.9168, | |
| "mean_token_accuracy": 0.9756815880537033, | |
| "num_tokens": 961512553.0, | |
| "step": 8350 | |
| }, | |
| { | |
| "epoch": 1.8447533929162527, | |
| "grad_norm": 29.5, | |
| "learning_rate": 1.6451879991471186e-07, | |
| "loss": 1.6774, | |
| "mean_token_accuracy": 0.977296793460846, | |
| "num_tokens": 962651073.0, | |
| "step": 8360 | |
| }, | |
| { | |
| "epoch": 1.8469601677148848, | |
| "grad_norm": 110.0, | |
| "learning_rate": 1.59909563630542e-07, | |
| "loss": 1.8339, | |
| "mean_token_accuracy": 0.9754460528492928, | |
| "num_tokens": 963788631.0, | |
| "step": 8370 | |
| }, | |
| { | |
| "epoch": 1.8491669425135164, | |
| "grad_norm": 91.5, | |
| "learning_rate": 1.553647659969354e-07, | |
| "loss": 1.7973, | |
| "mean_token_accuracy": 0.9760838657617569, | |
| "num_tokens": 964941646.0, | |
| "step": 8380 | |
| }, | |
| { | |
| "epoch": 1.8513737173121483, | |
| "grad_norm": 27.25, | |
| "learning_rate": 1.5088446752115403e-07, | |
| "loss": 1.6203, | |
| "mean_token_accuracy": 0.9800925001502037, | |
| "num_tokens": 966071138.0, | |
| "step": 8390 | |
| }, | |
| { | |
| "epoch": 1.8535804921107801, | |
| "grad_norm": 45.5, | |
| "learning_rate": 1.4646872785175182e-07, | |
| "loss": 1.7809, | |
| "mean_token_accuracy": 0.9762905180454254, | |
| "num_tokens": 967222718.0, | |
| "step": 8400 | |
| }, | |
| { | |
| "epoch": 1.8557872669094118, | |
| "grad_norm": 119.5, | |
| "learning_rate": 1.4211760577777567e-07, | |
| "loss": 1.8268, | |
| "mean_token_accuracy": 0.9754861250519753, | |
| "num_tokens": 968388811.0, | |
| "step": 8410 | |
| }, | |
| { | |
| "epoch": 1.8579940417080438, | |
| "grad_norm": 94.5, | |
| "learning_rate": 1.378311592279835e-07, | |
| "loss": 1.927, | |
| "mean_token_accuracy": 0.9751164630055428, | |
| "num_tokens": 969545212.0, | |
| "step": 8420 | |
| }, | |
| { | |
| "epoch": 1.8602008165066755, | |
| "grad_norm": 44.5, | |
| "learning_rate": 1.3360944527007758e-07, | |
| "loss": 1.8739, | |
| "mean_token_accuracy": 0.9755971372127533, | |
| "num_tokens": 970707357.0, | |
| "step": 8430 | |
| }, | |
| { | |
| "epoch": 1.8624075913053073, | |
| "grad_norm": 38.5, | |
| "learning_rate": 1.2945252010993836e-07, | |
| "loss": 1.7196, | |
| "mean_token_accuracy": 0.9762973368167878, | |
| "num_tokens": 971855325.0, | |
| "step": 8440 | |
| }, | |
| { | |
| "epoch": 1.8646143661039392, | |
| "grad_norm": 29.375, | |
| "learning_rate": 1.253604390908819e-07, | |
| "loss": 1.9145, | |
| "mean_token_accuracy": 0.9760203078389168, | |
| "num_tokens": 973004582.0, | |
| "step": 8450 | |
| }, | |
| { | |
| "epoch": 1.8668211409025708, | |
| "grad_norm": 127.5, | |
| "learning_rate": 1.2133325669291818e-07, | |
| "loss": 1.7994, | |
| "mean_token_accuracy": 0.9747086107730866, | |
| "num_tokens": 974150966.0, | |
| "step": 8460 | |
| }, | |
| { | |
| "epoch": 1.8690279157012029, | |
| "grad_norm": 103.5, | |
| "learning_rate": 1.1737102653202825e-07, | |
| "loss": 1.9375, | |
| "mean_token_accuracy": 0.974118135869503, | |
| "num_tokens": 975305148.0, | |
| "step": 8470 | |
| }, | |
| { | |
| "epoch": 1.8712346904998345, | |
| "grad_norm": 52.0, | |
| "learning_rate": 1.1347380135945108e-07, | |
| "loss": 1.7386, | |
| "mean_token_accuracy": 0.9766201511025429, | |
| "num_tokens": 976473337.0, | |
| "step": 8480 | |
| }, | |
| { | |
| "epoch": 1.8734414652984663, | |
| "grad_norm": 101.5, | |
| "learning_rate": 1.0964163306098007e-07, | |
| "loss": 1.9524, | |
| "mean_token_accuracy": 0.9737635105848312, | |
| "num_tokens": 977639676.0, | |
| "step": 8490 | |
| }, | |
| { | |
| "epoch": 1.8756482400970982, | |
| "grad_norm": 41.5, | |
| "learning_rate": 1.0587457265627099e-07, | |
| "loss": 1.8065, | |
| "mean_token_accuracy": 0.9753396958112717, | |
| "num_tokens": 978791690.0, | |
| "step": 8500 | |
| }, | |
| { | |
| "epoch": 1.8778550148957298, | |
| "grad_norm": 39.25, | |
| "learning_rate": 1.0217267029816736e-07, | |
| "loss": 1.8344, | |
| "mean_token_accuracy": 0.9768350154161454, | |
| "num_tokens": 979948778.0, | |
| "step": 8510 | |
| }, | |
| { | |
| "epoch": 1.8800617896943617, | |
| "grad_norm": 83.5, | |
| "learning_rate": 9.853597527202608e-08, | |
| "loss": 1.7656, | |
| "mean_token_accuracy": 0.9761572375893592, | |
| "num_tokens": 981124882.0, | |
| "step": 8520 | |
| }, | |
| { | |
| "epoch": 1.8822685644929935, | |
| "grad_norm": 61.25, | |
| "learning_rate": 9.496453599506683e-08, | |
| "loss": 1.7886, | |
| "mean_token_accuracy": 0.9780130028724671, | |
| "num_tokens": 982276679.0, | |
| "step": 8530 | |
| }, | |
| { | |
| "epoch": 1.8844753392916251, | |
| "grad_norm": 70.5, | |
| "learning_rate": 9.145840001572537e-08, | |
| "loss": 1.9337, | |
| "mean_token_accuracy": 0.9746360927820206, | |
| "num_tokens": 983430849.0, | |
| "step": 8540 | |
| }, | |
| { | |
| "epoch": 1.8866821140902572, | |
| "grad_norm": 35.25, | |
| "learning_rate": 8.801761401302012e-08, | |
| "loss": 1.8609, | |
| "mean_token_accuracy": 0.9742721617221832, | |
| "num_tokens": 984588235.0, | |
| "step": 8550 | |
| }, | |
| { | |
| "epoch": 1.8888888888888888, | |
| "grad_norm": 33.25, | |
| "learning_rate": 8.464222379593157e-08, | |
| "loss": 1.7678, | |
| "mean_token_accuracy": 0.9771417051553726, | |
| "num_tokens": 985724223.0, | |
| "step": 8560 | |
| }, | |
| { | |
| "epoch": 1.8910956636875207, | |
| "grad_norm": 25.5, | |
| "learning_rate": 8.133227430279055e-08, | |
| "loss": 1.8519, | |
| "mean_token_accuracy": 0.976787468791008, | |
| "num_tokens": 986878816.0, | |
| "step": 8570 | |
| }, | |
| { | |
| "epoch": 1.8933024384861525, | |
| "grad_norm": 78.0, | |
| "learning_rate": 7.808780960068374e-08, | |
| "loss": 1.8762, | |
| "mean_token_accuracy": 0.9760066717863083, | |
| "num_tokens": 988048180.0, | |
| "step": 8580 | |
| }, | |
| { | |
| "epoch": 1.8955092132847842, | |
| "grad_norm": 49.25, | |
| "learning_rate": 7.490887288486293e-08, | |
| "loss": 1.8086, | |
| "mean_token_accuracy": 0.9755912661552429, | |
| "num_tokens": 989193104.0, | |
| "step": 8590 | |
| }, | |
| { | |
| "epoch": 1.8977159880834162, | |
| "grad_norm": 44.25, | |
| "learning_rate": 7.179550647817224e-08, | |
| "loss": 1.9104, | |
| "mean_token_accuracy": 0.9744039729237557, | |
| "num_tokens": 990357136.0, | |
| "step": 8600 | |
| }, | |
| { | |
| "epoch": 1.8999227628820479, | |
| "grad_norm": 41.75, | |
| "learning_rate": 6.8747751830483e-08, | |
| "loss": 1.8999, | |
| "mean_token_accuracy": 0.9732261091470719, | |
| "num_tokens": 991522520.0, | |
| "step": 8610 | |
| }, | |
| { | |
| "epoch": 1.9021295376806797, | |
| "grad_norm": 31.625, | |
| "learning_rate": 6.576564951814468e-08, | |
| "loss": 1.8353, | |
| "mean_token_accuracy": 0.9761197596788407, | |
| "num_tokens": 992681250.0, | |
| "step": 8620 | |
| }, | |
| { | |
| "epoch": 1.9043363124793116, | |
| "grad_norm": 44.5, | |
| "learning_rate": 6.28492392434421e-08, | |
| "loss": 1.7129, | |
| "mean_token_accuracy": 0.9781472623348236, | |
| "num_tokens": 993832727.0, | |
| "step": 8630 | |
| }, | |
| { | |
| "epoch": 1.9065430872779432, | |
| "grad_norm": 106.5, | |
| "learning_rate": 5.999855983406688e-08, | |
| "loss": 1.9484, | |
| "mean_token_accuracy": 0.9728034630417823, | |
| "num_tokens": 994974350.0, | |
| "step": 8640 | |
| }, | |
| { | |
| "epoch": 1.908749862076575, | |
| "grad_norm": 47.75, | |
| "learning_rate": 5.7213649242602865e-08, | |
| "loss": 1.7992, | |
| "mean_token_accuracy": 0.9751810654997826, | |
| "num_tokens": 996123250.0, | |
| "step": 8650 | |
| }, | |
| { | |
| "epoch": 1.910956636875207, | |
| "grad_norm": 75.5, | |
| "learning_rate": 5.4494544546018216e-08, | |
| "loss": 1.8234, | |
| "mean_token_accuracy": 0.9758804485201835, | |
| "num_tokens": 997275499.0, | |
| "step": 8660 | |
| }, | |
| { | |
| "epoch": 1.9131634116738385, | |
| "grad_norm": 47.75, | |
| "learning_rate": 5.184128194517302e-08, | |
| "loss": 1.9264, | |
| "mean_token_accuracy": 0.9729637667536736, | |
| "num_tokens": 998427968.0, | |
| "step": 8670 | |
| }, | |
| { | |
| "epoch": 1.9153701864724706, | |
| "grad_norm": 26.5, | |
| "learning_rate": 4.925389676433745e-08, | |
| "loss": 1.7685, | |
| "mean_token_accuracy": 0.9756125986576081, | |
| "num_tokens": 999578497.0, | |
| "step": 8680 | |
| }, | |
| { | |
| "epoch": 1.9175769612711022, | |
| "grad_norm": 31.875, | |
| "learning_rate": 4.67324234507216e-08, | |
| "loss": 1.8395, | |
| "mean_token_accuracy": 0.974879115819931, | |
| "num_tokens": 1000720591.0, | |
| "step": 8690 | |
| }, | |
| { | |
| "epoch": 1.919783736069734, | |
| "grad_norm": 66.5, | |
| "learning_rate": 4.427689557401471e-08, | |
| "loss": 1.8698, | |
| "mean_token_accuracy": 0.9746023684740066, | |
| "num_tokens": 1001885635.0, | |
| "step": 8700 | |
| }, | |
| { | |
| "epoch": 1.921990510868366, | |
| "grad_norm": 25.375, | |
| "learning_rate": 4.188734582594223e-08, | |
| "loss": 1.7639, | |
| "mean_token_accuracy": 0.9765366345643998, | |
| "num_tokens": 1003035953.0, | |
| "step": 8710 | |
| }, | |
| { | |
| "epoch": 1.9241972856669975, | |
| "grad_norm": 42.75, | |
| "learning_rate": 3.956380601982668e-08, | |
| "loss": 1.8576, | |
| "mean_token_accuracy": 0.9746762126684189, | |
| "num_tokens": 1004186983.0, | |
| "step": 8720 | |
| }, | |
| { | |
| "epoch": 1.9264040604656296, | |
| "grad_norm": 35.75, | |
| "learning_rate": 3.730630709016747e-08, | |
| "loss": 1.8812, | |
| "mean_token_accuracy": 0.975875337421894, | |
| "num_tokens": 1005337640.0, | |
| "step": 8730 | |
| }, | |
| { | |
| "epoch": 1.9286108352642612, | |
| "grad_norm": 33.5, | |
| "learning_rate": 3.5114879092225104e-08, | |
| "loss": 1.8061, | |
| "mean_token_accuracy": 0.9754047855734825, | |
| "num_tokens": 1006512664.0, | |
| "step": 8740 | |
| }, | |
| { | |
| "epoch": 1.930817610062893, | |
| "grad_norm": 26.875, | |
| "learning_rate": 3.2989551201624836e-08, | |
| "loss": 1.867, | |
| "mean_token_accuracy": 0.9752555429935456, | |
| "num_tokens": 1007670382.0, | |
| "step": 8750 | |
| }, | |
| { | |
| "epoch": 1.933024384861525, | |
| "grad_norm": 34.75, | |
| "learning_rate": 3.093035171396641e-08, | |
| "loss": 1.8486, | |
| "mean_token_accuracy": 0.9765599980950356, | |
| "num_tokens": 1008816857.0, | |
| "step": 8760 | |
| }, | |
| { | |
| "epoch": 1.9352311596601566, | |
| "grad_norm": 31.875, | |
| "learning_rate": 2.893730804444772e-08, | |
| "loss": 1.8845, | |
| "mean_token_accuracy": 0.9751312792301178, | |
| "num_tokens": 1009986863.0, | |
| "step": 8770 | |
| }, | |
| { | |
| "epoch": 1.9374379344587886, | |
| "grad_norm": 40.25, | |
| "learning_rate": 2.7010446727498974e-08, | |
| "loss": 1.8545, | |
| "mean_token_accuracy": 0.9768265023827553, | |
| "num_tokens": 1011143146.0, | |
| "step": 8780 | |
| }, | |
| { | |
| "epoch": 1.9396447092574203, | |
| "grad_norm": 87.0, | |
| "learning_rate": 2.514979341643131e-08, | |
| "loss": 1.9943, | |
| "mean_token_accuracy": 0.9732449635863304, | |
| "num_tokens": 1012294422.0, | |
| "step": 8790 | |
| }, | |
| { | |
| "epoch": 1.9418514840560521, | |
| "grad_norm": 101.5, | |
| "learning_rate": 2.3355372883093197e-08, | |
| "loss": 1.7494, | |
| "mean_token_accuracy": 0.9770986959338188, | |
| "num_tokens": 1013435222.0, | |
| "step": 8800 | |
| }, | |
| { | |
| "epoch": 1.944058258854684, | |
| "grad_norm": 56.0, | |
| "learning_rate": 2.162720901754234e-08, | |
| "loss": 1.8597, | |
| "mean_token_accuracy": 0.9748499050736428, | |
| "num_tokens": 1014598338.0, | |
| "step": 8810 | |
| }, | |
| { | |
| "epoch": 1.9462650336533156, | |
| "grad_norm": 146.0, | |
| "learning_rate": 1.996532482772595e-08, | |
| "loss": 1.7836, | |
| "mean_token_accuracy": 0.9772195801138878, | |
| "num_tokens": 1015761488.0, | |
| "step": 8820 | |
| }, | |
| { | |
| "epoch": 1.9484718084519475, | |
| "grad_norm": 45.0, | |
| "learning_rate": 1.8369742439175997e-08, | |
| "loss": 1.8448, | |
| "mean_token_accuracy": 0.9751393094658851, | |
| "num_tokens": 1016880438.0, | |
| "step": 8830 | |
| }, | |
| { | |
| "epoch": 1.9506785832505793, | |
| "grad_norm": 73.5, | |
| "learning_rate": 1.6840483094713867e-08, | |
| "loss": 1.9247, | |
| "mean_token_accuracy": 0.9763313442468643, | |
| "num_tokens": 1018040918.0, | |
| "step": 8840 | |
| }, | |
| { | |
| "epoch": 1.952885358049211, | |
| "grad_norm": 45.75, | |
| "learning_rate": 1.5377567154167274e-08, | |
| "loss": 1.9046, | |
| "mean_token_accuracy": 0.9767668411135674, | |
| "num_tokens": 1019209080.0, | |
| "step": 8850 | |
| }, | |
| { | |
| "epoch": 1.955092132847843, | |
| "grad_norm": 88.5, | |
| "learning_rate": 1.3981014094099354e-08, | |
| "loss": 1.9519, | |
| "mean_token_accuracy": 0.9730608597397804, | |
| "num_tokens": 1020352084.0, | |
| "step": 8860 | |
| }, | |
| { | |
| "epoch": 1.9572989076464746, | |
| "grad_norm": 26.625, | |
| "learning_rate": 1.2650842507550554e-08, | |
| "loss": 1.7988, | |
| "mean_token_accuracy": 0.9755701437592507, | |
| "num_tokens": 1021497976.0, | |
| "step": 8870 | |
| }, | |
| { | |
| "epoch": 1.9595056824451065, | |
| "grad_norm": 71.5, | |
| "learning_rate": 1.138707010378992e-08, | |
| "loss": 1.8193, | |
| "mean_token_accuracy": 0.976331713795662, | |
| "num_tokens": 1022655116.0, | |
| "step": 8880 | |
| }, | |
| { | |
| "epoch": 1.9617124572437383, | |
| "grad_norm": 56.5, | |
| "learning_rate": 1.0189713708078086e-08, | |
| "loss": 1.8042, | |
| "mean_token_accuracy": 0.9762147620320321, | |
| "num_tokens": 1023814495.0, | |
| "step": 8890 | |
| }, | |
| { | |
| "epoch": 1.96391923204237, | |
| "grad_norm": 91.5, | |
| "learning_rate": 9.058789261446876e-09, | |
| "loss": 1.867, | |
| "mean_token_accuracy": 0.9728297904133797, | |
| "num_tokens": 1024958911.0, | |
| "step": 8900 | |
| }, | |
| { | |
| "epoch": 1.966126006841002, | |
| "grad_norm": 79.5, | |
| "learning_rate": 7.994311820482825e-09, | |
| "loss": 1.7625, | |
| "mean_token_accuracy": 0.977770508825779, | |
| "num_tokens": 1026100278.0, | |
| "step": 8910 | |
| }, | |
| { | |
| "epoch": 1.9683327816396337, | |
| "grad_norm": 34.5, | |
| "learning_rate": 6.996295557131216e-09, | |
| "loss": 1.7883, | |
| "mean_token_accuracy": 0.9773753598332405, | |
| "num_tokens": 1027251930.0, | |
| "step": 8920 | |
| }, | |
| { | |
| "epoch": 1.9705395564382655, | |
| "grad_norm": 23.375, | |
| "learning_rate": 6.0647537585017956e-09, | |
| "loss": 1.787, | |
| "mean_token_accuracy": 0.9774172812700271, | |
| "num_tokens": 1028405074.0, | |
| "step": 8930 | |
| }, | |
| { | |
| "epoch": 1.9727463312368974, | |
| "grad_norm": 107.5, | |
| "learning_rate": 5.199698826697796e-09, | |
| "loss": 2.004, | |
| "mean_token_accuracy": 0.9719442516565323, | |
| "num_tokens": 1029561237.0, | |
| "step": 8940 | |
| }, | |
| { | |
| "epoch": 1.974953106035529, | |
| "grad_norm": 45.0, | |
| "learning_rate": 4.401142278647186e-09, | |
| "loss": 1.7939, | |
| "mean_token_accuracy": 0.9772859245538712, | |
| "num_tokens": 1030694833.0, | |
| "step": 8950 | |
| }, | |
| { | |
| "epoch": 1.9771598808341608, | |
| "grad_norm": 49.0, | |
| "learning_rate": 3.669094745950008e-09, | |
| "loss": 1.8913, | |
| "mean_token_accuracy": 0.9748271107673645, | |
| "num_tokens": 1031855806.0, | |
| "step": 8960 | |
| }, | |
| { | |
| "epoch": 1.9793666556327927, | |
| "grad_norm": 31.875, | |
| "learning_rate": 3.003565974736833e-09, | |
| "loss": 1.8319, | |
| "mean_token_accuracy": 0.9758983448147773, | |
| "num_tokens": 1033016720.0, | |
| "step": 8970 | |
| }, | |
| { | |
| "epoch": 1.9815734304314243, | |
| "grad_norm": 47.5, | |
| "learning_rate": 2.404564825539968e-09, | |
| "loss": 1.9449, | |
| "mean_token_accuracy": 0.9749760687351227, | |
| "num_tokens": 1034169444.0, | |
| "step": 8980 | |
| }, | |
| { | |
| "epoch": 1.9837802052300564, | |
| "grad_norm": 69.0, | |
| "learning_rate": 1.8720992731741104e-09, | |
| "loss": 1.7037, | |
| "mean_token_accuracy": 0.9773508176207543, | |
| "num_tokens": 1035314307.0, | |
| "step": 8990 | |
| }, | |
| { | |
| "epoch": 1.985986980028688, | |
| "grad_norm": 42.25, | |
| "learning_rate": 1.406176406631432e-09, | |
| "loss": 1.9563, | |
| "mean_token_accuracy": 0.9728571131825448, | |
| "num_tokens": 1036476142.0, | |
| "step": 9000 | |
| }, | |
| { | |
| "epoch": 1.9881937548273199, | |
| "grad_norm": 61.0, | |
| "learning_rate": 1.0068024289860978e-09, | |
| "loss": 1.8863, | |
| "mean_token_accuracy": 0.9757997334003449, | |
| "num_tokens": 1037615528.0, | |
| "step": 9010 | |
| }, | |
| { | |
| "epoch": 1.9904005296259517, | |
| "grad_norm": 29.625, | |
| "learning_rate": 6.739826573121111e-10, | |
| "loss": 1.8484, | |
| "mean_token_accuracy": 0.9749989673495293, | |
| "num_tokens": 1038763476.0, | |
| "step": 9020 | |
| }, | |
| { | |
| "epoch": 1.9926073044245833, | |
| "grad_norm": 49.75, | |
| "learning_rate": 4.0772152261336906e-10, | |
| "loss": 1.8112, | |
| "mean_token_accuracy": 0.9777238726615906, | |
| "num_tokens": 1039917315.0, | |
| "step": 9030 | |
| }, | |
| { | |
| "epoch": 1.9948140792232154, | |
| "grad_norm": 44.0, | |
| "learning_rate": 2.080225697631555e-10, | |
| "loss": 2.0101, | |
| "mean_token_accuracy": 0.9723559439182281, | |
| "num_tokens": 1041047624.0, | |
| "step": 9040 | |
| }, | |
| { | |
| "epoch": 1.997020854021847, | |
| "grad_norm": 44.5, | |
| "learning_rate": 7.48884574575115e-11, | |
| "loss": 1.8884, | |
| "mean_token_accuracy": 0.9754657998681069, | |
| "num_tokens": 1042193833.0, | |
| "step": 9050 | |
| }, | |
| { | |
| "epoch": 1.9992276288204789, | |
| "grad_norm": 45.75, | |
| "learning_rate": 8.320958180818572e-12, | |
| "loss": 2.0289, | |
| "mean_token_accuracy": 0.9712605029344559, | |
| "num_tokens": 1043350004.0, | |
| "step": 9060 | |
| } | |
| ], | |
| "logging_steps": 10, | |
| "max_steps": 9064, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 2, | |
| "save_steps": 5000, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": true | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 5.381112047100494e+19, | |
| "train_batch_size": 2, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |