|
{ |
|
"best_global_step": null, |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 0.9994683678894205, |
|
"eval_steps": 500, |
|
"global_step": 940, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.01063264221158958, |
|
"grad_norm": 0.9233230352401733, |
|
"learning_rate": 0.00019808510638297873, |
|
"loss": 1.7298, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.02126528442317916, |
|
"grad_norm": 0.9088625311851501, |
|
"learning_rate": 0.00019595744680851065, |
|
"loss": 1.2536, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.03189792663476874, |
|
"grad_norm": 0.9639036655426025, |
|
"learning_rate": 0.00019382978723404257, |
|
"loss": 1.1759, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.04253056884635832, |
|
"grad_norm": 0.9426536560058594, |
|
"learning_rate": 0.00019170212765957448, |
|
"loss": 1.0703, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.0531632110579479, |
|
"grad_norm": 0.9788757562637329, |
|
"learning_rate": 0.0001895744680851064, |
|
"loss": 1.0505, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.06379585326953748, |
|
"grad_norm": 1.2088581323623657, |
|
"learning_rate": 0.00018744680851063832, |
|
"loss": 1.0358, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.07442849548112707, |
|
"grad_norm": 0.9232538342475891, |
|
"learning_rate": 0.0001853191489361702, |
|
"loss": 0.9859, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.08506113769271664, |
|
"grad_norm": 1.2805695533752441, |
|
"learning_rate": 0.00018319148936170215, |
|
"loss": 0.9763, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.09569377990430622, |
|
"grad_norm": 1.067161202430725, |
|
"learning_rate": 0.00018106382978723404, |
|
"loss": 1.0738, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.1063264221158958, |
|
"grad_norm": 1.2387498617172241, |
|
"learning_rate": 0.00017893617021276596, |
|
"loss": 0.9779, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.11695906432748537, |
|
"grad_norm": 1.0024847984313965, |
|
"learning_rate": 0.00017680851063829787, |
|
"loss": 0.9815, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.12759170653907495, |
|
"grad_norm": 1.9225773811340332, |
|
"learning_rate": 0.0001746808510638298, |
|
"loss": 0.9203, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.13822434875066453, |
|
"grad_norm": 1.3451205492019653, |
|
"learning_rate": 0.0001725531914893617, |
|
"loss": 0.9678, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.14885699096225413, |
|
"grad_norm": 1.4681438207626343, |
|
"learning_rate": 0.00017042553191489362, |
|
"loss": 0.813, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.1594896331738437, |
|
"grad_norm": 1.244214415550232, |
|
"learning_rate": 0.00016829787234042554, |
|
"loss": 0.7149, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.17012227538543329, |
|
"grad_norm": 1.4099949598312378, |
|
"learning_rate": 0.00016617021276595746, |
|
"loss": 0.8472, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.18075491759702286, |
|
"grad_norm": 1.4794244766235352, |
|
"learning_rate": 0.00016404255319148937, |
|
"loss": 0.6425, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 0.19138755980861244, |
|
"grad_norm": 1.574625849723816, |
|
"learning_rate": 0.0001619148936170213, |
|
"loss": 0.7604, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 0.20202020202020202, |
|
"grad_norm": 1.8564409017562866, |
|
"learning_rate": 0.0001597872340425532, |
|
"loss": 0.6777, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 0.2126528442317916, |
|
"grad_norm": 1.1089837551116943, |
|
"learning_rate": 0.00015765957446808512, |
|
"loss": 0.6804, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.22328548644338117, |
|
"grad_norm": 1.5858855247497559, |
|
"learning_rate": 0.00015553191489361701, |
|
"loss": 0.6962, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 0.23391812865497075, |
|
"grad_norm": 1.0457383394241333, |
|
"learning_rate": 0.00015340425531914896, |
|
"loss": 0.6975, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 0.24455077086656035, |
|
"grad_norm": 1.007315993309021, |
|
"learning_rate": 0.00015127659574468085, |
|
"loss": 0.6911, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 0.2551834130781499, |
|
"grad_norm": 1.4641199111938477, |
|
"learning_rate": 0.00014914893617021276, |
|
"loss": 0.6968, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 0.2658160552897395, |
|
"grad_norm": 1.127540111541748, |
|
"learning_rate": 0.00014702127659574468, |
|
"loss": 0.6226, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 0.27644869750132905, |
|
"grad_norm": 1.841412901878357, |
|
"learning_rate": 0.0001448936170212766, |
|
"loss": 0.5781, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 0.28708133971291866, |
|
"grad_norm": 1.6684510707855225, |
|
"learning_rate": 0.00014276595744680851, |
|
"loss": 0.6214, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 0.29771398192450826, |
|
"grad_norm": 1.7632583379745483, |
|
"learning_rate": 0.00014063829787234043, |
|
"loss": 0.5107, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 0.3083466241360978, |
|
"grad_norm": 1.3282111883163452, |
|
"learning_rate": 0.00013851063829787235, |
|
"loss": 0.6491, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 0.3189792663476874, |
|
"grad_norm": 1.7586910724639893, |
|
"learning_rate": 0.00013638297872340427, |
|
"loss": 0.7237, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.32961190855927697, |
|
"grad_norm": 1.5256597995758057, |
|
"learning_rate": 0.00013425531914893618, |
|
"loss": 0.6446, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 0.34024455077086657, |
|
"grad_norm": 1.4499211311340332, |
|
"learning_rate": 0.0001321276595744681, |
|
"loss": 0.557, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 0.3508771929824561, |
|
"grad_norm": 1.7797976732254028, |
|
"learning_rate": 0.00013000000000000002, |
|
"loss": 0.6162, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 0.3615098351940457, |
|
"grad_norm": 1.2894353866577148, |
|
"learning_rate": 0.0001278723404255319, |
|
"loss": 0.4761, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 0.3721424774056353, |
|
"grad_norm": 1.3315067291259766, |
|
"learning_rate": 0.00012574468085106382, |
|
"loss": 0.5865, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 0.3827751196172249, |
|
"grad_norm": 1.4586937427520752, |
|
"learning_rate": 0.00012361702127659577, |
|
"loss": 0.5099, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 0.3934077618288145, |
|
"grad_norm": 1.2912027835845947, |
|
"learning_rate": 0.00012148936170212766, |
|
"loss": 0.5801, |
|
"step": 370 |
|
}, |
|
{ |
|
"epoch": 0.40404040404040403, |
|
"grad_norm": 1.2132781744003296, |
|
"learning_rate": 0.00011936170212765959, |
|
"loss": 0.3565, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 0.41467304625199364, |
|
"grad_norm": 1.2837001085281372, |
|
"learning_rate": 0.0001172340425531915, |
|
"loss": 0.3634, |
|
"step": 390 |
|
}, |
|
{ |
|
"epoch": 0.4253056884635832, |
|
"grad_norm": 1.6399765014648438, |
|
"learning_rate": 0.0001151063829787234, |
|
"loss": 0.5858, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.4359383306751728, |
|
"grad_norm": 1.2120444774627686, |
|
"learning_rate": 0.00011297872340425532, |
|
"loss": 0.5533, |
|
"step": 410 |
|
}, |
|
{ |
|
"epoch": 0.44657097288676234, |
|
"grad_norm": 2.2904655933380127, |
|
"learning_rate": 0.00011085106382978725, |
|
"loss": 0.5185, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 0.45720361509835195, |
|
"grad_norm": 0.9909681081771851, |
|
"learning_rate": 0.00010872340425531916, |
|
"loss": 0.4418, |
|
"step": 430 |
|
}, |
|
{ |
|
"epoch": 0.4678362573099415, |
|
"grad_norm": 1.9283276796340942, |
|
"learning_rate": 0.00010659574468085107, |
|
"loss": 0.458, |
|
"step": 440 |
|
}, |
|
{ |
|
"epoch": 0.4784688995215311, |
|
"grad_norm": 1.5563241243362427, |
|
"learning_rate": 0.00010446808510638298, |
|
"loss": 0.3739, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 0.4891015417331207, |
|
"grad_norm": 0.8688263893127441, |
|
"learning_rate": 0.0001023404255319149, |
|
"loss": 0.3839, |
|
"step": 460 |
|
}, |
|
{ |
|
"epoch": 0.49973418394471025, |
|
"grad_norm": 1.012356162071228, |
|
"learning_rate": 0.00010021276595744682, |
|
"loss": 0.3637, |
|
"step": 470 |
|
}, |
|
{ |
|
"epoch": 0.5103668261562998, |
|
"grad_norm": 1.2394040822982788, |
|
"learning_rate": 9.808510638297873e-05, |
|
"loss": 0.4456, |
|
"step": 480 |
|
}, |
|
{ |
|
"epoch": 0.5209994683678895, |
|
"grad_norm": 2.0661351680755615, |
|
"learning_rate": 9.595744680851064e-05, |
|
"loss": 0.3375, |
|
"step": 490 |
|
}, |
|
{ |
|
"epoch": 0.531632110579479, |
|
"grad_norm": 0.8300966024398804, |
|
"learning_rate": 9.382978723404256e-05, |
|
"loss": 0.2703, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.5422647527910686, |
|
"grad_norm": 2.6386091709136963, |
|
"learning_rate": 9.170212765957448e-05, |
|
"loss": 0.3582, |
|
"step": 510 |
|
}, |
|
{ |
|
"epoch": 0.5528973950026581, |
|
"grad_norm": 1.5658433437347412, |
|
"learning_rate": 8.95744680851064e-05, |
|
"loss": 0.5284, |
|
"step": 520 |
|
}, |
|
{ |
|
"epoch": 0.5635300372142478, |
|
"grad_norm": 1.4143650531768799, |
|
"learning_rate": 8.74468085106383e-05, |
|
"loss": 0.3619, |
|
"step": 530 |
|
}, |
|
{ |
|
"epoch": 0.5741626794258373, |
|
"grad_norm": 1.0321277379989624, |
|
"learning_rate": 8.531914893617021e-05, |
|
"loss": 0.449, |
|
"step": 540 |
|
}, |
|
{ |
|
"epoch": 0.5847953216374269, |
|
"grad_norm": 1.4047714471817017, |
|
"learning_rate": 8.319148936170213e-05, |
|
"loss": 0.3873, |
|
"step": 550 |
|
}, |
|
{ |
|
"epoch": 0.5954279638490165, |
|
"grad_norm": 1.176665186882019, |
|
"learning_rate": 8.106382978723405e-05, |
|
"loss": 0.3208, |
|
"step": 560 |
|
}, |
|
{ |
|
"epoch": 0.6060606060606061, |
|
"grad_norm": 1.349563479423523, |
|
"learning_rate": 7.893617021276596e-05, |
|
"loss": 0.3534, |
|
"step": 570 |
|
}, |
|
{ |
|
"epoch": 0.6166932482721956, |
|
"grad_norm": 1.899173617362976, |
|
"learning_rate": 7.680851063829788e-05, |
|
"loss": 0.4149, |
|
"step": 580 |
|
}, |
|
{ |
|
"epoch": 0.6273258904837852, |
|
"grad_norm": 1.041756272315979, |
|
"learning_rate": 7.46808510638298e-05, |
|
"loss": 0.2761, |
|
"step": 590 |
|
}, |
|
{ |
|
"epoch": 0.6379585326953748, |
|
"grad_norm": 1.1541553735733032, |
|
"learning_rate": 7.25531914893617e-05, |
|
"loss": 0.3183, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 0.6485911749069644, |
|
"grad_norm": 1.7732151746749878, |
|
"learning_rate": 7.042553191489362e-05, |
|
"loss": 0.4356, |
|
"step": 610 |
|
}, |
|
{ |
|
"epoch": 0.6592238171185539, |
|
"grad_norm": 2.6027865409851074, |
|
"learning_rate": 6.829787234042554e-05, |
|
"loss": 0.5153, |
|
"step": 620 |
|
}, |
|
{ |
|
"epoch": 0.6698564593301436, |
|
"grad_norm": 1.1163185834884644, |
|
"learning_rate": 6.617021276595745e-05, |
|
"loss": 0.3758, |
|
"step": 630 |
|
}, |
|
{ |
|
"epoch": 0.6804891015417331, |
|
"grad_norm": 0.8950490355491638, |
|
"learning_rate": 6.404255319148937e-05, |
|
"loss": 0.3092, |
|
"step": 640 |
|
}, |
|
{ |
|
"epoch": 0.6911217437533227, |
|
"grad_norm": 0.8353213667869568, |
|
"learning_rate": 6.191489361702127e-05, |
|
"loss": 0.2713, |
|
"step": 650 |
|
}, |
|
{ |
|
"epoch": 0.7017543859649122, |
|
"grad_norm": 0.9185741543769836, |
|
"learning_rate": 5.9787234042553196e-05, |
|
"loss": 0.2692, |
|
"step": 660 |
|
}, |
|
{ |
|
"epoch": 0.7123870281765019, |
|
"grad_norm": 1.5412646532058716, |
|
"learning_rate": 5.7659574468085106e-05, |
|
"loss": 0.252, |
|
"step": 670 |
|
}, |
|
{ |
|
"epoch": 0.7230196703880915, |
|
"grad_norm": 1.210580825805664, |
|
"learning_rate": 5.553191489361702e-05, |
|
"loss": 0.2394, |
|
"step": 680 |
|
}, |
|
{ |
|
"epoch": 0.733652312599681, |
|
"grad_norm": 2.3778483867645264, |
|
"learning_rate": 5.3404255319148946e-05, |
|
"loss": 0.2672, |
|
"step": 690 |
|
}, |
|
{ |
|
"epoch": 0.7442849548112705, |
|
"grad_norm": 2.204791784286499, |
|
"learning_rate": 5.1276595744680856e-05, |
|
"loss": 0.3295, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 0.7549175970228602, |
|
"grad_norm": 1.610378623008728, |
|
"learning_rate": 4.9148936170212766e-05, |
|
"loss": 0.3868, |
|
"step": 710 |
|
}, |
|
{ |
|
"epoch": 0.7655502392344498, |
|
"grad_norm": 1.7490154504776, |
|
"learning_rate": 4.702127659574468e-05, |
|
"loss": 0.302, |
|
"step": 720 |
|
}, |
|
{ |
|
"epoch": 0.7761828814460393, |
|
"grad_norm": 1.022546410560608, |
|
"learning_rate": 4.489361702127659e-05, |
|
"loss": 0.2969, |
|
"step": 730 |
|
}, |
|
{ |
|
"epoch": 0.786815523657629, |
|
"grad_norm": 1.0458086729049683, |
|
"learning_rate": 4.276595744680851e-05, |
|
"loss": 0.3652, |
|
"step": 740 |
|
}, |
|
{ |
|
"epoch": 0.7974481658692185, |
|
"grad_norm": 1.330607295036316, |
|
"learning_rate": 4.063829787234043e-05, |
|
"loss": 0.2184, |
|
"step": 750 |
|
}, |
|
{ |
|
"epoch": 0.8080808080808081, |
|
"grad_norm": 1.8219746351242065, |
|
"learning_rate": 3.8510638297872344e-05, |
|
"loss": 0.2359, |
|
"step": 760 |
|
}, |
|
{ |
|
"epoch": 0.8187134502923976, |
|
"grad_norm": 3.080618143081665, |
|
"learning_rate": 3.638297872340426e-05, |
|
"loss": 0.3008, |
|
"step": 770 |
|
}, |
|
{ |
|
"epoch": 0.8293460925039873, |
|
"grad_norm": 2.212218999862671, |
|
"learning_rate": 3.425531914893617e-05, |
|
"loss": 0.2675, |
|
"step": 780 |
|
}, |
|
{ |
|
"epoch": 0.8399787347155768, |
|
"grad_norm": 1.714879035949707, |
|
"learning_rate": 3.212765957446809e-05, |
|
"loss": 0.2914, |
|
"step": 790 |
|
}, |
|
{ |
|
"epoch": 0.8506113769271664, |
|
"grad_norm": 2.811004161834717, |
|
"learning_rate": 3e-05, |
|
"loss": 0.2235, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 0.861244019138756, |
|
"grad_norm": 2.3071866035461426, |
|
"learning_rate": 2.7872340425531918e-05, |
|
"loss": 0.2387, |
|
"step": 810 |
|
}, |
|
{ |
|
"epoch": 0.8718766613503456, |
|
"grad_norm": 2.134385108947754, |
|
"learning_rate": 2.574468085106383e-05, |
|
"loss": 0.2365, |
|
"step": 820 |
|
}, |
|
{ |
|
"epoch": 0.8825093035619351, |
|
"grad_norm": 1.6607768535614014, |
|
"learning_rate": 2.3617021276595748e-05, |
|
"loss": 0.3073, |
|
"step": 830 |
|
}, |
|
{ |
|
"epoch": 0.8931419457735247, |
|
"grad_norm": 2.4962167739868164, |
|
"learning_rate": 2.148936170212766e-05, |
|
"loss": 0.2799, |
|
"step": 840 |
|
}, |
|
{ |
|
"epoch": 0.9037745879851143, |
|
"grad_norm": 3.272426128387451, |
|
"learning_rate": 1.9361702127659575e-05, |
|
"loss": 0.2627, |
|
"step": 850 |
|
}, |
|
{ |
|
"epoch": 0.9144072301967039, |
|
"grad_norm": 0.6173011064529419, |
|
"learning_rate": 1.723404255319149e-05, |
|
"loss": 0.2336, |
|
"step": 860 |
|
}, |
|
{ |
|
"epoch": 0.9250398724082934, |
|
"grad_norm": 1.584494948387146, |
|
"learning_rate": 1.5106382978723405e-05, |
|
"loss": 0.2081, |
|
"step": 870 |
|
}, |
|
{ |
|
"epoch": 0.935672514619883, |
|
"grad_norm": 0.16361036896705627, |
|
"learning_rate": 1.2978723404255318e-05, |
|
"loss": 0.3102, |
|
"step": 880 |
|
}, |
|
{ |
|
"epoch": 0.9463051568314726, |
|
"grad_norm": 1.1021312475204468, |
|
"learning_rate": 1.0851063829787235e-05, |
|
"loss": 0.2414, |
|
"step": 890 |
|
}, |
|
{ |
|
"epoch": 0.9569377990430622, |
|
"grad_norm": 1.4342254400253296, |
|
"learning_rate": 8.72340425531915e-06, |
|
"loss": 0.2268, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 0.9675704412546517, |
|
"grad_norm": 3.8059401512145996, |
|
"learning_rate": 6.595744680851064e-06, |
|
"loss": 0.192, |
|
"step": 910 |
|
}, |
|
{ |
|
"epoch": 0.9782030834662414, |
|
"grad_norm": 3.46911883354187, |
|
"learning_rate": 4.468085106382979e-06, |
|
"loss": 0.2758, |
|
"step": 920 |
|
}, |
|
{ |
|
"epoch": 0.988835725677831, |
|
"grad_norm": 1.878474235534668, |
|
"learning_rate": 2.3404255319148935e-06, |
|
"loss": 0.2006, |
|
"step": 930 |
|
}, |
|
{ |
|
"epoch": 0.9994683678894205, |
|
"grad_norm": 4.247520923614502, |
|
"learning_rate": 2.1276595744680852e-07, |
|
"loss": 0.2977, |
|
"step": 940 |
|
} |
|
], |
|
"logging_steps": 10, |
|
"max_steps": 940, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 1, |
|
"save_steps": 500, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 5.314124516155392e+16, |
|
"train_batch_size": 1, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|