|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 6.0, |
|
"eval_steps": 5030, |
|
"global_step": 150876, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.019883878151594685, |
|
"grad_norm": 2.3147895336151123, |
|
"learning_rate": 0.0001994694115774394, |
|
"loss": 0.594, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.03976775630318937, |
|
"grad_norm": 2.2017321586608887, |
|
"learning_rate": 0.0001988061760492386, |
|
"loss": 0.4251, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 0.05965163445478406, |
|
"grad_norm": 2.0374693870544434, |
|
"learning_rate": 0.00019814294052103784, |
|
"loss": 0.386, |
|
"step": 1500 |
|
}, |
|
{ |
|
"epoch": 0.07953551260637874, |
|
"grad_norm": 1.3447909355163574, |
|
"learning_rate": 0.00019747970499283706, |
|
"loss": 0.3585, |
|
"step": 2000 |
|
}, |
|
{ |
|
"epoch": 0.09941939075797343, |
|
"grad_norm": 1.2458261251449585, |
|
"learning_rate": 0.00019682044887780548, |
|
"loss": 0.3545, |
|
"step": 2500 |
|
}, |
|
{ |
|
"epoch": 0.11930326890956812, |
|
"grad_norm": 1.844347357749939, |
|
"learning_rate": 0.00019615721334960472, |
|
"loss": 0.3329, |
|
"step": 3000 |
|
}, |
|
{ |
|
"epoch": 0.1391871470611628, |
|
"grad_norm": 1.3815340995788574, |
|
"learning_rate": 0.00019549397782140396, |
|
"loss": 0.3182, |
|
"step": 3500 |
|
}, |
|
{ |
|
"epoch": 0.15907102521275748, |
|
"grad_norm": 2.2595863342285156, |
|
"learning_rate": 0.00019483074229320317, |
|
"loss": 0.2985, |
|
"step": 4000 |
|
}, |
|
{ |
|
"epoch": 0.17895490336435219, |
|
"grad_norm": 1.8043708801269531, |
|
"learning_rate": 0.00019416750676500238, |
|
"loss": 0.3068, |
|
"step": 4500 |
|
}, |
|
{ |
|
"epoch": 0.19883878151594686, |
|
"grad_norm": 1.8568594455718994, |
|
"learning_rate": 0.00019350427123680162, |
|
"loss": 0.3044, |
|
"step": 5000 |
|
}, |
|
{ |
|
"epoch": 0.20003181420504254, |
|
"eval_loss": 0.29235559701919556, |
|
"eval_runtime": 31.6112, |
|
"eval_samples_per_second": 15.817, |
|
"eval_steps_per_second": 7.909, |
|
"step": 5030 |
|
}, |
|
{ |
|
"epoch": 0.21872265966754156, |
|
"grad_norm": 1.780515193939209, |
|
"learning_rate": 0.00019284103570860084, |
|
"loss": 0.2884, |
|
"step": 5500 |
|
}, |
|
{ |
|
"epoch": 0.23860653781913624, |
|
"grad_norm": 1.3652188777923584, |
|
"learning_rate": 0.00019217780018040008, |
|
"loss": 0.2924, |
|
"step": 6000 |
|
}, |
|
{ |
|
"epoch": 0.2584904159707309, |
|
"grad_norm": 1.5280920267105103, |
|
"learning_rate": 0.0001915158911232557, |
|
"loss": 0.2714, |
|
"step": 6500 |
|
}, |
|
{ |
|
"epoch": 0.2783742941223256, |
|
"grad_norm": 1.8563861846923828, |
|
"learning_rate": 0.00019085398206611135, |
|
"loss": 0.2729, |
|
"step": 7000 |
|
}, |
|
{ |
|
"epoch": 0.2982581722739203, |
|
"grad_norm": 1.0958434343338013, |
|
"learning_rate": 0.00019019074653791056, |
|
"loss": 0.2735, |
|
"step": 7500 |
|
}, |
|
{ |
|
"epoch": 0.31814205042551497, |
|
"grad_norm": 1.145859956741333, |
|
"learning_rate": 0.00018952751100970977, |
|
"loss": 0.2737, |
|
"step": 8000 |
|
}, |
|
{ |
|
"epoch": 0.33802592857710967, |
|
"grad_norm": 1.2784571647644043, |
|
"learning_rate": 0.000188864275481509, |
|
"loss": 0.2611, |
|
"step": 8500 |
|
}, |
|
{ |
|
"epoch": 0.35790980672870437, |
|
"grad_norm": 1.3518733978271484, |
|
"learning_rate": 0.00018820103995330822, |
|
"loss": 0.2612, |
|
"step": 9000 |
|
}, |
|
{ |
|
"epoch": 0.3777936848802991, |
|
"grad_norm": 1.4692599773406982, |
|
"learning_rate": 0.00018753780442510743, |
|
"loss": 0.2609, |
|
"step": 9500 |
|
}, |
|
{ |
|
"epoch": 0.3976775630318937, |
|
"grad_norm": 2.682018518447876, |
|
"learning_rate": 0.00018687456889690667, |
|
"loss": 0.2643, |
|
"step": 10000 |
|
}, |
|
{ |
|
"epoch": 0.4000636284100851, |
|
"eval_loss": 0.2585141062736511, |
|
"eval_runtime": 31.9094, |
|
"eval_samples_per_second": 15.669, |
|
"eval_steps_per_second": 7.835, |
|
"step": 10060 |
|
}, |
|
{ |
|
"epoch": 0.4175614411834884, |
|
"grad_norm": 1.7332996129989624, |
|
"learning_rate": 0.00018621133336870591, |
|
"loss": 0.2523, |
|
"step": 10500 |
|
}, |
|
{ |
|
"epoch": 0.4374453193350831, |
|
"grad_norm": 2.1415417194366455, |
|
"learning_rate": 0.00018554942431156152, |
|
"loss": 0.2585, |
|
"step": 11000 |
|
}, |
|
{ |
|
"epoch": 0.4573291974866778, |
|
"grad_norm": 1.2841447591781616, |
|
"learning_rate": 0.00018488618878336073, |
|
"loss": 0.2557, |
|
"step": 11500 |
|
}, |
|
{ |
|
"epoch": 0.4772130756382725, |
|
"grad_norm": 3.2847816944122314, |
|
"learning_rate": 0.00018422295325516, |
|
"loss": 0.2609, |
|
"step": 12000 |
|
}, |
|
{ |
|
"epoch": 0.4970969537898672, |
|
"grad_norm": 0.7331926822662354, |
|
"learning_rate": 0.00018355971772695921, |
|
"loss": 0.2477, |
|
"step": 12500 |
|
}, |
|
{ |
|
"epoch": 0.5169808319414618, |
|
"grad_norm": 1.4415650367736816, |
|
"learning_rate": 0.00018289648219875843, |
|
"loss": 0.2482, |
|
"step": 13000 |
|
}, |
|
{ |
|
"epoch": 0.5368647100930566, |
|
"grad_norm": 1.7630778551101685, |
|
"learning_rate": 0.00018223457314161406, |
|
"loss": 0.2421, |
|
"step": 13500 |
|
}, |
|
{ |
|
"epoch": 0.5567485882446512, |
|
"grad_norm": 2.0509490966796875, |
|
"learning_rate": 0.0001815713376134133, |
|
"loss": 0.2444, |
|
"step": 14000 |
|
}, |
|
{ |
|
"epoch": 0.5766324663962459, |
|
"grad_norm": 1.959215760231018, |
|
"learning_rate": 0.0001809081020852125, |
|
"loss": 0.2398, |
|
"step": 14500 |
|
}, |
|
{ |
|
"epoch": 0.5965163445478406, |
|
"grad_norm": 1.6416336297988892, |
|
"learning_rate": 0.00018024486655701173, |
|
"loss": 0.2396, |
|
"step": 15000 |
|
}, |
|
{ |
|
"epoch": 0.6000954426151277, |
|
"eval_loss": 0.23102714121341705, |
|
"eval_runtime": 32.4432, |
|
"eval_samples_per_second": 15.412, |
|
"eval_steps_per_second": 7.706, |
|
"step": 15090 |
|
}, |
|
{ |
|
"epoch": 0.6164002226994353, |
|
"grad_norm": 0.5961350798606873, |
|
"learning_rate": 0.00017958163102881097, |
|
"loss": 0.2291, |
|
"step": 15500 |
|
}, |
|
{ |
|
"epoch": 0.6362841008510299, |
|
"grad_norm": 1.3288564682006836, |
|
"learning_rate": 0.00017891972197166657, |
|
"loss": 0.2368, |
|
"step": 16000 |
|
}, |
|
{ |
|
"epoch": 0.6561679790026247, |
|
"grad_norm": 1.7699204683303833, |
|
"learning_rate": 0.0001782564864434658, |
|
"loss": 0.2397, |
|
"step": 16500 |
|
}, |
|
{ |
|
"epoch": 0.6760518571542193, |
|
"grad_norm": 1.3349616527557373, |
|
"learning_rate": 0.00017759325091526505, |
|
"loss": 0.2353, |
|
"step": 17000 |
|
}, |
|
{ |
|
"epoch": 0.6959357353058141, |
|
"grad_norm": 1.6228617429733276, |
|
"learning_rate": 0.00017693001538706426, |
|
"loss": 0.2314, |
|
"step": 17500 |
|
}, |
|
{ |
|
"epoch": 0.7158196134574087, |
|
"grad_norm": 1.349108338356018, |
|
"learning_rate": 0.00017626677985886348, |
|
"loss": 0.2339, |
|
"step": 18000 |
|
}, |
|
{ |
|
"epoch": 0.7357034916090034, |
|
"grad_norm": 2.2128469944000244, |
|
"learning_rate": 0.00017560354433066272, |
|
"loss": 0.2252, |
|
"step": 18500 |
|
}, |
|
{ |
|
"epoch": 0.7555873697605981, |
|
"grad_norm": 2.038177013397217, |
|
"learning_rate": 0.00017494163527351835, |
|
"loss": 0.2242, |
|
"step": 19000 |
|
}, |
|
{ |
|
"epoch": 0.7754712479121928, |
|
"grad_norm": 1.1377464532852173, |
|
"learning_rate": 0.00017427839974531756, |
|
"loss": 0.2197, |
|
"step": 19500 |
|
}, |
|
{ |
|
"epoch": 0.7953551260637874, |
|
"grad_norm": 1.2686738967895508, |
|
"learning_rate": 0.00017361516421711678, |
|
"loss": 0.2277, |
|
"step": 20000 |
|
}, |
|
{ |
|
"epoch": 0.8001272568201702, |
|
"eval_loss": 0.21075737476348877, |
|
"eval_runtime": 32.819, |
|
"eval_samples_per_second": 15.235, |
|
"eval_steps_per_second": 7.618, |
|
"step": 20120 |
|
}, |
|
{ |
|
"epoch": 0.8152390042153822, |
|
"grad_norm": 2.1816446781158447, |
|
"learning_rate": 0.00017295192868891602, |
|
"loss": 0.2245, |
|
"step": 20500 |
|
}, |
|
{ |
|
"epoch": 0.8351228823669768, |
|
"grad_norm": 2.224292755126953, |
|
"learning_rate": 0.00017229001963177165, |
|
"loss": 0.2176, |
|
"step": 21000 |
|
}, |
|
{ |
|
"epoch": 0.8550067605185715, |
|
"grad_norm": 0.7720803618431091, |
|
"learning_rate": 0.00017162811057462726, |
|
"loss": 0.2097, |
|
"step": 21500 |
|
}, |
|
{ |
|
"epoch": 0.8748906386701663, |
|
"grad_norm": 1.716422200202942, |
|
"learning_rate": 0.0001709648750464265, |
|
"loss": 0.2104, |
|
"step": 22000 |
|
}, |
|
{ |
|
"epoch": 0.8947745168217609, |
|
"grad_norm": 1.5487083196640015, |
|
"learning_rate": 0.00017030163951822574, |
|
"loss": 0.2179, |
|
"step": 22500 |
|
}, |
|
{ |
|
"epoch": 0.9146583949733555, |
|
"grad_norm": 1.0968928337097168, |
|
"learning_rate": 0.00016963840399002495, |
|
"loss": 0.2194, |
|
"step": 23000 |
|
}, |
|
{ |
|
"epoch": 0.9345422731249503, |
|
"grad_norm": 0.36388519406318665, |
|
"learning_rate": 0.00016897516846182416, |
|
"loss": 0.2259, |
|
"step": 23500 |
|
}, |
|
{ |
|
"epoch": 0.954426151276545, |
|
"grad_norm": 1.2857156991958618, |
|
"learning_rate": 0.0001683119329336234, |
|
"loss": 0.2195, |
|
"step": 24000 |
|
}, |
|
{ |
|
"epoch": 0.9743100294281397, |
|
"grad_norm": 1.7136821746826172, |
|
"learning_rate": 0.00016764869740542261, |
|
"loss": 0.2036, |
|
"step": 24500 |
|
}, |
|
{ |
|
"epoch": 0.9941939075797344, |
|
"grad_norm": 2.1068880558013916, |
|
"learning_rate": 0.00016698546187722183, |
|
"loss": 0.2103, |
|
"step": 25000 |
|
}, |
|
{ |
|
"epoch": 1.0001590710252128, |
|
"eval_loss": 0.2087584286928177, |
|
"eval_runtime": 34.3797, |
|
"eval_samples_per_second": 14.543, |
|
"eval_steps_per_second": 7.272, |
|
"step": 25150 |
|
}, |
|
{ |
|
"epoch": 1.014077785731329, |
|
"grad_norm": 0.9411464929580688, |
|
"learning_rate": 0.0001663222263490211, |
|
"loss": 0.1902, |
|
"step": 25500 |
|
}, |
|
{ |
|
"epoch": 1.0339616638829237, |
|
"grad_norm": 1.8499776124954224, |
|
"learning_rate": 0.0001656589908208203, |
|
"loss": 0.1961, |
|
"step": 26000 |
|
}, |
|
{ |
|
"epoch": 1.0538455420345185, |
|
"grad_norm": 2.804515838623047, |
|
"learning_rate": 0.00016499708176367591, |
|
"loss": 0.1975, |
|
"step": 26500 |
|
}, |
|
{ |
|
"epoch": 1.0737294201861132, |
|
"grad_norm": 1.0124012231826782, |
|
"learning_rate": 0.00016433384623547515, |
|
"loss": 0.1965, |
|
"step": 27000 |
|
}, |
|
{ |
|
"epoch": 1.0936132983377078, |
|
"grad_norm": 2.1755409240722656, |
|
"learning_rate": 0.0001636706107072744, |
|
"loss": 0.2032, |
|
"step": 27500 |
|
}, |
|
{ |
|
"epoch": 1.1134971764893025, |
|
"grad_norm": 1.9173059463500977, |
|
"learning_rate": 0.0001630073751790736, |
|
"loss": 0.1998, |
|
"step": 28000 |
|
}, |
|
{ |
|
"epoch": 1.1333810546408971, |
|
"grad_norm": 2.6213159561157227, |
|
"learning_rate": 0.0001623454661219292, |
|
"loss": 0.1944, |
|
"step": 28500 |
|
}, |
|
{ |
|
"epoch": 1.1532649327924918, |
|
"grad_norm": 1.979064702987671, |
|
"learning_rate": 0.00016168223059372845, |
|
"loss": 0.1947, |
|
"step": 29000 |
|
}, |
|
{ |
|
"epoch": 1.1731488109440866, |
|
"grad_norm": 3.864307403564453, |
|
"learning_rate": 0.00016101899506552767, |
|
"loss": 0.1948, |
|
"step": 29500 |
|
}, |
|
{ |
|
"epoch": 1.1930326890956813, |
|
"grad_norm": 1.3037413358688354, |
|
"learning_rate": 0.0001603557595373269, |
|
"loss": 0.1942, |
|
"step": 30000 |
|
}, |
|
{ |
|
"epoch": 1.2001908852302554, |
|
"eval_loss": 0.20310936868190765, |
|
"eval_runtime": 33.1759, |
|
"eval_samples_per_second": 15.071, |
|
"eval_steps_per_second": 7.536, |
|
"step": 30180 |
|
}, |
|
{ |
|
"epoch": 1.212916567247276, |
|
"grad_norm": 1.4742060899734497, |
|
"learning_rate": 0.00015969385048018254, |
|
"loss": 0.1873, |
|
"step": 30500 |
|
}, |
|
{ |
|
"epoch": 1.2328004453988706, |
|
"grad_norm": 0.8921416401863098, |
|
"learning_rate": 0.00015903061495198175, |
|
"loss": 0.1888, |
|
"step": 31000 |
|
}, |
|
{ |
|
"epoch": 1.2526843235504652, |
|
"grad_norm": 2.814047336578369, |
|
"learning_rate": 0.00015836737942378096, |
|
"loss": 0.1933, |
|
"step": 31500 |
|
}, |
|
{ |
|
"epoch": 1.27256820170206, |
|
"grad_norm": 1.6011271476745605, |
|
"learning_rate": 0.0001577041438955802, |
|
"loss": 0.1943, |
|
"step": 32000 |
|
}, |
|
{ |
|
"epoch": 1.2924520798536547, |
|
"grad_norm": 2.1085023880004883, |
|
"learning_rate": 0.00015704090836737944, |
|
"loss": 0.1939, |
|
"step": 32500 |
|
}, |
|
{ |
|
"epoch": 1.3123359580052494, |
|
"grad_norm": 1.4254250526428223, |
|
"learning_rate": 0.00015637899931023505, |
|
"loss": 0.1898, |
|
"step": 33000 |
|
}, |
|
{ |
|
"epoch": 1.332219836156844, |
|
"grad_norm": 2.6085307598114014, |
|
"learning_rate": 0.00015571576378203426, |
|
"loss": 0.1869, |
|
"step": 33500 |
|
}, |
|
{ |
|
"epoch": 1.3521037143084387, |
|
"grad_norm": 1.4234007596969604, |
|
"learning_rate": 0.00015505252825383353, |
|
"loss": 0.1924, |
|
"step": 34000 |
|
}, |
|
{ |
|
"epoch": 1.3719875924600333, |
|
"grad_norm": 1.5566816329956055, |
|
"learning_rate": 0.00015439061919668914, |
|
"loss": 0.1932, |
|
"step": 34500 |
|
}, |
|
{ |
|
"epoch": 1.391871470611628, |
|
"grad_norm": 8.811753273010254, |
|
"learning_rate": 0.00015372738366848835, |
|
"loss": 0.1846, |
|
"step": 35000 |
|
}, |
|
{ |
|
"epoch": 1.400222699435298, |
|
"eval_loss": 0.19551397860050201, |
|
"eval_runtime": 35.3626, |
|
"eval_samples_per_second": 14.139, |
|
"eval_steps_per_second": 7.07, |
|
"step": 35210 |
|
}, |
|
{ |
|
"epoch": 1.4117553487632228, |
|
"grad_norm": 1.463742971420288, |
|
"learning_rate": 0.0001530641481402876, |
|
"loss": 0.1924, |
|
"step": 35500 |
|
}, |
|
{ |
|
"epoch": 1.4316392269148175, |
|
"grad_norm": 2.8610801696777344, |
|
"learning_rate": 0.00015240091261208683, |
|
"loss": 0.1938, |
|
"step": 36000 |
|
}, |
|
{ |
|
"epoch": 1.4515231050664121, |
|
"grad_norm": 1.2369511127471924, |
|
"learning_rate": 0.00015173767708388604, |
|
"loss": 0.1817, |
|
"step": 36500 |
|
}, |
|
{ |
|
"epoch": 1.4714069832180068, |
|
"grad_norm": 1.4233882427215576, |
|
"learning_rate": 0.00015107444155568526, |
|
"loss": 0.1817, |
|
"step": 37000 |
|
}, |
|
{ |
|
"epoch": 1.4912908613696017, |
|
"grad_norm": 1.5385404825210571, |
|
"learning_rate": 0.0001504112060274845, |
|
"loss": 0.1849, |
|
"step": 37500 |
|
}, |
|
{ |
|
"epoch": 1.5111747395211963, |
|
"grad_norm": 1.2022610902786255, |
|
"learning_rate": 0.0001497479704992837, |
|
"loss": 0.183, |
|
"step": 38000 |
|
}, |
|
{ |
|
"epoch": 1.531058617672791, |
|
"grad_norm": 1.5988807678222656, |
|
"learning_rate": 0.00014908606144213934, |
|
"loss": 0.1821, |
|
"step": 38500 |
|
}, |
|
{ |
|
"epoch": 1.5509424958243856, |
|
"grad_norm": 1.480351209640503, |
|
"learning_rate": 0.00014842282591393855, |
|
"loss": 0.1787, |
|
"step": 39000 |
|
}, |
|
{ |
|
"epoch": 1.5708263739759802, |
|
"grad_norm": 1.9051077365875244, |
|
"learning_rate": 0.0001477595903857378, |
|
"loss": 0.1826, |
|
"step": 39500 |
|
}, |
|
{ |
|
"epoch": 1.5907102521275749, |
|
"grad_norm": 2.0775561332702637, |
|
"learning_rate": 0.000147096354857537, |
|
"loss": 0.1814, |
|
"step": 40000 |
|
}, |
|
{ |
|
"epoch": 1.6002545136403405, |
|
"eval_loss": 0.18265177309513092, |
|
"eval_runtime": 31.3228, |
|
"eval_samples_per_second": 15.963, |
|
"eval_steps_per_second": 7.981, |
|
"step": 40240 |
|
}, |
|
{ |
|
"epoch": 1.6105941302791695, |
|
"grad_norm": 2.4270079135894775, |
|
"learning_rate": 0.00014643444580039264, |
|
"loss": 0.1728, |
|
"step": 40500 |
|
}, |
|
{ |
|
"epoch": 1.6304780084307642, |
|
"grad_norm": 1.4693158864974976, |
|
"learning_rate": 0.00014577121027219188, |
|
"loss": 0.1835, |
|
"step": 41000 |
|
}, |
|
{ |
|
"epoch": 1.650361886582359, |
|
"grad_norm": 2.014503240585327, |
|
"learning_rate": 0.0001451079747439911, |
|
"loss": 0.1825, |
|
"step": 41500 |
|
}, |
|
{ |
|
"epoch": 1.6702457647339537, |
|
"grad_norm": 2.4731264114379883, |
|
"learning_rate": 0.0001444447392157903, |
|
"loss": 0.1803, |
|
"step": 42000 |
|
}, |
|
{ |
|
"epoch": 1.6901296428855483, |
|
"grad_norm": 1.043967366218567, |
|
"learning_rate": 0.00014378150368758955, |
|
"loss": 0.1831, |
|
"step": 42500 |
|
}, |
|
{ |
|
"epoch": 1.7100135210371432, |
|
"grad_norm": 2.0062990188598633, |
|
"learning_rate": 0.00014311826815938879, |
|
"loss": 0.1851, |
|
"step": 43000 |
|
}, |
|
{ |
|
"epoch": 1.7298973991887379, |
|
"grad_norm": 2.958763837814331, |
|
"learning_rate": 0.000142455032631188, |
|
"loss": 0.1825, |
|
"step": 43500 |
|
}, |
|
{ |
|
"epoch": 1.7497812773403325, |
|
"grad_norm": 1.6624678373336792, |
|
"learning_rate": 0.0001417917971029872, |
|
"loss": 0.1834, |
|
"step": 44000 |
|
}, |
|
{ |
|
"epoch": 1.7696651554919272, |
|
"grad_norm": 1.1731349229812622, |
|
"learning_rate": 0.00014112856157478645, |
|
"loss": 0.1775, |
|
"step": 44500 |
|
}, |
|
{ |
|
"epoch": 1.7895490336435218, |
|
"grad_norm": 1.2256383895874023, |
|
"learning_rate": 0.00014046665251764206, |
|
"loss": 0.1688, |
|
"step": 45000 |
|
}, |
|
{ |
|
"epoch": 1.8002863278453831, |
|
"eval_loss": 0.1788606494665146, |
|
"eval_runtime": 31.0353, |
|
"eval_samples_per_second": 16.111, |
|
"eval_steps_per_second": 8.055, |
|
"step": 45270 |
|
}, |
|
{ |
|
"epoch": 1.8094329117951164, |
|
"grad_norm": 5.187135219573975, |
|
"learning_rate": 0.0001398034169894413, |
|
"loss": 0.1757, |
|
"step": 45500 |
|
}, |
|
{ |
|
"epoch": 1.829316789946711, |
|
"grad_norm": 3.0221340656280518, |
|
"learning_rate": 0.00013914018146124054, |
|
"loss": 0.169, |
|
"step": 46000 |
|
}, |
|
{ |
|
"epoch": 1.8492006680983057, |
|
"grad_norm": 1.780038833618164, |
|
"learning_rate": 0.00013847694593303975, |
|
"loss": 0.1815, |
|
"step": 46500 |
|
}, |
|
{ |
|
"epoch": 1.8690845462499006, |
|
"grad_norm": 1.5590816736221313, |
|
"learning_rate": 0.00013781503687589536, |
|
"loss": 0.1754, |
|
"step": 47000 |
|
}, |
|
{ |
|
"epoch": 1.8889684244014953, |
|
"grad_norm": 2.1302263736724854, |
|
"learning_rate": 0.0001371518013476946, |
|
"loss": 0.1737, |
|
"step": 47500 |
|
}, |
|
{ |
|
"epoch": 1.90885230255309, |
|
"grad_norm": 2.173957109451294, |
|
"learning_rate": 0.00013648856581949384, |
|
"loss": 0.1696, |
|
"step": 48000 |
|
}, |
|
{ |
|
"epoch": 1.9287361807046848, |
|
"grad_norm": 1.0864589214324951, |
|
"learning_rate": 0.00013582533029129305, |
|
"loss": 0.1832, |
|
"step": 48500 |
|
}, |
|
{ |
|
"epoch": 1.9486200588562794, |
|
"grad_norm": 1.9979732036590576, |
|
"learning_rate": 0.00013516209476309226, |
|
"loss": 0.1758, |
|
"step": 49000 |
|
}, |
|
{ |
|
"epoch": 1.968503937007874, |
|
"grad_norm": 2.2656006813049316, |
|
"learning_rate": 0.00013450018570594792, |
|
"loss": 0.1728, |
|
"step": 49500 |
|
}, |
|
{ |
|
"epoch": 1.9883878151594687, |
|
"grad_norm": 2.077143669128418, |
|
"learning_rate": 0.00013383695017774714, |
|
"loss": 0.1772, |
|
"step": 50000 |
|
}, |
|
{ |
|
"epoch": 2.0003181420504257, |
|
"eval_loss": 0.1767009198665619, |
|
"eval_runtime": 32.0718, |
|
"eval_samples_per_second": 15.59, |
|
"eval_steps_per_second": 7.795, |
|
"step": 50300 |
|
}, |
|
{ |
|
"epoch": 2.0082716933110634, |
|
"grad_norm": 1.5150219202041626, |
|
"learning_rate": 0.00013317371464954635, |
|
"loss": 0.164, |
|
"step": 50500 |
|
}, |
|
{ |
|
"epoch": 2.028155571462658, |
|
"grad_norm": 1.2225052118301392, |
|
"learning_rate": 0.0001325104791213456, |
|
"loss": 0.1534, |
|
"step": 51000 |
|
}, |
|
{ |
|
"epoch": 2.0480394496142527, |
|
"grad_norm": 1.9311732053756714, |
|
"learning_rate": 0.0001318472435931448, |
|
"loss": 0.1584, |
|
"step": 51500 |
|
}, |
|
{ |
|
"epoch": 2.0679233277658473, |
|
"grad_norm": 2.399226188659668, |
|
"learning_rate": 0.000131184008064944, |
|
"loss": 0.153, |
|
"step": 52000 |
|
}, |
|
{ |
|
"epoch": 2.087807205917442, |
|
"grad_norm": 2.9771511554718018, |
|
"learning_rate": 0.00013052077253674325, |
|
"loss": 0.1528, |
|
"step": 52500 |
|
}, |
|
{ |
|
"epoch": 2.107691084069037, |
|
"grad_norm": 1.5730229616165161, |
|
"learning_rate": 0.0001298588634795989, |
|
"loss": 0.154, |
|
"step": 53000 |
|
}, |
|
{ |
|
"epoch": 2.1275749622206317, |
|
"grad_norm": 1.2603771686553955, |
|
"learning_rate": 0.0001291956279513981, |
|
"loss": 0.1575, |
|
"step": 53500 |
|
}, |
|
{ |
|
"epoch": 2.1474588403722263, |
|
"grad_norm": 2.315129280090332, |
|
"learning_rate": 0.00012853371889425373, |
|
"loss": 0.1603, |
|
"step": 54000 |
|
}, |
|
{ |
|
"epoch": 2.167342718523821, |
|
"grad_norm": 1.5500929355621338, |
|
"learning_rate": 0.00012787048336605297, |
|
"loss": 0.1544, |
|
"step": 54500 |
|
}, |
|
{ |
|
"epoch": 2.1872265966754156, |
|
"grad_norm": 1.2831624746322632, |
|
"learning_rate": 0.0001272072478378522, |
|
"loss": 0.143, |
|
"step": 55000 |
|
}, |
|
{ |
|
"epoch": 2.2003499562554683, |
|
"eval_loss": 0.1747935712337494, |
|
"eval_runtime": 31.3745, |
|
"eval_samples_per_second": 15.936, |
|
"eval_steps_per_second": 7.968, |
|
"step": 55330 |
|
}, |
|
{ |
|
"epoch": 2.2071104748270103, |
|
"grad_norm": 1.4339005947113037, |
|
"learning_rate": 0.0001265440123096514, |
|
"loss": 0.1587, |
|
"step": 55500 |
|
}, |
|
{ |
|
"epoch": 2.226994352978605, |
|
"grad_norm": 2.5719878673553467, |
|
"learning_rate": 0.00012588077678145064, |
|
"loss": 0.1527, |
|
"step": 56000 |
|
}, |
|
{ |
|
"epoch": 2.2468782311301996, |
|
"grad_norm": 2.1503918170928955, |
|
"learning_rate": 0.00012521754125324988, |
|
"loss": 0.1585, |
|
"step": 56500 |
|
}, |
|
{ |
|
"epoch": 2.2667621092817942, |
|
"grad_norm": 3.443103551864624, |
|
"learning_rate": 0.0001245543057250491, |
|
"loss": 0.1593, |
|
"step": 57000 |
|
}, |
|
{ |
|
"epoch": 2.286645987433389, |
|
"grad_norm": 1.2123481035232544, |
|
"learning_rate": 0.0001238910701968483, |
|
"loss": 0.1624, |
|
"step": 57500 |
|
}, |
|
{ |
|
"epoch": 2.3065298655849835, |
|
"grad_norm": 1.7718604803085327, |
|
"learning_rate": 0.00012322783466864754, |
|
"loss": 0.1569, |
|
"step": 58000 |
|
}, |
|
{ |
|
"epoch": 2.3264137437365786, |
|
"grad_norm": 4.025502681732178, |
|
"learning_rate": 0.00012256592561150315, |
|
"loss": 0.156, |
|
"step": 58500 |
|
}, |
|
{ |
|
"epoch": 2.3462976218881733, |
|
"grad_norm": 2.876533269882202, |
|
"learning_rate": 0.00012190401655435879, |
|
"loss": 0.1616, |
|
"step": 59000 |
|
}, |
|
{ |
|
"epoch": 2.366181500039768, |
|
"grad_norm": 1.680629849433899, |
|
"learning_rate": 0.00012124078102615801, |
|
"loss": 0.1543, |
|
"step": 59500 |
|
}, |
|
{ |
|
"epoch": 2.3860653781913626, |
|
"grad_norm": 1.0101946592330933, |
|
"learning_rate": 0.00012057754549795724, |
|
"loss": 0.1465, |
|
"step": 60000 |
|
}, |
|
{ |
|
"epoch": 2.400381770460511, |
|
"eval_loss": 0.16907282173633575, |
|
"eval_runtime": 31.0907, |
|
"eval_samples_per_second": 16.082, |
|
"eval_steps_per_second": 8.041, |
|
"step": 60360 |
|
}, |
|
{ |
|
"epoch": 2.405949256342957, |
|
"grad_norm": 1.9501550197601318, |
|
"learning_rate": 0.00011991430996975645, |
|
"loss": 0.1535, |
|
"step": 60500 |
|
}, |
|
{ |
|
"epoch": 2.425833134494552, |
|
"grad_norm": 3.678605556488037, |
|
"learning_rate": 0.0001192510744415557, |
|
"loss": 0.1453, |
|
"step": 61000 |
|
}, |
|
{ |
|
"epoch": 2.4457170126461465, |
|
"grad_norm": 4.075408458709717, |
|
"learning_rate": 0.00011858783891335492, |
|
"loss": 0.1494, |
|
"step": 61500 |
|
}, |
|
{ |
|
"epoch": 2.465600890797741, |
|
"grad_norm": 1.3122295141220093, |
|
"learning_rate": 0.00011792460338515414, |
|
"loss": 0.1593, |
|
"step": 62000 |
|
}, |
|
{ |
|
"epoch": 2.485484768949336, |
|
"grad_norm": 2.0857038497924805, |
|
"learning_rate": 0.00011726136785695337, |
|
"loss": 0.1514, |
|
"step": 62500 |
|
}, |
|
{ |
|
"epoch": 2.5053686471009304, |
|
"grad_norm": 1.6657792329788208, |
|
"learning_rate": 0.000116599458799809, |
|
"loss": 0.1501, |
|
"step": 63000 |
|
}, |
|
{ |
|
"epoch": 2.525252525252525, |
|
"grad_norm": 1.8100333213806152, |
|
"learning_rate": 0.00011593622327160823, |
|
"loss": 0.1519, |
|
"step": 63500 |
|
}, |
|
{ |
|
"epoch": 2.54513640340412, |
|
"grad_norm": 2.504148006439209, |
|
"learning_rate": 0.00011527298774340744, |
|
"loss": 0.1496, |
|
"step": 64000 |
|
}, |
|
{ |
|
"epoch": 2.5650202815557144, |
|
"grad_norm": 1.1805058717727661, |
|
"learning_rate": 0.00011461107868626306, |
|
"loss": 0.144, |
|
"step": 64500 |
|
}, |
|
{ |
|
"epoch": 2.5849041597073095, |
|
"grad_norm": 3.109494686126709, |
|
"learning_rate": 0.0001139478431580623, |
|
"loss": 0.1517, |
|
"step": 65000 |
|
}, |
|
{ |
|
"epoch": 2.600413584665553, |
|
"eval_loss": 0.17017363011837006, |
|
"eval_runtime": 31.1173, |
|
"eval_samples_per_second": 16.068, |
|
"eval_steps_per_second": 8.034, |
|
"step": 65390 |
|
}, |
|
{ |
|
"epoch": 2.604788037858904, |
|
"grad_norm": 2.1431307792663574, |
|
"learning_rate": 0.00011328460762986153, |
|
"loss": 0.1561, |
|
"step": 65500 |
|
}, |
|
{ |
|
"epoch": 2.6246719160104988, |
|
"grad_norm": 1.5804929733276367, |
|
"learning_rate": 0.00011262137210166075, |
|
"loss": 0.1536, |
|
"step": 66000 |
|
}, |
|
{ |
|
"epoch": 2.6445557941620934, |
|
"grad_norm": 3.0483312606811523, |
|
"learning_rate": 0.00011195813657345997, |
|
"loss": 0.1497, |
|
"step": 66500 |
|
}, |
|
{ |
|
"epoch": 2.664439672313688, |
|
"grad_norm": 2.7714364528656006, |
|
"learning_rate": 0.00011129490104525919, |
|
"loss": 0.1493, |
|
"step": 67000 |
|
}, |
|
{ |
|
"epoch": 2.6843235504652827, |
|
"grad_norm": 1.1538747549057007, |
|
"learning_rate": 0.00011063299198811483, |
|
"loss": 0.1586, |
|
"step": 67500 |
|
}, |
|
{ |
|
"epoch": 2.7042074286168774, |
|
"grad_norm": 1.946218729019165, |
|
"learning_rate": 0.00010996975645991405, |
|
"loss": 0.1484, |
|
"step": 68000 |
|
}, |
|
{ |
|
"epoch": 2.724091306768472, |
|
"grad_norm": 1.453487753868103, |
|
"learning_rate": 0.00010930652093171328, |
|
"loss": 0.1545, |
|
"step": 68500 |
|
}, |
|
{ |
|
"epoch": 2.7439751849200666, |
|
"grad_norm": 1.9838638305664062, |
|
"learning_rate": 0.00010864328540351249, |
|
"loss": 0.1443, |
|
"step": 69000 |
|
}, |
|
{ |
|
"epoch": 2.7638590630716617, |
|
"grad_norm": 0.83545982837677, |
|
"learning_rate": 0.00010798004987531172, |
|
"loss": 0.148, |
|
"step": 69500 |
|
}, |
|
{ |
|
"epoch": 2.783742941223256, |
|
"grad_norm": 2.1687493324279785, |
|
"learning_rate": 0.00010731681434711096, |
|
"loss": 0.1533, |
|
"step": 70000 |
|
}, |
|
{ |
|
"epoch": 2.800445398870596, |
|
"eval_loss": 0.16508887708187103, |
|
"eval_runtime": 31.1958, |
|
"eval_samples_per_second": 16.028, |
|
"eval_steps_per_second": 8.014, |
|
"step": 70420 |
|
}, |
|
{ |
|
"epoch": 2.803626819374851, |
|
"grad_norm": 1.3648791313171387, |
|
"learning_rate": 0.00010665357881891018, |
|
"loss": 0.1419, |
|
"step": 70500 |
|
}, |
|
{ |
|
"epoch": 2.8235106975264457, |
|
"grad_norm": 1.7863802909851074, |
|
"learning_rate": 0.00010599034329070941, |
|
"loss": 0.1482, |
|
"step": 71000 |
|
}, |
|
{ |
|
"epoch": 2.8433945756780403, |
|
"grad_norm": 3.2346866130828857, |
|
"learning_rate": 0.00010532710776250862, |
|
"loss": 0.1425, |
|
"step": 71500 |
|
}, |
|
{ |
|
"epoch": 2.863278453829635, |
|
"grad_norm": 0.9211582541465759, |
|
"learning_rate": 0.00010466519870536427, |
|
"loss": 0.1501, |
|
"step": 72000 |
|
}, |
|
{ |
|
"epoch": 2.8831623319812296, |
|
"grad_norm": 0.1923113614320755, |
|
"learning_rate": 0.00010400196317716348, |
|
"loss": 0.1469, |
|
"step": 72500 |
|
}, |
|
{ |
|
"epoch": 2.9030462101328243, |
|
"grad_norm": 4.506950855255127, |
|
"learning_rate": 0.00010333872764896271, |
|
"loss": 0.1465, |
|
"step": 73000 |
|
}, |
|
{ |
|
"epoch": 2.922930088284419, |
|
"grad_norm": 3.244807004928589, |
|
"learning_rate": 0.00010267549212076194, |
|
"loss": 0.1425, |
|
"step": 73500 |
|
}, |
|
{ |
|
"epoch": 2.9428139664360136, |
|
"grad_norm": 1.8501616716384888, |
|
"learning_rate": 0.00010201225659256115, |
|
"loss": 0.1446, |
|
"step": 74000 |
|
}, |
|
{ |
|
"epoch": 2.962697844587608, |
|
"grad_norm": 3.9013025760650635, |
|
"learning_rate": 0.0001013503475354168, |
|
"loss": 0.1508, |
|
"step": 74500 |
|
}, |
|
{ |
|
"epoch": 2.9825817227392033, |
|
"grad_norm": 3.1799283027648926, |
|
"learning_rate": 0.00010068711200721601, |
|
"loss": 0.1412, |
|
"step": 75000 |
|
}, |
|
{ |
|
"epoch": 3.000477213075638, |
|
"eval_loss": 0.1569422334432602, |
|
"eval_runtime": 33.8027, |
|
"eval_samples_per_second": 14.792, |
|
"eval_steps_per_second": 7.396, |
|
"step": 75450 |
|
}, |
|
{ |
|
"epoch": 3.002465600890798, |
|
"grad_norm": 1.1885572671890259, |
|
"learning_rate": 0.00010002520295007163, |
|
"loss": 0.1334, |
|
"step": 75500 |
|
}, |
|
{ |
|
"epoch": 3.0223494790423926, |
|
"grad_norm": 0.8424841165542603, |
|
"learning_rate": 9.936196742187086e-05, |
|
"loss": 0.1261, |
|
"step": 76000 |
|
}, |
|
{ |
|
"epoch": 3.0422333571939872, |
|
"grad_norm": 1.2642099857330322, |
|
"learning_rate": 9.869873189367008e-05, |
|
"loss": 0.1233, |
|
"step": 76500 |
|
}, |
|
{ |
|
"epoch": 3.062117235345582, |
|
"grad_norm": 3.7216506004333496, |
|
"learning_rate": 9.803549636546931e-05, |
|
"loss": 0.1257, |
|
"step": 77000 |
|
}, |
|
{ |
|
"epoch": 3.0820011134971765, |
|
"grad_norm": 1.576157808303833, |
|
"learning_rate": 9.737358730832493e-05, |
|
"loss": 0.1257, |
|
"step": 77500 |
|
}, |
|
{ |
|
"epoch": 3.101884991648771, |
|
"grad_norm": 3.2462401390075684, |
|
"learning_rate": 9.671035178012417e-05, |
|
"loss": 0.1387, |
|
"step": 78000 |
|
}, |
|
{ |
|
"epoch": 3.121768869800366, |
|
"grad_norm": 0.38061007857322693, |
|
"learning_rate": 9.604711625192338e-05, |
|
"loss": 0.1288, |
|
"step": 78500 |
|
}, |
|
{ |
|
"epoch": 3.1416527479519605, |
|
"grad_norm": 1.4168367385864258, |
|
"learning_rate": 9.538388072372262e-05, |
|
"loss": 0.125, |
|
"step": 79000 |
|
}, |
|
{ |
|
"epoch": 3.161536626103555, |
|
"grad_norm": 1.8440510034561157, |
|
"learning_rate": 9.472064519552183e-05, |
|
"loss": 0.1203, |
|
"step": 79500 |
|
}, |
|
{ |
|
"epoch": 3.1814205042551498, |
|
"grad_norm": 0.012771493755280972, |
|
"learning_rate": 9.405873613837747e-05, |
|
"loss": 0.1237, |
|
"step": 80000 |
|
}, |
|
{ |
|
"epoch": 3.2005090272806807, |
|
"eval_loss": 0.15806268155574799, |
|
"eval_runtime": 31.0713, |
|
"eval_samples_per_second": 16.092, |
|
"eval_steps_per_second": 8.046, |
|
"step": 80480 |
|
}, |
|
{ |
|
"epoch": 3.2013043824067444, |
|
"grad_norm": 2.36724853515625, |
|
"learning_rate": 9.33955006101767e-05, |
|
"loss": 0.1253, |
|
"step": 80500 |
|
}, |
|
{ |
|
"epoch": 3.221188260558339, |
|
"grad_norm": 1.2032071352005005, |
|
"learning_rate": 9.273226508197591e-05, |
|
"loss": 0.1321, |
|
"step": 81000 |
|
}, |
|
{ |
|
"epoch": 3.241072138709934, |
|
"grad_norm": 1.6413310766220093, |
|
"learning_rate": 9.206902955377515e-05, |
|
"loss": 0.1256, |
|
"step": 81500 |
|
}, |
|
{ |
|
"epoch": 3.260956016861529, |
|
"grad_norm": 1.263128638267517, |
|
"learning_rate": 9.140579402557436e-05, |
|
"loss": 0.1281, |
|
"step": 82000 |
|
}, |
|
{ |
|
"epoch": 3.2808398950131235, |
|
"grad_norm": 2.12233304977417, |
|
"learning_rate": 9.074255849737359e-05, |
|
"loss": 0.1299, |
|
"step": 82500 |
|
}, |
|
{ |
|
"epoch": 3.300723773164718, |
|
"grad_norm": 1.265188217163086, |
|
"learning_rate": 9.007932296917282e-05, |
|
"loss": 0.1226, |
|
"step": 83000 |
|
}, |
|
{ |
|
"epoch": 3.3206076513163127, |
|
"grad_norm": 1.391507625579834, |
|
"learning_rate": 8.941608744097204e-05, |
|
"loss": 0.1269, |
|
"step": 83500 |
|
}, |
|
{ |
|
"epoch": 3.3404915294679074, |
|
"grad_norm": 1.941706657409668, |
|
"learning_rate": 8.875285191277126e-05, |
|
"loss": 0.1306, |
|
"step": 84000 |
|
}, |
|
{ |
|
"epoch": 3.360375407619502, |
|
"grad_norm": 1.1882116794586182, |
|
"learning_rate": 8.809094285562688e-05, |
|
"loss": 0.1287, |
|
"step": 84500 |
|
}, |
|
{ |
|
"epoch": 3.3802592857710967, |
|
"grad_norm": 2.2689340114593506, |
|
"learning_rate": 8.742903379848252e-05, |
|
"loss": 0.1281, |
|
"step": 85000 |
|
}, |
|
{ |
|
"epoch": 3.4001431639226913, |
|
"grad_norm": 2.9482505321502686, |
|
"learning_rate": 8.676579827028175e-05, |
|
"loss": 0.131, |
|
"step": 85500 |
|
}, |
|
{ |
|
"epoch": 3.400540841485723, |
|
"eval_loss": 0.15611666440963745, |
|
"eval_runtime": 31.2043, |
|
"eval_samples_per_second": 16.023, |
|
"eval_steps_per_second": 8.012, |
|
"step": 85510 |
|
}, |
|
{ |
|
"epoch": 3.420027042074286, |
|
"grad_norm": 1.8892121315002441, |
|
"learning_rate": 8.610256274208097e-05, |
|
"loss": 0.1261, |
|
"step": 86000 |
|
}, |
|
{ |
|
"epoch": 3.4399109202258806, |
|
"grad_norm": 2.923980236053467, |
|
"learning_rate": 8.54393272138802e-05, |
|
"loss": 0.1304, |
|
"step": 86500 |
|
}, |
|
{ |
|
"epoch": 3.4597947983774757, |
|
"grad_norm": 0.46090424060821533, |
|
"learning_rate": 8.477609168567942e-05, |
|
"loss": 0.1263, |
|
"step": 87000 |
|
}, |
|
{ |
|
"epoch": 3.4796786765290704, |
|
"grad_norm": 1.307573676109314, |
|
"learning_rate": 8.411285615747865e-05, |
|
"loss": 0.1217, |
|
"step": 87500 |
|
}, |
|
{ |
|
"epoch": 3.499562554680665, |
|
"grad_norm": 1.9948647022247314, |
|
"learning_rate": 8.345094710033427e-05, |
|
"loss": 0.1298, |
|
"step": 88000 |
|
}, |
|
{ |
|
"epoch": 3.5194464328322597, |
|
"grad_norm": 1.2264705896377563, |
|
"learning_rate": 8.27877115721335e-05, |
|
"loss": 0.1254, |
|
"step": 88500 |
|
}, |
|
{ |
|
"epoch": 3.5393303109838543, |
|
"grad_norm": 1.249614953994751, |
|
"learning_rate": 8.212447604393274e-05, |
|
"loss": 0.1217, |
|
"step": 89000 |
|
}, |
|
{ |
|
"epoch": 3.559214189135449, |
|
"grad_norm": 1.5197207927703857, |
|
"learning_rate": 8.146124051573195e-05, |
|
"loss": 0.1276, |
|
"step": 89500 |
|
}, |
|
{ |
|
"epoch": 3.5790980672870436, |
|
"grad_norm": 0.8200188875198364, |
|
"learning_rate": 8.079800498753118e-05, |
|
"loss": 0.1283, |
|
"step": 90000 |
|
}, |
|
{ |
|
"epoch": 3.5989819454386383, |
|
"grad_norm": 1.3526813983917236, |
|
"learning_rate": 8.01347694593304e-05, |
|
"loss": 0.1221, |
|
"step": 90500 |
|
}, |
|
{ |
|
"epoch": 3.600572655690766, |
|
"eval_loss": 0.14859890937805176, |
|
"eval_runtime": 32.6446, |
|
"eval_samples_per_second": 15.316, |
|
"eval_steps_per_second": 7.658, |
|
"step": 90540 |
|
}, |
|
{ |
|
"epoch": 3.618865823590233, |
|
"grad_norm": 1.9000824689865112, |
|
"learning_rate": 7.947153393112963e-05, |
|
"loss": 0.1215, |
|
"step": 91000 |
|
}, |
|
{ |
|
"epoch": 3.638749701741828, |
|
"grad_norm": 1.2409000396728516, |
|
"learning_rate": 7.880829840292885e-05, |
|
"loss": 0.1259, |
|
"step": 91500 |
|
}, |
|
{ |
|
"epoch": 3.658633579893422, |
|
"grad_norm": 1.7296925783157349, |
|
"learning_rate": 7.814506287472808e-05, |
|
"loss": 0.1245, |
|
"step": 92000 |
|
}, |
|
{ |
|
"epoch": 3.6785174580450173, |
|
"grad_norm": 1.3685667514801025, |
|
"learning_rate": 7.748315381758371e-05, |
|
"loss": 0.127, |
|
"step": 92500 |
|
}, |
|
{ |
|
"epoch": 3.698401336196612, |
|
"grad_norm": 2.338923692703247, |
|
"learning_rate": 7.681991828938293e-05, |
|
"loss": 0.119, |
|
"step": 93000 |
|
}, |
|
{ |
|
"epoch": 3.7182852143482066, |
|
"grad_norm": 2.3641035556793213, |
|
"learning_rate": 7.615668276118215e-05, |
|
"loss": 0.1256, |
|
"step": 93500 |
|
}, |
|
{ |
|
"epoch": 3.7381690924998012, |
|
"grad_norm": 1.444541335105896, |
|
"learning_rate": 7.549477370403777e-05, |
|
"loss": 0.1218, |
|
"step": 94000 |
|
}, |
|
{ |
|
"epoch": 3.758052970651396, |
|
"grad_norm": 2.2175920009613037, |
|
"learning_rate": 7.4831538175837e-05, |
|
"loss": 0.1181, |
|
"step": 94500 |
|
}, |
|
{ |
|
"epoch": 3.7779368488029905, |
|
"grad_norm": 2.533275604248047, |
|
"learning_rate": 7.416830264763624e-05, |
|
"loss": 0.1203, |
|
"step": 95000 |
|
}, |
|
{ |
|
"epoch": 3.797820726954585, |
|
"grad_norm": 2.5117480754852295, |
|
"learning_rate": 7.350506711943545e-05, |
|
"loss": 0.1161, |
|
"step": 95500 |
|
}, |
|
{ |
|
"epoch": 3.8006044698958084, |
|
"eval_loss": 0.14638979732990265, |
|
"eval_runtime": 33.193, |
|
"eval_samples_per_second": 15.063, |
|
"eval_steps_per_second": 7.532, |
|
"step": 95570 |
|
}, |
|
{ |
|
"epoch": 3.81770460510618, |
|
"grad_norm": 0.9872569441795349, |
|
"learning_rate": 7.284183159123469e-05, |
|
"loss": 0.1257, |
|
"step": 96000 |
|
}, |
|
{ |
|
"epoch": 3.8375884832577745, |
|
"grad_norm": 2.7399699687957764, |
|
"learning_rate": 7.21785960630339e-05, |
|
"loss": 0.1309, |
|
"step": 96500 |
|
}, |
|
{ |
|
"epoch": 3.8574723614093696, |
|
"grad_norm": 1.9438740015029907, |
|
"learning_rate": 7.151536053483313e-05, |
|
"loss": 0.1196, |
|
"step": 97000 |
|
}, |
|
{ |
|
"epoch": 3.8773562395609638, |
|
"grad_norm": 3.683006763458252, |
|
"learning_rate": 7.085212500663237e-05, |
|
"loss": 0.1207, |
|
"step": 97500 |
|
}, |
|
{ |
|
"epoch": 3.897240117712559, |
|
"grad_norm": 2.3406124114990234, |
|
"learning_rate": 7.019154242054439e-05, |
|
"loss": 0.1202, |
|
"step": 98000 |
|
}, |
|
{ |
|
"epoch": 3.9171239958641535, |
|
"grad_norm": 1.3052023649215698, |
|
"learning_rate": 6.952830689234361e-05, |
|
"loss": 0.1233, |
|
"step": 98500 |
|
}, |
|
{ |
|
"epoch": 3.937007874015748, |
|
"grad_norm": 1.8666225671768188, |
|
"learning_rate": 6.886507136414284e-05, |
|
"loss": 0.1217, |
|
"step": 99000 |
|
}, |
|
{ |
|
"epoch": 3.956891752167343, |
|
"grad_norm": 1.7979810237884521, |
|
"learning_rate": 6.820183583594206e-05, |
|
"loss": 0.1194, |
|
"step": 99500 |
|
}, |
|
{ |
|
"epoch": 3.9767756303189374, |
|
"grad_norm": 0.9724803566932678, |
|
"learning_rate": 6.753860030774129e-05, |
|
"loss": 0.1276, |
|
"step": 100000 |
|
}, |
|
{ |
|
"epoch": 3.996659508470532, |
|
"grad_norm": 3.251120090484619, |
|
"learning_rate": 6.687536477954052e-05, |
|
"loss": 0.1165, |
|
"step": 100500 |
|
}, |
|
{ |
|
"epoch": 4.000636284100851, |
|
"eval_loss": 0.1412682831287384, |
|
"eval_runtime": 31.0761, |
|
"eval_samples_per_second": 16.09, |
|
"eval_steps_per_second": 8.045, |
|
"step": 100600 |
|
}, |
|
{ |
|
"epoch": 4.016543386622127, |
|
"grad_norm": 3.6826343536376953, |
|
"learning_rate": 6.621212925133974e-05, |
|
"loss": 0.1046, |
|
"step": 101000 |
|
}, |
|
{ |
|
"epoch": 4.036427264773722, |
|
"grad_norm": 5.360718250274658, |
|
"learning_rate": 6.554889372313896e-05, |
|
"loss": 0.1032, |
|
"step": 101500 |
|
}, |
|
{ |
|
"epoch": 4.056311142925316, |
|
"grad_norm": 1.780490756034851, |
|
"learning_rate": 6.488698466599459e-05, |
|
"loss": 0.1014, |
|
"step": 102000 |
|
}, |
|
{ |
|
"epoch": 4.076195021076911, |
|
"grad_norm": 2.984694480895996, |
|
"learning_rate": 6.422374913779382e-05, |
|
"loss": 0.1044, |
|
"step": 102500 |
|
}, |
|
{ |
|
"epoch": 4.096078899228505, |
|
"grad_norm": 0.6237806677818298, |
|
"learning_rate": 6.356184008064945e-05, |
|
"loss": 0.1009, |
|
"step": 103000 |
|
}, |
|
{ |
|
"epoch": 4.1159627773801, |
|
"grad_norm": 3.3006651401519775, |
|
"learning_rate": 6.289860455244868e-05, |
|
"loss": 0.1017, |
|
"step": 103500 |
|
}, |
|
{ |
|
"epoch": 4.135846655531695, |
|
"grad_norm": 1.9449480772018433, |
|
"learning_rate": 6.223536902424789e-05, |
|
"loss": 0.1043, |
|
"step": 104000 |
|
}, |
|
{ |
|
"epoch": 4.15573053368329, |
|
"grad_norm": 2.495291233062744, |
|
"learning_rate": 6.157213349604713e-05, |
|
"loss": 0.1024, |
|
"step": 104500 |
|
}, |
|
{ |
|
"epoch": 4.175614411834884, |
|
"grad_norm": 0.23489686846733093, |
|
"learning_rate": 6.090889796784635e-05, |
|
"loss": 0.1031, |
|
"step": 105000 |
|
}, |
|
{ |
|
"epoch": 4.195498289986479, |
|
"grad_norm": 3.0352611541748047, |
|
"learning_rate": 6.024566243964557e-05, |
|
"loss": 0.0994, |
|
"step": 105500 |
|
}, |
|
{ |
|
"epoch": 4.2006680983058935, |
|
"eval_loss": 0.14581650495529175, |
|
"eval_runtime": 35.7264, |
|
"eval_samples_per_second": 13.995, |
|
"eval_steps_per_second": 6.998, |
|
"step": 105630 |
|
}, |
|
{ |
|
"epoch": 4.215382168138074, |
|
"grad_norm": 1.1670618057250977, |
|
"learning_rate": 5.95824269114448e-05, |
|
"loss": 0.1045, |
|
"step": 106000 |
|
}, |
|
{ |
|
"epoch": 4.235266046289668, |
|
"grad_norm": 2.0033414363861084, |
|
"learning_rate": 5.891919138324402e-05, |
|
"loss": 0.1016, |
|
"step": 106500 |
|
}, |
|
{ |
|
"epoch": 4.255149924441263, |
|
"grad_norm": 1.4979524612426758, |
|
"learning_rate": 5.825728232609965e-05, |
|
"loss": 0.1079, |
|
"step": 107000 |
|
}, |
|
{ |
|
"epoch": 4.275033802592858, |
|
"grad_norm": 1.7079240083694458, |
|
"learning_rate": 5.7594046797898874e-05, |
|
"loss": 0.1016, |
|
"step": 107500 |
|
}, |
|
{ |
|
"epoch": 4.294917680744453, |
|
"grad_norm": 1.1605221033096313, |
|
"learning_rate": 5.6930811269698107e-05, |
|
"loss": 0.1008, |
|
"step": 108000 |
|
}, |
|
{ |
|
"epoch": 4.314801558896047, |
|
"grad_norm": 1.4162037372589111, |
|
"learning_rate": 5.6267575741497326e-05, |
|
"loss": 0.1083, |
|
"step": 108500 |
|
}, |
|
{ |
|
"epoch": 4.334685437047642, |
|
"grad_norm": 2.373020887374878, |
|
"learning_rate": 5.5604340213296545e-05, |
|
"loss": 0.1073, |
|
"step": 109000 |
|
}, |
|
{ |
|
"epoch": 4.354569315199236, |
|
"grad_norm": 2.836911916732788, |
|
"learning_rate": 5.494375762720858e-05, |
|
"loss": 0.1019, |
|
"step": 109500 |
|
}, |
|
{ |
|
"epoch": 4.374453193350831, |
|
"grad_norm": 3.7747974395751953, |
|
"learning_rate": 5.42805220990078e-05, |
|
"loss": 0.1038, |
|
"step": 110000 |
|
}, |
|
{ |
|
"epoch": 4.3943370715024255, |
|
"grad_norm": 1.4646281003952026, |
|
"learning_rate": 5.361728657080703e-05, |
|
"loss": 0.1113, |
|
"step": 110500 |
|
}, |
|
{ |
|
"epoch": 4.4006999125109365, |
|
"eval_loss": 0.14834672212600708, |
|
"eval_runtime": 37.228, |
|
"eval_samples_per_second": 13.431, |
|
"eval_steps_per_second": 6.715, |
|
"step": 110660 |
|
}, |
|
{ |
|
"epoch": 4.414220949654021, |
|
"grad_norm": 2.5483474731445312, |
|
"learning_rate": 5.295405104260626e-05, |
|
"loss": 0.1006, |
|
"step": 111000 |
|
}, |
|
{ |
|
"epoch": 4.434104827805616, |
|
"grad_norm": 2.5747101306915283, |
|
"learning_rate": 5.229081551440548e-05, |
|
"loss": 0.1082, |
|
"step": 111500 |
|
}, |
|
{ |
|
"epoch": 4.45398870595721, |
|
"grad_norm": 0.23354358971118927, |
|
"learning_rate": 5.16275799862047e-05, |
|
"loss": 0.1032, |
|
"step": 112000 |
|
}, |
|
{ |
|
"epoch": 4.473872584108805, |
|
"grad_norm": 2.4420604705810547, |
|
"learning_rate": 5.096434445800393e-05, |
|
"loss": 0.106, |
|
"step": 112500 |
|
}, |
|
{ |
|
"epoch": 4.493756462260399, |
|
"grad_norm": 1.2000492811203003, |
|
"learning_rate": 5.030110892980315e-05, |
|
"loss": 0.0961, |
|
"step": 113000 |
|
}, |
|
{ |
|
"epoch": 4.513640340411994, |
|
"grad_norm": 1.8864790201187134, |
|
"learning_rate": 4.963919987265878e-05, |
|
"loss": 0.0978, |
|
"step": 113500 |
|
}, |
|
{ |
|
"epoch": 4.5335242185635884, |
|
"grad_norm": 0.45711418986320496, |
|
"learning_rate": 4.897596434445801e-05, |
|
"loss": 0.094, |
|
"step": 114000 |
|
}, |
|
{ |
|
"epoch": 4.5534080967151835, |
|
"grad_norm": 7.053537845611572, |
|
"learning_rate": 4.831272881625723e-05, |
|
"loss": 0.1, |
|
"step": 114500 |
|
}, |
|
{ |
|
"epoch": 4.573291974866778, |
|
"grad_norm": 3.276334047317505, |
|
"learning_rate": 4.765081975911286e-05, |
|
"loss": 0.0996, |
|
"step": 115000 |
|
}, |
|
{ |
|
"epoch": 4.593175853018373, |
|
"grad_norm": 1.4245741367340088, |
|
"learning_rate": 4.6987584230912085e-05, |
|
"loss": 0.0996, |
|
"step": 115500 |
|
}, |
|
{ |
|
"epoch": 4.600731726715979, |
|
"eval_loss": 0.14262481033802032, |
|
"eval_runtime": 32.3953, |
|
"eval_samples_per_second": 15.434, |
|
"eval_steps_per_second": 7.717, |
|
"step": 115690 |
|
}, |
|
{ |
|
"epoch": 4.613059731169967, |
|
"grad_norm": 2.3254101276397705, |
|
"learning_rate": 4.6324348702711304e-05, |
|
"loss": 0.1017, |
|
"step": 116000 |
|
}, |
|
{ |
|
"epoch": 4.632943609321562, |
|
"grad_norm": 1.167358636856079, |
|
"learning_rate": 4.566111317451054e-05, |
|
"loss": 0.0981, |
|
"step": 116500 |
|
}, |
|
{ |
|
"epoch": 4.652827487473157, |
|
"grad_norm": 4.5853376388549805, |
|
"learning_rate": 4.499787764630976e-05, |
|
"loss": 0.1016, |
|
"step": 117000 |
|
}, |
|
{ |
|
"epoch": 4.672711365624751, |
|
"grad_norm": 1.4917422533035278, |
|
"learning_rate": 4.433464211810899e-05, |
|
"loss": 0.1018, |
|
"step": 117500 |
|
}, |
|
{ |
|
"epoch": 4.6925952437763465, |
|
"grad_norm": 0.012467560358345509, |
|
"learning_rate": 4.367273306096461e-05, |
|
"loss": 0.0975, |
|
"step": 118000 |
|
}, |
|
{ |
|
"epoch": 4.712479121927941, |
|
"grad_norm": 0.8470689058303833, |
|
"learning_rate": 4.3009497532763836e-05, |
|
"loss": 0.0965, |
|
"step": 118500 |
|
}, |
|
{ |
|
"epoch": 4.732363000079536, |
|
"grad_norm": 0.43946540355682373, |
|
"learning_rate": 4.234626200456306e-05, |
|
"loss": 0.1014, |
|
"step": 119000 |
|
}, |
|
{ |
|
"epoch": 4.75224687823113, |
|
"grad_norm": 2.230210304260254, |
|
"learning_rate": 4.168302647636229e-05, |
|
"loss": 0.0996, |
|
"step": 119500 |
|
}, |
|
{ |
|
"epoch": 4.772130756382725, |
|
"grad_norm": 3.4537596702575684, |
|
"learning_rate": 4.1019790948161514e-05, |
|
"loss": 0.0966, |
|
"step": 120000 |
|
}, |
|
{ |
|
"epoch": 4.792014634534319, |
|
"grad_norm": 1.7490534782409668, |
|
"learning_rate": 4.035655541996074e-05, |
|
"loss": 0.1116, |
|
"step": 120500 |
|
}, |
|
{ |
|
"epoch": 4.800763540921022, |
|
"eval_loss": 0.1392485797405243, |
|
"eval_runtime": 34.3723, |
|
"eval_samples_per_second": 14.547, |
|
"eval_steps_per_second": 7.273, |
|
"step": 120720 |
|
}, |
|
{ |
|
"epoch": 4.811898512685914, |
|
"grad_norm": 0.9216163158416748, |
|
"learning_rate": 3.969331989175996e-05, |
|
"loss": 0.0965, |
|
"step": 121000 |
|
}, |
|
{ |
|
"epoch": 4.831782390837509, |
|
"grad_norm": 4.56651496887207, |
|
"learning_rate": 3.9030084363559186e-05, |
|
"loss": 0.0965, |
|
"step": 121500 |
|
}, |
|
{ |
|
"epoch": 4.851666268989104, |
|
"grad_norm": 1.2061914205551147, |
|
"learning_rate": 3.8368175306414814e-05, |
|
"loss": 0.1059, |
|
"step": 122000 |
|
}, |
|
{ |
|
"epoch": 4.871550147140699, |
|
"grad_norm": 1.8895795345306396, |
|
"learning_rate": 3.770493977821405e-05, |
|
"loss": 0.0988, |
|
"step": 122500 |
|
}, |
|
{ |
|
"epoch": 4.891434025292293, |
|
"grad_norm": 2.730050563812256, |
|
"learning_rate": 3.7041704250013266e-05, |
|
"loss": 0.1022, |
|
"step": 123000 |
|
}, |
|
{ |
|
"epoch": 4.911317903443888, |
|
"grad_norm": 2.6674962043762207, |
|
"learning_rate": 3.637846872181249e-05, |
|
"loss": 0.0966, |
|
"step": 123500 |
|
}, |
|
{ |
|
"epoch": 4.931201781595482, |
|
"grad_norm": 5.602296352386475, |
|
"learning_rate": 3.571655966466812e-05, |
|
"loss": 0.0944, |
|
"step": 124000 |
|
}, |
|
{ |
|
"epoch": 4.951085659747077, |
|
"grad_norm": 2.389179229736328, |
|
"learning_rate": 3.505332413646734e-05, |
|
"loss": 0.0975, |
|
"step": 124500 |
|
}, |
|
{ |
|
"epoch": 4.970969537898672, |
|
"grad_norm": 3.1410694122314453, |
|
"learning_rate": 3.4390088608266565e-05, |
|
"loss": 0.0923, |
|
"step": 125000 |
|
}, |
|
{ |
|
"epoch": 4.990853416050267, |
|
"grad_norm": 0.3184365928173065, |
|
"learning_rate": 3.37268530800658e-05, |
|
"loss": 0.1004, |
|
"step": 125500 |
|
}, |
|
{ |
|
"epoch": 5.000795355126064, |
|
"eval_loss": 0.13804183900356293, |
|
"eval_runtime": 34.1527, |
|
"eval_samples_per_second": 14.64, |
|
"eval_steps_per_second": 7.32, |
|
"step": 125750 |
|
}, |
|
{ |
|
"epoch": 5.010737294201861, |
|
"grad_norm": 1.9156638383865356, |
|
"learning_rate": 3.3063617551865024e-05, |
|
"loss": 0.0899, |
|
"step": 126000 |
|
}, |
|
{ |
|
"epoch": 5.030621172353456, |
|
"grad_norm": 1.1794829368591309, |
|
"learning_rate": 3.2401708494720645e-05, |
|
"loss": 0.0762, |
|
"step": 126500 |
|
}, |
|
{ |
|
"epoch": 5.05050505050505, |
|
"grad_norm": 2.2873692512512207, |
|
"learning_rate": 3.173847296651987e-05, |
|
"loss": 0.0823, |
|
"step": 127000 |
|
}, |
|
{ |
|
"epoch": 5.070388928656645, |
|
"grad_norm": 2.4998435974121094, |
|
"learning_rate": 3.10752374383191e-05, |
|
"loss": 0.0747, |
|
"step": 127500 |
|
}, |
|
{ |
|
"epoch": 5.0902728068082395, |
|
"grad_norm": 3.9110782146453857, |
|
"learning_rate": 3.041200191011832e-05, |
|
"loss": 0.0853, |
|
"step": 128000 |
|
}, |
|
{ |
|
"epoch": 5.1101566849598345, |
|
"grad_norm": 3.3292062282562256, |
|
"learning_rate": 2.9748766381917546e-05, |
|
"loss": 0.0839, |
|
"step": 128500 |
|
}, |
|
{ |
|
"epoch": 5.13004056311143, |
|
"grad_norm": 4.433184623718262, |
|
"learning_rate": 2.9085530853716776e-05, |
|
"loss": 0.0802, |
|
"step": 129000 |
|
}, |
|
{ |
|
"epoch": 5.149924441263024, |
|
"grad_norm": 3.4676051139831543, |
|
"learning_rate": 2.8422295325515995e-05, |
|
"loss": 0.0838, |
|
"step": 129500 |
|
}, |
|
{ |
|
"epoch": 5.169808319414619, |
|
"grad_norm": 1.5169734954833984, |
|
"learning_rate": 2.7759059797315225e-05, |
|
"loss": 0.0783, |
|
"step": 130000 |
|
}, |
|
{ |
|
"epoch": 5.189692197566213, |
|
"grad_norm": 1.637904405593872, |
|
"learning_rate": 2.709582426911445e-05, |
|
"loss": 0.0814, |
|
"step": 130500 |
|
}, |
|
{ |
|
"epoch": 5.200827169331106, |
|
"eval_loss": 0.141804039478302, |
|
"eval_runtime": 32.6973, |
|
"eval_samples_per_second": 15.292, |
|
"eval_steps_per_second": 7.646, |
|
"step": 130780 |
|
}, |
|
{ |
|
"epoch": 5.209576075717808, |
|
"grad_norm": 2.086423873901367, |
|
"learning_rate": 2.643391521197008e-05, |
|
"loss": 0.0831, |
|
"step": 131000 |
|
}, |
|
{ |
|
"epoch": 5.229459953869402, |
|
"grad_norm": 0.6672555804252625, |
|
"learning_rate": 2.57720061548257e-05, |
|
"loss": 0.0862, |
|
"step": 131500 |
|
}, |
|
{ |
|
"epoch": 5.2493438320209975, |
|
"grad_norm": 0.04646410793066025, |
|
"learning_rate": 2.510877062662493e-05, |
|
"loss": 0.0792, |
|
"step": 132000 |
|
}, |
|
{ |
|
"epoch": 5.269227710172592, |
|
"grad_norm": 4.182920932769775, |
|
"learning_rate": 2.4445535098424155e-05, |
|
"loss": 0.0802, |
|
"step": 132500 |
|
}, |
|
{ |
|
"epoch": 5.289111588324187, |
|
"grad_norm": 3.0408833026885986, |
|
"learning_rate": 2.3782299570223378e-05, |
|
"loss": 0.0836, |
|
"step": 133000 |
|
}, |
|
{ |
|
"epoch": 5.308995466475781, |
|
"grad_norm": 1.9850901365280151, |
|
"learning_rate": 2.3120390513079006e-05, |
|
"loss": 0.0779, |
|
"step": 133500 |
|
}, |
|
{ |
|
"epoch": 5.328879344627376, |
|
"grad_norm": 2.1532881259918213, |
|
"learning_rate": 2.2457154984878232e-05, |
|
"loss": 0.0813, |
|
"step": 134000 |
|
}, |
|
{ |
|
"epoch": 5.348763222778971, |
|
"grad_norm": 2.2527785301208496, |
|
"learning_rate": 2.1793919456677455e-05, |
|
"loss": 0.0761, |
|
"step": 134500 |
|
}, |
|
{ |
|
"epoch": 5.368647100930565, |
|
"grad_norm": 1.829942226409912, |
|
"learning_rate": 2.1130683928476684e-05, |
|
"loss": 0.0813, |
|
"step": 135000 |
|
}, |
|
{ |
|
"epoch": 5.3885309790821605, |
|
"grad_norm": 2.5190794467926025, |
|
"learning_rate": 2.046877487133231e-05, |
|
"loss": 0.086, |
|
"step": 135500 |
|
}, |
|
{ |
|
"epoch": 5.400858983536149, |
|
"eval_loss": 0.1405518800020218, |
|
"eval_runtime": 32.4801, |
|
"eval_samples_per_second": 15.394, |
|
"eval_steps_per_second": 7.697, |
|
"step": 135810 |
|
}, |
|
{ |
|
"epoch": 5.408414857233755, |
|
"grad_norm": 0.9381058812141418, |
|
"learning_rate": 1.9805539343131535e-05, |
|
"loss": 0.0769, |
|
"step": 136000 |
|
}, |
|
{ |
|
"epoch": 5.42829873538535, |
|
"grad_norm": 3.986593246459961, |
|
"learning_rate": 1.914230381493076e-05, |
|
"loss": 0.08, |
|
"step": 136500 |
|
}, |
|
{ |
|
"epoch": 5.448182613536944, |
|
"grad_norm": 1.4495145082473755, |
|
"learning_rate": 1.848039475778639e-05, |
|
"loss": 0.0787, |
|
"step": 137000 |
|
}, |
|
{ |
|
"epoch": 5.468066491688539, |
|
"grad_norm": 2.496209144592285, |
|
"learning_rate": 1.781715922958561e-05, |
|
"loss": 0.0808, |
|
"step": 137500 |
|
}, |
|
{ |
|
"epoch": 5.487950369840133, |
|
"grad_norm": 1.4230667352676392, |
|
"learning_rate": 1.7153923701384837e-05, |
|
"loss": 0.0818, |
|
"step": 138000 |
|
}, |
|
{ |
|
"epoch": 5.507834247991728, |
|
"grad_norm": 0.07668805867433548, |
|
"learning_rate": 1.6490688173184063e-05, |
|
"loss": 0.0786, |
|
"step": 138500 |
|
}, |
|
{ |
|
"epoch": 5.5277181261433235, |
|
"grad_norm": 2.7259907722473145, |
|
"learning_rate": 1.5827452644983286e-05, |
|
"loss": 0.0839, |
|
"step": 139000 |
|
}, |
|
{ |
|
"epoch": 5.547602004294918, |
|
"grad_norm": 1.5834600925445557, |
|
"learning_rate": 1.5164217116782512e-05, |
|
"loss": 0.0807, |
|
"step": 139500 |
|
}, |
|
{ |
|
"epoch": 5.567485882446512, |
|
"grad_norm": 1.825166940689087, |
|
"learning_rate": 1.4500981588581738e-05, |
|
"loss": 0.0786, |
|
"step": 140000 |
|
}, |
|
{ |
|
"epoch": 5.587369760598107, |
|
"grad_norm": 0.33820098638534546, |
|
"learning_rate": 1.3837746060380963e-05, |
|
"loss": 0.0731, |
|
"step": 140500 |
|
}, |
|
{ |
|
"epoch": 5.600890797741192, |
|
"eval_loss": 0.1415776163339615, |
|
"eval_runtime": 32.1699, |
|
"eval_samples_per_second": 15.542, |
|
"eval_steps_per_second": 7.771, |
|
"step": 140840 |
|
}, |
|
{ |
|
"epoch": 5.607253638749702, |
|
"grad_norm": 0.2709919214248657, |
|
"learning_rate": 1.317583700323659e-05, |
|
"loss": 0.0755, |
|
"step": 141000 |
|
}, |
|
{ |
|
"epoch": 5.627137516901296, |
|
"grad_norm": 2.084345817565918, |
|
"learning_rate": 1.2512601475035815e-05, |
|
"loss": 0.0784, |
|
"step": 141500 |
|
}, |
|
{ |
|
"epoch": 5.647021395052891, |
|
"grad_norm": 1.5787373781204224, |
|
"learning_rate": 1.1849365946835041e-05, |
|
"loss": 0.0724, |
|
"step": 142000 |
|
}, |
|
{ |
|
"epoch": 5.666905273204486, |
|
"grad_norm": 2.0165510177612305, |
|
"learning_rate": 1.1186130418634267e-05, |
|
"loss": 0.0814, |
|
"step": 142500 |
|
}, |
|
{ |
|
"epoch": 5.686789151356081, |
|
"grad_norm": 1.0443742275238037, |
|
"learning_rate": 1.052289489043349e-05, |
|
"loss": 0.0749, |
|
"step": 143000 |
|
}, |
|
{ |
|
"epoch": 5.706673029507675, |
|
"grad_norm": 0.3988170921802521, |
|
"learning_rate": 9.859659362232716e-06, |
|
"loss": 0.0805, |
|
"step": 143500 |
|
}, |
|
{ |
|
"epoch": 5.72655690765927, |
|
"grad_norm": 1.6842561960220337, |
|
"learning_rate": 9.196423834031942e-06, |
|
"loss": 0.0859, |
|
"step": 144000 |
|
}, |
|
{ |
|
"epoch": 5.746440785810865, |
|
"grad_norm": 3.893829584121704, |
|
"learning_rate": 8.533188305831168e-06, |
|
"loss": 0.0834, |
|
"step": 144500 |
|
}, |
|
{ |
|
"epoch": 5.766324663962459, |
|
"grad_norm": 3.017742156982422, |
|
"learning_rate": 7.871279248686794e-06, |
|
"loss": 0.0767, |
|
"step": 145000 |
|
}, |
|
{ |
|
"epoch": 5.786208542114053, |
|
"grad_norm": 1.0755605697631836, |
|
"learning_rate": 7.2080437204860195e-06, |
|
"loss": 0.0807, |
|
"step": 145500 |
|
}, |
|
{ |
|
"epoch": 5.800922611946234, |
|
"eval_loss": 0.14066436886787415, |
|
"eval_runtime": 37.1151, |
|
"eval_samples_per_second": 13.472, |
|
"eval_steps_per_second": 6.736, |
|
"step": 145870 |
|
}, |
|
{ |
|
"epoch": 5.8060924202656485, |
|
"grad_norm": 2.315115451812744, |
|
"learning_rate": 6.544808192285244e-06, |
|
"loss": 0.0737, |
|
"step": 146000 |
|
}, |
|
{ |
|
"epoch": 5.825976298417244, |
|
"grad_norm": 1.7753525972366333, |
|
"learning_rate": 5.88157266408447e-06, |
|
"loss": 0.0781, |
|
"step": 146500 |
|
}, |
|
{ |
|
"epoch": 5.845860176568838, |
|
"grad_norm": 0.942420244216919, |
|
"learning_rate": 5.218337135883695e-06, |
|
"loss": 0.0785, |
|
"step": 147000 |
|
}, |
|
{ |
|
"epoch": 5.865744054720433, |
|
"grad_norm": 1.6908499002456665, |
|
"learning_rate": 4.5551016076829205e-06, |
|
"loss": 0.0773, |
|
"step": 147500 |
|
}, |
|
{ |
|
"epoch": 5.885627932872027, |
|
"grad_norm": 3.541743755340576, |
|
"learning_rate": 3.891866079482146e-06, |
|
"loss": 0.082, |
|
"step": 148000 |
|
}, |
|
{ |
|
"epoch": 5.905511811023622, |
|
"grad_norm": 1.41182541847229, |
|
"learning_rate": 3.228630551281371e-06, |
|
"loss": 0.0755, |
|
"step": 148500 |
|
}, |
|
{ |
|
"epoch": 5.925395689175216, |
|
"grad_norm": 1.457543134689331, |
|
"learning_rate": 2.5653950230805963e-06, |
|
"loss": 0.0721, |
|
"step": 149000 |
|
}, |
|
{ |
|
"epoch": 5.9452795673268115, |
|
"grad_norm": 1.3256011009216309, |
|
"learning_rate": 1.9034859659362233e-06, |
|
"loss": 0.0756, |
|
"step": 149500 |
|
}, |
|
{ |
|
"epoch": 5.965163445478407, |
|
"grad_norm": 4.112858295440674, |
|
"learning_rate": 1.2402504377354487e-06, |
|
"loss": 0.0792, |
|
"step": 150000 |
|
}, |
|
{ |
|
"epoch": 5.985047323630001, |
|
"grad_norm": 9.184978485107422, |
|
"learning_rate": 5.77014909534674e-07, |
|
"loss": 0.0763, |
|
"step": 150500 |
|
}, |
|
{ |
|
"epoch": 6.0, |
|
"step": 150876, |
|
"total_flos": 9.584515856152166e+17, |
|
"train_loss": 0.1514410440879876, |
|
"train_runtime": 79770.8244, |
|
"train_samples_per_second": 3.783, |
|
"train_steps_per_second": 1.891 |
|
} |
|
], |
|
"logging_steps": 500, |
|
"max_steps": 150876, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 6, |
|
"save_steps": 500, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 9.584515856152166e+17, |
|
"train_batch_size": 2, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|