{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 0.9994683678894205, "eval_steps": 500, "global_step": 940, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.01063264221158958, "grad_norm": 5.609270095825195, "learning_rate": 0.00019808510638297873, "loss": 2.4434, "step": 10 }, { "epoch": 0.02126528442317916, "grad_norm": 4.589075088500977, "learning_rate": 0.00019595744680851065, "loss": 1.6902, "step": 20 }, { "epoch": 0.03189792663476874, "grad_norm": 3.7465291023254395, "learning_rate": 0.00019382978723404257, "loss": 1.3148, "step": 30 }, { "epoch": 0.04253056884635832, "grad_norm": 3.543064594268799, "learning_rate": 0.00019170212765957448, "loss": 1.4302, "step": 40 }, { "epoch": 0.0531632110579479, "grad_norm": 2.68544340133667, "learning_rate": 0.0001895744680851064, "loss": 1.3222, "step": 50 }, { "epoch": 0.06379585326953748, "grad_norm": 2.752901792526245, "learning_rate": 0.00018744680851063832, "loss": 1.2792, "step": 60 }, { "epoch": 0.07442849548112707, "grad_norm": 2.7944841384887695, "learning_rate": 0.0001853191489361702, "loss": 1.3764, "step": 70 }, { "epoch": 0.08506113769271664, "grad_norm": 3.0340654850006104, "learning_rate": 0.00018319148936170215, "loss": 1.2255, "step": 80 }, { "epoch": 0.09569377990430622, "grad_norm": 2.5017054080963135, "learning_rate": 0.00018106382978723404, "loss": 1.1689, "step": 90 }, { "epoch": 0.1063264221158958, "grad_norm": 4.572251319885254, "learning_rate": 0.00017893617021276596, "loss": 1.1418, "step": 100 }, { "epoch": 0.11695906432748537, "grad_norm": 3.354853630065918, "learning_rate": 0.00017680851063829787, "loss": 1.3103, "step": 110 }, { "epoch": 0.12759170653907495, "grad_norm": 2.387272834777832, "learning_rate": 0.0001746808510638298, "loss": 1.2848, "step": 120 }, { "epoch": 0.13822434875066453, "grad_norm": 2.579465627670288, "learning_rate": 0.0001725531914893617, "loss": 1.2395, "step": 130 }, { "epoch": 0.14885699096225413, "grad_norm": 2.9512267112731934, "learning_rate": 0.00017042553191489362, "loss": 1.3716, "step": 140 }, { "epoch": 0.1594896331738437, "grad_norm": 2.6200809478759766, "learning_rate": 0.00016829787234042554, "loss": 1.2647, "step": 150 }, { "epoch": 0.17012227538543329, "grad_norm": 2.7764666080474854, "learning_rate": 0.00016617021276595746, "loss": 1.0862, "step": 160 }, { "epoch": 0.18075491759702286, "grad_norm": 2.454061269760132, "learning_rate": 0.00016404255319148937, "loss": 1.2213, "step": 170 }, { "epoch": 0.19138755980861244, "grad_norm": 2.483651876449585, "learning_rate": 0.0001619148936170213, "loss": 1.1895, "step": 180 }, { "epoch": 0.20202020202020202, "grad_norm": 3.5856575965881348, "learning_rate": 0.0001597872340425532, "loss": 1.2322, "step": 190 }, { "epoch": 0.2126528442317916, "grad_norm": 2.1436448097229004, "learning_rate": 0.00015765957446808512, "loss": 1.2782, "step": 200 }, { "epoch": 0.22328548644338117, "grad_norm": 2.569831609725952, "learning_rate": 0.00015553191489361701, "loss": 1.171, "step": 210 }, { "epoch": 0.23391812865497075, "grad_norm": 2.5455546379089355, "learning_rate": 0.00015340425531914896, "loss": 1.3055, "step": 220 }, { "epoch": 0.24455077086656035, "grad_norm": 1.7153586149215698, "learning_rate": 0.00015127659574468085, "loss": 1.0822, "step": 230 }, { "epoch": 0.2551834130781499, "grad_norm": 2.549631357192993, "learning_rate": 0.00014914893617021276, "loss": 1.3206, "step": 240 }, { "epoch": 0.2658160552897395, "grad_norm": 2.53717041015625, "learning_rate": 0.00014702127659574468, "loss": 1.1995, "step": 250 }, { "epoch": 0.27644869750132905, "grad_norm": 2.331685781478882, "learning_rate": 0.0001448936170212766, "loss": 1.2426, "step": 260 }, { "epoch": 0.28708133971291866, "grad_norm": 2.6866092681884766, "learning_rate": 0.00014276595744680851, "loss": 1.1314, "step": 270 }, { "epoch": 0.29771398192450826, "grad_norm": 2.107909679412842, "learning_rate": 0.00014063829787234043, "loss": 1.1542, "step": 280 }, { "epoch": 0.3083466241360978, "grad_norm": 1.8758138418197632, "learning_rate": 0.00013851063829787235, "loss": 1.1377, "step": 290 }, { "epoch": 0.3189792663476874, "grad_norm": 1.647929072380066, "learning_rate": 0.00013638297872340427, "loss": 1.0096, "step": 300 }, { "epoch": 0.32961190855927697, "grad_norm": 2.186124563217163, "learning_rate": 0.00013425531914893618, "loss": 1.1746, "step": 310 }, { "epoch": 0.34024455077086657, "grad_norm": 2.4536380767822266, "learning_rate": 0.0001321276595744681, "loss": 1.1833, "step": 320 }, { "epoch": 0.3508771929824561, "grad_norm": 1.8024215698242188, "learning_rate": 0.00013000000000000002, "loss": 0.9309, "step": 330 }, { "epoch": 0.3615098351940457, "grad_norm": 3.0355000495910645, "learning_rate": 0.0001278723404255319, "loss": 1.0863, "step": 340 }, { "epoch": 0.3721424774056353, "grad_norm": 1.9415550231933594, "learning_rate": 0.00012574468085106382, "loss": 1.0507, "step": 350 }, { "epoch": 0.3827751196172249, "grad_norm": 2.327995538711548, "learning_rate": 0.00012361702127659577, "loss": 1.2524, "step": 360 }, { "epoch": 0.3934077618288145, "grad_norm": 2.001037120819092, "learning_rate": 0.00012148936170212766, "loss": 1.0437, "step": 370 }, { "epoch": 0.40404040404040403, "grad_norm": 2.1419551372528076, "learning_rate": 0.00011936170212765959, "loss": 1.0968, "step": 380 }, { "epoch": 0.41467304625199364, "grad_norm": 2.3085482120513916, "learning_rate": 0.0001172340425531915, "loss": 1.1706, "step": 390 }, { "epoch": 0.4253056884635832, "grad_norm": 4.618401050567627, "learning_rate": 0.0001151063829787234, "loss": 1.0685, "step": 400 }, { "epoch": 0.4359383306751728, "grad_norm": 2.421363115310669, "learning_rate": 0.00011297872340425532, "loss": 1.0373, "step": 410 }, { "epoch": 0.44657097288676234, "grad_norm": 1.6373859643936157, "learning_rate": 0.00011085106382978725, "loss": 1.0669, "step": 420 }, { "epoch": 0.45720361509835195, "grad_norm": 2.3031554222106934, "learning_rate": 0.00010872340425531916, "loss": 1.1394, "step": 430 }, { "epoch": 0.4678362573099415, "grad_norm": 1.9488067626953125, "learning_rate": 0.00010659574468085107, "loss": 1.0347, "step": 440 }, { "epoch": 0.4784688995215311, "grad_norm": 1.8650946617126465, "learning_rate": 0.00010446808510638298, "loss": 1.1159, "step": 450 }, { "epoch": 0.4891015417331207, "grad_norm": 1.8462837934494019, "learning_rate": 0.0001023404255319149, "loss": 1.0389, "step": 460 }, { "epoch": 0.49973418394471025, "grad_norm": 2.5941386222839355, "learning_rate": 0.00010021276595744682, "loss": 1.1587, "step": 470 }, { "epoch": 0.5103668261562998, "grad_norm": 1.34873366355896, "learning_rate": 9.808510638297873e-05, "loss": 1.1095, "step": 480 }, { "epoch": 0.5209994683678895, "grad_norm": 2.2580478191375732, "learning_rate": 9.595744680851064e-05, "loss": 1.1268, "step": 490 }, { "epoch": 0.531632110579479, "grad_norm": 2.389127731323242, "learning_rate": 9.382978723404256e-05, "loss": 1.2718, "step": 500 }, { "epoch": 0.5422647527910686, "grad_norm": 2.1379384994506836, "learning_rate": 9.170212765957448e-05, "loss": 1.0394, "step": 510 }, { "epoch": 0.5528973950026581, "grad_norm": 2.5996925830841064, "learning_rate": 8.95744680851064e-05, "loss": 1.0508, "step": 520 }, { "epoch": 0.5635300372142478, "grad_norm": 2.143913984298706, "learning_rate": 8.74468085106383e-05, "loss": 1.0038, "step": 530 }, { "epoch": 0.5741626794258373, "grad_norm": 2.285888910293579, "learning_rate": 8.531914893617021e-05, "loss": 1.2064, "step": 540 }, { "epoch": 0.5847953216374269, "grad_norm": 2.3260293006896973, "learning_rate": 8.319148936170213e-05, "loss": 1.0499, "step": 550 }, { "epoch": 0.5954279638490165, "grad_norm": 2.3417248725891113, "learning_rate": 8.106382978723405e-05, "loss": 1.1371, "step": 560 }, { "epoch": 0.6060606060606061, "grad_norm": 2.194345474243164, "learning_rate": 7.893617021276596e-05, "loss": 1.0571, "step": 570 }, { "epoch": 0.6166932482721956, "grad_norm": 2.3759639263153076, "learning_rate": 7.680851063829788e-05, "loss": 0.9709, "step": 580 }, { "epoch": 0.6273258904837852, "grad_norm": 1.7851307392120361, "learning_rate": 7.46808510638298e-05, "loss": 1.0751, "step": 590 }, { "epoch": 0.6379585326953748, "grad_norm": 2.1073718070983887, "learning_rate": 7.25531914893617e-05, "loss": 1.0453, "step": 600 }, { "epoch": 0.6485911749069644, "grad_norm": 3.0715222358703613, "learning_rate": 7.042553191489362e-05, "loss": 1.019, "step": 610 }, { "epoch": 0.6592238171185539, "grad_norm": 2.7208268642425537, "learning_rate": 6.829787234042554e-05, "loss": 0.919, "step": 620 }, { "epoch": 0.6698564593301436, "grad_norm": 1.7897045612335205, "learning_rate": 6.617021276595745e-05, "loss": 0.9964, "step": 630 }, { "epoch": 0.6804891015417331, "grad_norm": 2.317929744720459, "learning_rate": 6.404255319148937e-05, "loss": 1.1598, "step": 640 }, { "epoch": 0.6911217437533227, "grad_norm": 1.826894760131836, "learning_rate": 6.191489361702127e-05, "loss": 1.117, "step": 650 }, { "epoch": 0.7017543859649122, "grad_norm": 2.0165112018585205, "learning_rate": 5.9787234042553196e-05, "loss": 1.0511, "step": 660 }, { "epoch": 0.7123870281765019, "grad_norm": 1.6636179685592651, "learning_rate": 5.7659574468085106e-05, "loss": 1.0488, "step": 670 }, { "epoch": 0.7230196703880915, "grad_norm": 2.3491950035095215, "learning_rate": 5.553191489361702e-05, "loss": 1.2297, "step": 680 }, { "epoch": 0.733652312599681, "grad_norm": 2.28796124458313, "learning_rate": 5.3404255319148946e-05, "loss": 1.1457, "step": 690 }, { "epoch": 0.7442849548112705, "grad_norm": 2.550320863723755, "learning_rate": 5.1276595744680856e-05, "loss": 1.069, "step": 700 }, { "epoch": 0.7549175970228602, "grad_norm": 1.5172102451324463, "learning_rate": 4.9148936170212766e-05, "loss": 0.849, "step": 710 }, { "epoch": 0.7655502392344498, "grad_norm": 1.7714675664901733, "learning_rate": 4.702127659574468e-05, "loss": 1.1457, "step": 720 }, { "epoch": 0.7761828814460393, "grad_norm": 1.587169885635376, "learning_rate": 4.489361702127659e-05, "loss": 0.9438, "step": 730 }, { "epoch": 0.786815523657629, "grad_norm": 2.464047908782959, "learning_rate": 4.276595744680851e-05, "loss": 1.0606, "step": 740 }, { "epoch": 0.7974481658692185, "grad_norm": 1.6491392850875854, "learning_rate": 4.063829787234043e-05, "loss": 1.0333, "step": 750 }, { "epoch": 0.8080808080808081, "grad_norm": 2.159282684326172, "learning_rate": 3.8510638297872344e-05, "loss": 0.9192, "step": 760 }, { "epoch": 0.8187134502923976, "grad_norm": 1.6473966836929321, "learning_rate": 3.638297872340426e-05, "loss": 1.0218, "step": 770 }, { "epoch": 0.8293460925039873, "grad_norm": 2.5140249729156494, "learning_rate": 3.425531914893617e-05, "loss": 1.1425, "step": 780 }, { "epoch": 0.8399787347155768, "grad_norm": 1.8191956281661987, "learning_rate": 3.212765957446809e-05, "loss": 1.0179, "step": 790 }, { "epoch": 0.8506113769271664, "grad_norm": 1.570918083190918, "learning_rate": 3e-05, "loss": 1.0624, "step": 800 }, { "epoch": 0.861244019138756, "grad_norm": 2.4648308753967285, "learning_rate": 2.7872340425531918e-05, "loss": 1.0768, "step": 810 }, { "epoch": 0.8718766613503456, "grad_norm": 2.4284791946411133, "learning_rate": 2.574468085106383e-05, "loss": 1.0748, "step": 820 }, { "epoch": 0.8825093035619351, "grad_norm": 2.543541193008423, "learning_rate": 2.3617021276595748e-05, "loss": 1.051, "step": 830 }, { "epoch": 0.8931419457735247, "grad_norm": 2.0287232398986816, "learning_rate": 2.148936170212766e-05, "loss": 0.9987, "step": 840 }, { "epoch": 0.9037745879851143, "grad_norm": 2.2504048347473145, "learning_rate": 1.9361702127659575e-05, "loss": 0.9468, "step": 850 }, { "epoch": 0.9144072301967039, "grad_norm": 1.889223337173462, "learning_rate": 1.723404255319149e-05, "loss": 1.1471, "step": 860 }, { "epoch": 0.9250398724082934, "grad_norm": 2.414099931716919, "learning_rate": 1.5106382978723405e-05, "loss": 1.0941, "step": 870 }, { "epoch": 0.935672514619883, "grad_norm": 1.7655644416809082, "learning_rate": 1.2978723404255318e-05, "loss": 1.0504, "step": 880 }, { "epoch": 0.9463051568314726, "grad_norm": 1.6641113758087158, "learning_rate": 1.0851063829787235e-05, "loss": 1.1144, "step": 890 }, { "epoch": 0.9569377990430622, "grad_norm": 2.2806735038757324, "learning_rate": 8.72340425531915e-06, "loss": 1.0884, "step": 900 }, { "epoch": 0.9675704412546517, "grad_norm": 2.1201162338256836, "learning_rate": 6.595744680851064e-06, "loss": 1.0405, "step": 910 }, { "epoch": 0.9782030834662414, "grad_norm": 1.651154637336731, "learning_rate": 4.468085106382979e-06, "loss": 0.9533, "step": 920 }, { "epoch": 0.988835725677831, "grad_norm": 2.6276893615722656, "learning_rate": 2.3404255319148935e-06, "loss": 1.0675, "step": 930 }, { "epoch": 0.9994683678894205, "grad_norm": 1.7685601711273193, "learning_rate": 2.1276595744680852e-07, "loss": 0.8962, "step": 940 } ], "logging_steps": 10, "max_steps": 940, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 2872735048949760.0, "train_batch_size": 1, "trial_name": null, "trial_params": null }