{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 1.6722689075630253, "eval_steps": 500, "global_step": 100, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.01680672268907563, "grad_norm": 0.9195247888565063, "learning_rate": 0.0002, "loss": 2.2044, "mean_token_accuracy": 0.5819655358791351, "num_tokens": 280.0, "step": 1 }, { "epoch": 0.03361344537815126, "grad_norm": 0.7940512895584106, "learning_rate": 0.00019800000000000002, "loss": 2.3089, "mean_token_accuracy": 0.5964397341012955, "num_tokens": 551.0, "step": 2 }, { "epoch": 0.05042016806722689, "grad_norm": 0.7546899914741516, "learning_rate": 0.000196, "loss": 1.8231, "mean_token_accuracy": 0.6352723836898804, "num_tokens": 862.0, "step": 3 }, { "epoch": 0.06722689075630252, "grad_norm": 1.136237621307373, "learning_rate": 0.000194, "loss": 2.1194, "mean_token_accuracy": 0.5928251147270203, "num_tokens": 1092.0, "step": 4 }, { "epoch": 0.08403361344537816, "grad_norm": 1.2270488739013672, "learning_rate": 0.000192, "loss": 1.8441, "mean_token_accuracy": 0.6453376561403275, "num_tokens": 1334.0, "step": 5 }, { "epoch": 0.10084033613445378, "grad_norm": 1.1615289449691772, "learning_rate": 0.00019, "loss": 1.8065, "mean_token_accuracy": 0.6384429484605789, "num_tokens": 1572.0, "step": 6 }, { "epoch": 0.11764705882352941, "grad_norm": 1.291213870048523, "learning_rate": 0.000188, "loss": 1.433, "mean_token_accuracy": 0.6760559976100922, "num_tokens": 1834.0, "step": 7 }, { "epoch": 0.13445378151260504, "grad_norm": 1.267329216003418, "learning_rate": 0.00018600000000000002, "loss": 1.2454, "mean_token_accuracy": 0.7190766334533691, "num_tokens": 2102.0, "step": 8 }, { "epoch": 0.15126050420168066, "grad_norm": 1.332841396331787, "learning_rate": 0.00018400000000000003, "loss": 1.3004, "mean_token_accuracy": 0.7266262024641037, "num_tokens": 2372.0, "step": 9 }, { "epoch": 0.16806722689075632, "grad_norm": 1.5956121683120728, "learning_rate": 0.000182, "loss": 1.1887, "mean_token_accuracy": 0.7487049698829651, "num_tokens": 2635.0, "step": 10 }, { "epoch": 0.18487394957983194, "grad_norm": 1.5509591102600098, "learning_rate": 0.00018, "loss": 1.0526, "mean_token_accuracy": 0.7587652802467346, "num_tokens": 2876.0, "step": 11 }, { "epoch": 0.20168067226890757, "grad_norm": 1.5047630071640015, "learning_rate": 0.00017800000000000002, "loss": 0.9594, "mean_token_accuracy": 0.7663996070623398, "num_tokens": 3092.0, "step": 12 }, { "epoch": 0.2184873949579832, "grad_norm": 1.444846510887146, "learning_rate": 0.00017600000000000002, "loss": 0.6974, "mean_token_accuracy": 0.8532201200723648, "num_tokens": 3363.0, "step": 13 }, { "epoch": 0.23529411764705882, "grad_norm": 1.386878252029419, "learning_rate": 0.000174, "loss": 0.7017, "mean_token_accuracy": 0.8347512483596802, "num_tokens": 3649.0, "step": 14 }, { "epoch": 0.25210084033613445, "grad_norm": 1.596093773841858, "learning_rate": 0.000172, "loss": 0.6279, "mean_token_accuracy": 0.8388981074094772, "num_tokens": 3870.0, "step": 15 }, { "epoch": 0.2689075630252101, "grad_norm": 1.8912463188171387, "learning_rate": 0.00017, "loss": 0.533, "mean_token_accuracy": 0.8526211529970169, "num_tokens": 4139.0, "step": 16 }, { "epoch": 0.2857142857142857, "grad_norm": 1.3809293508529663, "learning_rate": 0.000168, "loss": 0.5192, "mean_token_accuracy": 0.8702940046787262, "num_tokens": 4395.0, "step": 17 }, { "epoch": 0.3025210084033613, "grad_norm": 1.4815438985824585, "learning_rate": 0.000166, "loss": 0.6678, "mean_token_accuracy": 0.8443302363157272, "num_tokens": 4613.0, "step": 18 }, { "epoch": 0.31932773109243695, "grad_norm": 1.1631455421447754, "learning_rate": 0.000164, "loss": 0.3321, "mean_token_accuracy": 0.8968759775161743, "num_tokens": 4877.0, "step": 19 }, { "epoch": 0.33613445378151263, "grad_norm": 1.2620583772659302, "learning_rate": 0.000162, "loss": 0.4485, "mean_token_accuracy": 0.8692560493946075, "num_tokens": 5133.0, "step": 20 }, { "epoch": 0.35294117647058826, "grad_norm": 0.9936636686325073, "learning_rate": 0.00016, "loss": 0.4403, "mean_token_accuracy": 0.8931679427623749, "num_tokens": 5412.0, "step": 21 }, { "epoch": 0.3697478991596639, "grad_norm": 0.9228686690330505, "learning_rate": 0.00015800000000000002, "loss": 0.3091, "mean_token_accuracy": 0.9256660640239716, "num_tokens": 5681.0, "step": 22 }, { "epoch": 0.3865546218487395, "grad_norm": 1.0632286071777344, "learning_rate": 0.00015600000000000002, "loss": 0.4862, "mean_token_accuracy": 0.8614651262760162, "num_tokens": 5949.0, "step": 23 }, { "epoch": 0.40336134453781514, "grad_norm": 1.7566121816635132, "learning_rate": 0.000154, "loss": 0.4674, "mean_token_accuracy": 0.8802213817834854, "num_tokens": 6198.0, "step": 24 }, { "epoch": 0.42016806722689076, "grad_norm": 1.3451807498931885, "learning_rate": 0.000152, "loss": 0.345, "mean_token_accuracy": 0.9215966314077377, "num_tokens": 6418.0, "step": 25 }, { "epoch": 0.4369747899159664, "grad_norm": 1.0630156993865967, "learning_rate": 0.00015000000000000001, "loss": 0.379, "mean_token_accuracy": 0.8995761275291443, "num_tokens": 6646.0, "step": 26 }, { "epoch": 0.453781512605042, "grad_norm": 1.5985835790634155, "learning_rate": 0.000148, "loss": 0.3516, "mean_token_accuracy": 0.9066900312900543, "num_tokens": 6891.0, "step": 27 }, { "epoch": 0.47058823529411764, "grad_norm": 1.1677768230438232, "learning_rate": 0.000146, "loss": 0.2507, "mean_token_accuracy": 0.9439381659030914, "num_tokens": 7115.0, "step": 28 }, { "epoch": 0.48739495798319327, "grad_norm": 1.8340446949005127, "learning_rate": 0.000144, "loss": 0.2958, "mean_token_accuracy": 0.9207049608230591, "num_tokens": 7360.0, "step": 29 }, { "epoch": 0.5042016806722689, "grad_norm": 1.5426018238067627, "learning_rate": 0.000142, "loss": 0.275, "mean_token_accuracy": 0.9157240688800812, "num_tokens": 7639.0, "step": 30 }, { "epoch": 0.5210084033613446, "grad_norm": 1.3587743043899536, "learning_rate": 0.00014, "loss": 0.1968, "mean_token_accuracy": 0.9501292258501053, "num_tokens": 7869.0, "step": 31 }, { "epoch": 0.5378151260504201, "grad_norm": 1.1160968542099, "learning_rate": 0.000138, "loss": 0.1432, "mean_token_accuracy": 0.9485518783330917, "num_tokens": 8094.0, "step": 32 }, { "epoch": 0.5546218487394958, "grad_norm": 1.00589919090271, "learning_rate": 0.00013600000000000003, "loss": 0.1469, "mean_token_accuracy": 0.9555195868015289, "num_tokens": 8330.0, "step": 33 }, { "epoch": 0.5714285714285714, "grad_norm": 0.9749813675880432, "learning_rate": 0.000134, "loss": 0.1758, "mean_token_accuracy": 0.9601360410451889, "num_tokens": 8628.0, "step": 34 }, { "epoch": 0.5882352941176471, "grad_norm": 1.0763829946517944, "learning_rate": 0.000132, "loss": 0.1643, "mean_token_accuracy": 0.9445353597402573, "num_tokens": 8865.0, "step": 35 }, { "epoch": 0.6050420168067226, "grad_norm": 0.833663821220398, "learning_rate": 0.00013000000000000002, "loss": 0.121, "mean_token_accuracy": 0.9618351757526398, "num_tokens": 9105.0, "step": 36 }, { "epoch": 0.6218487394957983, "grad_norm": 1.3269267082214355, "learning_rate": 0.00012800000000000002, "loss": 0.1915, "mean_token_accuracy": 0.9508444666862488, "num_tokens": 9396.0, "step": 37 }, { "epoch": 0.6386554621848739, "grad_norm": 1.1704431772232056, "learning_rate": 0.000126, "loss": 0.1726, "mean_token_accuracy": 0.9565037339925766, "num_tokens": 9676.0, "step": 38 }, { "epoch": 0.6554621848739496, "grad_norm": 0.7786519527435303, "learning_rate": 0.000124, "loss": 0.1421, "mean_token_accuracy": 0.9649268388748169, "num_tokens": 9930.0, "step": 39 }, { "epoch": 0.6722689075630253, "grad_norm": 0.9049130082130432, "learning_rate": 0.000122, "loss": 0.1154, "mean_token_accuracy": 0.9734069108963013, "num_tokens": 10235.0, "step": 40 }, { "epoch": 0.6890756302521008, "grad_norm": 1.623077630996704, "learning_rate": 0.00012, "loss": 0.1528, "mean_token_accuracy": 0.9538236260414124, "num_tokens": 10449.0, "step": 41 }, { "epoch": 0.7058823529411765, "grad_norm": 1.512434720993042, "learning_rate": 0.000118, "loss": 0.1793, "mean_token_accuracy": 0.9374579340219498, "num_tokens": 10695.0, "step": 42 }, { "epoch": 0.7226890756302521, "grad_norm": 0.941612958908081, "learning_rate": 0.000116, "loss": 0.1458, "mean_token_accuracy": 0.9549220055341721, "num_tokens": 10981.0, "step": 43 }, { "epoch": 0.7394957983193278, "grad_norm": 0.725567102432251, "learning_rate": 0.00011399999999999999, "loss": 0.113, "mean_token_accuracy": 0.9642024636268616, "num_tokens": 11249.0, "step": 44 }, { "epoch": 0.7563025210084033, "grad_norm": 0.9533170461654663, "learning_rate": 0.00011200000000000001, "loss": 0.1289, "mean_token_accuracy": 0.9561101645231247, "num_tokens": 11518.0, "step": 45 }, { "epoch": 0.773109243697479, "grad_norm": 0.7225016951560974, "learning_rate": 0.00011000000000000002, "loss": 0.1127, "mean_token_accuracy": 0.9768504500389099, "num_tokens": 11786.0, "step": 46 }, { "epoch": 0.7899159663865546, "grad_norm": 0.7140802145004272, "learning_rate": 0.00010800000000000001, "loss": 0.0905, "mean_token_accuracy": 0.9777714610099792, "num_tokens": 12022.0, "step": 47 }, { "epoch": 0.8067226890756303, "grad_norm": 0.9338831901550293, "learning_rate": 0.00010600000000000002, "loss": 0.1079, "mean_token_accuracy": 0.9652335196733475, "num_tokens": 12240.0, "step": 48 }, { "epoch": 0.8235294117647058, "grad_norm": 0.6939389109611511, "learning_rate": 0.00010400000000000001, "loss": 0.0942, "mean_token_accuracy": 0.9659361839294434, "num_tokens": 12491.0, "step": 49 }, { "epoch": 0.8403361344537815, "grad_norm": 0.6977850794792175, "learning_rate": 0.00010200000000000001, "loss": 0.079, "mean_token_accuracy": 0.9743468016386032, "num_tokens": 12742.0, "step": 50 }, { "epoch": 0.8571428571428571, "grad_norm": 0.6693803071975708, "learning_rate": 0.0001, "loss": 0.0686, "mean_token_accuracy": 0.9718740880489349, "num_tokens": 12986.0, "step": 51 }, { "epoch": 0.8739495798319328, "grad_norm": 1.267232894897461, "learning_rate": 9.8e-05, "loss": 0.1208, "mean_token_accuracy": 0.9666058868169785, "num_tokens": 13207.0, "step": 52 }, { "epoch": 0.8907563025210085, "grad_norm": 0.686630368232727, "learning_rate": 9.6e-05, "loss": 0.0829, "mean_token_accuracy": 0.9754088222980499, "num_tokens": 13461.0, "step": 53 }, { "epoch": 0.907563025210084, "grad_norm": 0.6465067267417908, "learning_rate": 9.4e-05, "loss": 0.063, "mean_token_accuracy": 0.9753759354352951, "num_tokens": 13751.0, "step": 54 }, { "epoch": 0.9243697478991597, "grad_norm": 0.5915058255195618, "learning_rate": 9.200000000000001e-05, "loss": 0.0652, "mean_token_accuracy": 0.9792723804712296, "num_tokens": 14013.0, "step": 55 }, { "epoch": 0.9411764705882353, "grad_norm": 0.4265214502811432, "learning_rate": 9e-05, "loss": 0.0552, "mean_token_accuracy": 0.9794800281524658, "num_tokens": 14315.0, "step": 56 }, { "epoch": 0.957983193277311, "grad_norm": 0.553076446056366, "learning_rate": 8.800000000000001e-05, "loss": 0.0726, "mean_token_accuracy": 0.981107696890831, "num_tokens": 14583.0, "step": 57 }, { "epoch": 0.9747899159663865, "grad_norm": 0.7135282158851624, "learning_rate": 8.6e-05, "loss": 0.066, "mean_token_accuracy": 0.9685419946908951, "num_tokens": 14828.0, "step": 58 }, { "epoch": 0.9915966386554622, "grad_norm": 0.5108709931373596, "learning_rate": 8.4e-05, "loss": 0.0579, "mean_token_accuracy": 0.9833555668592453, "num_tokens": 15131.0, "step": 59 }, { "epoch": 1.0, "grad_norm": 0.6764453649520874, "learning_rate": 8.2e-05, "loss": 0.069, "mean_token_accuracy": 0.9762941002845764, "num_tokens": 15266.0, "step": 60 }, { "epoch": 1.0, "eval_loss": 0.06955836713314056, "eval_mean_token_accuracy": 0.9762418886025747, "eval_num_tokens": 15266.0, "eval_runtime": 8.8223, "eval_samples_per_second": 26.977, "eval_steps_per_second": 3.4, "step": 60 }, { "epoch": 1.0168067226890756, "grad_norm": 0.7502458691596985, "learning_rate": 8e-05, "loss": 0.0765, "mean_token_accuracy": 0.9749942719936371, "num_tokens": 15517.0, "step": 61 }, { "epoch": 1.0336134453781514, "grad_norm": 0.8168013691902161, "learning_rate": 7.800000000000001e-05, "loss": 0.0859, "mean_token_accuracy": 0.9644966721534729, "num_tokens": 15759.0, "step": 62 }, { "epoch": 1.050420168067227, "grad_norm": 0.5339249968528748, "learning_rate": 7.6e-05, "loss": 0.0602, "mean_token_accuracy": 0.9755090177059174, "num_tokens": 15978.0, "step": 63 }, { "epoch": 1.0672268907563025, "grad_norm": 0.42795777320861816, "learning_rate": 7.4e-05, "loss": 0.0471, "mean_token_accuracy": 0.9771932363510132, "num_tokens": 16222.0, "step": 64 }, { "epoch": 1.084033613445378, "grad_norm": 0.5189265608787537, "learning_rate": 7.2e-05, "loss": 0.0625, "mean_token_accuracy": 0.9816806763410568, "num_tokens": 16497.0, "step": 65 }, { "epoch": 1.1008403361344539, "grad_norm": 0.5358922481536865, "learning_rate": 7e-05, "loss": 0.055, "mean_token_accuracy": 0.9817706048488617, "num_tokens": 16730.0, "step": 66 }, { "epoch": 1.1176470588235294, "grad_norm": 0.4254271984100342, "learning_rate": 6.800000000000001e-05, "loss": 0.0551, "mean_token_accuracy": 0.9754405617713928, "num_tokens": 16989.0, "step": 67 }, { "epoch": 1.134453781512605, "grad_norm": 0.6607869863510132, "learning_rate": 6.6e-05, "loss": 0.0649, "mean_token_accuracy": 0.969991609454155, "num_tokens": 17238.0, "step": 68 }, { "epoch": 1.1512605042016806, "grad_norm": 0.5505990982055664, "learning_rate": 6.400000000000001e-05, "loss": 0.0555, "mean_token_accuracy": 0.9807892739772797, "num_tokens": 17506.0, "step": 69 }, { "epoch": 1.1680672268907564, "grad_norm": 0.5438902974128723, "learning_rate": 6.2e-05, "loss": 0.0523, "mean_token_accuracy": 0.9766516983509064, "num_tokens": 17757.0, "step": 70 }, { "epoch": 1.184873949579832, "grad_norm": 0.6576088666915894, "learning_rate": 6e-05, "loss": 0.0556, "mean_token_accuracy": 0.9764496088027954, "num_tokens": 18045.0, "step": 71 }, { "epoch": 1.2016806722689075, "grad_norm": 0.4228130877017975, "learning_rate": 5.8e-05, "loss": 0.0482, "mean_token_accuracy": 0.9782925397157669, "num_tokens": 18328.0, "step": 72 }, { "epoch": 1.2184873949579833, "grad_norm": 0.6111922860145569, "learning_rate": 5.6000000000000006e-05, "loss": 0.0565, "mean_token_accuracy": 0.9718534648418427, "num_tokens": 18572.0, "step": 73 }, { "epoch": 1.2352941176470589, "grad_norm": 0.558793306350708, "learning_rate": 5.4000000000000005e-05, "loss": 0.0649, "mean_token_accuracy": 0.9830508828163147, "num_tokens": 18814.0, "step": 74 }, { "epoch": 1.2521008403361344, "grad_norm": 0.5880012512207031, "learning_rate": 5.2000000000000004e-05, "loss": 0.0527, "mean_token_accuracy": 0.981307253241539, "num_tokens": 19053.0, "step": 75 }, { "epoch": 1.26890756302521, "grad_norm": 0.32453206181526184, "learning_rate": 5e-05, "loss": 0.0387, "mean_token_accuracy": 0.9886987954378128, "num_tokens": 19322.0, "step": 76 }, { "epoch": 1.2857142857142856, "grad_norm": 0.5277529358863831, "learning_rate": 4.8e-05, "loss": 0.065, "mean_token_accuracy": 0.9776898175477982, "num_tokens": 19557.0, "step": 77 }, { "epoch": 1.3025210084033614, "grad_norm": 0.40146011114120483, "learning_rate": 4.600000000000001e-05, "loss": 0.0485, "mean_token_accuracy": 0.9843288958072662, "num_tokens": 19819.0, "step": 78 }, { "epoch": 1.319327731092437, "grad_norm": 0.6262933015823364, "learning_rate": 4.4000000000000006e-05, "loss": 0.0614, "mean_token_accuracy": 0.9736219197511673, "num_tokens": 20062.0, "step": 79 }, { "epoch": 1.3361344537815127, "grad_norm": 1.2388803958892822, "learning_rate": 4.2e-05, "loss": 0.0746, "mean_token_accuracy": 0.9680851101875305, "num_tokens": 20254.0, "step": 80 }, { "epoch": 1.3529411764705883, "grad_norm": 0.4032573103904724, "learning_rate": 4e-05, "loss": 0.0469, "mean_token_accuracy": 0.9774428457021713, "num_tokens": 20536.0, "step": 81 }, { "epoch": 1.3697478991596639, "grad_norm": 0.4665139317512512, "learning_rate": 3.8e-05, "loss": 0.0508, "mean_token_accuracy": 0.9771178215742111, "num_tokens": 20761.0, "step": 82 }, { "epoch": 1.3865546218487395, "grad_norm": 0.39903876185417175, "learning_rate": 3.6e-05, "loss": 0.0534, "mean_token_accuracy": 0.9797466993331909, "num_tokens": 21026.0, "step": 83 }, { "epoch": 1.403361344537815, "grad_norm": 0.5796261429786682, "learning_rate": 3.4000000000000007e-05, "loss": 0.0521, "mean_token_accuracy": 0.9702111333608627, "num_tokens": 21280.0, "step": 84 }, { "epoch": 1.4201680672268908, "grad_norm": 0.4139143228530884, "learning_rate": 3.2000000000000005e-05, "loss": 0.0442, "mean_token_accuracy": 0.9848089665174484, "num_tokens": 21529.0, "step": 85 }, { "epoch": 1.4369747899159664, "grad_norm": 0.5100081562995911, "learning_rate": 3e-05, "loss": 0.0601, "mean_token_accuracy": 0.9778546392917633, "num_tokens": 21764.0, "step": 86 }, { "epoch": 1.453781512605042, "grad_norm": 0.6040212512016296, "learning_rate": 2.8000000000000003e-05, "loss": 0.0593, "mean_token_accuracy": 0.974713608622551, "num_tokens": 22024.0, "step": 87 }, { "epoch": 1.4705882352941178, "grad_norm": 0.2890975773334503, "learning_rate": 2.6000000000000002e-05, "loss": 0.043, "mean_token_accuracy": 0.9827319085597992, "num_tokens": 22273.0, "step": 88 }, { "epoch": 1.4873949579831933, "grad_norm": 0.6577038168907166, "learning_rate": 2.4e-05, "loss": 0.061, "mean_token_accuracy": 0.9718292504549026, "num_tokens": 22493.0, "step": 89 }, { "epoch": 1.504201680672269, "grad_norm": 0.40916889905929565, "learning_rate": 2.2000000000000003e-05, "loss": 0.0493, "mean_token_accuracy": 0.9844753742218018, "num_tokens": 22759.0, "step": 90 }, { "epoch": 1.5210084033613445, "grad_norm": 0.3740443289279938, "learning_rate": 2e-05, "loss": 0.0513, "mean_token_accuracy": 0.9788970202207565, "num_tokens": 22999.0, "step": 91 }, { "epoch": 1.53781512605042, "grad_norm": 0.3873707056045532, "learning_rate": 1.8e-05, "loss": 0.0501, "mean_token_accuracy": 0.9810363203287125, "num_tokens": 23270.0, "step": 92 }, { "epoch": 1.5546218487394958, "grad_norm": 0.41598230600357056, "learning_rate": 1.6000000000000003e-05, "loss": 0.044, "mean_token_accuracy": 0.9829495698213577, "num_tokens": 23570.0, "step": 93 }, { "epoch": 1.5714285714285714, "grad_norm": 0.6361399292945862, "learning_rate": 1.4000000000000001e-05, "loss": 0.0514, "mean_token_accuracy": 0.9911330789327621, "num_tokens": 23803.0, "step": 94 }, { "epoch": 1.5882352941176472, "grad_norm": 0.36418417096138, "learning_rate": 1.2e-05, "loss": 0.0456, "mean_token_accuracy": 0.9847660809755325, "num_tokens": 24075.0, "step": 95 }, { "epoch": 1.6050420168067228, "grad_norm": 0.44817042350769043, "learning_rate": 1e-05, "loss": 0.0485, "mean_token_accuracy": 0.9853666424751282, "num_tokens": 24343.0, "step": 96 }, { "epoch": 1.6218487394957983, "grad_norm": 0.4462110698223114, "learning_rate": 8.000000000000001e-06, "loss": 0.0522, "mean_token_accuracy": 0.977595642209053, "num_tokens": 24612.0, "step": 97 }, { "epoch": 1.638655462184874, "grad_norm": 0.35791558027267456, "learning_rate": 6e-06, "loss": 0.0474, "mean_token_accuracy": 0.9792077094316483, "num_tokens": 24855.0, "step": 98 }, { "epoch": 1.6554621848739495, "grad_norm": 0.38176652789115906, "learning_rate": 4.000000000000001e-06, "loss": 0.0441, "mean_token_accuracy": 0.9870962202548981, "num_tokens": 25150.0, "step": 99 }, { "epoch": 1.6722689075630253, "grad_norm": 0.3823869526386261, "learning_rate": 2.0000000000000003e-06, "loss": 0.0476, "mean_token_accuracy": 0.9815702140331268, "num_tokens": 25425.0, "step": 100 } ], "logging_steps": 1, "max_steps": 100, "num_input_tokens_seen": 0, "num_train_epochs": 2, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 196286558515200.0, "train_batch_size": 1, "trial_name": null, "trial_params": null }