{ "best_metric": null, "best_model_checkpoint": null, "epoch": 1.0, "eval_steps": 2000, "global_step": 100000, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.01, "grad_norm": 13.3125, "learning_rate": 2.0000000000000003e-06, "loss": 1.5082, "step": 1000 }, { "epoch": 0.02, "grad_norm": 11.75, "learning_rate": 4.000000000000001e-06, "loss": 1.2924, "step": 2000 }, { "epoch": 0.02, "eval_loss": 1.2315222024917603, "eval_runtime": 78.9092, "eval_samples_per_second": 49.563, "eval_steps_per_second": 12.394, "step": 2000 }, { "epoch": 0.03, "grad_norm": 11.0625, "learning_rate": 6e-06, "loss": 1.18, "step": 3000 }, { "epoch": 0.04, "grad_norm": 11.4375, "learning_rate": 8.000000000000001e-06, "loss": 1.101, "step": 4000 }, { "epoch": 0.04, "eval_loss": 1.073156714439392, "eval_runtime": 79.3697, "eval_samples_per_second": 49.276, "eval_steps_per_second": 12.322, "step": 4000 }, { "epoch": 0.05, "grad_norm": 11.125, "learning_rate": 1e-05, "loss": 1.0474, "step": 5000 }, { "epoch": 0.06, "grad_norm": 10.125, "learning_rate": 9.99726628670463e-06, "loss": 0.9949, "step": 6000 }, { "epoch": 0.06, "eval_loss": 0.978751003742218, "eval_runtime": 79.5844, "eval_samples_per_second": 49.143, "eval_steps_per_second": 12.289, "step": 6000 }, { "epoch": 0.07, "grad_norm": 8.8125, "learning_rate": 9.989068136093873e-06, "loss": 0.9562, "step": 7000 }, { "epoch": 0.08, "grad_norm": 8.375, "learning_rate": 9.975414512725058e-06, "loss": 0.9365, "step": 8000 }, { "epoch": 0.08, "eval_loss": 0.9295967221260071, "eval_runtime": 78.9753, "eval_samples_per_second": 49.522, "eval_steps_per_second": 12.384, "step": 8000 }, { "epoch": 0.09, "grad_norm": 9.8125, "learning_rate": 9.956320346634877e-06, "loss": 0.9169, "step": 9000 }, { "epoch": 0.1, "grad_norm": 8.9375, "learning_rate": 9.931806517013612e-06, "loss": 0.9036, "step": 10000 }, { "epoch": 0.1, "eval_loss": 0.9004048705101013, "eval_runtime": 79.5265, "eval_samples_per_second": 49.179, "eval_steps_per_second": 12.298, "step": 10000 }, { "epoch": 0.11, "grad_norm": 9.0625, "learning_rate": 9.901899829374048e-06, "loss": 0.89, "step": 11000 }, { "epoch": 0.12, "grad_norm": 9.125, "learning_rate": 9.86663298624003e-06, "loss": 0.8811, "step": 12000 }, { "epoch": 0.12, "eval_loss": 0.8809531331062317, "eval_runtime": 79.2169, "eval_samples_per_second": 49.371, "eval_steps_per_second": 12.346, "step": 12000 }, { "epoch": 0.13, "grad_norm": 8.8125, "learning_rate": 9.826044551386743e-06, "loss": 0.8724, "step": 13000 }, { "epoch": 0.14, "grad_norm": 8.6875, "learning_rate": 9.780178907671788e-06, "loss": 0.8656, "step": 14000 }, { "epoch": 0.14, "eval_loss": 0.8673146963119507, "eval_runtime": 78.9692, "eval_samples_per_second": 49.526, "eval_steps_per_second": 12.385, "step": 14000 }, { "epoch": 0.15, "grad_norm": 9.25, "learning_rate": 9.729086208503174e-06, "loss": 0.8562, "step": 15000 }, { "epoch": 0.16, "grad_norm": 8.8125, "learning_rate": 9.672822322997305e-06, "loss": 0.8493, "step": 16000 }, { "epoch": 0.16, "eval_loss": 0.8561869859695435, "eval_runtime": 79.0551, "eval_samples_per_second": 49.472, "eval_steps_per_second": 12.371, "step": 16000 }, { "epoch": 0.17, "grad_norm": 9.375, "learning_rate": 9.611448774886925e-06, "loss": 0.8459, "step": 17000 }, { "epoch": 0.18, "grad_norm": 9.1875, "learning_rate": 9.545032675245814e-06, "loss": 0.8433, "step": 18000 }, { "epoch": 0.18, "eval_loss": 0.8474856615066528, "eval_runtime": 79.8666, "eval_samples_per_second": 48.969, "eval_steps_per_second": 12.245, "step": 18000 }, { "epoch": 0.19, "grad_norm": 9.0, "learning_rate": 9.473646649103819e-06, "loss": 0.8409, "step": 19000 }, { "epoch": 0.2, "grad_norm": 8.6875, "learning_rate": 9.397368756032445e-06, "loss": 0.8352, "step": 20000 }, { "epoch": 0.2, "eval_loss": 0.8410937786102295, "eval_runtime": 79.0729, "eval_samples_per_second": 49.461, "eval_steps_per_second": 12.368, "step": 20000 }, { "epoch": 0.21, "grad_norm": 10.0, "learning_rate": 9.31628240478787e-06, "loss": 0.8347, "step": 21000 }, { "epoch": 0.22, "grad_norm": 9.3125, "learning_rate": 9.230476262104678e-06, "loss": 0.8295, "step": 22000 }, { "epoch": 0.22, "eval_loss": 0.8347375392913818, "eval_runtime": 79.1246, "eval_samples_per_second": 49.428, "eval_steps_per_second": 12.36, "step": 22000 }, { "epoch": 0.23, "grad_norm": 9.875, "learning_rate": 9.140044155740102e-06, "loss": 0.8275, "step": 23000 }, { "epoch": 0.24, "grad_norm": 10.1875, "learning_rate": 9.045084971874738e-06, "loss": 0.822, "step": 24000 }, { "epoch": 0.24, "eval_loss": 0.8303326368331909, "eval_runtime": 78.9475, "eval_samples_per_second": 49.539, "eval_steps_per_second": 12.388, "step": 24000 }, { "epoch": 0.25, "grad_norm": 8.875, "learning_rate": 8.94570254698197e-06, "loss": 0.8245, "step": 25000 }, { "epoch": 0.26, "grad_norm": 9.0625, "learning_rate": 8.842005554284296e-06, "loss": 0.8216, "step": 26000 }, { "epoch": 0.26, "eval_loss": 0.8262131810188293, "eval_runtime": 78.9434, "eval_samples_per_second": 49.542, "eval_steps_per_second": 12.389, "step": 26000 }, { "epoch": 0.27, "grad_norm": 8.875, "learning_rate": 8.734107384920771e-06, "loss": 0.8174, "step": 27000 }, { "epoch": 0.28, "grad_norm": 10.125, "learning_rate": 8.622126023955446e-06, "loss": 0.8116, "step": 28000 }, { "epoch": 0.28, "eval_loss": 0.8211948275566101, "eval_runtime": 78.9986, "eval_samples_per_second": 49.507, "eval_steps_per_second": 12.38, "step": 28000 }, { "epoch": 0.29, "grad_norm": 8.375, "learning_rate": 8.506183921362443e-06, "loss": 0.8073, "step": 29000 }, { "epoch": 0.3, "grad_norm": 8.3125, "learning_rate": 8.386407858128707e-06, "loss": 0.7966, "step": 30000 }, { "epoch": 0.3, "eval_loss": 0.8195165991783142, "eval_runtime": 78.951, "eval_samples_per_second": 49.537, "eval_steps_per_second": 12.387, "step": 30000 }, { "epoch": 0.31, "grad_norm": 9.25, "learning_rate": 8.262928807620843e-06, "loss": 0.7853, "step": 31000 }, { "epoch": 0.32, "grad_norm": 8.8125, "learning_rate": 8.135881792367686e-06, "loss": 0.7807, "step": 32000 }, { "epoch": 0.32, "eval_loss": 0.8171260952949524, "eval_runtime": 79.0444, "eval_samples_per_second": 49.479, "eval_steps_per_second": 12.373, "step": 32000 }, { "epoch": 0.33, "grad_norm": 8.6875, "learning_rate": 8.005405736415127e-06, "loss": 0.7828, "step": 33000 }, { "epoch": 0.34, "grad_norm": 9.375, "learning_rate": 7.871643313414718e-06, "loss": 0.7794, "step": 34000 }, { "epoch": 0.34, "eval_loss": 0.8152210712432861, "eval_runtime": 78.8407, "eval_samples_per_second": 49.606, "eval_steps_per_second": 12.405, "step": 34000 }, { "epoch": 0.35, "grad_norm": 9.375, "learning_rate": 7.734740790612137e-06, "loss": 0.7846, "step": 35000 }, { "epoch": 0.36, "grad_norm": 9.1875, "learning_rate": 7.594847868906076e-06, "loss": 0.7812, "step": 36000 }, { "epoch": 0.36, "eval_loss": 0.8131356239318848, "eval_runtime": 79.0096, "eval_samples_per_second": 49.5, "eval_steps_per_second": 12.378, "step": 36000 }, { "epoch": 0.37, "grad_norm": 9.75, "learning_rate": 7.452117519152542e-06, "loss": 0.7798, "step": 37000 }, { "epoch": 0.38, "grad_norm": 9.4375, "learning_rate": 7.30670581489344e-06, "loss": 0.7819, "step": 38000 }, { "epoch": 0.38, "eval_loss": 0.8113033175468445, "eval_runtime": 79.0043, "eval_samples_per_second": 49.504, "eval_steps_per_second": 12.379, "step": 38000 }, { "epoch": 0.39, "grad_norm": 9.375, "learning_rate": 7.158771761692464e-06, "loss": 0.7825, "step": 39000 }, { "epoch": 0.4, "grad_norm": 10.0625, "learning_rate": 7.008477123264849e-06, "loss": 0.7798, "step": 40000 }, { "epoch": 0.4, "eval_loss": 0.8099530935287476, "eval_runtime": 79.3148, "eval_samples_per_second": 49.31, "eval_steps_per_second": 12.331, "step": 40000 }, { "epoch": 0.41, "grad_norm": 9.0, "learning_rate": 6.855986244591104e-06, "loss": 0.7785, "step": 41000 }, { "epoch": 0.42, "grad_norm": 10.625, "learning_rate": 6.701465872208216e-06, "loss": 0.7777, "step": 42000 }, { "epoch": 0.42, "eval_loss": 0.8080056309700012, "eval_runtime": 79.2187, "eval_samples_per_second": 49.37, "eval_steps_per_second": 12.346, "step": 42000 }, { "epoch": 0.43, "grad_norm": 9.75, "learning_rate": 6.545084971874738e-06, "loss": 0.7799, "step": 43000 }, { "epoch": 0.44, "grad_norm": 9.1875, "learning_rate": 6.387014543809224e-06, "loss": 0.7775, "step": 44000 }, { "epoch": 0.44, "eval_loss": 0.8067404627799988, "eval_runtime": 79.9447, "eval_samples_per_second": 48.921, "eval_steps_per_second": 12.233, "step": 44000 }, { "epoch": 0.45, "grad_norm": 9.3125, "learning_rate": 6.227427435703997e-06, "loss": 0.7792, "step": 45000 }, { "epoch": 0.46, "grad_norm": 9.0, "learning_rate": 6.066498153718735e-06, "loss": 0.7789, "step": 46000 }, { "epoch": 0.46, "eval_loss": 0.8063809871673584, "eval_runtime": 79.5507, "eval_samples_per_second": 49.164, "eval_steps_per_second": 12.294, "step": 46000 }, { "epoch": 0.47, "grad_norm": 9.625, "learning_rate": 5.904402671660551e-06, "loss": 0.7813, "step": 47000 }, { "epoch": 0.48, "grad_norm": 10.6875, "learning_rate": 5.74131823855921e-06, "loss": 0.7795, "step": 48000 }, { "epoch": 0.48, "eval_loss": 0.8048525452613831, "eval_runtime": 79.0053, "eval_samples_per_second": 49.503, "eval_steps_per_second": 12.379, "step": 48000 }, { "epoch": 0.49, "grad_norm": 9.875, "learning_rate": 5.577423184847932e-06, "loss": 0.7787, "step": 49000 }, { "epoch": 0.5, "grad_norm": 10.0625, "learning_rate": 5.412896727361663e-06, "loss": 0.781, "step": 50000 }, { "epoch": 0.5, "eval_loss": 0.8037334084510803, "eval_runtime": 80.9609, "eval_samples_per_second": 48.307, "eval_steps_per_second": 12.08, "step": 50000 }, { "epoch": 0.51, "grad_norm": 8.875, "learning_rate": 5.247918773366112e-06, "loss": 0.7814, "step": 51000 }, { "epoch": 0.52, "grad_norm": 9.6875, "learning_rate": 5.082669723831793e-06, "loss": 0.7782, "step": 52000 }, { "epoch": 0.52, "eval_loss": 0.8030668497085571, "eval_runtime": 79.8706, "eval_samples_per_second": 48.967, "eval_steps_per_second": 12.245, "step": 52000 }, { "epoch": 0.53, "grad_norm": 8.6875, "learning_rate": 4.917330276168208e-06, "loss": 0.7771, "step": 53000 }, { "epoch": 0.54, "grad_norm": 9.0625, "learning_rate": 4.752081226633888e-06, "loss": 0.773, "step": 54000 }, { "epoch": 0.54, "eval_loss": 0.8017913103103638, "eval_runtime": 80.4358, "eval_samples_per_second": 48.623, "eval_steps_per_second": 12.159, "step": 54000 }, { "epoch": 0.55, "grad_norm": 8.25, "learning_rate": 4.587103272638339e-06, "loss": 0.7708, "step": 55000 }, { "epoch": 0.56, "grad_norm": 9.625, "learning_rate": 4.42257681515207e-06, "loss": 0.7645, "step": 56000 }, { "epoch": 0.56, "eval_loss": 0.8031152486801147, "eval_runtime": 80.4118, "eval_samples_per_second": 48.637, "eval_steps_per_second": 12.162, "step": 56000 }, { "epoch": 0.57, "grad_norm": 9.1875, "learning_rate": 4.25868176144079e-06, "loss": 0.7504, "step": 57000 }, { "epoch": 0.58, "grad_norm": 10.0, "learning_rate": 4.0955973283394525e-06, "loss": 0.7534, "step": 58000 }, { "epoch": 0.58, "eval_loss": 0.8028683662414551, "eval_runtime": 79.2432, "eval_samples_per_second": 49.354, "eval_steps_per_second": 12.342, "step": 58000 }, { "epoch": 0.59, "grad_norm": 9.4375, "learning_rate": 3.9335018462812664e-06, "loss": 0.7563, "step": 59000 }, { "epoch": 0.6, "grad_norm": 9.625, "learning_rate": 3.7725725642960047e-06, "loss": 0.7572, "step": 60000 }, { "epoch": 0.6, "eval_loss": 0.8028994202613831, "eval_runtime": 80.8868, "eval_samples_per_second": 48.352, "eval_steps_per_second": 12.091, "step": 60000 }, { "epoch": 0.61, "grad_norm": 10.0, "learning_rate": 3.6129854561907786e-06, "loss": 0.7569, "step": 61000 }, { "epoch": 0.62, "grad_norm": 9.125, "learning_rate": 3.4549150281252635e-06, "loss": 0.7549, "step": 62000 }, { "epoch": 0.62, "eval_loss": 0.8032099604606628, "eval_runtime": 80.2623, "eval_samples_per_second": 48.728, "eval_steps_per_second": 12.185, "step": 62000 }, { "epoch": 0.63, "grad_norm": 9.375, "learning_rate": 3.298534127791785e-06, "loss": 0.7605, "step": 63000 }, { "epoch": 0.64, "grad_norm": 9.0625, "learning_rate": 3.1440137554088957e-06, "loss": 0.759, "step": 64000 }, { "epoch": 0.64, "eval_loss": 0.8016828894615173, "eval_runtime": 79.8549, "eval_samples_per_second": 48.976, "eval_steps_per_second": 12.247, "step": 64000 }, { "epoch": 0.65, "grad_norm": 13.1875, "learning_rate": 2.991522876735154e-06, "loss": 0.7626, "step": 65000 }, { "epoch": 0.66, "grad_norm": 9.1875, "learning_rate": 2.8412282383075362e-06, "loss": 0.7597, "step": 66000 }, { "epoch": 0.66, "eval_loss": 0.8016167879104614, "eval_runtime": 80.2412, "eval_samples_per_second": 48.741, "eval_steps_per_second": 12.188, "step": 66000 }, { "epoch": 0.67, "grad_norm": 9.375, "learning_rate": 2.693294185106562e-06, "loss": 0.7626, "step": 67000 }, { "epoch": 0.68, "grad_norm": 9.75, "learning_rate": 2.5478824808474613e-06, "loss": 0.7612, "step": 68000 }, { "epoch": 0.68, "eval_loss": 0.8017856478691101, "eval_runtime": 80.53, "eval_samples_per_second": 48.566, "eval_steps_per_second": 12.145, "step": 68000 }, { "epoch": 0.69, "grad_norm": 9.0, "learning_rate": 2.4051521310939258e-06, "loss": 0.7659, "step": 69000 }, { "epoch": 0.7, "grad_norm": 8.9375, "learning_rate": 2.265259209387867e-06, "loss": 0.7643, "step": 70000 }, { "epoch": 0.7, "eval_loss": 0.8011899590492249, "eval_runtime": 80.4265, "eval_samples_per_second": 48.628, "eval_steps_per_second": 12.16, "step": 70000 }, { "epoch": 0.71, "grad_norm": 9.5625, "learning_rate": 2.1283566865852824e-06, "loss": 0.7658, "step": 71000 }, { "epoch": 0.72, "grad_norm": 9.5625, "learning_rate": 1.9945942635848745e-06, "loss": 0.768, "step": 72000 }, { "epoch": 0.72, "eval_loss": 0.8010539412498474, "eval_runtime": 79.2118, "eval_samples_per_second": 49.374, "eval_steps_per_second": 12.347, "step": 72000 }, { "epoch": 0.73, "grad_norm": 10.3125, "learning_rate": 1.864118207632315e-06, "loss": 0.7683, "step": 73000 }, { "epoch": 0.74, "grad_norm": 9.375, "learning_rate": 1.7370711923791567e-06, "loss": 0.7669, "step": 74000 }, { "epoch": 0.74, "eval_loss": 0.8013535737991333, "eval_runtime": 79.7314, "eval_samples_per_second": 49.052, "eval_steps_per_second": 12.266, "step": 74000 }, { "epoch": 0.75, "grad_norm": 8.6875, "learning_rate": 1.6135921418712959e-06, "loss": 0.7683, "step": 75000 }, { "epoch": 0.76, "grad_norm": 9.3125, "learning_rate": 1.4938160786375571e-06, "loss": 0.7728, "step": 76000 }, { "epoch": 0.76, "eval_loss": 0.8007571697235107, "eval_runtime": 79.1576, "eval_samples_per_second": 49.408, "eval_steps_per_second": 12.355, "step": 76000 }, { "epoch": 0.77, "grad_norm": 9.375, "learning_rate": 1.3778739760445552e-06, "loss": 0.7729, "step": 77000 }, { "epoch": 0.78, "grad_norm": 10.0625, "learning_rate": 1.2658926150792321e-06, "loss": 0.7728, "step": 78000 }, { "epoch": 0.78, "eval_loss": 0.8001759052276611, "eval_runtime": 79.1337, "eval_samples_per_second": 49.423, "eval_steps_per_second": 12.359, "step": 78000 }, { "epoch": 0.79, "grad_norm": 10.6875, "learning_rate": 1.157994445715706e-06, "loss": 0.771, "step": 79000 }, { "epoch": 0.8, "grad_norm": 9.4375, "learning_rate": 1.0542974530180327e-06, "loss": 0.7661, "step": 80000 }, { "epoch": 0.8, "eval_loss": 0.8007293343544006, "eval_runtime": 79.1754, "eval_samples_per_second": 49.397, "eval_steps_per_second": 12.352, "step": 80000 }, { "epoch": 0.81, "grad_norm": 9.375, "learning_rate": 9.549150281252633e-07, "loss": 0.7593, "step": 81000 }, { "epoch": 0.82, "grad_norm": 9.4375, "learning_rate": 8.599558442598998e-07, "loss": 0.7505, "step": 82000 }, { "epoch": 0.82, "eval_loss": 0.8015545606613159, "eval_runtime": 79.021, "eval_samples_per_second": 49.493, "eval_steps_per_second": 12.376, "step": 82000 }, { "epoch": 0.83, "grad_norm": 9.6875, "learning_rate": 7.695237378953224e-07, "loss": 0.7452, "step": 83000 }, { "epoch": 0.84, "grad_norm": 9.125, "learning_rate": 6.837175952121305e-07, "loss": 0.7529, "step": 84000 }, { "epoch": 0.84, "eval_loss": 0.8017655611038208, "eval_runtime": 79.164, "eval_samples_per_second": 49.404, "eval_steps_per_second": 12.354, "step": 84000 }, { "epoch": 0.85, "grad_norm": 8.9375, "learning_rate": 6.026312439675553e-07, "loss": 0.7481, "step": 85000 }, { "epoch": 0.86, "grad_norm": 9.625, "learning_rate": 5.263533508961827e-07, "loss": 0.7542, "step": 86000 }, { "epoch": 0.86, "eval_loss": 0.8010063767433167, "eval_runtime": 79.0232, "eval_samples_per_second": 49.492, "eval_steps_per_second": 12.376, "step": 86000 }, { "epoch": 0.87, "grad_norm": 10.25, "learning_rate": 4.549673247541875e-07, "loss": 0.7531, "step": 87000 }, { "epoch": 0.88, "grad_norm": 10.5, "learning_rate": 3.885512251130763e-07, "loss": 0.7531, "step": 88000 }, { "epoch": 0.88, "eval_loss": 0.8015376329421997, "eval_runtime": 79.1757, "eval_samples_per_second": 49.396, "eval_steps_per_second": 12.352, "step": 88000 }, { "epoch": 0.89, "grad_norm": 10.0625, "learning_rate": 3.271776770026963e-07, "loss": 0.7566, "step": 89000 }, { "epoch": 0.9, "grad_norm": 9.5625, "learning_rate": 2.7091379149682683e-07, "loss": 0.7601, "step": 90000 }, { "epoch": 0.9, "eval_loss": 0.8013355135917664, "eval_runtime": 79.4846, "eval_samples_per_second": 49.205, "eval_steps_per_second": 12.304, "step": 90000 }, { "epoch": 0.91, "grad_norm": 9.5, "learning_rate": 2.198210923282118e-07, "loss": 0.7581, "step": 91000 }, { "epoch": 0.92, "grad_norm": 9.125, "learning_rate": 1.7395544861325718e-07, "loss": 0.7569, "step": 92000 }, { "epoch": 0.92, "eval_loss": 0.8014345765113831, "eval_runtime": 79.6557, "eval_samples_per_second": 49.099, "eval_steps_per_second": 12.278, "step": 92000 }, { "epoch": 0.93, "grad_norm": 9.8125, "learning_rate": 1.333670137599713e-07, "loss": 0.7599, "step": 93000 }, { "epoch": 0.94, "grad_norm": 9.125, "learning_rate": 9.810017062595322e-08, "loss": 0.7617, "step": 94000 }, { "epoch": 0.94, "eval_loss": 0.8014338612556458, "eval_runtime": 79.0466, "eval_samples_per_second": 49.477, "eval_steps_per_second": 12.372, "step": 94000 }, { "epoch": 0.95, "grad_norm": 10.1875, "learning_rate": 6.819348298638839e-08, "loss": 0.7641, "step": 95000 }, { "epoch": 0.96, "grad_norm": 9.8125, "learning_rate": 4.367965336512403e-08, "loss": 0.766, "step": 96000 }, { "epoch": 0.96, "eval_loss": 0.8010128140449524, "eval_runtime": 79.5861, "eval_samples_per_second": 49.142, "eval_steps_per_second": 12.289, "step": 96000 }, { "epoch": 0.97, "grad_norm": 9.75, "learning_rate": 2.4585487274942922e-08, "loss": 0.7646, "step": 97000 }, { "epoch": 0.98, "grad_norm": 10.0625, "learning_rate": 1.0931863906127327e-08, "loss": 0.7673, "step": 98000 }, { "epoch": 0.98, "eval_loss": 0.8010921478271484, "eval_runtime": 79.6671, "eval_samples_per_second": 49.092, "eval_steps_per_second": 12.276, "step": 98000 }, { "epoch": 0.99, "grad_norm": 10.125, "learning_rate": 2.7337132953697555e-09, "loss": 0.7674, "step": 99000 }, { "epoch": 1.0, "grad_norm": 9.5625, "learning_rate": 0.0, "loss": 0.7658, "step": 100000 }, { "epoch": 1.0, "eval_loss": 0.8011825680732727, "eval_runtime": 79.4501, "eval_samples_per_second": 49.226, "eval_steps_per_second": 12.31, "step": 100000 } ], "logging_steps": 1000, "max_steps": 100000, "num_input_tokens_seen": 0, "num_train_epochs": 9223372036854775807, "save_steps": 2000, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 3.68882057478144e+19, "train_batch_size": 4, "trial_name": null, "trial_params": null }