{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 9.923076923076923, "eval_steps": 500, "global_step": 39, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.3076923076923077, "grad_norm": 429.5760192871094, "learning_rate": 5e-06, "loss": 1.9642, "memory/device_mem_reserved(gib)": 51.52, "memory/max_mem_active(gib)": 46.79, "memory/max_mem_allocated(gib)": 45.85, "step": 1 }, { "epoch": 0.6153846153846154, "grad_norm": 88.243408203125, "learning_rate": 1e-05, "loss": 1.43, "memory/device_mem_reserved(gib)": 51.53, "memory/max_mem_active(gib)": 46.82, "memory/max_mem_allocated(gib)": 45.88, "step": 2 }, { "epoch": 0.9230769230769231, "grad_norm": 188.16270446777344, "learning_rate": 1.5000000000000002e-05, "loss": 1.1989, "memory/device_mem_reserved(gib)": 51.53, "memory/max_mem_active(gib)": 46.82, "memory/max_mem_allocated(gib)": 45.88, "step": 3 }, { "epoch": 1.0, "grad_norm": 238.33164978027344, "learning_rate": 2e-05, "loss": 1.4779, "memory/device_mem_reserved(gib)": 51.53, "memory/max_mem_active(gib)": 46.82, "memory/max_mem_allocated(gib)": 45.88, "step": 4 }, { "epoch": 1.3076923076923077, "grad_norm": 60.53327178955078, "learning_rate": 1.9953596287703015e-05, "loss": 1.4517, "memory/device_mem_reserved(gib)": 51.53, "memory/max_mem_active(gib)": 46.82, "memory/max_mem_allocated(gib)": 45.88, "step": 5 }, { "epoch": 1.6153846153846154, "grad_norm": 89.0710678100586, "learning_rate": 1.9905213270142184e-05, "loss": 1.1788, "memory/device_mem_reserved(gib)": 51.53, "memory/max_mem_active(gib)": 46.82, "memory/max_mem_allocated(gib)": 45.88, "step": 6 }, { "epoch": 1.9230769230769231, "grad_norm": 74.55708312988281, "learning_rate": 1.9854721549636805e-05, "loss": 1.3326, "memory/device_mem_reserved(gib)": 51.53, "memory/max_mem_active(gib)": 46.82, "memory/max_mem_allocated(gib)": 45.88, "step": 7 }, { "epoch": 2.0, "grad_norm": 46.6354866027832, "learning_rate": 1.9801980198019806e-05, "loss": 1.1162, "memory/device_mem_reserved(gib)": 51.53, "memory/max_mem_active(gib)": 46.82, "memory/max_mem_allocated(gib)": 45.88, "step": 8 }, { "epoch": 2.3076923076923075, "grad_norm": 11.567599296569824, "learning_rate": 1.974683544303798e-05, "loss": 1.0117, "memory/device_mem_reserved(gib)": 51.53, "memory/max_mem_active(gib)": 46.82, "memory/max_mem_allocated(gib)": 45.88, "step": 9 }, { "epoch": 2.6153846153846154, "grad_norm": 15.884381294250488, "learning_rate": 1.9689119170984456e-05, "loss": 0.9432, "memory/device_mem_reserved(gib)": 51.53, "memory/max_mem_active(gib)": 46.82, "memory/max_mem_allocated(gib)": 45.88, "step": 10 }, { "epoch": 2.9230769230769234, "grad_norm": 8.479954719543457, "learning_rate": 1.9628647214854114e-05, "loss": 0.9247, "memory/device_mem_reserved(gib)": 51.53, "memory/max_mem_active(gib)": 46.82, "memory/max_mem_allocated(gib)": 45.88, "step": 11 }, { "epoch": 3.0, "grad_norm": 17.43779754638672, "learning_rate": 1.956521739130435e-05, "loss": 1.0189, "memory/device_mem_reserved(gib)": 51.53, "memory/max_mem_active(gib)": 46.82, "memory/max_mem_allocated(gib)": 45.88, "step": 12 }, { "epoch": 3.3076923076923075, "grad_norm": 8.543362617492676, "learning_rate": 1.9498607242339832e-05, "loss": 0.9282, "memory/device_mem_reserved(gib)": 51.53, "memory/max_mem_active(gib)": 46.82, "memory/max_mem_allocated(gib)": 45.88, "step": 13 }, { "epoch": 3.6153846153846154, "grad_norm": 5.215980052947998, "learning_rate": 1.942857142857143e-05, "loss": 0.8694, "memory/device_mem_reserved(gib)": 51.53, "memory/max_mem_active(gib)": 46.82, "memory/max_mem_allocated(gib)": 45.88, "step": 14 }, { "epoch": 3.9230769230769234, "grad_norm": 4.073164463043213, "learning_rate": 1.935483870967742e-05, "loss": 0.8029, "memory/device_mem_reserved(gib)": 51.53, "memory/max_mem_active(gib)": 46.82, "memory/max_mem_allocated(gib)": 45.88, "step": 15 }, { "epoch": 4.0, "grad_norm": 5.382778167724609, "learning_rate": 1.9277108433734944e-05, "loss": 0.7638, "memory/device_mem_reserved(gib)": 51.53, "memory/max_mem_active(gib)": 46.82, "memory/max_mem_allocated(gib)": 45.88, "step": 16 }, { "epoch": 4.3076923076923075, "grad_norm": 4.377191066741943, "learning_rate": 1.9195046439628485e-05, "loss": 0.7751, "memory/device_mem_reserved(gib)": 51.53, "memory/max_mem_active(gib)": 46.82, "memory/max_mem_allocated(gib)": 45.88, "step": 17 }, { "epoch": 4.615384615384615, "grad_norm": 3.5090882778167725, "learning_rate": 1.9108280254777068e-05, "loss": 0.731, "memory/device_mem_reserved(gib)": 51.53, "memory/max_mem_active(gib)": 46.82, "memory/max_mem_allocated(gib)": 45.88, "step": 18 }, { "epoch": 4.923076923076923, "grad_norm": 4.811877727508545, "learning_rate": 1.9016393442622952e-05, "loss": 0.7118, "memory/device_mem_reserved(gib)": 51.53, "memory/max_mem_active(gib)": 46.82, "memory/max_mem_allocated(gib)": 45.88, "step": 19 }, { "epoch": 5.0, "grad_norm": 4.822802543640137, "learning_rate": 1.891891891891892e-05, "loss": 0.6149, "memory/device_mem_reserved(gib)": 51.53, "memory/max_mem_active(gib)": 46.82, "memory/max_mem_allocated(gib)": 45.88, "step": 20 }, { "epoch": 5.3076923076923075, "grad_norm": 4.164008140563965, "learning_rate": 1.8815331010452963e-05, "loss": 0.6936, "memory/device_mem_reserved(gib)": 51.53, "memory/max_mem_active(gib)": 46.82, "memory/max_mem_allocated(gib)": 45.88, "step": 21 }, { "epoch": 5.615384615384615, "grad_norm": 3.9381167888641357, "learning_rate": 1.8705035971223024e-05, "loss": 0.6502, "memory/device_mem_reserved(gib)": 51.53, "memory/max_mem_active(gib)": 46.82, "memory/max_mem_allocated(gib)": 45.88, "step": 22 }, { "epoch": 5.923076923076923, "grad_norm": 3.9260995388031006, "learning_rate": 1.858736059479554e-05, "loss": 0.633, "memory/device_mem_reserved(gib)": 51.53, "memory/max_mem_active(gib)": 46.82, "memory/max_mem_allocated(gib)": 45.88, "step": 23 }, { "epoch": 6.0, "grad_norm": 5.373617649078369, "learning_rate": 1.846153846153846e-05, "loss": 0.4986, "memory/device_mem_reserved(gib)": 51.53, "memory/max_mem_active(gib)": 46.82, "memory/max_mem_allocated(gib)": 45.88, "step": 24 }, { "epoch": 6.3076923076923075, "grad_norm": 4.031383514404297, "learning_rate": 1.8326693227091633e-05, "loss": 0.6118, "memory/device_mem_reserved(gib)": 51.53, "memory/max_mem_active(gib)": 46.82, "memory/max_mem_allocated(gib)": 45.88, "step": 25 }, { "epoch": 6.615384615384615, "grad_norm": 4.3576436042785645, "learning_rate": 1.8181818181818182e-05, "loss": 0.5786, "memory/device_mem_reserved(gib)": 51.53, "memory/max_mem_active(gib)": 46.82, "memory/max_mem_allocated(gib)": 45.88, "step": 26 }, { "epoch": 6.923076923076923, "grad_norm": 3.389698028564453, "learning_rate": 1.8025751072961374e-05, "loss": 0.569, "memory/device_mem_reserved(gib)": 51.53, "memory/max_mem_active(gib)": 46.82, "memory/max_mem_allocated(gib)": 45.88, "step": 27 }, { "epoch": 7.0, "grad_norm": 3.289379596710205, "learning_rate": 1.785714285714286e-05, "loss": 0.3683, "memory/device_mem_reserved(gib)": 51.53, "memory/max_mem_active(gib)": 46.82, "memory/max_mem_allocated(gib)": 45.88, "step": 28 }, { "epoch": 7.3076923076923075, "grad_norm": 3.358076333999634, "learning_rate": 1.7674418604651163e-05, "loss": 0.5345, "memory/device_mem_reserved(gib)": 51.53, "memory/max_mem_active(gib)": 46.82, "memory/max_mem_allocated(gib)": 45.88, "step": 29 }, { "epoch": 7.615384615384615, "grad_norm": 3.419854164123535, "learning_rate": 1.7475728155339808e-05, "loss": 0.5051, "memory/device_mem_reserved(gib)": 51.53, "memory/max_mem_active(gib)": 46.82, "memory/max_mem_allocated(gib)": 45.88, "step": 30 }, { "epoch": 7.923076923076923, "grad_norm": 3.624562978744507, "learning_rate": 1.7258883248730966e-05, "loss": 0.4952, "memory/device_mem_reserved(gib)": 51.53, "memory/max_mem_active(gib)": 46.82, "memory/max_mem_allocated(gib)": 45.88, "step": 31 }, { "epoch": 8.0, "grad_norm": 3.478980541229248, "learning_rate": 1.7021276595744686e-05, "loss": 0.2747, "memory/device_mem_reserved(gib)": 51.53, "memory/max_mem_active(gib)": 46.82, "memory/max_mem_allocated(gib)": 45.88, "step": 32 }, { "epoch": 8.307692307692308, "grad_norm": 3.6138954162597656, "learning_rate": 1.675977653631285e-05, "loss": 0.4662, "memory/device_mem_reserved(gib)": 51.53, "memory/max_mem_active(gib)": 46.82, "memory/max_mem_allocated(gib)": 45.88, "step": 33 }, { "epoch": 8.615384615384615, "grad_norm": 3.260951042175293, "learning_rate": 1.647058823529412e-05, "loss": 0.4351, "memory/device_mem_reserved(gib)": 51.53, "memory/max_mem_active(gib)": 46.82, "memory/max_mem_allocated(gib)": 45.88, "step": 34 }, { "epoch": 8.923076923076923, "grad_norm": 4.216782093048096, "learning_rate": 1.6149068322981367e-05, "loss": 0.4366, "memory/device_mem_reserved(gib)": 51.53, "memory/max_mem_active(gib)": 46.82, "memory/max_mem_allocated(gib)": 45.88, "step": 35 }, { "epoch": 9.0, "grad_norm": 3.045259475708008, "learning_rate": 1.5789473684210526e-05, "loss": 0.2067, "memory/device_mem_reserved(gib)": 51.53, "memory/max_mem_active(gib)": 46.82, "memory/max_mem_allocated(gib)": 45.88, "step": 36 }, { "epoch": 9.307692307692308, "grad_norm": 3.721900463104248, "learning_rate": 1.5384615384615384e-05, "loss": 0.3979, "memory/device_mem_reserved(gib)": 51.53, "memory/max_mem_active(gib)": 46.82, "memory/max_mem_allocated(gib)": 45.88, "step": 37 }, { "epoch": 9.615384615384615, "grad_norm": 4.223248481750488, "learning_rate": 1.4925373134328359e-05, "loss": 0.3831, "memory/device_mem_reserved(gib)": 51.53, "memory/max_mem_active(gib)": 46.82, "memory/max_mem_allocated(gib)": 45.88, "step": 38 }, { "epoch": 9.923076923076923, "grad_norm": 4.570195198059082, "learning_rate": 1.4400000000000001e-05, "loss": 0.3919, "memory/device_mem_reserved(gib)": 51.53, "memory/max_mem_active(gib)": 46.82, "memory/max_mem_allocated(gib)": 45.88, "step": 39 } ], "logging_steps": 1, "max_steps": 48, "num_input_tokens_seen": 0, "num_train_epochs": 12, "save_steps": 3, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 3.482969594438615e+18, "train_batch_size": 4, "trial_name": null, "trial_params": null }