sravanthib's picture
Training completed
07d95cb verified
{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 0.5,
"eval_steps": 0,
"global_step": 500,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.01,
"grad_norm": 6.734264373779297,
"learning_rate": 8.576691395183485e-05,
"loss": 3.5009,
"step": 10
},
{
"epoch": 0.02,
"grad_norm": 0.29098451137542725,
"learning_rate": 9.653382790366966e-05,
"loss": 0.1027,
"step": 20
},
{
"epoch": 0.03,
"grad_norm": 0.09549916535615921,
"learning_rate": 0.0001,
"loss": 0.0191,
"step": 30
},
{
"epoch": 0.04,
"grad_norm": 0.3484920263290405,
"learning_rate": 0.0001,
"loss": 0.0222,
"step": 40
},
{
"epoch": 0.05,
"grad_norm": 0.33001908659935,
"learning_rate": 0.0001,
"loss": 0.021,
"step": 50
},
{
"epoch": 0.06,
"grad_norm": 0.057511646300554276,
"learning_rate": 0.0001,
"loss": 0.0135,
"step": 60
},
{
"epoch": 0.07,
"grad_norm": 0.05701196566224098,
"learning_rate": 0.0001,
"loss": 0.0117,
"step": 70
},
{
"epoch": 0.08,
"grad_norm": 0.043988876044750214,
"learning_rate": 0.0001,
"loss": 0.0107,
"step": 80
},
{
"epoch": 0.09,
"grad_norm": 0.03720390796661377,
"learning_rate": 0.0001,
"loss": 0.0098,
"step": 90
},
{
"epoch": 0.1,
"grad_norm": 0.0470854677259922,
"learning_rate": 0.0001,
"loss": 0.0091,
"step": 100
},
{
"epoch": 0.11,
"grad_norm": 0.035510435700416565,
"learning_rate": 0.0001,
"loss": 0.0075,
"step": 110
},
{
"epoch": 0.12,
"grad_norm": 0.0346401073038578,
"learning_rate": 0.0001,
"loss": 0.0069,
"step": 120
},
{
"epoch": 0.13,
"grad_norm": 0.0329650416970253,
"learning_rate": 0.0001,
"loss": 0.0064,
"step": 130
},
{
"epoch": 0.14,
"grad_norm": 0.056529607623815536,
"learning_rate": 0.0001,
"loss": 0.0061,
"step": 140
},
{
"epoch": 0.15,
"grad_norm": 0.049417588859796524,
"learning_rate": 0.0001,
"loss": 0.0058,
"step": 150
},
{
"epoch": 0.16,
"grad_norm": 0.031275127083063126,
"learning_rate": 0.0001,
"loss": 0.0046,
"step": 160
},
{
"epoch": 0.17,
"grad_norm": 0.026077693328261375,
"learning_rate": 0.0001,
"loss": 0.0043,
"step": 170
},
{
"epoch": 0.18,
"grad_norm": 0.03110571764409542,
"learning_rate": 0.0001,
"loss": 0.0035,
"step": 180
},
{
"epoch": 0.19,
"grad_norm": 0.0256363395601511,
"learning_rate": 0.0001,
"loss": 0.0039,
"step": 190
},
{
"epoch": 0.2,
"grad_norm": 0.13061155378818512,
"learning_rate": 0.0001,
"loss": 0.0042,
"step": 200
},
{
"epoch": 0.21,
"grad_norm": 0.022342098876833916,
"learning_rate": 0.0001,
"loss": 0.0029,
"step": 210
},
{
"epoch": 0.22,
"grad_norm": 0.06658010929822922,
"learning_rate": 0.0001,
"loss": 0.0028,
"step": 220
},
{
"epoch": 0.23,
"grad_norm": 0.02203432098031044,
"learning_rate": 0.0001,
"loss": 0.0028,
"step": 230
},
{
"epoch": 0.24,
"grad_norm": 0.04879545792937279,
"learning_rate": 0.0001,
"loss": 0.0022,
"step": 240
},
{
"epoch": 0.25,
"grad_norm": 0.044768281280994415,
"learning_rate": 0.0001,
"loss": 0.0026,
"step": 250
},
{
"epoch": 0.26,
"grad_norm": 0.030401039868593216,
"learning_rate": 0.0001,
"loss": 0.0018,
"step": 260
},
{
"epoch": 0.27,
"grad_norm": 0.10380243510007858,
"learning_rate": 0.0001,
"loss": 0.0015,
"step": 270
},
{
"epoch": 0.28,
"grad_norm": 0.019732531160116196,
"learning_rate": 0.0001,
"loss": 0.0017,
"step": 280
},
{
"epoch": 0.29,
"grad_norm": 0.015292245894670486,
"learning_rate": 0.0001,
"loss": 0.0019,
"step": 290
},
{
"epoch": 0.3,
"grad_norm": 0.030675368383526802,
"learning_rate": 0.0001,
"loss": 0.0022,
"step": 300
},
{
"epoch": 0.31,
"grad_norm": 0.029702844098210335,
"learning_rate": 0.0001,
"loss": 0.0014,
"step": 310
},
{
"epoch": 0.32,
"grad_norm": 0.016342662274837494,
"learning_rate": 0.0001,
"loss": 0.0014,
"step": 320
},
{
"epoch": 0.33,
"grad_norm": 0.013499235734343529,
"learning_rate": 0.0001,
"loss": 0.0013,
"step": 330
},
{
"epoch": 0.34,
"grad_norm": 0.011413372121751308,
"learning_rate": 0.0001,
"loss": 0.0012,
"step": 340
},
{
"epoch": 0.35,
"grad_norm": 0.09215894341468811,
"learning_rate": 0.0001,
"loss": 0.0073,
"step": 350
},
{
"epoch": 0.36,
"grad_norm": 0.06609797477722168,
"learning_rate": 0.0001,
"loss": 0.0084,
"step": 360
},
{
"epoch": 0.37,
"grad_norm": 0.03970978036522865,
"learning_rate": 0.0001,
"loss": 0.0075,
"step": 370
},
{
"epoch": 0.38,
"grad_norm": 0.029625259339809418,
"learning_rate": 0.0001,
"loss": 0.0059,
"step": 380
},
{
"epoch": 0.39,
"grad_norm": 0.02456456422805786,
"learning_rate": 0.0001,
"loss": 0.005,
"step": 390
},
{
"epoch": 0.4,
"grad_norm": 0.03191933035850525,
"learning_rate": 0.0001,
"loss": 0.0045,
"step": 400
},
{
"epoch": 0.41,
"grad_norm": 0.01918269693851471,
"learning_rate": 0.0001,
"loss": 0.0037,
"step": 410
},
{
"epoch": 0.42,
"grad_norm": 0.018161766231060028,
"learning_rate": 0.0001,
"loss": 0.0031,
"step": 420
},
{
"epoch": 0.43,
"grad_norm": 0.019575210288167,
"learning_rate": 0.0001,
"loss": 0.0026,
"step": 430
},
{
"epoch": 0.44,
"grad_norm": 0.026317287236452103,
"learning_rate": 0.0001,
"loss": 0.0023,
"step": 440
},
{
"epoch": 0.45,
"grad_norm": 0.040029872208833694,
"learning_rate": 0.0001,
"loss": 0.0026,
"step": 450
},
{
"epoch": 0.46,
"grad_norm": 0.013975433073937893,
"learning_rate": 0.0001,
"loss": 0.0022,
"step": 460
},
{
"epoch": 0.47,
"grad_norm": 0.03210354968905449,
"learning_rate": 0.0001,
"loss": 0.0017,
"step": 470
},
{
"epoch": 0.48,
"grad_norm": 0.01889188587665558,
"learning_rate": 0.0001,
"loss": 0.0019,
"step": 480
},
{
"epoch": 0.49,
"grad_norm": 0.013832672499120235,
"learning_rate": 0.0001,
"loss": 0.0016,
"step": 490
},
{
"epoch": 0.5,
"grad_norm": 0.057756196707487106,
"learning_rate": 0.0001,
"loss": 0.0018,
"step": 500
},
{
"epoch": 0.5,
"step": 500,
"total_flos": 6.856066495152128e+17,
"train_loss": 0.07727908698283135,
"train_runtime": 9003.3191,
"train_samples_per_second": 1.111,
"train_steps_per_second": 0.056
}
],
"logging_steps": 10,
"max_steps": 500,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 1000,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 6.856066495152128e+17,
"train_batch_size": 2,
"trial_name": null,
"trial_params": null
}