CodCodingCode's picture
Upload folder using huggingface_hub
765af1a verified
{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 1.0051031256644696,
"eval_steps": 500,
"global_step": 6500,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.0015464017165059054,
"grad_norm": 4352.0,
"learning_rate": 1.9972307692307693e-05,
"loss": 10.9174,
"step": 10
},
{
"epoch": 0.0030928034330118107,
"grad_norm": 71168.0,
"learning_rate": 1.9941538461538464e-05,
"loss": 11.9649,
"step": 20
},
{
"epoch": 0.004639205149517716,
"grad_norm": 190.0,
"learning_rate": 1.9910769230769232e-05,
"loss": 5.27,
"step": 30
},
{
"epoch": 0.0061856068660236215,
"grad_norm": 16.125,
"learning_rate": 1.9880000000000003e-05,
"loss": 0.3647,
"step": 40
},
{
"epoch": 0.007732008582529527,
"grad_norm": 3.890625,
"learning_rate": 1.984923076923077e-05,
"loss": 0.3099,
"step": 50
},
{
"epoch": 0.009278410299035433,
"grad_norm": 1.8828125,
"learning_rate": 1.9818461538461538e-05,
"loss": 0.2842,
"step": 60
},
{
"epoch": 0.010824812015541337,
"grad_norm": 1.6953125,
"learning_rate": 1.978769230769231e-05,
"loss": 0.2943,
"step": 70
},
{
"epoch": 0.012371213732047243,
"grad_norm": 1.1640625,
"learning_rate": 1.9756923076923077e-05,
"loss": 0.3539,
"step": 80
},
{
"epoch": 0.013917615448553147,
"grad_norm": 1.046875,
"learning_rate": 1.9726153846153848e-05,
"loss": 0.259,
"step": 90
},
{
"epoch": 0.015464017165059053,
"grad_norm": 1.203125,
"learning_rate": 1.9695384615384616e-05,
"loss": 0.2741,
"step": 100
},
{
"epoch": 0.01701041888156496,
"grad_norm": 1.0390625,
"learning_rate": 1.9664615384615387e-05,
"loss": 0.281,
"step": 110
},
{
"epoch": 0.018556820598070865,
"grad_norm": 0.921875,
"learning_rate": 1.9633846153846155e-05,
"loss": 0.2586,
"step": 120
},
{
"epoch": 0.020103222314576768,
"grad_norm": 0.921875,
"learning_rate": 1.9603076923076926e-05,
"loss": 0.2776,
"step": 130
},
{
"epoch": 0.021649624031082674,
"grad_norm": 1.140625,
"learning_rate": 1.9572307692307693e-05,
"loss": 0.3186,
"step": 140
},
{
"epoch": 0.02319602574758858,
"grad_norm": 0.85546875,
"learning_rate": 1.9541538461538464e-05,
"loss": 0.3315,
"step": 150
},
{
"epoch": 0.024742427464094486,
"grad_norm": 1.1171875,
"learning_rate": 1.9510769230769232e-05,
"loss": 0.257,
"step": 160
},
{
"epoch": 0.026288829180600392,
"grad_norm": 1.1640625,
"learning_rate": 1.948e-05,
"loss": 0.2592,
"step": 170
},
{
"epoch": 0.027835230897106295,
"grad_norm": 0.91015625,
"learning_rate": 1.944923076923077e-05,
"loss": 0.2703,
"step": 180
},
{
"epoch": 0.0293816326136122,
"grad_norm": 0.94140625,
"learning_rate": 1.941846153846154e-05,
"loss": 0.2547,
"step": 190
},
{
"epoch": 0.030928034330118107,
"grad_norm": 1.078125,
"learning_rate": 1.938769230769231e-05,
"loss": 0.3182,
"step": 200
},
{
"epoch": 0.03247443604662401,
"grad_norm": 0.94140625,
"learning_rate": 1.9356923076923077e-05,
"loss": 0.3005,
"step": 210
},
{
"epoch": 0.03402083776312992,
"grad_norm": 1.09375,
"learning_rate": 1.932615384615385e-05,
"loss": 0.2693,
"step": 220
},
{
"epoch": 0.035567239479635825,
"grad_norm": 0.95703125,
"learning_rate": 1.929538461538462e-05,
"loss": 0.2925,
"step": 230
},
{
"epoch": 0.03711364119614173,
"grad_norm": 1.078125,
"learning_rate": 1.9264615384615387e-05,
"loss": 0.3165,
"step": 240
},
{
"epoch": 0.03866004291264763,
"grad_norm": 0.82421875,
"learning_rate": 1.9233846153846155e-05,
"loss": 0.2606,
"step": 250
},
{
"epoch": 0.040206444629153536,
"grad_norm": 1.109375,
"learning_rate": 1.9203076923076923e-05,
"loss": 0.324,
"step": 260
},
{
"epoch": 0.04175284634565944,
"grad_norm": 1.0390625,
"learning_rate": 1.9172307692307694e-05,
"loss": 0.2787,
"step": 270
},
{
"epoch": 0.04329924806216535,
"grad_norm": 1.046875,
"learning_rate": 1.914153846153846e-05,
"loss": 0.3092,
"step": 280
},
{
"epoch": 0.044845649778671254,
"grad_norm": 0.875,
"learning_rate": 1.9110769230769233e-05,
"loss": 0.2831,
"step": 290
},
{
"epoch": 0.04639205149517716,
"grad_norm": 1.0390625,
"learning_rate": 1.908e-05,
"loss": 0.282,
"step": 300
},
{
"epoch": 0.047938453211683066,
"grad_norm": 1.8125,
"learning_rate": 1.904923076923077e-05,
"loss": 0.3863,
"step": 310
},
{
"epoch": 0.04948485492818897,
"grad_norm": 0.85546875,
"learning_rate": 1.901846153846154e-05,
"loss": 0.246,
"step": 320
},
{
"epoch": 0.05103125664469488,
"grad_norm": 1.3984375,
"learning_rate": 1.898769230769231e-05,
"loss": 0.3483,
"step": 330
},
{
"epoch": 0.052577658361200784,
"grad_norm": 1.0546875,
"learning_rate": 1.8956923076923078e-05,
"loss": 0.4107,
"step": 340
},
{
"epoch": 0.05412406007770668,
"grad_norm": 1.0,
"learning_rate": 1.892615384615385e-05,
"loss": 0.2813,
"step": 350
},
{
"epoch": 0.05567046179421259,
"grad_norm": 0.828125,
"learning_rate": 1.8895384615384617e-05,
"loss": 0.283,
"step": 360
},
{
"epoch": 0.057216863510718495,
"grad_norm": 0.9765625,
"learning_rate": 1.8864615384615384e-05,
"loss": 0.268,
"step": 370
},
{
"epoch": 0.0587632652272244,
"grad_norm": 1.0859375,
"learning_rate": 1.8833846153846155e-05,
"loss": 0.2852,
"step": 380
},
{
"epoch": 0.06030966694373031,
"grad_norm": 0.921875,
"learning_rate": 1.8803076923076923e-05,
"loss": 0.2477,
"step": 390
},
{
"epoch": 0.06185606866023621,
"grad_norm": 1.0,
"learning_rate": 1.8772307692307694e-05,
"loss": 0.2418,
"step": 400
},
{
"epoch": 0.06340247037674211,
"grad_norm": 0.85546875,
"learning_rate": 1.8741538461538462e-05,
"loss": 0.2218,
"step": 410
},
{
"epoch": 0.06494887209324803,
"grad_norm": 1.1484375,
"learning_rate": 1.8710769230769233e-05,
"loss": 0.2616,
"step": 420
},
{
"epoch": 0.06649527380975392,
"grad_norm": 1.1171875,
"learning_rate": 1.8680000000000004e-05,
"loss": 0.3475,
"step": 430
},
{
"epoch": 0.06804167552625984,
"grad_norm": 1.125,
"learning_rate": 1.8649230769230772e-05,
"loss": 0.3025,
"step": 440
},
{
"epoch": 0.06958807724276574,
"grad_norm": 0.93359375,
"learning_rate": 1.861846153846154e-05,
"loss": 0.3119,
"step": 450
},
{
"epoch": 0.07113447895927165,
"grad_norm": 0.90625,
"learning_rate": 1.8587692307692307e-05,
"loss": 0.3004,
"step": 460
},
{
"epoch": 0.07268088067577755,
"grad_norm": 0.97265625,
"learning_rate": 1.8556923076923078e-05,
"loss": 0.2957,
"step": 470
},
{
"epoch": 0.07422728239228346,
"grad_norm": 1.1328125,
"learning_rate": 1.8526153846153846e-05,
"loss": 0.3162,
"step": 480
},
{
"epoch": 0.07577368410878936,
"grad_norm": 1.75,
"learning_rate": 1.8495384615384617e-05,
"loss": 0.3637,
"step": 490
},
{
"epoch": 0.07732008582529526,
"grad_norm": 1.0390625,
"learning_rate": 1.8464615384615385e-05,
"loss": 0.2379,
"step": 500
},
{
"epoch": 0.07886648754180117,
"grad_norm": 0.984375,
"learning_rate": 1.8433846153846156e-05,
"loss": 0.3098,
"step": 510
},
{
"epoch": 0.08041288925830707,
"grad_norm": 0.99609375,
"learning_rate": 1.8403076923076924e-05,
"loss": 0.3977,
"step": 520
},
{
"epoch": 0.08195929097481298,
"grad_norm": 0.9609375,
"learning_rate": 1.8372307692307695e-05,
"loss": 0.3034,
"step": 530
},
{
"epoch": 0.08350569269131888,
"grad_norm": 0.7421875,
"learning_rate": 1.8341538461538462e-05,
"loss": 0.2327,
"step": 540
},
{
"epoch": 0.0850520944078248,
"grad_norm": 1.0625,
"learning_rate": 1.8310769230769233e-05,
"loss": 0.2561,
"step": 550
},
{
"epoch": 0.0865984961243307,
"grad_norm": 0.8515625,
"learning_rate": 1.828e-05,
"loss": 0.3739,
"step": 560
},
{
"epoch": 0.08814489784083661,
"grad_norm": 0.87890625,
"learning_rate": 1.824923076923077e-05,
"loss": 0.3605,
"step": 570
},
{
"epoch": 0.08969129955734251,
"grad_norm": 1.0,
"learning_rate": 1.821846153846154e-05,
"loss": 0.2557,
"step": 580
},
{
"epoch": 0.09123770127384842,
"grad_norm": 1.03125,
"learning_rate": 1.8187692307692308e-05,
"loss": 0.2806,
"step": 590
},
{
"epoch": 0.09278410299035432,
"grad_norm": 0.875,
"learning_rate": 1.815692307692308e-05,
"loss": 0.2977,
"step": 600
},
{
"epoch": 0.09433050470686022,
"grad_norm": 0.7890625,
"learning_rate": 1.8126153846153846e-05,
"loss": 0.2845,
"step": 610
},
{
"epoch": 0.09587690642336613,
"grad_norm": 0.859375,
"learning_rate": 1.8095384615384618e-05,
"loss": 0.3309,
"step": 620
},
{
"epoch": 0.09742330813987203,
"grad_norm": 0.95703125,
"learning_rate": 1.806461538461539e-05,
"loss": 0.3197,
"step": 630
},
{
"epoch": 0.09896970985637794,
"grad_norm": 0.83203125,
"learning_rate": 1.8033846153846156e-05,
"loss": 0.2654,
"step": 640
},
{
"epoch": 0.10051611157288384,
"grad_norm": 1.09375,
"learning_rate": 1.8003076923076924e-05,
"loss": 0.2954,
"step": 650
},
{
"epoch": 0.10206251328938976,
"grad_norm": 1.0390625,
"learning_rate": 1.7972307692307692e-05,
"loss": 0.3237,
"step": 660
},
{
"epoch": 0.10360891500589565,
"grad_norm": 0.82421875,
"learning_rate": 1.7941538461538463e-05,
"loss": 0.2887,
"step": 670
},
{
"epoch": 0.10515531672240157,
"grad_norm": 1.0546875,
"learning_rate": 1.791076923076923e-05,
"loss": 0.3018,
"step": 680
},
{
"epoch": 0.10670171843890747,
"grad_norm": 0.890625,
"learning_rate": 1.788e-05,
"loss": 0.261,
"step": 690
},
{
"epoch": 0.10824812015541337,
"grad_norm": 0.84375,
"learning_rate": 1.784923076923077e-05,
"loss": 0.254,
"step": 700
},
{
"epoch": 0.10979452187191928,
"grad_norm": 0.96875,
"learning_rate": 1.781846153846154e-05,
"loss": 0.2944,
"step": 710
},
{
"epoch": 0.11134092358842518,
"grad_norm": 0.9375,
"learning_rate": 1.778769230769231e-05,
"loss": 0.3163,
"step": 720
},
{
"epoch": 0.11288732530493109,
"grad_norm": 0.640625,
"learning_rate": 1.775692307692308e-05,
"loss": 0.2838,
"step": 730
},
{
"epoch": 0.11443372702143699,
"grad_norm": 0.98828125,
"learning_rate": 1.7726153846153847e-05,
"loss": 0.236,
"step": 740
},
{
"epoch": 0.1159801287379429,
"grad_norm": 0.6796875,
"learning_rate": 1.7695384615384618e-05,
"loss": 0.2164,
"step": 750
},
{
"epoch": 0.1175265304544488,
"grad_norm": 0.7109375,
"learning_rate": 1.7664615384615386e-05,
"loss": 0.3331,
"step": 760
},
{
"epoch": 0.11907293217095472,
"grad_norm": 0.859375,
"learning_rate": 1.7633846153846153e-05,
"loss": 0.303,
"step": 770
},
{
"epoch": 0.12061933388746061,
"grad_norm": 0.80078125,
"learning_rate": 1.7603076923076924e-05,
"loss": 0.3264,
"step": 780
},
{
"epoch": 0.12216573560396653,
"grad_norm": 0.85546875,
"learning_rate": 1.7572307692307692e-05,
"loss": 0.2097,
"step": 790
},
{
"epoch": 0.12371213732047243,
"grad_norm": 0.80859375,
"learning_rate": 1.7541538461538463e-05,
"loss": 0.2456,
"step": 800
},
{
"epoch": 0.12525853903697834,
"grad_norm": 0.875,
"learning_rate": 1.751076923076923e-05,
"loss": 0.2877,
"step": 810
},
{
"epoch": 0.12680494075348422,
"grad_norm": 0.93359375,
"learning_rate": 1.7480000000000002e-05,
"loss": 0.2902,
"step": 820
},
{
"epoch": 0.12835134246999014,
"grad_norm": 0.875,
"learning_rate": 1.7449230769230773e-05,
"loss": 0.2357,
"step": 830
},
{
"epoch": 0.12989774418649605,
"grad_norm": 0.79296875,
"learning_rate": 1.741846153846154e-05,
"loss": 0.2926,
"step": 840
},
{
"epoch": 0.13144414590300196,
"grad_norm": 0.85546875,
"learning_rate": 1.738769230769231e-05,
"loss": 0.2301,
"step": 850
},
{
"epoch": 0.13299054761950785,
"grad_norm": 1.1015625,
"learning_rate": 1.7356923076923076e-05,
"loss": 0.2501,
"step": 860
},
{
"epoch": 0.13453694933601376,
"grad_norm": 1.0234375,
"learning_rate": 1.7326153846153847e-05,
"loss": 0.2393,
"step": 870
},
{
"epoch": 0.13608335105251967,
"grad_norm": 1.0625,
"learning_rate": 1.7295384615384615e-05,
"loss": 0.2337,
"step": 880
},
{
"epoch": 0.1376297527690256,
"grad_norm": 1.1171875,
"learning_rate": 1.7264615384615386e-05,
"loss": 0.3147,
"step": 890
},
{
"epoch": 0.13917615448553147,
"grad_norm": 0.828125,
"learning_rate": 1.7233846153846154e-05,
"loss": 0.2949,
"step": 900
},
{
"epoch": 0.14072255620203739,
"grad_norm": 1.1484375,
"learning_rate": 1.7203076923076925e-05,
"loss": 0.3394,
"step": 910
},
{
"epoch": 0.1422689579185433,
"grad_norm": 0.72265625,
"learning_rate": 1.7172307692307696e-05,
"loss": 0.3119,
"step": 920
},
{
"epoch": 0.14381535963504918,
"grad_norm": 0.8828125,
"learning_rate": 1.7141538461538464e-05,
"loss": 0.2959,
"step": 930
},
{
"epoch": 0.1453617613515551,
"grad_norm": 0.86328125,
"learning_rate": 1.711076923076923e-05,
"loss": 0.2677,
"step": 940
},
{
"epoch": 0.146908163068061,
"grad_norm": 0.99609375,
"learning_rate": 1.7080000000000002e-05,
"loss": 0.2575,
"step": 950
},
{
"epoch": 0.14845456478456692,
"grad_norm": 1.1640625,
"learning_rate": 1.704923076923077e-05,
"loss": 0.2419,
"step": 960
},
{
"epoch": 0.1500009665010728,
"grad_norm": 0.86328125,
"learning_rate": 1.7018461538461538e-05,
"loss": 0.2631,
"step": 970
},
{
"epoch": 0.15154736821757872,
"grad_norm": 0.8125,
"learning_rate": 1.698769230769231e-05,
"loss": 0.2415,
"step": 980
},
{
"epoch": 0.15309376993408463,
"grad_norm": 0.96484375,
"learning_rate": 1.6956923076923077e-05,
"loss": 0.2498,
"step": 990
},
{
"epoch": 0.15464017165059052,
"grad_norm": 0.93359375,
"learning_rate": 1.6926153846153848e-05,
"loss": 0.2845,
"step": 1000
},
{
"epoch": 0.15618657336709643,
"grad_norm": 1.0,
"learning_rate": 1.6895384615384615e-05,
"loss": 0.3159,
"step": 1010
},
{
"epoch": 0.15773297508360234,
"grad_norm": 1.15625,
"learning_rate": 1.6864615384615387e-05,
"loss": 0.2969,
"step": 1020
},
{
"epoch": 0.15927937680010826,
"grad_norm": 0.9765625,
"learning_rate": 1.6833846153846158e-05,
"loss": 0.3195,
"step": 1030
},
{
"epoch": 0.16082577851661414,
"grad_norm": 0.96484375,
"learning_rate": 1.6803076923076925e-05,
"loss": 0.3086,
"step": 1040
},
{
"epoch": 0.16237218023312006,
"grad_norm": 0.87109375,
"learning_rate": 1.6772307692307693e-05,
"loss": 0.297,
"step": 1050
},
{
"epoch": 0.16391858194962597,
"grad_norm": 1.015625,
"learning_rate": 1.674153846153846e-05,
"loss": 0.2677,
"step": 1060
},
{
"epoch": 0.16546498366613188,
"grad_norm": 1.015625,
"learning_rate": 1.6710769230769232e-05,
"loss": 0.294,
"step": 1070
},
{
"epoch": 0.16701138538263777,
"grad_norm": 0.953125,
"learning_rate": 1.668e-05,
"loss": 0.2483,
"step": 1080
},
{
"epoch": 0.16855778709914368,
"grad_norm": 0.9375,
"learning_rate": 1.664923076923077e-05,
"loss": 0.2564,
"step": 1090
},
{
"epoch": 0.1701041888156496,
"grad_norm": 0.96484375,
"learning_rate": 1.661846153846154e-05,
"loss": 0.2363,
"step": 1100
},
{
"epoch": 0.17165059053215548,
"grad_norm": 1.015625,
"learning_rate": 1.658769230769231e-05,
"loss": 0.2486,
"step": 1110
},
{
"epoch": 0.1731969922486614,
"grad_norm": 1.046875,
"learning_rate": 1.655692307692308e-05,
"loss": 0.3142,
"step": 1120
},
{
"epoch": 0.1747433939651673,
"grad_norm": 1.609375,
"learning_rate": 1.6526153846153848e-05,
"loss": 0.4319,
"step": 1130
},
{
"epoch": 0.17628979568167322,
"grad_norm": 0.9609375,
"learning_rate": 1.6495384615384616e-05,
"loss": 0.2727,
"step": 1140
},
{
"epoch": 0.1778361973981791,
"grad_norm": 0.92578125,
"learning_rate": 1.6464615384615387e-05,
"loss": 0.2472,
"step": 1150
},
{
"epoch": 0.17938259911468502,
"grad_norm": 0.9609375,
"learning_rate": 1.6433846153846155e-05,
"loss": 0.3036,
"step": 1160
},
{
"epoch": 0.18092900083119093,
"grad_norm": 0.99609375,
"learning_rate": 1.6403076923076922e-05,
"loss": 0.2199,
"step": 1170
},
{
"epoch": 0.18247540254769684,
"grad_norm": 1.1484375,
"learning_rate": 1.6372307692307693e-05,
"loss": 0.2474,
"step": 1180
},
{
"epoch": 0.18402180426420273,
"grad_norm": 0.9609375,
"learning_rate": 1.634153846153846e-05,
"loss": 0.2892,
"step": 1190
},
{
"epoch": 0.18556820598070864,
"grad_norm": 0.94140625,
"learning_rate": 1.6310769230769232e-05,
"loss": 0.3317,
"step": 1200
},
{
"epoch": 0.18711460769721455,
"grad_norm": 0.85546875,
"learning_rate": 1.628e-05,
"loss": 0.3066,
"step": 1210
},
{
"epoch": 0.18866100941372044,
"grad_norm": 0.88671875,
"learning_rate": 1.624923076923077e-05,
"loss": 0.2811,
"step": 1220
},
{
"epoch": 0.19020741113022635,
"grad_norm": 0.87890625,
"learning_rate": 1.6218461538461542e-05,
"loss": 0.2503,
"step": 1230
},
{
"epoch": 0.19175381284673226,
"grad_norm": 0.9140625,
"learning_rate": 1.618769230769231e-05,
"loss": 0.3128,
"step": 1240
},
{
"epoch": 0.19330021456323818,
"grad_norm": 0.953125,
"learning_rate": 1.6156923076923078e-05,
"loss": 0.3067,
"step": 1250
},
{
"epoch": 0.19484661627974406,
"grad_norm": 1.203125,
"learning_rate": 1.6126153846153845e-05,
"loss": 0.2975,
"step": 1260
},
{
"epoch": 0.19639301799624997,
"grad_norm": 1.140625,
"learning_rate": 1.6095384615384616e-05,
"loss": 0.3083,
"step": 1270
},
{
"epoch": 0.1979394197127559,
"grad_norm": 0.66796875,
"learning_rate": 1.6064615384615384e-05,
"loss": 0.2786,
"step": 1280
},
{
"epoch": 0.19948582142926177,
"grad_norm": 0.890625,
"learning_rate": 1.6033846153846155e-05,
"loss": 0.404,
"step": 1290
},
{
"epoch": 0.20103222314576769,
"grad_norm": 0.8984375,
"learning_rate": 1.6003076923076923e-05,
"loss": 0.3213,
"step": 1300
},
{
"epoch": 0.2025786248622736,
"grad_norm": 0.91015625,
"learning_rate": 1.5972307692307694e-05,
"loss": 0.24,
"step": 1310
},
{
"epoch": 0.2041250265787795,
"grad_norm": 1.2890625,
"learning_rate": 1.5941538461538465e-05,
"loss": 0.2711,
"step": 1320
},
{
"epoch": 0.2056714282952854,
"grad_norm": 0.83203125,
"learning_rate": 1.5910769230769233e-05,
"loss": 0.2493,
"step": 1330
},
{
"epoch": 0.2072178300117913,
"grad_norm": 1.4609375,
"learning_rate": 1.588e-05,
"loss": 0.353,
"step": 1340
},
{
"epoch": 0.20876423172829722,
"grad_norm": 0.9375,
"learning_rate": 1.584923076923077e-05,
"loss": 0.2534,
"step": 1350
},
{
"epoch": 0.21031063344480314,
"grad_norm": 0.64453125,
"learning_rate": 1.581846153846154e-05,
"loss": 0.2088,
"step": 1360
},
{
"epoch": 0.21185703516130902,
"grad_norm": 0.94921875,
"learning_rate": 1.5787692307692307e-05,
"loss": 0.3146,
"step": 1370
},
{
"epoch": 0.21340343687781493,
"grad_norm": 1.125,
"learning_rate": 1.5756923076923078e-05,
"loss": 0.2947,
"step": 1380
},
{
"epoch": 0.21494983859432085,
"grad_norm": 0.6796875,
"learning_rate": 1.5726153846153846e-05,
"loss": 0.2039,
"step": 1390
},
{
"epoch": 0.21649624031082673,
"grad_norm": 0.828125,
"learning_rate": 1.5695384615384617e-05,
"loss": 0.252,
"step": 1400
},
{
"epoch": 0.21804264202733264,
"grad_norm": 0.875,
"learning_rate": 1.5664615384615388e-05,
"loss": 0.2689,
"step": 1410
},
{
"epoch": 0.21958904374383856,
"grad_norm": 1.0390625,
"learning_rate": 1.5633846153846156e-05,
"loss": 0.3239,
"step": 1420
},
{
"epoch": 0.22113544546034447,
"grad_norm": 1.0859375,
"learning_rate": 1.5603076923076927e-05,
"loss": 0.2891,
"step": 1430
},
{
"epoch": 0.22268184717685036,
"grad_norm": 0.75390625,
"learning_rate": 1.5572307692307694e-05,
"loss": 0.3306,
"step": 1440
},
{
"epoch": 0.22422824889335627,
"grad_norm": 1.0859375,
"learning_rate": 1.5541538461538462e-05,
"loss": 0.2971,
"step": 1450
},
{
"epoch": 0.22577465060986218,
"grad_norm": 0.953125,
"learning_rate": 1.551076923076923e-05,
"loss": 0.2892,
"step": 1460
},
{
"epoch": 0.2273210523263681,
"grad_norm": 0.75390625,
"learning_rate": 1.548e-05,
"loss": 0.2773,
"step": 1470
},
{
"epoch": 0.22886745404287398,
"grad_norm": 0.9453125,
"learning_rate": 1.544923076923077e-05,
"loss": 0.2767,
"step": 1480
},
{
"epoch": 0.2304138557593799,
"grad_norm": 1.046875,
"learning_rate": 1.541846153846154e-05,
"loss": 0.2899,
"step": 1490
},
{
"epoch": 0.2319602574758858,
"grad_norm": 0.890625,
"learning_rate": 1.5387692307692307e-05,
"loss": 0.2521,
"step": 1500
},
{
"epoch": 0.2335066591923917,
"grad_norm": 1.03125,
"learning_rate": 1.535692307692308e-05,
"loss": 0.2479,
"step": 1510
},
{
"epoch": 0.2350530609088976,
"grad_norm": 1.0,
"learning_rate": 1.532615384615385e-05,
"loss": 0.3154,
"step": 1520
},
{
"epoch": 0.23659946262540352,
"grad_norm": 0.83984375,
"learning_rate": 1.5295384615384617e-05,
"loss": 0.3391,
"step": 1530
},
{
"epoch": 0.23814586434190943,
"grad_norm": 1.2265625,
"learning_rate": 1.5264615384615385e-05,
"loss": 0.265,
"step": 1540
},
{
"epoch": 0.23969226605841532,
"grad_norm": 0.99609375,
"learning_rate": 1.5233846153846154e-05,
"loss": 0.2949,
"step": 1550
},
{
"epoch": 0.24123866777492123,
"grad_norm": 1.265625,
"learning_rate": 1.5203076923076925e-05,
"loss": 0.3136,
"step": 1560
},
{
"epoch": 0.24278506949142714,
"grad_norm": 1.4609375,
"learning_rate": 1.5172307692307693e-05,
"loss": 0.3073,
"step": 1570
},
{
"epoch": 0.24433147120793305,
"grad_norm": 1.140625,
"learning_rate": 1.5141538461538463e-05,
"loss": 0.3271,
"step": 1580
},
{
"epoch": 0.24587787292443894,
"grad_norm": 1.03125,
"learning_rate": 1.5110769230769232e-05,
"loss": 0.2722,
"step": 1590
},
{
"epoch": 0.24742427464094485,
"grad_norm": 0.87109375,
"learning_rate": 1.5080000000000001e-05,
"loss": 0.3513,
"step": 1600
},
{
"epoch": 0.24897067635745077,
"grad_norm": 1.1796875,
"learning_rate": 1.504923076923077e-05,
"loss": 0.2212,
"step": 1610
},
{
"epoch": 0.2505170780739567,
"grad_norm": 0.890625,
"learning_rate": 1.501846153846154e-05,
"loss": 0.2914,
"step": 1620
},
{
"epoch": 0.25206347979046256,
"grad_norm": 0.83984375,
"learning_rate": 1.498769230769231e-05,
"loss": 0.281,
"step": 1630
},
{
"epoch": 0.25360988150696845,
"grad_norm": 0.86328125,
"learning_rate": 1.4956923076923077e-05,
"loss": 0.2509,
"step": 1640
},
{
"epoch": 0.2551562832234744,
"grad_norm": 1.2578125,
"learning_rate": 1.4926153846153848e-05,
"loss": 0.2994,
"step": 1650
},
{
"epoch": 0.2567026849399803,
"grad_norm": 0.90625,
"learning_rate": 1.4895384615384616e-05,
"loss": 0.2839,
"step": 1660
},
{
"epoch": 0.2582490866564862,
"grad_norm": 0.81640625,
"learning_rate": 1.4864615384615385e-05,
"loss": 0.229,
"step": 1670
},
{
"epoch": 0.2597954883729921,
"grad_norm": 0.9453125,
"learning_rate": 1.4833846153846155e-05,
"loss": 0.2381,
"step": 1680
},
{
"epoch": 0.261341890089498,
"grad_norm": 0.9140625,
"learning_rate": 1.4803076923076924e-05,
"loss": 0.3495,
"step": 1690
},
{
"epoch": 0.2628882918060039,
"grad_norm": 0.7421875,
"learning_rate": 1.4772307692307692e-05,
"loss": 0.2756,
"step": 1700
},
{
"epoch": 0.2644346935225098,
"grad_norm": 0.9375,
"learning_rate": 1.4741538461538463e-05,
"loss": 0.3189,
"step": 1710
},
{
"epoch": 0.2659810952390157,
"grad_norm": 0.765625,
"learning_rate": 1.4710769230769232e-05,
"loss": 0.289,
"step": 1720
},
{
"epoch": 0.26752749695552164,
"grad_norm": 1.0703125,
"learning_rate": 1.4680000000000002e-05,
"loss": 0.2848,
"step": 1730
},
{
"epoch": 0.2690738986720275,
"grad_norm": 2.96875,
"learning_rate": 1.4649230769230771e-05,
"loss": 0.3115,
"step": 1740
},
{
"epoch": 0.2706203003885334,
"grad_norm": 0.79296875,
"learning_rate": 1.4618461538461539e-05,
"loss": 0.3075,
"step": 1750
},
{
"epoch": 0.27216670210503935,
"grad_norm": 1.0625,
"learning_rate": 1.458769230769231e-05,
"loss": 0.2705,
"step": 1760
},
{
"epoch": 0.27371310382154523,
"grad_norm": 0.98046875,
"learning_rate": 1.4556923076923078e-05,
"loss": 0.266,
"step": 1770
},
{
"epoch": 0.2752595055380512,
"grad_norm": 0.8203125,
"learning_rate": 1.4526153846153847e-05,
"loss": 0.3179,
"step": 1780
},
{
"epoch": 0.27680590725455706,
"grad_norm": 0.953125,
"learning_rate": 1.4495384615384616e-05,
"loss": 0.2775,
"step": 1790
},
{
"epoch": 0.27835230897106295,
"grad_norm": 0.87890625,
"learning_rate": 1.4464615384615386e-05,
"loss": 0.3279,
"step": 1800
},
{
"epoch": 0.2798987106875689,
"grad_norm": 0.8515625,
"learning_rate": 1.4433846153846155e-05,
"loss": 0.2373,
"step": 1810
},
{
"epoch": 0.28144511240407477,
"grad_norm": 0.80859375,
"learning_rate": 1.4403076923076925e-05,
"loss": 0.2216,
"step": 1820
},
{
"epoch": 0.28299151412058066,
"grad_norm": 0.93359375,
"learning_rate": 1.4372307692307694e-05,
"loss": 0.3206,
"step": 1830
},
{
"epoch": 0.2845379158370866,
"grad_norm": 1.109375,
"learning_rate": 1.4341538461538462e-05,
"loss": 0.2467,
"step": 1840
},
{
"epoch": 0.2860843175535925,
"grad_norm": 0.9765625,
"learning_rate": 1.4310769230769233e-05,
"loss": 0.2818,
"step": 1850
},
{
"epoch": 0.28763071927009837,
"grad_norm": 0.8984375,
"learning_rate": 1.428e-05,
"loss": 0.2477,
"step": 1860
},
{
"epoch": 0.2891771209866043,
"grad_norm": 0.734375,
"learning_rate": 1.4249230769230772e-05,
"loss": 0.2576,
"step": 1870
},
{
"epoch": 0.2907235227031102,
"grad_norm": 0.91015625,
"learning_rate": 1.421846153846154e-05,
"loss": 0.2841,
"step": 1880
},
{
"epoch": 0.2922699244196161,
"grad_norm": 0.98828125,
"learning_rate": 1.4187692307692309e-05,
"loss": 0.3371,
"step": 1890
},
{
"epoch": 0.293816326136122,
"grad_norm": 1.0,
"learning_rate": 1.4156923076923076e-05,
"loss": 0.3037,
"step": 1900
},
{
"epoch": 0.2953627278526279,
"grad_norm": 1.015625,
"learning_rate": 1.4126153846153847e-05,
"loss": 0.2526,
"step": 1910
},
{
"epoch": 0.29690912956913385,
"grad_norm": 0.98828125,
"learning_rate": 1.4095384615384617e-05,
"loss": 0.2125,
"step": 1920
},
{
"epoch": 0.29845553128563973,
"grad_norm": 0.8125,
"learning_rate": 1.4064615384615386e-05,
"loss": 0.2783,
"step": 1930
},
{
"epoch": 0.3000019330021456,
"grad_norm": 0.90625,
"learning_rate": 1.4033846153846156e-05,
"loss": 0.3131,
"step": 1940
},
{
"epoch": 0.30154833471865156,
"grad_norm": 0.78125,
"learning_rate": 1.4003076923076923e-05,
"loss": 0.3226,
"step": 1950
},
{
"epoch": 0.30309473643515744,
"grad_norm": 0.83203125,
"learning_rate": 1.3972307692307694e-05,
"loss": 0.2819,
"step": 1960
},
{
"epoch": 0.3046411381516633,
"grad_norm": 0.78515625,
"learning_rate": 1.3941538461538462e-05,
"loss": 0.2868,
"step": 1970
},
{
"epoch": 0.30618753986816927,
"grad_norm": 0.92578125,
"learning_rate": 1.3910769230769232e-05,
"loss": 0.2615,
"step": 1980
},
{
"epoch": 0.30773394158467515,
"grad_norm": 0.87109375,
"learning_rate": 1.3880000000000001e-05,
"loss": 0.255,
"step": 1990
},
{
"epoch": 0.30928034330118104,
"grad_norm": 0.82421875,
"learning_rate": 1.384923076923077e-05,
"loss": 0.251,
"step": 2000
},
{
"epoch": 0.310826745017687,
"grad_norm": 0.9609375,
"learning_rate": 1.3818461538461541e-05,
"loss": 0.2983,
"step": 2010
},
{
"epoch": 0.31237314673419286,
"grad_norm": 1.0859375,
"learning_rate": 1.3787692307692309e-05,
"loss": 0.2705,
"step": 2020
},
{
"epoch": 0.3139195484506988,
"grad_norm": 0.890625,
"learning_rate": 1.3756923076923079e-05,
"loss": 0.2937,
"step": 2030
},
{
"epoch": 0.3154659501672047,
"grad_norm": 1.1953125,
"learning_rate": 1.3726153846153846e-05,
"loss": 0.3296,
"step": 2040
},
{
"epoch": 0.3170123518837106,
"grad_norm": 1.140625,
"learning_rate": 1.3695384615384617e-05,
"loss": 0.2666,
"step": 2050
},
{
"epoch": 0.3185587536002165,
"grad_norm": 1.0390625,
"learning_rate": 1.3664615384615385e-05,
"loss": 0.3124,
"step": 2060
},
{
"epoch": 0.3201051553167224,
"grad_norm": 1.015625,
"learning_rate": 1.3633846153846156e-05,
"loss": 0.3752,
"step": 2070
},
{
"epoch": 0.3216515570332283,
"grad_norm": 0.9296875,
"learning_rate": 1.3603076923076924e-05,
"loss": 0.2622,
"step": 2080
},
{
"epoch": 0.3231979587497342,
"grad_norm": 0.734375,
"learning_rate": 1.3572307692307693e-05,
"loss": 0.2526,
"step": 2090
},
{
"epoch": 0.3247443604662401,
"grad_norm": 0.80078125,
"learning_rate": 1.3541538461538464e-05,
"loss": 0.2775,
"step": 2100
},
{
"epoch": 0.326290762182746,
"grad_norm": 1.0546875,
"learning_rate": 1.3510769230769232e-05,
"loss": 0.3322,
"step": 2110
},
{
"epoch": 0.32783716389925194,
"grad_norm": 1.0,
"learning_rate": 1.3480000000000001e-05,
"loss": 0.2897,
"step": 2120
},
{
"epoch": 0.3293835656157578,
"grad_norm": 1.0234375,
"learning_rate": 1.344923076923077e-05,
"loss": 0.2815,
"step": 2130
},
{
"epoch": 0.33092996733226376,
"grad_norm": 0.7109375,
"learning_rate": 1.341846153846154e-05,
"loss": 0.2959,
"step": 2140
},
{
"epoch": 0.33247636904876965,
"grad_norm": 0.91015625,
"learning_rate": 1.3387692307692308e-05,
"loss": 0.2571,
"step": 2150
},
{
"epoch": 0.33402277076527553,
"grad_norm": 0.91015625,
"learning_rate": 1.3356923076923079e-05,
"loss": 0.247,
"step": 2160
},
{
"epoch": 0.3355691724817815,
"grad_norm": 0.828125,
"learning_rate": 1.3326153846153847e-05,
"loss": 0.248,
"step": 2170
},
{
"epoch": 0.33711557419828736,
"grad_norm": 0.94140625,
"learning_rate": 1.3295384615384616e-05,
"loss": 0.2438,
"step": 2180
},
{
"epoch": 0.33866197591479325,
"grad_norm": 1.2734375,
"learning_rate": 1.3264615384615385e-05,
"loss": 0.3612,
"step": 2190
},
{
"epoch": 0.3402083776312992,
"grad_norm": 0.9609375,
"learning_rate": 1.3233846153846155e-05,
"loss": 0.3287,
"step": 2200
},
{
"epoch": 0.34175477934780507,
"grad_norm": 0.9609375,
"learning_rate": 1.3203076923076926e-05,
"loss": 0.2756,
"step": 2210
},
{
"epoch": 0.34330118106431096,
"grad_norm": 0.91796875,
"learning_rate": 1.3172307692307694e-05,
"loss": 0.2886,
"step": 2220
},
{
"epoch": 0.3448475827808169,
"grad_norm": 0.87109375,
"learning_rate": 1.3141538461538463e-05,
"loss": 0.2446,
"step": 2230
},
{
"epoch": 0.3463939844973228,
"grad_norm": 0.94140625,
"learning_rate": 1.311076923076923e-05,
"loss": 0.3064,
"step": 2240
},
{
"epoch": 0.3479403862138287,
"grad_norm": 0.8203125,
"learning_rate": 1.3080000000000002e-05,
"loss": 0.2376,
"step": 2250
},
{
"epoch": 0.3494867879303346,
"grad_norm": 0.8359375,
"learning_rate": 1.304923076923077e-05,
"loss": 0.2932,
"step": 2260
},
{
"epoch": 0.3510331896468405,
"grad_norm": 1.4296875,
"learning_rate": 1.301846153846154e-05,
"loss": 0.2979,
"step": 2270
},
{
"epoch": 0.35257959136334643,
"grad_norm": 0.84375,
"learning_rate": 1.2987692307692308e-05,
"loss": 0.2897,
"step": 2280
},
{
"epoch": 0.3541259930798523,
"grad_norm": 0.99609375,
"learning_rate": 1.2956923076923078e-05,
"loss": 0.2744,
"step": 2290
},
{
"epoch": 0.3556723947963582,
"grad_norm": 0.96484375,
"learning_rate": 1.2926153846153849e-05,
"loss": 0.2708,
"step": 2300
},
{
"epoch": 0.35721879651286415,
"grad_norm": 0.81640625,
"learning_rate": 1.2895384615384616e-05,
"loss": 0.2224,
"step": 2310
},
{
"epoch": 0.35876519822937003,
"grad_norm": 1.1015625,
"learning_rate": 1.2864615384615386e-05,
"loss": 0.2728,
"step": 2320
},
{
"epoch": 0.3603115999458759,
"grad_norm": 1.0859375,
"learning_rate": 1.2833846153846155e-05,
"loss": 0.2661,
"step": 2330
},
{
"epoch": 0.36185800166238186,
"grad_norm": 1.046875,
"learning_rate": 1.2803076923076925e-05,
"loss": 0.2892,
"step": 2340
},
{
"epoch": 0.36340440337888774,
"grad_norm": 0.6640625,
"learning_rate": 1.2772307692307692e-05,
"loss": 0.3092,
"step": 2350
},
{
"epoch": 0.3649508050953937,
"grad_norm": 0.84765625,
"learning_rate": 1.2741538461538463e-05,
"loss": 0.2542,
"step": 2360
},
{
"epoch": 0.36649720681189957,
"grad_norm": 0.91796875,
"learning_rate": 1.2710769230769231e-05,
"loss": 0.3589,
"step": 2370
},
{
"epoch": 0.36804360852840545,
"grad_norm": 0.71484375,
"learning_rate": 1.268e-05,
"loss": 0.2237,
"step": 2380
},
{
"epoch": 0.3695900102449114,
"grad_norm": 1.0703125,
"learning_rate": 1.264923076923077e-05,
"loss": 0.3413,
"step": 2390
},
{
"epoch": 0.3711364119614173,
"grad_norm": 1.0234375,
"learning_rate": 1.261846153846154e-05,
"loss": 0.2556,
"step": 2400
},
{
"epoch": 0.37268281367792316,
"grad_norm": 0.9296875,
"learning_rate": 1.258769230769231e-05,
"loss": 0.3087,
"step": 2410
},
{
"epoch": 0.3742292153944291,
"grad_norm": 0.95703125,
"learning_rate": 1.2556923076923078e-05,
"loss": 0.2609,
"step": 2420
},
{
"epoch": 0.375775617110935,
"grad_norm": 1.1328125,
"learning_rate": 1.2526153846153848e-05,
"loss": 0.2572,
"step": 2430
},
{
"epoch": 0.3773220188274409,
"grad_norm": 1.1484375,
"learning_rate": 1.2495384615384615e-05,
"loss": 0.3003,
"step": 2440
},
{
"epoch": 0.3788684205439468,
"grad_norm": 1.1484375,
"learning_rate": 1.2464615384615386e-05,
"loss": 0.259,
"step": 2450
},
{
"epoch": 0.3804148222604527,
"grad_norm": 1.203125,
"learning_rate": 1.2433846153846154e-05,
"loss": 0.2606,
"step": 2460
},
{
"epoch": 0.38196122397695864,
"grad_norm": 0.85546875,
"learning_rate": 1.2403076923076925e-05,
"loss": 0.2351,
"step": 2470
},
{
"epoch": 0.3835076256934645,
"grad_norm": 0.90625,
"learning_rate": 1.2372307692307693e-05,
"loss": 0.2664,
"step": 2480
},
{
"epoch": 0.3850540274099704,
"grad_norm": 0.91796875,
"learning_rate": 1.2341538461538462e-05,
"loss": 0.245,
"step": 2490
},
{
"epoch": 0.38660042912647635,
"grad_norm": 1.109375,
"learning_rate": 1.2310769230769233e-05,
"loss": 0.2781,
"step": 2500
},
{
"epoch": 0.38814683084298224,
"grad_norm": 1.046875,
"learning_rate": 1.2280000000000001e-05,
"loss": 0.2847,
"step": 2510
},
{
"epoch": 0.3896932325594881,
"grad_norm": 0.8203125,
"learning_rate": 1.224923076923077e-05,
"loss": 0.3223,
"step": 2520
},
{
"epoch": 0.39123963427599406,
"grad_norm": 0.9765625,
"learning_rate": 1.221846153846154e-05,
"loss": 0.3068,
"step": 2530
},
{
"epoch": 0.39278603599249995,
"grad_norm": 0.9609375,
"learning_rate": 1.218769230769231e-05,
"loss": 0.2393,
"step": 2540
},
{
"epoch": 0.39433243770900583,
"grad_norm": 1.09375,
"learning_rate": 1.2156923076923077e-05,
"loss": 0.2918,
"step": 2550
},
{
"epoch": 0.3958788394255118,
"grad_norm": 0.84375,
"learning_rate": 1.2126153846153848e-05,
"loss": 0.2146,
"step": 2560
},
{
"epoch": 0.39742524114201766,
"grad_norm": 0.875,
"learning_rate": 1.2095384615384616e-05,
"loss": 0.3178,
"step": 2570
},
{
"epoch": 0.39897164285852355,
"grad_norm": 0.78515625,
"learning_rate": 1.2064615384615385e-05,
"loss": 0.2247,
"step": 2580
},
{
"epoch": 0.4005180445750295,
"grad_norm": 0.93359375,
"learning_rate": 1.2033846153846154e-05,
"loss": 0.2684,
"step": 2590
},
{
"epoch": 0.40206444629153537,
"grad_norm": 0.99609375,
"learning_rate": 1.2003076923076924e-05,
"loss": 0.2332,
"step": 2600
},
{
"epoch": 0.4036108480080413,
"grad_norm": 0.86328125,
"learning_rate": 1.1972307692307695e-05,
"loss": 0.4153,
"step": 2610
},
{
"epoch": 0.4051572497245472,
"grad_norm": 0.96484375,
"learning_rate": 1.1941538461538463e-05,
"loss": 0.2559,
"step": 2620
},
{
"epoch": 0.4067036514410531,
"grad_norm": 1.53125,
"learning_rate": 1.1910769230769232e-05,
"loss": 0.2974,
"step": 2630
},
{
"epoch": 0.408250053157559,
"grad_norm": 1.0078125,
"learning_rate": 1.188e-05,
"loss": 0.3523,
"step": 2640
},
{
"epoch": 0.4097964548740649,
"grad_norm": 1.1171875,
"learning_rate": 1.1849230769230771e-05,
"loss": 0.2619,
"step": 2650
},
{
"epoch": 0.4113428565905708,
"grad_norm": 1.109375,
"learning_rate": 1.1818461538461539e-05,
"loss": 0.2833,
"step": 2660
},
{
"epoch": 0.41288925830707673,
"grad_norm": 0.96875,
"learning_rate": 1.178769230769231e-05,
"loss": 0.244,
"step": 2670
},
{
"epoch": 0.4144356600235826,
"grad_norm": 0.8046875,
"learning_rate": 1.1756923076923077e-05,
"loss": 0.2238,
"step": 2680
},
{
"epoch": 0.4159820617400885,
"grad_norm": 0.87890625,
"learning_rate": 1.1726153846153847e-05,
"loss": 0.2839,
"step": 2690
},
{
"epoch": 0.41752846345659445,
"grad_norm": 1.46875,
"learning_rate": 1.1695384615384618e-05,
"loss": 0.3264,
"step": 2700
},
{
"epoch": 0.41907486517310033,
"grad_norm": 0.953125,
"learning_rate": 1.1664615384615386e-05,
"loss": 0.2501,
"step": 2710
},
{
"epoch": 0.42062126688960627,
"grad_norm": 0.94140625,
"learning_rate": 1.1633846153846155e-05,
"loss": 0.3618,
"step": 2720
},
{
"epoch": 0.42216766860611216,
"grad_norm": 1.015625,
"learning_rate": 1.1603076923076924e-05,
"loss": 0.2353,
"step": 2730
},
{
"epoch": 0.42371407032261804,
"grad_norm": 0.79296875,
"learning_rate": 1.1572307692307694e-05,
"loss": 0.2745,
"step": 2740
},
{
"epoch": 0.425260472039124,
"grad_norm": 1.1171875,
"learning_rate": 1.1541538461538461e-05,
"loss": 0.2673,
"step": 2750
},
{
"epoch": 0.42680687375562987,
"grad_norm": 1.1796875,
"learning_rate": 1.1510769230769232e-05,
"loss": 0.2448,
"step": 2760
},
{
"epoch": 0.42835327547213575,
"grad_norm": 0.82421875,
"learning_rate": 1.148e-05,
"loss": 0.2428,
"step": 2770
},
{
"epoch": 0.4298996771886417,
"grad_norm": 0.97265625,
"learning_rate": 1.144923076923077e-05,
"loss": 0.2259,
"step": 2780
},
{
"epoch": 0.4314460789051476,
"grad_norm": 1.0625,
"learning_rate": 1.141846153846154e-05,
"loss": 0.3214,
"step": 2790
},
{
"epoch": 0.43299248062165346,
"grad_norm": 0.90625,
"learning_rate": 1.1387692307692308e-05,
"loss": 0.272,
"step": 2800
},
{
"epoch": 0.4345388823381594,
"grad_norm": 0.7890625,
"learning_rate": 1.135692307692308e-05,
"loss": 0.3055,
"step": 2810
},
{
"epoch": 0.4360852840546653,
"grad_norm": 1.3359375,
"learning_rate": 1.1326153846153847e-05,
"loss": 0.3783,
"step": 2820
},
{
"epoch": 0.43763168577117123,
"grad_norm": 0.83203125,
"learning_rate": 1.1295384615384617e-05,
"loss": 0.2318,
"step": 2830
},
{
"epoch": 0.4391780874876771,
"grad_norm": 1.0,
"learning_rate": 1.1264615384615384e-05,
"loss": 0.3072,
"step": 2840
},
{
"epoch": 0.440724489204183,
"grad_norm": 1.453125,
"learning_rate": 1.1233846153846155e-05,
"loss": 0.3272,
"step": 2850
},
{
"epoch": 0.44227089092068894,
"grad_norm": 1.015625,
"learning_rate": 1.1203076923076923e-05,
"loss": 0.2422,
"step": 2860
},
{
"epoch": 0.4438172926371948,
"grad_norm": 0.83984375,
"learning_rate": 1.1172307692307694e-05,
"loss": 0.2424,
"step": 2870
},
{
"epoch": 0.4453636943537007,
"grad_norm": 1.0703125,
"learning_rate": 1.1141538461538462e-05,
"loss": 0.288,
"step": 2880
},
{
"epoch": 0.44691009607020665,
"grad_norm": 0.90234375,
"learning_rate": 1.1110769230769231e-05,
"loss": 0.2376,
"step": 2890
},
{
"epoch": 0.44845649778671254,
"grad_norm": 0.80859375,
"learning_rate": 1.1080000000000002e-05,
"loss": 0.2541,
"step": 2900
},
{
"epoch": 0.4500028995032184,
"grad_norm": 0.80078125,
"learning_rate": 1.104923076923077e-05,
"loss": 0.2928,
"step": 2910
},
{
"epoch": 0.45154930121972436,
"grad_norm": 0.83984375,
"learning_rate": 1.101846153846154e-05,
"loss": 0.2582,
"step": 2920
},
{
"epoch": 0.45309570293623025,
"grad_norm": 0.9140625,
"learning_rate": 1.0987692307692309e-05,
"loss": 0.2548,
"step": 2930
},
{
"epoch": 0.4546421046527362,
"grad_norm": 1.421875,
"learning_rate": 1.0956923076923078e-05,
"loss": 0.3462,
"step": 2940
},
{
"epoch": 0.4561885063692421,
"grad_norm": 1.0,
"learning_rate": 1.0926153846153846e-05,
"loss": 0.3076,
"step": 2950
},
{
"epoch": 0.45773490808574796,
"grad_norm": 1.0546875,
"learning_rate": 1.0895384615384617e-05,
"loss": 0.2761,
"step": 2960
},
{
"epoch": 0.4592813098022539,
"grad_norm": 0.859375,
"learning_rate": 1.0864615384615385e-05,
"loss": 0.3359,
"step": 2970
},
{
"epoch": 0.4608277115187598,
"grad_norm": 0.86328125,
"learning_rate": 1.0833846153846154e-05,
"loss": 0.3213,
"step": 2980
},
{
"epoch": 0.46237411323526567,
"grad_norm": 0.85546875,
"learning_rate": 1.0803076923076925e-05,
"loss": 0.2917,
"step": 2990
},
{
"epoch": 0.4639205149517716,
"grad_norm": 0.953125,
"learning_rate": 1.0772307692307693e-05,
"loss": 0.2774,
"step": 3000
},
{
"epoch": 0.4654669166682775,
"grad_norm": 0.7734375,
"learning_rate": 1.0741538461538464e-05,
"loss": 0.3373,
"step": 3010
},
{
"epoch": 0.4670133183847834,
"grad_norm": 0.57421875,
"learning_rate": 1.0710769230769232e-05,
"loss": 0.248,
"step": 3020
},
{
"epoch": 0.4685597201012893,
"grad_norm": 1.0078125,
"learning_rate": 1.0680000000000001e-05,
"loss": 0.2782,
"step": 3030
},
{
"epoch": 0.4701061218177952,
"grad_norm": 0.80859375,
"learning_rate": 1.0649230769230769e-05,
"loss": 0.3041,
"step": 3040
},
{
"epoch": 0.47165252353430115,
"grad_norm": 0.8359375,
"learning_rate": 1.061846153846154e-05,
"loss": 0.2109,
"step": 3050
},
{
"epoch": 0.47319892525080703,
"grad_norm": 0.99609375,
"learning_rate": 1.0587692307692308e-05,
"loss": 0.2815,
"step": 3060
},
{
"epoch": 0.4747453269673129,
"grad_norm": 1.1875,
"learning_rate": 1.0556923076923079e-05,
"loss": 0.2775,
"step": 3070
},
{
"epoch": 0.47629172868381886,
"grad_norm": 1.15625,
"learning_rate": 1.0526153846153846e-05,
"loss": 0.2645,
"step": 3080
},
{
"epoch": 0.47783813040032475,
"grad_norm": 0.91796875,
"learning_rate": 1.0495384615384616e-05,
"loss": 0.2738,
"step": 3090
},
{
"epoch": 0.47938453211683063,
"grad_norm": 0.73828125,
"learning_rate": 1.0464615384615387e-05,
"loss": 0.2912,
"step": 3100
},
{
"epoch": 0.48093093383333657,
"grad_norm": 1.046875,
"learning_rate": 1.0433846153846155e-05,
"loss": 0.217,
"step": 3110
},
{
"epoch": 0.48247733554984246,
"grad_norm": 0.96875,
"learning_rate": 1.0403076923076924e-05,
"loss": 0.3397,
"step": 3120
},
{
"epoch": 0.48402373726634834,
"grad_norm": 0.80859375,
"learning_rate": 1.0372307692307693e-05,
"loss": 0.215,
"step": 3130
},
{
"epoch": 0.4855701389828543,
"grad_norm": 1.125,
"learning_rate": 1.0341538461538463e-05,
"loss": 0.2587,
"step": 3140
},
{
"epoch": 0.48711654069936017,
"grad_norm": 0.9765625,
"learning_rate": 1.031076923076923e-05,
"loss": 0.2183,
"step": 3150
},
{
"epoch": 0.4886629424158661,
"grad_norm": 1.140625,
"learning_rate": 1.0280000000000002e-05,
"loss": 0.2551,
"step": 3160
},
{
"epoch": 0.490209344132372,
"grad_norm": 0.859375,
"learning_rate": 1.024923076923077e-05,
"loss": 0.2389,
"step": 3170
},
{
"epoch": 0.4917557458488779,
"grad_norm": 0.859375,
"learning_rate": 1.0218461538461539e-05,
"loss": 0.2774,
"step": 3180
},
{
"epoch": 0.4933021475653838,
"grad_norm": 0.8828125,
"learning_rate": 1.018769230769231e-05,
"loss": 0.2608,
"step": 3190
},
{
"epoch": 0.4948485492818897,
"grad_norm": 1.0078125,
"learning_rate": 1.0156923076923077e-05,
"loss": 0.3287,
"step": 3200
},
{
"epoch": 0.4963949509983956,
"grad_norm": 0.80859375,
"learning_rate": 1.0126153846153849e-05,
"loss": 0.236,
"step": 3210
},
{
"epoch": 0.49794135271490153,
"grad_norm": 1.0,
"learning_rate": 1.0095384615384616e-05,
"loss": 0.259,
"step": 3220
},
{
"epoch": 0.4994877544314074,
"grad_norm": 0.6796875,
"learning_rate": 1.0064615384615386e-05,
"loss": 0.2668,
"step": 3230
},
{
"epoch": 0.5010341561479134,
"grad_norm": 0.86328125,
"learning_rate": 1.0033846153846153e-05,
"loss": 0.3078,
"step": 3240
},
{
"epoch": 0.5025805578644192,
"grad_norm": 1.1328125,
"learning_rate": 1.0003076923076924e-05,
"loss": 0.2674,
"step": 3250
},
{
"epoch": 0.5041269595809251,
"grad_norm": 1.03125,
"learning_rate": 9.972307692307694e-06,
"loss": 0.276,
"step": 3260
},
{
"epoch": 0.505673361297431,
"grad_norm": 0.58203125,
"learning_rate": 9.941538461538463e-06,
"loss": 0.2331,
"step": 3270
},
{
"epoch": 0.5072197630139369,
"grad_norm": 0.7734375,
"learning_rate": 9.910769230769231e-06,
"loss": 0.2518,
"step": 3280
},
{
"epoch": 0.5087661647304429,
"grad_norm": 0.79296875,
"learning_rate": 9.88e-06,
"loss": 0.3217,
"step": 3290
},
{
"epoch": 0.5103125664469488,
"grad_norm": 0.78125,
"learning_rate": 9.84923076923077e-06,
"loss": 0.2582,
"step": 3300
},
{
"epoch": 0.5118589681634547,
"grad_norm": 0.921875,
"learning_rate": 9.818461538461539e-06,
"loss": 0.2967,
"step": 3310
},
{
"epoch": 0.5134053698799605,
"grad_norm": 0.921875,
"learning_rate": 9.787692307692308e-06,
"loss": 0.2508,
"step": 3320
},
{
"epoch": 0.5149517715964664,
"grad_norm": 1.3203125,
"learning_rate": 9.756923076923078e-06,
"loss": 0.247,
"step": 3330
},
{
"epoch": 0.5164981733129724,
"grad_norm": 0.94140625,
"learning_rate": 9.726153846153847e-06,
"loss": 0.2664,
"step": 3340
},
{
"epoch": 0.5180445750294783,
"grad_norm": 0.96875,
"learning_rate": 9.695384615384617e-06,
"loss": 0.2963,
"step": 3350
},
{
"epoch": 0.5195909767459842,
"grad_norm": 1.2109375,
"learning_rate": 9.664615384615386e-06,
"loss": 0.2869,
"step": 3360
},
{
"epoch": 0.5211373784624901,
"grad_norm": 0.796875,
"learning_rate": 9.633846153846155e-06,
"loss": 0.2125,
"step": 3370
},
{
"epoch": 0.522683780178996,
"grad_norm": 0.83203125,
"learning_rate": 9.603076923076923e-06,
"loss": 0.2379,
"step": 3380
},
{
"epoch": 0.5242301818955019,
"grad_norm": 1.2578125,
"learning_rate": 9.572307692307693e-06,
"loss": 0.2897,
"step": 3390
},
{
"epoch": 0.5257765836120079,
"grad_norm": 0.94140625,
"learning_rate": 9.541538461538462e-06,
"loss": 0.2954,
"step": 3400
},
{
"epoch": 0.5273229853285137,
"grad_norm": 0.75390625,
"learning_rate": 9.510769230769231e-06,
"loss": 0.257,
"step": 3410
},
{
"epoch": 0.5288693870450196,
"grad_norm": 0.77734375,
"learning_rate": 9.48e-06,
"loss": 0.3107,
"step": 3420
},
{
"epoch": 0.5304157887615255,
"grad_norm": 0.7890625,
"learning_rate": 9.44923076923077e-06,
"loss": 0.2178,
"step": 3430
},
{
"epoch": 0.5319621904780314,
"grad_norm": 1.0625,
"learning_rate": 9.41846153846154e-06,
"loss": 0.2407,
"step": 3440
},
{
"epoch": 0.5335085921945374,
"grad_norm": 1.2421875,
"learning_rate": 9.387692307692309e-06,
"loss": 0.284,
"step": 3450
},
{
"epoch": 0.5350549939110433,
"grad_norm": 0.8984375,
"learning_rate": 9.356923076923078e-06,
"loss": 0.2358,
"step": 3460
},
{
"epoch": 0.5366013956275492,
"grad_norm": 1.1328125,
"learning_rate": 9.326153846153848e-06,
"loss": 0.2455,
"step": 3470
},
{
"epoch": 0.538147797344055,
"grad_norm": 1.171875,
"learning_rate": 9.295384615384615e-06,
"loss": 0.3416,
"step": 3480
},
{
"epoch": 0.5396941990605609,
"grad_norm": 1.171875,
"learning_rate": 9.264615384615385e-06,
"loss": 0.2908,
"step": 3490
},
{
"epoch": 0.5412406007770668,
"grad_norm": 0.83984375,
"learning_rate": 9.233846153846154e-06,
"loss": 0.2648,
"step": 3500
},
{
"epoch": 0.5427870024935728,
"grad_norm": 0.984375,
"learning_rate": 9.203076923076924e-06,
"loss": 0.2159,
"step": 3510
},
{
"epoch": 0.5443334042100787,
"grad_norm": 1.078125,
"learning_rate": 9.172307692307693e-06,
"loss": 0.3019,
"step": 3520
},
{
"epoch": 0.5458798059265846,
"grad_norm": 1.4375,
"learning_rate": 9.141538461538462e-06,
"loss": 0.2886,
"step": 3530
},
{
"epoch": 0.5474262076430905,
"grad_norm": 1.1484375,
"learning_rate": 9.110769230769232e-06,
"loss": 0.2674,
"step": 3540
},
{
"epoch": 0.5489726093595964,
"grad_norm": 1.015625,
"learning_rate": 9.080000000000001e-06,
"loss": 0.2809,
"step": 3550
},
{
"epoch": 0.5505190110761023,
"grad_norm": 0.94140625,
"learning_rate": 9.04923076923077e-06,
"loss": 0.3105,
"step": 3560
},
{
"epoch": 0.5520654127926082,
"grad_norm": 0.859375,
"learning_rate": 9.01846153846154e-06,
"loss": 0.3115,
"step": 3570
},
{
"epoch": 0.5536118145091141,
"grad_norm": 0.90234375,
"learning_rate": 8.987692307692308e-06,
"loss": 0.2605,
"step": 3580
},
{
"epoch": 0.55515821622562,
"grad_norm": 0.703125,
"learning_rate": 8.956923076923077e-06,
"loss": 0.2281,
"step": 3590
},
{
"epoch": 0.5567046179421259,
"grad_norm": 0.953125,
"learning_rate": 8.926153846153846e-06,
"loss": 0.2732,
"step": 3600
},
{
"epoch": 0.5582510196586318,
"grad_norm": 1.0078125,
"learning_rate": 8.895384615384616e-06,
"loss": 0.2134,
"step": 3610
},
{
"epoch": 0.5597974213751378,
"grad_norm": 1.140625,
"learning_rate": 8.864615384615385e-06,
"loss": 0.2788,
"step": 3620
},
{
"epoch": 0.5613438230916437,
"grad_norm": 0.8984375,
"learning_rate": 8.833846153846155e-06,
"loss": 0.2558,
"step": 3630
},
{
"epoch": 0.5628902248081495,
"grad_norm": 0.7265625,
"learning_rate": 8.803076923076924e-06,
"loss": 0.2719,
"step": 3640
},
{
"epoch": 0.5644366265246554,
"grad_norm": 1.0703125,
"learning_rate": 8.772307692307693e-06,
"loss": 0.2596,
"step": 3650
},
{
"epoch": 0.5659830282411613,
"grad_norm": 0.8828125,
"learning_rate": 8.741538461538463e-06,
"loss": 0.2484,
"step": 3660
},
{
"epoch": 0.5675294299576673,
"grad_norm": 0.734375,
"learning_rate": 8.710769230769232e-06,
"loss": 0.2734,
"step": 3670
},
{
"epoch": 0.5690758316741732,
"grad_norm": 0.91796875,
"learning_rate": 8.68e-06,
"loss": 0.2728,
"step": 3680
},
{
"epoch": 0.5706222333906791,
"grad_norm": 0.9765625,
"learning_rate": 8.64923076923077e-06,
"loss": 0.2746,
"step": 3690
},
{
"epoch": 0.572168635107185,
"grad_norm": 0.94140625,
"learning_rate": 8.618461538461539e-06,
"loss": 0.2767,
"step": 3700
},
{
"epoch": 0.5737150368236908,
"grad_norm": 0.80859375,
"learning_rate": 8.587692307692308e-06,
"loss": 0.2798,
"step": 3710
},
{
"epoch": 0.5752614385401967,
"grad_norm": 1.0703125,
"learning_rate": 8.556923076923077e-06,
"loss": 0.2573,
"step": 3720
},
{
"epoch": 0.5768078402567027,
"grad_norm": 1.1015625,
"learning_rate": 8.526153846153847e-06,
"loss": 0.2756,
"step": 3730
},
{
"epoch": 0.5783542419732086,
"grad_norm": 1.03125,
"learning_rate": 8.495384615384616e-06,
"loss": 0.2819,
"step": 3740
},
{
"epoch": 0.5799006436897145,
"grad_norm": 0.8046875,
"learning_rate": 8.464615384615386e-06,
"loss": 0.22,
"step": 3750
},
{
"epoch": 0.5814470454062204,
"grad_norm": 1.0078125,
"learning_rate": 8.433846153846155e-06,
"loss": 0.2857,
"step": 3760
},
{
"epoch": 0.5829934471227263,
"grad_norm": 1.0703125,
"learning_rate": 8.403076923076924e-06,
"loss": 0.2803,
"step": 3770
},
{
"epoch": 0.5845398488392322,
"grad_norm": 1.0625,
"learning_rate": 8.372307692307692e-06,
"loss": 0.2207,
"step": 3780
},
{
"epoch": 0.5860862505557382,
"grad_norm": 1.375,
"learning_rate": 8.341538461538462e-06,
"loss": 0.2684,
"step": 3790
},
{
"epoch": 0.587632652272244,
"grad_norm": 0.7578125,
"learning_rate": 8.310769230769231e-06,
"loss": 0.2353,
"step": 3800
},
{
"epoch": 0.5891790539887499,
"grad_norm": 0.6484375,
"learning_rate": 8.28e-06,
"loss": 0.3244,
"step": 3810
},
{
"epoch": 0.5907254557052558,
"grad_norm": 1.15625,
"learning_rate": 8.24923076923077e-06,
"loss": 0.288,
"step": 3820
},
{
"epoch": 0.5922718574217617,
"grad_norm": 0.91015625,
"learning_rate": 8.218461538461539e-06,
"loss": 0.261,
"step": 3830
},
{
"epoch": 0.5938182591382677,
"grad_norm": 0.78125,
"learning_rate": 8.187692307692309e-06,
"loss": 0.3277,
"step": 3840
},
{
"epoch": 0.5953646608547736,
"grad_norm": 0.87890625,
"learning_rate": 8.156923076923078e-06,
"loss": 0.2727,
"step": 3850
},
{
"epoch": 0.5969110625712795,
"grad_norm": 0.80078125,
"learning_rate": 8.126153846153847e-06,
"loss": 0.2319,
"step": 3860
},
{
"epoch": 0.5984574642877853,
"grad_norm": 0.97265625,
"learning_rate": 8.095384615384617e-06,
"loss": 0.252,
"step": 3870
},
{
"epoch": 0.6000038660042912,
"grad_norm": 0.96875,
"learning_rate": 8.064615384615384e-06,
"loss": 0.2683,
"step": 3880
},
{
"epoch": 0.6015502677207971,
"grad_norm": 0.89453125,
"learning_rate": 8.033846153846154e-06,
"loss": 0.3251,
"step": 3890
},
{
"epoch": 0.6030966694373031,
"grad_norm": 1.015625,
"learning_rate": 8.003076923076923e-06,
"loss": 0.3153,
"step": 3900
},
{
"epoch": 0.604643071153809,
"grad_norm": 0.9296875,
"learning_rate": 7.972307692307693e-06,
"loss": 0.3234,
"step": 3910
},
{
"epoch": 0.6061894728703149,
"grad_norm": 0.88671875,
"learning_rate": 7.941538461538462e-06,
"loss": 0.2812,
"step": 3920
},
{
"epoch": 0.6077358745868208,
"grad_norm": 0.890625,
"learning_rate": 7.910769230769231e-06,
"loss": 0.2959,
"step": 3930
},
{
"epoch": 0.6092822763033267,
"grad_norm": 0.98046875,
"learning_rate": 7.88e-06,
"loss": 0.2818,
"step": 3940
},
{
"epoch": 0.6108286780198326,
"grad_norm": 1.15625,
"learning_rate": 7.84923076923077e-06,
"loss": 0.3152,
"step": 3950
},
{
"epoch": 0.6123750797363385,
"grad_norm": 0.9375,
"learning_rate": 7.81846153846154e-06,
"loss": 0.2988,
"step": 3960
},
{
"epoch": 0.6139214814528444,
"grad_norm": 0.984375,
"learning_rate": 7.787692307692309e-06,
"loss": 0.2835,
"step": 3970
},
{
"epoch": 0.6154678831693503,
"grad_norm": 1.359375,
"learning_rate": 7.756923076923077e-06,
"loss": 0.3486,
"step": 3980
},
{
"epoch": 0.6170142848858562,
"grad_norm": 1.3828125,
"learning_rate": 7.726153846153846e-06,
"loss": 0.2934,
"step": 3990
},
{
"epoch": 0.6185606866023621,
"grad_norm": 0.92578125,
"learning_rate": 7.695384615384615e-06,
"loss": 0.2678,
"step": 4000
},
{
"epoch": 0.6201070883188681,
"grad_norm": 0.90625,
"learning_rate": 7.664615384615385e-06,
"loss": 0.2608,
"step": 4010
},
{
"epoch": 0.621653490035374,
"grad_norm": 1.0703125,
"learning_rate": 7.633846153846154e-06,
"loss": 0.289,
"step": 4020
},
{
"epoch": 0.6231998917518798,
"grad_norm": 1.421875,
"learning_rate": 7.6030769230769245e-06,
"loss": 0.2856,
"step": 4030
},
{
"epoch": 0.6247462934683857,
"grad_norm": 0.67578125,
"learning_rate": 7.572307692307693e-06,
"loss": 0.2569,
"step": 4040
},
{
"epoch": 0.6262926951848916,
"grad_norm": 0.96484375,
"learning_rate": 7.5415384615384624e-06,
"loss": 0.2727,
"step": 4050
},
{
"epoch": 0.6278390969013976,
"grad_norm": 0.75390625,
"learning_rate": 7.510769230769232e-06,
"loss": 0.279,
"step": 4060
},
{
"epoch": 0.6293854986179035,
"grad_norm": 0.9453125,
"learning_rate": 7.48e-06,
"loss": 0.3606,
"step": 4070
},
{
"epoch": 0.6309319003344094,
"grad_norm": 0.7109375,
"learning_rate": 7.44923076923077e-06,
"loss": 0.2959,
"step": 4080
},
{
"epoch": 0.6324783020509153,
"grad_norm": 1.03125,
"learning_rate": 7.418461538461539e-06,
"loss": 0.2622,
"step": 4090
},
{
"epoch": 0.6340247037674211,
"grad_norm": 0.8046875,
"learning_rate": 7.387692307692308e-06,
"loss": 0.2207,
"step": 4100
},
{
"epoch": 0.635571105483927,
"grad_norm": 1.0234375,
"learning_rate": 7.356923076923077e-06,
"loss": 0.3007,
"step": 4110
},
{
"epoch": 0.637117507200433,
"grad_norm": 1.09375,
"learning_rate": 7.326153846153847e-06,
"loss": 0.2815,
"step": 4120
},
{
"epoch": 0.6386639089169389,
"grad_norm": 0.9453125,
"learning_rate": 7.295384615384617e-06,
"loss": 0.2587,
"step": 4130
},
{
"epoch": 0.6402103106334448,
"grad_norm": 0.9453125,
"learning_rate": 7.264615384615385e-06,
"loss": 0.2999,
"step": 4140
},
{
"epoch": 0.6417567123499507,
"grad_norm": 0.90234375,
"learning_rate": 7.233846153846155e-06,
"loss": 0.2398,
"step": 4150
},
{
"epoch": 0.6433031140664566,
"grad_norm": 0.98046875,
"learning_rate": 7.203076923076924e-06,
"loss": 0.2716,
"step": 4160
},
{
"epoch": 0.6448495157829626,
"grad_norm": 1.0234375,
"learning_rate": 7.172307692307693e-06,
"loss": 0.2222,
"step": 4170
},
{
"epoch": 0.6463959174994685,
"grad_norm": 0.73828125,
"learning_rate": 7.141538461538462e-06,
"loss": 0.285,
"step": 4180
},
{
"epoch": 0.6479423192159743,
"grad_norm": 0.74609375,
"learning_rate": 7.1107692307692314e-06,
"loss": 0.3562,
"step": 4190
},
{
"epoch": 0.6494887209324802,
"grad_norm": 0.9921875,
"learning_rate": 7.08e-06,
"loss": 0.283,
"step": 4200
},
{
"epoch": 0.6510351226489861,
"grad_norm": 0.85546875,
"learning_rate": 7.049230769230769e-06,
"loss": 0.2915,
"step": 4210
},
{
"epoch": 0.652581524365492,
"grad_norm": 0.765625,
"learning_rate": 7.01846153846154e-06,
"loss": 0.2424,
"step": 4220
},
{
"epoch": 0.654127926081998,
"grad_norm": 0.81640625,
"learning_rate": 6.987692307692309e-06,
"loss": 0.2456,
"step": 4230
},
{
"epoch": 0.6556743277985039,
"grad_norm": 0.97265625,
"learning_rate": 6.9569230769230776e-06,
"loss": 0.2946,
"step": 4240
},
{
"epoch": 0.6572207295150098,
"grad_norm": 0.77734375,
"learning_rate": 6.926153846153847e-06,
"loss": 0.3338,
"step": 4250
},
{
"epoch": 0.6587671312315156,
"grad_norm": 1.2265625,
"learning_rate": 6.895384615384616e-06,
"loss": 0.2645,
"step": 4260
},
{
"epoch": 0.6603135329480215,
"grad_norm": 0.96484375,
"learning_rate": 6.864615384615385e-06,
"loss": 0.2671,
"step": 4270
},
{
"epoch": 0.6618599346645275,
"grad_norm": 1.09375,
"learning_rate": 6.833846153846154e-06,
"loss": 0.2627,
"step": 4280
},
{
"epoch": 0.6634063363810334,
"grad_norm": 1.2890625,
"learning_rate": 6.803076923076924e-06,
"loss": 0.2972,
"step": 4290
},
{
"epoch": 0.6649527380975393,
"grad_norm": 0.8359375,
"learning_rate": 6.772307692307692e-06,
"loss": 0.2637,
"step": 4300
},
{
"epoch": 0.6664991398140452,
"grad_norm": 0.87109375,
"learning_rate": 6.741538461538462e-06,
"loss": 0.2459,
"step": 4310
},
{
"epoch": 0.6680455415305511,
"grad_norm": 0.93359375,
"learning_rate": 6.710769230769232e-06,
"loss": 0.3008,
"step": 4320
},
{
"epoch": 0.669591943247057,
"grad_norm": 1.109375,
"learning_rate": 6.680000000000001e-06,
"loss": 0.2881,
"step": 4330
},
{
"epoch": 0.671138344963563,
"grad_norm": 0.67578125,
"learning_rate": 6.64923076923077e-06,
"loss": 0.2887,
"step": 4340
},
{
"epoch": 0.6726847466800688,
"grad_norm": 0.875,
"learning_rate": 6.618461538461539e-06,
"loss": 0.3097,
"step": 4350
},
{
"epoch": 0.6742311483965747,
"grad_norm": 0.8671875,
"learning_rate": 6.587692307692309e-06,
"loss": 0.2623,
"step": 4360
},
{
"epoch": 0.6757775501130806,
"grad_norm": 0.8984375,
"learning_rate": 6.556923076923077e-06,
"loss": 0.2589,
"step": 4370
},
{
"epoch": 0.6773239518295865,
"grad_norm": 0.77734375,
"learning_rate": 6.5261538461538465e-06,
"loss": 0.2149,
"step": 4380
},
{
"epoch": 0.6788703535460925,
"grad_norm": 0.9453125,
"learning_rate": 6.495384615384616e-06,
"loss": 0.22,
"step": 4390
},
{
"epoch": 0.6804167552625984,
"grad_norm": 0.98828125,
"learning_rate": 6.4646153846153845e-06,
"loss": 0.2636,
"step": 4400
},
{
"epoch": 0.6819631569791043,
"grad_norm": 1.0234375,
"learning_rate": 6.433846153846154e-06,
"loss": 0.3088,
"step": 4410
},
{
"epoch": 0.6835095586956101,
"grad_norm": 0.92578125,
"learning_rate": 6.403076923076924e-06,
"loss": 0.2378,
"step": 4420
},
{
"epoch": 0.685055960412116,
"grad_norm": 0.91796875,
"learning_rate": 6.3723076923076935e-06,
"loss": 0.2405,
"step": 4430
},
{
"epoch": 0.6866023621286219,
"grad_norm": 0.83203125,
"learning_rate": 6.341538461538462e-06,
"loss": 0.2825,
"step": 4440
},
{
"epoch": 0.6881487638451279,
"grad_norm": 0.9296875,
"learning_rate": 6.3107692307692315e-06,
"loss": 0.3513,
"step": 4450
},
{
"epoch": 0.6896951655616338,
"grad_norm": 1.078125,
"learning_rate": 6.280000000000001e-06,
"loss": 0.3271,
"step": 4460
},
{
"epoch": 0.6912415672781397,
"grad_norm": 0.82421875,
"learning_rate": 6.249230769230769e-06,
"loss": 0.2742,
"step": 4470
},
{
"epoch": 0.6927879689946456,
"grad_norm": 0.9921875,
"learning_rate": 6.218461538461539e-06,
"loss": 0.2642,
"step": 4480
},
{
"epoch": 0.6943343707111514,
"grad_norm": 0.96875,
"learning_rate": 6.187692307692308e-06,
"loss": 0.2364,
"step": 4490
},
{
"epoch": 0.6958807724276574,
"grad_norm": 1.1875,
"learning_rate": 6.156923076923077e-06,
"loss": 0.2883,
"step": 4500
},
{
"epoch": 0.6974271741441633,
"grad_norm": 1.03125,
"learning_rate": 6.126153846153846e-06,
"loss": 0.2219,
"step": 4510
},
{
"epoch": 0.6989735758606692,
"grad_norm": 0.9453125,
"learning_rate": 6.095384615384616e-06,
"loss": 0.2639,
"step": 4520
},
{
"epoch": 0.7005199775771751,
"grad_norm": 0.80859375,
"learning_rate": 6.064615384615386e-06,
"loss": 0.2764,
"step": 4530
},
{
"epoch": 0.702066379293681,
"grad_norm": 1.0390625,
"learning_rate": 6.033846153846154e-06,
"loss": 0.3097,
"step": 4540
},
{
"epoch": 0.7036127810101869,
"grad_norm": 0.890625,
"learning_rate": 6.003076923076924e-06,
"loss": 0.2262,
"step": 4550
},
{
"epoch": 0.7051591827266929,
"grad_norm": 0.82421875,
"learning_rate": 5.972307692307693e-06,
"loss": 0.2537,
"step": 4560
},
{
"epoch": 0.7067055844431988,
"grad_norm": 0.7734375,
"learning_rate": 5.941538461538462e-06,
"loss": 0.347,
"step": 4570
},
{
"epoch": 0.7082519861597046,
"grad_norm": 1.4140625,
"learning_rate": 5.910769230769231e-06,
"loss": 0.3193,
"step": 4580
},
{
"epoch": 0.7097983878762105,
"grad_norm": 0.75,
"learning_rate": 5.8800000000000005e-06,
"loss": 0.273,
"step": 4590
},
{
"epoch": 0.7113447895927164,
"grad_norm": 0.921875,
"learning_rate": 5.849230769230769e-06,
"loss": 0.2902,
"step": 4600
},
{
"epoch": 0.7128911913092224,
"grad_norm": 0.8046875,
"learning_rate": 5.818461538461538e-06,
"loss": 0.3653,
"step": 4610
},
{
"epoch": 0.7144375930257283,
"grad_norm": 0.76171875,
"learning_rate": 5.787692307692309e-06,
"loss": 0.3106,
"step": 4620
},
{
"epoch": 0.7159839947422342,
"grad_norm": 0.953125,
"learning_rate": 5.756923076923078e-06,
"loss": 0.2368,
"step": 4630
},
{
"epoch": 0.7175303964587401,
"grad_norm": 0.8203125,
"learning_rate": 5.726153846153847e-06,
"loss": 0.249,
"step": 4640
},
{
"epoch": 0.719076798175246,
"grad_norm": 1.1328125,
"learning_rate": 5.695384615384616e-06,
"loss": 0.3709,
"step": 4650
},
{
"epoch": 0.7206231998917518,
"grad_norm": 0.88671875,
"learning_rate": 5.664615384615385e-06,
"loss": 0.2921,
"step": 4660
},
{
"epoch": 0.7221696016082578,
"grad_norm": 1.015625,
"learning_rate": 5.633846153846154e-06,
"loss": 0.3115,
"step": 4670
},
{
"epoch": 0.7237160033247637,
"grad_norm": 0.87890625,
"learning_rate": 5.603076923076923e-06,
"loss": 0.2479,
"step": 4680
},
{
"epoch": 0.7252624050412696,
"grad_norm": 1.046875,
"learning_rate": 5.572307692307693e-06,
"loss": 0.2297,
"step": 4690
},
{
"epoch": 0.7268088067577755,
"grad_norm": 0.89453125,
"learning_rate": 5.541538461538461e-06,
"loss": 0.2454,
"step": 4700
},
{
"epoch": 0.7283552084742814,
"grad_norm": 0.79296875,
"learning_rate": 5.5107692307692315e-06,
"loss": 0.2849,
"step": 4710
},
{
"epoch": 0.7299016101907874,
"grad_norm": 0.74609375,
"learning_rate": 5.480000000000001e-06,
"loss": 0.2797,
"step": 4720
},
{
"epoch": 0.7314480119072932,
"grad_norm": 1.0234375,
"learning_rate": 5.44923076923077e-06,
"loss": 0.3882,
"step": 4730
},
{
"epoch": 0.7329944136237991,
"grad_norm": 0.90234375,
"learning_rate": 5.418461538461539e-06,
"loss": 0.2509,
"step": 4740
},
{
"epoch": 0.734540815340305,
"grad_norm": 0.81640625,
"learning_rate": 5.387692307692308e-06,
"loss": 0.2408,
"step": 4750
},
{
"epoch": 0.7360872170568109,
"grad_norm": 0.8828125,
"learning_rate": 5.356923076923078e-06,
"loss": 0.2413,
"step": 4760
},
{
"epoch": 0.7376336187733168,
"grad_norm": 0.7890625,
"learning_rate": 5.326153846153846e-06,
"loss": 0.2432,
"step": 4770
},
{
"epoch": 0.7391800204898228,
"grad_norm": 0.9921875,
"learning_rate": 5.2953846153846156e-06,
"loss": 0.277,
"step": 4780
},
{
"epoch": 0.7407264222063287,
"grad_norm": 1.15625,
"learning_rate": 5.264615384615385e-06,
"loss": 0.2486,
"step": 4790
},
{
"epoch": 0.7422728239228346,
"grad_norm": 0.8203125,
"learning_rate": 5.2338461538461535e-06,
"loss": 0.3,
"step": 4800
},
{
"epoch": 0.7438192256393404,
"grad_norm": 0.87109375,
"learning_rate": 5.203076923076924e-06,
"loss": 0.26,
"step": 4810
},
{
"epoch": 0.7453656273558463,
"grad_norm": 1.328125,
"learning_rate": 5.172307692307693e-06,
"loss": 0.2937,
"step": 4820
},
{
"epoch": 0.7469120290723523,
"grad_norm": 0.9453125,
"learning_rate": 5.1415384615384625e-06,
"loss": 0.3057,
"step": 4830
},
{
"epoch": 0.7484584307888582,
"grad_norm": 0.8828125,
"learning_rate": 5.110769230769231e-06,
"loss": 0.3284,
"step": 4840
},
{
"epoch": 0.7500048325053641,
"grad_norm": 0.98046875,
"learning_rate": 5.0800000000000005e-06,
"loss": 0.2434,
"step": 4850
},
{
"epoch": 0.75155123422187,
"grad_norm": 0.97265625,
"learning_rate": 5.04923076923077e-06,
"loss": 0.257,
"step": 4860
},
{
"epoch": 0.7530976359383759,
"grad_norm": 0.91015625,
"learning_rate": 5.0184615384615384e-06,
"loss": 0.2714,
"step": 4870
},
{
"epoch": 0.7546440376548817,
"grad_norm": 0.609375,
"learning_rate": 4.987692307692308e-06,
"loss": 0.2182,
"step": 4880
},
{
"epoch": 0.7561904393713877,
"grad_norm": 0.85546875,
"learning_rate": 4.956923076923077e-06,
"loss": 0.2855,
"step": 4890
},
{
"epoch": 0.7577368410878936,
"grad_norm": 1.0703125,
"learning_rate": 4.926153846153847e-06,
"loss": 0.2774,
"step": 4900
},
{
"epoch": 0.7592832428043995,
"grad_norm": 0.86328125,
"learning_rate": 4.895384615384616e-06,
"loss": 0.2489,
"step": 4910
},
{
"epoch": 0.7608296445209054,
"grad_norm": 1.1328125,
"learning_rate": 4.8646153846153846e-06,
"loss": 0.3157,
"step": 4920
},
{
"epoch": 0.7623760462374113,
"grad_norm": 0.9609375,
"learning_rate": 4.833846153846154e-06,
"loss": 0.2704,
"step": 4930
},
{
"epoch": 0.7639224479539173,
"grad_norm": 0.82421875,
"learning_rate": 4.803076923076923e-06,
"loss": 0.2995,
"step": 4940
},
{
"epoch": 0.7654688496704232,
"grad_norm": 0.98046875,
"learning_rate": 4.772307692307693e-06,
"loss": 0.2422,
"step": 4950
},
{
"epoch": 0.767015251386929,
"grad_norm": 1.296875,
"learning_rate": 4.741538461538462e-06,
"loss": 0.2692,
"step": 4960
},
{
"epoch": 0.7685616531034349,
"grad_norm": 1.015625,
"learning_rate": 4.710769230769231e-06,
"loss": 0.2704,
"step": 4970
},
{
"epoch": 0.7701080548199408,
"grad_norm": 0.85546875,
"learning_rate": 4.680000000000001e-06,
"loss": 0.3147,
"step": 4980
},
{
"epoch": 0.7716544565364467,
"grad_norm": 0.9609375,
"learning_rate": 4.6492307692307695e-06,
"loss": 0.2867,
"step": 4990
},
{
"epoch": 0.7732008582529527,
"grad_norm": 1.09375,
"learning_rate": 4.618461538461539e-06,
"loss": 0.2896,
"step": 5000
},
{
"epoch": 0.7747472599694586,
"grad_norm": 1.0546875,
"learning_rate": 4.587692307692308e-06,
"loss": 0.2335,
"step": 5010
},
{
"epoch": 0.7762936616859645,
"grad_norm": 1.0390625,
"learning_rate": 4.556923076923077e-06,
"loss": 0.2441,
"step": 5020
},
{
"epoch": 0.7778400634024704,
"grad_norm": 1.4453125,
"learning_rate": 4.526153846153847e-06,
"loss": 0.3049,
"step": 5030
},
{
"epoch": 0.7793864651189762,
"grad_norm": 1.09375,
"learning_rate": 4.495384615384616e-06,
"loss": 0.2605,
"step": 5040
},
{
"epoch": 0.7809328668354822,
"grad_norm": 1.2265625,
"learning_rate": 4.464615384615385e-06,
"loss": 0.2876,
"step": 5050
},
{
"epoch": 0.7824792685519881,
"grad_norm": 1.09375,
"learning_rate": 4.433846153846154e-06,
"loss": 0.3434,
"step": 5060
},
{
"epoch": 0.784025670268494,
"grad_norm": 1.046875,
"learning_rate": 4.403076923076923e-06,
"loss": 0.2956,
"step": 5070
},
{
"epoch": 0.7855720719849999,
"grad_norm": 1.0078125,
"learning_rate": 4.372307692307693e-06,
"loss": 0.3175,
"step": 5080
},
{
"epoch": 0.7871184737015058,
"grad_norm": 1.1015625,
"learning_rate": 4.341538461538462e-06,
"loss": 0.2914,
"step": 5090
},
{
"epoch": 0.7886648754180117,
"grad_norm": 1.125,
"learning_rate": 4.310769230769231e-06,
"loss": 0.2657,
"step": 5100
},
{
"epoch": 0.7902112771345177,
"grad_norm": 0.9453125,
"learning_rate": 4.2800000000000005e-06,
"loss": 0.3168,
"step": 5110
},
{
"epoch": 0.7917576788510235,
"grad_norm": 1.0390625,
"learning_rate": 4.249230769230769e-06,
"loss": 0.2422,
"step": 5120
},
{
"epoch": 0.7933040805675294,
"grad_norm": 0.8125,
"learning_rate": 4.218461538461539e-06,
"loss": 0.2651,
"step": 5130
},
{
"epoch": 0.7948504822840353,
"grad_norm": 0.984375,
"learning_rate": 4.187692307692308e-06,
"loss": 0.245,
"step": 5140
},
{
"epoch": 0.7963968840005412,
"grad_norm": 0.82421875,
"learning_rate": 4.156923076923077e-06,
"loss": 0.3055,
"step": 5150
},
{
"epoch": 0.7979432857170471,
"grad_norm": 1.015625,
"learning_rate": 4.126153846153847e-06,
"loss": 0.2992,
"step": 5160
},
{
"epoch": 0.7994896874335531,
"grad_norm": 0.796875,
"learning_rate": 4.095384615384615e-06,
"loss": 0.3123,
"step": 5170
},
{
"epoch": 0.801036089150059,
"grad_norm": 1.1796875,
"learning_rate": 4.0646153846153854e-06,
"loss": 0.2849,
"step": 5180
},
{
"epoch": 0.8025824908665649,
"grad_norm": 0.84765625,
"learning_rate": 4.033846153846154e-06,
"loss": 0.317,
"step": 5190
},
{
"epoch": 0.8041288925830707,
"grad_norm": 0.88671875,
"learning_rate": 4.003076923076923e-06,
"loss": 0.2567,
"step": 5200
},
{
"epoch": 0.8056752942995766,
"grad_norm": 1.109375,
"learning_rate": 3.972307692307693e-06,
"loss": 0.2918,
"step": 5210
},
{
"epoch": 0.8072216960160826,
"grad_norm": 0.9765625,
"learning_rate": 3.941538461538461e-06,
"loss": 0.3973,
"step": 5220
},
{
"epoch": 0.8087680977325885,
"grad_norm": 1.1015625,
"learning_rate": 3.9107692307692316e-06,
"loss": 0.3034,
"step": 5230
},
{
"epoch": 0.8103144994490944,
"grad_norm": 1.0546875,
"learning_rate": 3.88e-06,
"loss": 0.2369,
"step": 5240
},
{
"epoch": 0.8118609011656003,
"grad_norm": 0.8125,
"learning_rate": 3.8492307692307695e-06,
"loss": 0.261,
"step": 5250
},
{
"epoch": 0.8134073028821062,
"grad_norm": 1.015625,
"learning_rate": 3.818461538461539e-06,
"loss": 0.2657,
"step": 5260
},
{
"epoch": 0.814953704598612,
"grad_norm": 0.91796875,
"learning_rate": 3.787692307692308e-06,
"loss": 0.2336,
"step": 5270
},
{
"epoch": 0.816500106315118,
"grad_norm": 0.83984375,
"learning_rate": 3.7569230769230773e-06,
"loss": 0.2683,
"step": 5280
},
{
"epoch": 0.8180465080316239,
"grad_norm": 1.09375,
"learning_rate": 3.7261538461538467e-06,
"loss": 0.2703,
"step": 5290
},
{
"epoch": 0.8195929097481298,
"grad_norm": 1.28125,
"learning_rate": 3.6953846153846156e-06,
"loss": 0.2907,
"step": 5300
},
{
"epoch": 0.8211393114646357,
"grad_norm": 0.95703125,
"learning_rate": 3.6646153846153846e-06,
"loss": 0.3177,
"step": 5310
},
{
"epoch": 0.8226857131811416,
"grad_norm": 0.796875,
"learning_rate": 3.633846153846154e-06,
"loss": 0.3023,
"step": 5320
},
{
"epoch": 0.8242321148976476,
"grad_norm": 0.8359375,
"learning_rate": 3.6030769230769234e-06,
"loss": 0.2169,
"step": 5330
},
{
"epoch": 0.8257785166141535,
"grad_norm": 0.7890625,
"learning_rate": 3.572307692307693e-06,
"loss": 0.24,
"step": 5340
},
{
"epoch": 0.8273249183306594,
"grad_norm": 1.0546875,
"learning_rate": 3.5415384615384618e-06,
"loss": 0.3394,
"step": 5350
},
{
"epoch": 0.8288713200471652,
"grad_norm": 1.1875,
"learning_rate": 3.5107692307692307e-06,
"loss": 0.2527,
"step": 5360
},
{
"epoch": 0.8304177217636711,
"grad_norm": 0.69921875,
"learning_rate": 3.48e-06,
"loss": 0.2447,
"step": 5370
},
{
"epoch": 0.831964123480177,
"grad_norm": 1.0703125,
"learning_rate": 3.4492307692307695e-06,
"loss": 0.2509,
"step": 5380
},
{
"epoch": 0.833510525196683,
"grad_norm": 0.80859375,
"learning_rate": 3.418461538461539e-06,
"loss": 0.3633,
"step": 5390
},
{
"epoch": 0.8350569269131889,
"grad_norm": 0.69921875,
"learning_rate": 3.387692307692308e-06,
"loss": 0.3206,
"step": 5400
},
{
"epoch": 0.8366033286296948,
"grad_norm": 1.5078125,
"learning_rate": 3.356923076923077e-06,
"loss": 0.3542,
"step": 5410
},
{
"epoch": 0.8381497303462007,
"grad_norm": 1.0234375,
"learning_rate": 3.3261538461538463e-06,
"loss": 0.2731,
"step": 5420
},
{
"epoch": 0.8396961320627065,
"grad_norm": 0.92578125,
"learning_rate": 3.2953846153846157e-06,
"loss": 0.3256,
"step": 5430
},
{
"epoch": 0.8412425337792125,
"grad_norm": 0.8359375,
"learning_rate": 3.264615384615385e-06,
"loss": 0.2471,
"step": 5440
},
{
"epoch": 0.8427889354957184,
"grad_norm": 0.9453125,
"learning_rate": 3.233846153846154e-06,
"loss": 0.2755,
"step": 5450
},
{
"epoch": 0.8443353372122243,
"grad_norm": 1.046875,
"learning_rate": 3.203076923076923e-06,
"loss": 0.3139,
"step": 5460
},
{
"epoch": 0.8458817389287302,
"grad_norm": 0.98046875,
"learning_rate": 3.1723076923076924e-06,
"loss": 0.2722,
"step": 5470
},
{
"epoch": 0.8474281406452361,
"grad_norm": 0.828125,
"learning_rate": 3.141538461538462e-06,
"loss": 0.3058,
"step": 5480
},
{
"epoch": 0.848974542361742,
"grad_norm": 1.03125,
"learning_rate": 3.110769230769231e-06,
"loss": 0.2424,
"step": 5490
},
{
"epoch": 0.850520944078248,
"grad_norm": 0.98828125,
"learning_rate": 3.08e-06,
"loss": 0.2752,
"step": 5500
},
{
"epoch": 0.8520673457947538,
"grad_norm": 0.6640625,
"learning_rate": 3.049230769230769e-06,
"loss": 0.2309,
"step": 5510
},
{
"epoch": 0.8536137475112597,
"grad_norm": 0.85546875,
"learning_rate": 3.0184615384615385e-06,
"loss": 0.33,
"step": 5520
},
{
"epoch": 0.8551601492277656,
"grad_norm": 0.765625,
"learning_rate": 2.987692307692308e-06,
"loss": 0.2942,
"step": 5530
},
{
"epoch": 0.8567065509442715,
"grad_norm": 0.76171875,
"learning_rate": 2.9569230769230773e-06,
"loss": 0.3103,
"step": 5540
},
{
"epoch": 0.8582529526607775,
"grad_norm": 0.91796875,
"learning_rate": 2.9261538461538463e-06,
"loss": 0.2775,
"step": 5550
},
{
"epoch": 0.8597993543772834,
"grad_norm": 1.0234375,
"learning_rate": 2.8953846153846153e-06,
"loss": 0.2941,
"step": 5560
},
{
"epoch": 0.8613457560937893,
"grad_norm": 0.72265625,
"learning_rate": 2.8646153846153847e-06,
"loss": 0.2591,
"step": 5570
},
{
"epoch": 0.8628921578102952,
"grad_norm": 1.0390625,
"learning_rate": 2.833846153846154e-06,
"loss": 0.2801,
"step": 5580
},
{
"epoch": 0.864438559526801,
"grad_norm": 0.859375,
"learning_rate": 2.8030769230769234e-06,
"loss": 0.3041,
"step": 5590
},
{
"epoch": 0.8659849612433069,
"grad_norm": 1.2265625,
"learning_rate": 2.7723076923076924e-06,
"loss": 0.2866,
"step": 5600
},
{
"epoch": 0.8675313629598129,
"grad_norm": 0.99609375,
"learning_rate": 2.7415384615384614e-06,
"loss": 0.3128,
"step": 5610
},
{
"epoch": 0.8690777646763188,
"grad_norm": 1.0390625,
"learning_rate": 2.710769230769231e-06,
"loss": 0.3121,
"step": 5620
},
{
"epoch": 0.8706241663928247,
"grad_norm": 0.7578125,
"learning_rate": 2.68e-06,
"loss": 0.2264,
"step": 5630
},
{
"epoch": 0.8721705681093306,
"grad_norm": 1.0390625,
"learning_rate": 2.6492307692307696e-06,
"loss": 0.2619,
"step": 5640
},
{
"epoch": 0.8737169698258365,
"grad_norm": 0.70703125,
"learning_rate": 2.6184615384615385e-06,
"loss": 0.2631,
"step": 5650
},
{
"epoch": 0.8752633715423425,
"grad_norm": 0.9765625,
"learning_rate": 2.587692307692308e-06,
"loss": 0.2636,
"step": 5660
},
{
"epoch": 0.8768097732588483,
"grad_norm": 1.03125,
"learning_rate": 2.5569230769230773e-06,
"loss": 0.3569,
"step": 5670
},
{
"epoch": 0.8783561749753542,
"grad_norm": 0.76953125,
"learning_rate": 2.5261538461538463e-06,
"loss": 0.2297,
"step": 5680
},
{
"epoch": 0.8799025766918601,
"grad_norm": 0.89453125,
"learning_rate": 2.4953846153846157e-06,
"loss": 0.2181,
"step": 5690
},
{
"epoch": 0.881448978408366,
"grad_norm": 1.3359375,
"learning_rate": 2.4646153846153847e-06,
"loss": 0.3117,
"step": 5700
},
{
"epoch": 0.8829953801248719,
"grad_norm": 0.9296875,
"learning_rate": 2.433846153846154e-06,
"loss": 0.3071,
"step": 5710
},
{
"epoch": 0.8845417818413779,
"grad_norm": 0.828125,
"learning_rate": 2.4030769230769235e-06,
"loss": 0.2599,
"step": 5720
},
{
"epoch": 0.8860881835578838,
"grad_norm": 1.0234375,
"learning_rate": 2.3723076923076924e-06,
"loss": 0.265,
"step": 5730
},
{
"epoch": 0.8876345852743897,
"grad_norm": 1.0078125,
"learning_rate": 2.341538461538462e-06,
"loss": 0.2922,
"step": 5740
},
{
"epoch": 0.8891809869908955,
"grad_norm": 0.984375,
"learning_rate": 2.310769230769231e-06,
"loss": 0.3616,
"step": 5750
},
{
"epoch": 0.8907273887074014,
"grad_norm": 1.2578125,
"learning_rate": 2.28e-06,
"loss": 0.2587,
"step": 5760
},
{
"epoch": 0.8922737904239074,
"grad_norm": 0.7890625,
"learning_rate": 2.2492307692307696e-06,
"loss": 0.335,
"step": 5770
},
{
"epoch": 0.8938201921404133,
"grad_norm": 0.9375,
"learning_rate": 2.218461538461539e-06,
"loss": 0.288,
"step": 5780
},
{
"epoch": 0.8953665938569192,
"grad_norm": 0.921875,
"learning_rate": 2.187692307692308e-06,
"loss": 0.2932,
"step": 5790
},
{
"epoch": 0.8969129955734251,
"grad_norm": 1.109375,
"learning_rate": 2.156923076923077e-06,
"loss": 0.282,
"step": 5800
},
{
"epoch": 0.898459397289931,
"grad_norm": 0.8671875,
"learning_rate": 2.1261538461538463e-06,
"loss": 0.2073,
"step": 5810
},
{
"epoch": 0.9000057990064368,
"grad_norm": 0.87890625,
"learning_rate": 2.0953846153846157e-06,
"loss": 0.2583,
"step": 5820
},
{
"epoch": 0.9015522007229428,
"grad_norm": 1.0390625,
"learning_rate": 2.064615384615385e-06,
"loss": 0.2805,
"step": 5830
},
{
"epoch": 0.9030986024394487,
"grad_norm": 0.828125,
"learning_rate": 2.033846153846154e-06,
"loss": 0.2416,
"step": 5840
},
{
"epoch": 0.9046450041559546,
"grad_norm": 0.7890625,
"learning_rate": 2.003076923076923e-06,
"loss": 0.2826,
"step": 5850
},
{
"epoch": 0.9061914058724605,
"grad_norm": 0.9375,
"learning_rate": 1.9723076923076924e-06,
"loss": 0.3072,
"step": 5860
},
{
"epoch": 0.9077378075889664,
"grad_norm": 1.171875,
"learning_rate": 1.941538461538462e-06,
"loss": 0.3551,
"step": 5870
},
{
"epoch": 0.9092842093054724,
"grad_norm": 0.84765625,
"learning_rate": 1.9107692307692312e-06,
"loss": 0.3224,
"step": 5880
},
{
"epoch": 0.9108306110219783,
"grad_norm": 0.890625,
"learning_rate": 1.8800000000000002e-06,
"loss": 0.2501,
"step": 5890
},
{
"epoch": 0.9123770127384841,
"grad_norm": 0.671875,
"learning_rate": 1.8492307692307692e-06,
"loss": 0.2555,
"step": 5900
},
{
"epoch": 0.91392341445499,
"grad_norm": 0.8671875,
"learning_rate": 1.8184615384615386e-06,
"loss": 0.329,
"step": 5910
},
{
"epoch": 0.9154698161714959,
"grad_norm": 0.953125,
"learning_rate": 1.7876923076923078e-06,
"loss": 0.3193,
"step": 5920
},
{
"epoch": 0.9170162178880018,
"grad_norm": 1.2265625,
"learning_rate": 1.7569230769230772e-06,
"loss": 0.3162,
"step": 5930
},
{
"epoch": 0.9185626196045078,
"grad_norm": 0.87109375,
"learning_rate": 1.7261538461538463e-06,
"loss": 0.29,
"step": 5940
},
{
"epoch": 0.9201090213210137,
"grad_norm": 0.859375,
"learning_rate": 1.6953846153846153e-06,
"loss": 0.3122,
"step": 5950
},
{
"epoch": 0.9216554230375196,
"grad_norm": 0.75,
"learning_rate": 1.6646153846153847e-06,
"loss": 0.2374,
"step": 5960
},
{
"epoch": 0.9232018247540255,
"grad_norm": 0.80859375,
"learning_rate": 1.6338461538461539e-06,
"loss": 0.2562,
"step": 5970
},
{
"epoch": 0.9247482264705313,
"grad_norm": 1.0859375,
"learning_rate": 1.6030769230769233e-06,
"loss": 0.2854,
"step": 5980
},
{
"epoch": 0.9262946281870373,
"grad_norm": 1.1015625,
"learning_rate": 1.5723076923076925e-06,
"loss": 0.3549,
"step": 5990
},
{
"epoch": 0.9278410299035432,
"grad_norm": 1.1953125,
"learning_rate": 1.5415384615384614e-06,
"loss": 0.4152,
"step": 6000
},
{
"epoch": 0.9293874316200491,
"grad_norm": 1.0546875,
"learning_rate": 1.5107692307692308e-06,
"loss": 0.2626,
"step": 6010
},
{
"epoch": 0.930933833336555,
"grad_norm": 0.9921875,
"learning_rate": 1.48e-06,
"loss": 0.3302,
"step": 6020
},
{
"epoch": 0.9324802350530609,
"grad_norm": 1.0390625,
"learning_rate": 1.4492307692307694e-06,
"loss": 0.3063,
"step": 6030
},
{
"epoch": 0.9340266367695668,
"grad_norm": 1.2265625,
"learning_rate": 1.4184615384615386e-06,
"loss": 0.2758,
"step": 6040
},
{
"epoch": 0.9355730384860728,
"grad_norm": 1.015625,
"learning_rate": 1.3876923076923076e-06,
"loss": 0.2999,
"step": 6050
},
{
"epoch": 0.9371194402025786,
"grad_norm": 1.0234375,
"learning_rate": 1.356923076923077e-06,
"loss": 0.3438,
"step": 6060
},
{
"epoch": 0.9386658419190845,
"grad_norm": 0.70703125,
"learning_rate": 1.3261538461538461e-06,
"loss": 0.2368,
"step": 6070
},
{
"epoch": 0.9402122436355904,
"grad_norm": 0.80859375,
"learning_rate": 1.2953846153846155e-06,
"loss": 0.2524,
"step": 6080
},
{
"epoch": 0.9417586453520963,
"grad_norm": 0.74609375,
"learning_rate": 1.2646153846153847e-06,
"loss": 0.2264,
"step": 6090
},
{
"epoch": 0.9433050470686023,
"grad_norm": 0.75390625,
"learning_rate": 1.233846153846154e-06,
"loss": 0.2067,
"step": 6100
},
{
"epoch": 0.9448514487851082,
"grad_norm": 0.81640625,
"learning_rate": 1.2030769230769233e-06,
"loss": 0.231,
"step": 6110
},
{
"epoch": 0.9463978505016141,
"grad_norm": 0.9765625,
"learning_rate": 1.1723076923076925e-06,
"loss": 0.2731,
"step": 6120
},
{
"epoch": 0.94794425221812,
"grad_norm": 0.9140625,
"learning_rate": 1.1415384615384617e-06,
"loss": 0.2837,
"step": 6130
},
{
"epoch": 0.9494906539346258,
"grad_norm": 1.0078125,
"learning_rate": 1.1107692307692309e-06,
"loss": 0.2272,
"step": 6140
},
{
"epoch": 0.9510370556511317,
"grad_norm": 0.9921875,
"learning_rate": 1.08e-06,
"loss": 0.3152,
"step": 6150
},
{
"epoch": 0.9525834573676377,
"grad_norm": 0.86328125,
"learning_rate": 1.0492307692307694e-06,
"loss": 0.3017,
"step": 6160
},
{
"epoch": 0.9541298590841436,
"grad_norm": 0.8828125,
"learning_rate": 1.0184615384615386e-06,
"loss": 0.3378,
"step": 6170
},
{
"epoch": 0.9556762608006495,
"grad_norm": 0.92578125,
"learning_rate": 9.876923076923078e-07,
"loss": 0.2503,
"step": 6180
},
{
"epoch": 0.9572226625171554,
"grad_norm": 0.87109375,
"learning_rate": 9.56923076923077e-07,
"loss": 0.3438,
"step": 6190
},
{
"epoch": 0.9587690642336613,
"grad_norm": 0.875,
"learning_rate": 9.261538461538462e-07,
"loss": 0.2667,
"step": 6200
},
{
"epoch": 0.9603154659501673,
"grad_norm": 0.64453125,
"learning_rate": 8.953846153846155e-07,
"loss": 0.2745,
"step": 6210
},
{
"epoch": 0.9618618676666731,
"grad_norm": 0.84765625,
"learning_rate": 8.646153846153847e-07,
"loss": 0.2683,
"step": 6220
},
{
"epoch": 0.963408269383179,
"grad_norm": 0.91015625,
"learning_rate": 8.338461538461539e-07,
"loss": 0.2422,
"step": 6230
},
{
"epoch": 0.9649546710996849,
"grad_norm": 0.89453125,
"learning_rate": 8.030769230769231e-07,
"loss": 0.2588,
"step": 6240
},
{
"epoch": 0.9665010728161908,
"grad_norm": 0.87890625,
"learning_rate": 7.723076923076923e-07,
"loss": 0.2812,
"step": 6250
},
{
"epoch": 0.9680474745326967,
"grad_norm": 1.09375,
"learning_rate": 7.415384615384616e-07,
"loss": 0.3232,
"step": 6260
},
{
"epoch": 0.9695938762492027,
"grad_norm": 0.92578125,
"learning_rate": 7.107692307692309e-07,
"loss": 0.2422,
"step": 6270
},
{
"epoch": 0.9711402779657086,
"grad_norm": 0.9765625,
"learning_rate": 6.800000000000001e-07,
"loss": 0.2833,
"step": 6280
},
{
"epoch": 0.9726866796822145,
"grad_norm": 0.890625,
"learning_rate": 6.492307692307692e-07,
"loss": 0.2517,
"step": 6290
},
{
"epoch": 0.9742330813987203,
"grad_norm": 0.99609375,
"learning_rate": 6.184615384615385e-07,
"loss": 0.2534,
"step": 6300
},
{
"epoch": 0.9757794831152262,
"grad_norm": 0.8125,
"learning_rate": 5.876923076923077e-07,
"loss": 0.2911,
"step": 6310
},
{
"epoch": 0.9773258848317322,
"grad_norm": 1.1015625,
"learning_rate": 5.56923076923077e-07,
"loss": 0.2628,
"step": 6320
},
{
"epoch": 0.9788722865482381,
"grad_norm": 1.1015625,
"learning_rate": 5.261538461538462e-07,
"loss": 0.3191,
"step": 6330
},
{
"epoch": 0.980418688264744,
"grad_norm": 0.9765625,
"learning_rate": 4.953846153846155e-07,
"loss": 0.3225,
"step": 6340
},
{
"epoch": 0.9819650899812499,
"grad_norm": 0.92578125,
"learning_rate": 4.6461538461538465e-07,
"loss": 0.2836,
"step": 6350
},
{
"epoch": 0.9835114916977558,
"grad_norm": 1.0625,
"learning_rate": 4.3384615384615384e-07,
"loss": 0.303,
"step": 6360
},
{
"epoch": 0.9850578934142616,
"grad_norm": 1.171875,
"learning_rate": 4.0307692307692313e-07,
"loss": 0.2381,
"step": 6370
},
{
"epoch": 0.9866042951307676,
"grad_norm": 0.77734375,
"learning_rate": 3.7230769230769236e-07,
"loss": 0.2821,
"step": 6380
},
{
"epoch": 0.9881506968472735,
"grad_norm": 0.796875,
"learning_rate": 3.4153846153846155e-07,
"loss": 0.3187,
"step": 6390
},
{
"epoch": 0.9896970985637794,
"grad_norm": 0.80859375,
"learning_rate": 3.107692307692308e-07,
"loss": 0.2993,
"step": 6400
},
{
"epoch": 0.9912435002802853,
"grad_norm": 0.7421875,
"learning_rate": 2.8e-07,
"loss": 0.2923,
"step": 6410
},
{
"epoch": 0.9927899019967912,
"grad_norm": 0.953125,
"learning_rate": 2.4923076923076926e-07,
"loss": 0.2407,
"step": 6420
},
{
"epoch": 0.9943363037132972,
"grad_norm": 1.0078125,
"learning_rate": 2.1846153846153847e-07,
"loss": 0.257,
"step": 6430
},
{
"epoch": 0.9958827054298031,
"grad_norm": 1.171875,
"learning_rate": 1.8769230769230773e-07,
"loss": 0.2531,
"step": 6440
},
{
"epoch": 0.997429107146309,
"grad_norm": 0.69921875,
"learning_rate": 1.5692307692307694e-07,
"loss": 0.273,
"step": 6450
},
{
"epoch": 0.9989755088628148,
"grad_norm": 0.99609375,
"learning_rate": 1.2615384615384617e-07,
"loss": 0.313,
"step": 6460
},
{
"epoch": 1.0004639205149517,
"grad_norm": 0.75,
"learning_rate": 9.53846153846154e-08,
"loss": 0.2218,
"step": 6470
},
{
"epoch": 1.0020103222314576,
"grad_norm": 1.0625,
"learning_rate": 6.461538461538462e-08,
"loss": 0.3402,
"step": 6480
},
{
"epoch": 1.0035567239479637,
"grad_norm": 0.80078125,
"learning_rate": 3.384615384615385e-08,
"loss": 0.2327,
"step": 6490
},
{
"epoch": 1.0051031256644696,
"grad_norm": 0.98046875,
"learning_rate": 3.0769230769230774e-09,
"loss": 0.2185,
"step": 6500
}
],
"logging_steps": 10,
"max_steps": 6500,
"num_input_tokens_seen": 0,
"num_train_epochs": 2,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 1.4935783864782275e+18,
"train_batch_size": 1,
"trial_name": null,
"trial_params": null
}