patient-v1.3 / checkpoint-10000 /trainer_state.json
CodCodingCode's picture
Upload folder using huggingface_hub
7539ce6 verified
{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 8.61333046193604,
"eval_steps": 500,
"global_step": 10000,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.008614191881124151,
"grad_norm": 3.921875,
"learning_rate": 1.9982000000000003e-05,
"loss": 1.3429,
"step": 10
},
{
"epoch": 0.017228383762248303,
"grad_norm": 3.15625,
"learning_rate": 1.9962000000000003e-05,
"loss": 0.7212,
"step": 20
},
{
"epoch": 0.025842575643372456,
"grad_norm": 3.015625,
"learning_rate": 1.9942e-05,
"loss": 0.6892,
"step": 30
},
{
"epoch": 0.034456767524496605,
"grad_norm": 2.890625,
"learning_rate": 1.9922e-05,
"loss": 0.6611,
"step": 40
},
{
"epoch": 0.04307095940562076,
"grad_norm": 2.875,
"learning_rate": 1.9902e-05,
"loss": 0.6514,
"step": 50
},
{
"epoch": 0.05168515128674491,
"grad_norm": 3.0,
"learning_rate": 1.9882e-05,
"loss": 0.6437,
"step": 60
},
{
"epoch": 0.060299343167869064,
"grad_norm": 2.96875,
"learning_rate": 1.9862e-05,
"loss": 0.6161,
"step": 70
},
{
"epoch": 0.06891353504899321,
"grad_norm": 3.109375,
"learning_rate": 1.9842e-05,
"loss": 0.6083,
"step": 80
},
{
"epoch": 0.07752772693011736,
"grad_norm": 3.296875,
"learning_rate": 1.9822e-05,
"loss": 0.5967,
"step": 90
},
{
"epoch": 0.08614191881124152,
"grad_norm": 2.953125,
"learning_rate": 1.9802e-05,
"loss": 0.5773,
"step": 100
},
{
"epoch": 0.09475611069236567,
"grad_norm": 3.21875,
"learning_rate": 1.9782e-05,
"loss": 0.5722,
"step": 110
},
{
"epoch": 0.10337030257348982,
"grad_norm": 2.796875,
"learning_rate": 1.9762e-05,
"loss": 0.5528,
"step": 120
},
{
"epoch": 0.11198449445461398,
"grad_norm": 2.953125,
"learning_rate": 1.9742000000000002e-05,
"loss": 0.5952,
"step": 130
},
{
"epoch": 0.12059868633573813,
"grad_norm": 2.6875,
"learning_rate": 1.9722000000000002e-05,
"loss": 0.5309,
"step": 140
},
{
"epoch": 0.12921287821686228,
"grad_norm": 2.609375,
"learning_rate": 1.9702000000000002e-05,
"loss": 0.5353,
"step": 150
},
{
"epoch": 0.13782707009798642,
"grad_norm": 2.71875,
"learning_rate": 1.9682000000000002e-05,
"loss": 0.5447,
"step": 160
},
{
"epoch": 0.1464412619791106,
"grad_norm": 2.859375,
"learning_rate": 1.9662000000000003e-05,
"loss": 0.4998,
"step": 170
},
{
"epoch": 0.15505545386023473,
"grad_norm": 3.09375,
"learning_rate": 1.9642000000000003e-05,
"loss": 0.5191,
"step": 180
},
{
"epoch": 0.1636696457413589,
"grad_norm": 3.140625,
"learning_rate": 1.9622e-05,
"loss": 0.5358,
"step": 190
},
{
"epoch": 0.17228383762248303,
"grad_norm": 2.515625,
"learning_rate": 1.9602e-05,
"loss": 0.4914,
"step": 200
},
{
"epoch": 0.1808980295036072,
"grad_norm": 2.6875,
"learning_rate": 1.9582e-05,
"loss": 0.4943,
"step": 210
},
{
"epoch": 0.18951222138473134,
"grad_norm": 2.53125,
"learning_rate": 1.9562e-05,
"loss": 0.4731,
"step": 220
},
{
"epoch": 0.1981264132658555,
"grad_norm": 2.625,
"learning_rate": 1.9542e-05,
"loss": 0.497,
"step": 230
},
{
"epoch": 0.20674060514697964,
"grad_norm": 2.46875,
"learning_rate": 1.9522e-05,
"loss": 0.4746,
"step": 240
},
{
"epoch": 0.2153547970281038,
"grad_norm": 2.515625,
"learning_rate": 1.9502e-05,
"loss": 0.4763,
"step": 250
},
{
"epoch": 0.22396898890922795,
"grad_norm": 2.546875,
"learning_rate": 1.9482e-05,
"loss": 0.4759,
"step": 260
},
{
"epoch": 0.23258318079035212,
"grad_norm": 2.71875,
"learning_rate": 1.9462e-05,
"loss": 0.5039,
"step": 270
},
{
"epoch": 0.24119737267147626,
"grad_norm": 2.546875,
"learning_rate": 1.9442e-05,
"loss": 0.4799,
"step": 280
},
{
"epoch": 0.2498115645526004,
"grad_norm": 2.84375,
"learning_rate": 1.9422e-05,
"loss": 0.4446,
"step": 290
},
{
"epoch": 0.25842575643372456,
"grad_norm": 2.671875,
"learning_rate": 1.9402e-05,
"loss": 0.4727,
"step": 300
},
{
"epoch": 0.26703994831484873,
"grad_norm": 2.890625,
"learning_rate": 1.9382000000000002e-05,
"loss": 0.4189,
"step": 310
},
{
"epoch": 0.27565414019597284,
"grad_norm": 2.265625,
"learning_rate": 1.9362000000000002e-05,
"loss": 0.4409,
"step": 320
},
{
"epoch": 0.284268332077097,
"grad_norm": 3.25,
"learning_rate": 1.9342000000000002e-05,
"loss": 0.4656,
"step": 330
},
{
"epoch": 0.2928825239582212,
"grad_norm": 2.578125,
"learning_rate": 1.9322000000000002e-05,
"loss": 0.4713,
"step": 340
},
{
"epoch": 0.30149671583934534,
"grad_norm": 2.0625,
"learning_rate": 1.9302e-05,
"loss": 0.4282,
"step": 350
},
{
"epoch": 0.31011090772046945,
"grad_norm": 2.953125,
"learning_rate": 1.9282e-05,
"loss": 0.4565,
"step": 360
},
{
"epoch": 0.3187250996015936,
"grad_norm": 2.296875,
"learning_rate": 1.9262e-05,
"loss": 0.4346,
"step": 370
},
{
"epoch": 0.3273392914827178,
"grad_norm": 2.828125,
"learning_rate": 1.9242e-05,
"loss": 0.426,
"step": 380
},
{
"epoch": 0.33595348336384195,
"grad_norm": 2.3125,
"learning_rate": 1.9222e-05,
"loss": 0.42,
"step": 390
},
{
"epoch": 0.34456767524496607,
"grad_norm": 2.171875,
"learning_rate": 1.9202e-05,
"loss": 0.4317,
"step": 400
},
{
"epoch": 0.35318186712609023,
"grad_norm": 2.59375,
"learning_rate": 1.9182e-05,
"loss": 0.4311,
"step": 410
},
{
"epoch": 0.3617960590072144,
"grad_norm": 2.609375,
"learning_rate": 1.9162e-05,
"loss": 0.4056,
"step": 420
},
{
"epoch": 0.3704102508883385,
"grad_norm": 2.1875,
"learning_rate": 1.9142e-05,
"loss": 0.4029,
"step": 430
},
{
"epoch": 0.3790244427694627,
"grad_norm": 3.125,
"learning_rate": 1.9122e-05,
"loss": 0.4337,
"step": 440
},
{
"epoch": 0.38763863465058684,
"grad_norm": 2.390625,
"learning_rate": 1.9102e-05,
"loss": 0.4381,
"step": 450
},
{
"epoch": 0.396252826531711,
"grad_norm": 2.796875,
"learning_rate": 1.9082e-05,
"loss": 0.4174,
"step": 460
},
{
"epoch": 0.4048670184128351,
"grad_norm": 2.421875,
"learning_rate": 1.9062e-05,
"loss": 0.3928,
"step": 470
},
{
"epoch": 0.4134812102939593,
"grad_norm": 2.515625,
"learning_rate": 1.9042e-05,
"loss": 0.4051,
"step": 480
},
{
"epoch": 0.42209540217508346,
"grad_norm": 2.40625,
"learning_rate": 1.9022000000000002e-05,
"loss": 0.3992,
"step": 490
},
{
"epoch": 0.4307095940562076,
"grad_norm": 2.21875,
"learning_rate": 1.9002000000000002e-05,
"loss": 0.4194,
"step": 500
},
{
"epoch": 0.43932378593733173,
"grad_norm": 3.328125,
"learning_rate": 1.8982000000000002e-05,
"loss": 0.3951,
"step": 510
},
{
"epoch": 0.4479379778184559,
"grad_norm": 2.234375,
"learning_rate": 1.8962000000000002e-05,
"loss": 0.3918,
"step": 520
},
{
"epoch": 0.45655216969958007,
"grad_norm": 2.65625,
"learning_rate": 1.8942000000000003e-05,
"loss": 0.3854,
"step": 530
},
{
"epoch": 0.46516636158070424,
"grad_norm": 2.0625,
"learning_rate": 1.8922000000000003e-05,
"loss": 0.3836,
"step": 540
},
{
"epoch": 0.47378055346182835,
"grad_norm": 2.28125,
"learning_rate": 1.8902000000000003e-05,
"loss": 0.3824,
"step": 550
},
{
"epoch": 0.4823947453429525,
"grad_norm": 2.625,
"learning_rate": 1.8882000000000003e-05,
"loss": 0.3913,
"step": 560
},
{
"epoch": 0.4910089372240767,
"grad_norm": 2.265625,
"learning_rate": 1.8862000000000003e-05,
"loss": 0.3834,
"step": 570
},
{
"epoch": 0.4996231291052008,
"grad_norm": 2.359375,
"learning_rate": 1.8842000000000004e-05,
"loss": 0.3848,
"step": 580
},
{
"epoch": 0.508237320986325,
"grad_norm": 2.34375,
"learning_rate": 1.8822000000000004e-05,
"loss": 0.3845,
"step": 590
},
{
"epoch": 0.5168515128674491,
"grad_norm": 2.453125,
"learning_rate": 1.8802000000000004e-05,
"loss": 0.3836,
"step": 600
},
{
"epoch": 0.5254657047485732,
"grad_norm": 2.0,
"learning_rate": 1.8782e-05,
"loss": 0.3799,
"step": 610
},
{
"epoch": 0.5340798966296975,
"grad_norm": 2.296875,
"learning_rate": 1.8762e-05,
"loss": 0.3715,
"step": 620
},
{
"epoch": 0.5426940885108216,
"grad_norm": 2.0625,
"learning_rate": 1.8742e-05,
"loss": 0.3825,
"step": 630
},
{
"epoch": 0.5513082803919457,
"grad_norm": 2.359375,
"learning_rate": 1.8722e-05,
"loss": 0.364,
"step": 640
},
{
"epoch": 0.5599224722730699,
"grad_norm": 2.1875,
"learning_rate": 1.8702e-05,
"loss": 0.3765,
"step": 650
},
{
"epoch": 0.568536664154194,
"grad_norm": 1.96875,
"learning_rate": 1.8682000000000002e-05,
"loss": 0.3748,
"step": 660
},
{
"epoch": 0.5771508560353182,
"grad_norm": 2.390625,
"learning_rate": 1.8662000000000002e-05,
"loss": 0.3751,
"step": 670
},
{
"epoch": 0.5857650479164423,
"grad_norm": 2.203125,
"learning_rate": 1.8642000000000002e-05,
"loss": 0.3778,
"step": 680
},
{
"epoch": 0.5943792397975665,
"grad_norm": 1.796875,
"learning_rate": 1.8622000000000002e-05,
"loss": 0.3798,
"step": 690
},
{
"epoch": 0.6029934316786907,
"grad_norm": 2.46875,
"learning_rate": 1.8602000000000002e-05,
"loss": 0.3682,
"step": 700
},
{
"epoch": 0.6116076235598148,
"grad_norm": 1.9765625,
"learning_rate": 1.8582000000000003e-05,
"loss": 0.3652,
"step": 710
},
{
"epoch": 0.6202218154409389,
"grad_norm": 2.25,
"learning_rate": 1.8562000000000003e-05,
"loss": 0.3658,
"step": 720
},
{
"epoch": 0.6288360073220631,
"grad_norm": 1.9140625,
"learning_rate": 1.8542000000000003e-05,
"loss": 0.389,
"step": 730
},
{
"epoch": 0.6374501992031872,
"grad_norm": 1.9375,
"learning_rate": 1.8522000000000003e-05,
"loss": 0.375,
"step": 740
},
{
"epoch": 0.6460643910843114,
"grad_norm": 2.140625,
"learning_rate": 1.8502000000000003e-05,
"loss": 0.3617,
"step": 750
},
{
"epoch": 0.6546785829654356,
"grad_norm": 1.9140625,
"learning_rate": 1.8482000000000004e-05,
"loss": 0.3777,
"step": 760
},
{
"epoch": 0.6632927748465597,
"grad_norm": 1.9453125,
"learning_rate": 1.8462000000000004e-05,
"loss": 0.3599,
"step": 770
},
{
"epoch": 0.6719069667276839,
"grad_norm": 1.75,
"learning_rate": 1.8442e-05,
"loss": 0.3495,
"step": 780
},
{
"epoch": 0.680521158608808,
"grad_norm": 2.109375,
"learning_rate": 1.8422e-05,
"loss": 0.3461,
"step": 790
},
{
"epoch": 0.6891353504899321,
"grad_norm": 2.078125,
"learning_rate": 1.8402e-05,
"loss": 0.3661,
"step": 800
},
{
"epoch": 0.6977495423710564,
"grad_norm": 2.03125,
"learning_rate": 1.8382e-05,
"loss": 0.3594,
"step": 810
},
{
"epoch": 0.7063637342521805,
"grad_norm": 1.984375,
"learning_rate": 1.8362e-05,
"loss": 0.3512,
"step": 820
},
{
"epoch": 0.7149779261333046,
"grad_norm": 2.0625,
"learning_rate": 1.8342e-05,
"loss": 0.3616,
"step": 830
},
{
"epoch": 0.7235921180144288,
"grad_norm": 2.484375,
"learning_rate": 1.8322000000000002e-05,
"loss": 0.3575,
"step": 840
},
{
"epoch": 0.7322063098955529,
"grad_norm": 2.1875,
"learning_rate": 1.8302000000000002e-05,
"loss": 0.3712,
"step": 850
},
{
"epoch": 0.740820501776677,
"grad_norm": 2.046875,
"learning_rate": 1.8282000000000002e-05,
"loss": 0.3724,
"step": 860
},
{
"epoch": 0.7494346936578012,
"grad_norm": 1.953125,
"learning_rate": 1.8262000000000002e-05,
"loss": 0.3524,
"step": 870
},
{
"epoch": 0.7580488855389254,
"grad_norm": 2.203125,
"learning_rate": 1.8242000000000003e-05,
"loss": 0.3543,
"step": 880
},
{
"epoch": 0.7666630774200496,
"grad_norm": 2.015625,
"learning_rate": 1.8222000000000003e-05,
"loss": 0.3697,
"step": 890
},
{
"epoch": 0.7752772693011737,
"grad_norm": 2.140625,
"learning_rate": 1.8202000000000003e-05,
"loss": 0.3583,
"step": 900
},
{
"epoch": 0.7838914611822978,
"grad_norm": 1.9921875,
"learning_rate": 1.8182000000000003e-05,
"loss": 0.3753,
"step": 910
},
{
"epoch": 0.792505653063422,
"grad_norm": 1.9375,
"learning_rate": 1.8162000000000003e-05,
"loss": 0.3581,
"step": 920
},
{
"epoch": 0.8011198449445461,
"grad_norm": 2.171875,
"learning_rate": 1.8142000000000004e-05,
"loss": 0.3534,
"step": 930
},
{
"epoch": 0.8097340368256702,
"grad_norm": 2.328125,
"learning_rate": 1.8122e-05,
"loss": 0.3654,
"step": 940
},
{
"epoch": 0.8183482287067945,
"grad_norm": 1.875,
"learning_rate": 1.8102e-05,
"loss": 0.3663,
"step": 950
},
{
"epoch": 0.8269624205879186,
"grad_norm": 1.921875,
"learning_rate": 1.8082e-05,
"loss": 0.3599,
"step": 960
},
{
"epoch": 0.8355766124690428,
"grad_norm": 2.296875,
"learning_rate": 1.8062e-05,
"loss": 0.3511,
"step": 970
},
{
"epoch": 0.8441908043501669,
"grad_norm": 2.0625,
"learning_rate": 1.8042e-05,
"loss": 0.3615,
"step": 980
},
{
"epoch": 0.852804996231291,
"grad_norm": 1.9140625,
"learning_rate": 1.8022e-05,
"loss": 0.3523,
"step": 990
},
{
"epoch": 0.8614191881124152,
"grad_norm": 2.0,
"learning_rate": 1.8002e-05,
"loss": 0.3591,
"step": 1000
},
{
"epoch": 0.8700333799935394,
"grad_norm": 2.078125,
"learning_rate": 1.7982e-05,
"loss": 0.3567,
"step": 1010
},
{
"epoch": 0.8786475718746635,
"grad_norm": 1.984375,
"learning_rate": 1.7962000000000002e-05,
"loss": 0.3568,
"step": 1020
},
{
"epoch": 0.8872617637557877,
"grad_norm": 1.765625,
"learning_rate": 1.7942000000000002e-05,
"loss": 0.3492,
"step": 1030
},
{
"epoch": 0.8958759556369118,
"grad_norm": 1.671875,
"learning_rate": 1.7922000000000002e-05,
"loss": 0.3386,
"step": 1040
},
{
"epoch": 0.9044901475180359,
"grad_norm": 2.09375,
"learning_rate": 1.7902000000000002e-05,
"loss": 0.3496,
"step": 1050
},
{
"epoch": 0.9131043393991601,
"grad_norm": 1.9296875,
"learning_rate": 1.7882000000000003e-05,
"loss": 0.3278,
"step": 1060
},
{
"epoch": 0.9217185312802842,
"grad_norm": 1.8359375,
"learning_rate": 1.7862000000000003e-05,
"loss": 0.3343,
"step": 1070
},
{
"epoch": 0.9303327231614085,
"grad_norm": 1.6171875,
"learning_rate": 1.7842000000000003e-05,
"loss": 0.3389,
"step": 1080
},
{
"epoch": 0.9389469150425326,
"grad_norm": 1.96875,
"learning_rate": 1.7822000000000003e-05,
"loss": 0.351,
"step": 1090
},
{
"epoch": 0.9475611069236567,
"grad_norm": 1.8203125,
"learning_rate": 1.7802e-05,
"loss": 0.3625,
"step": 1100
},
{
"epoch": 0.9561752988047809,
"grad_norm": 2.03125,
"learning_rate": 1.7782e-05,
"loss": 0.3597,
"step": 1110
},
{
"epoch": 0.964789490685905,
"grad_norm": 2.015625,
"learning_rate": 1.7762e-05,
"loss": 0.3631,
"step": 1120
},
{
"epoch": 0.9734036825670291,
"grad_norm": 1.859375,
"learning_rate": 1.7742e-05,
"loss": 0.3378,
"step": 1130
},
{
"epoch": 0.9820178744481534,
"grad_norm": 1.71875,
"learning_rate": 1.7722e-05,
"loss": 0.3461,
"step": 1140
},
{
"epoch": 0.9906320663292775,
"grad_norm": 1.9609375,
"learning_rate": 1.7702e-05,
"loss": 0.3691,
"step": 1150
},
{
"epoch": 0.9992462582104016,
"grad_norm": 1.6328125,
"learning_rate": 1.7682e-05,
"loss": 0.3332,
"step": 1160
},
{
"epoch": 1.0077527726930118,
"grad_norm": 1.7890625,
"learning_rate": 1.7662e-05,
"loss": 0.2853,
"step": 1170
},
{
"epoch": 1.0163669645741358,
"grad_norm": 1.8125,
"learning_rate": 1.7642e-05,
"loss": 0.284,
"step": 1180
},
{
"epoch": 1.02498115645526,
"grad_norm": 1.78125,
"learning_rate": 1.7622000000000002e-05,
"loss": 0.2768,
"step": 1190
},
{
"epoch": 1.0335953483363842,
"grad_norm": 1.8515625,
"learning_rate": 1.7602000000000002e-05,
"loss": 0.2734,
"step": 1200
},
{
"epoch": 1.0422095402175084,
"grad_norm": 1.7734375,
"learning_rate": 1.7582000000000002e-05,
"loss": 0.2697,
"step": 1210
},
{
"epoch": 1.0508237320986324,
"grad_norm": 1.9765625,
"learning_rate": 1.7562000000000002e-05,
"loss": 0.2799,
"step": 1220
},
{
"epoch": 1.0594379239797567,
"grad_norm": 1.875,
"learning_rate": 1.7542000000000002e-05,
"loss": 0.2766,
"step": 1230
},
{
"epoch": 1.068052115860881,
"grad_norm": 1.96875,
"learning_rate": 1.7522000000000003e-05,
"loss": 0.2742,
"step": 1240
},
{
"epoch": 1.076666307742005,
"grad_norm": 1.8671875,
"learning_rate": 1.7502000000000003e-05,
"loss": 0.2882,
"step": 1250
},
{
"epoch": 1.0852804996231291,
"grad_norm": 1.8046875,
"learning_rate": 1.7482e-05,
"loss": 0.2882,
"step": 1260
},
{
"epoch": 1.0938946915042533,
"grad_norm": 2.140625,
"learning_rate": 1.7462e-05,
"loss": 0.2828,
"step": 1270
},
{
"epoch": 1.1025088833853773,
"grad_norm": 1.859375,
"learning_rate": 1.7442e-05,
"loss": 0.2732,
"step": 1280
},
{
"epoch": 1.1111230752665016,
"grad_norm": 1.65625,
"learning_rate": 1.7422e-05,
"loss": 0.2798,
"step": 1290
},
{
"epoch": 1.1197372671476258,
"grad_norm": 1.8671875,
"learning_rate": 1.7402e-05,
"loss": 0.293,
"step": 1300
},
{
"epoch": 1.1283514590287498,
"grad_norm": 1.9609375,
"learning_rate": 1.7382e-05,
"loss": 0.2971,
"step": 1310
},
{
"epoch": 1.136965650909874,
"grad_norm": 1.953125,
"learning_rate": 1.7362e-05,
"loss": 0.2673,
"step": 1320
},
{
"epoch": 1.1455798427909982,
"grad_norm": 1.75,
"learning_rate": 1.7342e-05,
"loss": 0.2875,
"step": 1330
},
{
"epoch": 1.1541940346721222,
"grad_norm": 1.6640625,
"learning_rate": 1.7322e-05,
"loss": 0.2723,
"step": 1340
},
{
"epoch": 1.1628082265532464,
"grad_norm": 1.9296875,
"learning_rate": 1.7302e-05,
"loss": 0.3001,
"step": 1350
},
{
"epoch": 1.1714224184343707,
"grad_norm": 1.6875,
"learning_rate": 1.7282e-05,
"loss": 0.2878,
"step": 1360
},
{
"epoch": 1.180036610315495,
"grad_norm": 1.5546875,
"learning_rate": 1.7262000000000002e-05,
"loss": 0.2663,
"step": 1370
},
{
"epoch": 1.188650802196619,
"grad_norm": 1.90625,
"learning_rate": 1.7242000000000002e-05,
"loss": 0.3061,
"step": 1380
},
{
"epoch": 1.1972649940777431,
"grad_norm": 1.84375,
"learning_rate": 1.7222000000000002e-05,
"loss": 0.3015,
"step": 1390
},
{
"epoch": 1.2058791859588673,
"grad_norm": 1.8125,
"learning_rate": 1.7202000000000002e-05,
"loss": 0.2825,
"step": 1400
},
{
"epoch": 1.2144933778399913,
"grad_norm": 1.859375,
"learning_rate": 1.7182000000000003e-05,
"loss": 0.2818,
"step": 1410
},
{
"epoch": 1.2231075697211156,
"grad_norm": 1.96875,
"learning_rate": 1.7162e-05,
"loss": 0.2755,
"step": 1420
},
{
"epoch": 1.2317217616022398,
"grad_norm": 1.7421875,
"learning_rate": 1.7142e-05,
"loss": 0.2897,
"step": 1430
},
{
"epoch": 1.2403359534833638,
"grad_norm": 1.8515625,
"learning_rate": 1.7122e-05,
"loss": 0.2816,
"step": 1440
},
{
"epoch": 1.248950145364488,
"grad_norm": 2.03125,
"learning_rate": 1.7102e-05,
"loss": 0.2798,
"step": 1450
},
{
"epoch": 1.2575643372456122,
"grad_norm": 2.0,
"learning_rate": 1.7082e-05,
"loss": 0.2799,
"step": 1460
},
{
"epoch": 1.2661785291267362,
"grad_norm": 1.6953125,
"learning_rate": 1.7062e-05,
"loss": 0.2843,
"step": 1470
},
{
"epoch": 1.2747927210078605,
"grad_norm": 1.953125,
"learning_rate": 1.7042e-05,
"loss": 0.2876,
"step": 1480
},
{
"epoch": 1.2834069128889847,
"grad_norm": 1.8203125,
"learning_rate": 1.7022e-05,
"loss": 0.2912,
"step": 1490
},
{
"epoch": 1.2920211047701087,
"grad_norm": 2.0,
"learning_rate": 1.7002e-05,
"loss": 0.2811,
"step": 1500
},
{
"epoch": 1.300635296651233,
"grad_norm": 1.875,
"learning_rate": 1.6982e-05,
"loss": 0.2944,
"step": 1510
},
{
"epoch": 1.3092494885323571,
"grad_norm": 1.828125,
"learning_rate": 1.6962e-05,
"loss": 0.2796,
"step": 1520
},
{
"epoch": 1.3178636804134811,
"grad_norm": 1.78125,
"learning_rate": 1.6942e-05,
"loss": 0.2857,
"step": 1530
},
{
"epoch": 1.3264778722946053,
"grad_norm": 1.8359375,
"learning_rate": 1.6922e-05,
"loss": 0.278,
"step": 1540
},
{
"epoch": 1.3350920641757296,
"grad_norm": 2.125,
"learning_rate": 1.6902000000000002e-05,
"loss": 0.2711,
"step": 1550
},
{
"epoch": 1.3437062560568536,
"grad_norm": 1.859375,
"learning_rate": 1.6882000000000002e-05,
"loss": 0.2894,
"step": 1560
},
{
"epoch": 1.3523204479379778,
"grad_norm": 1.71875,
"learning_rate": 1.6862000000000002e-05,
"loss": 0.2834,
"step": 1570
},
{
"epoch": 1.360934639819102,
"grad_norm": 1.90625,
"learning_rate": 1.6842e-05,
"loss": 0.2748,
"step": 1580
},
{
"epoch": 1.369548831700226,
"grad_norm": 1.734375,
"learning_rate": 1.6822e-05,
"loss": 0.2904,
"step": 1590
},
{
"epoch": 1.3781630235813502,
"grad_norm": 1.65625,
"learning_rate": 1.6802e-05,
"loss": 0.2816,
"step": 1600
},
{
"epoch": 1.3867772154624745,
"grad_norm": 1.90625,
"learning_rate": 1.6782e-05,
"loss": 0.2993,
"step": 1610
},
{
"epoch": 1.3953914073435985,
"grad_norm": 1.96875,
"learning_rate": 1.6762e-05,
"loss": 0.2747,
"step": 1620
},
{
"epoch": 1.4040055992247227,
"grad_norm": 1.90625,
"learning_rate": 1.6742e-05,
"loss": 0.2814,
"step": 1630
},
{
"epoch": 1.412619791105847,
"grad_norm": 1.90625,
"learning_rate": 1.6722e-05,
"loss": 0.2777,
"step": 1640
},
{
"epoch": 1.421233982986971,
"grad_norm": 2.125,
"learning_rate": 1.6702e-05,
"loss": 0.2789,
"step": 1650
},
{
"epoch": 1.4298481748680951,
"grad_norm": 1.703125,
"learning_rate": 1.6682e-05,
"loss": 0.2876,
"step": 1660
},
{
"epoch": 1.4384623667492193,
"grad_norm": 1.828125,
"learning_rate": 1.6662e-05,
"loss": 0.2855,
"step": 1670
},
{
"epoch": 1.4470765586303436,
"grad_norm": 1.6953125,
"learning_rate": 1.6642e-05,
"loss": 0.2823,
"step": 1680
},
{
"epoch": 1.4556907505114676,
"grad_norm": 1.6875,
"learning_rate": 1.6622e-05,
"loss": 0.2759,
"step": 1690
},
{
"epoch": 1.4643049423925918,
"grad_norm": 1.734375,
"learning_rate": 1.6602e-05,
"loss": 0.2804,
"step": 1700
},
{
"epoch": 1.472919134273716,
"grad_norm": 1.71875,
"learning_rate": 1.6582e-05,
"loss": 0.2772,
"step": 1710
},
{
"epoch": 1.4815333261548402,
"grad_norm": 2.078125,
"learning_rate": 1.6562e-05,
"loss": 0.2834,
"step": 1720
},
{
"epoch": 1.4901475180359642,
"grad_norm": 1.765625,
"learning_rate": 1.6542000000000002e-05,
"loss": 0.2788,
"step": 1730
},
{
"epoch": 1.4987617099170885,
"grad_norm": 1.8359375,
"learning_rate": 1.6522e-05,
"loss": 0.2796,
"step": 1740
},
{
"epoch": 1.5073759017982127,
"grad_norm": 1.7578125,
"learning_rate": 1.6502e-05,
"loss": 0.2764,
"step": 1750
},
{
"epoch": 1.5159900936793367,
"grad_norm": 1.859375,
"learning_rate": 1.6482000000000002e-05,
"loss": 0.2893,
"step": 1760
},
{
"epoch": 1.524604285560461,
"grad_norm": 1.9140625,
"learning_rate": 1.6462000000000003e-05,
"loss": 0.2868,
"step": 1770
},
{
"epoch": 1.5332184774415851,
"grad_norm": 2.046875,
"learning_rate": 1.6442000000000003e-05,
"loss": 0.2801,
"step": 1780
},
{
"epoch": 1.5418326693227091,
"grad_norm": 1.78125,
"learning_rate": 1.6422000000000003e-05,
"loss": 0.2845,
"step": 1790
},
{
"epoch": 1.5504468612038333,
"grad_norm": 1.703125,
"learning_rate": 1.6402000000000003e-05,
"loss": 0.2814,
"step": 1800
},
{
"epoch": 1.5590610530849576,
"grad_norm": 1.765625,
"learning_rate": 1.6382000000000003e-05,
"loss": 0.2707,
"step": 1810
},
{
"epoch": 1.5676752449660816,
"grad_norm": 1.8046875,
"learning_rate": 1.6362000000000004e-05,
"loss": 0.2809,
"step": 1820
},
{
"epoch": 1.5762894368472058,
"grad_norm": 1.7578125,
"learning_rate": 1.6342000000000004e-05,
"loss": 0.2875,
"step": 1830
},
{
"epoch": 1.58490362872833,
"grad_norm": 1.734375,
"learning_rate": 1.6322e-05,
"loss": 0.292,
"step": 1840
},
{
"epoch": 1.593517820609454,
"grad_norm": 1.71875,
"learning_rate": 1.6302e-05,
"loss": 0.2954,
"step": 1850
},
{
"epoch": 1.6021320124905782,
"grad_norm": 1.796875,
"learning_rate": 1.6282e-05,
"loss": 0.2823,
"step": 1860
},
{
"epoch": 1.6107462043717025,
"grad_norm": 1.6796875,
"learning_rate": 1.6262e-05,
"loss": 0.2822,
"step": 1870
},
{
"epoch": 1.6193603962528265,
"grad_norm": 1.8515625,
"learning_rate": 1.6242e-05,
"loss": 0.2776,
"step": 1880
},
{
"epoch": 1.6279745881339507,
"grad_norm": 1.7890625,
"learning_rate": 1.6222e-05,
"loss": 0.2798,
"step": 1890
},
{
"epoch": 1.636588780015075,
"grad_norm": 2.0625,
"learning_rate": 1.6202000000000002e-05,
"loss": 0.2873,
"step": 1900
},
{
"epoch": 1.645202971896199,
"grad_norm": 1.8203125,
"learning_rate": 1.6182000000000002e-05,
"loss": 0.2783,
"step": 1910
},
{
"epoch": 1.6538171637773231,
"grad_norm": 1.796875,
"learning_rate": 1.6162000000000002e-05,
"loss": 0.2847,
"step": 1920
},
{
"epoch": 1.6624313556584474,
"grad_norm": 2.046875,
"learning_rate": 1.6142000000000002e-05,
"loss": 0.2917,
"step": 1930
},
{
"epoch": 1.6710455475395714,
"grad_norm": 1.796875,
"learning_rate": 1.6122000000000003e-05,
"loss": 0.2682,
"step": 1940
},
{
"epoch": 1.6796597394206956,
"grad_norm": 2.109375,
"learning_rate": 1.6102000000000003e-05,
"loss": 0.2837,
"step": 1950
},
{
"epoch": 1.6882739313018198,
"grad_norm": 1.78125,
"learning_rate": 1.6082000000000003e-05,
"loss": 0.2852,
"step": 1960
},
{
"epoch": 1.6968881231829438,
"grad_norm": 1.8515625,
"learning_rate": 1.6062000000000003e-05,
"loss": 0.2896,
"step": 1970
},
{
"epoch": 1.705502315064068,
"grad_norm": 1.9765625,
"learning_rate": 1.6042000000000003e-05,
"loss": 0.2827,
"step": 1980
},
{
"epoch": 1.7141165069451922,
"grad_norm": 1.75,
"learning_rate": 1.6022000000000003e-05,
"loss": 0.2725,
"step": 1990
},
{
"epoch": 1.7227306988263162,
"grad_norm": 1.8828125,
"learning_rate": 1.6002000000000004e-05,
"loss": 0.2835,
"step": 2000
},
{
"epoch": 1.7313448907074405,
"grad_norm": 1.859375,
"learning_rate": 1.5982e-05,
"loss": 0.2779,
"step": 2010
},
{
"epoch": 1.7399590825885647,
"grad_norm": 1.8046875,
"learning_rate": 1.5962e-05,
"loss": 0.2856,
"step": 2020
},
{
"epoch": 1.7485732744696887,
"grad_norm": 1.9453125,
"learning_rate": 1.5942e-05,
"loss": 0.2835,
"step": 2030
},
{
"epoch": 1.757187466350813,
"grad_norm": 1.7734375,
"learning_rate": 1.5922e-05,
"loss": 0.2777,
"step": 2040
},
{
"epoch": 1.7658016582319371,
"grad_norm": 1.8671875,
"learning_rate": 1.5902e-05,
"loss": 0.2787,
"step": 2050
},
{
"epoch": 1.7744158501130611,
"grad_norm": 1.7890625,
"learning_rate": 1.5882e-05,
"loss": 0.2842,
"step": 2060
},
{
"epoch": 1.7830300419941856,
"grad_norm": 1.75,
"learning_rate": 1.5862e-05,
"loss": 0.2654,
"step": 2070
},
{
"epoch": 1.7916442338753096,
"grad_norm": 1.875,
"learning_rate": 1.5842000000000002e-05,
"loss": 0.2701,
"step": 2080
},
{
"epoch": 1.8002584257564336,
"grad_norm": 1.8203125,
"learning_rate": 1.5822000000000002e-05,
"loss": 0.2709,
"step": 2090
},
{
"epoch": 1.808872617637558,
"grad_norm": 1.8046875,
"learning_rate": 1.5802000000000002e-05,
"loss": 0.2781,
"step": 2100
},
{
"epoch": 1.817486809518682,
"grad_norm": 1.875,
"learning_rate": 1.5782000000000002e-05,
"loss": 0.2823,
"step": 2110
},
{
"epoch": 1.826101001399806,
"grad_norm": 1.875,
"learning_rate": 1.5762000000000003e-05,
"loss": 0.2747,
"step": 2120
},
{
"epoch": 1.8347151932809305,
"grad_norm": 1.8828125,
"learning_rate": 1.5742000000000003e-05,
"loss": 0.2731,
"step": 2130
},
{
"epoch": 1.8433293851620545,
"grad_norm": 1.71875,
"learning_rate": 1.5722000000000003e-05,
"loss": 0.2738,
"step": 2140
},
{
"epoch": 1.8519435770431787,
"grad_norm": 1.8671875,
"learning_rate": 1.5702000000000003e-05,
"loss": 0.2887,
"step": 2150
},
{
"epoch": 1.860557768924303,
"grad_norm": 2.046875,
"learning_rate": 1.5682000000000003e-05,
"loss": 0.2841,
"step": 2160
},
{
"epoch": 1.869171960805427,
"grad_norm": 2.015625,
"learning_rate": 1.5662e-05,
"loss": 0.2916,
"step": 2170
},
{
"epoch": 1.8777861526865511,
"grad_norm": 1.7734375,
"learning_rate": 1.5642e-05,
"loss": 0.2747,
"step": 2180
},
{
"epoch": 1.8864003445676754,
"grad_norm": 1.7109375,
"learning_rate": 1.5622e-05,
"loss": 0.2769,
"step": 2190
},
{
"epoch": 1.8950145364487994,
"grad_norm": 1.7421875,
"learning_rate": 1.5602e-05,
"loss": 0.2798,
"step": 2200
},
{
"epoch": 1.9036287283299236,
"grad_norm": 1.8125,
"learning_rate": 1.5582e-05,
"loss": 0.2791,
"step": 2210
},
{
"epoch": 1.9122429202110478,
"grad_norm": 1.9296875,
"learning_rate": 1.5562e-05,
"loss": 0.2721,
"step": 2220
},
{
"epoch": 1.9208571120921718,
"grad_norm": 2.015625,
"learning_rate": 1.5542e-05,
"loss": 0.2778,
"step": 2230
},
{
"epoch": 1.929471303973296,
"grad_norm": 1.7265625,
"learning_rate": 1.5522e-05,
"loss": 0.2792,
"step": 2240
},
{
"epoch": 1.9380854958544202,
"grad_norm": 1.671875,
"learning_rate": 1.5502e-05,
"loss": 0.2819,
"step": 2250
},
{
"epoch": 1.9466996877355442,
"grad_norm": 1.8515625,
"learning_rate": 1.5482000000000002e-05,
"loss": 0.2691,
"step": 2260
},
{
"epoch": 1.9553138796166685,
"grad_norm": 1.6015625,
"learning_rate": 1.5462000000000002e-05,
"loss": 0.2731,
"step": 2270
},
{
"epoch": 1.9639280714977927,
"grad_norm": 1.734375,
"learning_rate": 1.5442000000000002e-05,
"loss": 0.2709,
"step": 2280
},
{
"epoch": 1.9725422633789167,
"grad_norm": 2.0625,
"learning_rate": 1.5422000000000002e-05,
"loss": 0.288,
"step": 2290
},
{
"epoch": 1.981156455260041,
"grad_norm": 1.8046875,
"learning_rate": 1.5402000000000003e-05,
"loss": 0.2807,
"step": 2300
},
{
"epoch": 1.9897706471411651,
"grad_norm": 1.8671875,
"learning_rate": 1.5382000000000003e-05,
"loss": 0.2769,
"step": 2310
},
{
"epoch": 1.9983848390222891,
"grad_norm": 1.90625,
"learning_rate": 1.5362000000000003e-05,
"loss": 0.2855,
"step": 2320
},
{
"epoch": 2.006891353504899,
"grad_norm": 1.875,
"learning_rate": 1.5342e-05,
"loss": 0.2429,
"step": 2330
},
{
"epoch": 2.0155055453860236,
"grad_norm": 2.0,
"learning_rate": 1.5322e-05,
"loss": 0.2166,
"step": 2340
},
{
"epoch": 2.0241197372671476,
"grad_norm": 1.8125,
"learning_rate": 1.5302e-05,
"loss": 0.2022,
"step": 2350
},
{
"epoch": 2.0327339291482716,
"grad_norm": 1.8984375,
"learning_rate": 1.5282e-05,
"loss": 0.1995,
"step": 2360
},
{
"epoch": 2.041348121029396,
"grad_norm": 1.8828125,
"learning_rate": 1.5262e-05,
"loss": 0.1997,
"step": 2370
},
{
"epoch": 2.04996231291052,
"grad_norm": 2.109375,
"learning_rate": 1.5242e-05,
"loss": 0.2031,
"step": 2380
},
{
"epoch": 2.0585765047916444,
"grad_norm": 1.9609375,
"learning_rate": 1.5222000000000001e-05,
"loss": 0.21,
"step": 2390
},
{
"epoch": 2.0671906966727684,
"grad_norm": 2.15625,
"learning_rate": 1.5202000000000001e-05,
"loss": 0.2159,
"step": 2400
},
{
"epoch": 2.0758048885538924,
"grad_norm": 2.03125,
"learning_rate": 1.5182000000000001e-05,
"loss": 0.1967,
"step": 2410
},
{
"epoch": 2.084419080435017,
"grad_norm": 2.140625,
"learning_rate": 1.5162000000000002e-05,
"loss": 0.2168,
"step": 2420
},
{
"epoch": 2.093033272316141,
"grad_norm": 1.921875,
"learning_rate": 1.5142000000000002e-05,
"loss": 0.2028,
"step": 2430
},
{
"epoch": 2.101647464197265,
"grad_norm": 2.015625,
"learning_rate": 1.5122000000000002e-05,
"loss": 0.1958,
"step": 2440
},
{
"epoch": 2.1102616560783893,
"grad_norm": 2.046875,
"learning_rate": 1.5102e-05,
"loss": 0.2105,
"step": 2450
},
{
"epoch": 2.1188758479595133,
"grad_norm": 1.953125,
"learning_rate": 1.5082e-05,
"loss": 0.2027,
"step": 2460
},
{
"epoch": 2.1274900398406373,
"grad_norm": 2.09375,
"learning_rate": 1.5062e-05,
"loss": 0.207,
"step": 2470
},
{
"epoch": 2.136104231721762,
"grad_norm": 2.0,
"learning_rate": 1.5042000000000001e-05,
"loss": 0.2125,
"step": 2480
},
{
"epoch": 2.144718423602886,
"grad_norm": 2.078125,
"learning_rate": 1.5022000000000001e-05,
"loss": 0.2052,
"step": 2490
},
{
"epoch": 2.15333261548401,
"grad_norm": 2.15625,
"learning_rate": 1.5002000000000001e-05,
"loss": 0.2057,
"step": 2500
},
{
"epoch": 2.1619468073651342,
"grad_norm": 2.109375,
"learning_rate": 1.4982000000000002e-05,
"loss": 0.2159,
"step": 2510
},
{
"epoch": 2.1705609992462582,
"grad_norm": 1.9453125,
"learning_rate": 1.4962000000000002e-05,
"loss": 0.2088,
"step": 2520
},
{
"epoch": 2.1791751911273822,
"grad_norm": 2.125,
"learning_rate": 1.4942e-05,
"loss": 0.2128,
"step": 2530
},
{
"epoch": 2.1877893830085067,
"grad_norm": 1.984375,
"learning_rate": 1.4922e-05,
"loss": 0.2143,
"step": 2540
},
{
"epoch": 2.1964035748896307,
"grad_norm": 2.109375,
"learning_rate": 1.4902e-05,
"loss": 0.2098,
"step": 2550
},
{
"epoch": 2.2050177667707547,
"grad_norm": 2.1875,
"learning_rate": 1.4882e-05,
"loss": 0.2027,
"step": 2560
},
{
"epoch": 2.213631958651879,
"grad_norm": 1.9765625,
"learning_rate": 1.4862000000000001e-05,
"loss": 0.2082,
"step": 2570
},
{
"epoch": 2.222246150533003,
"grad_norm": 2.03125,
"learning_rate": 1.4842000000000001e-05,
"loss": 0.2132,
"step": 2580
},
{
"epoch": 2.230860342414127,
"grad_norm": 2.078125,
"learning_rate": 1.4822000000000001e-05,
"loss": 0.2008,
"step": 2590
},
{
"epoch": 2.2394745342952516,
"grad_norm": 2.1875,
"learning_rate": 1.4802000000000002e-05,
"loss": 0.207,
"step": 2600
},
{
"epoch": 2.2480887261763756,
"grad_norm": 1.9765625,
"learning_rate": 1.4782e-05,
"loss": 0.2129,
"step": 2610
},
{
"epoch": 2.2567029180574996,
"grad_norm": 2.375,
"learning_rate": 1.4762e-05,
"loss": 0.2085,
"step": 2620
},
{
"epoch": 2.265317109938624,
"grad_norm": 2.140625,
"learning_rate": 1.4742e-05,
"loss": 0.216,
"step": 2630
},
{
"epoch": 2.273931301819748,
"grad_norm": 1.984375,
"learning_rate": 1.4722e-05,
"loss": 0.2133,
"step": 2640
},
{
"epoch": 2.282545493700872,
"grad_norm": 2.34375,
"learning_rate": 1.4702000000000001e-05,
"loss": 0.2032,
"step": 2650
},
{
"epoch": 2.2911596855819965,
"grad_norm": 2.203125,
"learning_rate": 1.4682000000000001e-05,
"loss": 0.2248,
"step": 2660
},
{
"epoch": 2.2997738774631205,
"grad_norm": 2.25,
"learning_rate": 1.4662000000000001e-05,
"loss": 0.2056,
"step": 2670
},
{
"epoch": 2.3083880693442445,
"grad_norm": 2.046875,
"learning_rate": 1.4642000000000001e-05,
"loss": 0.2115,
"step": 2680
},
{
"epoch": 2.317002261225369,
"grad_norm": 2.296875,
"learning_rate": 1.4622e-05,
"loss": 0.1984,
"step": 2690
},
{
"epoch": 2.325616453106493,
"grad_norm": 2.140625,
"learning_rate": 1.4602e-05,
"loss": 0.2122,
"step": 2700
},
{
"epoch": 2.334230644987617,
"grad_norm": 2.09375,
"learning_rate": 1.4582e-05,
"loss": 0.2094,
"step": 2710
},
{
"epoch": 2.3428448368687413,
"grad_norm": 2.109375,
"learning_rate": 1.4562e-05,
"loss": 0.2225,
"step": 2720
},
{
"epoch": 2.3514590287498653,
"grad_norm": 2.421875,
"learning_rate": 1.4542e-05,
"loss": 0.2131,
"step": 2730
},
{
"epoch": 2.36007322063099,
"grad_norm": 2.40625,
"learning_rate": 1.4522000000000001e-05,
"loss": 0.2118,
"step": 2740
},
{
"epoch": 2.368687412512114,
"grad_norm": 2.078125,
"learning_rate": 1.4502000000000001e-05,
"loss": 0.2148,
"step": 2750
},
{
"epoch": 2.377301604393238,
"grad_norm": 2.375,
"learning_rate": 1.4482000000000001e-05,
"loss": 0.214,
"step": 2760
},
{
"epoch": 2.385915796274362,
"grad_norm": 2.078125,
"learning_rate": 1.4462e-05,
"loss": 0.2028,
"step": 2770
},
{
"epoch": 2.3945299881554862,
"grad_norm": 2.25,
"learning_rate": 1.4442e-05,
"loss": 0.2073,
"step": 2780
},
{
"epoch": 2.4031441800366102,
"grad_norm": 1.9609375,
"learning_rate": 1.4422e-05,
"loss": 0.2145,
"step": 2790
},
{
"epoch": 2.4117583719177347,
"grad_norm": 2.90625,
"learning_rate": 1.4402e-05,
"loss": 0.2069,
"step": 2800
},
{
"epoch": 2.4203725637988587,
"grad_norm": 1.8984375,
"learning_rate": 1.4382e-05,
"loss": 0.2216,
"step": 2810
},
{
"epoch": 2.4289867556799827,
"grad_norm": 2.03125,
"learning_rate": 1.4362e-05,
"loss": 0.2117,
"step": 2820
},
{
"epoch": 2.4376009475611067,
"grad_norm": 1.9609375,
"learning_rate": 1.4342000000000001e-05,
"loss": 0.219,
"step": 2830
},
{
"epoch": 2.446215139442231,
"grad_norm": 2.28125,
"learning_rate": 1.4322000000000001e-05,
"loss": 0.2061,
"step": 2840
},
{
"epoch": 2.454829331323355,
"grad_norm": 2.03125,
"learning_rate": 1.4302e-05,
"loss": 0.2182,
"step": 2850
},
{
"epoch": 2.4634435232044796,
"grad_norm": 2.21875,
"learning_rate": 1.4282e-05,
"loss": 0.2135,
"step": 2860
},
{
"epoch": 2.4720577150856036,
"grad_norm": 2.125,
"learning_rate": 1.4262e-05,
"loss": 0.215,
"step": 2870
},
{
"epoch": 2.4806719069667276,
"grad_norm": 2.015625,
"learning_rate": 1.4242e-05,
"loss": 0.2105,
"step": 2880
},
{
"epoch": 2.489286098847852,
"grad_norm": 2.078125,
"learning_rate": 1.4222e-05,
"loss": 0.2067,
"step": 2890
},
{
"epoch": 2.497900290728976,
"grad_norm": 2.359375,
"learning_rate": 1.4202e-05,
"loss": 0.2145,
"step": 2900
},
{
"epoch": 2.5065144826101,
"grad_norm": 2.328125,
"learning_rate": 1.4182e-05,
"loss": 0.2191,
"step": 2910
},
{
"epoch": 2.5151286744912245,
"grad_norm": 2.375,
"learning_rate": 1.4162000000000001e-05,
"loss": 0.2068,
"step": 2920
},
{
"epoch": 2.5237428663723485,
"grad_norm": 2.015625,
"learning_rate": 1.4142e-05,
"loss": 0.2025,
"step": 2930
},
{
"epoch": 2.5323570582534725,
"grad_norm": 2.140625,
"learning_rate": 1.4122e-05,
"loss": 0.2089,
"step": 2940
},
{
"epoch": 2.540971250134597,
"grad_norm": 2.296875,
"learning_rate": 1.4102e-05,
"loss": 0.2167,
"step": 2950
},
{
"epoch": 2.549585442015721,
"grad_norm": 2.140625,
"learning_rate": 1.4082e-05,
"loss": 0.2049,
"step": 2960
},
{
"epoch": 2.558199633896845,
"grad_norm": 1.984375,
"learning_rate": 1.4062e-05,
"loss": 0.2034,
"step": 2970
},
{
"epoch": 2.5668138257779693,
"grad_norm": 1.9296875,
"learning_rate": 1.4042e-05,
"loss": 0.2095,
"step": 2980
},
{
"epoch": 2.5754280176590933,
"grad_norm": 2.34375,
"learning_rate": 1.4022e-05,
"loss": 0.2047,
"step": 2990
},
{
"epoch": 2.5840422095402173,
"grad_norm": 2.234375,
"learning_rate": 1.4002e-05,
"loss": 0.2055,
"step": 3000
},
{
"epoch": 2.592656401421342,
"grad_norm": 2.203125,
"learning_rate": 1.3982000000000003e-05,
"loss": 0.211,
"step": 3010
},
{
"epoch": 2.601270593302466,
"grad_norm": 1.90625,
"learning_rate": 1.3962000000000003e-05,
"loss": 0.2009,
"step": 3020
},
{
"epoch": 2.6098847851835902,
"grad_norm": 2.28125,
"learning_rate": 1.3942000000000001e-05,
"loss": 0.2173,
"step": 3030
},
{
"epoch": 2.6184989770647142,
"grad_norm": 2.25,
"learning_rate": 1.3922000000000002e-05,
"loss": 0.2102,
"step": 3040
},
{
"epoch": 2.6271131689458382,
"grad_norm": 1.9140625,
"learning_rate": 1.3902000000000002e-05,
"loss": 0.2102,
"step": 3050
},
{
"epoch": 2.6357273608269622,
"grad_norm": 2.34375,
"learning_rate": 1.3882000000000002e-05,
"loss": 0.213,
"step": 3060
},
{
"epoch": 2.6443415527080867,
"grad_norm": 2.0,
"learning_rate": 1.3862000000000002e-05,
"loss": 0.2117,
"step": 3070
},
{
"epoch": 2.6529557445892107,
"grad_norm": 2.15625,
"learning_rate": 1.3842000000000002e-05,
"loss": 0.2142,
"step": 3080
},
{
"epoch": 2.661569936470335,
"grad_norm": 2.203125,
"learning_rate": 1.3822000000000003e-05,
"loss": 0.202,
"step": 3090
},
{
"epoch": 2.670184128351459,
"grad_norm": 2.671875,
"learning_rate": 1.3802000000000003e-05,
"loss": 0.2133,
"step": 3100
},
{
"epoch": 2.678798320232583,
"grad_norm": 2.25,
"learning_rate": 1.3782000000000001e-05,
"loss": 0.2086,
"step": 3110
},
{
"epoch": 2.687412512113707,
"grad_norm": 1.984375,
"learning_rate": 1.3762000000000001e-05,
"loss": 0.2032,
"step": 3120
},
{
"epoch": 2.6960267039948316,
"grad_norm": 2.40625,
"learning_rate": 1.3742000000000002e-05,
"loss": 0.2157,
"step": 3130
},
{
"epoch": 2.7046408958759556,
"grad_norm": 1.90625,
"learning_rate": 1.3722000000000002e-05,
"loss": 0.2139,
"step": 3140
},
{
"epoch": 2.71325508775708,
"grad_norm": 2.125,
"learning_rate": 1.3702000000000002e-05,
"loss": 0.2177,
"step": 3150
},
{
"epoch": 2.721869279638204,
"grad_norm": 2.1875,
"learning_rate": 1.3682000000000002e-05,
"loss": 0.2082,
"step": 3160
},
{
"epoch": 2.730483471519328,
"grad_norm": 2.109375,
"learning_rate": 1.3662000000000002e-05,
"loss": 0.2093,
"step": 3170
},
{
"epoch": 2.739097663400452,
"grad_norm": 1.984375,
"learning_rate": 1.3642000000000003e-05,
"loss": 0.2054,
"step": 3180
},
{
"epoch": 2.7477118552815765,
"grad_norm": 2.34375,
"learning_rate": 1.3622000000000003e-05,
"loss": 0.2007,
"step": 3190
},
{
"epoch": 2.7563260471627005,
"grad_norm": 2.34375,
"learning_rate": 1.3602000000000001e-05,
"loss": 0.2109,
"step": 3200
},
{
"epoch": 2.764940239043825,
"grad_norm": 2.203125,
"learning_rate": 1.3582000000000001e-05,
"loss": 0.2106,
"step": 3210
},
{
"epoch": 2.773554430924949,
"grad_norm": 2.515625,
"learning_rate": 1.3562000000000002e-05,
"loss": 0.2101,
"step": 3220
},
{
"epoch": 2.782168622806073,
"grad_norm": 2.109375,
"learning_rate": 1.3542000000000002e-05,
"loss": 0.216,
"step": 3230
},
{
"epoch": 2.790782814687197,
"grad_norm": 2.109375,
"learning_rate": 1.3522000000000002e-05,
"loss": 0.2103,
"step": 3240
},
{
"epoch": 2.7993970065683214,
"grad_norm": 2.015625,
"learning_rate": 1.3502000000000002e-05,
"loss": 0.2083,
"step": 3250
},
{
"epoch": 2.8080111984494454,
"grad_norm": 2.203125,
"learning_rate": 1.3482000000000002e-05,
"loss": 0.2124,
"step": 3260
},
{
"epoch": 2.81662539033057,
"grad_norm": 2.546875,
"learning_rate": 1.3462000000000003e-05,
"loss": 0.2114,
"step": 3270
},
{
"epoch": 2.825239582211694,
"grad_norm": 2.0,
"learning_rate": 1.3442000000000001e-05,
"loss": 0.2122,
"step": 3280
},
{
"epoch": 2.833853774092818,
"grad_norm": 2.203125,
"learning_rate": 1.3422000000000001e-05,
"loss": 0.217,
"step": 3290
},
{
"epoch": 2.842467965973942,
"grad_norm": 2.34375,
"learning_rate": 1.3402000000000001e-05,
"loss": 0.2137,
"step": 3300
},
{
"epoch": 2.8510821578550662,
"grad_norm": 2.0,
"learning_rate": 1.3382000000000002e-05,
"loss": 0.2171,
"step": 3310
},
{
"epoch": 2.8596963497361902,
"grad_norm": 2.09375,
"learning_rate": 1.3362000000000002e-05,
"loss": 0.21,
"step": 3320
},
{
"epoch": 2.8683105416173147,
"grad_norm": 2.203125,
"learning_rate": 1.3342000000000002e-05,
"loss": 0.2083,
"step": 3330
},
{
"epoch": 2.8769247334984387,
"grad_norm": 2.171875,
"learning_rate": 1.3322000000000002e-05,
"loss": 0.2178,
"step": 3340
},
{
"epoch": 2.8855389253795627,
"grad_norm": 2.015625,
"learning_rate": 1.3302000000000002e-05,
"loss": 0.2086,
"step": 3350
},
{
"epoch": 2.894153117260687,
"grad_norm": 1.953125,
"learning_rate": 1.3282000000000001e-05,
"loss": 0.2144,
"step": 3360
},
{
"epoch": 2.902767309141811,
"grad_norm": 2.09375,
"learning_rate": 1.3262000000000001e-05,
"loss": 0.2067,
"step": 3370
},
{
"epoch": 2.911381501022935,
"grad_norm": 2.125,
"learning_rate": 1.3242000000000001e-05,
"loss": 0.2104,
"step": 3380
},
{
"epoch": 2.9199956929040596,
"grad_norm": 2.25,
"learning_rate": 1.3222000000000001e-05,
"loss": 0.2135,
"step": 3390
},
{
"epoch": 2.9286098847851836,
"grad_norm": 2.328125,
"learning_rate": 1.3202000000000002e-05,
"loss": 0.2095,
"step": 3400
},
{
"epoch": 2.9372240766663076,
"grad_norm": 2.234375,
"learning_rate": 1.3182000000000002e-05,
"loss": 0.2147,
"step": 3410
},
{
"epoch": 2.945838268547432,
"grad_norm": 2.484375,
"learning_rate": 1.3162000000000002e-05,
"loss": 0.2146,
"step": 3420
},
{
"epoch": 2.954452460428556,
"grad_norm": 2.28125,
"learning_rate": 1.3142000000000002e-05,
"loss": 0.2137,
"step": 3430
},
{
"epoch": 2.9630666523096805,
"grad_norm": 2.09375,
"learning_rate": 1.3122e-05,
"loss": 0.2111,
"step": 3440
},
{
"epoch": 2.9716808441908045,
"grad_norm": 2.15625,
"learning_rate": 1.3102000000000001e-05,
"loss": 0.2194,
"step": 3450
},
{
"epoch": 2.9802950360719285,
"grad_norm": 2.265625,
"learning_rate": 1.3082000000000001e-05,
"loss": 0.21,
"step": 3460
},
{
"epoch": 2.9889092279530525,
"grad_norm": 2.109375,
"learning_rate": 1.3062000000000001e-05,
"loss": 0.2004,
"step": 3470
},
{
"epoch": 2.997523419834177,
"grad_norm": 2.15625,
"learning_rate": 1.3042000000000002e-05,
"loss": 0.212,
"step": 3480
},
{
"epoch": 3.006029934316787,
"grad_norm": 2.328125,
"learning_rate": 1.3022000000000002e-05,
"loss": 0.1743,
"step": 3490
},
{
"epoch": 3.014644126197911,
"grad_norm": 2.28125,
"learning_rate": 1.3002000000000002e-05,
"loss": 0.1524,
"step": 3500
},
{
"epoch": 3.0232583180790353,
"grad_norm": 3.359375,
"learning_rate": 1.2982000000000002e-05,
"loss": 0.1476,
"step": 3510
},
{
"epoch": 3.0318725099601593,
"grad_norm": 2.359375,
"learning_rate": 1.2962e-05,
"loss": 0.1408,
"step": 3520
},
{
"epoch": 3.0404867018412833,
"grad_norm": 3.03125,
"learning_rate": 1.2942e-05,
"loss": 0.1495,
"step": 3530
},
{
"epoch": 3.049100893722408,
"grad_norm": 2.296875,
"learning_rate": 1.2922000000000001e-05,
"loss": 0.1457,
"step": 3540
},
{
"epoch": 3.057715085603532,
"grad_norm": 2.71875,
"learning_rate": 1.2902000000000001e-05,
"loss": 0.1483,
"step": 3550
},
{
"epoch": 3.066329277484656,
"grad_norm": 2.234375,
"learning_rate": 1.2882000000000001e-05,
"loss": 0.1451,
"step": 3560
},
{
"epoch": 3.0749434693657802,
"grad_norm": 2.5,
"learning_rate": 1.2862000000000002e-05,
"loss": 0.1378,
"step": 3570
},
{
"epoch": 3.0835576612469042,
"grad_norm": 2.53125,
"learning_rate": 1.2842000000000002e-05,
"loss": 0.1539,
"step": 3580
},
{
"epoch": 3.0921718531280282,
"grad_norm": 2.46875,
"learning_rate": 1.2822000000000002e-05,
"loss": 0.1503,
"step": 3590
},
{
"epoch": 3.1007860450091527,
"grad_norm": 2.15625,
"learning_rate": 1.2802e-05,
"loss": 0.147,
"step": 3600
},
{
"epoch": 3.1094002368902767,
"grad_norm": 2.53125,
"learning_rate": 1.2782e-05,
"loss": 0.1447,
"step": 3610
},
{
"epoch": 3.1180144287714007,
"grad_norm": 2.5625,
"learning_rate": 1.2762e-05,
"loss": 0.1568,
"step": 3620
},
{
"epoch": 3.126628620652525,
"grad_norm": 2.640625,
"learning_rate": 1.2742000000000001e-05,
"loss": 0.1397,
"step": 3630
},
{
"epoch": 3.135242812533649,
"grad_norm": 2.546875,
"learning_rate": 1.2722000000000001e-05,
"loss": 0.1347,
"step": 3640
},
{
"epoch": 3.143857004414773,
"grad_norm": 2.578125,
"learning_rate": 1.2702000000000001e-05,
"loss": 0.1487,
"step": 3650
},
{
"epoch": 3.1524711962958976,
"grad_norm": 2.296875,
"learning_rate": 1.2682000000000002e-05,
"loss": 0.1458,
"step": 3660
},
{
"epoch": 3.1610853881770216,
"grad_norm": 2.359375,
"learning_rate": 1.2662000000000002e-05,
"loss": 0.139,
"step": 3670
},
{
"epoch": 3.169699580058146,
"grad_norm": 2.46875,
"learning_rate": 1.2642e-05,
"loss": 0.141,
"step": 3680
},
{
"epoch": 3.17831377193927,
"grad_norm": 2.640625,
"learning_rate": 1.2622e-05,
"loss": 0.1355,
"step": 3690
},
{
"epoch": 3.186927963820394,
"grad_norm": 2.703125,
"learning_rate": 1.2602e-05,
"loss": 0.149,
"step": 3700
},
{
"epoch": 3.1955421557015184,
"grad_norm": 2.546875,
"learning_rate": 1.2582e-05,
"loss": 0.1485,
"step": 3710
},
{
"epoch": 3.2041563475826425,
"grad_norm": 2.4375,
"learning_rate": 1.2562000000000001e-05,
"loss": 0.1475,
"step": 3720
},
{
"epoch": 3.2127705394637665,
"grad_norm": 2.671875,
"learning_rate": 1.2542000000000001e-05,
"loss": 0.1469,
"step": 3730
},
{
"epoch": 3.221384731344891,
"grad_norm": 2.625,
"learning_rate": 1.2522000000000001e-05,
"loss": 0.1448,
"step": 3740
},
{
"epoch": 3.229998923226015,
"grad_norm": 2.1875,
"learning_rate": 1.2502000000000002e-05,
"loss": 0.1454,
"step": 3750
},
{
"epoch": 3.238613115107139,
"grad_norm": 2.234375,
"learning_rate": 1.2482e-05,
"loss": 0.1437,
"step": 3760
},
{
"epoch": 3.2472273069882633,
"grad_norm": 2.515625,
"learning_rate": 1.2462e-05,
"loss": 0.1521,
"step": 3770
},
{
"epoch": 3.2558414988693873,
"grad_norm": 2.40625,
"learning_rate": 1.2442e-05,
"loss": 0.1462,
"step": 3780
},
{
"epoch": 3.2644556907505113,
"grad_norm": 2.84375,
"learning_rate": 1.2422e-05,
"loss": 0.1528,
"step": 3790
},
{
"epoch": 3.273069882631636,
"grad_norm": 2.5,
"learning_rate": 1.2402000000000001e-05,
"loss": 0.1438,
"step": 3800
},
{
"epoch": 3.28168407451276,
"grad_norm": 2.71875,
"learning_rate": 1.2382000000000001e-05,
"loss": 0.1481,
"step": 3810
},
{
"epoch": 3.290298266393884,
"grad_norm": 2.59375,
"learning_rate": 1.2362000000000001e-05,
"loss": 0.1521,
"step": 3820
},
{
"epoch": 3.2989124582750082,
"grad_norm": 2.375,
"learning_rate": 1.2342000000000001e-05,
"loss": 0.1453,
"step": 3830
},
{
"epoch": 3.3075266501561322,
"grad_norm": 2.75,
"learning_rate": 1.2322e-05,
"loss": 0.1453,
"step": 3840
},
{
"epoch": 3.3161408420372562,
"grad_norm": 2.328125,
"learning_rate": 1.2302e-05,
"loss": 0.1512,
"step": 3850
},
{
"epoch": 3.3247550339183807,
"grad_norm": 2.484375,
"learning_rate": 1.2282e-05,
"loss": 0.1484,
"step": 3860
},
{
"epoch": 3.3333692257995047,
"grad_norm": 2.6875,
"learning_rate": 1.2262e-05,
"loss": 0.1374,
"step": 3870
},
{
"epoch": 3.3419834176806287,
"grad_norm": 2.515625,
"learning_rate": 1.2242e-05,
"loss": 0.149,
"step": 3880
},
{
"epoch": 3.350597609561753,
"grad_norm": 2.75,
"learning_rate": 1.2222000000000001e-05,
"loss": 0.151,
"step": 3890
},
{
"epoch": 3.359211801442877,
"grad_norm": 2.328125,
"learning_rate": 1.2202000000000001e-05,
"loss": 0.143,
"step": 3900
},
{
"epoch": 3.367825993324001,
"grad_norm": 2.6875,
"learning_rate": 1.2182000000000001e-05,
"loss": 0.1484,
"step": 3910
},
{
"epoch": 3.3764401852051256,
"grad_norm": 2.234375,
"learning_rate": 1.2162e-05,
"loss": 0.1532,
"step": 3920
},
{
"epoch": 3.3850543770862496,
"grad_norm": 2.640625,
"learning_rate": 1.2142e-05,
"loss": 0.1472,
"step": 3930
},
{
"epoch": 3.3936685689673736,
"grad_norm": 2.5,
"learning_rate": 1.2122e-05,
"loss": 0.1448,
"step": 3940
},
{
"epoch": 3.402282760848498,
"grad_norm": 2.421875,
"learning_rate": 1.2102e-05,
"loss": 0.1548,
"step": 3950
},
{
"epoch": 3.410896952729622,
"grad_norm": 2.734375,
"learning_rate": 1.2082e-05,
"loss": 0.1372,
"step": 3960
},
{
"epoch": 3.419511144610746,
"grad_norm": 2.921875,
"learning_rate": 1.2062e-05,
"loss": 0.1466,
"step": 3970
},
{
"epoch": 3.4281253364918705,
"grad_norm": 2.421875,
"learning_rate": 1.2042000000000001e-05,
"loss": 0.1406,
"step": 3980
},
{
"epoch": 3.4367395283729945,
"grad_norm": 2.765625,
"learning_rate": 1.2022000000000001e-05,
"loss": 0.1516,
"step": 3990
},
{
"epoch": 3.4453537202541185,
"grad_norm": 2.59375,
"learning_rate": 1.2002e-05,
"loss": 0.1491,
"step": 4000
},
{
"epoch": 3.453967912135243,
"grad_norm": 2.6875,
"learning_rate": 1.1982e-05,
"loss": 0.146,
"step": 4010
},
{
"epoch": 3.462582104016367,
"grad_norm": 2.53125,
"learning_rate": 1.1962e-05,
"loss": 0.1438,
"step": 4020
},
{
"epoch": 3.4711962958974913,
"grad_norm": 2.96875,
"learning_rate": 1.1942e-05,
"loss": 0.1372,
"step": 4030
},
{
"epoch": 3.4798104877786153,
"grad_norm": 2.359375,
"learning_rate": 1.1922e-05,
"loss": 0.1501,
"step": 4040
},
{
"epoch": 3.4884246796597393,
"grad_norm": 2.71875,
"learning_rate": 1.1902e-05,
"loss": 0.1515,
"step": 4050
},
{
"epoch": 3.4970388715408633,
"grad_norm": 2.796875,
"learning_rate": 1.1882e-05,
"loss": 0.151,
"step": 4060
},
{
"epoch": 3.505653063421988,
"grad_norm": 2.53125,
"learning_rate": 1.1862000000000001e-05,
"loss": 0.1468,
"step": 4070
},
{
"epoch": 3.514267255303112,
"grad_norm": 2.6875,
"learning_rate": 1.1842e-05,
"loss": 0.1477,
"step": 4080
},
{
"epoch": 3.5228814471842362,
"grad_norm": 2.375,
"learning_rate": 1.1822e-05,
"loss": 0.1516,
"step": 4090
},
{
"epoch": 3.5314956390653602,
"grad_norm": 2.5625,
"learning_rate": 1.1802e-05,
"loss": 0.145,
"step": 4100
},
{
"epoch": 3.5401098309464842,
"grad_norm": 2.671875,
"learning_rate": 1.1782e-05,
"loss": 0.1434,
"step": 4110
},
{
"epoch": 3.5487240228276082,
"grad_norm": 2.515625,
"learning_rate": 1.1762e-05,
"loss": 0.1424,
"step": 4120
},
{
"epoch": 3.5573382147087327,
"grad_norm": 2.140625,
"learning_rate": 1.1742e-05,
"loss": 0.1433,
"step": 4130
},
{
"epoch": 3.5659524065898567,
"grad_norm": 2.578125,
"learning_rate": 1.1722e-05,
"loss": 0.1511,
"step": 4140
},
{
"epoch": 3.574566598470981,
"grad_norm": 2.6875,
"learning_rate": 1.1702e-05,
"loss": 0.149,
"step": 4150
},
{
"epoch": 3.583180790352105,
"grad_norm": 2.6875,
"learning_rate": 1.1682e-05,
"loss": 0.151,
"step": 4160
},
{
"epoch": 3.591794982233229,
"grad_norm": 2.796875,
"learning_rate": 1.1662e-05,
"loss": 0.1412,
"step": 4170
},
{
"epoch": 3.600409174114353,
"grad_norm": 2.359375,
"learning_rate": 1.1642e-05,
"loss": 0.1461,
"step": 4180
},
{
"epoch": 3.6090233659954776,
"grad_norm": 2.546875,
"learning_rate": 1.1622e-05,
"loss": 0.1481,
"step": 4190
},
{
"epoch": 3.6176375578766016,
"grad_norm": 2.484375,
"learning_rate": 1.1602e-05,
"loss": 0.1535,
"step": 4200
},
{
"epoch": 3.626251749757726,
"grad_norm": 2.5,
"learning_rate": 1.1582e-05,
"loss": 0.1579,
"step": 4210
},
{
"epoch": 3.63486594163885,
"grad_norm": 3.078125,
"learning_rate": 1.1562e-05,
"loss": 0.1528,
"step": 4220
},
{
"epoch": 3.643480133519974,
"grad_norm": 2.625,
"learning_rate": 1.1542e-05,
"loss": 0.1436,
"step": 4230
},
{
"epoch": 3.6520943254010985,
"grad_norm": 2.78125,
"learning_rate": 1.1521999999999999e-05,
"loss": 0.1488,
"step": 4240
},
{
"epoch": 3.6607085172822225,
"grad_norm": 2.53125,
"learning_rate": 1.1502e-05,
"loss": 0.1404,
"step": 4250
},
{
"epoch": 3.6693227091633465,
"grad_norm": 2.5625,
"learning_rate": 1.1482000000000001e-05,
"loss": 0.1584,
"step": 4260
},
{
"epoch": 3.677936901044471,
"grad_norm": 2.65625,
"learning_rate": 1.1462000000000001e-05,
"loss": 0.1429,
"step": 4270
},
{
"epoch": 3.686551092925595,
"grad_norm": 2.9375,
"learning_rate": 1.1442000000000002e-05,
"loss": 0.1478,
"step": 4280
},
{
"epoch": 3.695165284806719,
"grad_norm": 2.28125,
"learning_rate": 1.1422000000000002e-05,
"loss": 0.1495,
"step": 4290
},
{
"epoch": 3.7037794766878434,
"grad_norm": 2.65625,
"learning_rate": 1.1402000000000002e-05,
"loss": 0.1487,
"step": 4300
},
{
"epoch": 3.7123936685689674,
"grad_norm": 2.75,
"learning_rate": 1.1382000000000002e-05,
"loss": 0.1515,
"step": 4310
},
{
"epoch": 3.721007860450092,
"grad_norm": 2.4375,
"learning_rate": 1.1362000000000002e-05,
"loss": 0.1524,
"step": 4320
},
{
"epoch": 3.729622052331216,
"grad_norm": 2.65625,
"learning_rate": 1.1342000000000003e-05,
"loss": 0.1506,
"step": 4330
},
{
"epoch": 3.73823624421234,
"grad_norm": 2.609375,
"learning_rate": 1.1322000000000001e-05,
"loss": 0.1448,
"step": 4340
},
{
"epoch": 3.746850436093464,
"grad_norm": 2.390625,
"learning_rate": 1.1302000000000001e-05,
"loss": 0.1444,
"step": 4350
},
{
"epoch": 3.7554646279745882,
"grad_norm": 2.203125,
"learning_rate": 1.1282000000000001e-05,
"loss": 0.1483,
"step": 4360
},
{
"epoch": 3.7640788198557122,
"grad_norm": 2.953125,
"learning_rate": 1.1262000000000002e-05,
"loss": 0.1524,
"step": 4370
},
{
"epoch": 3.7726930117368367,
"grad_norm": 2.640625,
"learning_rate": 1.1242000000000002e-05,
"loss": 0.1553,
"step": 4380
},
{
"epoch": 3.7813072036179607,
"grad_norm": 2.484375,
"learning_rate": 1.1222000000000002e-05,
"loss": 0.156,
"step": 4390
},
{
"epoch": 3.7899213954990847,
"grad_norm": 2.59375,
"learning_rate": 1.1202000000000002e-05,
"loss": 0.1543,
"step": 4400
},
{
"epoch": 3.7985355873802087,
"grad_norm": 2.515625,
"learning_rate": 1.1182000000000002e-05,
"loss": 0.1521,
"step": 4410
},
{
"epoch": 3.807149779261333,
"grad_norm": 3.046875,
"learning_rate": 1.1162000000000003e-05,
"loss": 0.1515,
"step": 4420
},
{
"epoch": 3.815763971142457,
"grad_norm": 2.703125,
"learning_rate": 1.1142000000000001e-05,
"loss": 0.1527,
"step": 4430
},
{
"epoch": 3.8243781630235816,
"grad_norm": 3.015625,
"learning_rate": 1.1122000000000001e-05,
"loss": 0.1543,
"step": 4440
},
{
"epoch": 3.8329923549047056,
"grad_norm": 2.546875,
"learning_rate": 1.1102000000000001e-05,
"loss": 0.1461,
"step": 4450
},
{
"epoch": 3.8416065467858296,
"grad_norm": 3.03125,
"learning_rate": 1.1082000000000002e-05,
"loss": 0.1448,
"step": 4460
},
{
"epoch": 3.8502207386669536,
"grad_norm": 2.453125,
"learning_rate": 1.1062000000000002e-05,
"loss": 0.1462,
"step": 4470
},
{
"epoch": 3.858834930548078,
"grad_norm": 2.59375,
"learning_rate": 1.1042000000000002e-05,
"loss": 0.146,
"step": 4480
},
{
"epoch": 3.867449122429202,
"grad_norm": 2.84375,
"learning_rate": 1.1022000000000002e-05,
"loss": 0.1416,
"step": 4490
},
{
"epoch": 3.8760633143103265,
"grad_norm": 2.359375,
"learning_rate": 1.1002000000000002e-05,
"loss": 0.156,
"step": 4500
},
{
"epoch": 3.8846775061914505,
"grad_norm": 3.140625,
"learning_rate": 1.0982000000000001e-05,
"loss": 0.1462,
"step": 4510
},
{
"epoch": 3.8932916980725745,
"grad_norm": 2.578125,
"learning_rate": 1.0962000000000001e-05,
"loss": 0.1447,
"step": 4520
},
{
"epoch": 3.9019058899536985,
"grad_norm": 2.78125,
"learning_rate": 1.0942000000000001e-05,
"loss": 0.1467,
"step": 4530
},
{
"epoch": 3.910520081834823,
"grad_norm": 2.53125,
"learning_rate": 1.0922000000000001e-05,
"loss": 0.1452,
"step": 4540
},
{
"epoch": 3.919134273715947,
"grad_norm": 2.609375,
"learning_rate": 1.0902000000000002e-05,
"loss": 0.1519,
"step": 4550
},
{
"epoch": 3.9277484655970714,
"grad_norm": 2.296875,
"learning_rate": 1.0882000000000002e-05,
"loss": 0.1485,
"step": 4560
},
{
"epoch": 3.9363626574781954,
"grad_norm": 2.921875,
"learning_rate": 1.0862000000000002e-05,
"loss": 0.1473,
"step": 4570
},
{
"epoch": 3.9449768493593194,
"grad_norm": 2.78125,
"learning_rate": 1.0842000000000002e-05,
"loss": 0.1462,
"step": 4580
},
{
"epoch": 3.9535910412404434,
"grad_norm": 2.5625,
"learning_rate": 1.0822e-05,
"loss": 0.1472,
"step": 4590
},
{
"epoch": 3.962205233121568,
"grad_norm": 2.484375,
"learning_rate": 1.0802000000000001e-05,
"loss": 0.1497,
"step": 4600
},
{
"epoch": 3.970819425002692,
"grad_norm": 2.421875,
"learning_rate": 1.0782000000000001e-05,
"loss": 0.1474,
"step": 4610
},
{
"epoch": 3.9794336168838162,
"grad_norm": 2.46875,
"learning_rate": 1.0762000000000001e-05,
"loss": 0.1477,
"step": 4620
},
{
"epoch": 3.9880478087649402,
"grad_norm": 2.734375,
"learning_rate": 1.0742000000000002e-05,
"loss": 0.1508,
"step": 4630
},
{
"epoch": 3.9966620006460643,
"grad_norm": 2.515625,
"learning_rate": 1.0722000000000002e-05,
"loss": 0.1449,
"step": 4640
},
{
"epoch": 4.005168515128674,
"grad_norm": 2.328125,
"learning_rate": 1.0702000000000002e-05,
"loss": 0.1182,
"step": 4650
},
{
"epoch": 4.013782707009798,
"grad_norm": 2.75,
"learning_rate": 1.0682000000000002e-05,
"loss": 0.1047,
"step": 4660
},
{
"epoch": 4.022396898890923,
"grad_norm": 3.0625,
"learning_rate": 1.0662e-05,
"loss": 0.0972,
"step": 4670
},
{
"epoch": 4.031011090772047,
"grad_norm": 3.109375,
"learning_rate": 1.0642e-05,
"loss": 0.099,
"step": 4680
},
{
"epoch": 4.039625282653171,
"grad_norm": 2.609375,
"learning_rate": 1.0622000000000001e-05,
"loss": 0.0977,
"step": 4690
},
{
"epoch": 4.048239474534295,
"grad_norm": 2.515625,
"learning_rate": 1.0602000000000001e-05,
"loss": 0.1003,
"step": 4700
},
{
"epoch": 4.056853666415419,
"grad_norm": 2.625,
"learning_rate": 1.0582000000000001e-05,
"loss": 0.0964,
"step": 4710
},
{
"epoch": 4.065467858296543,
"grad_norm": 2.625,
"learning_rate": 1.0562000000000002e-05,
"loss": 0.0969,
"step": 4720
},
{
"epoch": 4.074082050177668,
"grad_norm": 3.234375,
"learning_rate": 1.0542000000000002e-05,
"loss": 0.1009,
"step": 4730
},
{
"epoch": 4.082696242058792,
"grad_norm": 2.625,
"learning_rate": 1.0522000000000002e-05,
"loss": 0.1039,
"step": 4740
},
{
"epoch": 4.091310433939916,
"grad_norm": 2.65625,
"learning_rate": 1.0502e-05,
"loss": 0.102,
"step": 4750
},
{
"epoch": 4.09992462582104,
"grad_norm": 2.71875,
"learning_rate": 1.0482e-05,
"loss": 0.0998,
"step": 4760
},
{
"epoch": 4.108538817702164,
"grad_norm": 2.46875,
"learning_rate": 1.0462e-05,
"loss": 0.1001,
"step": 4770
},
{
"epoch": 4.117153009583289,
"grad_norm": 2.390625,
"learning_rate": 1.0442000000000001e-05,
"loss": 0.0929,
"step": 4780
},
{
"epoch": 4.125767201464413,
"grad_norm": 2.78125,
"learning_rate": 1.0422000000000001e-05,
"loss": 0.0971,
"step": 4790
},
{
"epoch": 4.134381393345537,
"grad_norm": 2.65625,
"learning_rate": 1.0402000000000001e-05,
"loss": 0.0997,
"step": 4800
},
{
"epoch": 4.142995585226661,
"grad_norm": 2.515625,
"learning_rate": 1.0382000000000002e-05,
"loss": 0.1048,
"step": 4810
},
{
"epoch": 4.151609777107785,
"grad_norm": 2.84375,
"learning_rate": 1.0362000000000002e-05,
"loss": 0.1024,
"step": 4820
},
{
"epoch": 4.160223968988909,
"grad_norm": 2.828125,
"learning_rate": 1.0342e-05,
"loss": 0.1015,
"step": 4830
},
{
"epoch": 4.168838160870034,
"grad_norm": 2.875,
"learning_rate": 1.0322e-05,
"loss": 0.1015,
"step": 4840
},
{
"epoch": 4.177452352751158,
"grad_norm": 3.15625,
"learning_rate": 1.0302e-05,
"loss": 0.0962,
"step": 4850
},
{
"epoch": 4.186066544632282,
"grad_norm": 2.84375,
"learning_rate": 1.0282e-05,
"loss": 0.1004,
"step": 4860
},
{
"epoch": 4.194680736513406,
"grad_norm": 2.8125,
"learning_rate": 1.0262000000000001e-05,
"loss": 0.0913,
"step": 4870
},
{
"epoch": 4.20329492839453,
"grad_norm": 4.0,
"learning_rate": 1.0242000000000001e-05,
"loss": 0.0974,
"step": 4880
},
{
"epoch": 4.211909120275654,
"grad_norm": 2.875,
"learning_rate": 1.0222000000000001e-05,
"loss": 0.1002,
"step": 4890
},
{
"epoch": 4.220523312156779,
"grad_norm": 2.546875,
"learning_rate": 1.0202000000000002e-05,
"loss": 0.1032,
"step": 4900
},
{
"epoch": 4.229137504037903,
"grad_norm": 3.0,
"learning_rate": 1.0182e-05,
"loss": 0.0937,
"step": 4910
},
{
"epoch": 4.237751695919027,
"grad_norm": 2.921875,
"learning_rate": 1.0162e-05,
"loss": 0.0974,
"step": 4920
},
{
"epoch": 4.246365887800151,
"grad_norm": 2.703125,
"learning_rate": 1.0142e-05,
"loss": 0.1001,
"step": 4930
},
{
"epoch": 4.254980079681275,
"grad_norm": 3.578125,
"learning_rate": 1.0122e-05,
"loss": 0.1008,
"step": 4940
},
{
"epoch": 4.263594271562399,
"grad_norm": 2.578125,
"learning_rate": 1.0102000000000001e-05,
"loss": 0.0994,
"step": 4950
},
{
"epoch": 4.272208463443524,
"grad_norm": 2.359375,
"learning_rate": 1.0082000000000001e-05,
"loss": 0.1013,
"step": 4960
},
{
"epoch": 4.280822655324648,
"grad_norm": 3.4375,
"learning_rate": 1.0062000000000001e-05,
"loss": 0.0959,
"step": 4970
},
{
"epoch": 4.289436847205772,
"grad_norm": 3.59375,
"learning_rate": 1.0042000000000001e-05,
"loss": 0.0992,
"step": 4980
},
{
"epoch": 4.298051039086896,
"grad_norm": 2.734375,
"learning_rate": 1.0022e-05,
"loss": 0.0962,
"step": 4990
},
{
"epoch": 4.30666523096802,
"grad_norm": 2.890625,
"learning_rate": 1.0002e-05,
"loss": 0.1062,
"step": 5000
},
{
"epoch": 4.315279422849144,
"grad_norm": 2.71875,
"learning_rate": 9.982e-06,
"loss": 0.0995,
"step": 5010
},
{
"epoch": 4.3238936147302685,
"grad_norm": 2.828125,
"learning_rate": 9.962e-06,
"loss": 0.1022,
"step": 5020
},
{
"epoch": 4.3325078066113925,
"grad_norm": 2.6875,
"learning_rate": 9.942e-06,
"loss": 0.0996,
"step": 5030
},
{
"epoch": 4.3411219984925165,
"grad_norm": 3.09375,
"learning_rate": 9.922000000000001e-06,
"loss": 0.0993,
"step": 5040
},
{
"epoch": 4.3497361903736405,
"grad_norm": 2.84375,
"learning_rate": 9.902000000000001e-06,
"loss": 0.1029,
"step": 5050
},
{
"epoch": 4.3583503822547645,
"grad_norm": 2.90625,
"learning_rate": 9.882000000000001e-06,
"loss": 0.1003,
"step": 5060
},
{
"epoch": 4.3669645741358885,
"grad_norm": 2.984375,
"learning_rate": 9.862e-06,
"loss": 0.0979,
"step": 5070
},
{
"epoch": 4.375578766017013,
"grad_norm": 2.734375,
"learning_rate": 9.842e-06,
"loss": 0.1003,
"step": 5080
},
{
"epoch": 4.384192957898137,
"grad_norm": 2.34375,
"learning_rate": 9.822e-06,
"loss": 0.0981,
"step": 5090
},
{
"epoch": 4.392807149779261,
"grad_norm": 2.90625,
"learning_rate": 9.802e-06,
"loss": 0.098,
"step": 5100
},
{
"epoch": 4.401421341660385,
"grad_norm": 3.265625,
"learning_rate": 9.782e-06,
"loss": 0.1007,
"step": 5110
},
{
"epoch": 4.410035533541509,
"grad_norm": 2.75,
"learning_rate": 9.762e-06,
"loss": 0.0968,
"step": 5120
},
{
"epoch": 4.418649725422634,
"grad_norm": 2.75,
"learning_rate": 9.742000000000001e-06,
"loss": 0.1031,
"step": 5130
},
{
"epoch": 4.427263917303758,
"grad_norm": 3.078125,
"learning_rate": 9.722000000000001e-06,
"loss": 0.0996,
"step": 5140
},
{
"epoch": 4.435878109184882,
"grad_norm": 2.828125,
"learning_rate": 9.702e-06,
"loss": 0.0982,
"step": 5150
},
{
"epoch": 4.444492301066006,
"grad_norm": 2.609375,
"learning_rate": 9.682e-06,
"loss": 0.0991,
"step": 5160
},
{
"epoch": 4.45310649294713,
"grad_norm": 3.296875,
"learning_rate": 9.662e-06,
"loss": 0.0999,
"step": 5170
},
{
"epoch": 4.461720684828254,
"grad_norm": 3.109375,
"learning_rate": 9.642e-06,
"loss": 0.1042,
"step": 5180
},
{
"epoch": 4.470334876709378,
"grad_norm": 2.765625,
"learning_rate": 9.622000000000002e-06,
"loss": 0.1016,
"step": 5190
},
{
"epoch": 4.478949068590503,
"grad_norm": 2.5,
"learning_rate": 9.602e-06,
"loss": 0.0992,
"step": 5200
},
{
"epoch": 4.487563260471627,
"grad_norm": 3.140625,
"learning_rate": 9.582e-06,
"loss": 0.0986,
"step": 5210
},
{
"epoch": 4.496177452352751,
"grad_norm": 2.53125,
"learning_rate": 9.562000000000001e-06,
"loss": 0.0995,
"step": 5220
},
{
"epoch": 4.504791644233875,
"grad_norm": 3.25,
"learning_rate": 9.542000000000001e-06,
"loss": 0.1039,
"step": 5230
},
{
"epoch": 4.513405836114999,
"grad_norm": 2.890625,
"learning_rate": 9.522000000000001e-06,
"loss": 0.1017,
"step": 5240
},
{
"epoch": 4.522020027996124,
"grad_norm": 2.671875,
"learning_rate": 9.502000000000002e-06,
"loss": 0.1018,
"step": 5250
},
{
"epoch": 4.530634219877248,
"grad_norm": 3.203125,
"learning_rate": 9.482000000000002e-06,
"loss": 0.1021,
"step": 5260
},
{
"epoch": 4.539248411758372,
"grad_norm": 2.5625,
"learning_rate": 9.462000000000002e-06,
"loss": 0.1007,
"step": 5270
},
{
"epoch": 4.547862603639496,
"grad_norm": 2.359375,
"learning_rate": 9.442e-06,
"loss": 0.0974,
"step": 5280
},
{
"epoch": 4.55647679552062,
"grad_norm": 2.5625,
"learning_rate": 9.422e-06,
"loss": 0.0994,
"step": 5290
},
{
"epoch": 4.565090987401744,
"grad_norm": 2.890625,
"learning_rate": 9.402e-06,
"loss": 0.1027,
"step": 5300
},
{
"epoch": 4.573705179282869,
"grad_norm": 3.015625,
"learning_rate": 9.382000000000001e-06,
"loss": 0.1081,
"step": 5310
},
{
"epoch": 4.582319371163993,
"grad_norm": 3.015625,
"learning_rate": 9.362000000000001e-06,
"loss": 0.0977,
"step": 5320
},
{
"epoch": 4.590933563045117,
"grad_norm": 2.75,
"learning_rate": 9.342000000000001e-06,
"loss": 0.1057,
"step": 5330
},
{
"epoch": 4.599547754926241,
"grad_norm": 3.0,
"learning_rate": 9.322000000000002e-06,
"loss": 0.1008,
"step": 5340
},
{
"epoch": 4.608161946807365,
"grad_norm": 2.9375,
"learning_rate": 9.302000000000002e-06,
"loss": 0.102,
"step": 5350
},
{
"epoch": 4.616776138688489,
"grad_norm": 3.015625,
"learning_rate": 9.282e-06,
"loss": 0.106,
"step": 5360
},
{
"epoch": 4.625390330569614,
"grad_norm": 3.015625,
"learning_rate": 9.262e-06,
"loss": 0.1025,
"step": 5370
},
{
"epoch": 4.634004522450738,
"grad_norm": 2.59375,
"learning_rate": 9.242e-06,
"loss": 0.0968,
"step": 5380
},
{
"epoch": 4.642618714331862,
"grad_norm": 2.6875,
"learning_rate": 9.222e-06,
"loss": 0.0951,
"step": 5390
},
{
"epoch": 4.651232906212986,
"grad_norm": 2.71875,
"learning_rate": 9.202000000000001e-06,
"loss": 0.1027,
"step": 5400
},
{
"epoch": 4.65984709809411,
"grad_norm": 2.515625,
"learning_rate": 9.182000000000001e-06,
"loss": 0.1014,
"step": 5410
},
{
"epoch": 4.668461289975234,
"grad_norm": 2.9375,
"learning_rate": 9.162000000000001e-06,
"loss": 0.1017,
"step": 5420
},
{
"epoch": 4.677075481856359,
"grad_norm": 3.046875,
"learning_rate": 9.142000000000002e-06,
"loss": 0.1019,
"step": 5430
},
{
"epoch": 4.685689673737483,
"grad_norm": 2.828125,
"learning_rate": 9.122e-06,
"loss": 0.1034,
"step": 5440
},
{
"epoch": 4.694303865618607,
"grad_norm": 2.671875,
"learning_rate": 9.102e-06,
"loss": 0.098,
"step": 5450
},
{
"epoch": 4.702918057499731,
"grad_norm": 3.203125,
"learning_rate": 9.082e-06,
"loss": 0.0971,
"step": 5460
},
{
"epoch": 4.711532249380855,
"grad_norm": 2.78125,
"learning_rate": 9.062e-06,
"loss": 0.0989,
"step": 5470
},
{
"epoch": 4.72014644126198,
"grad_norm": 2.78125,
"learning_rate": 9.042e-06,
"loss": 0.0987,
"step": 5480
},
{
"epoch": 4.728760633143104,
"grad_norm": 3.046875,
"learning_rate": 9.022000000000001e-06,
"loss": 0.0983,
"step": 5490
},
{
"epoch": 4.737374825024228,
"grad_norm": 2.890625,
"learning_rate": 9.002000000000001e-06,
"loss": 0.102,
"step": 5500
},
{
"epoch": 4.745989016905352,
"grad_norm": 2.71875,
"learning_rate": 8.982000000000001e-06,
"loss": 0.1022,
"step": 5510
},
{
"epoch": 4.754603208786476,
"grad_norm": 2.8125,
"learning_rate": 8.962e-06,
"loss": 0.0997,
"step": 5520
},
{
"epoch": 4.7632174006676,
"grad_norm": 2.625,
"learning_rate": 8.942e-06,
"loss": 0.1032,
"step": 5530
},
{
"epoch": 4.771831592548724,
"grad_norm": 2.71875,
"learning_rate": 8.922e-06,
"loss": 0.0995,
"step": 5540
},
{
"epoch": 4.7804457844298485,
"grad_norm": 2.90625,
"learning_rate": 8.902e-06,
"loss": 0.1053,
"step": 5550
},
{
"epoch": 4.7890599763109725,
"grad_norm": 2.59375,
"learning_rate": 8.882e-06,
"loss": 0.0929,
"step": 5560
},
{
"epoch": 4.7976741681920965,
"grad_norm": 2.734375,
"learning_rate": 8.862000000000001e-06,
"loss": 0.1015,
"step": 5570
},
{
"epoch": 4.8062883600732205,
"grad_norm": 2.90625,
"learning_rate": 8.842000000000001e-06,
"loss": 0.0987,
"step": 5580
},
{
"epoch": 4.8149025519543445,
"grad_norm": 2.859375,
"learning_rate": 8.822000000000001e-06,
"loss": 0.1039,
"step": 5590
},
{
"epoch": 4.823516743835469,
"grad_norm": 3.34375,
"learning_rate": 8.802e-06,
"loss": 0.102,
"step": 5600
},
{
"epoch": 4.832130935716593,
"grad_norm": 2.9375,
"learning_rate": 8.782e-06,
"loss": 0.1002,
"step": 5610
},
{
"epoch": 4.840745127597717,
"grad_norm": 2.828125,
"learning_rate": 8.762e-06,
"loss": 0.1023,
"step": 5620
},
{
"epoch": 4.849359319478841,
"grad_norm": 2.46875,
"learning_rate": 8.742e-06,
"loss": 0.1007,
"step": 5630
},
{
"epoch": 4.857973511359965,
"grad_norm": 2.890625,
"learning_rate": 8.722e-06,
"loss": 0.1036,
"step": 5640
},
{
"epoch": 4.866587703241089,
"grad_norm": 2.765625,
"learning_rate": 8.702e-06,
"loss": 0.1044,
"step": 5650
},
{
"epoch": 4.875201895122213,
"grad_norm": 2.765625,
"learning_rate": 8.682000000000001e-06,
"loss": 0.0943,
"step": 5660
},
{
"epoch": 4.883816087003338,
"grad_norm": 2.609375,
"learning_rate": 8.662000000000001e-06,
"loss": 0.0982,
"step": 5670
},
{
"epoch": 4.892430278884462,
"grad_norm": 3.125,
"learning_rate": 8.642e-06,
"loss": 0.1033,
"step": 5680
},
{
"epoch": 4.901044470765586,
"grad_norm": 2.3125,
"learning_rate": 8.622e-06,
"loss": 0.1034,
"step": 5690
},
{
"epoch": 4.90965866264671,
"grad_norm": 2.9375,
"learning_rate": 8.602e-06,
"loss": 0.0999,
"step": 5700
},
{
"epoch": 4.918272854527834,
"grad_norm": 3.03125,
"learning_rate": 8.582e-06,
"loss": 0.1048,
"step": 5710
},
{
"epoch": 4.926887046408959,
"grad_norm": 3.75,
"learning_rate": 8.562e-06,
"loss": 0.0956,
"step": 5720
},
{
"epoch": 4.935501238290083,
"grad_norm": 2.609375,
"learning_rate": 8.542e-06,
"loss": 0.1041,
"step": 5730
},
{
"epoch": 4.944115430171207,
"grad_norm": 2.6875,
"learning_rate": 8.522e-06,
"loss": 0.1029,
"step": 5740
},
{
"epoch": 4.952729622052331,
"grad_norm": 2.609375,
"learning_rate": 8.502000000000001e-06,
"loss": 0.1009,
"step": 5750
},
{
"epoch": 4.961343813933455,
"grad_norm": 2.515625,
"learning_rate": 8.482e-06,
"loss": 0.1065,
"step": 5760
},
{
"epoch": 4.969958005814579,
"grad_norm": 2.78125,
"learning_rate": 8.462e-06,
"loss": 0.0968,
"step": 5770
},
{
"epoch": 4.978572197695704,
"grad_norm": 2.96875,
"learning_rate": 8.442e-06,
"loss": 0.1061,
"step": 5780
},
{
"epoch": 4.987186389576828,
"grad_norm": 2.65625,
"learning_rate": 8.422e-06,
"loss": 0.0998,
"step": 5790
},
{
"epoch": 4.995800581457952,
"grad_norm": 2.8125,
"learning_rate": 8.402e-06,
"loss": 0.1004,
"step": 5800
},
{
"epoch": 5.004307095940562,
"grad_norm": 2.09375,
"learning_rate": 8.382e-06,
"loss": 0.0826,
"step": 5810
},
{
"epoch": 5.012921287821686,
"grad_norm": 2.9375,
"learning_rate": 8.362e-06,
"loss": 0.0723,
"step": 5820
},
{
"epoch": 5.02153547970281,
"grad_norm": 2.4375,
"learning_rate": 8.342e-06,
"loss": 0.0668,
"step": 5830
},
{
"epoch": 5.030149671583935,
"grad_norm": 2.8125,
"learning_rate": 8.322000000000001e-06,
"loss": 0.072,
"step": 5840
},
{
"epoch": 5.038763863465059,
"grad_norm": 2.828125,
"learning_rate": 8.302000000000001e-06,
"loss": 0.0673,
"step": 5850
},
{
"epoch": 5.047378055346183,
"grad_norm": 3.03125,
"learning_rate": 8.282000000000001e-06,
"loss": 0.0656,
"step": 5860
},
{
"epoch": 5.055992247227307,
"grad_norm": 3.21875,
"learning_rate": 8.262000000000002e-06,
"loss": 0.0721,
"step": 5870
},
{
"epoch": 5.064606439108431,
"grad_norm": 2.953125,
"learning_rate": 8.242000000000002e-06,
"loss": 0.0737,
"step": 5880
},
{
"epoch": 5.073220630989555,
"grad_norm": 2.796875,
"learning_rate": 8.222000000000002e-06,
"loss": 0.068,
"step": 5890
},
{
"epoch": 5.08183482287068,
"grad_norm": 2.921875,
"learning_rate": 8.202e-06,
"loss": 0.0682,
"step": 5900
},
{
"epoch": 5.090449014751804,
"grad_norm": 3.09375,
"learning_rate": 8.182e-06,
"loss": 0.0677,
"step": 5910
},
{
"epoch": 5.099063206632928,
"grad_norm": 2.71875,
"learning_rate": 8.162e-06,
"loss": 0.0686,
"step": 5920
},
{
"epoch": 5.107677398514052,
"grad_norm": 2.5625,
"learning_rate": 8.142000000000001e-06,
"loss": 0.0688,
"step": 5930
},
{
"epoch": 5.116291590395176,
"grad_norm": 2.4375,
"learning_rate": 8.122000000000001e-06,
"loss": 0.072,
"step": 5940
},
{
"epoch": 5.1249057822763,
"grad_norm": 2.703125,
"learning_rate": 8.102000000000001e-06,
"loss": 0.0707,
"step": 5950
},
{
"epoch": 5.133519974157425,
"grad_norm": 2.734375,
"learning_rate": 8.082000000000002e-06,
"loss": 0.0733,
"step": 5960
},
{
"epoch": 5.142134166038549,
"grad_norm": 2.515625,
"learning_rate": 8.062000000000002e-06,
"loss": 0.0726,
"step": 5970
},
{
"epoch": 5.150748357919673,
"grad_norm": 3.109375,
"learning_rate": 8.042e-06,
"loss": 0.069,
"step": 5980
},
{
"epoch": 5.159362549800797,
"grad_norm": 2.640625,
"learning_rate": 8.022e-06,
"loss": 0.0699,
"step": 5990
},
{
"epoch": 5.167976741681921,
"grad_norm": 3.046875,
"learning_rate": 8.002e-06,
"loss": 0.0704,
"step": 6000
},
{
"epoch": 5.176590933563045,
"grad_norm": 2.515625,
"learning_rate": 7.982e-06,
"loss": 0.0676,
"step": 6010
},
{
"epoch": 5.18520512544417,
"grad_norm": 2.859375,
"learning_rate": 7.962000000000001e-06,
"loss": 0.0674,
"step": 6020
},
{
"epoch": 5.193819317325294,
"grad_norm": 2.265625,
"learning_rate": 7.942000000000001e-06,
"loss": 0.0721,
"step": 6030
},
{
"epoch": 5.202433509206418,
"grad_norm": 3.140625,
"learning_rate": 7.922000000000001e-06,
"loss": 0.0727,
"step": 6040
},
{
"epoch": 5.211047701087542,
"grad_norm": 2.703125,
"learning_rate": 7.902000000000002e-06,
"loss": 0.071,
"step": 6050
},
{
"epoch": 5.219661892968666,
"grad_norm": 3.5,
"learning_rate": 7.882e-06,
"loss": 0.0682,
"step": 6060
},
{
"epoch": 5.2282760848497905,
"grad_norm": 2.703125,
"learning_rate": 7.862e-06,
"loss": 0.0708,
"step": 6070
},
{
"epoch": 5.2368902767309145,
"grad_norm": 3.015625,
"learning_rate": 7.842e-06,
"loss": 0.0695,
"step": 6080
},
{
"epoch": 5.2455044686120385,
"grad_norm": 2.421875,
"learning_rate": 7.822e-06,
"loss": 0.0731,
"step": 6090
},
{
"epoch": 5.2541186604931625,
"grad_norm": 3.140625,
"learning_rate": 7.802000000000001e-06,
"loss": 0.0695,
"step": 6100
},
{
"epoch": 5.2627328523742865,
"grad_norm": 3.046875,
"learning_rate": 7.782000000000001e-06,
"loss": 0.0765,
"step": 6110
},
{
"epoch": 5.2713470442554105,
"grad_norm": 2.984375,
"learning_rate": 7.762000000000001e-06,
"loss": 0.0702,
"step": 6120
},
{
"epoch": 5.279961236136535,
"grad_norm": 3.015625,
"learning_rate": 7.742000000000001e-06,
"loss": 0.0698,
"step": 6130
},
{
"epoch": 5.288575428017659,
"grad_norm": 2.75,
"learning_rate": 7.722e-06,
"loss": 0.0724,
"step": 6140
},
{
"epoch": 5.297189619898783,
"grad_norm": 2.46875,
"learning_rate": 7.702e-06,
"loss": 0.0708,
"step": 6150
},
{
"epoch": 5.305803811779907,
"grad_norm": 2.78125,
"learning_rate": 7.682e-06,
"loss": 0.0703,
"step": 6160
},
{
"epoch": 5.314418003661031,
"grad_norm": 2.53125,
"learning_rate": 7.662e-06,
"loss": 0.0704,
"step": 6170
},
{
"epoch": 5.323032195542155,
"grad_norm": 2.5625,
"learning_rate": 7.642e-06,
"loss": 0.0674,
"step": 6180
},
{
"epoch": 5.33164638742328,
"grad_norm": 4.5,
"learning_rate": 7.622000000000001e-06,
"loss": 0.077,
"step": 6190
},
{
"epoch": 5.340260579304404,
"grad_norm": 2.921875,
"learning_rate": 7.602e-06,
"loss": 0.0722,
"step": 6200
},
{
"epoch": 5.348874771185528,
"grad_norm": 2.5625,
"learning_rate": 7.582e-06,
"loss": 0.0728,
"step": 6210
},
{
"epoch": 5.357488963066652,
"grad_norm": 2.609375,
"learning_rate": 7.562000000000001e-06,
"loss": 0.0709,
"step": 6220
},
{
"epoch": 5.366103154947776,
"grad_norm": 3.4375,
"learning_rate": 7.542000000000001e-06,
"loss": 0.069,
"step": 6230
},
{
"epoch": 5.3747173468289,
"grad_norm": 3.109375,
"learning_rate": 7.522e-06,
"loss": 0.0717,
"step": 6240
},
{
"epoch": 5.383331538710025,
"grad_norm": 2.53125,
"learning_rate": 7.502e-06,
"loss": 0.0677,
"step": 6250
},
{
"epoch": 5.391945730591149,
"grad_norm": 2.96875,
"learning_rate": 7.4820000000000005e-06,
"loss": 0.0706,
"step": 6260
},
{
"epoch": 5.400559922472273,
"grad_norm": 3.015625,
"learning_rate": 7.462000000000001e-06,
"loss": 0.0753,
"step": 6270
},
{
"epoch": 5.409174114353397,
"grad_norm": 3.3125,
"learning_rate": 7.442e-06,
"loss": 0.0698,
"step": 6280
},
{
"epoch": 5.417788306234521,
"grad_norm": 2.953125,
"learning_rate": 7.422e-06,
"loss": 0.0733,
"step": 6290
},
{
"epoch": 5.426402498115645,
"grad_norm": 3.171875,
"learning_rate": 7.4020000000000005e-06,
"loss": 0.0672,
"step": 6300
},
{
"epoch": 5.43501668999677,
"grad_norm": 2.6875,
"learning_rate": 7.382000000000001e-06,
"loss": 0.0701,
"step": 6310
},
{
"epoch": 5.443630881877894,
"grad_norm": 2.765625,
"learning_rate": 7.362e-06,
"loss": 0.0677,
"step": 6320
},
{
"epoch": 5.452245073759018,
"grad_norm": 2.953125,
"learning_rate": 7.342e-06,
"loss": 0.0715,
"step": 6330
},
{
"epoch": 5.460859265640142,
"grad_norm": 3.09375,
"learning_rate": 7.322e-06,
"loss": 0.0713,
"step": 6340
},
{
"epoch": 5.469473457521266,
"grad_norm": 2.59375,
"learning_rate": 7.3020000000000006e-06,
"loss": 0.0697,
"step": 6350
},
{
"epoch": 5.47808764940239,
"grad_norm": 2.734375,
"learning_rate": 7.282e-06,
"loss": 0.0686,
"step": 6360
},
{
"epoch": 5.486701841283515,
"grad_norm": 2.78125,
"learning_rate": 7.262e-06,
"loss": 0.0685,
"step": 6370
},
{
"epoch": 5.495316033164639,
"grad_norm": 2.625,
"learning_rate": 7.242e-06,
"loss": 0.0697,
"step": 6380
},
{
"epoch": 5.503930225045763,
"grad_norm": 2.8125,
"learning_rate": 7.2220000000000005e-06,
"loss": 0.0684,
"step": 6390
},
{
"epoch": 5.512544416926887,
"grad_norm": 2.296875,
"learning_rate": 7.202e-06,
"loss": 0.0674,
"step": 6400
},
{
"epoch": 5.521158608808011,
"grad_norm": 3.21875,
"learning_rate": 7.182e-06,
"loss": 0.0728,
"step": 6410
},
{
"epoch": 5.529772800689136,
"grad_norm": 2.953125,
"learning_rate": 7.162e-06,
"loss": 0.0706,
"step": 6420
},
{
"epoch": 5.53838699257026,
"grad_norm": 2.6875,
"learning_rate": 7.142e-06,
"loss": 0.0725,
"step": 6430
},
{
"epoch": 5.547001184451384,
"grad_norm": 2.375,
"learning_rate": 7.1220000000000014e-06,
"loss": 0.07,
"step": 6440
},
{
"epoch": 5.555615376332508,
"grad_norm": 2.671875,
"learning_rate": 7.102000000000001e-06,
"loss": 0.0682,
"step": 6450
},
{
"epoch": 5.564229568213632,
"grad_norm": 3.046875,
"learning_rate": 7.082000000000001e-06,
"loss": 0.0715,
"step": 6460
},
{
"epoch": 5.572843760094756,
"grad_norm": 2.515625,
"learning_rate": 7.062000000000001e-06,
"loss": 0.0715,
"step": 6470
},
{
"epoch": 5.58145795197588,
"grad_norm": 2.859375,
"learning_rate": 7.042000000000001e-06,
"loss": 0.0702,
"step": 6480
},
{
"epoch": 5.590072143857005,
"grad_norm": 2.765625,
"learning_rate": 7.022000000000001e-06,
"loss": 0.0729,
"step": 6490
},
{
"epoch": 5.598686335738129,
"grad_norm": 2.984375,
"learning_rate": 7.002000000000001e-06,
"loss": 0.0734,
"step": 6500
},
{
"epoch": 5.607300527619253,
"grad_norm": 3.15625,
"learning_rate": 6.982000000000001e-06,
"loss": 0.0785,
"step": 6510
},
{
"epoch": 5.615914719500377,
"grad_norm": 2.875,
"learning_rate": 6.962000000000001e-06,
"loss": 0.0727,
"step": 6520
},
{
"epoch": 5.624528911381501,
"grad_norm": 2.890625,
"learning_rate": 6.942000000000001e-06,
"loss": 0.0703,
"step": 6530
},
{
"epoch": 5.633143103262626,
"grad_norm": 3.453125,
"learning_rate": 6.922000000000001e-06,
"loss": 0.0698,
"step": 6540
},
{
"epoch": 5.64175729514375,
"grad_norm": 2.953125,
"learning_rate": 6.902000000000001e-06,
"loss": 0.0667,
"step": 6550
},
{
"epoch": 5.650371487024874,
"grad_norm": 3.5625,
"learning_rate": 6.882000000000001e-06,
"loss": 0.0707,
"step": 6560
},
{
"epoch": 5.658985678905998,
"grad_norm": 2.84375,
"learning_rate": 6.8620000000000005e-06,
"loss": 0.0696,
"step": 6570
},
{
"epoch": 5.667599870787122,
"grad_norm": 2.671875,
"learning_rate": 6.842000000000001e-06,
"loss": 0.0726,
"step": 6580
},
{
"epoch": 5.676214062668246,
"grad_norm": 2.609375,
"learning_rate": 6.822000000000001e-06,
"loss": 0.0737,
"step": 6590
},
{
"epoch": 5.6848282545493705,
"grad_norm": 2.875,
"learning_rate": 6.802000000000001e-06,
"loss": 0.0752,
"step": 6600
},
{
"epoch": 5.6934424464304945,
"grad_norm": 2.34375,
"learning_rate": 6.7820000000000005e-06,
"loss": 0.0711,
"step": 6610
},
{
"epoch": 5.7020566383116185,
"grad_norm": 3.0625,
"learning_rate": 6.762000000000001e-06,
"loss": 0.0715,
"step": 6620
},
{
"epoch": 5.7106708301927425,
"grad_norm": 3.140625,
"learning_rate": 6.742000000000001e-06,
"loss": 0.0715,
"step": 6630
},
{
"epoch": 5.7192850220738665,
"grad_norm": 2.96875,
"learning_rate": 6.722000000000001e-06,
"loss": 0.0706,
"step": 6640
},
{
"epoch": 5.7278992139549905,
"grad_norm": 2.875,
"learning_rate": 6.702e-06,
"loss": 0.0671,
"step": 6650
},
{
"epoch": 5.736513405836115,
"grad_norm": 2.984375,
"learning_rate": 6.6820000000000006e-06,
"loss": 0.0754,
"step": 6660
},
{
"epoch": 5.745127597717239,
"grad_norm": 2.734375,
"learning_rate": 6.662000000000001e-06,
"loss": 0.0702,
"step": 6670
},
{
"epoch": 5.753741789598363,
"grad_norm": 2.84375,
"learning_rate": 6.642000000000001e-06,
"loss": 0.071,
"step": 6680
},
{
"epoch": 5.762355981479487,
"grad_norm": 2.84375,
"learning_rate": 6.622e-06,
"loss": 0.0724,
"step": 6690
},
{
"epoch": 5.770970173360611,
"grad_norm": 3.21875,
"learning_rate": 6.6020000000000005e-06,
"loss": 0.0739,
"step": 6700
},
{
"epoch": 5.779584365241735,
"grad_norm": 3.203125,
"learning_rate": 6.582000000000001e-06,
"loss": 0.0713,
"step": 6710
},
{
"epoch": 5.78819855712286,
"grad_norm": 2.6875,
"learning_rate": 6.562000000000001e-06,
"loss": 0.0691,
"step": 6720
},
{
"epoch": 5.796812749003984,
"grad_norm": 2.75,
"learning_rate": 6.542e-06,
"loss": 0.0713,
"step": 6730
},
{
"epoch": 5.805426940885108,
"grad_norm": 2.6875,
"learning_rate": 6.522e-06,
"loss": 0.0685,
"step": 6740
},
{
"epoch": 5.814041132766232,
"grad_norm": 3.46875,
"learning_rate": 6.502000000000001e-06,
"loss": 0.0728,
"step": 6750
},
{
"epoch": 5.822655324647356,
"grad_norm": 3.03125,
"learning_rate": 6.482000000000001e-06,
"loss": 0.0722,
"step": 6760
},
{
"epoch": 5.831269516528481,
"grad_norm": 2.59375,
"learning_rate": 6.462e-06,
"loss": 0.0687,
"step": 6770
},
{
"epoch": 5.839883708409605,
"grad_norm": 3.0,
"learning_rate": 6.442e-06,
"loss": 0.0682,
"step": 6780
},
{
"epoch": 5.848497900290729,
"grad_norm": 2.40625,
"learning_rate": 6.4220000000000005e-06,
"loss": 0.0674,
"step": 6790
},
{
"epoch": 5.857112092171853,
"grad_norm": 2.703125,
"learning_rate": 6.402000000000001e-06,
"loss": 0.0743,
"step": 6800
},
{
"epoch": 5.865726284052977,
"grad_norm": 2.734375,
"learning_rate": 6.382e-06,
"loss": 0.071,
"step": 6810
},
{
"epoch": 5.874340475934101,
"grad_norm": 3.0625,
"learning_rate": 6.362e-06,
"loss": 0.0688,
"step": 6820
},
{
"epoch": 5.882954667815225,
"grad_norm": 2.734375,
"learning_rate": 6.3420000000000004e-06,
"loss": 0.0696,
"step": 6830
},
{
"epoch": 5.89156885969635,
"grad_norm": 2.875,
"learning_rate": 6.322000000000001e-06,
"loss": 0.0698,
"step": 6840
},
{
"epoch": 5.900183051577474,
"grad_norm": 2.53125,
"learning_rate": 6.302e-06,
"loss": 0.0688,
"step": 6850
},
{
"epoch": 5.908797243458598,
"grad_norm": 3.0,
"learning_rate": 6.282e-06,
"loss": 0.0704,
"step": 6860
},
{
"epoch": 5.917411435339722,
"grad_norm": 3.0,
"learning_rate": 6.262e-06,
"loss": 0.0742,
"step": 6870
},
{
"epoch": 5.926025627220846,
"grad_norm": 2.46875,
"learning_rate": 6.2420000000000005e-06,
"loss": 0.0656,
"step": 6880
},
{
"epoch": 5.934639819101971,
"grad_norm": 2.8125,
"learning_rate": 6.222e-06,
"loss": 0.072,
"step": 6890
},
{
"epoch": 5.943254010983095,
"grad_norm": 2.921875,
"learning_rate": 6.202e-06,
"loss": 0.0668,
"step": 6900
},
{
"epoch": 5.951868202864219,
"grad_norm": 2.765625,
"learning_rate": 6.182e-06,
"loss": 0.0706,
"step": 6910
},
{
"epoch": 5.960482394745343,
"grad_norm": 2.921875,
"learning_rate": 6.1620000000000005e-06,
"loss": 0.0685,
"step": 6920
},
{
"epoch": 5.969096586626467,
"grad_norm": 2.703125,
"learning_rate": 6.142e-06,
"loss": 0.074,
"step": 6930
},
{
"epoch": 5.977710778507591,
"grad_norm": 3.40625,
"learning_rate": 6.122e-06,
"loss": 0.0713,
"step": 6940
},
{
"epoch": 5.986324970388715,
"grad_norm": 2.390625,
"learning_rate": 6.102e-06,
"loss": 0.0719,
"step": 6950
},
{
"epoch": 5.99493916226984,
"grad_norm": 3.0,
"learning_rate": 6.082e-06,
"loss": 0.0704,
"step": 6960
},
{
"epoch": 6.00344567675245,
"grad_norm": 2.0,
"learning_rate": 6.062e-06,
"loss": 0.0591,
"step": 6970
},
{
"epoch": 6.012059868633574,
"grad_norm": 2.421875,
"learning_rate": 6.042e-06,
"loss": 0.0506,
"step": 6980
},
{
"epoch": 6.020674060514698,
"grad_norm": 2.859375,
"learning_rate": 6.022e-06,
"loss": 0.055,
"step": 6990
},
{
"epoch": 6.029288252395822,
"grad_norm": 2.28125,
"learning_rate": 6.002e-06,
"loss": 0.0564,
"step": 7000
},
{
"epoch": 6.037902444276947,
"grad_norm": 2.546875,
"learning_rate": 5.982e-06,
"loss": 0.0567,
"step": 7010
},
{
"epoch": 6.046516636158071,
"grad_norm": 2.5,
"learning_rate": 5.962e-06,
"loss": 0.056,
"step": 7020
},
{
"epoch": 6.055130828039195,
"grad_norm": 2.640625,
"learning_rate": 5.942e-06,
"loss": 0.0564,
"step": 7030
},
{
"epoch": 6.063745019920319,
"grad_norm": 2.171875,
"learning_rate": 5.922e-06,
"loss": 0.0511,
"step": 7040
},
{
"epoch": 6.072359211801443,
"grad_norm": 2.5625,
"learning_rate": 5.9019999999999996e-06,
"loss": 0.0535,
"step": 7050
},
{
"epoch": 6.080973403682567,
"grad_norm": 2.546875,
"learning_rate": 5.882e-06,
"loss": 0.0558,
"step": 7060
},
{
"epoch": 6.089587595563692,
"grad_norm": 2.609375,
"learning_rate": 5.862000000000001e-06,
"loss": 0.0526,
"step": 7070
},
{
"epoch": 6.098201787444816,
"grad_norm": 2.25,
"learning_rate": 5.842000000000001e-06,
"loss": 0.0541,
"step": 7080
},
{
"epoch": 6.10681597932594,
"grad_norm": 3.328125,
"learning_rate": 5.822000000000001e-06,
"loss": 0.0552,
"step": 7090
},
{
"epoch": 6.115430171207064,
"grad_norm": 2.453125,
"learning_rate": 5.802000000000001e-06,
"loss": 0.0551,
"step": 7100
},
{
"epoch": 6.124044363088188,
"grad_norm": 2.171875,
"learning_rate": 5.782000000000001e-06,
"loss": 0.0528,
"step": 7110
},
{
"epoch": 6.132658554969312,
"grad_norm": 2.453125,
"learning_rate": 5.762000000000001e-06,
"loss": 0.0528,
"step": 7120
},
{
"epoch": 6.1412727468504364,
"grad_norm": 2.125,
"learning_rate": 5.742000000000001e-06,
"loss": 0.0538,
"step": 7130
},
{
"epoch": 6.1498869387315604,
"grad_norm": 2.171875,
"learning_rate": 5.722000000000001e-06,
"loss": 0.0521,
"step": 7140
},
{
"epoch": 6.1585011306126844,
"grad_norm": 2.78125,
"learning_rate": 5.702000000000001e-06,
"loss": 0.0557,
"step": 7150
},
{
"epoch": 6.1671153224938084,
"grad_norm": 2.6875,
"learning_rate": 5.682000000000001e-06,
"loss": 0.0536,
"step": 7160
},
{
"epoch": 6.1757295143749324,
"grad_norm": 2.234375,
"learning_rate": 5.662000000000001e-06,
"loss": 0.0526,
"step": 7170
},
{
"epoch": 6.1843437062560564,
"grad_norm": 2.953125,
"learning_rate": 5.642000000000001e-06,
"loss": 0.0547,
"step": 7180
},
{
"epoch": 6.192957898137181,
"grad_norm": 2.40625,
"learning_rate": 5.6220000000000006e-06,
"loss": 0.0508,
"step": 7190
},
{
"epoch": 6.201572090018305,
"grad_norm": 2.453125,
"learning_rate": 5.602000000000001e-06,
"loss": 0.0541,
"step": 7200
},
{
"epoch": 6.210186281899429,
"grad_norm": 2.953125,
"learning_rate": 5.582000000000001e-06,
"loss": 0.0547,
"step": 7210
},
{
"epoch": 6.218800473780553,
"grad_norm": 2.703125,
"learning_rate": 5.562000000000001e-06,
"loss": 0.056,
"step": 7220
},
{
"epoch": 6.227414665661677,
"grad_norm": 2.734375,
"learning_rate": 5.5420000000000005e-06,
"loss": 0.0538,
"step": 7230
},
{
"epoch": 6.236028857542801,
"grad_norm": 2.1875,
"learning_rate": 5.522000000000001e-06,
"loss": 0.0491,
"step": 7240
},
{
"epoch": 6.244643049423926,
"grad_norm": 2.390625,
"learning_rate": 5.502000000000001e-06,
"loss": 0.0532,
"step": 7250
},
{
"epoch": 6.25325724130505,
"grad_norm": 2.265625,
"learning_rate": 5.482000000000001e-06,
"loss": 0.0496,
"step": 7260
},
{
"epoch": 6.261871433186174,
"grad_norm": 2.734375,
"learning_rate": 5.462e-06,
"loss": 0.0587,
"step": 7270
},
{
"epoch": 6.270485625067298,
"grad_norm": 2.546875,
"learning_rate": 5.442000000000001e-06,
"loss": 0.055,
"step": 7280
},
{
"epoch": 6.279099816948422,
"grad_norm": 2.4375,
"learning_rate": 5.422000000000001e-06,
"loss": 0.0559,
"step": 7290
},
{
"epoch": 6.287714008829546,
"grad_norm": 2.59375,
"learning_rate": 5.402000000000001e-06,
"loss": 0.0572,
"step": 7300
},
{
"epoch": 6.296328200710671,
"grad_norm": 2.421875,
"learning_rate": 5.382e-06,
"loss": 0.0516,
"step": 7310
},
{
"epoch": 6.304942392591795,
"grad_norm": 2.21875,
"learning_rate": 5.3620000000000005e-06,
"loss": 0.05,
"step": 7320
},
{
"epoch": 6.313556584472919,
"grad_norm": 2.734375,
"learning_rate": 5.342000000000001e-06,
"loss": 0.0534,
"step": 7330
},
{
"epoch": 6.322170776354043,
"grad_norm": 2.21875,
"learning_rate": 5.322000000000001e-06,
"loss": 0.056,
"step": 7340
},
{
"epoch": 6.330784968235167,
"grad_norm": 2.6875,
"learning_rate": 5.302e-06,
"loss": 0.0542,
"step": 7350
},
{
"epoch": 6.339399160116292,
"grad_norm": 2.703125,
"learning_rate": 5.282e-06,
"loss": 0.0573,
"step": 7360
},
{
"epoch": 6.348013351997416,
"grad_norm": 2.921875,
"learning_rate": 5.262000000000001e-06,
"loss": 0.0518,
"step": 7370
},
{
"epoch": 6.35662754387854,
"grad_norm": 2.5,
"learning_rate": 5.242000000000001e-06,
"loss": 0.0567,
"step": 7380
},
{
"epoch": 6.365241735759664,
"grad_norm": 2.859375,
"learning_rate": 5.222e-06,
"loss": 0.0563,
"step": 7390
},
{
"epoch": 6.373855927640788,
"grad_norm": 2.40625,
"learning_rate": 5.202e-06,
"loss": 0.0534,
"step": 7400
},
{
"epoch": 6.382470119521912,
"grad_norm": 2.734375,
"learning_rate": 5.1820000000000005e-06,
"loss": 0.0515,
"step": 7410
},
{
"epoch": 6.391084311403037,
"grad_norm": 2.71875,
"learning_rate": 5.162000000000001e-06,
"loss": 0.0568,
"step": 7420
},
{
"epoch": 6.399698503284161,
"grad_norm": 3.140625,
"learning_rate": 5.142e-06,
"loss": 0.0527,
"step": 7430
},
{
"epoch": 6.408312695165285,
"grad_norm": 2.671875,
"learning_rate": 5.122e-06,
"loss": 0.0528,
"step": 7440
},
{
"epoch": 6.416926887046409,
"grad_norm": 3.515625,
"learning_rate": 5.1020000000000004e-06,
"loss": 0.0553,
"step": 7450
},
{
"epoch": 6.425541078927533,
"grad_norm": 2.6875,
"learning_rate": 5.082000000000001e-06,
"loss": 0.0531,
"step": 7460
},
{
"epoch": 6.434155270808657,
"grad_norm": 2.046875,
"learning_rate": 5.062e-06,
"loss": 0.05,
"step": 7470
},
{
"epoch": 6.442769462689782,
"grad_norm": 3.25,
"learning_rate": 5.042e-06,
"loss": 0.0562,
"step": 7480
},
{
"epoch": 6.451383654570906,
"grad_norm": 2.828125,
"learning_rate": 5.022e-06,
"loss": 0.0531,
"step": 7490
},
{
"epoch": 6.45999784645203,
"grad_norm": 2.625,
"learning_rate": 5.0020000000000006e-06,
"loss": 0.0545,
"step": 7500
},
{
"epoch": 6.468612038333154,
"grad_norm": 2.4375,
"learning_rate": 4.982e-06,
"loss": 0.0528,
"step": 7510
},
{
"epoch": 6.477226230214278,
"grad_norm": 2.296875,
"learning_rate": 4.962e-06,
"loss": 0.0519,
"step": 7520
},
{
"epoch": 6.485840422095402,
"grad_norm": 2.40625,
"learning_rate": 4.942e-06,
"loss": 0.0548,
"step": 7530
},
{
"epoch": 6.494454613976527,
"grad_norm": 3.390625,
"learning_rate": 4.9220000000000005e-06,
"loss": 0.0534,
"step": 7540
},
{
"epoch": 6.503068805857651,
"grad_norm": 2.625,
"learning_rate": 4.902000000000001e-06,
"loss": 0.0516,
"step": 7550
},
{
"epoch": 6.511682997738775,
"grad_norm": 2.609375,
"learning_rate": 4.882000000000001e-06,
"loss": 0.055,
"step": 7560
},
{
"epoch": 6.520297189619899,
"grad_norm": 2.265625,
"learning_rate": 4.862e-06,
"loss": 0.0508,
"step": 7570
},
{
"epoch": 6.528911381501023,
"grad_norm": 2.59375,
"learning_rate": 4.842e-06,
"loss": 0.0557,
"step": 7580
},
{
"epoch": 6.537525573382148,
"grad_norm": 2.765625,
"learning_rate": 4.822000000000001e-06,
"loss": 0.0564,
"step": 7590
},
{
"epoch": 6.546139765263272,
"grad_norm": 2.140625,
"learning_rate": 4.802000000000001e-06,
"loss": 0.0519,
"step": 7600
},
{
"epoch": 6.554753957144396,
"grad_norm": 2.53125,
"learning_rate": 4.782e-06,
"loss": 0.056,
"step": 7610
},
{
"epoch": 6.56336814902552,
"grad_norm": 2.546875,
"learning_rate": 4.762e-06,
"loss": 0.0554,
"step": 7620
},
{
"epoch": 6.571982340906644,
"grad_norm": 2.703125,
"learning_rate": 4.7420000000000005e-06,
"loss": 0.0538,
"step": 7630
},
{
"epoch": 6.580596532787768,
"grad_norm": 2.59375,
"learning_rate": 4.722000000000001e-06,
"loss": 0.0527,
"step": 7640
},
{
"epoch": 6.589210724668892,
"grad_norm": 2.765625,
"learning_rate": 4.702e-06,
"loss": 0.0558,
"step": 7650
},
{
"epoch": 6.5978249165500165,
"grad_norm": 2.4375,
"learning_rate": 4.682e-06,
"loss": 0.0506,
"step": 7660
},
{
"epoch": 6.6064391084311405,
"grad_norm": 3.4375,
"learning_rate": 4.6620000000000004e-06,
"loss": 0.055,
"step": 7670
},
{
"epoch": 6.6150533003122645,
"grad_norm": 2.484375,
"learning_rate": 4.642000000000001e-06,
"loss": 0.0535,
"step": 7680
},
{
"epoch": 6.6236674921933885,
"grad_norm": 2.65625,
"learning_rate": 4.622e-06,
"loss": 0.0564,
"step": 7690
},
{
"epoch": 6.6322816840745125,
"grad_norm": 2.6875,
"learning_rate": 4.602e-06,
"loss": 0.0567,
"step": 7700
},
{
"epoch": 6.640895875955637,
"grad_norm": 2.84375,
"learning_rate": 4.582e-06,
"loss": 0.0557,
"step": 7710
},
{
"epoch": 6.649510067836761,
"grad_norm": 2.625,
"learning_rate": 4.5620000000000005e-06,
"loss": 0.0533,
"step": 7720
},
{
"epoch": 6.658124259717885,
"grad_norm": 2.1875,
"learning_rate": 4.542e-06,
"loss": 0.0548,
"step": 7730
},
{
"epoch": 6.666738451599009,
"grad_norm": 2.734375,
"learning_rate": 4.522e-06,
"loss": 0.0543,
"step": 7740
},
{
"epoch": 6.675352643480133,
"grad_norm": 2.484375,
"learning_rate": 4.502e-06,
"loss": 0.0498,
"step": 7750
},
{
"epoch": 6.683966835361257,
"grad_norm": 2.640625,
"learning_rate": 4.4820000000000005e-06,
"loss": 0.0559,
"step": 7760
},
{
"epoch": 6.692581027242381,
"grad_norm": 2.796875,
"learning_rate": 4.462e-06,
"loss": 0.0556,
"step": 7770
},
{
"epoch": 6.701195219123506,
"grad_norm": 2.3125,
"learning_rate": 4.442e-06,
"loss": 0.051,
"step": 7780
},
{
"epoch": 6.70980941100463,
"grad_norm": 2.65625,
"learning_rate": 4.422e-06,
"loss": 0.0555,
"step": 7790
},
{
"epoch": 6.718423602885754,
"grad_norm": 2.5625,
"learning_rate": 4.402e-06,
"loss": 0.0522,
"step": 7800
},
{
"epoch": 6.727037794766878,
"grad_norm": 2.265625,
"learning_rate": 4.382e-06,
"loss": 0.0555,
"step": 7810
},
{
"epoch": 6.735651986648002,
"grad_norm": 4.125,
"learning_rate": 4.362e-06,
"loss": 0.0519,
"step": 7820
},
{
"epoch": 6.744266178529127,
"grad_norm": 3.25,
"learning_rate": 4.342e-06,
"loss": 0.0538,
"step": 7830
},
{
"epoch": 6.752880370410251,
"grad_norm": 3.328125,
"learning_rate": 4.322e-06,
"loss": 0.056,
"step": 7840
},
{
"epoch": 6.761494562291375,
"grad_norm": 2.734375,
"learning_rate": 4.3020000000000005e-06,
"loss": 0.0538,
"step": 7850
},
{
"epoch": 6.770108754172499,
"grad_norm": 2.171875,
"learning_rate": 4.282000000000001e-06,
"loss": 0.0513,
"step": 7860
},
{
"epoch": 6.778722946053623,
"grad_norm": 2.921875,
"learning_rate": 4.262000000000001e-06,
"loss": 0.0548,
"step": 7870
},
{
"epoch": 6.787337137934747,
"grad_norm": 3.0625,
"learning_rate": 4.242e-06,
"loss": 0.0538,
"step": 7880
},
{
"epoch": 6.795951329815872,
"grad_norm": 2.75,
"learning_rate": 4.222e-06,
"loss": 0.0537,
"step": 7890
},
{
"epoch": 6.804565521696996,
"grad_norm": 2.859375,
"learning_rate": 4.202000000000001e-06,
"loss": 0.0582,
"step": 7900
},
{
"epoch": 6.81317971357812,
"grad_norm": 2.421875,
"learning_rate": 4.182000000000001e-06,
"loss": 0.0577,
"step": 7910
},
{
"epoch": 6.821793905459244,
"grad_norm": 2.75,
"learning_rate": 4.162e-06,
"loss": 0.0564,
"step": 7920
},
{
"epoch": 6.830408097340368,
"grad_norm": 2.9375,
"learning_rate": 4.142e-06,
"loss": 0.0551,
"step": 7930
},
{
"epoch": 6.839022289221492,
"grad_norm": 2.734375,
"learning_rate": 4.1220000000000005e-06,
"loss": 0.0501,
"step": 7940
},
{
"epoch": 6.847636481102617,
"grad_norm": 2.515625,
"learning_rate": 4.102000000000001e-06,
"loss": 0.0498,
"step": 7950
},
{
"epoch": 6.856250672983741,
"grad_norm": 3.125,
"learning_rate": 4.082e-06,
"loss": 0.0512,
"step": 7960
},
{
"epoch": 6.864864864864865,
"grad_norm": 2.5,
"learning_rate": 4.062e-06,
"loss": 0.052,
"step": 7970
},
{
"epoch": 6.873479056745989,
"grad_norm": 2.84375,
"learning_rate": 4.0420000000000004e-06,
"loss": 0.0563,
"step": 7980
},
{
"epoch": 6.882093248627113,
"grad_norm": 2.671875,
"learning_rate": 4.022000000000001e-06,
"loss": 0.0526,
"step": 7990
},
{
"epoch": 6.890707440508237,
"grad_norm": 2.640625,
"learning_rate": 4.002e-06,
"loss": 0.0556,
"step": 8000
},
{
"epoch": 6.899321632389362,
"grad_norm": 2.8125,
"learning_rate": 3.982e-06,
"loss": 0.0553,
"step": 8010
},
{
"epoch": 6.907935824270486,
"grad_norm": 2.828125,
"learning_rate": 3.962e-06,
"loss": 0.0538,
"step": 8020
},
{
"epoch": 6.91655001615161,
"grad_norm": 3.046875,
"learning_rate": 3.9420000000000005e-06,
"loss": 0.056,
"step": 8030
},
{
"epoch": 6.925164208032734,
"grad_norm": 3.203125,
"learning_rate": 3.922e-06,
"loss": 0.0553,
"step": 8040
},
{
"epoch": 6.933778399913858,
"grad_norm": 2.796875,
"learning_rate": 3.902e-06,
"loss": 0.054,
"step": 8050
},
{
"epoch": 6.942392591794983,
"grad_norm": 2.390625,
"learning_rate": 3.882e-06,
"loss": 0.056,
"step": 8060
},
{
"epoch": 6.951006783676107,
"grad_norm": 2.296875,
"learning_rate": 3.8620000000000005e-06,
"loss": 0.055,
"step": 8070
},
{
"epoch": 6.959620975557231,
"grad_norm": 2.6875,
"learning_rate": 3.842e-06,
"loss": 0.0542,
"step": 8080
},
{
"epoch": 6.968235167438355,
"grad_norm": 2.5,
"learning_rate": 3.822e-06,
"loss": 0.052,
"step": 8090
},
{
"epoch": 6.976849359319479,
"grad_norm": 2.34375,
"learning_rate": 3.802e-06,
"loss": 0.0528,
"step": 8100
},
{
"epoch": 6.985463551200603,
"grad_norm": 2.875,
"learning_rate": 3.782e-06,
"loss": 0.0551,
"step": 8110
},
{
"epoch": 6.994077743081727,
"grad_norm": 2.46875,
"learning_rate": 3.762e-06,
"loss": 0.0545,
"step": 8120
},
{
"epoch": 7.0025842575643376,
"grad_norm": 2.46875,
"learning_rate": 3.742e-06,
"loss": 0.0536,
"step": 8130
},
{
"epoch": 7.0111984494454616,
"grad_norm": 1.9296875,
"learning_rate": 3.722e-06,
"loss": 0.0441,
"step": 8140
},
{
"epoch": 7.019812641326586,
"grad_norm": 2.234375,
"learning_rate": 3.702e-06,
"loss": 0.0494,
"step": 8150
},
{
"epoch": 7.02842683320771,
"grad_norm": 2.546875,
"learning_rate": 3.6820000000000005e-06,
"loss": 0.0473,
"step": 8160
},
{
"epoch": 7.037041025088834,
"grad_norm": 2.546875,
"learning_rate": 3.6620000000000007e-06,
"loss": 0.0443,
"step": 8170
},
{
"epoch": 7.0456552169699584,
"grad_norm": 2.25,
"learning_rate": 3.6420000000000005e-06,
"loss": 0.0454,
"step": 8180
},
{
"epoch": 7.0542694088510824,
"grad_norm": 2.484375,
"learning_rate": 3.6220000000000006e-06,
"loss": 0.0465,
"step": 8190
},
{
"epoch": 7.0628836007322064,
"grad_norm": 2.5625,
"learning_rate": 3.6020000000000004e-06,
"loss": 0.0463,
"step": 8200
},
{
"epoch": 7.0714977926133304,
"grad_norm": 2.375,
"learning_rate": 3.5820000000000006e-06,
"loss": 0.0454,
"step": 8210
},
{
"epoch": 7.0801119844944544,
"grad_norm": 2.421875,
"learning_rate": 3.5620000000000004e-06,
"loss": 0.0453,
"step": 8220
},
{
"epoch": 7.0887261763755784,
"grad_norm": 1.984375,
"learning_rate": 3.5420000000000006e-06,
"loss": 0.0459,
"step": 8230
},
{
"epoch": 7.097340368256703,
"grad_norm": 2.53125,
"learning_rate": 3.5220000000000003e-06,
"loss": 0.0462,
"step": 8240
},
{
"epoch": 7.105954560137827,
"grad_norm": 2.921875,
"learning_rate": 3.5020000000000005e-06,
"loss": 0.0486,
"step": 8250
},
{
"epoch": 7.114568752018951,
"grad_norm": 2.390625,
"learning_rate": 3.4820000000000003e-06,
"loss": 0.0436,
"step": 8260
},
{
"epoch": 7.123182943900075,
"grad_norm": 2.375,
"learning_rate": 3.4620000000000005e-06,
"loss": 0.0462,
"step": 8270
},
{
"epoch": 7.131797135781199,
"grad_norm": 2.21875,
"learning_rate": 3.4420000000000002e-06,
"loss": 0.045,
"step": 8280
},
{
"epoch": 7.140411327662323,
"grad_norm": 2.65625,
"learning_rate": 3.4220000000000004e-06,
"loss": 0.0461,
"step": 8290
},
{
"epoch": 7.149025519543448,
"grad_norm": 2.09375,
"learning_rate": 3.402e-06,
"loss": 0.0453,
"step": 8300
},
{
"epoch": 7.157639711424572,
"grad_norm": 2.375,
"learning_rate": 3.3820000000000004e-06,
"loss": 0.045,
"step": 8310
},
{
"epoch": 7.166253903305696,
"grad_norm": 2.46875,
"learning_rate": 3.362e-06,
"loss": 0.0483,
"step": 8320
},
{
"epoch": 7.17486809518682,
"grad_norm": 2.25,
"learning_rate": 3.3420000000000004e-06,
"loss": 0.0452,
"step": 8330
},
{
"epoch": 7.183482287067944,
"grad_norm": 2.40625,
"learning_rate": 3.322e-06,
"loss": 0.0485,
"step": 8340
},
{
"epoch": 7.192096478949068,
"grad_norm": 2.28125,
"learning_rate": 3.3020000000000003e-06,
"loss": 0.0485,
"step": 8350
},
{
"epoch": 7.200710670830193,
"grad_norm": 2.125,
"learning_rate": 3.282e-06,
"loss": 0.0461,
"step": 8360
},
{
"epoch": 7.209324862711317,
"grad_norm": 2.5625,
"learning_rate": 3.2620000000000003e-06,
"loss": 0.0498,
"step": 8370
},
{
"epoch": 7.217939054592441,
"grad_norm": 2.296875,
"learning_rate": 3.242e-06,
"loss": 0.0469,
"step": 8380
},
{
"epoch": 7.226553246473565,
"grad_norm": 1.9609375,
"learning_rate": 3.2220000000000002e-06,
"loss": 0.0428,
"step": 8390
},
{
"epoch": 7.235167438354689,
"grad_norm": 2.328125,
"learning_rate": 3.202e-06,
"loss": 0.0449,
"step": 8400
},
{
"epoch": 7.243781630235813,
"grad_norm": 2.09375,
"learning_rate": 3.182e-06,
"loss": 0.0441,
"step": 8410
},
{
"epoch": 7.252395822116938,
"grad_norm": 2.4375,
"learning_rate": 3.162e-06,
"loss": 0.0452,
"step": 8420
},
{
"epoch": 7.261010013998062,
"grad_norm": 2.578125,
"learning_rate": 3.142e-06,
"loss": 0.0458,
"step": 8430
},
{
"epoch": 7.269624205879186,
"grad_norm": 2.390625,
"learning_rate": 3.122e-06,
"loss": 0.0421,
"step": 8440
},
{
"epoch": 7.27823839776031,
"grad_norm": 2.65625,
"learning_rate": 3.102e-06,
"loss": 0.0443,
"step": 8450
},
{
"epoch": 7.286852589641434,
"grad_norm": 2.734375,
"learning_rate": 3.082e-06,
"loss": 0.0446,
"step": 8460
},
{
"epoch": 7.295466781522558,
"grad_norm": 2.515625,
"learning_rate": 3.0620000000000005e-06,
"loss": 0.0449,
"step": 8470
},
{
"epoch": 7.304080973403683,
"grad_norm": 2.15625,
"learning_rate": 3.0420000000000007e-06,
"loss": 0.0413,
"step": 8480
},
{
"epoch": 7.312695165284807,
"grad_norm": 2.46875,
"learning_rate": 3.0220000000000005e-06,
"loss": 0.0456,
"step": 8490
},
{
"epoch": 7.321309357165931,
"grad_norm": 2.5,
"learning_rate": 3.0020000000000006e-06,
"loss": 0.0459,
"step": 8500
},
{
"epoch": 7.329923549047055,
"grad_norm": 2.140625,
"learning_rate": 2.9820000000000004e-06,
"loss": 0.0477,
"step": 8510
},
{
"epoch": 7.338537740928179,
"grad_norm": 2.375,
"learning_rate": 2.9620000000000006e-06,
"loss": 0.0462,
"step": 8520
},
{
"epoch": 7.347151932809304,
"grad_norm": 2.3125,
"learning_rate": 2.9420000000000004e-06,
"loss": 0.0428,
"step": 8530
},
{
"epoch": 7.355766124690428,
"grad_norm": 2.359375,
"learning_rate": 2.9220000000000006e-06,
"loss": 0.0438,
"step": 8540
},
{
"epoch": 7.364380316571552,
"grad_norm": 2.65625,
"learning_rate": 2.9020000000000003e-06,
"loss": 0.0451,
"step": 8550
},
{
"epoch": 7.372994508452676,
"grad_norm": 2.4375,
"learning_rate": 2.8820000000000005e-06,
"loss": 0.0468,
"step": 8560
},
{
"epoch": 7.3816087003338,
"grad_norm": 2.171875,
"learning_rate": 2.8620000000000003e-06,
"loss": 0.0462,
"step": 8570
},
{
"epoch": 7.390222892214924,
"grad_norm": 2.546875,
"learning_rate": 2.8420000000000005e-06,
"loss": 0.0465,
"step": 8580
},
{
"epoch": 7.398837084096048,
"grad_norm": 2.375,
"learning_rate": 2.8220000000000003e-06,
"loss": 0.0471,
"step": 8590
},
{
"epoch": 7.407451275977173,
"grad_norm": 2.296875,
"learning_rate": 2.8020000000000004e-06,
"loss": 0.0473,
"step": 8600
},
{
"epoch": 7.416065467858297,
"grad_norm": 2.625,
"learning_rate": 2.7820000000000002e-06,
"loss": 0.0486,
"step": 8610
},
{
"epoch": 7.424679659739421,
"grad_norm": 2.40625,
"learning_rate": 2.7620000000000004e-06,
"loss": 0.0457,
"step": 8620
},
{
"epoch": 7.433293851620545,
"grad_norm": 2.0625,
"learning_rate": 2.742e-06,
"loss": 0.0481,
"step": 8630
},
{
"epoch": 7.441908043501669,
"grad_norm": 2.6875,
"learning_rate": 2.7220000000000004e-06,
"loss": 0.0452,
"step": 8640
},
{
"epoch": 7.450522235382794,
"grad_norm": 2.265625,
"learning_rate": 2.702e-06,
"loss": 0.0428,
"step": 8650
},
{
"epoch": 7.459136427263918,
"grad_norm": 2.40625,
"learning_rate": 2.6820000000000003e-06,
"loss": 0.0508,
"step": 8660
},
{
"epoch": 7.467750619145042,
"grad_norm": 2.28125,
"learning_rate": 2.662e-06,
"loss": 0.0443,
"step": 8670
},
{
"epoch": 7.476364811026166,
"grad_norm": 2.5,
"learning_rate": 2.6420000000000003e-06,
"loss": 0.047,
"step": 8680
},
{
"epoch": 7.48497900290729,
"grad_norm": 2.609375,
"learning_rate": 2.622e-06,
"loss": 0.0457,
"step": 8690
},
{
"epoch": 7.493593194788414,
"grad_norm": 2.5625,
"learning_rate": 2.6020000000000002e-06,
"loss": 0.0467,
"step": 8700
},
{
"epoch": 7.502207386669538,
"grad_norm": 2.59375,
"learning_rate": 2.582e-06,
"loss": 0.0468,
"step": 8710
},
{
"epoch": 7.5108215785506625,
"grad_norm": 2.078125,
"learning_rate": 2.562e-06,
"loss": 0.0447,
"step": 8720
},
{
"epoch": 7.5194357704317865,
"grad_norm": 2.0625,
"learning_rate": 2.542e-06,
"loss": 0.0449,
"step": 8730
},
{
"epoch": 7.5280499623129105,
"grad_norm": 2.46875,
"learning_rate": 2.522e-06,
"loss": 0.0466,
"step": 8740
},
{
"epoch": 7.5366641541940345,
"grad_norm": 2.3125,
"learning_rate": 2.502e-06,
"loss": 0.0429,
"step": 8750
},
{
"epoch": 7.5452783460751585,
"grad_norm": 2.859375,
"learning_rate": 2.482e-06,
"loss": 0.0474,
"step": 8760
},
{
"epoch": 7.553892537956283,
"grad_norm": 2.5625,
"learning_rate": 2.4620000000000003e-06,
"loss": 0.0469,
"step": 8770
},
{
"epoch": 7.562506729837407,
"grad_norm": 2.421875,
"learning_rate": 2.442e-06,
"loss": 0.0467,
"step": 8780
},
{
"epoch": 7.571120921718531,
"grad_norm": 2.8125,
"learning_rate": 2.4220000000000003e-06,
"loss": 0.0497,
"step": 8790
},
{
"epoch": 7.579735113599655,
"grad_norm": 2.28125,
"learning_rate": 2.402e-06,
"loss": 0.045,
"step": 8800
},
{
"epoch": 7.588349305480779,
"grad_norm": 2.953125,
"learning_rate": 2.3820000000000002e-06,
"loss": 0.0472,
"step": 8810
},
{
"epoch": 7.596963497361903,
"grad_norm": 2.859375,
"learning_rate": 2.362e-06,
"loss": 0.0495,
"step": 8820
},
{
"epoch": 7.605577689243028,
"grad_norm": 2.40625,
"learning_rate": 2.342e-06,
"loss": 0.0441,
"step": 8830
},
{
"epoch": 7.614191881124152,
"grad_norm": 2.078125,
"learning_rate": 2.322e-06,
"loss": 0.0466,
"step": 8840
},
{
"epoch": 7.622806073005276,
"grad_norm": 2.671875,
"learning_rate": 2.302e-06,
"loss": 0.0447,
"step": 8850
},
{
"epoch": 7.6314202648864,
"grad_norm": 2.3125,
"learning_rate": 2.282e-06,
"loss": 0.0469,
"step": 8860
},
{
"epoch": 7.640034456767524,
"grad_norm": 3.34375,
"learning_rate": 2.262e-06,
"loss": 0.0475,
"step": 8870
},
{
"epoch": 7.648648648648649,
"grad_norm": 2.515625,
"learning_rate": 2.2420000000000003e-06,
"loss": 0.0457,
"step": 8880
},
{
"epoch": 7.657262840529773,
"grad_norm": 2.4375,
"learning_rate": 2.222e-06,
"loss": 0.0437,
"step": 8890
},
{
"epoch": 7.665877032410897,
"grad_norm": 2.25,
"learning_rate": 2.2020000000000003e-06,
"loss": 0.0467,
"step": 8900
},
{
"epoch": 7.674491224292021,
"grad_norm": 2.75,
"learning_rate": 2.182e-06,
"loss": 0.0472,
"step": 8910
},
{
"epoch": 7.683105416173145,
"grad_norm": 2.84375,
"learning_rate": 2.1620000000000002e-06,
"loss": 0.0477,
"step": 8920
},
{
"epoch": 7.691719608054269,
"grad_norm": 2.875,
"learning_rate": 2.142e-06,
"loss": 0.0469,
"step": 8930
},
{
"epoch": 7.700333799935393,
"grad_norm": 2.609375,
"learning_rate": 2.122e-06,
"loss": 0.0481,
"step": 8940
},
{
"epoch": 7.708947991816518,
"grad_norm": 2.984375,
"learning_rate": 2.102e-06,
"loss": 0.0464,
"step": 8950
},
{
"epoch": 7.717562183697642,
"grad_norm": 2.578125,
"learning_rate": 2.082e-06,
"loss": 0.0469,
"step": 8960
},
{
"epoch": 7.726176375578766,
"grad_norm": 2.515625,
"learning_rate": 2.062e-06,
"loss": 0.045,
"step": 8970
},
{
"epoch": 7.73479056745989,
"grad_norm": 2.4375,
"learning_rate": 2.042e-06,
"loss": 0.0434,
"step": 8980
},
{
"epoch": 7.743404759341014,
"grad_norm": 2.78125,
"learning_rate": 2.022e-06,
"loss": 0.0437,
"step": 8990
},
{
"epoch": 7.752018951222139,
"grad_norm": 2.59375,
"learning_rate": 2.002e-06,
"loss": 0.0469,
"step": 9000
},
{
"epoch": 7.760633143103263,
"grad_norm": 2.453125,
"learning_rate": 1.982e-06,
"loss": 0.0458,
"step": 9010
},
{
"epoch": 7.769247334984387,
"grad_norm": 2.609375,
"learning_rate": 1.9620000000000004e-06,
"loss": 0.049,
"step": 9020
},
{
"epoch": 7.777861526865511,
"grad_norm": 2.390625,
"learning_rate": 1.942e-06,
"loss": 0.0472,
"step": 9030
},
{
"epoch": 7.786475718746635,
"grad_norm": 2.1875,
"learning_rate": 1.9220000000000004e-06,
"loss": 0.0474,
"step": 9040
},
{
"epoch": 7.795089910627759,
"grad_norm": 2.609375,
"learning_rate": 1.9020000000000002e-06,
"loss": 0.0456,
"step": 9050
},
{
"epoch": 7.803704102508883,
"grad_norm": 2.296875,
"learning_rate": 1.8820000000000001e-06,
"loss": 0.0427,
"step": 9060
},
{
"epoch": 7.812318294390008,
"grad_norm": 2.71875,
"learning_rate": 1.8620000000000001e-06,
"loss": 0.0441,
"step": 9070
},
{
"epoch": 7.820932486271132,
"grad_norm": 2.296875,
"learning_rate": 1.8420000000000001e-06,
"loss": 0.0435,
"step": 9080
},
{
"epoch": 7.829546678152256,
"grad_norm": 2.4375,
"learning_rate": 1.822e-06,
"loss": 0.0489,
"step": 9090
},
{
"epoch": 7.83816087003338,
"grad_norm": 2.453125,
"learning_rate": 1.802e-06,
"loss": 0.0443,
"step": 9100
},
{
"epoch": 7.846775061914504,
"grad_norm": 2.578125,
"learning_rate": 1.782e-06,
"loss": 0.045,
"step": 9110
},
{
"epoch": 7.855389253795629,
"grad_norm": 2.765625,
"learning_rate": 1.762e-06,
"loss": 0.0422,
"step": 9120
},
{
"epoch": 7.864003445676753,
"grad_norm": 2.546875,
"learning_rate": 1.742e-06,
"loss": 0.0454,
"step": 9130
},
{
"epoch": 7.872617637557877,
"grad_norm": 2.46875,
"learning_rate": 1.722e-06,
"loss": 0.0441,
"step": 9140
},
{
"epoch": 7.881231829439001,
"grad_norm": 2.4375,
"learning_rate": 1.702e-06,
"loss": 0.046,
"step": 9150
},
{
"epoch": 7.889846021320125,
"grad_norm": 2.71875,
"learning_rate": 1.682e-06,
"loss": 0.0467,
"step": 9160
},
{
"epoch": 7.898460213201249,
"grad_norm": 2.75,
"learning_rate": 1.662e-06,
"loss": 0.0486,
"step": 9170
},
{
"epoch": 7.907074405082374,
"grad_norm": 2.640625,
"learning_rate": 1.6420000000000003e-06,
"loss": 0.0475,
"step": 9180
},
{
"epoch": 7.915688596963498,
"grad_norm": 2.40625,
"learning_rate": 1.6220000000000003e-06,
"loss": 0.0476,
"step": 9190
},
{
"epoch": 7.924302788844622,
"grad_norm": 2.234375,
"learning_rate": 1.6020000000000003e-06,
"loss": 0.0425,
"step": 9200
},
{
"epoch": 7.932916980725746,
"grad_norm": 2.28125,
"learning_rate": 1.5820000000000003e-06,
"loss": 0.0447,
"step": 9210
},
{
"epoch": 7.94153117260687,
"grad_norm": 2.109375,
"learning_rate": 1.5620000000000002e-06,
"loss": 0.0484,
"step": 9220
},
{
"epoch": 7.950145364487994,
"grad_norm": 2.46875,
"learning_rate": 1.5420000000000002e-06,
"loss": 0.0455,
"step": 9230
},
{
"epoch": 7.9587595563691185,
"grad_norm": 2.703125,
"learning_rate": 1.5220000000000002e-06,
"loss": 0.0462,
"step": 9240
},
{
"epoch": 7.9673737482502425,
"grad_norm": 2.328125,
"learning_rate": 1.5020000000000002e-06,
"loss": 0.045,
"step": 9250
},
{
"epoch": 7.9759879401313665,
"grad_norm": 2.34375,
"learning_rate": 1.4820000000000002e-06,
"loss": 0.0447,
"step": 9260
},
{
"epoch": 7.9846021320124905,
"grad_norm": 3.34375,
"learning_rate": 1.4620000000000001e-06,
"loss": 0.0472,
"step": 9270
},
{
"epoch": 7.9932163238936145,
"grad_norm": 2.578125,
"learning_rate": 1.4420000000000001e-06,
"loss": 0.047,
"step": 9280
},
{
"epoch": 8.001722838376224,
"grad_norm": 2.03125,
"learning_rate": 1.4220000000000001e-06,
"loss": 0.0444,
"step": 9290
},
{
"epoch": 8.010337030257348,
"grad_norm": 2.59375,
"learning_rate": 1.402e-06,
"loss": 0.0444,
"step": 9300
},
{
"epoch": 8.018951222138472,
"grad_norm": 1.953125,
"learning_rate": 1.382e-06,
"loss": 0.0397,
"step": 9310
},
{
"epoch": 8.027565414019596,
"grad_norm": 2.40625,
"learning_rate": 1.362e-06,
"loss": 0.0425,
"step": 9320
},
{
"epoch": 8.036179605900722,
"grad_norm": 2.1875,
"learning_rate": 1.3420000000000002e-06,
"loss": 0.043,
"step": 9330
},
{
"epoch": 8.044793797781846,
"grad_norm": 2.390625,
"learning_rate": 1.3220000000000002e-06,
"loss": 0.0451,
"step": 9340
},
{
"epoch": 8.05340798966297,
"grad_norm": 2.25,
"learning_rate": 1.3020000000000002e-06,
"loss": 0.0442,
"step": 9350
},
{
"epoch": 8.062022181544094,
"grad_norm": 1.96875,
"learning_rate": 1.2820000000000002e-06,
"loss": 0.0428,
"step": 9360
},
{
"epoch": 8.070636373425218,
"grad_norm": 2.171875,
"learning_rate": 1.2620000000000002e-06,
"loss": 0.0441,
"step": 9370
},
{
"epoch": 8.079250565306342,
"grad_norm": 2.390625,
"learning_rate": 1.2420000000000001e-06,
"loss": 0.045,
"step": 9380
},
{
"epoch": 8.087864757187466,
"grad_norm": 2.421875,
"learning_rate": 1.2220000000000001e-06,
"loss": 0.0443,
"step": 9390
},
{
"epoch": 8.09647894906859,
"grad_norm": 2.515625,
"learning_rate": 1.202e-06,
"loss": 0.0422,
"step": 9400
},
{
"epoch": 8.105093140949714,
"grad_norm": 2.28125,
"learning_rate": 1.182e-06,
"loss": 0.0417,
"step": 9410
},
{
"epoch": 8.113707332830838,
"grad_norm": 2.28125,
"learning_rate": 1.162e-06,
"loss": 0.0432,
"step": 9420
},
{
"epoch": 8.122321524711962,
"grad_norm": 2.453125,
"learning_rate": 1.142e-06,
"loss": 0.0419,
"step": 9430
},
{
"epoch": 8.130935716593086,
"grad_norm": 2.109375,
"learning_rate": 1.122e-06,
"loss": 0.0416,
"step": 9440
},
{
"epoch": 8.139549908474212,
"grad_norm": 2.359375,
"learning_rate": 1.1020000000000002e-06,
"loss": 0.0422,
"step": 9450
},
{
"epoch": 8.148164100355336,
"grad_norm": 2.09375,
"learning_rate": 1.0820000000000002e-06,
"loss": 0.0455,
"step": 9460
},
{
"epoch": 8.15677829223646,
"grad_norm": 2.609375,
"learning_rate": 1.0620000000000002e-06,
"loss": 0.0455,
"step": 9470
},
{
"epoch": 8.165392484117584,
"grad_norm": 2.296875,
"learning_rate": 1.0420000000000001e-06,
"loss": 0.0441,
"step": 9480
},
{
"epoch": 8.174006675998708,
"grad_norm": 2.484375,
"learning_rate": 1.0220000000000001e-06,
"loss": 0.0433,
"step": 9490
},
{
"epoch": 8.182620867879832,
"grad_norm": 2.453125,
"learning_rate": 1.002e-06,
"loss": 0.045,
"step": 9500
},
{
"epoch": 8.191235059760956,
"grad_norm": 2.15625,
"learning_rate": 9.82e-07,
"loss": 0.0443,
"step": 9510
},
{
"epoch": 8.19984925164208,
"grad_norm": 2.203125,
"learning_rate": 9.62e-07,
"loss": 0.0427,
"step": 9520
},
{
"epoch": 8.208463443523204,
"grad_norm": 2.0625,
"learning_rate": 9.420000000000002e-07,
"loss": 0.0412,
"step": 9530
},
{
"epoch": 8.217077635404328,
"grad_norm": 2.234375,
"learning_rate": 9.220000000000001e-07,
"loss": 0.0431,
"step": 9540
},
{
"epoch": 8.225691827285452,
"grad_norm": 2.28125,
"learning_rate": 9.020000000000001e-07,
"loss": 0.0414,
"step": 9550
},
{
"epoch": 8.234306019166578,
"grad_norm": 2.796875,
"learning_rate": 8.820000000000001e-07,
"loss": 0.0454,
"step": 9560
},
{
"epoch": 8.242920211047702,
"grad_norm": 2.40625,
"learning_rate": 8.620000000000001e-07,
"loss": 0.0417,
"step": 9570
},
{
"epoch": 8.251534402928826,
"grad_norm": 2.25,
"learning_rate": 8.42e-07,
"loss": 0.0446,
"step": 9580
},
{
"epoch": 8.26014859480995,
"grad_norm": 2.828125,
"learning_rate": 8.22e-07,
"loss": 0.0434,
"step": 9590
},
{
"epoch": 8.268762786691074,
"grad_norm": 2.140625,
"learning_rate": 8.02e-07,
"loss": 0.0397,
"step": 9600
},
{
"epoch": 8.277376978572198,
"grad_norm": 2.0625,
"learning_rate": 7.820000000000001e-07,
"loss": 0.0429,
"step": 9610
},
{
"epoch": 8.285991170453322,
"grad_norm": 1.921875,
"learning_rate": 7.620000000000001e-07,
"loss": 0.0421,
"step": 9620
},
{
"epoch": 8.294605362334446,
"grad_norm": 2.65625,
"learning_rate": 7.420000000000001e-07,
"loss": 0.0433,
"step": 9630
},
{
"epoch": 8.30321955421557,
"grad_norm": 2.203125,
"learning_rate": 7.22e-07,
"loss": 0.0449,
"step": 9640
},
{
"epoch": 8.311833746096694,
"grad_norm": 2.203125,
"learning_rate": 7.02e-07,
"loss": 0.043,
"step": 9650
},
{
"epoch": 8.320447937977818,
"grad_norm": 2.25,
"learning_rate": 6.82e-07,
"loss": 0.0442,
"step": 9660
},
{
"epoch": 8.329062129858942,
"grad_norm": 2.171875,
"learning_rate": 6.62e-07,
"loss": 0.045,
"step": 9670
},
{
"epoch": 8.337676321740068,
"grad_norm": 2.359375,
"learning_rate": 6.42e-07,
"loss": 0.0457,
"step": 9680
},
{
"epoch": 8.346290513621192,
"grad_norm": 2.265625,
"learning_rate": 6.22e-07,
"loss": 0.0431,
"step": 9690
},
{
"epoch": 8.354904705502316,
"grad_norm": 2.640625,
"learning_rate": 6.02e-07,
"loss": 0.0465,
"step": 9700
},
{
"epoch": 8.36351889738344,
"grad_norm": 2.34375,
"learning_rate": 5.820000000000001e-07,
"loss": 0.0423,
"step": 9710
},
{
"epoch": 8.372133089264564,
"grad_norm": 2.296875,
"learning_rate": 5.620000000000001e-07,
"loss": 0.0428,
"step": 9720
},
{
"epoch": 8.380747281145688,
"grad_norm": 2.21875,
"learning_rate": 5.420000000000001e-07,
"loss": 0.041,
"step": 9730
},
{
"epoch": 8.389361473026812,
"grad_norm": 2.734375,
"learning_rate": 5.22e-07,
"loss": 0.0434,
"step": 9740
},
{
"epoch": 8.397975664907936,
"grad_norm": 2.1875,
"learning_rate": 5.02e-07,
"loss": 0.0451,
"step": 9750
},
{
"epoch": 8.40658985678906,
"grad_norm": 2.34375,
"learning_rate": 4.82e-07,
"loss": 0.0422,
"step": 9760
},
{
"epoch": 8.415204048670184,
"grad_norm": 2.140625,
"learning_rate": 4.6200000000000003e-07,
"loss": 0.0443,
"step": 9770
},
{
"epoch": 8.423818240551308,
"grad_norm": 2.15625,
"learning_rate": 4.4200000000000007e-07,
"loss": 0.043,
"step": 9780
},
{
"epoch": 8.432432432432432,
"grad_norm": 2.46875,
"learning_rate": 4.2200000000000005e-07,
"loss": 0.043,
"step": 9790
},
{
"epoch": 8.441046624313557,
"grad_norm": 2.03125,
"learning_rate": 4.02e-07,
"loss": 0.0437,
"step": 9800
},
{
"epoch": 8.449660816194681,
"grad_norm": 2.4375,
"learning_rate": 3.82e-07,
"loss": 0.0428,
"step": 9810
},
{
"epoch": 8.458275008075805,
"grad_norm": 2.421875,
"learning_rate": 3.6200000000000004e-07,
"loss": 0.0453,
"step": 9820
},
{
"epoch": 8.46688919995693,
"grad_norm": 2.453125,
"learning_rate": 3.42e-07,
"loss": 0.0444,
"step": 9830
},
{
"epoch": 8.475503391838053,
"grad_norm": 2.890625,
"learning_rate": 3.22e-07,
"loss": 0.0433,
"step": 9840
},
{
"epoch": 8.484117583719177,
"grad_norm": 2.078125,
"learning_rate": 3.0200000000000003e-07,
"loss": 0.0442,
"step": 9850
},
{
"epoch": 8.492731775600301,
"grad_norm": 2.234375,
"learning_rate": 2.82e-07,
"loss": 0.0434,
"step": 9860
},
{
"epoch": 8.501345967481425,
"grad_norm": 2.40625,
"learning_rate": 2.6200000000000004e-07,
"loss": 0.0454,
"step": 9870
},
{
"epoch": 8.50996015936255,
"grad_norm": 2.0625,
"learning_rate": 2.42e-07,
"loss": 0.0412,
"step": 9880
},
{
"epoch": 8.518574351243673,
"grad_norm": 2.265625,
"learning_rate": 2.2200000000000003e-07,
"loss": 0.043,
"step": 9890
},
{
"epoch": 8.527188543124797,
"grad_norm": 2.703125,
"learning_rate": 2.02e-07,
"loss": 0.0412,
"step": 9900
},
{
"epoch": 8.535802735005923,
"grad_norm": 2.4375,
"learning_rate": 1.8200000000000002e-07,
"loss": 0.0431,
"step": 9910
},
{
"epoch": 8.544416926887047,
"grad_norm": 2.734375,
"learning_rate": 1.62e-07,
"loss": 0.0413,
"step": 9920
},
{
"epoch": 8.553031118768171,
"grad_norm": 2.296875,
"learning_rate": 1.4200000000000003e-07,
"loss": 0.0458,
"step": 9930
},
{
"epoch": 8.561645310649295,
"grad_norm": 2.359375,
"learning_rate": 1.22e-07,
"loss": 0.0445,
"step": 9940
},
{
"epoch": 8.57025950253042,
"grad_norm": 2.46875,
"learning_rate": 1.0200000000000001e-07,
"loss": 0.0454,
"step": 9950
},
{
"epoch": 8.578873694411543,
"grad_norm": 2.546875,
"learning_rate": 8.200000000000002e-08,
"loss": 0.0428,
"step": 9960
},
{
"epoch": 8.587487886292667,
"grad_norm": 2.09375,
"learning_rate": 6.2e-08,
"loss": 0.0419,
"step": 9970
},
{
"epoch": 8.596102078173791,
"grad_norm": 1.8984375,
"learning_rate": 4.2e-08,
"loss": 0.044,
"step": 9980
},
{
"epoch": 8.604716270054915,
"grad_norm": 2.078125,
"learning_rate": 2.2000000000000002e-08,
"loss": 0.0411,
"step": 9990
},
{
"epoch": 8.61333046193604,
"grad_norm": 2.5625,
"learning_rate": 2e-09,
"loss": 0.0433,
"step": 10000
}
],
"logging_steps": 10,
"max_steps": 10000,
"num_input_tokens_seen": 0,
"num_train_epochs": 9,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 2.840766623509979e+18,
"train_batch_size": 1,
"trial_name": null,
"trial_params": null
}