{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 8.182620867879832, "eval_steps": 500, "global_step": 9500, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.008614191881124151, "grad_norm": 3.921875, "learning_rate": 1.9982000000000003e-05, "loss": 1.3429, "step": 10 }, { "epoch": 0.017228383762248303, "grad_norm": 3.15625, "learning_rate": 1.9962000000000003e-05, "loss": 0.7212, "step": 20 }, { "epoch": 0.025842575643372456, "grad_norm": 3.015625, "learning_rate": 1.9942e-05, "loss": 0.6892, "step": 30 }, { "epoch": 0.034456767524496605, "grad_norm": 2.890625, "learning_rate": 1.9922e-05, "loss": 0.6611, "step": 40 }, { "epoch": 0.04307095940562076, "grad_norm": 2.875, "learning_rate": 1.9902e-05, "loss": 0.6514, "step": 50 }, { "epoch": 0.05168515128674491, "grad_norm": 3.0, "learning_rate": 1.9882e-05, "loss": 0.6437, "step": 60 }, { "epoch": 0.060299343167869064, "grad_norm": 2.96875, "learning_rate": 1.9862e-05, "loss": 0.6161, "step": 70 }, { "epoch": 0.06891353504899321, "grad_norm": 3.109375, "learning_rate": 1.9842e-05, "loss": 0.6083, "step": 80 }, { "epoch": 0.07752772693011736, "grad_norm": 3.296875, "learning_rate": 1.9822e-05, "loss": 0.5967, "step": 90 }, { "epoch": 0.08614191881124152, "grad_norm": 2.953125, "learning_rate": 1.9802e-05, "loss": 0.5773, "step": 100 }, { "epoch": 0.09475611069236567, "grad_norm": 3.21875, "learning_rate": 1.9782e-05, "loss": 0.5722, "step": 110 }, { "epoch": 0.10337030257348982, "grad_norm": 2.796875, "learning_rate": 1.9762e-05, "loss": 0.5528, "step": 120 }, { "epoch": 0.11198449445461398, "grad_norm": 2.953125, "learning_rate": 1.9742000000000002e-05, "loss": 0.5952, "step": 130 }, { "epoch": 0.12059868633573813, "grad_norm": 2.6875, "learning_rate": 1.9722000000000002e-05, "loss": 0.5309, "step": 140 }, { "epoch": 0.12921287821686228, "grad_norm": 2.609375, "learning_rate": 1.9702000000000002e-05, "loss": 0.5353, "step": 150 }, { "epoch": 0.13782707009798642, "grad_norm": 2.71875, "learning_rate": 1.9682000000000002e-05, "loss": 0.5447, "step": 160 }, { "epoch": 0.1464412619791106, "grad_norm": 2.859375, "learning_rate": 1.9662000000000003e-05, "loss": 0.4998, "step": 170 }, { "epoch": 0.15505545386023473, "grad_norm": 3.09375, "learning_rate": 1.9642000000000003e-05, "loss": 0.5191, "step": 180 }, { "epoch": 0.1636696457413589, "grad_norm": 3.140625, "learning_rate": 1.9622e-05, "loss": 0.5358, "step": 190 }, { "epoch": 0.17228383762248303, "grad_norm": 2.515625, "learning_rate": 1.9602e-05, "loss": 0.4914, "step": 200 }, { "epoch": 0.1808980295036072, "grad_norm": 2.6875, "learning_rate": 1.9582e-05, "loss": 0.4943, "step": 210 }, { "epoch": 0.18951222138473134, "grad_norm": 2.53125, "learning_rate": 1.9562e-05, "loss": 0.4731, "step": 220 }, { "epoch": 0.1981264132658555, "grad_norm": 2.625, "learning_rate": 1.9542e-05, "loss": 0.497, "step": 230 }, { "epoch": 0.20674060514697964, "grad_norm": 2.46875, "learning_rate": 1.9522e-05, "loss": 0.4746, "step": 240 }, { "epoch": 0.2153547970281038, "grad_norm": 2.515625, "learning_rate": 1.9502e-05, "loss": 0.4763, "step": 250 }, { "epoch": 0.22396898890922795, "grad_norm": 2.546875, "learning_rate": 1.9482e-05, "loss": 0.4759, "step": 260 }, { "epoch": 0.23258318079035212, "grad_norm": 2.71875, "learning_rate": 1.9462e-05, "loss": 0.5039, "step": 270 }, { "epoch": 0.24119737267147626, "grad_norm": 2.546875, "learning_rate": 1.9442e-05, "loss": 0.4799, "step": 280 }, { "epoch": 0.2498115645526004, "grad_norm": 2.84375, "learning_rate": 1.9422e-05, "loss": 0.4446, "step": 290 }, { "epoch": 0.25842575643372456, "grad_norm": 2.671875, "learning_rate": 1.9402e-05, "loss": 0.4727, "step": 300 }, { "epoch": 0.26703994831484873, "grad_norm": 2.890625, "learning_rate": 1.9382000000000002e-05, "loss": 0.4189, "step": 310 }, { "epoch": 0.27565414019597284, "grad_norm": 2.265625, "learning_rate": 1.9362000000000002e-05, "loss": 0.4409, "step": 320 }, { "epoch": 0.284268332077097, "grad_norm": 3.25, "learning_rate": 1.9342000000000002e-05, "loss": 0.4656, "step": 330 }, { "epoch": 0.2928825239582212, "grad_norm": 2.578125, "learning_rate": 1.9322000000000002e-05, "loss": 0.4713, "step": 340 }, { "epoch": 0.30149671583934534, "grad_norm": 2.0625, "learning_rate": 1.9302e-05, "loss": 0.4282, "step": 350 }, { "epoch": 0.31011090772046945, "grad_norm": 2.953125, "learning_rate": 1.9282e-05, "loss": 0.4565, "step": 360 }, { "epoch": 0.3187250996015936, "grad_norm": 2.296875, "learning_rate": 1.9262e-05, "loss": 0.4346, "step": 370 }, { "epoch": 0.3273392914827178, "grad_norm": 2.828125, "learning_rate": 1.9242e-05, "loss": 0.426, "step": 380 }, { "epoch": 0.33595348336384195, "grad_norm": 2.3125, "learning_rate": 1.9222e-05, "loss": 0.42, "step": 390 }, { "epoch": 0.34456767524496607, "grad_norm": 2.171875, "learning_rate": 1.9202e-05, "loss": 0.4317, "step": 400 }, { "epoch": 0.35318186712609023, "grad_norm": 2.59375, "learning_rate": 1.9182e-05, "loss": 0.4311, "step": 410 }, { "epoch": 0.3617960590072144, "grad_norm": 2.609375, "learning_rate": 1.9162e-05, "loss": 0.4056, "step": 420 }, { "epoch": 0.3704102508883385, "grad_norm": 2.1875, "learning_rate": 1.9142e-05, "loss": 0.4029, "step": 430 }, { "epoch": 0.3790244427694627, "grad_norm": 3.125, "learning_rate": 1.9122e-05, "loss": 0.4337, "step": 440 }, { "epoch": 0.38763863465058684, "grad_norm": 2.390625, "learning_rate": 1.9102e-05, "loss": 0.4381, "step": 450 }, { "epoch": 0.396252826531711, "grad_norm": 2.796875, "learning_rate": 1.9082e-05, "loss": 0.4174, "step": 460 }, { "epoch": 0.4048670184128351, "grad_norm": 2.421875, "learning_rate": 1.9062e-05, "loss": 0.3928, "step": 470 }, { "epoch": 0.4134812102939593, "grad_norm": 2.515625, "learning_rate": 1.9042e-05, "loss": 0.4051, "step": 480 }, { "epoch": 0.42209540217508346, "grad_norm": 2.40625, "learning_rate": 1.9022000000000002e-05, "loss": 0.3992, "step": 490 }, { "epoch": 0.4307095940562076, "grad_norm": 2.21875, "learning_rate": 1.9002000000000002e-05, "loss": 0.4194, "step": 500 }, { "epoch": 0.43932378593733173, "grad_norm": 3.328125, "learning_rate": 1.8982000000000002e-05, "loss": 0.3951, "step": 510 }, { "epoch": 0.4479379778184559, "grad_norm": 2.234375, "learning_rate": 1.8962000000000002e-05, "loss": 0.3918, "step": 520 }, { "epoch": 0.45655216969958007, "grad_norm": 2.65625, "learning_rate": 1.8942000000000003e-05, "loss": 0.3854, "step": 530 }, { "epoch": 0.46516636158070424, "grad_norm": 2.0625, "learning_rate": 1.8922000000000003e-05, "loss": 0.3836, "step": 540 }, { "epoch": 0.47378055346182835, "grad_norm": 2.28125, "learning_rate": 1.8902000000000003e-05, "loss": 0.3824, "step": 550 }, { "epoch": 0.4823947453429525, "grad_norm": 2.625, "learning_rate": 1.8882000000000003e-05, "loss": 0.3913, "step": 560 }, { "epoch": 0.4910089372240767, "grad_norm": 2.265625, "learning_rate": 1.8862000000000003e-05, "loss": 0.3834, "step": 570 }, { "epoch": 0.4996231291052008, "grad_norm": 2.359375, "learning_rate": 1.8842000000000004e-05, "loss": 0.3848, "step": 580 }, { "epoch": 0.508237320986325, "grad_norm": 2.34375, "learning_rate": 1.8822000000000004e-05, "loss": 0.3845, "step": 590 }, { "epoch": 0.5168515128674491, "grad_norm": 2.453125, "learning_rate": 1.8802000000000004e-05, "loss": 0.3836, "step": 600 }, { "epoch": 0.5254657047485732, "grad_norm": 2.0, "learning_rate": 1.8782e-05, "loss": 0.3799, "step": 610 }, { "epoch": 0.5340798966296975, "grad_norm": 2.296875, "learning_rate": 1.8762e-05, "loss": 0.3715, "step": 620 }, { "epoch": 0.5426940885108216, "grad_norm": 2.0625, "learning_rate": 1.8742e-05, "loss": 0.3825, "step": 630 }, { "epoch": 0.5513082803919457, "grad_norm": 2.359375, "learning_rate": 1.8722e-05, "loss": 0.364, "step": 640 }, { "epoch": 0.5599224722730699, "grad_norm": 2.1875, "learning_rate": 1.8702e-05, "loss": 0.3765, "step": 650 }, { "epoch": 0.568536664154194, "grad_norm": 1.96875, "learning_rate": 1.8682000000000002e-05, "loss": 0.3748, "step": 660 }, { "epoch": 0.5771508560353182, "grad_norm": 2.390625, "learning_rate": 1.8662000000000002e-05, "loss": 0.3751, "step": 670 }, { "epoch": 0.5857650479164423, "grad_norm": 2.203125, "learning_rate": 1.8642000000000002e-05, "loss": 0.3778, "step": 680 }, { "epoch": 0.5943792397975665, "grad_norm": 1.796875, "learning_rate": 1.8622000000000002e-05, "loss": 0.3798, "step": 690 }, { "epoch": 0.6029934316786907, "grad_norm": 2.46875, "learning_rate": 1.8602000000000002e-05, "loss": 0.3682, "step": 700 }, { "epoch": 0.6116076235598148, "grad_norm": 1.9765625, "learning_rate": 1.8582000000000003e-05, "loss": 0.3652, "step": 710 }, { "epoch": 0.6202218154409389, "grad_norm": 2.25, "learning_rate": 1.8562000000000003e-05, "loss": 0.3658, "step": 720 }, { "epoch": 0.6288360073220631, "grad_norm": 1.9140625, "learning_rate": 1.8542000000000003e-05, "loss": 0.389, "step": 730 }, { "epoch": 0.6374501992031872, "grad_norm": 1.9375, "learning_rate": 1.8522000000000003e-05, "loss": 0.375, "step": 740 }, { "epoch": 0.6460643910843114, "grad_norm": 2.140625, "learning_rate": 1.8502000000000003e-05, "loss": 0.3617, "step": 750 }, { "epoch": 0.6546785829654356, "grad_norm": 1.9140625, "learning_rate": 1.8482000000000004e-05, "loss": 0.3777, "step": 760 }, { "epoch": 0.6632927748465597, "grad_norm": 1.9453125, "learning_rate": 1.8462000000000004e-05, "loss": 0.3599, "step": 770 }, { "epoch": 0.6719069667276839, "grad_norm": 1.75, "learning_rate": 1.8442e-05, "loss": 0.3495, "step": 780 }, { "epoch": 0.680521158608808, "grad_norm": 2.109375, "learning_rate": 1.8422e-05, "loss": 0.3461, "step": 790 }, { "epoch": 0.6891353504899321, "grad_norm": 2.078125, "learning_rate": 1.8402e-05, "loss": 0.3661, "step": 800 }, { "epoch": 0.6977495423710564, "grad_norm": 2.03125, "learning_rate": 1.8382e-05, "loss": 0.3594, "step": 810 }, { "epoch": 0.7063637342521805, "grad_norm": 1.984375, "learning_rate": 1.8362e-05, "loss": 0.3512, "step": 820 }, { "epoch": 0.7149779261333046, "grad_norm": 2.0625, "learning_rate": 1.8342e-05, "loss": 0.3616, "step": 830 }, { "epoch": 0.7235921180144288, "grad_norm": 2.484375, "learning_rate": 1.8322000000000002e-05, "loss": 0.3575, "step": 840 }, { "epoch": 0.7322063098955529, "grad_norm": 2.1875, "learning_rate": 1.8302000000000002e-05, "loss": 0.3712, "step": 850 }, { "epoch": 0.740820501776677, "grad_norm": 2.046875, "learning_rate": 1.8282000000000002e-05, "loss": 0.3724, "step": 860 }, { "epoch": 0.7494346936578012, "grad_norm": 1.953125, "learning_rate": 1.8262000000000002e-05, "loss": 0.3524, "step": 870 }, { "epoch": 0.7580488855389254, "grad_norm": 2.203125, "learning_rate": 1.8242000000000003e-05, "loss": 0.3543, "step": 880 }, { "epoch": 0.7666630774200496, "grad_norm": 2.015625, "learning_rate": 1.8222000000000003e-05, "loss": 0.3697, "step": 890 }, { "epoch": 0.7752772693011737, "grad_norm": 2.140625, "learning_rate": 1.8202000000000003e-05, "loss": 0.3583, "step": 900 }, { "epoch": 0.7838914611822978, "grad_norm": 1.9921875, "learning_rate": 1.8182000000000003e-05, "loss": 0.3753, "step": 910 }, { "epoch": 0.792505653063422, "grad_norm": 1.9375, "learning_rate": 1.8162000000000003e-05, "loss": 0.3581, "step": 920 }, { "epoch": 0.8011198449445461, "grad_norm": 2.171875, "learning_rate": 1.8142000000000004e-05, "loss": 0.3534, "step": 930 }, { "epoch": 0.8097340368256702, "grad_norm": 2.328125, "learning_rate": 1.8122e-05, "loss": 0.3654, "step": 940 }, { "epoch": 0.8183482287067945, "grad_norm": 1.875, "learning_rate": 1.8102e-05, "loss": 0.3663, "step": 950 }, { "epoch": 0.8269624205879186, "grad_norm": 1.921875, "learning_rate": 1.8082e-05, "loss": 0.3599, "step": 960 }, { "epoch": 0.8355766124690428, "grad_norm": 2.296875, "learning_rate": 1.8062e-05, "loss": 0.3511, "step": 970 }, { "epoch": 0.8441908043501669, "grad_norm": 2.0625, "learning_rate": 1.8042e-05, "loss": 0.3615, "step": 980 }, { "epoch": 0.852804996231291, "grad_norm": 1.9140625, "learning_rate": 1.8022e-05, "loss": 0.3523, "step": 990 }, { "epoch": 0.8614191881124152, "grad_norm": 2.0, "learning_rate": 1.8002e-05, "loss": 0.3591, "step": 1000 }, { "epoch": 0.8700333799935394, "grad_norm": 2.078125, "learning_rate": 1.7982e-05, "loss": 0.3567, "step": 1010 }, { "epoch": 0.8786475718746635, "grad_norm": 1.984375, "learning_rate": 1.7962000000000002e-05, "loss": 0.3568, "step": 1020 }, { "epoch": 0.8872617637557877, "grad_norm": 1.765625, "learning_rate": 1.7942000000000002e-05, "loss": 0.3492, "step": 1030 }, { "epoch": 0.8958759556369118, "grad_norm": 1.671875, "learning_rate": 1.7922000000000002e-05, "loss": 0.3386, "step": 1040 }, { "epoch": 0.9044901475180359, "grad_norm": 2.09375, "learning_rate": 1.7902000000000002e-05, "loss": 0.3496, "step": 1050 }, { "epoch": 0.9131043393991601, "grad_norm": 1.9296875, "learning_rate": 1.7882000000000003e-05, "loss": 0.3278, "step": 1060 }, { "epoch": 0.9217185312802842, "grad_norm": 1.8359375, "learning_rate": 1.7862000000000003e-05, "loss": 0.3343, "step": 1070 }, { "epoch": 0.9303327231614085, "grad_norm": 1.6171875, "learning_rate": 1.7842000000000003e-05, "loss": 0.3389, "step": 1080 }, { "epoch": 0.9389469150425326, "grad_norm": 1.96875, "learning_rate": 1.7822000000000003e-05, "loss": 0.351, "step": 1090 }, { "epoch": 0.9475611069236567, "grad_norm": 1.8203125, "learning_rate": 1.7802e-05, "loss": 0.3625, "step": 1100 }, { "epoch": 0.9561752988047809, "grad_norm": 2.03125, "learning_rate": 1.7782e-05, "loss": 0.3597, "step": 1110 }, { "epoch": 0.964789490685905, "grad_norm": 2.015625, "learning_rate": 1.7762e-05, "loss": 0.3631, "step": 1120 }, { "epoch": 0.9734036825670291, "grad_norm": 1.859375, "learning_rate": 1.7742e-05, "loss": 0.3378, "step": 1130 }, { "epoch": 0.9820178744481534, "grad_norm": 1.71875, "learning_rate": 1.7722e-05, "loss": 0.3461, "step": 1140 }, { "epoch": 0.9906320663292775, "grad_norm": 1.9609375, "learning_rate": 1.7702e-05, "loss": 0.3691, "step": 1150 }, { "epoch": 0.9992462582104016, "grad_norm": 1.6328125, "learning_rate": 1.7682e-05, "loss": 0.3332, "step": 1160 }, { "epoch": 1.0077527726930118, "grad_norm": 1.7890625, "learning_rate": 1.7662e-05, "loss": 0.2853, "step": 1170 }, { "epoch": 1.0163669645741358, "grad_norm": 1.8125, "learning_rate": 1.7642e-05, "loss": 0.284, "step": 1180 }, { "epoch": 1.02498115645526, "grad_norm": 1.78125, "learning_rate": 1.7622000000000002e-05, "loss": 0.2768, "step": 1190 }, { "epoch": 1.0335953483363842, "grad_norm": 1.8515625, "learning_rate": 1.7602000000000002e-05, "loss": 0.2734, "step": 1200 }, { "epoch": 1.0422095402175084, "grad_norm": 1.7734375, "learning_rate": 1.7582000000000002e-05, "loss": 0.2697, "step": 1210 }, { "epoch": 1.0508237320986324, "grad_norm": 1.9765625, "learning_rate": 1.7562000000000002e-05, "loss": 0.2799, "step": 1220 }, { "epoch": 1.0594379239797567, "grad_norm": 1.875, "learning_rate": 1.7542000000000002e-05, "loss": 0.2766, "step": 1230 }, { "epoch": 1.068052115860881, "grad_norm": 1.96875, "learning_rate": 1.7522000000000003e-05, "loss": 0.2742, "step": 1240 }, { "epoch": 1.076666307742005, "grad_norm": 1.8671875, "learning_rate": 1.7502000000000003e-05, "loss": 0.2882, "step": 1250 }, { "epoch": 1.0852804996231291, "grad_norm": 1.8046875, "learning_rate": 1.7482e-05, "loss": 0.2882, "step": 1260 }, { "epoch": 1.0938946915042533, "grad_norm": 2.140625, "learning_rate": 1.7462e-05, "loss": 0.2828, "step": 1270 }, { "epoch": 1.1025088833853773, "grad_norm": 1.859375, "learning_rate": 1.7442e-05, "loss": 0.2732, "step": 1280 }, { "epoch": 1.1111230752665016, "grad_norm": 1.65625, "learning_rate": 1.7422e-05, "loss": 0.2798, "step": 1290 }, { "epoch": 1.1197372671476258, "grad_norm": 1.8671875, "learning_rate": 1.7402e-05, "loss": 0.293, "step": 1300 }, { "epoch": 1.1283514590287498, "grad_norm": 1.9609375, "learning_rate": 1.7382e-05, "loss": 0.2971, "step": 1310 }, { "epoch": 1.136965650909874, "grad_norm": 1.953125, "learning_rate": 1.7362e-05, "loss": 0.2673, "step": 1320 }, { "epoch": 1.1455798427909982, "grad_norm": 1.75, "learning_rate": 1.7342e-05, "loss": 0.2875, "step": 1330 }, { "epoch": 1.1541940346721222, "grad_norm": 1.6640625, "learning_rate": 1.7322e-05, "loss": 0.2723, "step": 1340 }, { "epoch": 1.1628082265532464, "grad_norm": 1.9296875, "learning_rate": 1.7302e-05, "loss": 0.3001, "step": 1350 }, { "epoch": 1.1714224184343707, "grad_norm": 1.6875, "learning_rate": 1.7282e-05, "loss": 0.2878, "step": 1360 }, { "epoch": 1.180036610315495, "grad_norm": 1.5546875, "learning_rate": 1.7262000000000002e-05, "loss": 0.2663, "step": 1370 }, { "epoch": 1.188650802196619, "grad_norm": 1.90625, "learning_rate": 1.7242000000000002e-05, "loss": 0.3061, "step": 1380 }, { "epoch": 1.1972649940777431, "grad_norm": 1.84375, "learning_rate": 1.7222000000000002e-05, "loss": 0.3015, "step": 1390 }, { "epoch": 1.2058791859588673, "grad_norm": 1.8125, "learning_rate": 1.7202000000000002e-05, "loss": 0.2825, "step": 1400 }, { "epoch": 1.2144933778399913, "grad_norm": 1.859375, "learning_rate": 1.7182000000000003e-05, "loss": 0.2818, "step": 1410 }, { "epoch": 1.2231075697211156, "grad_norm": 1.96875, "learning_rate": 1.7162e-05, "loss": 0.2755, "step": 1420 }, { "epoch": 1.2317217616022398, "grad_norm": 1.7421875, "learning_rate": 1.7142e-05, "loss": 0.2897, "step": 1430 }, { "epoch": 1.2403359534833638, "grad_norm": 1.8515625, "learning_rate": 1.7122e-05, "loss": 0.2816, "step": 1440 }, { "epoch": 1.248950145364488, "grad_norm": 2.03125, "learning_rate": 1.7102e-05, "loss": 0.2798, "step": 1450 }, { "epoch": 1.2575643372456122, "grad_norm": 2.0, "learning_rate": 1.7082e-05, "loss": 0.2799, "step": 1460 }, { "epoch": 1.2661785291267362, "grad_norm": 1.6953125, "learning_rate": 1.7062e-05, "loss": 0.2843, "step": 1470 }, { "epoch": 1.2747927210078605, "grad_norm": 1.953125, "learning_rate": 1.7042e-05, "loss": 0.2876, "step": 1480 }, { "epoch": 1.2834069128889847, "grad_norm": 1.8203125, "learning_rate": 1.7022e-05, "loss": 0.2912, "step": 1490 }, { "epoch": 1.2920211047701087, "grad_norm": 2.0, "learning_rate": 1.7002e-05, "loss": 0.2811, "step": 1500 }, { "epoch": 1.300635296651233, "grad_norm": 1.875, "learning_rate": 1.6982e-05, "loss": 0.2944, "step": 1510 }, { "epoch": 1.3092494885323571, "grad_norm": 1.828125, "learning_rate": 1.6962e-05, "loss": 0.2796, "step": 1520 }, { "epoch": 1.3178636804134811, "grad_norm": 1.78125, "learning_rate": 1.6942e-05, "loss": 0.2857, "step": 1530 }, { "epoch": 1.3264778722946053, "grad_norm": 1.8359375, "learning_rate": 1.6922e-05, "loss": 0.278, "step": 1540 }, { "epoch": 1.3350920641757296, "grad_norm": 2.125, "learning_rate": 1.6902000000000002e-05, "loss": 0.2711, "step": 1550 }, { "epoch": 1.3437062560568536, "grad_norm": 1.859375, "learning_rate": 1.6882000000000002e-05, "loss": 0.2894, "step": 1560 }, { "epoch": 1.3523204479379778, "grad_norm": 1.71875, "learning_rate": 1.6862000000000002e-05, "loss": 0.2834, "step": 1570 }, { "epoch": 1.360934639819102, "grad_norm": 1.90625, "learning_rate": 1.6842e-05, "loss": 0.2748, "step": 1580 }, { "epoch": 1.369548831700226, "grad_norm": 1.734375, "learning_rate": 1.6822e-05, "loss": 0.2904, "step": 1590 }, { "epoch": 1.3781630235813502, "grad_norm": 1.65625, "learning_rate": 1.6802e-05, "loss": 0.2816, "step": 1600 }, { "epoch": 1.3867772154624745, "grad_norm": 1.90625, "learning_rate": 1.6782e-05, "loss": 0.2993, "step": 1610 }, { "epoch": 1.3953914073435985, "grad_norm": 1.96875, "learning_rate": 1.6762e-05, "loss": 0.2747, "step": 1620 }, { "epoch": 1.4040055992247227, "grad_norm": 1.90625, "learning_rate": 1.6742e-05, "loss": 0.2814, "step": 1630 }, { "epoch": 1.412619791105847, "grad_norm": 1.90625, "learning_rate": 1.6722e-05, "loss": 0.2777, "step": 1640 }, { "epoch": 1.421233982986971, "grad_norm": 2.125, "learning_rate": 1.6702e-05, "loss": 0.2789, "step": 1650 }, { "epoch": 1.4298481748680951, "grad_norm": 1.703125, "learning_rate": 1.6682e-05, "loss": 0.2876, "step": 1660 }, { "epoch": 1.4384623667492193, "grad_norm": 1.828125, "learning_rate": 1.6662e-05, "loss": 0.2855, "step": 1670 }, { "epoch": 1.4470765586303436, "grad_norm": 1.6953125, "learning_rate": 1.6642e-05, "loss": 0.2823, "step": 1680 }, { "epoch": 1.4556907505114676, "grad_norm": 1.6875, "learning_rate": 1.6622e-05, "loss": 0.2759, "step": 1690 }, { "epoch": 1.4643049423925918, "grad_norm": 1.734375, "learning_rate": 1.6602e-05, "loss": 0.2804, "step": 1700 }, { "epoch": 1.472919134273716, "grad_norm": 1.71875, "learning_rate": 1.6582e-05, "loss": 0.2772, "step": 1710 }, { "epoch": 1.4815333261548402, "grad_norm": 2.078125, "learning_rate": 1.6562e-05, "loss": 0.2834, "step": 1720 }, { "epoch": 1.4901475180359642, "grad_norm": 1.765625, "learning_rate": 1.6542000000000002e-05, "loss": 0.2788, "step": 1730 }, { "epoch": 1.4987617099170885, "grad_norm": 1.8359375, "learning_rate": 1.6522e-05, "loss": 0.2796, "step": 1740 }, { "epoch": 1.5073759017982127, "grad_norm": 1.7578125, "learning_rate": 1.6502e-05, "loss": 0.2764, "step": 1750 }, { "epoch": 1.5159900936793367, "grad_norm": 1.859375, "learning_rate": 1.6482000000000002e-05, "loss": 0.2893, "step": 1760 }, { "epoch": 1.524604285560461, "grad_norm": 1.9140625, "learning_rate": 1.6462000000000003e-05, "loss": 0.2868, "step": 1770 }, { "epoch": 1.5332184774415851, "grad_norm": 2.046875, "learning_rate": 1.6442000000000003e-05, "loss": 0.2801, "step": 1780 }, { "epoch": 1.5418326693227091, "grad_norm": 1.78125, "learning_rate": 1.6422000000000003e-05, "loss": 0.2845, "step": 1790 }, { "epoch": 1.5504468612038333, "grad_norm": 1.703125, "learning_rate": 1.6402000000000003e-05, "loss": 0.2814, "step": 1800 }, { "epoch": 1.5590610530849576, "grad_norm": 1.765625, "learning_rate": 1.6382000000000003e-05, "loss": 0.2707, "step": 1810 }, { "epoch": 1.5676752449660816, "grad_norm": 1.8046875, "learning_rate": 1.6362000000000004e-05, "loss": 0.2809, "step": 1820 }, { "epoch": 1.5762894368472058, "grad_norm": 1.7578125, "learning_rate": 1.6342000000000004e-05, "loss": 0.2875, "step": 1830 }, { "epoch": 1.58490362872833, "grad_norm": 1.734375, "learning_rate": 1.6322e-05, "loss": 0.292, "step": 1840 }, { "epoch": 1.593517820609454, "grad_norm": 1.71875, "learning_rate": 1.6302e-05, "loss": 0.2954, "step": 1850 }, { "epoch": 1.6021320124905782, "grad_norm": 1.796875, "learning_rate": 1.6282e-05, "loss": 0.2823, "step": 1860 }, { "epoch": 1.6107462043717025, "grad_norm": 1.6796875, "learning_rate": 1.6262e-05, "loss": 0.2822, "step": 1870 }, { "epoch": 1.6193603962528265, "grad_norm": 1.8515625, "learning_rate": 1.6242e-05, "loss": 0.2776, "step": 1880 }, { "epoch": 1.6279745881339507, "grad_norm": 1.7890625, "learning_rate": 1.6222e-05, "loss": 0.2798, "step": 1890 }, { "epoch": 1.636588780015075, "grad_norm": 2.0625, "learning_rate": 1.6202000000000002e-05, "loss": 0.2873, "step": 1900 }, { "epoch": 1.645202971896199, "grad_norm": 1.8203125, "learning_rate": 1.6182000000000002e-05, "loss": 0.2783, "step": 1910 }, { "epoch": 1.6538171637773231, "grad_norm": 1.796875, "learning_rate": 1.6162000000000002e-05, "loss": 0.2847, "step": 1920 }, { "epoch": 1.6624313556584474, "grad_norm": 2.046875, "learning_rate": 1.6142000000000002e-05, "loss": 0.2917, "step": 1930 }, { "epoch": 1.6710455475395714, "grad_norm": 1.796875, "learning_rate": 1.6122000000000003e-05, "loss": 0.2682, "step": 1940 }, { "epoch": 1.6796597394206956, "grad_norm": 2.109375, "learning_rate": 1.6102000000000003e-05, "loss": 0.2837, "step": 1950 }, { "epoch": 1.6882739313018198, "grad_norm": 1.78125, "learning_rate": 1.6082000000000003e-05, "loss": 0.2852, "step": 1960 }, { "epoch": 1.6968881231829438, "grad_norm": 1.8515625, "learning_rate": 1.6062000000000003e-05, "loss": 0.2896, "step": 1970 }, { "epoch": 1.705502315064068, "grad_norm": 1.9765625, "learning_rate": 1.6042000000000003e-05, "loss": 0.2827, "step": 1980 }, { "epoch": 1.7141165069451922, "grad_norm": 1.75, "learning_rate": 1.6022000000000003e-05, "loss": 0.2725, "step": 1990 }, { "epoch": 1.7227306988263162, "grad_norm": 1.8828125, "learning_rate": 1.6002000000000004e-05, "loss": 0.2835, "step": 2000 }, { "epoch": 1.7313448907074405, "grad_norm": 1.859375, "learning_rate": 1.5982e-05, "loss": 0.2779, "step": 2010 }, { "epoch": 1.7399590825885647, "grad_norm": 1.8046875, "learning_rate": 1.5962e-05, "loss": 0.2856, "step": 2020 }, { "epoch": 1.7485732744696887, "grad_norm": 1.9453125, "learning_rate": 1.5942e-05, "loss": 0.2835, "step": 2030 }, { "epoch": 1.757187466350813, "grad_norm": 1.7734375, "learning_rate": 1.5922e-05, "loss": 0.2777, "step": 2040 }, { "epoch": 1.7658016582319371, "grad_norm": 1.8671875, "learning_rate": 1.5902e-05, "loss": 0.2787, "step": 2050 }, { "epoch": 1.7744158501130611, "grad_norm": 1.7890625, "learning_rate": 1.5882e-05, "loss": 0.2842, "step": 2060 }, { "epoch": 1.7830300419941856, "grad_norm": 1.75, "learning_rate": 1.5862e-05, "loss": 0.2654, "step": 2070 }, { "epoch": 1.7916442338753096, "grad_norm": 1.875, "learning_rate": 1.5842000000000002e-05, "loss": 0.2701, "step": 2080 }, { "epoch": 1.8002584257564336, "grad_norm": 1.8203125, "learning_rate": 1.5822000000000002e-05, "loss": 0.2709, "step": 2090 }, { "epoch": 1.808872617637558, "grad_norm": 1.8046875, "learning_rate": 1.5802000000000002e-05, "loss": 0.2781, "step": 2100 }, { "epoch": 1.817486809518682, "grad_norm": 1.875, "learning_rate": 1.5782000000000002e-05, "loss": 0.2823, "step": 2110 }, { "epoch": 1.826101001399806, "grad_norm": 1.875, "learning_rate": 1.5762000000000003e-05, "loss": 0.2747, "step": 2120 }, { "epoch": 1.8347151932809305, "grad_norm": 1.8828125, "learning_rate": 1.5742000000000003e-05, "loss": 0.2731, "step": 2130 }, { "epoch": 1.8433293851620545, "grad_norm": 1.71875, "learning_rate": 1.5722000000000003e-05, "loss": 0.2738, "step": 2140 }, { "epoch": 1.8519435770431787, "grad_norm": 1.8671875, "learning_rate": 1.5702000000000003e-05, "loss": 0.2887, "step": 2150 }, { "epoch": 1.860557768924303, "grad_norm": 2.046875, "learning_rate": 1.5682000000000003e-05, "loss": 0.2841, "step": 2160 }, { "epoch": 1.869171960805427, "grad_norm": 2.015625, "learning_rate": 1.5662e-05, "loss": 0.2916, "step": 2170 }, { "epoch": 1.8777861526865511, "grad_norm": 1.7734375, "learning_rate": 1.5642e-05, "loss": 0.2747, "step": 2180 }, { "epoch": 1.8864003445676754, "grad_norm": 1.7109375, "learning_rate": 1.5622e-05, "loss": 0.2769, "step": 2190 }, { "epoch": 1.8950145364487994, "grad_norm": 1.7421875, "learning_rate": 1.5602e-05, "loss": 0.2798, "step": 2200 }, { "epoch": 1.9036287283299236, "grad_norm": 1.8125, "learning_rate": 1.5582e-05, "loss": 0.2791, "step": 2210 }, { "epoch": 1.9122429202110478, "grad_norm": 1.9296875, "learning_rate": 1.5562e-05, "loss": 0.2721, "step": 2220 }, { "epoch": 1.9208571120921718, "grad_norm": 2.015625, "learning_rate": 1.5542e-05, "loss": 0.2778, "step": 2230 }, { "epoch": 1.929471303973296, "grad_norm": 1.7265625, "learning_rate": 1.5522e-05, "loss": 0.2792, "step": 2240 }, { "epoch": 1.9380854958544202, "grad_norm": 1.671875, "learning_rate": 1.5502e-05, "loss": 0.2819, "step": 2250 }, { "epoch": 1.9466996877355442, "grad_norm": 1.8515625, "learning_rate": 1.5482000000000002e-05, "loss": 0.2691, "step": 2260 }, { "epoch": 1.9553138796166685, "grad_norm": 1.6015625, "learning_rate": 1.5462000000000002e-05, "loss": 0.2731, "step": 2270 }, { "epoch": 1.9639280714977927, "grad_norm": 1.734375, "learning_rate": 1.5442000000000002e-05, "loss": 0.2709, "step": 2280 }, { "epoch": 1.9725422633789167, "grad_norm": 2.0625, "learning_rate": 1.5422000000000002e-05, "loss": 0.288, "step": 2290 }, { "epoch": 1.981156455260041, "grad_norm": 1.8046875, "learning_rate": 1.5402000000000003e-05, "loss": 0.2807, "step": 2300 }, { "epoch": 1.9897706471411651, "grad_norm": 1.8671875, "learning_rate": 1.5382000000000003e-05, "loss": 0.2769, "step": 2310 }, { "epoch": 1.9983848390222891, "grad_norm": 1.90625, "learning_rate": 1.5362000000000003e-05, "loss": 0.2855, "step": 2320 }, { "epoch": 2.006891353504899, "grad_norm": 1.875, "learning_rate": 1.5342e-05, "loss": 0.2429, "step": 2330 }, { "epoch": 2.0155055453860236, "grad_norm": 2.0, "learning_rate": 1.5322e-05, "loss": 0.2166, "step": 2340 }, { "epoch": 2.0241197372671476, "grad_norm": 1.8125, "learning_rate": 1.5302e-05, "loss": 0.2022, "step": 2350 }, { "epoch": 2.0327339291482716, "grad_norm": 1.8984375, "learning_rate": 1.5282e-05, "loss": 0.1995, "step": 2360 }, { "epoch": 2.041348121029396, "grad_norm": 1.8828125, "learning_rate": 1.5262e-05, "loss": 0.1997, "step": 2370 }, { "epoch": 2.04996231291052, "grad_norm": 2.109375, "learning_rate": 1.5242e-05, "loss": 0.2031, "step": 2380 }, { "epoch": 2.0585765047916444, "grad_norm": 1.9609375, "learning_rate": 1.5222000000000001e-05, "loss": 0.21, "step": 2390 }, { "epoch": 2.0671906966727684, "grad_norm": 2.15625, "learning_rate": 1.5202000000000001e-05, "loss": 0.2159, "step": 2400 }, { "epoch": 2.0758048885538924, "grad_norm": 2.03125, "learning_rate": 1.5182000000000001e-05, "loss": 0.1967, "step": 2410 }, { "epoch": 2.084419080435017, "grad_norm": 2.140625, "learning_rate": 1.5162000000000002e-05, "loss": 0.2168, "step": 2420 }, { "epoch": 2.093033272316141, "grad_norm": 1.921875, "learning_rate": 1.5142000000000002e-05, "loss": 0.2028, "step": 2430 }, { "epoch": 2.101647464197265, "grad_norm": 2.015625, "learning_rate": 1.5122000000000002e-05, "loss": 0.1958, "step": 2440 }, { "epoch": 2.1102616560783893, "grad_norm": 2.046875, "learning_rate": 1.5102e-05, "loss": 0.2105, "step": 2450 }, { "epoch": 2.1188758479595133, "grad_norm": 1.953125, "learning_rate": 1.5082e-05, "loss": 0.2027, "step": 2460 }, { "epoch": 2.1274900398406373, "grad_norm": 2.09375, "learning_rate": 1.5062e-05, "loss": 0.207, "step": 2470 }, { "epoch": 2.136104231721762, "grad_norm": 2.0, "learning_rate": 1.5042000000000001e-05, "loss": 0.2125, "step": 2480 }, { "epoch": 2.144718423602886, "grad_norm": 2.078125, "learning_rate": 1.5022000000000001e-05, "loss": 0.2052, "step": 2490 }, { "epoch": 2.15333261548401, "grad_norm": 2.15625, "learning_rate": 1.5002000000000001e-05, "loss": 0.2057, "step": 2500 }, { "epoch": 2.1619468073651342, "grad_norm": 2.109375, "learning_rate": 1.4982000000000002e-05, "loss": 0.2159, "step": 2510 }, { "epoch": 2.1705609992462582, "grad_norm": 1.9453125, "learning_rate": 1.4962000000000002e-05, "loss": 0.2088, "step": 2520 }, { "epoch": 2.1791751911273822, "grad_norm": 2.125, "learning_rate": 1.4942e-05, "loss": 0.2128, "step": 2530 }, { "epoch": 2.1877893830085067, "grad_norm": 1.984375, "learning_rate": 1.4922e-05, "loss": 0.2143, "step": 2540 }, { "epoch": 2.1964035748896307, "grad_norm": 2.109375, "learning_rate": 1.4902e-05, "loss": 0.2098, "step": 2550 }, { "epoch": 2.2050177667707547, "grad_norm": 2.1875, "learning_rate": 1.4882e-05, "loss": 0.2027, "step": 2560 }, { "epoch": 2.213631958651879, "grad_norm": 1.9765625, "learning_rate": 1.4862000000000001e-05, "loss": 0.2082, "step": 2570 }, { "epoch": 2.222246150533003, "grad_norm": 2.03125, "learning_rate": 1.4842000000000001e-05, "loss": 0.2132, "step": 2580 }, { "epoch": 2.230860342414127, "grad_norm": 2.078125, "learning_rate": 1.4822000000000001e-05, "loss": 0.2008, "step": 2590 }, { "epoch": 2.2394745342952516, "grad_norm": 2.1875, "learning_rate": 1.4802000000000002e-05, "loss": 0.207, "step": 2600 }, { "epoch": 2.2480887261763756, "grad_norm": 1.9765625, "learning_rate": 1.4782e-05, "loss": 0.2129, "step": 2610 }, { "epoch": 2.2567029180574996, "grad_norm": 2.375, "learning_rate": 1.4762e-05, "loss": 0.2085, "step": 2620 }, { "epoch": 2.265317109938624, "grad_norm": 2.140625, "learning_rate": 1.4742e-05, "loss": 0.216, "step": 2630 }, { "epoch": 2.273931301819748, "grad_norm": 1.984375, "learning_rate": 1.4722e-05, "loss": 0.2133, "step": 2640 }, { "epoch": 2.282545493700872, "grad_norm": 2.34375, "learning_rate": 1.4702000000000001e-05, "loss": 0.2032, "step": 2650 }, { "epoch": 2.2911596855819965, "grad_norm": 2.203125, "learning_rate": 1.4682000000000001e-05, "loss": 0.2248, "step": 2660 }, { "epoch": 2.2997738774631205, "grad_norm": 2.25, "learning_rate": 1.4662000000000001e-05, "loss": 0.2056, "step": 2670 }, { "epoch": 2.3083880693442445, "grad_norm": 2.046875, "learning_rate": 1.4642000000000001e-05, "loss": 0.2115, "step": 2680 }, { "epoch": 2.317002261225369, "grad_norm": 2.296875, "learning_rate": 1.4622e-05, "loss": 0.1984, "step": 2690 }, { "epoch": 2.325616453106493, "grad_norm": 2.140625, "learning_rate": 1.4602e-05, "loss": 0.2122, "step": 2700 }, { "epoch": 2.334230644987617, "grad_norm": 2.09375, "learning_rate": 1.4582e-05, "loss": 0.2094, "step": 2710 }, { "epoch": 2.3428448368687413, "grad_norm": 2.109375, "learning_rate": 1.4562e-05, "loss": 0.2225, "step": 2720 }, { "epoch": 2.3514590287498653, "grad_norm": 2.421875, "learning_rate": 1.4542e-05, "loss": 0.2131, "step": 2730 }, { "epoch": 2.36007322063099, "grad_norm": 2.40625, "learning_rate": 1.4522000000000001e-05, "loss": 0.2118, "step": 2740 }, { "epoch": 2.368687412512114, "grad_norm": 2.078125, "learning_rate": 1.4502000000000001e-05, "loss": 0.2148, "step": 2750 }, { "epoch": 2.377301604393238, "grad_norm": 2.375, "learning_rate": 1.4482000000000001e-05, "loss": 0.214, "step": 2760 }, { "epoch": 2.385915796274362, "grad_norm": 2.078125, "learning_rate": 1.4462e-05, "loss": 0.2028, "step": 2770 }, { "epoch": 2.3945299881554862, "grad_norm": 2.25, "learning_rate": 1.4442e-05, "loss": 0.2073, "step": 2780 }, { "epoch": 2.4031441800366102, "grad_norm": 1.9609375, "learning_rate": 1.4422e-05, "loss": 0.2145, "step": 2790 }, { "epoch": 2.4117583719177347, "grad_norm": 2.90625, "learning_rate": 1.4402e-05, "loss": 0.2069, "step": 2800 }, { "epoch": 2.4203725637988587, "grad_norm": 1.8984375, "learning_rate": 1.4382e-05, "loss": 0.2216, "step": 2810 }, { "epoch": 2.4289867556799827, "grad_norm": 2.03125, "learning_rate": 1.4362e-05, "loss": 0.2117, "step": 2820 }, { "epoch": 2.4376009475611067, "grad_norm": 1.9609375, "learning_rate": 1.4342000000000001e-05, "loss": 0.219, "step": 2830 }, { "epoch": 2.446215139442231, "grad_norm": 2.28125, "learning_rate": 1.4322000000000001e-05, "loss": 0.2061, "step": 2840 }, { "epoch": 2.454829331323355, "grad_norm": 2.03125, "learning_rate": 1.4302e-05, "loss": 0.2182, "step": 2850 }, { "epoch": 2.4634435232044796, "grad_norm": 2.21875, "learning_rate": 1.4282e-05, "loss": 0.2135, "step": 2860 }, { "epoch": 2.4720577150856036, "grad_norm": 2.125, "learning_rate": 1.4262e-05, "loss": 0.215, "step": 2870 }, { "epoch": 2.4806719069667276, "grad_norm": 2.015625, "learning_rate": 1.4242e-05, "loss": 0.2105, "step": 2880 }, { "epoch": 2.489286098847852, "grad_norm": 2.078125, "learning_rate": 1.4222e-05, "loss": 0.2067, "step": 2890 }, { "epoch": 2.497900290728976, "grad_norm": 2.359375, "learning_rate": 1.4202e-05, "loss": 0.2145, "step": 2900 }, { "epoch": 2.5065144826101, "grad_norm": 2.328125, "learning_rate": 1.4182e-05, "loss": 0.2191, "step": 2910 }, { "epoch": 2.5151286744912245, "grad_norm": 2.375, "learning_rate": 1.4162000000000001e-05, "loss": 0.2068, "step": 2920 }, { "epoch": 2.5237428663723485, "grad_norm": 2.015625, "learning_rate": 1.4142e-05, "loss": 0.2025, "step": 2930 }, { "epoch": 2.5323570582534725, "grad_norm": 2.140625, "learning_rate": 1.4122e-05, "loss": 0.2089, "step": 2940 }, { "epoch": 2.540971250134597, "grad_norm": 2.296875, "learning_rate": 1.4102e-05, "loss": 0.2167, "step": 2950 }, { "epoch": 2.549585442015721, "grad_norm": 2.140625, "learning_rate": 1.4082e-05, "loss": 0.2049, "step": 2960 }, { "epoch": 2.558199633896845, "grad_norm": 1.984375, "learning_rate": 1.4062e-05, "loss": 0.2034, "step": 2970 }, { "epoch": 2.5668138257779693, "grad_norm": 1.9296875, "learning_rate": 1.4042e-05, "loss": 0.2095, "step": 2980 }, { "epoch": 2.5754280176590933, "grad_norm": 2.34375, "learning_rate": 1.4022e-05, "loss": 0.2047, "step": 2990 }, { "epoch": 2.5840422095402173, "grad_norm": 2.234375, "learning_rate": 1.4002e-05, "loss": 0.2055, "step": 3000 }, { "epoch": 2.592656401421342, "grad_norm": 2.203125, "learning_rate": 1.3982000000000003e-05, "loss": 0.211, "step": 3010 }, { "epoch": 2.601270593302466, "grad_norm": 1.90625, "learning_rate": 1.3962000000000003e-05, "loss": 0.2009, "step": 3020 }, { "epoch": 2.6098847851835902, "grad_norm": 2.28125, "learning_rate": 1.3942000000000001e-05, "loss": 0.2173, "step": 3030 }, { "epoch": 2.6184989770647142, "grad_norm": 2.25, "learning_rate": 1.3922000000000002e-05, "loss": 0.2102, "step": 3040 }, { "epoch": 2.6271131689458382, "grad_norm": 1.9140625, "learning_rate": 1.3902000000000002e-05, "loss": 0.2102, "step": 3050 }, { "epoch": 2.6357273608269622, "grad_norm": 2.34375, "learning_rate": 1.3882000000000002e-05, "loss": 0.213, "step": 3060 }, { "epoch": 2.6443415527080867, "grad_norm": 2.0, "learning_rate": 1.3862000000000002e-05, "loss": 0.2117, "step": 3070 }, { "epoch": 2.6529557445892107, "grad_norm": 2.15625, "learning_rate": 1.3842000000000002e-05, "loss": 0.2142, "step": 3080 }, { "epoch": 2.661569936470335, "grad_norm": 2.203125, "learning_rate": 1.3822000000000003e-05, "loss": 0.202, "step": 3090 }, { "epoch": 2.670184128351459, "grad_norm": 2.671875, "learning_rate": 1.3802000000000003e-05, "loss": 0.2133, "step": 3100 }, { "epoch": 2.678798320232583, "grad_norm": 2.25, "learning_rate": 1.3782000000000001e-05, "loss": 0.2086, "step": 3110 }, { "epoch": 2.687412512113707, "grad_norm": 1.984375, "learning_rate": 1.3762000000000001e-05, "loss": 0.2032, "step": 3120 }, { "epoch": 2.6960267039948316, "grad_norm": 2.40625, "learning_rate": 1.3742000000000002e-05, "loss": 0.2157, "step": 3130 }, { "epoch": 2.7046408958759556, "grad_norm": 1.90625, "learning_rate": 1.3722000000000002e-05, "loss": 0.2139, "step": 3140 }, { "epoch": 2.71325508775708, "grad_norm": 2.125, "learning_rate": 1.3702000000000002e-05, "loss": 0.2177, "step": 3150 }, { "epoch": 2.721869279638204, "grad_norm": 2.1875, "learning_rate": 1.3682000000000002e-05, "loss": 0.2082, "step": 3160 }, { "epoch": 2.730483471519328, "grad_norm": 2.109375, "learning_rate": 1.3662000000000002e-05, "loss": 0.2093, "step": 3170 }, { "epoch": 2.739097663400452, "grad_norm": 1.984375, "learning_rate": 1.3642000000000003e-05, "loss": 0.2054, "step": 3180 }, { "epoch": 2.7477118552815765, "grad_norm": 2.34375, "learning_rate": 1.3622000000000003e-05, "loss": 0.2007, "step": 3190 }, { "epoch": 2.7563260471627005, "grad_norm": 2.34375, "learning_rate": 1.3602000000000001e-05, "loss": 0.2109, "step": 3200 }, { "epoch": 2.764940239043825, "grad_norm": 2.203125, "learning_rate": 1.3582000000000001e-05, "loss": 0.2106, "step": 3210 }, { "epoch": 2.773554430924949, "grad_norm": 2.515625, "learning_rate": 1.3562000000000002e-05, "loss": 0.2101, "step": 3220 }, { "epoch": 2.782168622806073, "grad_norm": 2.109375, "learning_rate": 1.3542000000000002e-05, "loss": 0.216, "step": 3230 }, { "epoch": 2.790782814687197, "grad_norm": 2.109375, "learning_rate": 1.3522000000000002e-05, "loss": 0.2103, "step": 3240 }, { "epoch": 2.7993970065683214, "grad_norm": 2.015625, "learning_rate": 1.3502000000000002e-05, "loss": 0.2083, "step": 3250 }, { "epoch": 2.8080111984494454, "grad_norm": 2.203125, "learning_rate": 1.3482000000000002e-05, "loss": 0.2124, "step": 3260 }, { "epoch": 2.81662539033057, "grad_norm": 2.546875, "learning_rate": 1.3462000000000003e-05, "loss": 0.2114, "step": 3270 }, { "epoch": 2.825239582211694, "grad_norm": 2.0, "learning_rate": 1.3442000000000001e-05, "loss": 0.2122, "step": 3280 }, { "epoch": 2.833853774092818, "grad_norm": 2.203125, "learning_rate": 1.3422000000000001e-05, "loss": 0.217, "step": 3290 }, { "epoch": 2.842467965973942, "grad_norm": 2.34375, "learning_rate": 1.3402000000000001e-05, "loss": 0.2137, "step": 3300 }, { "epoch": 2.8510821578550662, "grad_norm": 2.0, "learning_rate": 1.3382000000000002e-05, "loss": 0.2171, "step": 3310 }, { "epoch": 2.8596963497361902, "grad_norm": 2.09375, "learning_rate": 1.3362000000000002e-05, "loss": 0.21, "step": 3320 }, { "epoch": 2.8683105416173147, "grad_norm": 2.203125, "learning_rate": 1.3342000000000002e-05, "loss": 0.2083, "step": 3330 }, { "epoch": 2.8769247334984387, "grad_norm": 2.171875, "learning_rate": 1.3322000000000002e-05, "loss": 0.2178, "step": 3340 }, { "epoch": 2.8855389253795627, "grad_norm": 2.015625, "learning_rate": 1.3302000000000002e-05, "loss": 0.2086, "step": 3350 }, { "epoch": 2.894153117260687, "grad_norm": 1.953125, "learning_rate": 1.3282000000000001e-05, "loss": 0.2144, "step": 3360 }, { "epoch": 2.902767309141811, "grad_norm": 2.09375, "learning_rate": 1.3262000000000001e-05, "loss": 0.2067, "step": 3370 }, { "epoch": 2.911381501022935, "grad_norm": 2.125, "learning_rate": 1.3242000000000001e-05, "loss": 0.2104, "step": 3380 }, { "epoch": 2.9199956929040596, "grad_norm": 2.25, "learning_rate": 1.3222000000000001e-05, "loss": 0.2135, "step": 3390 }, { "epoch": 2.9286098847851836, "grad_norm": 2.328125, "learning_rate": 1.3202000000000002e-05, "loss": 0.2095, "step": 3400 }, { "epoch": 2.9372240766663076, "grad_norm": 2.234375, "learning_rate": 1.3182000000000002e-05, "loss": 0.2147, "step": 3410 }, { "epoch": 2.945838268547432, "grad_norm": 2.484375, "learning_rate": 1.3162000000000002e-05, "loss": 0.2146, "step": 3420 }, { "epoch": 2.954452460428556, "grad_norm": 2.28125, "learning_rate": 1.3142000000000002e-05, "loss": 0.2137, "step": 3430 }, { "epoch": 2.9630666523096805, "grad_norm": 2.09375, "learning_rate": 1.3122e-05, "loss": 0.2111, "step": 3440 }, { "epoch": 2.9716808441908045, "grad_norm": 2.15625, "learning_rate": 1.3102000000000001e-05, "loss": 0.2194, "step": 3450 }, { "epoch": 2.9802950360719285, "grad_norm": 2.265625, "learning_rate": 1.3082000000000001e-05, "loss": 0.21, "step": 3460 }, { "epoch": 2.9889092279530525, "grad_norm": 2.109375, "learning_rate": 1.3062000000000001e-05, "loss": 0.2004, "step": 3470 }, { "epoch": 2.997523419834177, "grad_norm": 2.15625, "learning_rate": 1.3042000000000002e-05, "loss": 0.212, "step": 3480 }, { "epoch": 3.006029934316787, "grad_norm": 2.328125, "learning_rate": 1.3022000000000002e-05, "loss": 0.1743, "step": 3490 }, { "epoch": 3.014644126197911, "grad_norm": 2.28125, "learning_rate": 1.3002000000000002e-05, "loss": 0.1524, "step": 3500 }, { "epoch": 3.0232583180790353, "grad_norm": 3.359375, "learning_rate": 1.2982000000000002e-05, "loss": 0.1476, "step": 3510 }, { "epoch": 3.0318725099601593, "grad_norm": 2.359375, "learning_rate": 1.2962e-05, "loss": 0.1408, "step": 3520 }, { "epoch": 3.0404867018412833, "grad_norm": 3.03125, "learning_rate": 1.2942e-05, "loss": 0.1495, "step": 3530 }, { "epoch": 3.049100893722408, "grad_norm": 2.296875, "learning_rate": 1.2922000000000001e-05, "loss": 0.1457, "step": 3540 }, { "epoch": 3.057715085603532, "grad_norm": 2.71875, "learning_rate": 1.2902000000000001e-05, "loss": 0.1483, "step": 3550 }, { "epoch": 3.066329277484656, "grad_norm": 2.234375, "learning_rate": 1.2882000000000001e-05, "loss": 0.1451, "step": 3560 }, { "epoch": 3.0749434693657802, "grad_norm": 2.5, "learning_rate": 1.2862000000000002e-05, "loss": 0.1378, "step": 3570 }, { "epoch": 3.0835576612469042, "grad_norm": 2.53125, "learning_rate": 1.2842000000000002e-05, "loss": 0.1539, "step": 3580 }, { "epoch": 3.0921718531280282, "grad_norm": 2.46875, "learning_rate": 1.2822000000000002e-05, "loss": 0.1503, "step": 3590 }, { "epoch": 3.1007860450091527, "grad_norm": 2.15625, "learning_rate": 1.2802e-05, "loss": 0.147, "step": 3600 }, { "epoch": 3.1094002368902767, "grad_norm": 2.53125, "learning_rate": 1.2782e-05, "loss": 0.1447, "step": 3610 }, { "epoch": 3.1180144287714007, "grad_norm": 2.5625, "learning_rate": 1.2762e-05, "loss": 0.1568, "step": 3620 }, { "epoch": 3.126628620652525, "grad_norm": 2.640625, "learning_rate": 1.2742000000000001e-05, "loss": 0.1397, "step": 3630 }, { "epoch": 3.135242812533649, "grad_norm": 2.546875, "learning_rate": 1.2722000000000001e-05, "loss": 0.1347, "step": 3640 }, { "epoch": 3.143857004414773, "grad_norm": 2.578125, "learning_rate": 1.2702000000000001e-05, "loss": 0.1487, "step": 3650 }, { "epoch": 3.1524711962958976, "grad_norm": 2.296875, "learning_rate": 1.2682000000000002e-05, "loss": 0.1458, "step": 3660 }, { "epoch": 3.1610853881770216, "grad_norm": 2.359375, "learning_rate": 1.2662000000000002e-05, "loss": 0.139, "step": 3670 }, { "epoch": 3.169699580058146, "grad_norm": 2.46875, "learning_rate": 1.2642e-05, "loss": 0.141, "step": 3680 }, { "epoch": 3.17831377193927, "grad_norm": 2.640625, "learning_rate": 1.2622e-05, "loss": 0.1355, "step": 3690 }, { "epoch": 3.186927963820394, "grad_norm": 2.703125, "learning_rate": 1.2602e-05, "loss": 0.149, "step": 3700 }, { "epoch": 3.1955421557015184, "grad_norm": 2.546875, "learning_rate": 1.2582e-05, "loss": 0.1485, "step": 3710 }, { "epoch": 3.2041563475826425, "grad_norm": 2.4375, "learning_rate": 1.2562000000000001e-05, "loss": 0.1475, "step": 3720 }, { "epoch": 3.2127705394637665, "grad_norm": 2.671875, "learning_rate": 1.2542000000000001e-05, "loss": 0.1469, "step": 3730 }, { "epoch": 3.221384731344891, "grad_norm": 2.625, "learning_rate": 1.2522000000000001e-05, "loss": 0.1448, "step": 3740 }, { "epoch": 3.229998923226015, "grad_norm": 2.1875, "learning_rate": 1.2502000000000002e-05, "loss": 0.1454, "step": 3750 }, { "epoch": 3.238613115107139, "grad_norm": 2.234375, "learning_rate": 1.2482e-05, "loss": 0.1437, "step": 3760 }, { "epoch": 3.2472273069882633, "grad_norm": 2.515625, "learning_rate": 1.2462e-05, "loss": 0.1521, "step": 3770 }, { "epoch": 3.2558414988693873, "grad_norm": 2.40625, "learning_rate": 1.2442e-05, "loss": 0.1462, "step": 3780 }, { "epoch": 3.2644556907505113, "grad_norm": 2.84375, "learning_rate": 1.2422e-05, "loss": 0.1528, "step": 3790 }, { "epoch": 3.273069882631636, "grad_norm": 2.5, "learning_rate": 1.2402000000000001e-05, "loss": 0.1438, "step": 3800 }, { "epoch": 3.28168407451276, "grad_norm": 2.71875, "learning_rate": 1.2382000000000001e-05, "loss": 0.1481, "step": 3810 }, { "epoch": 3.290298266393884, "grad_norm": 2.59375, "learning_rate": 1.2362000000000001e-05, "loss": 0.1521, "step": 3820 }, { "epoch": 3.2989124582750082, "grad_norm": 2.375, "learning_rate": 1.2342000000000001e-05, "loss": 0.1453, "step": 3830 }, { "epoch": 3.3075266501561322, "grad_norm": 2.75, "learning_rate": 1.2322e-05, "loss": 0.1453, "step": 3840 }, { "epoch": 3.3161408420372562, "grad_norm": 2.328125, "learning_rate": 1.2302e-05, "loss": 0.1512, "step": 3850 }, { "epoch": 3.3247550339183807, "grad_norm": 2.484375, "learning_rate": 1.2282e-05, "loss": 0.1484, "step": 3860 }, { "epoch": 3.3333692257995047, "grad_norm": 2.6875, "learning_rate": 1.2262e-05, "loss": 0.1374, "step": 3870 }, { "epoch": 3.3419834176806287, "grad_norm": 2.515625, "learning_rate": 1.2242e-05, "loss": 0.149, "step": 3880 }, { "epoch": 3.350597609561753, "grad_norm": 2.75, "learning_rate": 1.2222000000000001e-05, "loss": 0.151, "step": 3890 }, { "epoch": 3.359211801442877, "grad_norm": 2.328125, "learning_rate": 1.2202000000000001e-05, "loss": 0.143, "step": 3900 }, { "epoch": 3.367825993324001, "grad_norm": 2.6875, "learning_rate": 1.2182000000000001e-05, "loss": 0.1484, "step": 3910 }, { "epoch": 3.3764401852051256, "grad_norm": 2.234375, "learning_rate": 1.2162e-05, "loss": 0.1532, "step": 3920 }, { "epoch": 3.3850543770862496, "grad_norm": 2.640625, "learning_rate": 1.2142e-05, "loss": 0.1472, "step": 3930 }, { "epoch": 3.3936685689673736, "grad_norm": 2.5, "learning_rate": 1.2122e-05, "loss": 0.1448, "step": 3940 }, { "epoch": 3.402282760848498, "grad_norm": 2.421875, "learning_rate": 1.2102e-05, "loss": 0.1548, "step": 3950 }, { "epoch": 3.410896952729622, "grad_norm": 2.734375, "learning_rate": 1.2082e-05, "loss": 0.1372, "step": 3960 }, { "epoch": 3.419511144610746, "grad_norm": 2.921875, "learning_rate": 1.2062e-05, "loss": 0.1466, "step": 3970 }, { "epoch": 3.4281253364918705, "grad_norm": 2.421875, "learning_rate": 1.2042000000000001e-05, "loss": 0.1406, "step": 3980 }, { "epoch": 3.4367395283729945, "grad_norm": 2.765625, "learning_rate": 1.2022000000000001e-05, "loss": 0.1516, "step": 3990 }, { "epoch": 3.4453537202541185, "grad_norm": 2.59375, "learning_rate": 1.2002e-05, "loss": 0.1491, "step": 4000 }, { "epoch": 3.453967912135243, "grad_norm": 2.6875, "learning_rate": 1.1982e-05, "loss": 0.146, "step": 4010 }, { "epoch": 3.462582104016367, "grad_norm": 2.53125, "learning_rate": 1.1962e-05, "loss": 0.1438, "step": 4020 }, { "epoch": 3.4711962958974913, "grad_norm": 2.96875, "learning_rate": 1.1942e-05, "loss": 0.1372, "step": 4030 }, { "epoch": 3.4798104877786153, "grad_norm": 2.359375, "learning_rate": 1.1922e-05, "loss": 0.1501, "step": 4040 }, { "epoch": 3.4884246796597393, "grad_norm": 2.71875, "learning_rate": 1.1902e-05, "loss": 0.1515, "step": 4050 }, { "epoch": 3.4970388715408633, "grad_norm": 2.796875, "learning_rate": 1.1882e-05, "loss": 0.151, "step": 4060 }, { "epoch": 3.505653063421988, "grad_norm": 2.53125, "learning_rate": 1.1862000000000001e-05, "loss": 0.1468, "step": 4070 }, { "epoch": 3.514267255303112, "grad_norm": 2.6875, "learning_rate": 1.1842e-05, "loss": 0.1477, "step": 4080 }, { "epoch": 3.5228814471842362, "grad_norm": 2.375, "learning_rate": 1.1822e-05, "loss": 0.1516, "step": 4090 }, { "epoch": 3.5314956390653602, "grad_norm": 2.5625, "learning_rate": 1.1802e-05, "loss": 0.145, "step": 4100 }, { "epoch": 3.5401098309464842, "grad_norm": 2.671875, "learning_rate": 1.1782e-05, "loss": 0.1434, "step": 4110 }, { "epoch": 3.5487240228276082, "grad_norm": 2.515625, "learning_rate": 1.1762e-05, "loss": 0.1424, "step": 4120 }, { "epoch": 3.5573382147087327, "grad_norm": 2.140625, "learning_rate": 1.1742e-05, "loss": 0.1433, "step": 4130 }, { "epoch": 3.5659524065898567, "grad_norm": 2.578125, "learning_rate": 1.1722e-05, "loss": 0.1511, "step": 4140 }, { "epoch": 3.574566598470981, "grad_norm": 2.6875, "learning_rate": 1.1702e-05, "loss": 0.149, "step": 4150 }, { "epoch": 3.583180790352105, "grad_norm": 2.6875, "learning_rate": 1.1682e-05, "loss": 0.151, "step": 4160 }, { "epoch": 3.591794982233229, "grad_norm": 2.796875, "learning_rate": 1.1662e-05, "loss": 0.1412, "step": 4170 }, { "epoch": 3.600409174114353, "grad_norm": 2.359375, "learning_rate": 1.1642e-05, "loss": 0.1461, "step": 4180 }, { "epoch": 3.6090233659954776, "grad_norm": 2.546875, "learning_rate": 1.1622e-05, "loss": 0.1481, "step": 4190 }, { "epoch": 3.6176375578766016, "grad_norm": 2.484375, "learning_rate": 1.1602e-05, "loss": 0.1535, "step": 4200 }, { "epoch": 3.626251749757726, "grad_norm": 2.5, "learning_rate": 1.1582e-05, "loss": 0.1579, "step": 4210 }, { "epoch": 3.63486594163885, "grad_norm": 3.078125, "learning_rate": 1.1562e-05, "loss": 0.1528, "step": 4220 }, { "epoch": 3.643480133519974, "grad_norm": 2.625, "learning_rate": 1.1542e-05, "loss": 0.1436, "step": 4230 }, { "epoch": 3.6520943254010985, "grad_norm": 2.78125, "learning_rate": 1.1521999999999999e-05, "loss": 0.1488, "step": 4240 }, { "epoch": 3.6607085172822225, "grad_norm": 2.53125, "learning_rate": 1.1502e-05, "loss": 0.1404, "step": 4250 }, { "epoch": 3.6693227091633465, "grad_norm": 2.5625, "learning_rate": 1.1482000000000001e-05, "loss": 0.1584, "step": 4260 }, { "epoch": 3.677936901044471, "grad_norm": 2.65625, "learning_rate": 1.1462000000000001e-05, "loss": 0.1429, "step": 4270 }, { "epoch": 3.686551092925595, "grad_norm": 2.9375, "learning_rate": 1.1442000000000002e-05, "loss": 0.1478, "step": 4280 }, { "epoch": 3.695165284806719, "grad_norm": 2.28125, "learning_rate": 1.1422000000000002e-05, "loss": 0.1495, "step": 4290 }, { "epoch": 3.7037794766878434, "grad_norm": 2.65625, "learning_rate": 1.1402000000000002e-05, "loss": 0.1487, "step": 4300 }, { "epoch": 3.7123936685689674, "grad_norm": 2.75, "learning_rate": 1.1382000000000002e-05, "loss": 0.1515, "step": 4310 }, { "epoch": 3.721007860450092, "grad_norm": 2.4375, "learning_rate": 1.1362000000000002e-05, "loss": 0.1524, "step": 4320 }, { "epoch": 3.729622052331216, "grad_norm": 2.65625, "learning_rate": 1.1342000000000003e-05, "loss": 0.1506, "step": 4330 }, { "epoch": 3.73823624421234, "grad_norm": 2.609375, "learning_rate": 1.1322000000000001e-05, "loss": 0.1448, "step": 4340 }, { "epoch": 3.746850436093464, "grad_norm": 2.390625, "learning_rate": 1.1302000000000001e-05, "loss": 0.1444, "step": 4350 }, { "epoch": 3.7554646279745882, "grad_norm": 2.203125, "learning_rate": 1.1282000000000001e-05, "loss": 0.1483, "step": 4360 }, { "epoch": 3.7640788198557122, "grad_norm": 2.953125, "learning_rate": 1.1262000000000002e-05, "loss": 0.1524, "step": 4370 }, { "epoch": 3.7726930117368367, "grad_norm": 2.640625, "learning_rate": 1.1242000000000002e-05, "loss": 0.1553, "step": 4380 }, { "epoch": 3.7813072036179607, "grad_norm": 2.484375, "learning_rate": 1.1222000000000002e-05, "loss": 0.156, "step": 4390 }, { "epoch": 3.7899213954990847, "grad_norm": 2.59375, "learning_rate": 1.1202000000000002e-05, "loss": 0.1543, "step": 4400 }, { "epoch": 3.7985355873802087, "grad_norm": 2.515625, "learning_rate": 1.1182000000000002e-05, "loss": 0.1521, "step": 4410 }, { "epoch": 3.807149779261333, "grad_norm": 3.046875, "learning_rate": 1.1162000000000003e-05, "loss": 0.1515, "step": 4420 }, { "epoch": 3.815763971142457, "grad_norm": 2.703125, "learning_rate": 1.1142000000000001e-05, "loss": 0.1527, "step": 4430 }, { "epoch": 3.8243781630235816, "grad_norm": 3.015625, "learning_rate": 1.1122000000000001e-05, "loss": 0.1543, "step": 4440 }, { "epoch": 3.8329923549047056, "grad_norm": 2.546875, "learning_rate": 1.1102000000000001e-05, "loss": 0.1461, "step": 4450 }, { "epoch": 3.8416065467858296, "grad_norm": 3.03125, "learning_rate": 1.1082000000000002e-05, "loss": 0.1448, "step": 4460 }, { "epoch": 3.8502207386669536, "grad_norm": 2.453125, "learning_rate": 1.1062000000000002e-05, "loss": 0.1462, "step": 4470 }, { "epoch": 3.858834930548078, "grad_norm": 2.59375, "learning_rate": 1.1042000000000002e-05, "loss": 0.146, "step": 4480 }, { "epoch": 3.867449122429202, "grad_norm": 2.84375, "learning_rate": 1.1022000000000002e-05, "loss": 0.1416, "step": 4490 }, { "epoch": 3.8760633143103265, "grad_norm": 2.359375, "learning_rate": 1.1002000000000002e-05, "loss": 0.156, "step": 4500 }, { "epoch": 3.8846775061914505, "grad_norm": 3.140625, "learning_rate": 1.0982000000000001e-05, "loss": 0.1462, "step": 4510 }, { "epoch": 3.8932916980725745, "grad_norm": 2.578125, "learning_rate": 1.0962000000000001e-05, "loss": 0.1447, "step": 4520 }, { "epoch": 3.9019058899536985, "grad_norm": 2.78125, "learning_rate": 1.0942000000000001e-05, "loss": 0.1467, "step": 4530 }, { "epoch": 3.910520081834823, "grad_norm": 2.53125, "learning_rate": 1.0922000000000001e-05, "loss": 0.1452, "step": 4540 }, { "epoch": 3.919134273715947, "grad_norm": 2.609375, "learning_rate": 1.0902000000000002e-05, "loss": 0.1519, "step": 4550 }, { "epoch": 3.9277484655970714, "grad_norm": 2.296875, "learning_rate": 1.0882000000000002e-05, "loss": 0.1485, "step": 4560 }, { "epoch": 3.9363626574781954, "grad_norm": 2.921875, "learning_rate": 1.0862000000000002e-05, "loss": 0.1473, "step": 4570 }, { "epoch": 3.9449768493593194, "grad_norm": 2.78125, "learning_rate": 1.0842000000000002e-05, "loss": 0.1462, "step": 4580 }, { "epoch": 3.9535910412404434, "grad_norm": 2.5625, "learning_rate": 1.0822e-05, "loss": 0.1472, "step": 4590 }, { "epoch": 3.962205233121568, "grad_norm": 2.484375, "learning_rate": 1.0802000000000001e-05, "loss": 0.1497, "step": 4600 }, { "epoch": 3.970819425002692, "grad_norm": 2.421875, "learning_rate": 1.0782000000000001e-05, "loss": 0.1474, "step": 4610 }, { "epoch": 3.9794336168838162, "grad_norm": 2.46875, "learning_rate": 1.0762000000000001e-05, "loss": 0.1477, "step": 4620 }, { "epoch": 3.9880478087649402, "grad_norm": 2.734375, "learning_rate": 1.0742000000000002e-05, "loss": 0.1508, "step": 4630 }, { "epoch": 3.9966620006460643, "grad_norm": 2.515625, "learning_rate": 1.0722000000000002e-05, "loss": 0.1449, "step": 4640 }, { "epoch": 4.005168515128674, "grad_norm": 2.328125, "learning_rate": 1.0702000000000002e-05, "loss": 0.1182, "step": 4650 }, { "epoch": 4.013782707009798, "grad_norm": 2.75, "learning_rate": 1.0682000000000002e-05, "loss": 0.1047, "step": 4660 }, { "epoch": 4.022396898890923, "grad_norm": 3.0625, "learning_rate": 1.0662e-05, "loss": 0.0972, "step": 4670 }, { "epoch": 4.031011090772047, "grad_norm": 3.109375, "learning_rate": 1.0642e-05, "loss": 0.099, "step": 4680 }, { "epoch": 4.039625282653171, "grad_norm": 2.609375, "learning_rate": 1.0622000000000001e-05, "loss": 0.0977, "step": 4690 }, { "epoch": 4.048239474534295, "grad_norm": 2.515625, "learning_rate": 1.0602000000000001e-05, "loss": 0.1003, "step": 4700 }, { "epoch": 4.056853666415419, "grad_norm": 2.625, "learning_rate": 1.0582000000000001e-05, "loss": 0.0964, "step": 4710 }, { "epoch": 4.065467858296543, "grad_norm": 2.625, "learning_rate": 1.0562000000000002e-05, "loss": 0.0969, "step": 4720 }, { "epoch": 4.074082050177668, "grad_norm": 3.234375, "learning_rate": 1.0542000000000002e-05, "loss": 0.1009, "step": 4730 }, { "epoch": 4.082696242058792, "grad_norm": 2.625, "learning_rate": 1.0522000000000002e-05, "loss": 0.1039, "step": 4740 }, { "epoch": 4.091310433939916, "grad_norm": 2.65625, "learning_rate": 1.0502e-05, "loss": 0.102, "step": 4750 }, { "epoch": 4.09992462582104, "grad_norm": 2.71875, "learning_rate": 1.0482e-05, "loss": 0.0998, "step": 4760 }, { "epoch": 4.108538817702164, "grad_norm": 2.46875, "learning_rate": 1.0462e-05, "loss": 0.1001, "step": 4770 }, { "epoch": 4.117153009583289, "grad_norm": 2.390625, "learning_rate": 1.0442000000000001e-05, "loss": 0.0929, "step": 4780 }, { "epoch": 4.125767201464413, "grad_norm": 2.78125, "learning_rate": 1.0422000000000001e-05, "loss": 0.0971, "step": 4790 }, { "epoch": 4.134381393345537, "grad_norm": 2.65625, "learning_rate": 1.0402000000000001e-05, "loss": 0.0997, "step": 4800 }, { "epoch": 4.142995585226661, "grad_norm": 2.515625, "learning_rate": 1.0382000000000002e-05, "loss": 0.1048, "step": 4810 }, { "epoch": 4.151609777107785, "grad_norm": 2.84375, "learning_rate": 1.0362000000000002e-05, "loss": 0.1024, "step": 4820 }, { "epoch": 4.160223968988909, "grad_norm": 2.828125, "learning_rate": 1.0342e-05, "loss": 0.1015, "step": 4830 }, { "epoch": 4.168838160870034, "grad_norm": 2.875, "learning_rate": 1.0322e-05, "loss": 0.1015, "step": 4840 }, { "epoch": 4.177452352751158, "grad_norm": 3.15625, "learning_rate": 1.0302e-05, "loss": 0.0962, "step": 4850 }, { "epoch": 4.186066544632282, "grad_norm": 2.84375, "learning_rate": 1.0282e-05, "loss": 0.1004, "step": 4860 }, { "epoch": 4.194680736513406, "grad_norm": 2.8125, "learning_rate": 1.0262000000000001e-05, "loss": 0.0913, "step": 4870 }, { "epoch": 4.20329492839453, "grad_norm": 4.0, "learning_rate": 1.0242000000000001e-05, "loss": 0.0974, "step": 4880 }, { "epoch": 4.211909120275654, "grad_norm": 2.875, "learning_rate": 1.0222000000000001e-05, "loss": 0.1002, "step": 4890 }, { "epoch": 4.220523312156779, "grad_norm": 2.546875, "learning_rate": 1.0202000000000002e-05, "loss": 0.1032, "step": 4900 }, { "epoch": 4.229137504037903, "grad_norm": 3.0, "learning_rate": 1.0182e-05, "loss": 0.0937, "step": 4910 }, { "epoch": 4.237751695919027, "grad_norm": 2.921875, "learning_rate": 1.0162e-05, "loss": 0.0974, "step": 4920 }, { "epoch": 4.246365887800151, "grad_norm": 2.703125, "learning_rate": 1.0142e-05, "loss": 0.1001, "step": 4930 }, { "epoch": 4.254980079681275, "grad_norm": 3.578125, "learning_rate": 1.0122e-05, "loss": 0.1008, "step": 4940 }, { "epoch": 4.263594271562399, "grad_norm": 2.578125, "learning_rate": 1.0102000000000001e-05, "loss": 0.0994, "step": 4950 }, { "epoch": 4.272208463443524, "grad_norm": 2.359375, "learning_rate": 1.0082000000000001e-05, "loss": 0.1013, "step": 4960 }, { "epoch": 4.280822655324648, "grad_norm": 3.4375, "learning_rate": 1.0062000000000001e-05, "loss": 0.0959, "step": 4970 }, { "epoch": 4.289436847205772, "grad_norm": 3.59375, "learning_rate": 1.0042000000000001e-05, "loss": 0.0992, "step": 4980 }, { "epoch": 4.298051039086896, "grad_norm": 2.734375, "learning_rate": 1.0022e-05, "loss": 0.0962, "step": 4990 }, { "epoch": 4.30666523096802, "grad_norm": 2.890625, "learning_rate": 1.0002e-05, "loss": 0.1062, "step": 5000 }, { "epoch": 4.315279422849144, "grad_norm": 2.71875, "learning_rate": 9.982e-06, "loss": 0.0995, "step": 5010 }, { "epoch": 4.3238936147302685, "grad_norm": 2.828125, "learning_rate": 9.962e-06, "loss": 0.1022, "step": 5020 }, { "epoch": 4.3325078066113925, "grad_norm": 2.6875, "learning_rate": 9.942e-06, "loss": 0.0996, "step": 5030 }, { "epoch": 4.3411219984925165, "grad_norm": 3.09375, "learning_rate": 9.922000000000001e-06, "loss": 0.0993, "step": 5040 }, { "epoch": 4.3497361903736405, "grad_norm": 2.84375, "learning_rate": 9.902000000000001e-06, "loss": 0.1029, "step": 5050 }, { "epoch": 4.3583503822547645, "grad_norm": 2.90625, "learning_rate": 9.882000000000001e-06, "loss": 0.1003, "step": 5060 }, { "epoch": 4.3669645741358885, "grad_norm": 2.984375, "learning_rate": 9.862e-06, "loss": 0.0979, "step": 5070 }, { "epoch": 4.375578766017013, "grad_norm": 2.734375, "learning_rate": 9.842e-06, "loss": 0.1003, "step": 5080 }, { "epoch": 4.384192957898137, "grad_norm": 2.34375, "learning_rate": 9.822e-06, "loss": 0.0981, "step": 5090 }, { "epoch": 4.392807149779261, "grad_norm": 2.90625, "learning_rate": 9.802e-06, "loss": 0.098, "step": 5100 }, { "epoch": 4.401421341660385, "grad_norm": 3.265625, "learning_rate": 9.782e-06, "loss": 0.1007, "step": 5110 }, { "epoch": 4.410035533541509, "grad_norm": 2.75, "learning_rate": 9.762e-06, "loss": 0.0968, "step": 5120 }, { "epoch": 4.418649725422634, "grad_norm": 2.75, "learning_rate": 9.742000000000001e-06, "loss": 0.1031, "step": 5130 }, { "epoch": 4.427263917303758, "grad_norm": 3.078125, "learning_rate": 9.722000000000001e-06, "loss": 0.0996, "step": 5140 }, { "epoch": 4.435878109184882, "grad_norm": 2.828125, "learning_rate": 9.702e-06, "loss": 0.0982, "step": 5150 }, { "epoch": 4.444492301066006, "grad_norm": 2.609375, "learning_rate": 9.682e-06, "loss": 0.0991, "step": 5160 }, { "epoch": 4.45310649294713, "grad_norm": 3.296875, "learning_rate": 9.662e-06, "loss": 0.0999, "step": 5170 }, { "epoch": 4.461720684828254, "grad_norm": 3.109375, "learning_rate": 9.642e-06, "loss": 0.1042, "step": 5180 }, { "epoch": 4.470334876709378, "grad_norm": 2.765625, "learning_rate": 9.622000000000002e-06, "loss": 0.1016, "step": 5190 }, { "epoch": 4.478949068590503, "grad_norm": 2.5, "learning_rate": 9.602e-06, "loss": 0.0992, "step": 5200 }, { "epoch": 4.487563260471627, "grad_norm": 3.140625, "learning_rate": 9.582e-06, "loss": 0.0986, "step": 5210 }, { "epoch": 4.496177452352751, "grad_norm": 2.53125, "learning_rate": 9.562000000000001e-06, "loss": 0.0995, "step": 5220 }, { "epoch": 4.504791644233875, "grad_norm": 3.25, "learning_rate": 9.542000000000001e-06, "loss": 0.1039, "step": 5230 }, { "epoch": 4.513405836114999, "grad_norm": 2.890625, "learning_rate": 9.522000000000001e-06, "loss": 0.1017, "step": 5240 }, { "epoch": 4.522020027996124, "grad_norm": 2.671875, "learning_rate": 9.502000000000002e-06, "loss": 0.1018, "step": 5250 }, { "epoch": 4.530634219877248, "grad_norm": 3.203125, "learning_rate": 9.482000000000002e-06, "loss": 0.1021, "step": 5260 }, { "epoch": 4.539248411758372, "grad_norm": 2.5625, "learning_rate": 9.462000000000002e-06, "loss": 0.1007, "step": 5270 }, { "epoch": 4.547862603639496, "grad_norm": 2.359375, "learning_rate": 9.442e-06, "loss": 0.0974, "step": 5280 }, { "epoch": 4.55647679552062, "grad_norm": 2.5625, "learning_rate": 9.422e-06, "loss": 0.0994, "step": 5290 }, { "epoch": 4.565090987401744, "grad_norm": 2.890625, "learning_rate": 9.402e-06, "loss": 0.1027, "step": 5300 }, { "epoch": 4.573705179282869, "grad_norm": 3.015625, "learning_rate": 9.382000000000001e-06, "loss": 0.1081, "step": 5310 }, { "epoch": 4.582319371163993, "grad_norm": 3.015625, "learning_rate": 9.362000000000001e-06, "loss": 0.0977, "step": 5320 }, { "epoch": 4.590933563045117, "grad_norm": 2.75, "learning_rate": 9.342000000000001e-06, "loss": 0.1057, "step": 5330 }, { "epoch": 4.599547754926241, "grad_norm": 3.0, "learning_rate": 9.322000000000002e-06, "loss": 0.1008, "step": 5340 }, { "epoch": 4.608161946807365, "grad_norm": 2.9375, "learning_rate": 9.302000000000002e-06, "loss": 0.102, "step": 5350 }, { "epoch": 4.616776138688489, "grad_norm": 3.015625, "learning_rate": 9.282e-06, "loss": 0.106, "step": 5360 }, { "epoch": 4.625390330569614, "grad_norm": 3.015625, "learning_rate": 9.262e-06, "loss": 0.1025, "step": 5370 }, { "epoch": 4.634004522450738, "grad_norm": 2.59375, "learning_rate": 9.242e-06, "loss": 0.0968, "step": 5380 }, { "epoch": 4.642618714331862, "grad_norm": 2.6875, "learning_rate": 9.222e-06, "loss": 0.0951, "step": 5390 }, { "epoch": 4.651232906212986, "grad_norm": 2.71875, "learning_rate": 9.202000000000001e-06, "loss": 0.1027, "step": 5400 }, { "epoch": 4.65984709809411, "grad_norm": 2.515625, "learning_rate": 9.182000000000001e-06, "loss": 0.1014, "step": 5410 }, { "epoch": 4.668461289975234, "grad_norm": 2.9375, "learning_rate": 9.162000000000001e-06, "loss": 0.1017, "step": 5420 }, { "epoch": 4.677075481856359, "grad_norm": 3.046875, "learning_rate": 9.142000000000002e-06, "loss": 0.1019, "step": 5430 }, { "epoch": 4.685689673737483, "grad_norm": 2.828125, "learning_rate": 9.122e-06, "loss": 0.1034, "step": 5440 }, { "epoch": 4.694303865618607, "grad_norm": 2.671875, "learning_rate": 9.102e-06, "loss": 0.098, "step": 5450 }, { "epoch": 4.702918057499731, "grad_norm": 3.203125, "learning_rate": 9.082e-06, "loss": 0.0971, "step": 5460 }, { "epoch": 4.711532249380855, "grad_norm": 2.78125, "learning_rate": 9.062e-06, "loss": 0.0989, "step": 5470 }, { "epoch": 4.72014644126198, "grad_norm": 2.78125, "learning_rate": 9.042e-06, "loss": 0.0987, "step": 5480 }, { "epoch": 4.728760633143104, "grad_norm": 3.046875, "learning_rate": 9.022000000000001e-06, "loss": 0.0983, "step": 5490 }, { "epoch": 4.737374825024228, "grad_norm": 2.890625, "learning_rate": 9.002000000000001e-06, "loss": 0.102, "step": 5500 }, { "epoch": 4.745989016905352, "grad_norm": 2.71875, "learning_rate": 8.982000000000001e-06, "loss": 0.1022, "step": 5510 }, { "epoch": 4.754603208786476, "grad_norm": 2.8125, "learning_rate": 8.962e-06, "loss": 0.0997, "step": 5520 }, { "epoch": 4.7632174006676, "grad_norm": 2.625, "learning_rate": 8.942e-06, "loss": 0.1032, "step": 5530 }, { "epoch": 4.771831592548724, "grad_norm": 2.71875, "learning_rate": 8.922e-06, "loss": 0.0995, "step": 5540 }, { "epoch": 4.7804457844298485, "grad_norm": 2.90625, "learning_rate": 8.902e-06, "loss": 0.1053, "step": 5550 }, { "epoch": 4.7890599763109725, "grad_norm": 2.59375, "learning_rate": 8.882e-06, "loss": 0.0929, "step": 5560 }, { "epoch": 4.7976741681920965, "grad_norm": 2.734375, "learning_rate": 8.862000000000001e-06, "loss": 0.1015, "step": 5570 }, { "epoch": 4.8062883600732205, "grad_norm": 2.90625, "learning_rate": 8.842000000000001e-06, "loss": 0.0987, "step": 5580 }, { "epoch": 4.8149025519543445, "grad_norm": 2.859375, "learning_rate": 8.822000000000001e-06, "loss": 0.1039, "step": 5590 }, { "epoch": 4.823516743835469, "grad_norm": 3.34375, "learning_rate": 8.802e-06, "loss": 0.102, "step": 5600 }, { "epoch": 4.832130935716593, "grad_norm": 2.9375, "learning_rate": 8.782e-06, "loss": 0.1002, "step": 5610 }, { "epoch": 4.840745127597717, "grad_norm": 2.828125, "learning_rate": 8.762e-06, "loss": 0.1023, "step": 5620 }, { "epoch": 4.849359319478841, "grad_norm": 2.46875, "learning_rate": 8.742e-06, "loss": 0.1007, "step": 5630 }, { "epoch": 4.857973511359965, "grad_norm": 2.890625, "learning_rate": 8.722e-06, "loss": 0.1036, "step": 5640 }, { "epoch": 4.866587703241089, "grad_norm": 2.765625, "learning_rate": 8.702e-06, "loss": 0.1044, "step": 5650 }, { "epoch": 4.875201895122213, "grad_norm": 2.765625, "learning_rate": 8.682000000000001e-06, "loss": 0.0943, "step": 5660 }, { "epoch": 4.883816087003338, "grad_norm": 2.609375, "learning_rate": 8.662000000000001e-06, "loss": 0.0982, "step": 5670 }, { "epoch": 4.892430278884462, "grad_norm": 3.125, "learning_rate": 8.642e-06, "loss": 0.1033, "step": 5680 }, { "epoch": 4.901044470765586, "grad_norm": 2.3125, "learning_rate": 8.622e-06, "loss": 0.1034, "step": 5690 }, { "epoch": 4.90965866264671, "grad_norm": 2.9375, "learning_rate": 8.602e-06, "loss": 0.0999, "step": 5700 }, { "epoch": 4.918272854527834, "grad_norm": 3.03125, "learning_rate": 8.582e-06, "loss": 0.1048, "step": 5710 }, { "epoch": 4.926887046408959, "grad_norm": 3.75, "learning_rate": 8.562e-06, "loss": 0.0956, "step": 5720 }, { "epoch": 4.935501238290083, "grad_norm": 2.609375, "learning_rate": 8.542e-06, "loss": 0.1041, "step": 5730 }, { "epoch": 4.944115430171207, "grad_norm": 2.6875, "learning_rate": 8.522e-06, "loss": 0.1029, "step": 5740 }, { "epoch": 4.952729622052331, "grad_norm": 2.609375, "learning_rate": 8.502000000000001e-06, "loss": 0.1009, "step": 5750 }, { "epoch": 4.961343813933455, "grad_norm": 2.515625, "learning_rate": 8.482e-06, "loss": 0.1065, "step": 5760 }, { "epoch": 4.969958005814579, "grad_norm": 2.78125, "learning_rate": 8.462e-06, "loss": 0.0968, "step": 5770 }, { "epoch": 4.978572197695704, "grad_norm": 2.96875, "learning_rate": 8.442e-06, "loss": 0.1061, "step": 5780 }, { "epoch": 4.987186389576828, "grad_norm": 2.65625, "learning_rate": 8.422e-06, "loss": 0.0998, "step": 5790 }, { "epoch": 4.995800581457952, "grad_norm": 2.8125, "learning_rate": 8.402e-06, "loss": 0.1004, "step": 5800 }, { "epoch": 5.004307095940562, "grad_norm": 2.09375, "learning_rate": 8.382e-06, "loss": 0.0826, "step": 5810 }, { "epoch": 5.012921287821686, "grad_norm": 2.9375, "learning_rate": 8.362e-06, "loss": 0.0723, "step": 5820 }, { "epoch": 5.02153547970281, "grad_norm": 2.4375, "learning_rate": 8.342e-06, "loss": 0.0668, "step": 5830 }, { "epoch": 5.030149671583935, "grad_norm": 2.8125, "learning_rate": 8.322000000000001e-06, "loss": 0.072, "step": 5840 }, { "epoch": 5.038763863465059, "grad_norm": 2.828125, "learning_rate": 8.302000000000001e-06, "loss": 0.0673, "step": 5850 }, { "epoch": 5.047378055346183, "grad_norm": 3.03125, "learning_rate": 8.282000000000001e-06, "loss": 0.0656, "step": 5860 }, { "epoch": 5.055992247227307, "grad_norm": 3.21875, "learning_rate": 8.262000000000002e-06, "loss": 0.0721, "step": 5870 }, { "epoch": 5.064606439108431, "grad_norm": 2.953125, "learning_rate": 8.242000000000002e-06, "loss": 0.0737, "step": 5880 }, { "epoch": 5.073220630989555, "grad_norm": 2.796875, "learning_rate": 8.222000000000002e-06, "loss": 0.068, "step": 5890 }, { "epoch": 5.08183482287068, "grad_norm": 2.921875, "learning_rate": 8.202e-06, "loss": 0.0682, "step": 5900 }, { "epoch": 5.090449014751804, "grad_norm": 3.09375, "learning_rate": 8.182e-06, "loss": 0.0677, "step": 5910 }, { "epoch": 5.099063206632928, "grad_norm": 2.71875, "learning_rate": 8.162e-06, "loss": 0.0686, "step": 5920 }, { "epoch": 5.107677398514052, "grad_norm": 2.5625, "learning_rate": 8.142000000000001e-06, "loss": 0.0688, "step": 5930 }, { "epoch": 5.116291590395176, "grad_norm": 2.4375, "learning_rate": 8.122000000000001e-06, "loss": 0.072, "step": 5940 }, { "epoch": 5.1249057822763, "grad_norm": 2.703125, "learning_rate": 8.102000000000001e-06, "loss": 0.0707, "step": 5950 }, { "epoch": 5.133519974157425, "grad_norm": 2.734375, "learning_rate": 8.082000000000002e-06, "loss": 0.0733, "step": 5960 }, { "epoch": 5.142134166038549, "grad_norm": 2.515625, "learning_rate": 8.062000000000002e-06, "loss": 0.0726, "step": 5970 }, { "epoch": 5.150748357919673, "grad_norm": 3.109375, "learning_rate": 8.042e-06, "loss": 0.069, "step": 5980 }, { "epoch": 5.159362549800797, "grad_norm": 2.640625, "learning_rate": 8.022e-06, "loss": 0.0699, "step": 5990 }, { "epoch": 5.167976741681921, "grad_norm": 3.046875, "learning_rate": 8.002e-06, "loss": 0.0704, "step": 6000 }, { "epoch": 5.176590933563045, "grad_norm": 2.515625, "learning_rate": 7.982e-06, "loss": 0.0676, "step": 6010 }, { "epoch": 5.18520512544417, "grad_norm": 2.859375, "learning_rate": 7.962000000000001e-06, "loss": 0.0674, "step": 6020 }, { "epoch": 5.193819317325294, "grad_norm": 2.265625, "learning_rate": 7.942000000000001e-06, "loss": 0.0721, "step": 6030 }, { "epoch": 5.202433509206418, "grad_norm": 3.140625, "learning_rate": 7.922000000000001e-06, "loss": 0.0727, "step": 6040 }, { "epoch": 5.211047701087542, "grad_norm": 2.703125, "learning_rate": 7.902000000000002e-06, "loss": 0.071, "step": 6050 }, { "epoch": 5.219661892968666, "grad_norm": 3.5, "learning_rate": 7.882e-06, "loss": 0.0682, "step": 6060 }, { "epoch": 5.2282760848497905, "grad_norm": 2.703125, "learning_rate": 7.862e-06, "loss": 0.0708, "step": 6070 }, { "epoch": 5.2368902767309145, "grad_norm": 3.015625, "learning_rate": 7.842e-06, "loss": 0.0695, "step": 6080 }, { "epoch": 5.2455044686120385, "grad_norm": 2.421875, "learning_rate": 7.822e-06, "loss": 0.0731, "step": 6090 }, { "epoch": 5.2541186604931625, "grad_norm": 3.140625, "learning_rate": 7.802000000000001e-06, "loss": 0.0695, "step": 6100 }, { "epoch": 5.2627328523742865, "grad_norm": 3.046875, "learning_rate": 7.782000000000001e-06, "loss": 0.0765, "step": 6110 }, { "epoch": 5.2713470442554105, "grad_norm": 2.984375, "learning_rate": 7.762000000000001e-06, "loss": 0.0702, "step": 6120 }, { "epoch": 5.279961236136535, "grad_norm": 3.015625, "learning_rate": 7.742000000000001e-06, "loss": 0.0698, "step": 6130 }, { "epoch": 5.288575428017659, "grad_norm": 2.75, "learning_rate": 7.722e-06, "loss": 0.0724, "step": 6140 }, { "epoch": 5.297189619898783, "grad_norm": 2.46875, "learning_rate": 7.702e-06, "loss": 0.0708, "step": 6150 }, { "epoch": 5.305803811779907, "grad_norm": 2.78125, "learning_rate": 7.682e-06, "loss": 0.0703, "step": 6160 }, { "epoch": 5.314418003661031, "grad_norm": 2.53125, "learning_rate": 7.662e-06, "loss": 0.0704, "step": 6170 }, { "epoch": 5.323032195542155, "grad_norm": 2.5625, "learning_rate": 7.642e-06, "loss": 0.0674, "step": 6180 }, { "epoch": 5.33164638742328, "grad_norm": 4.5, "learning_rate": 7.622000000000001e-06, "loss": 0.077, "step": 6190 }, { "epoch": 5.340260579304404, "grad_norm": 2.921875, "learning_rate": 7.602e-06, "loss": 0.0722, "step": 6200 }, { "epoch": 5.348874771185528, "grad_norm": 2.5625, "learning_rate": 7.582e-06, "loss": 0.0728, "step": 6210 }, { "epoch": 5.357488963066652, "grad_norm": 2.609375, "learning_rate": 7.562000000000001e-06, "loss": 0.0709, "step": 6220 }, { "epoch": 5.366103154947776, "grad_norm": 3.4375, "learning_rate": 7.542000000000001e-06, "loss": 0.069, "step": 6230 }, { "epoch": 5.3747173468289, "grad_norm": 3.109375, "learning_rate": 7.522e-06, "loss": 0.0717, "step": 6240 }, { "epoch": 5.383331538710025, "grad_norm": 2.53125, "learning_rate": 7.502e-06, "loss": 0.0677, "step": 6250 }, { "epoch": 5.391945730591149, "grad_norm": 2.96875, "learning_rate": 7.4820000000000005e-06, "loss": 0.0706, "step": 6260 }, { "epoch": 5.400559922472273, "grad_norm": 3.015625, "learning_rate": 7.462000000000001e-06, "loss": 0.0753, "step": 6270 }, { "epoch": 5.409174114353397, "grad_norm": 3.3125, "learning_rate": 7.442e-06, "loss": 0.0698, "step": 6280 }, { "epoch": 5.417788306234521, "grad_norm": 2.953125, "learning_rate": 7.422e-06, "loss": 0.0733, "step": 6290 }, { "epoch": 5.426402498115645, "grad_norm": 3.171875, "learning_rate": 7.4020000000000005e-06, "loss": 0.0672, "step": 6300 }, { "epoch": 5.43501668999677, "grad_norm": 2.6875, "learning_rate": 7.382000000000001e-06, "loss": 0.0701, "step": 6310 }, { "epoch": 5.443630881877894, "grad_norm": 2.765625, "learning_rate": 7.362e-06, "loss": 0.0677, "step": 6320 }, { "epoch": 5.452245073759018, "grad_norm": 2.953125, "learning_rate": 7.342e-06, "loss": 0.0715, "step": 6330 }, { "epoch": 5.460859265640142, "grad_norm": 3.09375, "learning_rate": 7.322e-06, "loss": 0.0713, "step": 6340 }, { "epoch": 5.469473457521266, "grad_norm": 2.59375, "learning_rate": 7.3020000000000006e-06, "loss": 0.0697, "step": 6350 }, { "epoch": 5.47808764940239, "grad_norm": 2.734375, "learning_rate": 7.282e-06, "loss": 0.0686, "step": 6360 }, { "epoch": 5.486701841283515, "grad_norm": 2.78125, "learning_rate": 7.262e-06, "loss": 0.0685, "step": 6370 }, { "epoch": 5.495316033164639, "grad_norm": 2.625, "learning_rate": 7.242e-06, "loss": 0.0697, "step": 6380 }, { "epoch": 5.503930225045763, "grad_norm": 2.8125, "learning_rate": 7.2220000000000005e-06, "loss": 0.0684, "step": 6390 }, { "epoch": 5.512544416926887, "grad_norm": 2.296875, "learning_rate": 7.202e-06, "loss": 0.0674, "step": 6400 }, { "epoch": 5.521158608808011, "grad_norm": 3.21875, "learning_rate": 7.182e-06, "loss": 0.0728, "step": 6410 }, { "epoch": 5.529772800689136, "grad_norm": 2.953125, "learning_rate": 7.162e-06, "loss": 0.0706, "step": 6420 }, { "epoch": 5.53838699257026, "grad_norm": 2.6875, "learning_rate": 7.142e-06, "loss": 0.0725, "step": 6430 }, { "epoch": 5.547001184451384, "grad_norm": 2.375, "learning_rate": 7.1220000000000014e-06, "loss": 0.07, "step": 6440 }, { "epoch": 5.555615376332508, "grad_norm": 2.671875, "learning_rate": 7.102000000000001e-06, "loss": 0.0682, "step": 6450 }, { "epoch": 5.564229568213632, "grad_norm": 3.046875, "learning_rate": 7.082000000000001e-06, "loss": 0.0715, "step": 6460 }, { "epoch": 5.572843760094756, "grad_norm": 2.515625, "learning_rate": 7.062000000000001e-06, "loss": 0.0715, "step": 6470 }, { "epoch": 5.58145795197588, "grad_norm": 2.859375, "learning_rate": 7.042000000000001e-06, "loss": 0.0702, "step": 6480 }, { "epoch": 5.590072143857005, "grad_norm": 2.765625, "learning_rate": 7.022000000000001e-06, "loss": 0.0729, "step": 6490 }, { "epoch": 5.598686335738129, "grad_norm": 2.984375, "learning_rate": 7.002000000000001e-06, "loss": 0.0734, "step": 6500 }, { "epoch": 5.607300527619253, "grad_norm": 3.15625, "learning_rate": 6.982000000000001e-06, "loss": 0.0785, "step": 6510 }, { "epoch": 5.615914719500377, "grad_norm": 2.875, "learning_rate": 6.962000000000001e-06, "loss": 0.0727, "step": 6520 }, { "epoch": 5.624528911381501, "grad_norm": 2.890625, "learning_rate": 6.942000000000001e-06, "loss": 0.0703, "step": 6530 }, { "epoch": 5.633143103262626, "grad_norm": 3.453125, "learning_rate": 6.922000000000001e-06, "loss": 0.0698, "step": 6540 }, { "epoch": 5.64175729514375, "grad_norm": 2.953125, "learning_rate": 6.902000000000001e-06, "loss": 0.0667, "step": 6550 }, { "epoch": 5.650371487024874, "grad_norm": 3.5625, "learning_rate": 6.882000000000001e-06, "loss": 0.0707, "step": 6560 }, { "epoch": 5.658985678905998, "grad_norm": 2.84375, "learning_rate": 6.8620000000000005e-06, "loss": 0.0696, "step": 6570 }, { "epoch": 5.667599870787122, "grad_norm": 2.671875, "learning_rate": 6.842000000000001e-06, "loss": 0.0726, "step": 6580 }, { "epoch": 5.676214062668246, "grad_norm": 2.609375, "learning_rate": 6.822000000000001e-06, "loss": 0.0737, "step": 6590 }, { "epoch": 5.6848282545493705, "grad_norm": 2.875, "learning_rate": 6.802000000000001e-06, "loss": 0.0752, "step": 6600 }, { "epoch": 5.6934424464304945, "grad_norm": 2.34375, "learning_rate": 6.7820000000000005e-06, "loss": 0.0711, "step": 6610 }, { "epoch": 5.7020566383116185, "grad_norm": 3.0625, "learning_rate": 6.762000000000001e-06, "loss": 0.0715, "step": 6620 }, { "epoch": 5.7106708301927425, "grad_norm": 3.140625, "learning_rate": 6.742000000000001e-06, "loss": 0.0715, "step": 6630 }, { "epoch": 5.7192850220738665, "grad_norm": 2.96875, "learning_rate": 6.722000000000001e-06, "loss": 0.0706, "step": 6640 }, { "epoch": 5.7278992139549905, "grad_norm": 2.875, "learning_rate": 6.702e-06, "loss": 0.0671, "step": 6650 }, { "epoch": 5.736513405836115, "grad_norm": 2.984375, "learning_rate": 6.6820000000000006e-06, "loss": 0.0754, "step": 6660 }, { "epoch": 5.745127597717239, "grad_norm": 2.734375, "learning_rate": 6.662000000000001e-06, "loss": 0.0702, "step": 6670 }, { "epoch": 5.753741789598363, "grad_norm": 2.84375, "learning_rate": 6.642000000000001e-06, "loss": 0.071, "step": 6680 }, { "epoch": 5.762355981479487, "grad_norm": 2.84375, "learning_rate": 6.622e-06, "loss": 0.0724, "step": 6690 }, { "epoch": 5.770970173360611, "grad_norm": 3.21875, "learning_rate": 6.6020000000000005e-06, "loss": 0.0739, "step": 6700 }, { "epoch": 5.779584365241735, "grad_norm": 3.203125, "learning_rate": 6.582000000000001e-06, "loss": 0.0713, "step": 6710 }, { "epoch": 5.78819855712286, "grad_norm": 2.6875, "learning_rate": 6.562000000000001e-06, "loss": 0.0691, "step": 6720 }, { "epoch": 5.796812749003984, "grad_norm": 2.75, "learning_rate": 6.542e-06, "loss": 0.0713, "step": 6730 }, { "epoch": 5.805426940885108, "grad_norm": 2.6875, "learning_rate": 6.522e-06, "loss": 0.0685, "step": 6740 }, { "epoch": 5.814041132766232, "grad_norm": 3.46875, "learning_rate": 6.502000000000001e-06, "loss": 0.0728, "step": 6750 }, { "epoch": 5.822655324647356, "grad_norm": 3.03125, "learning_rate": 6.482000000000001e-06, "loss": 0.0722, "step": 6760 }, { "epoch": 5.831269516528481, "grad_norm": 2.59375, "learning_rate": 6.462e-06, "loss": 0.0687, "step": 6770 }, { "epoch": 5.839883708409605, "grad_norm": 3.0, "learning_rate": 6.442e-06, "loss": 0.0682, "step": 6780 }, { "epoch": 5.848497900290729, "grad_norm": 2.40625, "learning_rate": 6.4220000000000005e-06, "loss": 0.0674, "step": 6790 }, { "epoch": 5.857112092171853, "grad_norm": 2.703125, "learning_rate": 6.402000000000001e-06, "loss": 0.0743, "step": 6800 }, { "epoch": 5.865726284052977, "grad_norm": 2.734375, "learning_rate": 6.382e-06, "loss": 0.071, "step": 6810 }, { "epoch": 5.874340475934101, "grad_norm": 3.0625, "learning_rate": 6.362e-06, "loss": 0.0688, "step": 6820 }, { "epoch": 5.882954667815225, "grad_norm": 2.734375, "learning_rate": 6.3420000000000004e-06, "loss": 0.0696, "step": 6830 }, { "epoch": 5.89156885969635, "grad_norm": 2.875, "learning_rate": 6.322000000000001e-06, "loss": 0.0698, "step": 6840 }, { "epoch": 5.900183051577474, "grad_norm": 2.53125, "learning_rate": 6.302e-06, "loss": 0.0688, "step": 6850 }, { "epoch": 5.908797243458598, "grad_norm": 3.0, "learning_rate": 6.282e-06, "loss": 0.0704, "step": 6860 }, { "epoch": 5.917411435339722, "grad_norm": 3.0, "learning_rate": 6.262e-06, "loss": 0.0742, "step": 6870 }, { "epoch": 5.926025627220846, "grad_norm": 2.46875, "learning_rate": 6.2420000000000005e-06, "loss": 0.0656, "step": 6880 }, { "epoch": 5.934639819101971, "grad_norm": 2.8125, "learning_rate": 6.222e-06, "loss": 0.072, "step": 6890 }, { "epoch": 5.943254010983095, "grad_norm": 2.921875, "learning_rate": 6.202e-06, "loss": 0.0668, "step": 6900 }, { "epoch": 5.951868202864219, "grad_norm": 2.765625, "learning_rate": 6.182e-06, "loss": 0.0706, "step": 6910 }, { "epoch": 5.960482394745343, "grad_norm": 2.921875, "learning_rate": 6.1620000000000005e-06, "loss": 0.0685, "step": 6920 }, { "epoch": 5.969096586626467, "grad_norm": 2.703125, "learning_rate": 6.142e-06, "loss": 0.074, "step": 6930 }, { "epoch": 5.977710778507591, "grad_norm": 3.40625, "learning_rate": 6.122e-06, "loss": 0.0713, "step": 6940 }, { "epoch": 5.986324970388715, "grad_norm": 2.390625, "learning_rate": 6.102e-06, "loss": 0.0719, "step": 6950 }, { "epoch": 5.99493916226984, "grad_norm": 3.0, "learning_rate": 6.082e-06, "loss": 0.0704, "step": 6960 }, { "epoch": 6.00344567675245, "grad_norm": 2.0, "learning_rate": 6.062e-06, "loss": 0.0591, "step": 6970 }, { "epoch": 6.012059868633574, "grad_norm": 2.421875, "learning_rate": 6.042e-06, "loss": 0.0506, "step": 6980 }, { "epoch": 6.020674060514698, "grad_norm": 2.859375, "learning_rate": 6.022e-06, "loss": 0.055, "step": 6990 }, { "epoch": 6.029288252395822, "grad_norm": 2.28125, "learning_rate": 6.002e-06, "loss": 0.0564, "step": 7000 }, { "epoch": 6.037902444276947, "grad_norm": 2.546875, "learning_rate": 5.982e-06, "loss": 0.0567, "step": 7010 }, { "epoch": 6.046516636158071, "grad_norm": 2.5, "learning_rate": 5.962e-06, "loss": 0.056, "step": 7020 }, { "epoch": 6.055130828039195, "grad_norm": 2.640625, "learning_rate": 5.942e-06, "loss": 0.0564, "step": 7030 }, { "epoch": 6.063745019920319, "grad_norm": 2.171875, "learning_rate": 5.922e-06, "loss": 0.0511, "step": 7040 }, { "epoch": 6.072359211801443, "grad_norm": 2.5625, "learning_rate": 5.9019999999999996e-06, "loss": 0.0535, "step": 7050 }, { "epoch": 6.080973403682567, "grad_norm": 2.546875, "learning_rate": 5.882e-06, "loss": 0.0558, "step": 7060 }, { "epoch": 6.089587595563692, "grad_norm": 2.609375, "learning_rate": 5.862000000000001e-06, "loss": 0.0526, "step": 7070 }, { "epoch": 6.098201787444816, "grad_norm": 2.25, "learning_rate": 5.842000000000001e-06, "loss": 0.0541, "step": 7080 }, { "epoch": 6.10681597932594, "grad_norm": 3.328125, "learning_rate": 5.822000000000001e-06, "loss": 0.0552, "step": 7090 }, { "epoch": 6.115430171207064, "grad_norm": 2.453125, "learning_rate": 5.802000000000001e-06, "loss": 0.0551, "step": 7100 }, { "epoch": 6.124044363088188, "grad_norm": 2.171875, "learning_rate": 5.782000000000001e-06, "loss": 0.0528, "step": 7110 }, { "epoch": 6.132658554969312, "grad_norm": 2.453125, "learning_rate": 5.762000000000001e-06, "loss": 0.0528, "step": 7120 }, { "epoch": 6.1412727468504364, "grad_norm": 2.125, "learning_rate": 5.742000000000001e-06, "loss": 0.0538, "step": 7130 }, { "epoch": 6.1498869387315604, "grad_norm": 2.171875, "learning_rate": 5.722000000000001e-06, "loss": 0.0521, "step": 7140 }, { "epoch": 6.1585011306126844, "grad_norm": 2.78125, "learning_rate": 5.702000000000001e-06, "loss": 0.0557, "step": 7150 }, { "epoch": 6.1671153224938084, "grad_norm": 2.6875, "learning_rate": 5.682000000000001e-06, "loss": 0.0536, "step": 7160 }, { "epoch": 6.1757295143749324, "grad_norm": 2.234375, "learning_rate": 5.662000000000001e-06, "loss": 0.0526, "step": 7170 }, { "epoch": 6.1843437062560564, "grad_norm": 2.953125, "learning_rate": 5.642000000000001e-06, "loss": 0.0547, "step": 7180 }, { "epoch": 6.192957898137181, "grad_norm": 2.40625, "learning_rate": 5.6220000000000006e-06, "loss": 0.0508, "step": 7190 }, { "epoch": 6.201572090018305, "grad_norm": 2.453125, "learning_rate": 5.602000000000001e-06, "loss": 0.0541, "step": 7200 }, { "epoch": 6.210186281899429, "grad_norm": 2.953125, "learning_rate": 5.582000000000001e-06, "loss": 0.0547, "step": 7210 }, { "epoch": 6.218800473780553, "grad_norm": 2.703125, "learning_rate": 5.562000000000001e-06, "loss": 0.056, "step": 7220 }, { "epoch": 6.227414665661677, "grad_norm": 2.734375, "learning_rate": 5.5420000000000005e-06, "loss": 0.0538, "step": 7230 }, { "epoch": 6.236028857542801, "grad_norm": 2.1875, "learning_rate": 5.522000000000001e-06, "loss": 0.0491, "step": 7240 }, { "epoch": 6.244643049423926, "grad_norm": 2.390625, "learning_rate": 5.502000000000001e-06, "loss": 0.0532, "step": 7250 }, { "epoch": 6.25325724130505, "grad_norm": 2.265625, "learning_rate": 5.482000000000001e-06, "loss": 0.0496, "step": 7260 }, { "epoch": 6.261871433186174, "grad_norm": 2.734375, "learning_rate": 5.462e-06, "loss": 0.0587, "step": 7270 }, { "epoch": 6.270485625067298, "grad_norm": 2.546875, "learning_rate": 5.442000000000001e-06, "loss": 0.055, "step": 7280 }, { "epoch": 6.279099816948422, "grad_norm": 2.4375, "learning_rate": 5.422000000000001e-06, "loss": 0.0559, "step": 7290 }, { "epoch": 6.287714008829546, "grad_norm": 2.59375, "learning_rate": 5.402000000000001e-06, "loss": 0.0572, "step": 7300 }, { "epoch": 6.296328200710671, "grad_norm": 2.421875, "learning_rate": 5.382e-06, "loss": 0.0516, "step": 7310 }, { "epoch": 6.304942392591795, "grad_norm": 2.21875, "learning_rate": 5.3620000000000005e-06, "loss": 0.05, "step": 7320 }, { "epoch": 6.313556584472919, "grad_norm": 2.734375, "learning_rate": 5.342000000000001e-06, "loss": 0.0534, "step": 7330 }, { "epoch": 6.322170776354043, "grad_norm": 2.21875, "learning_rate": 5.322000000000001e-06, "loss": 0.056, "step": 7340 }, { "epoch": 6.330784968235167, "grad_norm": 2.6875, "learning_rate": 5.302e-06, "loss": 0.0542, "step": 7350 }, { "epoch": 6.339399160116292, "grad_norm": 2.703125, "learning_rate": 5.282e-06, "loss": 0.0573, "step": 7360 }, { "epoch": 6.348013351997416, "grad_norm": 2.921875, "learning_rate": 5.262000000000001e-06, "loss": 0.0518, "step": 7370 }, { "epoch": 6.35662754387854, "grad_norm": 2.5, "learning_rate": 5.242000000000001e-06, "loss": 0.0567, "step": 7380 }, { "epoch": 6.365241735759664, "grad_norm": 2.859375, "learning_rate": 5.222e-06, "loss": 0.0563, "step": 7390 }, { "epoch": 6.373855927640788, "grad_norm": 2.40625, "learning_rate": 5.202e-06, "loss": 0.0534, "step": 7400 }, { "epoch": 6.382470119521912, "grad_norm": 2.734375, "learning_rate": 5.1820000000000005e-06, "loss": 0.0515, "step": 7410 }, { "epoch": 6.391084311403037, "grad_norm": 2.71875, "learning_rate": 5.162000000000001e-06, "loss": 0.0568, "step": 7420 }, { "epoch": 6.399698503284161, "grad_norm": 3.140625, "learning_rate": 5.142e-06, "loss": 0.0527, "step": 7430 }, { "epoch": 6.408312695165285, "grad_norm": 2.671875, "learning_rate": 5.122e-06, "loss": 0.0528, "step": 7440 }, { "epoch": 6.416926887046409, "grad_norm": 3.515625, "learning_rate": 5.1020000000000004e-06, "loss": 0.0553, "step": 7450 }, { "epoch": 6.425541078927533, "grad_norm": 2.6875, "learning_rate": 5.082000000000001e-06, "loss": 0.0531, "step": 7460 }, { "epoch": 6.434155270808657, "grad_norm": 2.046875, "learning_rate": 5.062e-06, "loss": 0.05, "step": 7470 }, { "epoch": 6.442769462689782, "grad_norm": 3.25, "learning_rate": 5.042e-06, "loss": 0.0562, "step": 7480 }, { "epoch": 6.451383654570906, "grad_norm": 2.828125, "learning_rate": 5.022e-06, "loss": 0.0531, "step": 7490 }, { "epoch": 6.45999784645203, "grad_norm": 2.625, "learning_rate": 5.0020000000000006e-06, "loss": 0.0545, "step": 7500 }, { "epoch": 6.468612038333154, "grad_norm": 2.4375, "learning_rate": 4.982e-06, "loss": 0.0528, "step": 7510 }, { "epoch": 6.477226230214278, "grad_norm": 2.296875, "learning_rate": 4.962e-06, "loss": 0.0519, "step": 7520 }, { "epoch": 6.485840422095402, "grad_norm": 2.40625, "learning_rate": 4.942e-06, "loss": 0.0548, "step": 7530 }, { "epoch": 6.494454613976527, "grad_norm": 3.390625, "learning_rate": 4.9220000000000005e-06, "loss": 0.0534, "step": 7540 }, { "epoch": 6.503068805857651, "grad_norm": 2.625, "learning_rate": 4.902000000000001e-06, "loss": 0.0516, "step": 7550 }, { "epoch": 6.511682997738775, "grad_norm": 2.609375, "learning_rate": 4.882000000000001e-06, "loss": 0.055, "step": 7560 }, { "epoch": 6.520297189619899, "grad_norm": 2.265625, "learning_rate": 4.862e-06, "loss": 0.0508, "step": 7570 }, { "epoch": 6.528911381501023, "grad_norm": 2.59375, "learning_rate": 4.842e-06, "loss": 0.0557, "step": 7580 }, { "epoch": 6.537525573382148, "grad_norm": 2.765625, "learning_rate": 4.822000000000001e-06, "loss": 0.0564, "step": 7590 }, { "epoch": 6.546139765263272, "grad_norm": 2.140625, "learning_rate": 4.802000000000001e-06, "loss": 0.0519, "step": 7600 }, { "epoch": 6.554753957144396, "grad_norm": 2.53125, "learning_rate": 4.782e-06, "loss": 0.056, "step": 7610 }, { "epoch": 6.56336814902552, "grad_norm": 2.546875, "learning_rate": 4.762e-06, "loss": 0.0554, "step": 7620 }, { "epoch": 6.571982340906644, "grad_norm": 2.703125, "learning_rate": 4.7420000000000005e-06, "loss": 0.0538, "step": 7630 }, { "epoch": 6.580596532787768, "grad_norm": 2.59375, "learning_rate": 4.722000000000001e-06, "loss": 0.0527, "step": 7640 }, { "epoch": 6.589210724668892, "grad_norm": 2.765625, "learning_rate": 4.702e-06, "loss": 0.0558, "step": 7650 }, { "epoch": 6.5978249165500165, "grad_norm": 2.4375, "learning_rate": 4.682e-06, "loss": 0.0506, "step": 7660 }, { "epoch": 6.6064391084311405, "grad_norm": 3.4375, "learning_rate": 4.6620000000000004e-06, "loss": 0.055, "step": 7670 }, { "epoch": 6.6150533003122645, "grad_norm": 2.484375, "learning_rate": 4.642000000000001e-06, "loss": 0.0535, "step": 7680 }, { "epoch": 6.6236674921933885, "grad_norm": 2.65625, "learning_rate": 4.622e-06, "loss": 0.0564, "step": 7690 }, { "epoch": 6.6322816840745125, "grad_norm": 2.6875, "learning_rate": 4.602e-06, "loss": 0.0567, "step": 7700 }, { "epoch": 6.640895875955637, "grad_norm": 2.84375, "learning_rate": 4.582e-06, "loss": 0.0557, "step": 7710 }, { "epoch": 6.649510067836761, "grad_norm": 2.625, "learning_rate": 4.5620000000000005e-06, "loss": 0.0533, "step": 7720 }, { "epoch": 6.658124259717885, "grad_norm": 2.1875, "learning_rate": 4.542e-06, "loss": 0.0548, "step": 7730 }, { "epoch": 6.666738451599009, "grad_norm": 2.734375, "learning_rate": 4.522e-06, "loss": 0.0543, "step": 7740 }, { "epoch": 6.675352643480133, "grad_norm": 2.484375, "learning_rate": 4.502e-06, "loss": 0.0498, "step": 7750 }, { "epoch": 6.683966835361257, "grad_norm": 2.640625, "learning_rate": 4.4820000000000005e-06, "loss": 0.0559, "step": 7760 }, { "epoch": 6.692581027242381, "grad_norm": 2.796875, "learning_rate": 4.462e-06, "loss": 0.0556, "step": 7770 }, { "epoch": 6.701195219123506, "grad_norm": 2.3125, "learning_rate": 4.442e-06, "loss": 0.051, "step": 7780 }, { "epoch": 6.70980941100463, "grad_norm": 2.65625, "learning_rate": 4.422e-06, "loss": 0.0555, "step": 7790 }, { "epoch": 6.718423602885754, "grad_norm": 2.5625, "learning_rate": 4.402e-06, "loss": 0.0522, "step": 7800 }, { "epoch": 6.727037794766878, "grad_norm": 2.265625, "learning_rate": 4.382e-06, "loss": 0.0555, "step": 7810 }, { "epoch": 6.735651986648002, "grad_norm": 4.125, "learning_rate": 4.362e-06, "loss": 0.0519, "step": 7820 }, { "epoch": 6.744266178529127, "grad_norm": 3.25, "learning_rate": 4.342e-06, "loss": 0.0538, "step": 7830 }, { "epoch": 6.752880370410251, "grad_norm": 3.328125, "learning_rate": 4.322e-06, "loss": 0.056, "step": 7840 }, { "epoch": 6.761494562291375, "grad_norm": 2.734375, "learning_rate": 4.3020000000000005e-06, "loss": 0.0538, "step": 7850 }, { "epoch": 6.770108754172499, "grad_norm": 2.171875, "learning_rate": 4.282000000000001e-06, "loss": 0.0513, "step": 7860 }, { "epoch": 6.778722946053623, "grad_norm": 2.921875, "learning_rate": 4.262000000000001e-06, "loss": 0.0548, "step": 7870 }, { "epoch": 6.787337137934747, "grad_norm": 3.0625, "learning_rate": 4.242e-06, "loss": 0.0538, "step": 7880 }, { "epoch": 6.795951329815872, "grad_norm": 2.75, "learning_rate": 4.222e-06, "loss": 0.0537, "step": 7890 }, { "epoch": 6.804565521696996, "grad_norm": 2.859375, "learning_rate": 4.202000000000001e-06, "loss": 0.0582, "step": 7900 }, { "epoch": 6.81317971357812, "grad_norm": 2.421875, "learning_rate": 4.182000000000001e-06, "loss": 0.0577, "step": 7910 }, { "epoch": 6.821793905459244, "grad_norm": 2.75, "learning_rate": 4.162e-06, "loss": 0.0564, "step": 7920 }, { "epoch": 6.830408097340368, "grad_norm": 2.9375, "learning_rate": 4.142e-06, "loss": 0.0551, "step": 7930 }, { "epoch": 6.839022289221492, "grad_norm": 2.734375, "learning_rate": 4.1220000000000005e-06, "loss": 0.0501, "step": 7940 }, { "epoch": 6.847636481102617, "grad_norm": 2.515625, "learning_rate": 4.102000000000001e-06, "loss": 0.0498, "step": 7950 }, { "epoch": 6.856250672983741, "grad_norm": 3.125, "learning_rate": 4.082e-06, "loss": 0.0512, "step": 7960 }, { "epoch": 6.864864864864865, "grad_norm": 2.5, "learning_rate": 4.062e-06, "loss": 0.052, "step": 7970 }, { "epoch": 6.873479056745989, "grad_norm": 2.84375, "learning_rate": 4.0420000000000004e-06, "loss": 0.0563, "step": 7980 }, { "epoch": 6.882093248627113, "grad_norm": 2.671875, "learning_rate": 4.022000000000001e-06, "loss": 0.0526, "step": 7990 }, { "epoch": 6.890707440508237, "grad_norm": 2.640625, "learning_rate": 4.002e-06, "loss": 0.0556, "step": 8000 }, { "epoch": 6.899321632389362, "grad_norm": 2.8125, "learning_rate": 3.982e-06, "loss": 0.0553, "step": 8010 }, { "epoch": 6.907935824270486, "grad_norm": 2.828125, "learning_rate": 3.962e-06, "loss": 0.0538, "step": 8020 }, { "epoch": 6.91655001615161, "grad_norm": 3.046875, "learning_rate": 3.9420000000000005e-06, "loss": 0.056, "step": 8030 }, { "epoch": 6.925164208032734, "grad_norm": 3.203125, "learning_rate": 3.922e-06, "loss": 0.0553, "step": 8040 }, { "epoch": 6.933778399913858, "grad_norm": 2.796875, "learning_rate": 3.902e-06, "loss": 0.054, "step": 8050 }, { "epoch": 6.942392591794983, "grad_norm": 2.390625, "learning_rate": 3.882e-06, "loss": 0.056, "step": 8060 }, { "epoch": 6.951006783676107, "grad_norm": 2.296875, "learning_rate": 3.8620000000000005e-06, "loss": 0.055, "step": 8070 }, { "epoch": 6.959620975557231, "grad_norm": 2.6875, "learning_rate": 3.842e-06, "loss": 0.0542, "step": 8080 }, { "epoch": 6.968235167438355, "grad_norm": 2.5, "learning_rate": 3.822e-06, "loss": 0.052, "step": 8090 }, { "epoch": 6.976849359319479, "grad_norm": 2.34375, "learning_rate": 3.802e-06, "loss": 0.0528, "step": 8100 }, { "epoch": 6.985463551200603, "grad_norm": 2.875, "learning_rate": 3.782e-06, "loss": 0.0551, "step": 8110 }, { "epoch": 6.994077743081727, "grad_norm": 2.46875, "learning_rate": 3.762e-06, "loss": 0.0545, "step": 8120 }, { "epoch": 7.0025842575643376, "grad_norm": 2.46875, "learning_rate": 3.742e-06, "loss": 0.0536, "step": 8130 }, { "epoch": 7.0111984494454616, "grad_norm": 1.9296875, "learning_rate": 3.722e-06, "loss": 0.0441, "step": 8140 }, { "epoch": 7.019812641326586, "grad_norm": 2.234375, "learning_rate": 3.702e-06, "loss": 0.0494, "step": 8150 }, { "epoch": 7.02842683320771, "grad_norm": 2.546875, "learning_rate": 3.6820000000000005e-06, "loss": 0.0473, "step": 8160 }, { "epoch": 7.037041025088834, "grad_norm": 2.546875, "learning_rate": 3.6620000000000007e-06, "loss": 0.0443, "step": 8170 }, { "epoch": 7.0456552169699584, "grad_norm": 2.25, "learning_rate": 3.6420000000000005e-06, "loss": 0.0454, "step": 8180 }, { "epoch": 7.0542694088510824, "grad_norm": 2.484375, "learning_rate": 3.6220000000000006e-06, "loss": 0.0465, "step": 8190 }, { "epoch": 7.0628836007322064, "grad_norm": 2.5625, "learning_rate": 3.6020000000000004e-06, "loss": 0.0463, "step": 8200 }, { "epoch": 7.0714977926133304, "grad_norm": 2.375, "learning_rate": 3.5820000000000006e-06, "loss": 0.0454, "step": 8210 }, { "epoch": 7.0801119844944544, "grad_norm": 2.421875, "learning_rate": 3.5620000000000004e-06, "loss": 0.0453, "step": 8220 }, { "epoch": 7.0887261763755784, "grad_norm": 1.984375, "learning_rate": 3.5420000000000006e-06, "loss": 0.0459, "step": 8230 }, { "epoch": 7.097340368256703, "grad_norm": 2.53125, "learning_rate": 3.5220000000000003e-06, "loss": 0.0462, "step": 8240 }, { "epoch": 7.105954560137827, "grad_norm": 2.921875, "learning_rate": 3.5020000000000005e-06, "loss": 0.0486, "step": 8250 }, { "epoch": 7.114568752018951, "grad_norm": 2.390625, "learning_rate": 3.4820000000000003e-06, "loss": 0.0436, "step": 8260 }, { "epoch": 7.123182943900075, "grad_norm": 2.375, "learning_rate": 3.4620000000000005e-06, "loss": 0.0462, "step": 8270 }, { "epoch": 7.131797135781199, "grad_norm": 2.21875, "learning_rate": 3.4420000000000002e-06, "loss": 0.045, "step": 8280 }, { "epoch": 7.140411327662323, "grad_norm": 2.65625, "learning_rate": 3.4220000000000004e-06, "loss": 0.0461, "step": 8290 }, { "epoch": 7.149025519543448, "grad_norm": 2.09375, "learning_rate": 3.402e-06, "loss": 0.0453, "step": 8300 }, { "epoch": 7.157639711424572, "grad_norm": 2.375, "learning_rate": 3.3820000000000004e-06, "loss": 0.045, "step": 8310 }, { "epoch": 7.166253903305696, "grad_norm": 2.46875, "learning_rate": 3.362e-06, "loss": 0.0483, "step": 8320 }, { "epoch": 7.17486809518682, "grad_norm": 2.25, "learning_rate": 3.3420000000000004e-06, "loss": 0.0452, "step": 8330 }, { "epoch": 7.183482287067944, "grad_norm": 2.40625, "learning_rate": 3.322e-06, "loss": 0.0485, "step": 8340 }, { "epoch": 7.192096478949068, "grad_norm": 2.28125, "learning_rate": 3.3020000000000003e-06, "loss": 0.0485, "step": 8350 }, { "epoch": 7.200710670830193, "grad_norm": 2.125, "learning_rate": 3.282e-06, "loss": 0.0461, "step": 8360 }, { "epoch": 7.209324862711317, "grad_norm": 2.5625, "learning_rate": 3.2620000000000003e-06, "loss": 0.0498, "step": 8370 }, { "epoch": 7.217939054592441, "grad_norm": 2.296875, "learning_rate": 3.242e-06, "loss": 0.0469, "step": 8380 }, { "epoch": 7.226553246473565, "grad_norm": 1.9609375, "learning_rate": 3.2220000000000002e-06, "loss": 0.0428, "step": 8390 }, { "epoch": 7.235167438354689, "grad_norm": 2.328125, "learning_rate": 3.202e-06, "loss": 0.0449, "step": 8400 }, { "epoch": 7.243781630235813, "grad_norm": 2.09375, "learning_rate": 3.182e-06, "loss": 0.0441, "step": 8410 }, { "epoch": 7.252395822116938, "grad_norm": 2.4375, "learning_rate": 3.162e-06, "loss": 0.0452, "step": 8420 }, { "epoch": 7.261010013998062, "grad_norm": 2.578125, "learning_rate": 3.142e-06, "loss": 0.0458, "step": 8430 }, { "epoch": 7.269624205879186, "grad_norm": 2.390625, "learning_rate": 3.122e-06, "loss": 0.0421, "step": 8440 }, { "epoch": 7.27823839776031, "grad_norm": 2.65625, "learning_rate": 3.102e-06, "loss": 0.0443, "step": 8450 }, { "epoch": 7.286852589641434, "grad_norm": 2.734375, "learning_rate": 3.082e-06, "loss": 0.0446, "step": 8460 }, { "epoch": 7.295466781522558, "grad_norm": 2.515625, "learning_rate": 3.0620000000000005e-06, "loss": 0.0449, "step": 8470 }, { "epoch": 7.304080973403683, "grad_norm": 2.15625, "learning_rate": 3.0420000000000007e-06, "loss": 0.0413, "step": 8480 }, { "epoch": 7.312695165284807, "grad_norm": 2.46875, "learning_rate": 3.0220000000000005e-06, "loss": 0.0456, "step": 8490 }, { "epoch": 7.321309357165931, "grad_norm": 2.5, "learning_rate": 3.0020000000000006e-06, "loss": 0.0459, "step": 8500 }, { "epoch": 7.329923549047055, "grad_norm": 2.140625, "learning_rate": 2.9820000000000004e-06, "loss": 0.0477, "step": 8510 }, { "epoch": 7.338537740928179, "grad_norm": 2.375, "learning_rate": 2.9620000000000006e-06, "loss": 0.0462, "step": 8520 }, { "epoch": 7.347151932809304, "grad_norm": 2.3125, "learning_rate": 2.9420000000000004e-06, "loss": 0.0428, "step": 8530 }, { "epoch": 7.355766124690428, "grad_norm": 2.359375, "learning_rate": 2.9220000000000006e-06, "loss": 0.0438, "step": 8540 }, { "epoch": 7.364380316571552, "grad_norm": 2.65625, "learning_rate": 2.9020000000000003e-06, "loss": 0.0451, "step": 8550 }, { "epoch": 7.372994508452676, "grad_norm": 2.4375, "learning_rate": 2.8820000000000005e-06, "loss": 0.0468, "step": 8560 }, { "epoch": 7.3816087003338, "grad_norm": 2.171875, "learning_rate": 2.8620000000000003e-06, "loss": 0.0462, "step": 8570 }, { "epoch": 7.390222892214924, "grad_norm": 2.546875, "learning_rate": 2.8420000000000005e-06, "loss": 0.0465, "step": 8580 }, { "epoch": 7.398837084096048, "grad_norm": 2.375, "learning_rate": 2.8220000000000003e-06, "loss": 0.0471, "step": 8590 }, { "epoch": 7.407451275977173, "grad_norm": 2.296875, "learning_rate": 2.8020000000000004e-06, "loss": 0.0473, "step": 8600 }, { "epoch": 7.416065467858297, "grad_norm": 2.625, "learning_rate": 2.7820000000000002e-06, "loss": 0.0486, "step": 8610 }, { "epoch": 7.424679659739421, "grad_norm": 2.40625, "learning_rate": 2.7620000000000004e-06, "loss": 0.0457, "step": 8620 }, { "epoch": 7.433293851620545, "grad_norm": 2.0625, "learning_rate": 2.742e-06, "loss": 0.0481, "step": 8630 }, { "epoch": 7.441908043501669, "grad_norm": 2.6875, "learning_rate": 2.7220000000000004e-06, "loss": 0.0452, "step": 8640 }, { "epoch": 7.450522235382794, "grad_norm": 2.265625, "learning_rate": 2.702e-06, "loss": 0.0428, "step": 8650 }, { "epoch": 7.459136427263918, "grad_norm": 2.40625, "learning_rate": 2.6820000000000003e-06, "loss": 0.0508, "step": 8660 }, { "epoch": 7.467750619145042, "grad_norm": 2.28125, "learning_rate": 2.662e-06, "loss": 0.0443, "step": 8670 }, { "epoch": 7.476364811026166, "grad_norm": 2.5, "learning_rate": 2.6420000000000003e-06, "loss": 0.047, "step": 8680 }, { "epoch": 7.48497900290729, "grad_norm": 2.609375, "learning_rate": 2.622e-06, "loss": 0.0457, "step": 8690 }, { "epoch": 7.493593194788414, "grad_norm": 2.5625, "learning_rate": 2.6020000000000002e-06, "loss": 0.0467, "step": 8700 }, { "epoch": 7.502207386669538, "grad_norm": 2.59375, "learning_rate": 2.582e-06, "loss": 0.0468, "step": 8710 }, { "epoch": 7.5108215785506625, "grad_norm": 2.078125, "learning_rate": 2.562e-06, "loss": 0.0447, "step": 8720 }, { "epoch": 7.5194357704317865, "grad_norm": 2.0625, "learning_rate": 2.542e-06, "loss": 0.0449, "step": 8730 }, { "epoch": 7.5280499623129105, "grad_norm": 2.46875, "learning_rate": 2.522e-06, "loss": 0.0466, "step": 8740 }, { "epoch": 7.5366641541940345, "grad_norm": 2.3125, "learning_rate": 2.502e-06, "loss": 0.0429, "step": 8750 }, { "epoch": 7.5452783460751585, "grad_norm": 2.859375, "learning_rate": 2.482e-06, "loss": 0.0474, "step": 8760 }, { "epoch": 7.553892537956283, "grad_norm": 2.5625, "learning_rate": 2.4620000000000003e-06, "loss": 0.0469, "step": 8770 }, { "epoch": 7.562506729837407, "grad_norm": 2.421875, "learning_rate": 2.442e-06, "loss": 0.0467, "step": 8780 }, { "epoch": 7.571120921718531, "grad_norm": 2.8125, "learning_rate": 2.4220000000000003e-06, "loss": 0.0497, "step": 8790 }, { "epoch": 7.579735113599655, "grad_norm": 2.28125, "learning_rate": 2.402e-06, "loss": 0.045, "step": 8800 }, { "epoch": 7.588349305480779, "grad_norm": 2.953125, "learning_rate": 2.3820000000000002e-06, "loss": 0.0472, "step": 8810 }, { "epoch": 7.596963497361903, "grad_norm": 2.859375, "learning_rate": 2.362e-06, "loss": 0.0495, "step": 8820 }, { "epoch": 7.605577689243028, "grad_norm": 2.40625, "learning_rate": 2.342e-06, "loss": 0.0441, "step": 8830 }, { "epoch": 7.614191881124152, "grad_norm": 2.078125, "learning_rate": 2.322e-06, "loss": 0.0466, "step": 8840 }, { "epoch": 7.622806073005276, "grad_norm": 2.671875, "learning_rate": 2.302e-06, "loss": 0.0447, "step": 8850 }, { "epoch": 7.6314202648864, "grad_norm": 2.3125, "learning_rate": 2.282e-06, "loss": 0.0469, "step": 8860 }, { "epoch": 7.640034456767524, "grad_norm": 3.34375, "learning_rate": 2.262e-06, "loss": 0.0475, "step": 8870 }, { "epoch": 7.648648648648649, "grad_norm": 2.515625, "learning_rate": 2.2420000000000003e-06, "loss": 0.0457, "step": 8880 }, { "epoch": 7.657262840529773, "grad_norm": 2.4375, "learning_rate": 2.222e-06, "loss": 0.0437, "step": 8890 }, { "epoch": 7.665877032410897, "grad_norm": 2.25, "learning_rate": 2.2020000000000003e-06, "loss": 0.0467, "step": 8900 }, { "epoch": 7.674491224292021, "grad_norm": 2.75, "learning_rate": 2.182e-06, "loss": 0.0472, "step": 8910 }, { "epoch": 7.683105416173145, "grad_norm": 2.84375, "learning_rate": 2.1620000000000002e-06, "loss": 0.0477, "step": 8920 }, { "epoch": 7.691719608054269, "grad_norm": 2.875, "learning_rate": 2.142e-06, "loss": 0.0469, "step": 8930 }, { "epoch": 7.700333799935393, "grad_norm": 2.609375, "learning_rate": 2.122e-06, "loss": 0.0481, "step": 8940 }, { "epoch": 7.708947991816518, "grad_norm": 2.984375, "learning_rate": 2.102e-06, "loss": 0.0464, "step": 8950 }, { "epoch": 7.717562183697642, "grad_norm": 2.578125, "learning_rate": 2.082e-06, "loss": 0.0469, "step": 8960 }, { "epoch": 7.726176375578766, "grad_norm": 2.515625, "learning_rate": 2.062e-06, "loss": 0.045, "step": 8970 }, { "epoch": 7.73479056745989, "grad_norm": 2.4375, "learning_rate": 2.042e-06, "loss": 0.0434, "step": 8980 }, { "epoch": 7.743404759341014, "grad_norm": 2.78125, "learning_rate": 2.022e-06, "loss": 0.0437, "step": 8990 }, { "epoch": 7.752018951222139, "grad_norm": 2.59375, "learning_rate": 2.002e-06, "loss": 0.0469, "step": 9000 }, { "epoch": 7.760633143103263, "grad_norm": 2.453125, "learning_rate": 1.982e-06, "loss": 0.0458, "step": 9010 }, { "epoch": 7.769247334984387, "grad_norm": 2.609375, "learning_rate": 1.9620000000000004e-06, "loss": 0.049, "step": 9020 }, { "epoch": 7.777861526865511, "grad_norm": 2.390625, "learning_rate": 1.942e-06, "loss": 0.0472, "step": 9030 }, { "epoch": 7.786475718746635, "grad_norm": 2.1875, "learning_rate": 1.9220000000000004e-06, "loss": 0.0474, "step": 9040 }, { "epoch": 7.795089910627759, "grad_norm": 2.609375, "learning_rate": 1.9020000000000002e-06, "loss": 0.0456, "step": 9050 }, { "epoch": 7.803704102508883, "grad_norm": 2.296875, "learning_rate": 1.8820000000000001e-06, "loss": 0.0427, "step": 9060 }, { "epoch": 7.812318294390008, "grad_norm": 2.71875, "learning_rate": 1.8620000000000001e-06, "loss": 0.0441, "step": 9070 }, { "epoch": 7.820932486271132, "grad_norm": 2.296875, "learning_rate": 1.8420000000000001e-06, "loss": 0.0435, "step": 9080 }, { "epoch": 7.829546678152256, "grad_norm": 2.4375, "learning_rate": 1.822e-06, "loss": 0.0489, "step": 9090 }, { "epoch": 7.83816087003338, "grad_norm": 2.453125, "learning_rate": 1.802e-06, "loss": 0.0443, "step": 9100 }, { "epoch": 7.846775061914504, "grad_norm": 2.578125, "learning_rate": 1.782e-06, "loss": 0.045, "step": 9110 }, { "epoch": 7.855389253795629, "grad_norm": 2.765625, "learning_rate": 1.762e-06, "loss": 0.0422, "step": 9120 }, { "epoch": 7.864003445676753, "grad_norm": 2.546875, "learning_rate": 1.742e-06, "loss": 0.0454, "step": 9130 }, { "epoch": 7.872617637557877, "grad_norm": 2.46875, "learning_rate": 1.722e-06, "loss": 0.0441, "step": 9140 }, { "epoch": 7.881231829439001, "grad_norm": 2.4375, "learning_rate": 1.702e-06, "loss": 0.046, "step": 9150 }, { "epoch": 7.889846021320125, "grad_norm": 2.71875, "learning_rate": 1.682e-06, "loss": 0.0467, "step": 9160 }, { "epoch": 7.898460213201249, "grad_norm": 2.75, "learning_rate": 1.662e-06, "loss": 0.0486, "step": 9170 }, { "epoch": 7.907074405082374, "grad_norm": 2.640625, "learning_rate": 1.6420000000000003e-06, "loss": 0.0475, "step": 9180 }, { "epoch": 7.915688596963498, "grad_norm": 2.40625, "learning_rate": 1.6220000000000003e-06, "loss": 0.0476, "step": 9190 }, { "epoch": 7.924302788844622, "grad_norm": 2.234375, "learning_rate": 1.6020000000000003e-06, "loss": 0.0425, "step": 9200 }, { "epoch": 7.932916980725746, "grad_norm": 2.28125, "learning_rate": 1.5820000000000003e-06, "loss": 0.0447, "step": 9210 }, { "epoch": 7.94153117260687, "grad_norm": 2.109375, "learning_rate": 1.5620000000000002e-06, "loss": 0.0484, "step": 9220 }, { "epoch": 7.950145364487994, "grad_norm": 2.46875, "learning_rate": 1.5420000000000002e-06, "loss": 0.0455, "step": 9230 }, { "epoch": 7.9587595563691185, "grad_norm": 2.703125, "learning_rate": 1.5220000000000002e-06, "loss": 0.0462, "step": 9240 }, { "epoch": 7.9673737482502425, "grad_norm": 2.328125, "learning_rate": 1.5020000000000002e-06, "loss": 0.045, "step": 9250 }, { "epoch": 7.9759879401313665, "grad_norm": 2.34375, "learning_rate": 1.4820000000000002e-06, "loss": 0.0447, "step": 9260 }, { "epoch": 7.9846021320124905, "grad_norm": 3.34375, "learning_rate": 1.4620000000000001e-06, "loss": 0.0472, "step": 9270 }, { "epoch": 7.9932163238936145, "grad_norm": 2.578125, "learning_rate": 1.4420000000000001e-06, "loss": 0.047, "step": 9280 }, { "epoch": 8.001722838376224, "grad_norm": 2.03125, "learning_rate": 1.4220000000000001e-06, "loss": 0.0444, "step": 9290 }, { "epoch": 8.010337030257348, "grad_norm": 2.59375, "learning_rate": 1.402e-06, "loss": 0.0444, "step": 9300 }, { "epoch": 8.018951222138472, "grad_norm": 1.953125, "learning_rate": 1.382e-06, "loss": 0.0397, "step": 9310 }, { "epoch": 8.027565414019596, "grad_norm": 2.40625, "learning_rate": 1.362e-06, "loss": 0.0425, "step": 9320 }, { "epoch": 8.036179605900722, "grad_norm": 2.1875, "learning_rate": 1.3420000000000002e-06, "loss": 0.043, "step": 9330 }, { "epoch": 8.044793797781846, "grad_norm": 2.390625, "learning_rate": 1.3220000000000002e-06, "loss": 0.0451, "step": 9340 }, { "epoch": 8.05340798966297, "grad_norm": 2.25, "learning_rate": 1.3020000000000002e-06, "loss": 0.0442, "step": 9350 }, { "epoch": 8.062022181544094, "grad_norm": 1.96875, "learning_rate": 1.2820000000000002e-06, "loss": 0.0428, "step": 9360 }, { "epoch": 8.070636373425218, "grad_norm": 2.171875, "learning_rate": 1.2620000000000002e-06, "loss": 0.0441, "step": 9370 }, { "epoch": 8.079250565306342, "grad_norm": 2.390625, "learning_rate": 1.2420000000000001e-06, "loss": 0.045, "step": 9380 }, { "epoch": 8.087864757187466, "grad_norm": 2.421875, "learning_rate": 1.2220000000000001e-06, "loss": 0.0443, "step": 9390 }, { "epoch": 8.09647894906859, "grad_norm": 2.515625, "learning_rate": 1.202e-06, "loss": 0.0422, "step": 9400 }, { "epoch": 8.105093140949714, "grad_norm": 2.28125, "learning_rate": 1.182e-06, "loss": 0.0417, "step": 9410 }, { "epoch": 8.113707332830838, "grad_norm": 2.28125, "learning_rate": 1.162e-06, "loss": 0.0432, "step": 9420 }, { "epoch": 8.122321524711962, "grad_norm": 2.453125, "learning_rate": 1.142e-06, "loss": 0.0419, "step": 9430 }, { "epoch": 8.130935716593086, "grad_norm": 2.109375, "learning_rate": 1.122e-06, "loss": 0.0416, "step": 9440 }, { "epoch": 8.139549908474212, "grad_norm": 2.359375, "learning_rate": 1.1020000000000002e-06, "loss": 0.0422, "step": 9450 }, { "epoch": 8.148164100355336, "grad_norm": 2.09375, "learning_rate": 1.0820000000000002e-06, "loss": 0.0455, "step": 9460 }, { "epoch": 8.15677829223646, "grad_norm": 2.609375, "learning_rate": 1.0620000000000002e-06, "loss": 0.0455, "step": 9470 }, { "epoch": 8.165392484117584, "grad_norm": 2.296875, "learning_rate": 1.0420000000000001e-06, "loss": 0.0441, "step": 9480 }, { "epoch": 8.174006675998708, "grad_norm": 2.484375, "learning_rate": 1.0220000000000001e-06, "loss": 0.0433, "step": 9490 }, { "epoch": 8.182620867879832, "grad_norm": 2.453125, "learning_rate": 1.002e-06, "loss": 0.045, "step": 9500 } ], "logging_steps": 10, "max_steps": 10000, "num_input_tokens_seen": 0, "num_train_epochs": 9, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 2.698799670922543e+18, "train_batch_size": 1, "trial_name": null, "trial_params": null }