{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 0.17561912602044316, "eval_steps": 500, "global_step": 8000, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.00021952390752555395, "grad_norm": 129536.0, "learning_rate": 1.99775e-05, "loss": 11.4186, "step": 10 }, { "epoch": 0.0004390478150511079, "grad_norm": 154.0, "learning_rate": 1.9952500000000003e-05, "loss": 6.1068, "step": 20 }, { "epoch": 0.0006585717225766619, "grad_norm": 2.1875, "learning_rate": 1.9927500000000002e-05, "loss": 4.8048, "step": 30 }, { "epoch": 0.0008780956301022158, "grad_norm": 1.640625, "learning_rate": 1.99025e-05, "loss": 0.2014, "step": 40 }, { "epoch": 0.0010976195376277698, "grad_norm": 1.4921875, "learning_rate": 1.98775e-05, "loss": 0.2177, "step": 50 }, { "epoch": 0.0013171434451533237, "grad_norm": 1.3125, "learning_rate": 1.98525e-05, "loss": 0.2286, "step": 60 }, { "epoch": 0.0015366673526788777, "grad_norm": 0.67578125, "learning_rate": 1.9827500000000003e-05, "loss": 0.1803, "step": 70 }, { "epoch": 0.0017561912602044316, "grad_norm": 1.1328125, "learning_rate": 1.9802500000000002e-05, "loss": 0.1814, "step": 80 }, { "epoch": 0.0019757151677299856, "grad_norm": 1.1953125, "learning_rate": 1.97775e-05, "loss": 0.1881, "step": 90 }, { "epoch": 0.0021952390752555395, "grad_norm": 0.74609375, "learning_rate": 1.97525e-05, "loss": 0.1877, "step": 100 }, { "epoch": 0.0024147629827810935, "grad_norm": 0.98046875, "learning_rate": 1.97275e-05, "loss": 0.1892, "step": 110 }, { "epoch": 0.0026342868903066474, "grad_norm": 0.92578125, "learning_rate": 1.9702500000000003e-05, "loss": 0.1921, "step": 120 }, { "epoch": 0.0028538107978322014, "grad_norm": 1.25, "learning_rate": 1.9677500000000003e-05, "loss": 0.2261, "step": 130 }, { "epoch": 0.0030733347053577553, "grad_norm": 0.82421875, "learning_rate": 1.9652500000000002e-05, "loss": 0.2294, "step": 140 }, { "epoch": 0.0032928586128833093, "grad_norm": 0.89453125, "learning_rate": 1.96275e-05, "loss": 0.1871, "step": 150 }, { "epoch": 0.0035123825204088632, "grad_norm": 0.83203125, "learning_rate": 1.96025e-05, "loss": 0.1888, "step": 160 }, { "epoch": 0.003731906427934417, "grad_norm": 1.0390625, "learning_rate": 1.9577500000000004e-05, "loss": 0.1925, "step": 170 }, { "epoch": 0.003951430335459971, "grad_norm": 0.95703125, "learning_rate": 1.95525e-05, "loss": 0.1974, "step": 180 }, { "epoch": 0.004170954242985525, "grad_norm": 1.2421875, "learning_rate": 1.9527500000000002e-05, "loss": 0.2109, "step": 190 }, { "epoch": 0.004390478150511079, "grad_norm": 0.734375, "learning_rate": 1.9502500000000002e-05, "loss": 0.2015, "step": 200 }, { "epoch": 0.004610002058036633, "grad_norm": 0.953125, "learning_rate": 1.94775e-05, "loss": 0.1631, "step": 210 }, { "epoch": 0.004829525965562187, "grad_norm": 0.6484375, "learning_rate": 1.94525e-05, "loss": 0.195, "step": 220 }, { "epoch": 0.005049049873087741, "grad_norm": 0.95703125, "learning_rate": 1.94275e-05, "loss": 0.1882, "step": 230 }, { "epoch": 0.005268573780613295, "grad_norm": 1.1875, "learning_rate": 1.9402500000000003e-05, "loss": 0.1956, "step": 240 }, { "epoch": 0.005488097688138849, "grad_norm": 0.9453125, "learning_rate": 1.9377500000000002e-05, "loss": 0.2236, "step": 250 }, { "epoch": 0.005707621595664403, "grad_norm": 0.65625, "learning_rate": 1.93525e-05, "loss": 0.1906, "step": 260 }, { "epoch": 0.005927145503189957, "grad_norm": 0.79296875, "learning_rate": 1.93275e-05, "loss": 0.2174, "step": 270 }, { "epoch": 0.006146669410715511, "grad_norm": 1.1875, "learning_rate": 1.93025e-05, "loss": 0.1898, "step": 280 }, { "epoch": 0.006366193318241065, "grad_norm": 0.79296875, "learning_rate": 1.9277500000000003e-05, "loss": 0.2021, "step": 290 }, { "epoch": 0.0065857172257666186, "grad_norm": 0.8125, "learning_rate": 1.92525e-05, "loss": 0.192, "step": 300 }, { "epoch": 0.0068052411332921725, "grad_norm": 0.8359375, "learning_rate": 1.9227500000000002e-05, "loss": 0.2299, "step": 310 }, { "epoch": 0.0070247650408177265, "grad_norm": 0.90234375, "learning_rate": 1.92025e-05, "loss": 0.2137, "step": 320 }, { "epoch": 0.00724428894834328, "grad_norm": 0.8359375, "learning_rate": 1.91775e-05, "loss": 0.1918, "step": 330 }, { "epoch": 0.007463812855868834, "grad_norm": 0.796875, "learning_rate": 1.91525e-05, "loss": 0.2303, "step": 340 }, { "epoch": 0.007683336763394388, "grad_norm": 0.68359375, "learning_rate": 1.91275e-05, "loss": 0.1957, "step": 350 }, { "epoch": 0.007902860670919942, "grad_norm": 1.1015625, "learning_rate": 1.9102500000000002e-05, "loss": 0.2029, "step": 360 }, { "epoch": 0.008122384578445496, "grad_norm": 0.87109375, "learning_rate": 1.90775e-05, "loss": 0.219, "step": 370 }, { "epoch": 0.00834190848597105, "grad_norm": 0.8828125, "learning_rate": 1.90525e-05, "loss": 0.2189, "step": 380 }, { "epoch": 0.008561432393496604, "grad_norm": 1.125, "learning_rate": 1.90275e-05, "loss": 0.2014, "step": 390 }, { "epoch": 0.008780956301022158, "grad_norm": 0.69921875, "learning_rate": 1.9002500000000003e-05, "loss": 0.1757, "step": 400 }, { "epoch": 0.009000480208547712, "grad_norm": 0.6640625, "learning_rate": 1.8977500000000003e-05, "loss": 0.2146, "step": 410 }, { "epoch": 0.009220004116073266, "grad_norm": 0.7578125, "learning_rate": 1.8952500000000002e-05, "loss": 0.1767, "step": 420 }, { "epoch": 0.00943952802359882, "grad_norm": 0.9140625, "learning_rate": 1.89275e-05, "loss": 0.1913, "step": 430 }, { "epoch": 0.009659051931124374, "grad_norm": 0.890625, "learning_rate": 1.89025e-05, "loss": 0.1938, "step": 440 }, { "epoch": 0.009878575838649928, "grad_norm": 0.9765625, "learning_rate": 1.8877500000000003e-05, "loss": 0.1898, "step": 450 }, { "epoch": 0.010098099746175482, "grad_norm": 1.0078125, "learning_rate": 1.8852500000000003e-05, "loss": 0.214, "step": 460 }, { "epoch": 0.010317623653701036, "grad_norm": 0.85546875, "learning_rate": 1.8827500000000002e-05, "loss": 0.191, "step": 470 }, { "epoch": 0.01053714756122659, "grad_norm": 0.71484375, "learning_rate": 1.88025e-05, "loss": 0.207, "step": 480 }, { "epoch": 0.010756671468752144, "grad_norm": 1.2734375, "learning_rate": 1.87775e-05, "loss": 0.2191, "step": 490 }, { "epoch": 0.010976195376277698, "grad_norm": 1.1171875, "learning_rate": 1.8752500000000004e-05, "loss": 0.2271, "step": 500 }, { "epoch": 0.011195719283803252, "grad_norm": 0.703125, "learning_rate": 1.87275e-05, "loss": 0.1823, "step": 510 }, { "epoch": 0.011415243191328805, "grad_norm": 1.0859375, "learning_rate": 1.8702500000000003e-05, "loss": 0.2054, "step": 520 }, { "epoch": 0.01163476709885436, "grad_norm": 0.765625, "learning_rate": 1.8677500000000002e-05, "loss": 0.2131, "step": 530 }, { "epoch": 0.011854291006379913, "grad_norm": 0.73046875, "learning_rate": 1.86525e-05, "loss": 0.1569, "step": 540 }, { "epoch": 0.012073814913905467, "grad_norm": 1.0078125, "learning_rate": 1.86275e-05, "loss": 0.1877, "step": 550 }, { "epoch": 0.012293338821431021, "grad_norm": 0.97265625, "learning_rate": 1.86025e-05, "loss": 0.2228, "step": 560 }, { "epoch": 0.012512862728956575, "grad_norm": 0.98046875, "learning_rate": 1.8577500000000003e-05, "loss": 0.2096, "step": 570 }, { "epoch": 0.01273238663648213, "grad_norm": 0.7421875, "learning_rate": 1.8552500000000002e-05, "loss": 0.1861, "step": 580 }, { "epoch": 0.012951910544007683, "grad_norm": 0.60546875, "learning_rate": 1.8527500000000002e-05, "loss": 0.1832, "step": 590 }, { "epoch": 0.013171434451533237, "grad_norm": 0.90234375, "learning_rate": 1.85025e-05, "loss": 0.2182, "step": 600 }, { "epoch": 0.013390958359058791, "grad_norm": 0.828125, "learning_rate": 1.84775e-05, "loss": 0.2285, "step": 610 }, { "epoch": 0.013610482266584345, "grad_norm": 0.703125, "learning_rate": 1.8452500000000003e-05, "loss": 0.2024, "step": 620 }, { "epoch": 0.013830006174109899, "grad_norm": 1.3203125, "learning_rate": 1.8427500000000003e-05, "loss": 0.2115, "step": 630 }, { "epoch": 0.014049530081635453, "grad_norm": 1.03125, "learning_rate": 1.8402500000000002e-05, "loss": 0.227, "step": 640 }, { "epoch": 0.014269053989161007, "grad_norm": 0.73828125, "learning_rate": 1.83775e-05, "loss": 0.185, "step": 650 }, { "epoch": 0.01448857789668656, "grad_norm": 0.78125, "learning_rate": 1.83525e-05, "loss": 0.1754, "step": 660 }, { "epoch": 0.014708101804212115, "grad_norm": 0.7578125, "learning_rate": 1.8327500000000004e-05, "loss": 0.2198, "step": 670 }, { "epoch": 0.014927625711737669, "grad_norm": 0.8046875, "learning_rate": 1.83025e-05, "loss": 0.1966, "step": 680 }, { "epoch": 0.015147149619263223, "grad_norm": 0.9140625, "learning_rate": 1.8277500000000002e-05, "loss": 0.187, "step": 690 }, { "epoch": 0.015366673526788777, "grad_norm": 1.0625, "learning_rate": 1.8252500000000002e-05, "loss": 0.2088, "step": 700 }, { "epoch": 0.01558619743431433, "grad_norm": 0.85546875, "learning_rate": 1.82275e-05, "loss": 0.2132, "step": 710 }, { "epoch": 0.015805721341839885, "grad_norm": 0.93359375, "learning_rate": 1.82025e-05, "loss": 0.2149, "step": 720 }, { "epoch": 0.01602524524936544, "grad_norm": 0.87109375, "learning_rate": 1.81775e-05, "loss": 0.1931, "step": 730 }, { "epoch": 0.016244769156890992, "grad_norm": 0.6875, "learning_rate": 1.8152500000000003e-05, "loss": 0.1721, "step": 740 }, { "epoch": 0.016464293064416546, "grad_norm": 0.78125, "learning_rate": 1.8127500000000002e-05, "loss": 0.1911, "step": 750 }, { "epoch": 0.0166838169719421, "grad_norm": 0.95703125, "learning_rate": 1.81025e-05, "loss": 0.2294, "step": 760 }, { "epoch": 0.016903340879467654, "grad_norm": 1.0546875, "learning_rate": 1.80775e-05, "loss": 0.1867, "step": 770 }, { "epoch": 0.017122864786993208, "grad_norm": 0.58984375, "learning_rate": 1.80525e-05, "loss": 0.1997, "step": 780 }, { "epoch": 0.017342388694518762, "grad_norm": 1.015625, "learning_rate": 1.8027500000000003e-05, "loss": 0.2095, "step": 790 }, { "epoch": 0.017561912602044316, "grad_norm": 1.0078125, "learning_rate": 1.80025e-05, "loss": 0.1862, "step": 800 }, { "epoch": 0.01778143650956987, "grad_norm": 0.75, "learning_rate": 1.7977500000000002e-05, "loss": 0.1971, "step": 810 }, { "epoch": 0.018000960417095424, "grad_norm": 1.046875, "learning_rate": 1.79525e-05, "loss": 0.193, "step": 820 }, { "epoch": 0.018220484324620978, "grad_norm": 0.65234375, "learning_rate": 1.79275e-05, "loss": 0.2195, "step": 830 }, { "epoch": 0.018440008232146532, "grad_norm": 0.68359375, "learning_rate": 1.79025e-05, "loss": 0.1954, "step": 840 }, { "epoch": 0.018659532139672086, "grad_norm": 0.76953125, "learning_rate": 1.78775e-05, "loss": 0.1546, "step": 850 }, { "epoch": 0.01887905604719764, "grad_norm": 0.80078125, "learning_rate": 1.7852500000000002e-05, "loss": 0.188, "step": 860 }, { "epoch": 0.019098579954723194, "grad_norm": 0.91796875, "learning_rate": 1.78275e-05, "loss": 0.2128, "step": 870 }, { "epoch": 0.019318103862248748, "grad_norm": 0.71875, "learning_rate": 1.78025e-05, "loss": 0.1846, "step": 880 }, { "epoch": 0.0195376277697743, "grad_norm": 0.734375, "learning_rate": 1.77775e-05, "loss": 0.1938, "step": 890 }, { "epoch": 0.019757151677299856, "grad_norm": 0.66796875, "learning_rate": 1.77525e-05, "loss": 0.1781, "step": 900 }, { "epoch": 0.01997667558482541, "grad_norm": 0.98046875, "learning_rate": 1.7727500000000003e-05, "loss": 0.1848, "step": 910 }, { "epoch": 0.020196199492350964, "grad_norm": 0.81640625, "learning_rate": 1.7702500000000002e-05, "loss": 0.1739, "step": 920 }, { "epoch": 0.020415723399876518, "grad_norm": 1.09375, "learning_rate": 1.76775e-05, "loss": 0.1878, "step": 930 }, { "epoch": 0.02063524730740207, "grad_norm": 1.078125, "learning_rate": 1.76525e-05, "loss": 0.2028, "step": 940 }, { "epoch": 0.020854771214927625, "grad_norm": 0.828125, "learning_rate": 1.76275e-05, "loss": 0.1897, "step": 950 }, { "epoch": 0.02107429512245318, "grad_norm": 0.671875, "learning_rate": 1.7602500000000003e-05, "loss": 0.1748, "step": 960 }, { "epoch": 0.021293819029978733, "grad_norm": 0.84375, "learning_rate": 1.75775e-05, "loss": 0.1997, "step": 970 }, { "epoch": 0.021513342937504287, "grad_norm": 0.78125, "learning_rate": 1.7552500000000002e-05, "loss": 0.1989, "step": 980 }, { "epoch": 0.02173286684502984, "grad_norm": 0.64453125, "learning_rate": 1.75275e-05, "loss": 0.1836, "step": 990 }, { "epoch": 0.021952390752555395, "grad_norm": 0.91015625, "learning_rate": 1.7502500000000004e-05, "loss": 0.2198, "step": 1000 }, { "epoch": 0.02217191466008095, "grad_norm": 0.96484375, "learning_rate": 1.74775e-05, "loss": 0.1847, "step": 1010 }, { "epoch": 0.022391438567606503, "grad_norm": 1.015625, "learning_rate": 1.7452500000000003e-05, "loss": 0.1954, "step": 1020 }, { "epoch": 0.022610962475132057, "grad_norm": 0.71484375, "learning_rate": 1.7427500000000002e-05, "loss": 0.2007, "step": 1030 }, { "epoch": 0.02283048638265761, "grad_norm": 0.5546875, "learning_rate": 1.74025e-05, "loss": 0.2068, "step": 1040 }, { "epoch": 0.023050010290183165, "grad_norm": 0.78125, "learning_rate": 1.73775e-05, "loss": 0.1898, "step": 1050 }, { "epoch": 0.02326953419770872, "grad_norm": 0.64453125, "learning_rate": 1.73525e-05, "loss": 0.1661, "step": 1060 }, { "epoch": 0.023489058105234273, "grad_norm": 0.7265625, "learning_rate": 1.7327500000000003e-05, "loss": 0.1745, "step": 1070 }, { "epoch": 0.023708582012759827, "grad_norm": 0.60546875, "learning_rate": 1.7302500000000002e-05, "loss": 0.1691, "step": 1080 }, { "epoch": 0.02392810592028538, "grad_norm": 0.94921875, "learning_rate": 1.7277500000000002e-05, "loss": 0.204, "step": 1090 }, { "epoch": 0.024147629827810935, "grad_norm": 0.69140625, "learning_rate": 1.72525e-05, "loss": 0.1775, "step": 1100 }, { "epoch": 0.02436715373533649, "grad_norm": 0.8203125, "learning_rate": 1.72275e-05, "loss": 0.1953, "step": 1110 }, { "epoch": 0.024586677642862043, "grad_norm": 0.78515625, "learning_rate": 1.7202500000000003e-05, "loss": 0.1916, "step": 1120 }, { "epoch": 0.024806201550387597, "grad_norm": 0.6640625, "learning_rate": 1.7177500000000003e-05, "loss": 0.1762, "step": 1130 }, { "epoch": 0.02502572545791315, "grad_norm": 1.0078125, "learning_rate": 1.7152500000000002e-05, "loss": 0.2267, "step": 1140 }, { "epoch": 0.025245249365438704, "grad_norm": 0.84765625, "learning_rate": 1.71275e-05, "loss": 0.1924, "step": 1150 }, { "epoch": 0.02546477327296426, "grad_norm": 0.91796875, "learning_rate": 1.71025e-05, "loss": 0.2081, "step": 1160 }, { "epoch": 0.025684297180489812, "grad_norm": 0.734375, "learning_rate": 1.7077500000000004e-05, "loss": 0.1693, "step": 1170 }, { "epoch": 0.025903821088015366, "grad_norm": 0.8046875, "learning_rate": 1.70525e-05, "loss": 0.2278, "step": 1180 }, { "epoch": 0.02612334499554092, "grad_norm": 0.7109375, "learning_rate": 1.7027500000000003e-05, "loss": 0.1819, "step": 1190 }, { "epoch": 0.026342868903066474, "grad_norm": 0.671875, "learning_rate": 1.7002500000000002e-05, "loss": 0.147, "step": 1200 }, { "epoch": 0.026562392810592028, "grad_norm": 0.8671875, "learning_rate": 1.69775e-05, "loss": 0.1857, "step": 1210 }, { "epoch": 0.026781916718117582, "grad_norm": 0.7890625, "learning_rate": 1.69525e-05, "loss": 0.1665, "step": 1220 }, { "epoch": 0.027001440625643136, "grad_norm": 0.8984375, "learning_rate": 1.69275e-05, "loss": 0.2032, "step": 1230 }, { "epoch": 0.02722096453316869, "grad_norm": 0.6171875, "learning_rate": 1.6902500000000003e-05, "loss": 0.182, "step": 1240 }, { "epoch": 0.027440488440694244, "grad_norm": 0.65234375, "learning_rate": 1.6877500000000002e-05, "loss": 0.1936, "step": 1250 }, { "epoch": 0.027660012348219798, "grad_norm": 0.84765625, "learning_rate": 1.6852500000000002e-05, "loss": 0.1971, "step": 1260 }, { "epoch": 0.027879536255745352, "grad_norm": 0.91015625, "learning_rate": 1.68275e-05, "loss": 0.1824, "step": 1270 }, { "epoch": 0.028099060163270906, "grad_norm": 0.65234375, "learning_rate": 1.68025e-05, "loss": 0.1731, "step": 1280 }, { "epoch": 0.02831858407079646, "grad_norm": 0.83203125, "learning_rate": 1.6777500000000003e-05, "loss": 0.1818, "step": 1290 }, { "epoch": 0.028538107978322014, "grad_norm": 0.95703125, "learning_rate": 1.67525e-05, "loss": 0.1959, "step": 1300 }, { "epoch": 0.028757631885847568, "grad_norm": 0.96875, "learning_rate": 1.6727500000000002e-05, "loss": 0.1777, "step": 1310 }, { "epoch": 0.02897715579337312, "grad_norm": 0.7734375, "learning_rate": 1.67025e-05, "loss": 0.1719, "step": 1320 }, { "epoch": 0.029196679700898676, "grad_norm": 0.64453125, "learning_rate": 1.66775e-05, "loss": 0.1985, "step": 1330 }, { "epoch": 0.02941620360842423, "grad_norm": 0.96875, "learning_rate": 1.66525e-05, "loss": 0.1841, "step": 1340 }, { "epoch": 0.029635727515949783, "grad_norm": 0.78125, "learning_rate": 1.66275e-05, "loss": 0.2174, "step": 1350 }, { "epoch": 0.029855251423475337, "grad_norm": 0.80078125, "learning_rate": 1.6602500000000002e-05, "loss": 0.1925, "step": 1360 }, { "epoch": 0.03007477533100089, "grad_norm": 0.78515625, "learning_rate": 1.6577500000000002e-05, "loss": 0.1686, "step": 1370 }, { "epoch": 0.030294299238526445, "grad_norm": 0.765625, "learning_rate": 1.65525e-05, "loss": 0.1877, "step": 1380 }, { "epoch": 0.030513823146052, "grad_norm": 0.9140625, "learning_rate": 1.65275e-05, "loss": 0.2048, "step": 1390 }, { "epoch": 0.030733347053577553, "grad_norm": 0.52734375, "learning_rate": 1.65025e-05, "loss": 0.2031, "step": 1400 }, { "epoch": 0.030952870961103107, "grad_norm": 0.8984375, "learning_rate": 1.6477500000000003e-05, "loss": 0.217, "step": 1410 }, { "epoch": 0.03117239486862866, "grad_norm": 0.78125, "learning_rate": 1.6452500000000002e-05, "loss": 0.1783, "step": 1420 }, { "epoch": 0.031391918776154215, "grad_norm": 0.80078125, "learning_rate": 1.64275e-05, "loss": 0.1761, "step": 1430 }, { "epoch": 0.03161144268367977, "grad_norm": 0.83203125, "learning_rate": 1.64025e-05, "loss": 0.182, "step": 1440 }, { "epoch": 0.03183096659120532, "grad_norm": 0.48828125, "learning_rate": 1.63775e-05, "loss": 0.1732, "step": 1450 }, { "epoch": 0.03205049049873088, "grad_norm": 0.87109375, "learning_rate": 1.6352500000000003e-05, "loss": 0.1951, "step": 1460 }, { "epoch": 0.03227001440625643, "grad_norm": 1.0078125, "learning_rate": 1.63275e-05, "loss": 0.2427, "step": 1470 }, { "epoch": 0.032489538313781985, "grad_norm": 0.78125, "learning_rate": 1.6302500000000002e-05, "loss": 0.1904, "step": 1480 }, { "epoch": 0.03270906222130754, "grad_norm": 0.81640625, "learning_rate": 1.62775e-05, "loss": 0.1938, "step": 1490 }, { "epoch": 0.03292858612883309, "grad_norm": 0.59765625, "learning_rate": 1.62525e-05, "loss": 0.1673, "step": 1500 }, { "epoch": 0.03314811003635865, "grad_norm": 1.109375, "learning_rate": 1.62275e-05, "loss": 0.2362, "step": 1510 }, { "epoch": 0.0333676339438842, "grad_norm": 0.85546875, "learning_rate": 1.62025e-05, "loss": 0.1716, "step": 1520 }, { "epoch": 0.033587157851409755, "grad_norm": 0.9375, "learning_rate": 1.6177500000000002e-05, "loss": 0.1872, "step": 1530 }, { "epoch": 0.03380668175893531, "grad_norm": 0.95703125, "learning_rate": 1.61525e-05, "loss": 0.2068, "step": 1540 }, { "epoch": 0.03402620566646086, "grad_norm": 0.78515625, "learning_rate": 1.61275e-05, "loss": 0.1902, "step": 1550 }, { "epoch": 0.034245729573986416, "grad_norm": 0.72265625, "learning_rate": 1.61025e-05, "loss": 0.1866, "step": 1560 }, { "epoch": 0.03446525348151197, "grad_norm": 0.703125, "learning_rate": 1.60775e-05, "loss": 0.1694, "step": 1570 }, { "epoch": 0.034684777389037524, "grad_norm": 0.57421875, "learning_rate": 1.6052500000000003e-05, "loss": 0.1915, "step": 1580 }, { "epoch": 0.03490430129656308, "grad_norm": 0.671875, "learning_rate": 1.60275e-05, "loss": 0.2041, "step": 1590 }, { "epoch": 0.03512382520408863, "grad_norm": 1.1484375, "learning_rate": 1.60025e-05, "loss": 0.2597, "step": 1600 }, { "epoch": 0.035343349111614186, "grad_norm": 0.73046875, "learning_rate": 1.59775e-05, "loss": 0.2025, "step": 1610 }, { "epoch": 0.03556287301913974, "grad_norm": 0.76171875, "learning_rate": 1.5952500000000004e-05, "loss": 0.178, "step": 1620 }, { "epoch": 0.035782396926665294, "grad_norm": 1.1015625, "learning_rate": 1.5927500000000003e-05, "loss": 0.2145, "step": 1630 }, { "epoch": 0.03600192083419085, "grad_norm": 0.625, "learning_rate": 1.5902500000000002e-05, "loss": 0.2196, "step": 1640 }, { "epoch": 0.0362214447417164, "grad_norm": 0.8125, "learning_rate": 1.5877500000000002e-05, "loss": 0.236, "step": 1650 }, { "epoch": 0.036440968649241956, "grad_norm": 1.0, "learning_rate": 1.58525e-05, "loss": 0.1927, "step": 1660 }, { "epoch": 0.03666049255676751, "grad_norm": 0.8203125, "learning_rate": 1.5827500000000004e-05, "loss": 0.1991, "step": 1670 }, { "epoch": 0.036880016464293064, "grad_norm": 1.171875, "learning_rate": 1.58025e-05, "loss": 0.2282, "step": 1680 }, { "epoch": 0.03709954037181862, "grad_norm": 0.78515625, "learning_rate": 1.5777500000000003e-05, "loss": 0.1799, "step": 1690 }, { "epoch": 0.03731906427934417, "grad_norm": 0.81640625, "learning_rate": 1.5752500000000002e-05, "loss": 0.1936, "step": 1700 }, { "epoch": 0.037538588186869726, "grad_norm": 0.74609375, "learning_rate": 1.57275e-05, "loss": 0.1886, "step": 1710 }, { "epoch": 0.03775811209439528, "grad_norm": 0.765625, "learning_rate": 1.57025e-05, "loss": 0.1771, "step": 1720 }, { "epoch": 0.037977636001920834, "grad_norm": 0.84765625, "learning_rate": 1.56775e-05, "loss": 0.1853, "step": 1730 }, { "epoch": 0.03819715990944639, "grad_norm": 0.90625, "learning_rate": 1.5652500000000003e-05, "loss": 0.1987, "step": 1740 }, { "epoch": 0.03841668381697194, "grad_norm": 0.734375, "learning_rate": 1.5627500000000002e-05, "loss": 0.2049, "step": 1750 }, { "epoch": 0.038636207724497496, "grad_norm": 0.69921875, "learning_rate": 1.5602500000000002e-05, "loss": 0.1691, "step": 1760 }, { "epoch": 0.03885573163202305, "grad_norm": 0.83984375, "learning_rate": 1.55775e-05, "loss": 0.1981, "step": 1770 }, { "epoch": 0.0390752555395486, "grad_norm": 0.69140625, "learning_rate": 1.55525e-05, "loss": 0.1842, "step": 1780 }, { "epoch": 0.03929477944707416, "grad_norm": 1.0546875, "learning_rate": 1.5527500000000003e-05, "loss": 0.1696, "step": 1790 }, { "epoch": 0.03951430335459971, "grad_norm": 1.0234375, "learning_rate": 1.55025e-05, "loss": 0.2018, "step": 1800 }, { "epoch": 0.039733827262125265, "grad_norm": 0.6953125, "learning_rate": 1.5477500000000002e-05, "loss": 0.201, "step": 1810 }, { "epoch": 0.03995335116965082, "grad_norm": 0.65234375, "learning_rate": 1.54525e-05, "loss": 0.1855, "step": 1820 }, { "epoch": 0.04017287507717637, "grad_norm": 0.7578125, "learning_rate": 1.54275e-05, "loss": 0.1679, "step": 1830 }, { "epoch": 0.04039239898470193, "grad_norm": 0.875, "learning_rate": 1.54025e-05, "loss": 0.1916, "step": 1840 }, { "epoch": 0.04061192289222748, "grad_norm": 0.75, "learning_rate": 1.53775e-05, "loss": 0.1715, "step": 1850 }, { "epoch": 0.040831446799753035, "grad_norm": 0.68359375, "learning_rate": 1.5352500000000003e-05, "loss": 0.1899, "step": 1860 }, { "epoch": 0.04105097070727859, "grad_norm": 0.78515625, "learning_rate": 1.5327500000000002e-05, "loss": 0.193, "step": 1870 }, { "epoch": 0.04127049461480414, "grad_norm": 0.7109375, "learning_rate": 1.53025e-05, "loss": 0.1822, "step": 1880 }, { "epoch": 0.0414900185223297, "grad_norm": 0.66015625, "learning_rate": 1.52775e-05, "loss": 0.1502, "step": 1890 }, { "epoch": 0.04170954242985525, "grad_norm": 0.9140625, "learning_rate": 1.5252500000000002e-05, "loss": 0.1909, "step": 1900 }, { "epoch": 0.041929066337380805, "grad_norm": 0.73046875, "learning_rate": 1.5227500000000001e-05, "loss": 0.1853, "step": 1910 }, { "epoch": 0.04214859024490636, "grad_norm": 0.7578125, "learning_rate": 1.5202500000000002e-05, "loss": 0.1905, "step": 1920 }, { "epoch": 0.04236811415243191, "grad_norm": 0.80078125, "learning_rate": 1.51775e-05, "loss": 0.1877, "step": 1930 }, { "epoch": 0.04258763805995747, "grad_norm": 0.5859375, "learning_rate": 1.5152500000000001e-05, "loss": 0.1947, "step": 1940 }, { "epoch": 0.04280716196748302, "grad_norm": 0.76953125, "learning_rate": 1.5127500000000002e-05, "loss": 0.1985, "step": 1950 }, { "epoch": 0.043026685875008575, "grad_norm": 0.71875, "learning_rate": 1.5102500000000002e-05, "loss": 0.1846, "step": 1960 }, { "epoch": 0.04324620978253413, "grad_norm": 0.734375, "learning_rate": 1.5077500000000001e-05, "loss": 0.2175, "step": 1970 }, { "epoch": 0.04346573369005968, "grad_norm": 0.7109375, "learning_rate": 1.50525e-05, "loss": 0.203, "step": 1980 }, { "epoch": 0.043685257597585236, "grad_norm": 0.609375, "learning_rate": 1.5027500000000001e-05, "loss": 0.1688, "step": 1990 }, { "epoch": 0.04390478150511079, "grad_norm": 0.9375, "learning_rate": 1.5002500000000002e-05, "loss": 0.1951, "step": 2000 }, { "epoch": 0.044124305412636344, "grad_norm": 0.91796875, "learning_rate": 1.49775e-05, "loss": 0.1931, "step": 2010 }, { "epoch": 0.0443438293201619, "grad_norm": 0.671875, "learning_rate": 1.4952500000000001e-05, "loss": 0.2017, "step": 2020 }, { "epoch": 0.04456335322768745, "grad_norm": 0.98828125, "learning_rate": 1.49275e-05, "loss": 0.2095, "step": 2030 }, { "epoch": 0.044782877135213006, "grad_norm": 0.7265625, "learning_rate": 1.4902500000000002e-05, "loss": 0.1814, "step": 2040 }, { "epoch": 0.04500240104273856, "grad_norm": 0.8828125, "learning_rate": 1.48775e-05, "loss": 0.1912, "step": 2050 }, { "epoch": 0.045221924950264114, "grad_norm": 0.71875, "learning_rate": 1.48525e-05, "loss": 0.1799, "step": 2060 }, { "epoch": 0.04544144885778967, "grad_norm": 0.64453125, "learning_rate": 1.4827500000000002e-05, "loss": 0.1754, "step": 2070 }, { "epoch": 0.04566097276531522, "grad_norm": 0.72265625, "learning_rate": 1.4802500000000003e-05, "loss": 0.2019, "step": 2080 }, { "epoch": 0.045880496672840776, "grad_norm": 0.7109375, "learning_rate": 1.47775e-05, "loss": 0.1767, "step": 2090 }, { "epoch": 0.04610002058036633, "grad_norm": 0.71875, "learning_rate": 1.4752500000000001e-05, "loss": 0.1762, "step": 2100 }, { "epoch": 0.046319544487891884, "grad_norm": 0.74609375, "learning_rate": 1.4727500000000001e-05, "loss": 0.2086, "step": 2110 }, { "epoch": 0.04653906839541744, "grad_norm": 1.03125, "learning_rate": 1.4702500000000002e-05, "loss": 0.2083, "step": 2120 }, { "epoch": 0.04675859230294299, "grad_norm": 0.80859375, "learning_rate": 1.4677500000000003e-05, "loss": 0.1774, "step": 2130 }, { "epoch": 0.046978116210468546, "grad_norm": 0.78125, "learning_rate": 1.46525e-05, "loss": 0.1908, "step": 2140 }, { "epoch": 0.0471976401179941, "grad_norm": 0.91015625, "learning_rate": 1.4627500000000002e-05, "loss": 0.2223, "step": 2150 }, { "epoch": 0.047417164025519654, "grad_norm": 1.046875, "learning_rate": 1.4602500000000001e-05, "loss": 0.2084, "step": 2160 }, { "epoch": 0.04763668793304521, "grad_norm": 1.0859375, "learning_rate": 1.4577500000000002e-05, "loss": 0.2029, "step": 2170 }, { "epoch": 0.04785621184057076, "grad_norm": 0.78515625, "learning_rate": 1.45525e-05, "loss": 0.1641, "step": 2180 }, { "epoch": 0.048075735748096315, "grad_norm": 0.83203125, "learning_rate": 1.4527500000000001e-05, "loss": 0.2053, "step": 2190 }, { "epoch": 0.04829525965562187, "grad_norm": 0.80859375, "learning_rate": 1.4502500000000002e-05, "loss": 0.1783, "step": 2200 }, { "epoch": 0.04851478356314742, "grad_norm": 0.984375, "learning_rate": 1.4477500000000002e-05, "loss": 0.1733, "step": 2210 }, { "epoch": 0.04873430747067298, "grad_norm": 0.80859375, "learning_rate": 1.4452500000000001e-05, "loss": 0.1852, "step": 2220 }, { "epoch": 0.04895383137819853, "grad_norm": 0.76953125, "learning_rate": 1.44275e-05, "loss": 0.1626, "step": 2230 }, { "epoch": 0.049173355285724085, "grad_norm": 0.76953125, "learning_rate": 1.4402500000000001e-05, "loss": 0.1804, "step": 2240 }, { "epoch": 0.04939287919324964, "grad_norm": 0.8203125, "learning_rate": 1.4377500000000003e-05, "loss": 0.1895, "step": 2250 }, { "epoch": 0.04961240310077519, "grad_norm": 0.8984375, "learning_rate": 1.43525e-05, "loss": 0.1911, "step": 2260 }, { "epoch": 0.04983192700830075, "grad_norm": 0.9765625, "learning_rate": 1.4327500000000001e-05, "loss": 0.1903, "step": 2270 }, { "epoch": 0.0500514509158263, "grad_norm": 0.890625, "learning_rate": 1.43025e-05, "loss": 0.2024, "step": 2280 }, { "epoch": 0.050270974823351855, "grad_norm": 0.6171875, "learning_rate": 1.4277500000000002e-05, "loss": 0.1669, "step": 2290 }, { "epoch": 0.05049049873087741, "grad_norm": 0.99609375, "learning_rate": 1.42525e-05, "loss": 0.1819, "step": 2300 }, { "epoch": 0.05071002263840296, "grad_norm": 0.7734375, "learning_rate": 1.42275e-05, "loss": 0.1808, "step": 2310 }, { "epoch": 0.05092954654592852, "grad_norm": 0.57421875, "learning_rate": 1.4202500000000002e-05, "loss": 0.1809, "step": 2320 }, { "epoch": 0.05114907045345407, "grad_norm": 0.94921875, "learning_rate": 1.4177500000000001e-05, "loss": 0.177, "step": 2330 }, { "epoch": 0.051368594360979625, "grad_norm": 0.671875, "learning_rate": 1.41525e-05, "loss": 0.1882, "step": 2340 }, { "epoch": 0.05158811826850518, "grad_norm": 0.79296875, "learning_rate": 1.41275e-05, "loss": 0.1705, "step": 2350 }, { "epoch": 0.05180764217603073, "grad_norm": 0.75390625, "learning_rate": 1.4102500000000001e-05, "loss": 0.1671, "step": 2360 }, { "epoch": 0.05202716608355629, "grad_norm": 0.79296875, "learning_rate": 1.4077500000000002e-05, "loss": 0.1622, "step": 2370 }, { "epoch": 0.05224668999108184, "grad_norm": 0.64453125, "learning_rate": 1.4052500000000001e-05, "loss": 0.1847, "step": 2380 }, { "epoch": 0.052466213898607394, "grad_norm": 0.75390625, "learning_rate": 1.40275e-05, "loss": 0.1789, "step": 2390 }, { "epoch": 0.05268573780613295, "grad_norm": 0.66015625, "learning_rate": 1.4002500000000002e-05, "loss": 0.1815, "step": 2400 }, { "epoch": 0.0529052617136585, "grad_norm": 0.94921875, "learning_rate": 1.3977500000000001e-05, "loss": 0.2352, "step": 2410 }, { "epoch": 0.053124785621184056, "grad_norm": 0.69140625, "learning_rate": 1.3952500000000002e-05, "loss": 0.1681, "step": 2420 }, { "epoch": 0.05334430952870961, "grad_norm": 0.828125, "learning_rate": 1.39275e-05, "loss": 0.2157, "step": 2430 }, { "epoch": 0.053563833436235164, "grad_norm": 0.76953125, "learning_rate": 1.3902500000000001e-05, "loss": 0.1829, "step": 2440 }, { "epoch": 0.05378335734376072, "grad_norm": 0.89453125, "learning_rate": 1.3877500000000002e-05, "loss": 0.1778, "step": 2450 }, { "epoch": 0.05400288125128627, "grad_norm": 0.66015625, "learning_rate": 1.3852500000000002e-05, "loss": 0.2, "step": 2460 }, { "epoch": 0.054222405158811826, "grad_norm": 0.75, "learning_rate": 1.3827500000000001e-05, "loss": 0.1975, "step": 2470 }, { "epoch": 0.05444192906633738, "grad_norm": 0.87109375, "learning_rate": 1.38025e-05, "loss": 0.1732, "step": 2480 }, { "epoch": 0.054661452973862934, "grad_norm": 0.6796875, "learning_rate": 1.3777500000000001e-05, "loss": 0.2128, "step": 2490 }, { "epoch": 0.05488097688138849, "grad_norm": 0.7578125, "learning_rate": 1.3752500000000003e-05, "loss": 0.2146, "step": 2500 }, { "epoch": 0.05510050078891404, "grad_norm": 0.73046875, "learning_rate": 1.37275e-05, "loss": 0.1893, "step": 2510 }, { "epoch": 0.055320024696439596, "grad_norm": 0.76953125, "learning_rate": 1.3702500000000001e-05, "loss": 0.2053, "step": 2520 }, { "epoch": 0.05553954860396515, "grad_norm": 0.71875, "learning_rate": 1.36775e-05, "loss": 0.1867, "step": 2530 }, { "epoch": 0.055759072511490704, "grad_norm": 0.83203125, "learning_rate": 1.3652500000000002e-05, "loss": 0.2008, "step": 2540 }, { "epoch": 0.05597859641901626, "grad_norm": 0.86328125, "learning_rate": 1.36275e-05, "loss": 0.1968, "step": 2550 }, { "epoch": 0.05619812032654181, "grad_norm": 1.03125, "learning_rate": 1.36025e-05, "loss": 0.199, "step": 2560 }, { "epoch": 0.056417644234067366, "grad_norm": 0.58984375, "learning_rate": 1.3577500000000002e-05, "loss": 0.1708, "step": 2570 }, { "epoch": 0.05663716814159292, "grad_norm": 0.83203125, "learning_rate": 1.3552500000000001e-05, "loss": 0.1923, "step": 2580 }, { "epoch": 0.056856692049118474, "grad_norm": 0.8046875, "learning_rate": 1.35275e-05, "loss": 0.2035, "step": 2590 }, { "epoch": 0.05707621595664403, "grad_norm": 1.03125, "learning_rate": 1.35025e-05, "loss": 0.2349, "step": 2600 }, { "epoch": 0.05729573986416958, "grad_norm": 0.74609375, "learning_rate": 1.3477500000000001e-05, "loss": 0.1965, "step": 2610 }, { "epoch": 0.057515263771695135, "grad_norm": 0.6796875, "learning_rate": 1.3452500000000002e-05, "loss": 0.1683, "step": 2620 }, { "epoch": 0.05773478767922069, "grad_norm": 1.015625, "learning_rate": 1.3427500000000001e-05, "loss": 0.2099, "step": 2630 }, { "epoch": 0.05795431158674624, "grad_norm": 0.80859375, "learning_rate": 1.3402500000000001e-05, "loss": 0.1802, "step": 2640 }, { "epoch": 0.0581738354942718, "grad_norm": 0.88671875, "learning_rate": 1.33775e-05, "loss": 0.1875, "step": 2650 }, { "epoch": 0.05839335940179735, "grad_norm": 0.7578125, "learning_rate": 1.3352500000000001e-05, "loss": 0.1849, "step": 2660 }, { "epoch": 0.058612883309322905, "grad_norm": 0.80859375, "learning_rate": 1.3327500000000002e-05, "loss": 0.1549, "step": 2670 }, { "epoch": 0.05883240721684846, "grad_norm": 0.87109375, "learning_rate": 1.33025e-05, "loss": 0.1936, "step": 2680 }, { "epoch": 0.05905193112437401, "grad_norm": 0.80078125, "learning_rate": 1.3277500000000001e-05, "loss": 0.1891, "step": 2690 }, { "epoch": 0.05927145503189957, "grad_norm": 0.75, "learning_rate": 1.3252500000000002e-05, "loss": 0.1879, "step": 2700 }, { "epoch": 0.05949097893942512, "grad_norm": 0.71484375, "learning_rate": 1.3227500000000002e-05, "loss": 0.1838, "step": 2710 }, { "epoch": 0.059710502846950675, "grad_norm": 0.67578125, "learning_rate": 1.3202500000000001e-05, "loss": 0.1807, "step": 2720 }, { "epoch": 0.05993002675447623, "grad_norm": 0.58203125, "learning_rate": 1.31775e-05, "loss": 0.1519, "step": 2730 }, { "epoch": 0.06014955066200178, "grad_norm": 0.87109375, "learning_rate": 1.3152500000000002e-05, "loss": 0.1843, "step": 2740 }, { "epoch": 0.06036907456952734, "grad_norm": 0.8515625, "learning_rate": 1.3127500000000003e-05, "loss": 0.2063, "step": 2750 }, { "epoch": 0.06058859847705289, "grad_norm": 0.7265625, "learning_rate": 1.31025e-05, "loss": 0.1785, "step": 2760 }, { "epoch": 0.060808122384578445, "grad_norm": 0.71875, "learning_rate": 1.3077500000000001e-05, "loss": 0.1707, "step": 2770 }, { "epoch": 0.061027646292104, "grad_norm": 1.03125, "learning_rate": 1.30525e-05, "loss": 0.1761, "step": 2780 }, { "epoch": 0.06124717019962955, "grad_norm": 0.67578125, "learning_rate": 1.3027500000000002e-05, "loss": 0.1782, "step": 2790 }, { "epoch": 0.061466694107155107, "grad_norm": 0.8671875, "learning_rate": 1.30025e-05, "loss": 0.1988, "step": 2800 }, { "epoch": 0.06168621801468066, "grad_norm": 0.78125, "learning_rate": 1.29775e-05, "loss": 0.1701, "step": 2810 }, { "epoch": 0.061905741922206214, "grad_norm": 0.83984375, "learning_rate": 1.2952500000000002e-05, "loss": 0.2079, "step": 2820 }, { "epoch": 0.06212526582973177, "grad_norm": 0.9609375, "learning_rate": 1.2927500000000001e-05, "loss": 0.1982, "step": 2830 }, { "epoch": 0.06234478973725732, "grad_norm": 0.68359375, "learning_rate": 1.29025e-05, "loss": 0.1798, "step": 2840 }, { "epoch": 0.06256431364478288, "grad_norm": 0.953125, "learning_rate": 1.28775e-05, "loss": 0.1582, "step": 2850 }, { "epoch": 0.06278383755230843, "grad_norm": 0.81640625, "learning_rate": 1.2852500000000001e-05, "loss": 0.2233, "step": 2860 }, { "epoch": 0.06300336145983398, "grad_norm": 0.875, "learning_rate": 1.2827500000000002e-05, "loss": 0.1952, "step": 2870 }, { "epoch": 0.06322288536735954, "grad_norm": 0.83984375, "learning_rate": 1.2802500000000002e-05, "loss": 0.1854, "step": 2880 }, { "epoch": 0.06344240927488509, "grad_norm": 0.671875, "learning_rate": 1.2777500000000001e-05, "loss": 0.205, "step": 2890 }, { "epoch": 0.06366193318241065, "grad_norm": 0.8125, "learning_rate": 1.27525e-05, "loss": 0.1899, "step": 2900 }, { "epoch": 0.0638814570899362, "grad_norm": 0.7109375, "learning_rate": 1.2727500000000001e-05, "loss": 0.1874, "step": 2910 }, { "epoch": 0.06410098099746175, "grad_norm": 0.6875, "learning_rate": 1.2702500000000002e-05, "loss": 0.186, "step": 2920 }, { "epoch": 0.06432050490498731, "grad_norm": 0.56640625, "learning_rate": 1.26775e-05, "loss": 0.1619, "step": 2930 }, { "epoch": 0.06454002881251286, "grad_norm": 0.64453125, "learning_rate": 1.2652500000000001e-05, "loss": 0.1959, "step": 2940 }, { "epoch": 0.06475955272003842, "grad_norm": 0.76953125, "learning_rate": 1.26275e-05, "loss": 0.1919, "step": 2950 }, { "epoch": 0.06497907662756397, "grad_norm": 1.25, "learning_rate": 1.2602500000000002e-05, "loss": 0.1998, "step": 2960 }, { "epoch": 0.06519860053508952, "grad_norm": 0.83984375, "learning_rate": 1.25775e-05, "loss": 0.2058, "step": 2970 }, { "epoch": 0.06541812444261508, "grad_norm": 0.55859375, "learning_rate": 1.25525e-05, "loss": 0.1861, "step": 2980 }, { "epoch": 0.06563764835014063, "grad_norm": 0.84765625, "learning_rate": 1.2527500000000002e-05, "loss": 0.2199, "step": 2990 }, { "epoch": 0.06585717225766619, "grad_norm": 0.73046875, "learning_rate": 1.2502500000000003e-05, "loss": 0.1804, "step": 3000 }, { "epoch": 0.06607669616519174, "grad_norm": 0.5703125, "learning_rate": 1.24775e-05, "loss": 0.1742, "step": 3010 }, { "epoch": 0.0662962200727173, "grad_norm": 0.73828125, "learning_rate": 1.2452500000000001e-05, "loss": 0.1782, "step": 3020 }, { "epoch": 0.06651574398024285, "grad_norm": 0.921875, "learning_rate": 1.2427500000000001e-05, "loss": 0.1822, "step": 3030 }, { "epoch": 0.0667352678877684, "grad_norm": 0.6640625, "learning_rate": 1.2402500000000002e-05, "loss": 0.1902, "step": 3040 }, { "epoch": 0.06695479179529396, "grad_norm": 2.34375, "learning_rate": 1.23775e-05, "loss": 0.1937, "step": 3050 }, { "epoch": 0.06717431570281951, "grad_norm": 0.58203125, "learning_rate": 1.23525e-05, "loss": 0.1919, "step": 3060 }, { "epoch": 0.06739383961034506, "grad_norm": 5.625, "learning_rate": 1.2327500000000002e-05, "loss": 0.1763, "step": 3070 }, { "epoch": 0.06761336351787062, "grad_norm": 0.7578125, "learning_rate": 1.2302500000000001e-05, "loss": 0.1956, "step": 3080 }, { "epoch": 0.06783288742539617, "grad_norm": 1.171875, "learning_rate": 1.22775e-05, "loss": 0.2159, "step": 3090 }, { "epoch": 0.06805241133292173, "grad_norm": 0.76171875, "learning_rate": 1.22525e-05, "loss": 0.1775, "step": 3100 }, { "epoch": 0.06827193524044728, "grad_norm": 0.71875, "learning_rate": 1.2227500000000001e-05, "loss": 0.1617, "step": 3110 }, { "epoch": 0.06849145914797283, "grad_norm": 0.81640625, "learning_rate": 1.2202500000000002e-05, "loss": 0.1725, "step": 3120 }, { "epoch": 0.06871098305549839, "grad_norm": 0.78125, "learning_rate": 1.2177500000000002e-05, "loss": 0.2037, "step": 3130 }, { "epoch": 0.06893050696302394, "grad_norm": 0.9609375, "learning_rate": 1.2152500000000001e-05, "loss": 0.21, "step": 3140 }, { "epoch": 0.0691500308705495, "grad_norm": 0.6796875, "learning_rate": 1.21275e-05, "loss": 0.1816, "step": 3150 }, { "epoch": 0.06936955477807505, "grad_norm": 0.83984375, "learning_rate": 1.2102500000000001e-05, "loss": 0.1811, "step": 3160 }, { "epoch": 0.0695890786856006, "grad_norm": 0.8828125, "learning_rate": 1.2077500000000003e-05, "loss": 0.1659, "step": 3170 }, { "epoch": 0.06980860259312616, "grad_norm": 0.83203125, "learning_rate": 1.20525e-05, "loss": 0.1924, "step": 3180 }, { "epoch": 0.07002812650065171, "grad_norm": 0.796875, "learning_rate": 1.2027500000000001e-05, "loss": 0.1891, "step": 3190 }, { "epoch": 0.07024765040817726, "grad_norm": 0.82421875, "learning_rate": 1.20025e-05, "loss": 0.1836, "step": 3200 }, { "epoch": 0.07046717431570282, "grad_norm": 0.71875, "learning_rate": 1.1977500000000002e-05, "loss": 0.2071, "step": 3210 }, { "epoch": 0.07068669822322837, "grad_norm": 0.80859375, "learning_rate": 1.19525e-05, "loss": 0.2144, "step": 3220 }, { "epoch": 0.07090622213075393, "grad_norm": 0.74609375, "learning_rate": 1.19275e-05, "loss": 0.1881, "step": 3230 }, { "epoch": 0.07112574603827948, "grad_norm": 0.7890625, "learning_rate": 1.1902500000000002e-05, "loss": 0.1751, "step": 3240 }, { "epoch": 0.07134526994580503, "grad_norm": 0.78515625, "learning_rate": 1.1877500000000001e-05, "loss": 0.1844, "step": 3250 }, { "epoch": 0.07156479385333059, "grad_norm": 1.0, "learning_rate": 1.18525e-05, "loss": 0.177, "step": 3260 }, { "epoch": 0.07178431776085614, "grad_norm": 0.84375, "learning_rate": 1.18275e-05, "loss": 0.1831, "step": 3270 }, { "epoch": 0.0720038416683817, "grad_norm": 0.79296875, "learning_rate": 1.1802500000000001e-05, "loss": 0.1793, "step": 3280 }, { "epoch": 0.07222336557590725, "grad_norm": 0.7578125, "learning_rate": 1.1777500000000002e-05, "loss": 0.1824, "step": 3290 }, { "epoch": 0.0724428894834328, "grad_norm": 0.875, "learning_rate": 1.17525e-05, "loss": 0.2064, "step": 3300 }, { "epoch": 0.07266241339095836, "grad_norm": 0.85546875, "learning_rate": 1.17275e-05, "loss": 0.1824, "step": 3310 }, { "epoch": 0.07288193729848391, "grad_norm": 0.55859375, "learning_rate": 1.1702500000000002e-05, "loss": 0.1792, "step": 3320 }, { "epoch": 0.07310146120600947, "grad_norm": 1.0703125, "learning_rate": 1.1677500000000001e-05, "loss": 0.1882, "step": 3330 }, { "epoch": 0.07332098511353502, "grad_norm": 0.73828125, "learning_rate": 1.16525e-05, "loss": 0.1945, "step": 3340 }, { "epoch": 0.07354050902106057, "grad_norm": 0.82421875, "learning_rate": 1.16275e-05, "loss": 0.2059, "step": 3350 }, { "epoch": 0.07376003292858613, "grad_norm": 0.8515625, "learning_rate": 1.1602500000000001e-05, "loss": 0.2108, "step": 3360 }, { "epoch": 0.07397955683611168, "grad_norm": 0.87890625, "learning_rate": 1.1577500000000002e-05, "loss": 0.1833, "step": 3370 }, { "epoch": 0.07419908074363724, "grad_norm": 0.96484375, "learning_rate": 1.1552500000000002e-05, "loss": 0.1719, "step": 3380 }, { "epoch": 0.07441860465116279, "grad_norm": 0.7578125, "learning_rate": 1.1527500000000001e-05, "loss": 0.191, "step": 3390 }, { "epoch": 0.07463812855868834, "grad_norm": 0.67578125, "learning_rate": 1.15025e-05, "loss": 0.1933, "step": 3400 }, { "epoch": 0.0748576524662139, "grad_norm": 0.73046875, "learning_rate": 1.1477500000000001e-05, "loss": 0.2048, "step": 3410 }, { "epoch": 0.07507717637373945, "grad_norm": 0.78515625, "learning_rate": 1.1452500000000003e-05, "loss": 0.1669, "step": 3420 }, { "epoch": 0.075296700281265, "grad_norm": 0.7109375, "learning_rate": 1.14275e-05, "loss": 0.1684, "step": 3430 }, { "epoch": 0.07551622418879056, "grad_norm": 0.64453125, "learning_rate": 1.1402500000000001e-05, "loss": 0.1744, "step": 3440 }, { "epoch": 0.07573574809631611, "grad_norm": 0.7578125, "learning_rate": 1.13775e-05, "loss": 0.1822, "step": 3450 }, { "epoch": 0.07595527200384167, "grad_norm": 0.66015625, "learning_rate": 1.1352500000000002e-05, "loss": 0.1808, "step": 3460 }, { "epoch": 0.07617479591136722, "grad_norm": 0.65234375, "learning_rate": 1.13275e-05, "loss": 0.1872, "step": 3470 }, { "epoch": 0.07639431981889278, "grad_norm": 0.74609375, "learning_rate": 1.13025e-05, "loss": 0.192, "step": 3480 }, { "epoch": 0.07661384372641833, "grad_norm": 0.76171875, "learning_rate": 1.1277500000000002e-05, "loss": 0.219, "step": 3490 }, { "epoch": 0.07683336763394388, "grad_norm": 0.984375, "learning_rate": 1.1252500000000001e-05, "loss": 0.2029, "step": 3500 }, { "epoch": 0.07705289154146944, "grad_norm": 1.421875, "learning_rate": 1.12275e-05, "loss": 0.1787, "step": 3510 }, { "epoch": 0.07727241544899499, "grad_norm": 0.5859375, "learning_rate": 1.12025e-05, "loss": 0.1613, "step": 3520 }, { "epoch": 0.07749193935652054, "grad_norm": 1.0546875, "learning_rate": 1.1177500000000001e-05, "loss": 0.1823, "step": 3530 }, { "epoch": 0.0777114632640461, "grad_norm": 0.84375, "learning_rate": 1.1152500000000002e-05, "loss": 0.1823, "step": 3540 }, { "epoch": 0.07793098717157165, "grad_norm": 0.7421875, "learning_rate": 1.11275e-05, "loss": 0.194, "step": 3550 }, { "epoch": 0.0781505110790972, "grad_norm": 0.921875, "learning_rate": 1.1102500000000001e-05, "loss": 0.202, "step": 3560 }, { "epoch": 0.07837003498662276, "grad_norm": 0.69921875, "learning_rate": 1.10775e-05, "loss": 0.1599, "step": 3570 }, { "epoch": 0.07858955889414831, "grad_norm": 1.1484375, "learning_rate": 1.1052500000000001e-05, "loss": 0.183, "step": 3580 }, { "epoch": 0.07880908280167387, "grad_norm": 0.8203125, "learning_rate": 1.1027499999999999e-05, "loss": 0.1697, "step": 3590 }, { "epoch": 0.07902860670919942, "grad_norm": 1.1484375, "learning_rate": 1.10025e-05, "loss": 0.1872, "step": 3600 }, { "epoch": 0.07924813061672498, "grad_norm": 0.890625, "learning_rate": 1.0977500000000001e-05, "loss": 0.2008, "step": 3610 }, { "epoch": 0.07946765452425053, "grad_norm": 1.1015625, "learning_rate": 1.0952500000000002e-05, "loss": 0.1857, "step": 3620 }, { "epoch": 0.07968717843177608, "grad_norm": 0.79296875, "learning_rate": 1.0927500000000002e-05, "loss": 0.2013, "step": 3630 }, { "epoch": 0.07990670233930164, "grad_norm": 0.73828125, "learning_rate": 1.0902500000000001e-05, "loss": 0.1996, "step": 3640 }, { "epoch": 0.08012622624682719, "grad_norm": 0.65234375, "learning_rate": 1.08775e-05, "loss": 0.1866, "step": 3650 }, { "epoch": 0.08034575015435275, "grad_norm": 0.953125, "learning_rate": 1.0852500000000002e-05, "loss": 0.1854, "step": 3660 }, { "epoch": 0.0805652740618783, "grad_norm": 0.73828125, "learning_rate": 1.0827500000000003e-05, "loss": 0.2041, "step": 3670 }, { "epoch": 0.08078479796940385, "grad_norm": 1.046875, "learning_rate": 1.08025e-05, "loss": 0.2103, "step": 3680 }, { "epoch": 0.08100432187692941, "grad_norm": 0.76953125, "learning_rate": 1.0777500000000001e-05, "loss": 0.1689, "step": 3690 }, { "epoch": 0.08122384578445496, "grad_norm": 0.9140625, "learning_rate": 1.07525e-05, "loss": 0.2075, "step": 3700 }, { "epoch": 0.08144336969198052, "grad_norm": 1.015625, "learning_rate": 1.0727500000000002e-05, "loss": 0.1958, "step": 3710 }, { "epoch": 0.08166289359950607, "grad_norm": 0.78515625, "learning_rate": 1.07025e-05, "loss": 0.2133, "step": 3720 }, { "epoch": 0.08188241750703162, "grad_norm": 0.703125, "learning_rate": 1.06775e-05, "loss": 0.2199, "step": 3730 }, { "epoch": 0.08210194141455718, "grad_norm": 0.640625, "learning_rate": 1.0652500000000002e-05, "loss": 0.1816, "step": 3740 }, { "epoch": 0.08232146532208273, "grad_norm": 0.75, "learning_rate": 1.0627500000000001e-05, "loss": 0.1618, "step": 3750 }, { "epoch": 0.08254098922960829, "grad_norm": 0.6796875, "learning_rate": 1.06025e-05, "loss": 0.2193, "step": 3760 }, { "epoch": 0.08276051313713384, "grad_norm": 0.71875, "learning_rate": 1.05775e-05, "loss": 0.1626, "step": 3770 }, { "epoch": 0.0829800370446594, "grad_norm": 1.015625, "learning_rate": 1.0552500000000001e-05, "loss": 0.1873, "step": 3780 }, { "epoch": 0.08319956095218495, "grad_norm": 0.7578125, "learning_rate": 1.0527500000000002e-05, "loss": 0.2129, "step": 3790 }, { "epoch": 0.0834190848597105, "grad_norm": 0.921875, "learning_rate": 1.05025e-05, "loss": 0.1906, "step": 3800 }, { "epoch": 0.08363860876723606, "grad_norm": 0.71484375, "learning_rate": 1.0477500000000001e-05, "loss": 0.1711, "step": 3810 }, { "epoch": 0.08385813267476161, "grad_norm": 1.1640625, "learning_rate": 1.04525e-05, "loss": 0.1813, "step": 3820 }, { "epoch": 0.08407765658228716, "grad_norm": 0.8671875, "learning_rate": 1.0427500000000001e-05, "loss": 0.1996, "step": 3830 }, { "epoch": 0.08429718048981272, "grad_norm": 0.68359375, "learning_rate": 1.0402499999999999e-05, "loss": 0.1536, "step": 3840 }, { "epoch": 0.08451670439733827, "grad_norm": 0.7265625, "learning_rate": 1.03775e-05, "loss": 0.1617, "step": 3850 }, { "epoch": 0.08473622830486383, "grad_norm": 0.90234375, "learning_rate": 1.0352500000000001e-05, "loss": 0.1874, "step": 3860 }, { "epoch": 0.08495575221238938, "grad_norm": 0.79296875, "learning_rate": 1.03275e-05, "loss": 0.2133, "step": 3870 }, { "epoch": 0.08517527611991493, "grad_norm": 0.9296875, "learning_rate": 1.0302500000000002e-05, "loss": 0.1796, "step": 3880 }, { "epoch": 0.08539480002744049, "grad_norm": 0.671875, "learning_rate": 1.02775e-05, "loss": 0.1698, "step": 3890 }, { "epoch": 0.08561432393496604, "grad_norm": 0.765625, "learning_rate": 1.02525e-05, "loss": 0.1938, "step": 3900 }, { "epoch": 0.0858338478424916, "grad_norm": 0.83203125, "learning_rate": 1.0227500000000002e-05, "loss": 0.1816, "step": 3910 }, { "epoch": 0.08605337175001715, "grad_norm": 0.7890625, "learning_rate": 1.0202500000000003e-05, "loss": 0.1677, "step": 3920 }, { "epoch": 0.0862728956575427, "grad_norm": 0.65234375, "learning_rate": 1.01775e-05, "loss": 0.1956, "step": 3930 }, { "epoch": 0.08649241956506826, "grad_norm": 0.80859375, "learning_rate": 1.0152500000000001e-05, "loss": 0.1769, "step": 3940 }, { "epoch": 0.08671194347259381, "grad_norm": 0.84765625, "learning_rate": 1.0127500000000001e-05, "loss": 0.2014, "step": 3950 }, { "epoch": 0.08693146738011936, "grad_norm": 0.6484375, "learning_rate": 1.0102500000000002e-05, "loss": 0.2199, "step": 3960 }, { "epoch": 0.08715099128764492, "grad_norm": 0.75, "learning_rate": 1.00775e-05, "loss": 0.1815, "step": 3970 }, { "epoch": 0.08737051519517047, "grad_norm": 0.8828125, "learning_rate": 1.00525e-05, "loss": 0.1797, "step": 3980 }, { "epoch": 0.08759003910269603, "grad_norm": 0.83984375, "learning_rate": 1.0027500000000002e-05, "loss": 0.2142, "step": 3990 }, { "epoch": 0.08780956301022158, "grad_norm": 0.8125, "learning_rate": 1.0002500000000001e-05, "loss": 0.2015, "step": 4000 }, { "epoch": 0.08802908691774713, "grad_norm": 0.703125, "learning_rate": 9.9775e-06, "loss": 0.1653, "step": 4010 }, { "epoch": 0.08824861082527269, "grad_norm": 0.76171875, "learning_rate": 9.9525e-06, "loss": 0.1784, "step": 4020 }, { "epoch": 0.08846813473279824, "grad_norm": 1.1015625, "learning_rate": 9.927500000000001e-06, "loss": 0.1915, "step": 4030 }, { "epoch": 0.0886876586403238, "grad_norm": 0.76953125, "learning_rate": 9.9025e-06, "loss": 0.1997, "step": 4040 }, { "epoch": 0.08890718254784935, "grad_norm": 0.7265625, "learning_rate": 9.877500000000002e-06, "loss": 0.1816, "step": 4050 }, { "epoch": 0.0891267064553749, "grad_norm": 0.7734375, "learning_rate": 9.852500000000001e-06, "loss": 0.1954, "step": 4060 }, { "epoch": 0.08934623036290046, "grad_norm": 0.63671875, "learning_rate": 9.8275e-06, "loss": 0.1532, "step": 4070 }, { "epoch": 0.08956575427042601, "grad_norm": 0.7265625, "learning_rate": 9.8025e-06, "loss": 0.2117, "step": 4080 }, { "epoch": 0.08978527817795157, "grad_norm": 0.7265625, "learning_rate": 9.7775e-06, "loss": 0.1957, "step": 4090 }, { "epoch": 0.09000480208547712, "grad_norm": 1.0546875, "learning_rate": 9.7525e-06, "loss": 0.1683, "step": 4100 }, { "epoch": 0.09022432599300267, "grad_norm": 0.671875, "learning_rate": 9.727500000000001e-06, "loss": 0.202, "step": 4110 }, { "epoch": 0.09044384990052823, "grad_norm": 0.92578125, "learning_rate": 9.7025e-06, "loss": 0.1806, "step": 4120 }, { "epoch": 0.09066337380805378, "grad_norm": 0.671875, "learning_rate": 9.6775e-06, "loss": 0.1941, "step": 4130 }, { "epoch": 0.09088289771557934, "grad_norm": 0.9453125, "learning_rate": 9.652500000000001e-06, "loss": 0.1824, "step": 4140 }, { "epoch": 0.09110242162310489, "grad_norm": 0.66015625, "learning_rate": 9.6275e-06, "loss": 0.1489, "step": 4150 }, { "epoch": 0.09132194553063044, "grad_norm": 0.78515625, "learning_rate": 9.602500000000002e-06, "loss": 0.1578, "step": 4160 }, { "epoch": 0.091541469438156, "grad_norm": 1.1015625, "learning_rate": 9.577500000000001e-06, "loss": 0.1837, "step": 4170 }, { "epoch": 0.09176099334568155, "grad_norm": 0.75, "learning_rate": 9.5525e-06, "loss": 0.1943, "step": 4180 }, { "epoch": 0.0919805172532071, "grad_norm": 0.6875, "learning_rate": 9.5275e-06, "loss": 0.1938, "step": 4190 }, { "epoch": 0.09220004116073266, "grad_norm": 0.890625, "learning_rate": 9.502500000000001e-06, "loss": 0.1722, "step": 4200 }, { "epoch": 0.09241956506825821, "grad_norm": 0.72265625, "learning_rate": 9.4775e-06, "loss": 0.1893, "step": 4210 }, { "epoch": 0.09263908897578377, "grad_norm": 0.8515625, "learning_rate": 9.452500000000001e-06, "loss": 0.1843, "step": 4220 }, { "epoch": 0.09285861288330932, "grad_norm": 0.75, "learning_rate": 9.4275e-06, "loss": 0.2091, "step": 4230 }, { "epoch": 0.09307813679083488, "grad_norm": 0.83984375, "learning_rate": 9.402500000000002e-06, "loss": 0.1744, "step": 4240 }, { "epoch": 0.09329766069836043, "grad_norm": 0.9921875, "learning_rate": 9.377500000000001e-06, "loss": 0.1934, "step": 4250 }, { "epoch": 0.09351718460588598, "grad_norm": 0.88671875, "learning_rate": 9.3525e-06, "loss": 0.1982, "step": 4260 }, { "epoch": 0.09373670851341154, "grad_norm": 0.60546875, "learning_rate": 9.3275e-06, "loss": 0.1705, "step": 4270 }, { "epoch": 0.09395623242093709, "grad_norm": 0.7890625, "learning_rate": 9.302500000000001e-06, "loss": 0.1701, "step": 4280 }, { "epoch": 0.09417575632846265, "grad_norm": 0.74609375, "learning_rate": 9.2775e-06, "loss": 0.1956, "step": 4290 }, { "epoch": 0.0943952802359882, "grad_norm": 0.6796875, "learning_rate": 9.252500000000002e-06, "loss": 0.1792, "step": 4300 }, { "epoch": 0.09461480414351375, "grad_norm": 0.90625, "learning_rate": 9.227500000000001e-06, "loss": 0.1744, "step": 4310 }, { "epoch": 0.09483432805103931, "grad_norm": 0.8203125, "learning_rate": 9.2025e-06, "loss": 0.2185, "step": 4320 }, { "epoch": 0.09505385195856486, "grad_norm": 0.9453125, "learning_rate": 9.1775e-06, "loss": 0.1655, "step": 4330 }, { "epoch": 0.09527337586609042, "grad_norm": 0.94921875, "learning_rate": 9.152500000000001e-06, "loss": 0.2083, "step": 4340 }, { "epoch": 0.09549289977361597, "grad_norm": 1.078125, "learning_rate": 9.1275e-06, "loss": 0.2039, "step": 4350 }, { "epoch": 0.09571242368114152, "grad_norm": 0.6171875, "learning_rate": 9.102500000000001e-06, "loss": 0.1923, "step": 4360 }, { "epoch": 0.09593194758866708, "grad_norm": 0.59765625, "learning_rate": 9.0775e-06, "loss": 0.2013, "step": 4370 }, { "epoch": 0.09615147149619263, "grad_norm": 0.6328125, "learning_rate": 9.0525e-06, "loss": 0.1856, "step": 4380 }, { "epoch": 0.09637099540371818, "grad_norm": 0.75, "learning_rate": 9.027500000000001e-06, "loss": 0.2029, "step": 4390 }, { "epoch": 0.09659051931124374, "grad_norm": 0.77734375, "learning_rate": 9.0025e-06, "loss": 0.1753, "step": 4400 }, { "epoch": 0.09681004321876929, "grad_norm": 0.62109375, "learning_rate": 8.977500000000002e-06, "loss": 0.1767, "step": 4410 }, { "epoch": 0.09702956712629485, "grad_norm": 0.64453125, "learning_rate": 8.952500000000001e-06, "loss": 0.1727, "step": 4420 }, { "epoch": 0.0972490910338204, "grad_norm": 0.71484375, "learning_rate": 8.9275e-06, "loss": 0.1802, "step": 4430 }, { "epoch": 0.09746861494134595, "grad_norm": 1.015625, "learning_rate": 8.9025e-06, "loss": 0.217, "step": 4440 }, { "epoch": 0.09768813884887151, "grad_norm": 0.79296875, "learning_rate": 8.877500000000001e-06, "loss": 0.1644, "step": 4450 }, { "epoch": 0.09790766275639706, "grad_norm": 0.7890625, "learning_rate": 8.8525e-06, "loss": 0.195, "step": 4460 }, { "epoch": 0.09812718666392262, "grad_norm": 0.68359375, "learning_rate": 8.827500000000001e-06, "loss": 0.1872, "step": 4470 }, { "epoch": 0.09834671057144817, "grad_norm": 0.83203125, "learning_rate": 8.802500000000001e-06, "loss": 0.196, "step": 4480 }, { "epoch": 0.09856623447897372, "grad_norm": 0.59765625, "learning_rate": 8.7775e-06, "loss": 0.186, "step": 4490 }, { "epoch": 0.09878575838649928, "grad_norm": 0.76171875, "learning_rate": 8.7525e-06, "loss": 0.1871, "step": 4500 }, { "epoch": 0.09900528229402483, "grad_norm": 0.84765625, "learning_rate": 8.7275e-06, "loss": 0.189, "step": 4510 }, { "epoch": 0.09922480620155039, "grad_norm": 0.6875, "learning_rate": 8.7025e-06, "loss": 0.1596, "step": 4520 }, { "epoch": 0.09944433010907594, "grad_norm": 0.96484375, "learning_rate": 8.677500000000001e-06, "loss": 0.1781, "step": 4530 }, { "epoch": 0.0996638540166015, "grad_norm": 0.93359375, "learning_rate": 8.6525e-06, "loss": 0.1879, "step": 4540 }, { "epoch": 0.09988337792412705, "grad_norm": 0.9765625, "learning_rate": 8.627500000000002e-06, "loss": 0.2002, "step": 4550 }, { "epoch": 0.1001029018316526, "grad_norm": 1.0078125, "learning_rate": 8.602500000000001e-06, "loss": 0.2004, "step": 4560 }, { "epoch": 0.10032242573917816, "grad_norm": 0.6328125, "learning_rate": 8.5775e-06, "loss": 0.1844, "step": 4570 }, { "epoch": 0.10054194964670371, "grad_norm": 0.56640625, "learning_rate": 8.5525e-06, "loss": 0.1701, "step": 4580 }, { "epoch": 0.10076147355422926, "grad_norm": 0.66015625, "learning_rate": 8.527500000000001e-06, "loss": 0.2176, "step": 4590 }, { "epoch": 0.10098099746175482, "grad_norm": 0.6796875, "learning_rate": 8.5025e-06, "loss": 0.1736, "step": 4600 }, { "epoch": 0.10120052136928037, "grad_norm": 0.6796875, "learning_rate": 8.477500000000001e-06, "loss": 0.1813, "step": 4610 }, { "epoch": 0.10142004527680593, "grad_norm": 0.96484375, "learning_rate": 8.4525e-06, "loss": 0.1878, "step": 4620 }, { "epoch": 0.10163956918433148, "grad_norm": 0.85546875, "learning_rate": 8.4275e-06, "loss": 0.1796, "step": 4630 }, { "epoch": 0.10185909309185703, "grad_norm": 0.8359375, "learning_rate": 8.402500000000001e-06, "loss": 0.1702, "step": 4640 }, { "epoch": 0.10207861699938259, "grad_norm": 0.7109375, "learning_rate": 8.3775e-06, "loss": 0.1689, "step": 4650 }, { "epoch": 0.10229814090690814, "grad_norm": 0.86328125, "learning_rate": 8.352500000000002e-06, "loss": 0.1922, "step": 4660 }, { "epoch": 0.1025176648144337, "grad_norm": 0.77734375, "learning_rate": 8.327500000000001e-06, "loss": 0.1813, "step": 4670 }, { "epoch": 0.10273718872195925, "grad_norm": 0.65625, "learning_rate": 8.3025e-06, "loss": 0.1486, "step": 4680 }, { "epoch": 0.1029567126294848, "grad_norm": 0.734375, "learning_rate": 8.2775e-06, "loss": 0.1606, "step": 4690 }, { "epoch": 0.10317623653701036, "grad_norm": 0.7890625, "learning_rate": 8.252500000000001e-06, "loss": 0.2014, "step": 4700 }, { "epoch": 0.10339576044453591, "grad_norm": 0.6015625, "learning_rate": 8.2275e-06, "loss": 0.1757, "step": 4710 }, { "epoch": 0.10361528435206147, "grad_norm": 0.9140625, "learning_rate": 8.202500000000002e-06, "loss": 0.1895, "step": 4720 }, { "epoch": 0.10383480825958702, "grad_norm": 0.6484375, "learning_rate": 8.177500000000001e-06, "loss": 0.1753, "step": 4730 }, { "epoch": 0.10405433216711257, "grad_norm": 0.75, "learning_rate": 8.1525e-06, "loss": 0.1705, "step": 4740 }, { "epoch": 0.10427385607463813, "grad_norm": 0.796875, "learning_rate": 8.1275e-06, "loss": 0.217, "step": 4750 }, { "epoch": 0.10449337998216368, "grad_norm": 0.671875, "learning_rate": 8.1025e-06, "loss": 0.2155, "step": 4760 }, { "epoch": 0.10471290388968924, "grad_norm": 0.8671875, "learning_rate": 8.0775e-06, "loss": 0.1972, "step": 4770 }, { "epoch": 0.10493242779721479, "grad_norm": 0.859375, "learning_rate": 8.052500000000001e-06, "loss": 0.1927, "step": 4780 }, { "epoch": 0.10515195170474034, "grad_norm": 1.078125, "learning_rate": 8.0275e-06, "loss": 0.2067, "step": 4790 }, { "epoch": 0.1053714756122659, "grad_norm": 0.70703125, "learning_rate": 8.0025e-06, "loss": 0.1802, "step": 4800 }, { "epoch": 0.10559099951979145, "grad_norm": 0.76171875, "learning_rate": 7.9775e-06, "loss": 0.1808, "step": 4810 }, { "epoch": 0.105810523427317, "grad_norm": 0.75390625, "learning_rate": 7.9525e-06, "loss": 0.1641, "step": 4820 }, { "epoch": 0.10603004733484256, "grad_norm": 0.84375, "learning_rate": 7.9275e-06, "loss": 0.175, "step": 4830 }, { "epoch": 0.10624957124236811, "grad_norm": 1.1640625, "learning_rate": 7.902500000000001e-06, "loss": 0.1984, "step": 4840 }, { "epoch": 0.10646909514989367, "grad_norm": 1.0, "learning_rate": 7.8775e-06, "loss": 0.182, "step": 4850 }, { "epoch": 0.10668861905741922, "grad_norm": 0.66796875, "learning_rate": 7.852500000000001e-06, "loss": 0.1785, "step": 4860 }, { "epoch": 0.10690814296494477, "grad_norm": 0.71484375, "learning_rate": 7.827500000000001e-06, "loss": 0.1917, "step": 4870 }, { "epoch": 0.10712766687247033, "grad_norm": 0.671875, "learning_rate": 7.8025e-06, "loss": 0.1747, "step": 4880 }, { "epoch": 0.10734719077999588, "grad_norm": 0.921875, "learning_rate": 7.777500000000001e-06, "loss": 0.1806, "step": 4890 }, { "epoch": 0.10756671468752144, "grad_norm": 0.73046875, "learning_rate": 7.7525e-06, "loss": 0.1672, "step": 4900 }, { "epoch": 0.10778623859504699, "grad_norm": 0.78515625, "learning_rate": 7.727500000000002e-06, "loss": 0.2223, "step": 4910 }, { "epoch": 0.10800576250257254, "grad_norm": 0.671875, "learning_rate": 7.702500000000001e-06, "loss": 0.1946, "step": 4920 }, { "epoch": 0.1082252864100981, "grad_norm": 0.69921875, "learning_rate": 7.6775e-06, "loss": 0.1682, "step": 4930 }, { "epoch": 0.10844481031762365, "grad_norm": 0.63671875, "learning_rate": 7.6525e-06, "loss": 0.1494, "step": 4940 }, { "epoch": 0.1086643342251492, "grad_norm": 0.9296875, "learning_rate": 7.627500000000001e-06, "loss": 0.1856, "step": 4950 }, { "epoch": 0.10888385813267476, "grad_norm": 0.87890625, "learning_rate": 7.6025000000000005e-06, "loss": 0.1922, "step": 4960 }, { "epoch": 0.10910338204020031, "grad_norm": 0.7421875, "learning_rate": 7.577500000000001e-06, "loss": 0.1919, "step": 4970 }, { "epoch": 0.10932290594772587, "grad_norm": 0.76953125, "learning_rate": 7.5525e-06, "loss": 0.1758, "step": 4980 }, { "epoch": 0.10954242985525142, "grad_norm": 0.73046875, "learning_rate": 7.527500000000001e-06, "loss": 0.2053, "step": 4990 }, { "epoch": 0.10976195376277698, "grad_norm": 0.81640625, "learning_rate": 7.502500000000001e-06, "loss": 0.1742, "step": 5000 }, { "epoch": 0.10998147767030253, "grad_norm": 0.9140625, "learning_rate": 7.477500000000001e-06, "loss": 0.2089, "step": 5010 }, { "epoch": 0.11020100157782808, "grad_norm": 0.84765625, "learning_rate": 7.4525e-06, "loss": 0.1788, "step": 5020 }, { "epoch": 0.11042052548535364, "grad_norm": 0.9375, "learning_rate": 7.4275000000000005e-06, "loss": 0.1981, "step": 5030 }, { "epoch": 0.11064004939287919, "grad_norm": 1.0078125, "learning_rate": 7.4025e-06, "loss": 0.1708, "step": 5040 }, { "epoch": 0.11085957330040475, "grad_norm": 0.7421875, "learning_rate": 7.377500000000001e-06, "loss": 0.1621, "step": 5050 }, { "epoch": 0.1110790972079303, "grad_norm": 0.80078125, "learning_rate": 7.3525e-06, "loss": 0.1805, "step": 5060 }, { "epoch": 0.11129862111545585, "grad_norm": 0.8046875, "learning_rate": 7.3275000000000006e-06, "loss": 0.201, "step": 5070 }, { "epoch": 0.11151814502298141, "grad_norm": 0.72265625, "learning_rate": 7.3025e-06, "loss": 0.2034, "step": 5080 }, { "epoch": 0.11173766893050696, "grad_norm": 0.8671875, "learning_rate": 7.277500000000001e-06, "loss": 0.1858, "step": 5090 }, { "epoch": 0.11195719283803252, "grad_norm": 0.75390625, "learning_rate": 7.2525000000000004e-06, "loss": 0.2008, "step": 5100 }, { "epoch": 0.11217671674555807, "grad_norm": 0.81640625, "learning_rate": 7.227500000000001e-06, "loss": 0.1954, "step": 5110 }, { "epoch": 0.11239624065308362, "grad_norm": 0.859375, "learning_rate": 7.2025e-06, "loss": 0.1638, "step": 5120 }, { "epoch": 0.11261576456060918, "grad_norm": 0.80078125, "learning_rate": 7.1775e-06, "loss": 0.1845, "step": 5130 }, { "epoch": 0.11283528846813473, "grad_norm": 0.80859375, "learning_rate": 7.152500000000001e-06, "loss": 0.1953, "step": 5140 }, { "epoch": 0.11305481237566029, "grad_norm": 0.640625, "learning_rate": 7.127500000000001e-06, "loss": 0.1937, "step": 5150 }, { "epoch": 0.11327433628318584, "grad_norm": 0.73828125, "learning_rate": 7.102500000000001e-06, "loss": 0.1736, "step": 5160 }, { "epoch": 0.1134938601907114, "grad_norm": 0.70703125, "learning_rate": 7.0775000000000004e-06, "loss": 0.2043, "step": 5170 }, { "epoch": 0.11371338409823695, "grad_norm": 0.8984375, "learning_rate": 7.052500000000001e-06, "loss": 0.1758, "step": 5180 }, { "epoch": 0.1139329080057625, "grad_norm": 0.93359375, "learning_rate": 7.0275e-06, "loss": 0.1895, "step": 5190 }, { "epoch": 0.11415243191328805, "grad_norm": 0.66796875, "learning_rate": 7.002500000000001e-06, "loss": 0.2119, "step": 5200 }, { "epoch": 0.11437195582081361, "grad_norm": 0.7734375, "learning_rate": 6.9775000000000005e-06, "loss": 0.1839, "step": 5210 }, { "epoch": 0.11459147972833916, "grad_norm": 0.91015625, "learning_rate": 6.952500000000001e-06, "loss": 0.186, "step": 5220 }, { "epoch": 0.11481100363586472, "grad_norm": 0.78125, "learning_rate": 6.9275e-06, "loss": 0.1887, "step": 5230 }, { "epoch": 0.11503052754339027, "grad_norm": 0.60546875, "learning_rate": 6.902500000000001e-06, "loss": 0.1916, "step": 5240 }, { "epoch": 0.11525005145091582, "grad_norm": 0.64453125, "learning_rate": 6.877500000000001e-06, "loss": 0.1755, "step": 5250 }, { "epoch": 0.11546957535844138, "grad_norm": 0.75390625, "learning_rate": 6.852500000000001e-06, "loss": 0.2005, "step": 5260 }, { "epoch": 0.11568909926596693, "grad_norm": 1.1796875, "learning_rate": 6.8275e-06, "loss": 0.229, "step": 5270 }, { "epoch": 0.11590862317349249, "grad_norm": 0.78125, "learning_rate": 6.8025000000000005e-06, "loss": 0.1747, "step": 5280 }, { "epoch": 0.11612814708101804, "grad_norm": 0.77734375, "learning_rate": 6.7775e-06, "loss": 0.1735, "step": 5290 }, { "epoch": 0.1163476709885436, "grad_norm": 0.90625, "learning_rate": 6.752500000000001e-06, "loss": 0.1971, "step": 5300 }, { "epoch": 0.11656719489606915, "grad_norm": 0.89453125, "learning_rate": 6.7275e-06, "loss": 0.1885, "step": 5310 }, { "epoch": 0.1167867188035947, "grad_norm": 0.73046875, "learning_rate": 6.702500000000001e-06, "loss": 0.1801, "step": 5320 }, { "epoch": 0.11700624271112026, "grad_norm": 0.7734375, "learning_rate": 6.6775e-06, "loss": 0.2179, "step": 5330 }, { "epoch": 0.11722576661864581, "grad_norm": 0.578125, "learning_rate": 6.6525e-06, "loss": 0.1607, "step": 5340 }, { "epoch": 0.11744529052617136, "grad_norm": 0.61328125, "learning_rate": 6.6275e-06, "loss": 0.1931, "step": 5350 }, { "epoch": 0.11766481443369692, "grad_norm": 0.85546875, "learning_rate": 6.602500000000001e-06, "loss": 0.2009, "step": 5360 }, { "epoch": 0.11788433834122247, "grad_norm": 0.5390625, "learning_rate": 6.5775e-06, "loss": 0.1551, "step": 5370 }, { "epoch": 0.11810386224874803, "grad_norm": 0.87109375, "learning_rate": 6.5525e-06, "loss": 0.1775, "step": 5380 }, { "epoch": 0.11832338615627358, "grad_norm": 0.6171875, "learning_rate": 6.5275000000000015e-06, "loss": 0.1673, "step": 5390 }, { "epoch": 0.11854291006379913, "grad_norm": 0.8203125, "learning_rate": 6.502500000000001e-06, "loss": 0.1785, "step": 5400 }, { "epoch": 0.11876243397132469, "grad_norm": 0.8515625, "learning_rate": 6.477500000000001e-06, "loss": 0.1902, "step": 5410 }, { "epoch": 0.11898195787885024, "grad_norm": 0.640625, "learning_rate": 6.4525000000000005e-06, "loss": 0.1794, "step": 5420 }, { "epoch": 0.1192014817863758, "grad_norm": 0.7265625, "learning_rate": 6.427500000000001e-06, "loss": 0.1954, "step": 5430 }, { "epoch": 0.11942100569390135, "grad_norm": 0.87890625, "learning_rate": 6.4025e-06, "loss": 0.21, "step": 5440 }, { "epoch": 0.1196405296014269, "grad_norm": 0.76171875, "learning_rate": 6.377500000000001e-06, "loss": 0.1887, "step": 5450 }, { "epoch": 0.11986005350895246, "grad_norm": 0.7109375, "learning_rate": 6.352500000000001e-06, "loss": 0.1928, "step": 5460 }, { "epoch": 0.12007957741647801, "grad_norm": 0.63671875, "learning_rate": 6.327500000000001e-06, "loss": 0.2024, "step": 5470 }, { "epoch": 0.12029910132400357, "grad_norm": 0.890625, "learning_rate": 6.3025e-06, "loss": 0.1855, "step": 5480 }, { "epoch": 0.12051862523152912, "grad_norm": 0.92578125, "learning_rate": 6.2775000000000005e-06, "loss": 0.1693, "step": 5490 }, { "epoch": 0.12073814913905467, "grad_norm": 0.94140625, "learning_rate": 6.2525e-06, "loss": 0.1777, "step": 5500 }, { "epoch": 0.12095767304658023, "grad_norm": 0.76953125, "learning_rate": 6.227500000000001e-06, "loss": 0.221, "step": 5510 }, { "epoch": 0.12117719695410578, "grad_norm": 0.6953125, "learning_rate": 6.2025e-06, "loss": 0.1692, "step": 5520 }, { "epoch": 0.12139672086163134, "grad_norm": 0.77734375, "learning_rate": 6.1775000000000006e-06, "loss": 0.192, "step": 5530 }, { "epoch": 0.12161624476915689, "grad_norm": 0.6484375, "learning_rate": 6.1525e-06, "loss": 0.1882, "step": 5540 }, { "epoch": 0.12183576867668244, "grad_norm": 0.98046875, "learning_rate": 6.127500000000001e-06, "loss": 0.2051, "step": 5550 }, { "epoch": 0.122055292584208, "grad_norm": 0.9609375, "learning_rate": 6.1025000000000004e-06, "loss": 0.2132, "step": 5560 }, { "epoch": 0.12227481649173355, "grad_norm": 0.76953125, "learning_rate": 6.077500000000001e-06, "loss": 0.1776, "step": 5570 }, { "epoch": 0.1224943403992591, "grad_norm": 0.83984375, "learning_rate": 6.0525e-06, "loss": 0.2029, "step": 5580 }, { "epoch": 0.12271386430678466, "grad_norm": 0.796875, "learning_rate": 6.0275e-06, "loss": 0.209, "step": 5590 }, { "epoch": 0.12293338821431021, "grad_norm": 0.5859375, "learning_rate": 6.0025e-06, "loss": 0.1967, "step": 5600 }, { "epoch": 0.12315291212183577, "grad_norm": 1.0, "learning_rate": 5.977500000000001e-06, "loss": 0.2383, "step": 5610 }, { "epoch": 0.12337243602936132, "grad_norm": 0.6875, "learning_rate": 5.9525e-06, "loss": 0.163, "step": 5620 }, { "epoch": 0.12359195993688687, "grad_norm": 0.578125, "learning_rate": 5.9275e-06, "loss": 0.196, "step": 5630 }, { "epoch": 0.12381148384441243, "grad_norm": 0.76171875, "learning_rate": 5.902500000000001e-06, "loss": 0.2017, "step": 5640 }, { "epoch": 0.12403100775193798, "grad_norm": 0.82421875, "learning_rate": 5.8775e-06, "loss": 0.1859, "step": 5650 }, { "epoch": 0.12425053165946354, "grad_norm": 0.6875, "learning_rate": 5.852500000000001e-06, "loss": 0.2002, "step": 5660 }, { "epoch": 0.12447005556698909, "grad_norm": 0.71875, "learning_rate": 5.8275000000000005e-06, "loss": 0.1784, "step": 5670 }, { "epoch": 0.12468957947451464, "grad_norm": 0.765625, "learning_rate": 5.802500000000001e-06, "loss": 0.1809, "step": 5680 }, { "epoch": 0.1249091033820402, "grad_norm": 0.51953125, "learning_rate": 5.7775e-06, "loss": 0.1663, "step": 5690 }, { "epoch": 0.12512862728956575, "grad_norm": 0.76953125, "learning_rate": 5.752500000000001e-06, "loss": 0.173, "step": 5700 }, { "epoch": 0.1253481511970913, "grad_norm": 0.7109375, "learning_rate": 5.727500000000001e-06, "loss": 0.1743, "step": 5710 }, { "epoch": 0.12556767510461686, "grad_norm": 0.90234375, "learning_rate": 5.702500000000001e-06, "loss": 0.2364, "step": 5720 }, { "epoch": 0.12578719901214241, "grad_norm": 0.69140625, "learning_rate": 5.6775e-06, "loss": 0.212, "step": 5730 }, { "epoch": 0.12600672291966797, "grad_norm": 0.76171875, "learning_rate": 5.6525000000000005e-06, "loss": 0.1757, "step": 5740 }, { "epoch": 0.12622624682719352, "grad_norm": 0.84765625, "learning_rate": 5.6275e-06, "loss": 0.1868, "step": 5750 }, { "epoch": 0.12644577073471908, "grad_norm": 0.84375, "learning_rate": 5.602500000000001e-06, "loss": 0.1901, "step": 5760 }, { "epoch": 0.12666529464224463, "grad_norm": 0.66796875, "learning_rate": 5.5775e-06, "loss": 0.2302, "step": 5770 }, { "epoch": 0.12688481854977018, "grad_norm": 0.61328125, "learning_rate": 5.552500000000001e-06, "loss": 0.1639, "step": 5780 }, { "epoch": 0.12710434245729574, "grad_norm": 1.4453125, "learning_rate": 5.5275e-06, "loss": 0.1778, "step": 5790 }, { "epoch": 0.1273238663648213, "grad_norm": 0.85546875, "learning_rate": 5.5025e-06, "loss": 0.1727, "step": 5800 }, { "epoch": 0.12754339027234685, "grad_norm": 0.5859375, "learning_rate": 5.4775e-06, "loss": 0.1731, "step": 5810 }, { "epoch": 0.1277629141798724, "grad_norm": 0.6171875, "learning_rate": 5.452500000000001e-06, "loss": 0.1762, "step": 5820 }, { "epoch": 0.12798243808739795, "grad_norm": 0.98828125, "learning_rate": 5.4275e-06, "loss": 0.1874, "step": 5830 }, { "epoch": 0.1282019619949235, "grad_norm": 0.6328125, "learning_rate": 5.4025e-06, "loss": 0.199, "step": 5840 }, { "epoch": 0.12842148590244906, "grad_norm": 0.8984375, "learning_rate": 5.3775e-06, "loss": 0.1935, "step": 5850 }, { "epoch": 0.12864100980997462, "grad_norm": 0.875, "learning_rate": 5.352500000000001e-06, "loss": 0.1862, "step": 5860 }, { "epoch": 0.12886053371750017, "grad_norm": 0.73046875, "learning_rate": 5.3275e-06, "loss": 0.1693, "step": 5870 }, { "epoch": 0.12908005762502572, "grad_norm": 0.87109375, "learning_rate": 5.3025000000000005e-06, "loss": 0.1972, "step": 5880 }, { "epoch": 0.12929958153255128, "grad_norm": 0.65234375, "learning_rate": 5.277500000000001e-06, "loss": 0.1742, "step": 5890 }, { "epoch": 0.12951910544007683, "grad_norm": 0.62109375, "learning_rate": 5.2525e-06, "loss": 0.1765, "step": 5900 }, { "epoch": 0.12973862934760239, "grad_norm": 0.97265625, "learning_rate": 5.227500000000001e-06, "loss": 0.1713, "step": 5910 }, { "epoch": 0.12995815325512794, "grad_norm": 0.7109375, "learning_rate": 5.202500000000001e-06, "loss": 0.1647, "step": 5920 }, { "epoch": 0.1301776771626535, "grad_norm": 0.984375, "learning_rate": 5.177500000000001e-06, "loss": 0.1795, "step": 5930 }, { "epoch": 0.13039720107017905, "grad_norm": 0.71875, "learning_rate": 5.1525e-06, "loss": 0.2107, "step": 5940 }, { "epoch": 0.1306167249777046, "grad_norm": 0.96875, "learning_rate": 5.1275000000000005e-06, "loss": 0.1919, "step": 5950 }, { "epoch": 0.13083624888523016, "grad_norm": 0.7109375, "learning_rate": 5.1025e-06, "loss": 0.1755, "step": 5960 }, { "epoch": 0.1310557727927557, "grad_norm": 0.5625, "learning_rate": 5.077500000000001e-06, "loss": 0.1673, "step": 5970 }, { "epoch": 0.13127529670028126, "grad_norm": 0.88671875, "learning_rate": 5.0525e-06, "loss": 0.2152, "step": 5980 }, { "epoch": 0.13149482060780682, "grad_norm": 1.1015625, "learning_rate": 5.0275000000000006e-06, "loss": 0.2161, "step": 5990 }, { "epoch": 0.13171434451533237, "grad_norm": 0.83984375, "learning_rate": 5.0025e-06, "loss": 0.1849, "step": 6000 }, { "epoch": 0.13193386842285793, "grad_norm": 0.8125, "learning_rate": 4.977500000000001e-06, "loss": 0.1786, "step": 6010 }, { "epoch": 0.13215339233038348, "grad_norm": 0.8515625, "learning_rate": 4.9525000000000004e-06, "loss": 0.1818, "step": 6020 }, { "epoch": 0.13237291623790903, "grad_norm": 0.6796875, "learning_rate": 4.927500000000001e-06, "loss": 0.182, "step": 6030 }, { "epoch": 0.1325924401454346, "grad_norm": 0.8359375, "learning_rate": 4.902500000000001e-06, "loss": 0.1778, "step": 6040 }, { "epoch": 0.13281196405296014, "grad_norm": 0.7109375, "learning_rate": 4.8775e-06, "loss": 0.165, "step": 6050 }, { "epoch": 0.1330314879604857, "grad_norm": 0.875, "learning_rate": 4.8525000000000006e-06, "loss": 0.2036, "step": 6060 }, { "epoch": 0.13325101186801125, "grad_norm": 0.74609375, "learning_rate": 4.827500000000001e-06, "loss": 0.1749, "step": 6070 }, { "epoch": 0.1334705357755368, "grad_norm": 0.83984375, "learning_rate": 4.8025e-06, "loss": 0.1979, "step": 6080 }, { "epoch": 0.13369005968306236, "grad_norm": 0.75390625, "learning_rate": 4.7775e-06, "loss": 0.1883, "step": 6090 }, { "epoch": 0.1339095835905879, "grad_norm": 0.73828125, "learning_rate": 4.752500000000001e-06, "loss": 0.1742, "step": 6100 }, { "epoch": 0.13412910749811346, "grad_norm": 0.84765625, "learning_rate": 4.7275e-06, "loss": 0.1704, "step": 6110 }, { "epoch": 0.13434863140563902, "grad_norm": 0.984375, "learning_rate": 4.7025e-06, "loss": 0.1963, "step": 6120 }, { "epoch": 0.13456815531316457, "grad_norm": 0.90625, "learning_rate": 4.6775000000000005e-06, "loss": 0.1935, "step": 6130 }, { "epoch": 0.13478767922069013, "grad_norm": 0.62109375, "learning_rate": 4.652500000000001e-06, "loss": 0.1756, "step": 6140 }, { "epoch": 0.13500720312821568, "grad_norm": 0.78515625, "learning_rate": 4.6275e-06, "loss": 0.1922, "step": 6150 }, { "epoch": 0.13522672703574123, "grad_norm": 0.9765625, "learning_rate": 4.6025e-06, "loss": 0.2005, "step": 6160 }, { "epoch": 0.1354462509432668, "grad_norm": 0.5625, "learning_rate": 4.577500000000001e-06, "loss": 0.184, "step": 6170 }, { "epoch": 0.13566577485079234, "grad_norm": 0.83984375, "learning_rate": 4.5525e-06, "loss": 0.1865, "step": 6180 }, { "epoch": 0.1358852987583179, "grad_norm": 0.703125, "learning_rate": 4.5275e-06, "loss": 0.201, "step": 6190 }, { "epoch": 0.13610482266584345, "grad_norm": 0.57421875, "learning_rate": 4.5025000000000005e-06, "loss": 0.2129, "step": 6200 }, { "epoch": 0.136324346573369, "grad_norm": 0.81640625, "learning_rate": 4.4775e-06, "loss": 0.2103, "step": 6210 }, { "epoch": 0.13654387048089456, "grad_norm": 0.8671875, "learning_rate": 4.4525e-06, "loss": 0.1933, "step": 6220 }, { "epoch": 0.1367633943884201, "grad_norm": 1.0234375, "learning_rate": 4.4275e-06, "loss": 0.1837, "step": 6230 }, { "epoch": 0.13698291829594567, "grad_norm": 0.72265625, "learning_rate": 4.4025e-06, "loss": 0.1696, "step": 6240 }, { "epoch": 0.13720244220347122, "grad_norm": 0.9140625, "learning_rate": 4.3775e-06, "loss": 0.1885, "step": 6250 }, { "epoch": 0.13742196611099677, "grad_norm": 0.7421875, "learning_rate": 4.3525e-06, "loss": 0.175, "step": 6260 }, { "epoch": 0.13764149001852233, "grad_norm": 0.67578125, "learning_rate": 4.3275000000000005e-06, "loss": 0.1905, "step": 6270 }, { "epoch": 0.13786101392604788, "grad_norm": 0.7109375, "learning_rate": 4.302500000000001e-06, "loss": 0.1854, "step": 6280 }, { "epoch": 0.13808053783357344, "grad_norm": 0.7734375, "learning_rate": 4.2775e-06, "loss": 0.2116, "step": 6290 }, { "epoch": 0.138300061741099, "grad_norm": 0.8671875, "learning_rate": 4.2525e-06, "loss": 0.184, "step": 6300 }, { "epoch": 0.13851958564862454, "grad_norm": 0.67578125, "learning_rate": 4.227500000000001e-06, "loss": 0.1831, "step": 6310 }, { "epoch": 0.1387391095561501, "grad_norm": 0.703125, "learning_rate": 4.202500000000001e-06, "loss": 0.1765, "step": 6320 }, { "epoch": 0.13895863346367565, "grad_norm": 0.67578125, "learning_rate": 4.1775e-06, "loss": 0.1742, "step": 6330 }, { "epoch": 0.1391781573712012, "grad_norm": 0.87890625, "learning_rate": 4.1525000000000005e-06, "loss": 0.2031, "step": 6340 }, { "epoch": 0.13939768127872676, "grad_norm": 0.6953125, "learning_rate": 4.127500000000001e-06, "loss": 0.1976, "step": 6350 }, { "epoch": 0.1396172051862523, "grad_norm": 0.625, "learning_rate": 4.1025e-06, "loss": 0.1701, "step": 6360 }, { "epoch": 0.13983672909377787, "grad_norm": 0.77734375, "learning_rate": 4.0775e-06, "loss": 0.1827, "step": 6370 }, { "epoch": 0.14005625300130342, "grad_norm": 0.8828125, "learning_rate": 4.052500000000001e-06, "loss": 0.1767, "step": 6380 }, { "epoch": 0.14027577690882898, "grad_norm": 0.56640625, "learning_rate": 4.0275e-06, "loss": 0.1869, "step": 6390 }, { "epoch": 0.14049530081635453, "grad_norm": 0.78125, "learning_rate": 4.0025e-06, "loss": 0.1954, "step": 6400 }, { "epoch": 0.14071482472388008, "grad_norm": 0.76953125, "learning_rate": 3.9775000000000005e-06, "loss": 0.1762, "step": 6410 }, { "epoch": 0.14093434863140564, "grad_norm": 0.953125, "learning_rate": 3.9525e-06, "loss": 0.1865, "step": 6420 }, { "epoch": 0.1411538725389312, "grad_norm": 0.59375, "learning_rate": 3.9275e-06, "loss": 0.1816, "step": 6430 }, { "epoch": 0.14137339644645674, "grad_norm": 0.765625, "learning_rate": 3.9025e-06, "loss": 0.1757, "step": 6440 }, { "epoch": 0.1415929203539823, "grad_norm": 0.5234375, "learning_rate": 3.8775000000000006e-06, "loss": 0.1836, "step": 6450 }, { "epoch": 0.14181244426150785, "grad_norm": 0.81640625, "learning_rate": 3.8525e-06, "loss": 0.1953, "step": 6460 }, { "epoch": 0.1420319681690334, "grad_norm": 0.67578125, "learning_rate": 3.8275e-06, "loss": 0.1605, "step": 6470 }, { "epoch": 0.14225149207655896, "grad_norm": 0.86328125, "learning_rate": 3.8025e-06, "loss": 0.1865, "step": 6480 }, { "epoch": 0.14247101598408451, "grad_norm": 0.65234375, "learning_rate": 3.7775000000000003e-06, "loss": 0.1746, "step": 6490 }, { "epoch": 0.14269053989161007, "grad_norm": 0.59375, "learning_rate": 3.7525e-06, "loss": 0.1572, "step": 6500 }, { "epoch": 0.14291006379913562, "grad_norm": 0.65234375, "learning_rate": 3.7275000000000007e-06, "loss": 0.1942, "step": 6510 }, { "epoch": 0.14312958770666118, "grad_norm": 0.8125, "learning_rate": 3.7025000000000005e-06, "loss": 0.1841, "step": 6520 }, { "epoch": 0.14334911161418673, "grad_norm": 0.64453125, "learning_rate": 3.6775000000000004e-06, "loss": 0.1964, "step": 6530 }, { "epoch": 0.14356863552171228, "grad_norm": 0.66796875, "learning_rate": 3.6525000000000006e-06, "loss": 0.198, "step": 6540 }, { "epoch": 0.14378815942923784, "grad_norm": 0.72265625, "learning_rate": 3.6275000000000004e-06, "loss": 0.1773, "step": 6550 }, { "epoch": 0.1440076833367634, "grad_norm": 0.78125, "learning_rate": 3.6025000000000002e-06, "loss": 0.1699, "step": 6560 }, { "epoch": 0.14422720724428895, "grad_norm": 0.73046875, "learning_rate": 3.5775000000000005e-06, "loss": 0.2117, "step": 6570 }, { "epoch": 0.1444467311518145, "grad_norm": 0.859375, "learning_rate": 3.5525000000000003e-06, "loss": 0.1783, "step": 6580 }, { "epoch": 0.14466625505934005, "grad_norm": 0.546875, "learning_rate": 3.5275000000000005e-06, "loss": 0.1608, "step": 6590 }, { "epoch": 0.1448857789668656, "grad_norm": 1.109375, "learning_rate": 3.5025000000000003e-06, "loss": 0.1933, "step": 6600 }, { "epoch": 0.14510530287439116, "grad_norm": 0.69921875, "learning_rate": 3.4775e-06, "loss": 0.2031, "step": 6610 }, { "epoch": 0.14532482678191672, "grad_norm": 1.09375, "learning_rate": 3.4525000000000004e-06, "loss": 0.188, "step": 6620 }, { "epoch": 0.14554435068944227, "grad_norm": 0.94921875, "learning_rate": 3.4275000000000002e-06, "loss": 0.1767, "step": 6630 }, { "epoch": 0.14576387459696782, "grad_norm": 0.59765625, "learning_rate": 3.4025000000000005e-06, "loss": 0.1888, "step": 6640 }, { "epoch": 0.14598339850449338, "grad_norm": 1.0546875, "learning_rate": 3.3775000000000003e-06, "loss": 0.1918, "step": 6650 }, { "epoch": 0.14620292241201893, "grad_norm": 0.6328125, "learning_rate": 3.3525e-06, "loss": 0.167, "step": 6660 }, { "epoch": 0.14642244631954449, "grad_norm": 0.6328125, "learning_rate": 3.3275000000000003e-06, "loss": 0.1635, "step": 6670 }, { "epoch": 0.14664197022707004, "grad_norm": 0.9140625, "learning_rate": 3.3025e-06, "loss": 0.2107, "step": 6680 }, { "epoch": 0.1468614941345956, "grad_norm": 0.84375, "learning_rate": 3.2775e-06, "loss": 0.1872, "step": 6690 }, { "epoch": 0.14708101804212115, "grad_norm": 0.78515625, "learning_rate": 3.2525e-06, "loss": 0.1627, "step": 6700 }, { "epoch": 0.1473005419496467, "grad_norm": 0.79296875, "learning_rate": 3.2275e-06, "loss": 0.1499, "step": 6710 }, { "epoch": 0.14752006585717226, "grad_norm": 0.69921875, "learning_rate": 3.2025000000000003e-06, "loss": 0.1921, "step": 6720 }, { "epoch": 0.1477395897646978, "grad_norm": 0.78125, "learning_rate": 3.1775e-06, "loss": 0.1811, "step": 6730 }, { "epoch": 0.14795911367222336, "grad_norm": 0.8515625, "learning_rate": 3.1525e-06, "loss": 0.176, "step": 6740 }, { "epoch": 0.14817863757974892, "grad_norm": 0.8828125, "learning_rate": 3.1275e-06, "loss": 0.2066, "step": 6750 }, { "epoch": 0.14839816148727447, "grad_norm": 0.5859375, "learning_rate": 3.1025000000000004e-06, "loss": 0.1424, "step": 6760 }, { "epoch": 0.14861768539480003, "grad_norm": 0.65234375, "learning_rate": 3.0775000000000006e-06, "loss": 0.1997, "step": 6770 }, { "epoch": 0.14883720930232558, "grad_norm": 0.7890625, "learning_rate": 3.0525000000000004e-06, "loss": 0.1976, "step": 6780 }, { "epoch": 0.14905673320985113, "grad_norm": 0.640625, "learning_rate": 3.0275000000000002e-06, "loss": 0.1596, "step": 6790 }, { "epoch": 0.1492762571173767, "grad_norm": 0.63671875, "learning_rate": 3.0025000000000005e-06, "loss": 0.1694, "step": 6800 }, { "epoch": 0.14949578102490224, "grad_norm": 0.8828125, "learning_rate": 2.9775000000000003e-06, "loss": 0.1774, "step": 6810 }, { "epoch": 0.1497153049324278, "grad_norm": 0.90234375, "learning_rate": 2.9525000000000005e-06, "loss": 0.1849, "step": 6820 }, { "epoch": 0.14993482883995335, "grad_norm": 0.69140625, "learning_rate": 2.9275000000000003e-06, "loss": 0.2215, "step": 6830 }, { "epoch": 0.1501543527474789, "grad_norm": 0.7578125, "learning_rate": 2.9025e-06, "loss": 0.1916, "step": 6840 }, { "epoch": 0.15037387665500446, "grad_norm": 1.109375, "learning_rate": 2.8775000000000004e-06, "loss": 0.185, "step": 6850 }, { "epoch": 0.15059340056253, "grad_norm": 0.81640625, "learning_rate": 2.8525000000000002e-06, "loss": 0.1826, "step": 6860 }, { "epoch": 0.15081292447005556, "grad_norm": 0.78515625, "learning_rate": 2.8275e-06, "loss": 0.1935, "step": 6870 }, { "epoch": 0.15103244837758112, "grad_norm": 0.765625, "learning_rate": 2.8025000000000003e-06, "loss": 0.1683, "step": 6880 }, { "epoch": 0.15125197228510667, "grad_norm": 1.078125, "learning_rate": 2.7775e-06, "loss": 0.2083, "step": 6890 }, { "epoch": 0.15147149619263223, "grad_norm": 0.87109375, "learning_rate": 2.7525000000000003e-06, "loss": 0.1656, "step": 6900 }, { "epoch": 0.15169102010015778, "grad_norm": 0.62890625, "learning_rate": 2.7275e-06, "loss": 0.1748, "step": 6910 }, { "epoch": 0.15191054400768333, "grad_norm": 0.7578125, "learning_rate": 2.7025e-06, "loss": 0.2087, "step": 6920 }, { "epoch": 0.1521300679152089, "grad_norm": 0.7421875, "learning_rate": 2.6775e-06, "loss": 0.1721, "step": 6930 }, { "epoch": 0.15234959182273444, "grad_norm": 0.86328125, "learning_rate": 2.6525e-06, "loss": 0.2098, "step": 6940 }, { "epoch": 0.15256911573026, "grad_norm": 0.921875, "learning_rate": 2.6275000000000003e-06, "loss": 0.1765, "step": 6950 }, { "epoch": 0.15278863963778555, "grad_norm": 0.640625, "learning_rate": 2.6025e-06, "loss": 0.1839, "step": 6960 }, { "epoch": 0.1530081635453111, "grad_norm": 0.6953125, "learning_rate": 2.5775e-06, "loss": 0.1648, "step": 6970 }, { "epoch": 0.15322768745283666, "grad_norm": 0.50390625, "learning_rate": 2.5525e-06, "loss": 0.1808, "step": 6980 }, { "epoch": 0.1534472113603622, "grad_norm": 0.68359375, "learning_rate": 2.5275e-06, "loss": 0.1903, "step": 6990 }, { "epoch": 0.15366673526788777, "grad_norm": 0.61328125, "learning_rate": 2.5024999999999998e-06, "loss": 0.1867, "step": 7000 }, { "epoch": 0.15388625917541332, "grad_norm": 0.70703125, "learning_rate": 2.4775e-06, "loss": 0.1942, "step": 7010 }, { "epoch": 0.15410578308293887, "grad_norm": 0.73828125, "learning_rate": 2.4525000000000002e-06, "loss": 0.1753, "step": 7020 }, { "epoch": 0.15432530699046443, "grad_norm": 0.6796875, "learning_rate": 2.4275e-06, "loss": 0.1916, "step": 7030 }, { "epoch": 0.15454483089798998, "grad_norm": 0.765625, "learning_rate": 2.4025000000000003e-06, "loss": 0.1735, "step": 7040 }, { "epoch": 0.15476435480551554, "grad_norm": 0.78515625, "learning_rate": 2.3775e-06, "loss": 0.1675, "step": 7050 }, { "epoch": 0.1549838787130411, "grad_norm": 0.7265625, "learning_rate": 2.3525e-06, "loss": 0.176, "step": 7060 }, { "epoch": 0.15520340262056664, "grad_norm": 0.81640625, "learning_rate": 2.3275e-06, "loss": 0.1785, "step": 7070 }, { "epoch": 0.1554229265280922, "grad_norm": 0.82421875, "learning_rate": 2.3025000000000004e-06, "loss": 0.1981, "step": 7080 }, { "epoch": 0.15564245043561775, "grad_norm": 0.7109375, "learning_rate": 2.2775000000000002e-06, "loss": 0.2026, "step": 7090 }, { "epoch": 0.1558619743431433, "grad_norm": 0.7109375, "learning_rate": 2.2525e-06, "loss": 0.1676, "step": 7100 }, { "epoch": 0.15608149825066886, "grad_norm": 1.0546875, "learning_rate": 2.2275000000000003e-06, "loss": 0.1657, "step": 7110 }, { "epoch": 0.1563010221581944, "grad_norm": 0.6953125, "learning_rate": 2.2025e-06, "loss": 0.1702, "step": 7120 }, { "epoch": 0.15652054606571997, "grad_norm": 0.7578125, "learning_rate": 2.1775000000000003e-06, "loss": 0.1788, "step": 7130 }, { "epoch": 0.15674006997324552, "grad_norm": 0.671875, "learning_rate": 2.1525e-06, "loss": 0.1713, "step": 7140 }, { "epoch": 0.15695959388077108, "grad_norm": 0.5078125, "learning_rate": 2.1275e-06, "loss": 0.1754, "step": 7150 }, { "epoch": 0.15717911778829663, "grad_norm": 0.8046875, "learning_rate": 2.1025e-06, "loss": 0.1924, "step": 7160 }, { "epoch": 0.15739864169582218, "grad_norm": 0.6328125, "learning_rate": 2.0775e-06, "loss": 0.1997, "step": 7170 }, { "epoch": 0.15761816560334774, "grad_norm": 0.74609375, "learning_rate": 2.0525000000000003e-06, "loss": 0.1917, "step": 7180 }, { "epoch": 0.1578376895108733, "grad_norm": 0.85546875, "learning_rate": 2.0275000000000005e-06, "loss": 0.2014, "step": 7190 }, { "epoch": 0.15805721341839885, "grad_norm": 0.8125, "learning_rate": 2.0025000000000003e-06, "loss": 0.1756, "step": 7200 }, { "epoch": 0.1582767373259244, "grad_norm": 0.62890625, "learning_rate": 1.9775e-06, "loss": 0.1767, "step": 7210 }, { "epoch": 0.15849626123344995, "grad_norm": 0.84765625, "learning_rate": 1.9525000000000004e-06, "loss": 0.1863, "step": 7220 }, { "epoch": 0.1587157851409755, "grad_norm": 1.0234375, "learning_rate": 1.9275e-06, "loss": 0.2036, "step": 7230 }, { "epoch": 0.15893530904850106, "grad_norm": 0.84765625, "learning_rate": 1.9025000000000002e-06, "loss": 0.1922, "step": 7240 }, { "epoch": 0.15915483295602662, "grad_norm": 1.1171875, "learning_rate": 1.8775000000000002e-06, "loss": 0.1937, "step": 7250 }, { "epoch": 0.15937435686355217, "grad_norm": 0.828125, "learning_rate": 1.8525e-06, "loss": 0.2084, "step": 7260 }, { "epoch": 0.15959388077107772, "grad_norm": 0.87890625, "learning_rate": 1.8275e-06, "loss": 0.2239, "step": 7270 }, { "epoch": 0.15981340467860328, "grad_norm": 0.6640625, "learning_rate": 1.8025000000000001e-06, "loss": 0.1826, "step": 7280 }, { "epoch": 0.16003292858612883, "grad_norm": 0.93359375, "learning_rate": 1.7775000000000001e-06, "loss": 0.1847, "step": 7290 }, { "epoch": 0.16025245249365438, "grad_norm": 0.83203125, "learning_rate": 1.7525e-06, "loss": 0.2061, "step": 7300 }, { "epoch": 0.16047197640117994, "grad_norm": 0.69921875, "learning_rate": 1.7275e-06, "loss": 0.1872, "step": 7310 }, { "epoch": 0.1606915003087055, "grad_norm": 0.66015625, "learning_rate": 1.7025000000000002e-06, "loss": 0.1826, "step": 7320 }, { "epoch": 0.16091102421623105, "grad_norm": 1.109375, "learning_rate": 1.6775000000000002e-06, "loss": 0.1821, "step": 7330 }, { "epoch": 0.1611305481237566, "grad_norm": 0.7734375, "learning_rate": 1.6525000000000003e-06, "loss": 0.1842, "step": 7340 }, { "epoch": 0.16135007203128215, "grad_norm": 0.62109375, "learning_rate": 1.6275e-06, "loss": 0.1754, "step": 7350 }, { "epoch": 0.1615695959388077, "grad_norm": 0.828125, "learning_rate": 1.6025000000000001e-06, "loss": 0.1928, "step": 7360 }, { "epoch": 0.16178911984633326, "grad_norm": 0.91015625, "learning_rate": 1.5775000000000001e-06, "loss": 0.1871, "step": 7370 }, { "epoch": 0.16200864375385882, "grad_norm": 0.859375, "learning_rate": 1.5525000000000002e-06, "loss": 0.2064, "step": 7380 }, { "epoch": 0.16222816766138437, "grad_norm": 1.1484375, "learning_rate": 1.5275000000000002e-06, "loss": 0.2008, "step": 7390 }, { "epoch": 0.16244769156890992, "grad_norm": 0.6875, "learning_rate": 1.5025e-06, "loss": 0.1788, "step": 7400 }, { "epoch": 0.16266721547643548, "grad_norm": 0.8828125, "learning_rate": 1.4775e-06, "loss": 0.1762, "step": 7410 }, { "epoch": 0.16288673938396103, "grad_norm": 0.79296875, "learning_rate": 1.4525e-06, "loss": 0.1807, "step": 7420 }, { "epoch": 0.1631062632914866, "grad_norm": 0.82421875, "learning_rate": 1.4275e-06, "loss": 0.2052, "step": 7430 }, { "epoch": 0.16332578719901214, "grad_norm": 0.7578125, "learning_rate": 1.4025000000000003e-06, "loss": 0.1669, "step": 7440 }, { "epoch": 0.1635453111065377, "grad_norm": 0.7578125, "learning_rate": 1.3775000000000002e-06, "loss": 0.1858, "step": 7450 }, { "epoch": 0.16376483501406325, "grad_norm": 0.6953125, "learning_rate": 1.3525000000000002e-06, "loss": 0.1636, "step": 7460 }, { "epoch": 0.1639843589215888, "grad_norm": 0.72265625, "learning_rate": 1.3275000000000002e-06, "loss": 0.1912, "step": 7470 }, { "epoch": 0.16420388282911436, "grad_norm": 0.83984375, "learning_rate": 1.3025000000000002e-06, "loss": 0.2127, "step": 7480 }, { "epoch": 0.1644234067366399, "grad_norm": 0.80859375, "learning_rate": 1.2775e-06, "loss": 0.1856, "step": 7490 }, { "epoch": 0.16464293064416546, "grad_norm": 0.72265625, "learning_rate": 1.2525e-06, "loss": 0.1888, "step": 7500 }, { "epoch": 0.16486245455169102, "grad_norm": 0.9296875, "learning_rate": 1.2275000000000001e-06, "loss": 0.2093, "step": 7510 }, { "epoch": 0.16508197845921657, "grad_norm": 1.1953125, "learning_rate": 1.2025000000000001e-06, "loss": 0.1947, "step": 7520 }, { "epoch": 0.16530150236674213, "grad_norm": 1.015625, "learning_rate": 1.1775e-06, "loss": 0.2203, "step": 7530 }, { "epoch": 0.16552102627426768, "grad_norm": 0.67578125, "learning_rate": 1.1525000000000002e-06, "loss": 0.1957, "step": 7540 }, { "epoch": 0.16574055018179323, "grad_norm": 0.60546875, "learning_rate": 1.1275000000000002e-06, "loss": 0.1815, "step": 7550 }, { "epoch": 0.1659600740893188, "grad_norm": 0.7734375, "learning_rate": 1.1025e-06, "loss": 0.163, "step": 7560 }, { "epoch": 0.16617959799684434, "grad_norm": 0.73828125, "learning_rate": 1.0775e-06, "loss": 0.167, "step": 7570 }, { "epoch": 0.1663991219043699, "grad_norm": 0.69921875, "learning_rate": 1.0525e-06, "loss": 0.1934, "step": 7580 }, { "epoch": 0.16661864581189545, "grad_norm": 0.75390625, "learning_rate": 1.0275000000000001e-06, "loss": 0.1982, "step": 7590 }, { "epoch": 0.166838169719421, "grad_norm": 0.75390625, "learning_rate": 1.0025000000000001e-06, "loss": 0.202, "step": 7600 }, { "epoch": 0.16705769362694656, "grad_norm": 0.55078125, "learning_rate": 9.775000000000002e-07, "loss": 0.1615, "step": 7610 }, { "epoch": 0.1672772175344721, "grad_norm": 0.8359375, "learning_rate": 9.525000000000001e-07, "loss": 0.2037, "step": 7620 }, { "epoch": 0.16749674144199767, "grad_norm": 0.9453125, "learning_rate": 9.275000000000001e-07, "loss": 0.2211, "step": 7630 }, { "epoch": 0.16771626534952322, "grad_norm": 0.66796875, "learning_rate": 9.025e-07, "loss": 0.1871, "step": 7640 }, { "epoch": 0.16793578925704877, "grad_norm": 0.84375, "learning_rate": 8.775000000000001e-07, "loss": 0.2264, "step": 7650 }, { "epoch": 0.16815531316457433, "grad_norm": 0.62890625, "learning_rate": 8.525000000000001e-07, "loss": 0.1951, "step": 7660 }, { "epoch": 0.16837483707209988, "grad_norm": 0.88671875, "learning_rate": 8.275000000000001e-07, "loss": 0.1819, "step": 7670 }, { "epoch": 0.16859436097962543, "grad_norm": 0.70703125, "learning_rate": 8.025e-07, "loss": 0.1665, "step": 7680 }, { "epoch": 0.168813884887151, "grad_norm": 0.7734375, "learning_rate": 7.775000000000001e-07, "loss": 0.1548, "step": 7690 }, { "epoch": 0.16903340879467654, "grad_norm": 0.8125, "learning_rate": 7.525e-07, "loss": 0.1992, "step": 7700 }, { "epoch": 0.1692529327022021, "grad_norm": 0.69140625, "learning_rate": 7.275e-07, "loss": 0.1725, "step": 7710 }, { "epoch": 0.16947245660972765, "grad_norm": 0.65234375, "learning_rate": 7.025000000000002e-07, "loss": 0.1665, "step": 7720 }, { "epoch": 0.1696919805172532, "grad_norm": 0.8125, "learning_rate": 6.775000000000001e-07, "loss": 0.1567, "step": 7730 }, { "epoch": 0.16991150442477876, "grad_norm": 0.984375, "learning_rate": 6.525000000000001e-07, "loss": 0.1834, "step": 7740 }, { "epoch": 0.1701310283323043, "grad_norm": 0.70703125, "learning_rate": 6.275e-07, "loss": 0.1979, "step": 7750 }, { "epoch": 0.17035055223982987, "grad_norm": 1.0546875, "learning_rate": 6.025000000000001e-07, "loss": 0.2028, "step": 7760 }, { "epoch": 0.17057007614735542, "grad_norm": 1.1484375, "learning_rate": 5.775000000000001e-07, "loss": 0.181, "step": 7770 }, { "epoch": 0.17078960005488097, "grad_norm": 0.6796875, "learning_rate": 5.525e-07, "loss": 0.1798, "step": 7780 }, { "epoch": 0.17100912396240653, "grad_norm": 1.1171875, "learning_rate": 5.275e-07, "loss": 0.1906, "step": 7790 }, { "epoch": 0.17122864786993208, "grad_norm": 0.7265625, "learning_rate": 5.025000000000001e-07, "loss": 0.1689, "step": 7800 }, { "epoch": 0.17144817177745764, "grad_norm": 0.796875, "learning_rate": 4.775000000000001e-07, "loss": 0.1893, "step": 7810 }, { "epoch": 0.1716676956849832, "grad_norm": 0.7265625, "learning_rate": 4.525e-07, "loss": 0.1728, "step": 7820 }, { "epoch": 0.17188721959250874, "grad_norm": 0.66015625, "learning_rate": 4.275000000000001e-07, "loss": 0.1853, "step": 7830 }, { "epoch": 0.1721067435000343, "grad_norm": 0.84765625, "learning_rate": 4.0250000000000006e-07, "loss": 0.1792, "step": 7840 }, { "epoch": 0.17232626740755985, "grad_norm": 0.82421875, "learning_rate": 3.7750000000000004e-07, "loss": 0.1823, "step": 7850 }, { "epoch": 0.1725457913150854, "grad_norm": 0.80078125, "learning_rate": 3.525e-07, "loss": 0.1791, "step": 7860 }, { "epoch": 0.17276531522261096, "grad_norm": 1.09375, "learning_rate": 3.2750000000000004e-07, "loss": 0.192, "step": 7870 }, { "epoch": 0.17298483913013651, "grad_norm": 0.875, "learning_rate": 3.025e-07, "loss": 0.1608, "step": 7880 }, { "epoch": 0.17320436303766207, "grad_norm": 0.79296875, "learning_rate": 2.7750000000000004e-07, "loss": 0.1995, "step": 7890 }, { "epoch": 0.17342388694518762, "grad_norm": 0.6796875, "learning_rate": 2.525e-07, "loss": 0.186, "step": 7900 }, { "epoch": 0.17364341085271318, "grad_norm": 0.79296875, "learning_rate": 2.2750000000000002e-07, "loss": 0.1908, "step": 7910 }, { "epoch": 0.17386293476023873, "grad_norm": 0.66015625, "learning_rate": 2.0250000000000002e-07, "loss": 0.1637, "step": 7920 }, { "epoch": 0.17408245866776428, "grad_norm": 0.671875, "learning_rate": 1.775e-07, "loss": 0.1692, "step": 7930 }, { "epoch": 0.17430198257528984, "grad_norm": 0.61328125, "learning_rate": 1.5250000000000002e-07, "loss": 0.1555, "step": 7940 }, { "epoch": 0.1745215064828154, "grad_norm": 0.640625, "learning_rate": 1.275e-07, "loss": 0.1798, "step": 7950 }, { "epoch": 0.17474103039034095, "grad_norm": 0.81640625, "learning_rate": 1.0250000000000001e-07, "loss": 0.2014, "step": 7960 }, { "epoch": 0.1749605542978665, "grad_norm": 0.95703125, "learning_rate": 7.750000000000001e-08, "loss": 0.1988, "step": 7970 }, { "epoch": 0.17518007820539205, "grad_norm": 0.89453125, "learning_rate": 5.250000000000001e-08, "loss": 0.2301, "step": 7980 }, { "epoch": 0.1753996021129176, "grad_norm": 0.73828125, "learning_rate": 2.75e-08, "loss": 0.1639, "step": 7990 }, { "epoch": 0.17561912602044316, "grad_norm": 0.74609375, "learning_rate": 2.5e-09, "loss": 0.1793, "step": 8000 } ], "logging_steps": 10, "max_steps": 8000, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 1.5267428972077793e+18, "train_batch_size": 1, "trial_name": null, "trial_params": null }