diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,115256 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 1.0, + "eval_steps": 500, + "global_step": 16459, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 6.075703262652652e-05, + "grad_norm": 0.17771193385124207, + "learning_rate": 0.0, + "loss": 1.1983, + "step": 1 + }, + { + "epoch": 0.00012151406525305304, + "grad_norm": 0.6649877429008484, + "learning_rate": 2.0000000000000003e-06, + "loss": 1.434, + "step": 2 + }, + { + "epoch": 0.00018227109787957957, + "grad_norm": 0.1839398741722107, + "learning_rate": 4.000000000000001e-06, + "loss": 1.2097, + "step": 3 + }, + { + "epoch": 0.0002430281305061061, + "grad_norm": 0.3777402341365814, + "learning_rate": 6e-06, + "loss": 1.4124, + "step": 4 + }, + { + "epoch": 0.0003037851631326326, + "grad_norm": 0.2884521484375, + "learning_rate": 8.000000000000001e-06, + "loss": 1.42, + "step": 5 + }, + { + "epoch": 0.00036454219575915913, + "grad_norm": 0.18382205069065094, + "learning_rate": 1e-05, + "loss": 1.2358, + "step": 6 + }, + { + "epoch": 0.00042529922838568566, + "grad_norm": 0.8757884502410889, + "learning_rate": 1.2e-05, + "loss": 1.4333, + "step": 7 + }, + { + "epoch": 0.0004860562610122122, + "grad_norm": 0.2029954046010971, + "learning_rate": 1.4000000000000001e-05, + "loss": 1.2286, + "step": 8 + }, + { + "epoch": 0.0005468132936387387, + "grad_norm": 0.6275981068611145, + "learning_rate": 1.6000000000000003e-05, + "loss": 1.3836, + "step": 9 + }, + { + "epoch": 0.0006075703262652652, + "grad_norm": 0.2643689513206482, + "learning_rate": 1.8e-05, + "loss": 1.2047, + "step": 10 + }, + { + "epoch": 0.0006683273588917917, + "grad_norm": 0.2377251386642456, + "learning_rate": 2e-05, + "loss": 1.2234, + "step": 11 + }, + { + "epoch": 0.0007290843915183183, + "grad_norm": 0.27713561058044434, + "learning_rate": 2.2000000000000003e-05, + "loss": 1.2786, + "step": 12 + }, + { + "epoch": 0.0007898414241448448, + "grad_norm": 0.25062233209609985, + "learning_rate": 2.4e-05, + "loss": 1.2505, + "step": 13 + }, + { + "epoch": 0.0008505984567713713, + "grad_norm": 0.2383335530757904, + "learning_rate": 2.6000000000000002e-05, + "loss": 1.358, + "step": 14 + }, + { + "epoch": 0.0009113554893978978, + "grad_norm": 0.3477482795715332, + "learning_rate": 2.8000000000000003e-05, + "loss": 1.4309, + "step": 15 + }, + { + "epoch": 0.0009721125220244244, + "grad_norm": 0.24556300044059753, + "learning_rate": 3e-05, + "loss": 1.2673, + "step": 16 + }, + { + "epoch": 0.0010328695546509509, + "grad_norm": 1.1669777631759644, + "learning_rate": 3.2000000000000005e-05, + "loss": 1.3584, + "step": 17 + }, + { + "epoch": 0.0010936265872774774, + "grad_norm": 0.26149648427963257, + "learning_rate": 3.4000000000000007e-05, + "loss": 1.3964, + "step": 18 + }, + { + "epoch": 0.001154383619904004, + "grad_norm": 0.30708009004592896, + "learning_rate": 3.6e-05, + "loss": 1.3155, + "step": 19 + }, + { + "epoch": 0.0012151406525305304, + "grad_norm": 0.23968864977359772, + "learning_rate": 3.8e-05, + "loss": 1.2109, + "step": 20 + }, + { + "epoch": 0.001275897685157057, + "grad_norm": 0.2607705593109131, + "learning_rate": 4e-05, + "loss": 1.3489, + "step": 21 + }, + { + "epoch": 0.0013366547177835835, + "grad_norm": 0.2762501835823059, + "learning_rate": 4.2e-05, + "loss": 1.2211, + "step": 22 + }, + { + "epoch": 0.00139741175041011, + "grad_norm": 0.3239821195602417, + "learning_rate": 4.4000000000000006e-05, + "loss": 1.4063, + "step": 23 + }, + { + "epoch": 0.0014581687830366365, + "grad_norm": 0.27468156814575195, + "learning_rate": 4.600000000000001e-05, + "loss": 1.2292, + "step": 24 + }, + { + "epoch": 0.001518925815663163, + "grad_norm": 0.34043198823928833, + "learning_rate": 4.8e-05, + "loss": 1.2227, + "step": 25 + }, + { + "epoch": 0.0015796828482896896, + "grad_norm": 0.23437948524951935, + "learning_rate": 5e-05, + "loss": 1.2727, + "step": 26 + }, + { + "epoch": 0.001640439880916216, + "grad_norm": 0.2744443416595459, + "learning_rate": 5.2000000000000004e-05, + "loss": 1.2517, + "step": 27 + }, + { + "epoch": 0.0017011969135427426, + "grad_norm": 0.3189443051815033, + "learning_rate": 5.4000000000000005e-05, + "loss": 1.1987, + "step": 28 + }, + { + "epoch": 0.0017619539461692691, + "grad_norm": 0.36634373664855957, + "learning_rate": 5.6000000000000006e-05, + "loss": 1.2931, + "step": 29 + }, + { + "epoch": 0.0018227109787957957, + "grad_norm": 0.21552708745002747, + "learning_rate": 5.8e-05, + "loss": 1.19, + "step": 30 + }, + { + "epoch": 0.0018834680114223222, + "grad_norm": 0.22331136465072632, + "learning_rate": 6e-05, + "loss": 1.166, + "step": 31 + }, + { + "epoch": 0.0019442250440488487, + "grad_norm": 0.27115359902381897, + "learning_rate": 6.2e-05, + "loss": 1.2394, + "step": 32 + }, + { + "epoch": 0.002004982076675375, + "grad_norm": 0.24723933637142181, + "learning_rate": 6.400000000000001e-05, + "loss": 1.298, + "step": 33 + }, + { + "epoch": 0.0020657391093019018, + "grad_norm": 0.6054807305335999, + "learning_rate": 6.6e-05, + "loss": 1.2711, + "step": 34 + }, + { + "epoch": 0.002126496141928428, + "grad_norm": 0.2657296657562256, + "learning_rate": 6.800000000000001e-05, + "loss": 1.1846, + "step": 35 + }, + { + "epoch": 0.002187253174554955, + "grad_norm": 0.5996095538139343, + "learning_rate": 7e-05, + "loss": 1.4208, + "step": 36 + }, + { + "epoch": 0.002248010207181481, + "grad_norm": 0.20939430594444275, + "learning_rate": 7.2e-05, + "loss": 1.21, + "step": 37 + }, + { + "epoch": 0.002308767239808008, + "grad_norm": 0.5714825987815857, + "learning_rate": 7.4e-05, + "loss": 1.2827, + "step": 38 + }, + { + "epoch": 0.002369524272434534, + "grad_norm": 0.9815083742141724, + "learning_rate": 7.6e-05, + "loss": 1.1884, + "step": 39 + }, + { + "epoch": 0.002430281305061061, + "grad_norm": 1.6414822340011597, + "learning_rate": 7.800000000000001e-05, + "loss": 1.4738, + "step": 40 + }, + { + "epoch": 0.002491038337687587, + "grad_norm": 0.3299960792064667, + "learning_rate": 8e-05, + "loss": 1.2818, + "step": 41 + }, + { + "epoch": 0.002551795370314114, + "grad_norm": 0.2786610424518585, + "learning_rate": 8.2e-05, + "loss": 1.3226, + "step": 42 + }, + { + "epoch": 0.0026125524029406402, + "grad_norm": 0.48672959208488464, + "learning_rate": 8.4e-05, + "loss": 1.3591, + "step": 43 + }, + { + "epoch": 0.002673309435567167, + "grad_norm": 0.2563237249851227, + "learning_rate": 8.6e-05, + "loss": 1.2016, + "step": 44 + }, + { + "epoch": 0.0027340664681936933, + "grad_norm": 0.31436944007873535, + "learning_rate": 8.800000000000001e-05, + "loss": 1.3627, + "step": 45 + }, + { + "epoch": 0.00279482350082022, + "grad_norm": 0.4340488016605377, + "learning_rate": 9e-05, + "loss": 1.3916, + "step": 46 + }, + { + "epoch": 0.0028555805334467463, + "grad_norm": 0.30021172761917114, + "learning_rate": 9.200000000000001e-05, + "loss": 1.1985, + "step": 47 + }, + { + "epoch": 0.002916337566073273, + "grad_norm": 0.587597668170929, + "learning_rate": 9.4e-05, + "loss": 1.3836, + "step": 48 + }, + { + "epoch": 0.0029770945986997994, + "grad_norm": 0.2984046936035156, + "learning_rate": 9.6e-05, + "loss": 1.2871, + "step": 49 + }, + { + "epoch": 0.003037851631326326, + "grad_norm": 1.8778438568115234, + "learning_rate": 9.8e-05, + "loss": 1.2312, + "step": 50 + }, + { + "epoch": 0.0030986086639528524, + "grad_norm": 0.548705518245697, + "learning_rate": 0.0001, + "loss": 1.2312, + "step": 51 + }, + { + "epoch": 0.003159365696579379, + "grad_norm": 0.5713818073272705, + "learning_rate": 9.999999908362017e-05, + "loss": 1.2052, + "step": 52 + }, + { + "epoch": 0.0032201227292059055, + "grad_norm": 0.4120612144470215, + "learning_rate": 9.999999633448071e-05, + "loss": 1.3071, + "step": 53 + }, + { + "epoch": 0.003280879761832432, + "grad_norm": 0.48224759101867676, + "learning_rate": 9.999999175258171e-05, + "loss": 1.2323, + "step": 54 + }, + { + "epoch": 0.0033416367944589585, + "grad_norm": 0.5659412741661072, + "learning_rate": 9.999998533792336e-05, + "loss": 1.203, + "step": 55 + }, + { + "epoch": 0.0034023938270854853, + "grad_norm": 0.34445783495903015, + "learning_rate": 9.999997709050588e-05, + "loss": 1.2022, + "step": 56 + }, + { + "epoch": 0.0034631508597120116, + "grad_norm": 0.957758903503418, + "learning_rate": 9.999996701032956e-05, + "loss": 1.2781, + "step": 57 + }, + { + "epoch": 0.0035239078923385383, + "grad_norm": 0.4782249331474304, + "learning_rate": 9.99999550973948e-05, + "loss": 1.166, + "step": 58 + }, + { + "epoch": 0.0035846649249650646, + "grad_norm": 0.4631136655807495, + "learning_rate": 9.999994135170202e-05, + "loss": 1.1847, + "step": 59 + }, + { + "epoch": 0.0036454219575915913, + "grad_norm": 0.49536243081092834, + "learning_rate": 9.999992577325172e-05, + "loss": 1.3205, + "step": 60 + }, + { + "epoch": 0.0037061789902181176, + "grad_norm": 0.6254922151565552, + "learning_rate": 9.999990836204447e-05, + "loss": 1.175, + "step": 61 + }, + { + "epoch": 0.0037669360228446444, + "grad_norm": 0.4348151385784149, + "learning_rate": 9.999988911808093e-05, + "loss": 1.306, + "step": 62 + }, + { + "epoch": 0.0038276930554711707, + "grad_norm": 0.3686026930809021, + "learning_rate": 9.999986804136179e-05, + "loss": 1.2343, + "step": 63 + }, + { + "epoch": 0.0038884500880976974, + "grad_norm": 0.26751378178596497, + "learning_rate": 9.99998451318878e-05, + "loss": 1.266, + "step": 64 + }, + { + "epoch": 0.003949207120724224, + "grad_norm": 0.555131733417511, + "learning_rate": 9.999982038965985e-05, + "loss": 1.1944, + "step": 65 + }, + { + "epoch": 0.00400996415335075, + "grad_norm": 0.2945740520954132, + "learning_rate": 9.999979381467879e-05, + "loss": 1.1458, + "step": 66 + }, + { + "epoch": 0.004070721185977277, + "grad_norm": 0.5865735411643982, + "learning_rate": 9.999976540694564e-05, + "loss": 1.2297, + "step": 67 + }, + { + "epoch": 0.0041314782186038035, + "grad_norm": 0.5849476456642151, + "learning_rate": 9.999973516646142e-05, + "loss": 1.3176, + "step": 68 + }, + { + "epoch": 0.00419223525123033, + "grad_norm": 0.3707224726676941, + "learning_rate": 9.999970309322725e-05, + "loss": 1.2165, + "step": 69 + }, + { + "epoch": 0.004252992283856856, + "grad_norm": 0.6674245595932007, + "learning_rate": 9.999966918724429e-05, + "loss": 1.248, + "step": 70 + }, + { + "epoch": 0.004313749316483383, + "grad_norm": 0.33753934502601624, + "learning_rate": 9.999963344851379e-05, + "loss": 1.3501, + "step": 71 + }, + { + "epoch": 0.00437450634910991, + "grad_norm": 0.6501384377479553, + "learning_rate": 9.999959587703706e-05, + "loss": 1.2387, + "step": 72 + }, + { + "epoch": 0.004435263381736436, + "grad_norm": 0.380902498960495, + "learning_rate": 9.999955647281549e-05, + "loss": 1.1812, + "step": 73 + }, + { + "epoch": 0.004496020414362962, + "grad_norm": 0.5059203505516052, + "learning_rate": 9.999951523585051e-05, + "loss": 1.2051, + "step": 74 + }, + { + "epoch": 0.004556777446989489, + "grad_norm": 0.21010912954807281, + "learning_rate": 9.999947216614363e-05, + "loss": 1.1709, + "step": 75 + }, + { + "epoch": 0.004617534479616016, + "grad_norm": 1.1988906860351562, + "learning_rate": 9.999942726369644e-05, + "loss": 1.1707, + "step": 76 + }, + { + "epoch": 0.004678291512242542, + "grad_norm": 0.3643610179424286, + "learning_rate": 9.999938052851057e-05, + "loss": 1.2246, + "step": 77 + }, + { + "epoch": 0.004739048544869068, + "grad_norm": 0.2887507379055023, + "learning_rate": 9.999933196058775e-05, + "loss": 1.2145, + "step": 78 + }, + { + "epoch": 0.0047998055774955955, + "grad_norm": 0.8529403209686279, + "learning_rate": 9.999928155992975e-05, + "loss": 1.2869, + "step": 79 + }, + { + "epoch": 0.004860562610122122, + "grad_norm": 0.4480118155479431, + "learning_rate": 9.99992293265384e-05, + "loss": 1.3645, + "step": 80 + }, + { + "epoch": 0.004921319642748648, + "grad_norm": 0.5300185680389404, + "learning_rate": 9.999917526041567e-05, + "loss": 1.3036, + "step": 81 + }, + { + "epoch": 0.004982076675375174, + "grad_norm": 0.5703990459442139, + "learning_rate": 9.999911936156348e-05, + "loss": 1.1698, + "step": 82 + }, + { + "epoch": 0.005042833708001702, + "grad_norm": 0.4402383267879486, + "learning_rate": 9.999906162998392e-05, + "loss": 1.2819, + "step": 83 + }, + { + "epoch": 0.005103590740628228, + "grad_norm": 0.3625530004501343, + "learning_rate": 9.999900206567906e-05, + "loss": 1.1667, + "step": 84 + }, + { + "epoch": 0.005164347773254754, + "grad_norm": 0.31713375449180603, + "learning_rate": 9.999894066865115e-05, + "loss": 1.1938, + "step": 85 + }, + { + "epoch": 0.0052251048058812805, + "grad_norm": 0.49715203046798706, + "learning_rate": 9.999887743890239e-05, + "loss": 1.3341, + "step": 86 + }, + { + "epoch": 0.005285861838507808, + "grad_norm": 0.3231082856655121, + "learning_rate": 9.999881237643512e-05, + "loss": 1.3648, + "step": 87 + }, + { + "epoch": 0.005346618871134334, + "grad_norm": 0.22051608562469482, + "learning_rate": 9.999874548125172e-05, + "loss": 1.218, + "step": 88 + }, + { + "epoch": 0.00540737590376086, + "grad_norm": 0.7233831882476807, + "learning_rate": 9.999867675335464e-05, + "loss": 1.181, + "step": 89 + }, + { + "epoch": 0.005468132936387387, + "grad_norm": 0.24952976405620575, + "learning_rate": 9.999860619274639e-05, + "loss": 1.2167, + "step": 90 + }, + { + "epoch": 0.005528889969013914, + "grad_norm": 0.3820326030254364, + "learning_rate": 9.999853379942956e-05, + "loss": 1.304, + "step": 91 + }, + { + "epoch": 0.00558964700164044, + "grad_norm": 0.28423139452934265, + "learning_rate": 9.999845957340684e-05, + "loss": 1.1986, + "step": 92 + }, + { + "epoch": 0.005650404034266966, + "grad_norm": 10.655237197875977, + "learning_rate": 9.999838351468088e-05, + "loss": 1.1906, + "step": 93 + }, + { + "epoch": 0.005711161066893493, + "grad_norm": 0.5088858604431152, + "learning_rate": 9.999830562325453e-05, + "loss": 1.3004, + "step": 94 + }, + { + "epoch": 0.00577191809952002, + "grad_norm": 0.4481561481952667, + "learning_rate": 9.999822589913062e-05, + "loss": 1.2209, + "step": 95 + }, + { + "epoch": 0.005832675132146546, + "grad_norm": 0.23201344907283783, + "learning_rate": 9.999814434231207e-05, + "loss": 1.1614, + "step": 96 + }, + { + "epoch": 0.0058934321647730725, + "grad_norm": 0.5318001508712769, + "learning_rate": 9.999806095280189e-05, + "loss": 1.1645, + "step": 97 + }, + { + "epoch": 0.005954189197399599, + "grad_norm": 0.19209295511245728, + "learning_rate": 9.99979757306031e-05, + "loss": 1.1684, + "step": 98 + }, + { + "epoch": 0.006014946230026126, + "grad_norm": 0.4901897609233856, + "learning_rate": 9.999788867571887e-05, + "loss": 1.1833, + "step": 99 + }, + { + "epoch": 0.006075703262652652, + "grad_norm": 0.24492308497428894, + "learning_rate": 9.999779978815233e-05, + "loss": 1.2093, + "step": 100 + }, + { + "epoch": 0.0061364602952791785, + "grad_norm": 0.3037416636943817, + "learning_rate": 9.99977090679068e-05, + "loss": 1.1944, + "step": 101 + }, + { + "epoch": 0.006197217327905705, + "grad_norm": 0.38511979579925537, + "learning_rate": 9.999761651498559e-05, + "loss": 1.2449, + "step": 102 + }, + { + "epoch": 0.006257974360532232, + "grad_norm": 45.19050979614258, + "learning_rate": 9.999752212939206e-05, + "loss": 1.3232, + "step": 103 + }, + { + "epoch": 0.006318731393158758, + "grad_norm": 0.3553546369075775, + "learning_rate": 9.99974259111297e-05, + "loss": 1.2232, + "step": 104 + }, + { + "epoch": 0.006379488425785285, + "grad_norm": 0.23319929838180542, + "learning_rate": 9.999732786020203e-05, + "loss": 1.1975, + "step": 105 + }, + { + "epoch": 0.006440245458411811, + "grad_norm": 0.21722131967544556, + "learning_rate": 9.999722797661264e-05, + "loss": 1.2408, + "step": 106 + }, + { + "epoch": 0.006501002491038338, + "grad_norm": 0.3070448338985443, + "learning_rate": 9.99971262603652e-05, + "loss": 1.1694, + "step": 107 + }, + { + "epoch": 0.006561759523664864, + "grad_norm": 0.20213374495506287, + "learning_rate": 9.999702271146343e-05, + "loss": 1.2252, + "step": 108 + }, + { + "epoch": 0.006622516556291391, + "grad_norm": 0.872248649597168, + "learning_rate": 9.999691732991112e-05, + "loss": 1.1857, + "step": 109 + }, + { + "epoch": 0.006683273588917917, + "grad_norm": 0.2687833309173584, + "learning_rate": 9.999681011571215e-05, + "loss": 1.1348, + "step": 110 + }, + { + "epoch": 0.006744030621544444, + "grad_norm": 0.22979538142681122, + "learning_rate": 9.999670106887045e-05, + "loss": 1.319, + "step": 111 + }, + { + "epoch": 0.0068047876541709705, + "grad_norm": 0.3538469970226288, + "learning_rate": 9.999659018938998e-05, + "loss": 1.3412, + "step": 112 + }, + { + "epoch": 0.006865544686797497, + "grad_norm": 0.23582477867603302, + "learning_rate": 9.999647747727485e-05, + "loss": 1.1568, + "step": 113 + }, + { + "epoch": 0.006926301719424023, + "grad_norm": 0.2528184652328491, + "learning_rate": 9.999636293252919e-05, + "loss": 1.1557, + "step": 114 + }, + { + "epoch": 0.006987058752050549, + "grad_norm": 0.22521843016147614, + "learning_rate": 9.999624655515716e-05, + "loss": 1.1874, + "step": 115 + }, + { + "epoch": 0.007047815784677077, + "grad_norm": 0.26984483003616333, + "learning_rate": 9.999612834516305e-05, + "loss": 1.3546, + "step": 116 + }, + { + "epoch": 0.007108572817303603, + "grad_norm": 0.357096403837204, + "learning_rate": 9.99960083025512e-05, + "loss": 1.3785, + "step": 117 + }, + { + "epoch": 0.007169329849930129, + "grad_norm": 0.41239115595817566, + "learning_rate": 9.9995886427326e-05, + "loss": 1.1813, + "step": 118 + }, + { + "epoch": 0.0072300868825566555, + "grad_norm": 0.19457732141017914, + "learning_rate": 9.999576271949192e-05, + "loss": 1.2222, + "step": 119 + }, + { + "epoch": 0.007290843915183183, + "grad_norm": 0.298249214887619, + "learning_rate": 9.999563717905349e-05, + "loss": 1.2833, + "step": 120 + }, + { + "epoch": 0.007351600947809709, + "grad_norm": 0.3192598223686218, + "learning_rate": 9.999550980601531e-05, + "loss": 1.2762, + "step": 121 + }, + { + "epoch": 0.007412357980436235, + "grad_norm": 0.2641714811325073, + "learning_rate": 9.999538060038207e-05, + "loss": 1.2404, + "step": 122 + }, + { + "epoch": 0.007473115013062762, + "grad_norm": 0.21821774542331696, + "learning_rate": 9.999524956215848e-05, + "loss": 1.1696, + "step": 123 + }, + { + "epoch": 0.007533872045689289, + "grad_norm": 0.21299785375595093, + "learning_rate": 9.999511669134935e-05, + "loss": 1.1902, + "step": 124 + }, + { + "epoch": 0.007594629078315815, + "grad_norm": 0.2785874605178833, + "learning_rate": 9.999498198795955e-05, + "loss": 1.2195, + "step": 125 + }, + { + "epoch": 0.007655386110942341, + "grad_norm": 0.293491929769516, + "learning_rate": 9.999484545199405e-05, + "loss": 1.2029, + "step": 126 + }, + { + "epoch": 0.007716143143568868, + "grad_norm": 0.3122383952140808, + "learning_rate": 9.999470708345779e-05, + "loss": 1.1709, + "step": 127 + }, + { + "epoch": 0.007776900176195395, + "grad_norm": 0.3170028328895569, + "learning_rate": 9.99945668823559e-05, + "loss": 1.1759, + "step": 128 + }, + { + "epoch": 0.007837657208821921, + "grad_norm": 0.3421311676502228, + "learning_rate": 9.999442484869348e-05, + "loss": 1.3083, + "step": 129 + }, + { + "epoch": 0.007898414241448447, + "grad_norm": 0.2029871642589569, + "learning_rate": 9.999428098247576e-05, + "loss": 1.2978, + "step": 130 + }, + { + "epoch": 0.007959171274074974, + "grad_norm": 0.17570683360099792, + "learning_rate": 9.999413528370803e-05, + "loss": 1.1686, + "step": 131 + }, + { + "epoch": 0.0080199283067015, + "grad_norm": 0.1681767851114273, + "learning_rate": 9.999398775239559e-05, + "loss": 1.1594, + "step": 132 + }, + { + "epoch": 0.008080685339328026, + "grad_norm": 0.22254763543605804, + "learning_rate": 9.999383838854385e-05, + "loss": 1.1966, + "step": 133 + }, + { + "epoch": 0.008141442371954554, + "grad_norm": 0.39164310693740845, + "learning_rate": 9.999368719215831e-05, + "loss": 1.2692, + "step": 134 + }, + { + "epoch": 0.00820219940458108, + "grad_norm": 0.2754465341567993, + "learning_rate": 9.999353416324452e-05, + "loss": 1.2483, + "step": 135 + }, + { + "epoch": 0.008262956437207607, + "grad_norm": 0.33278337121009827, + "learning_rate": 9.999337930180805e-05, + "loss": 1.2654, + "step": 136 + }, + { + "epoch": 0.008323713469834133, + "grad_norm": 0.5331653356552124, + "learning_rate": 9.999322260785461e-05, + "loss": 1.3173, + "step": 137 + }, + { + "epoch": 0.00838447050246066, + "grad_norm": 0.387779176235199, + "learning_rate": 9.999306408138994e-05, + "loss": 1.1948, + "step": 138 + }, + { + "epoch": 0.008445227535087186, + "grad_norm": 0.434356153011322, + "learning_rate": 9.999290372241982e-05, + "loss": 1.1932, + "step": 139 + }, + { + "epoch": 0.008505984567713712, + "grad_norm": 53.942237854003906, + "learning_rate": 9.999274153095016e-05, + "loss": 1.2247, + "step": 140 + }, + { + "epoch": 0.008566741600340239, + "grad_norm": 0.5120449066162109, + "learning_rate": 9.99925775069869e-05, + "loss": 1.2842, + "step": 141 + }, + { + "epoch": 0.008627498632966767, + "grad_norm": 0.40591004490852356, + "learning_rate": 9.999241165053605e-05, + "loss": 1.2218, + "step": 142 + }, + { + "epoch": 0.008688255665593293, + "grad_norm": 0.35961678624153137, + "learning_rate": 9.999224396160367e-05, + "loss": 1.3417, + "step": 143 + }, + { + "epoch": 0.00874901269821982, + "grad_norm": 0.2422172874212265, + "learning_rate": 9.999207444019594e-05, + "loss": 1.2508, + "step": 144 + }, + { + "epoch": 0.008809769730846346, + "grad_norm": 0.21179278194904327, + "learning_rate": 9.999190308631906e-05, + "loss": 1.2071, + "step": 145 + }, + { + "epoch": 0.008870526763472872, + "grad_norm": 0.22785666584968567, + "learning_rate": 9.999172989997931e-05, + "loss": 1.2072, + "step": 146 + }, + { + "epoch": 0.008931283796099398, + "grad_norm": 0.22191546857357025, + "learning_rate": 9.999155488118302e-05, + "loss": 1.1528, + "step": 147 + }, + { + "epoch": 0.008992040828725924, + "grad_norm": 0.4478309750556946, + "learning_rate": 9.999137802993664e-05, + "loss": 1.168, + "step": 148 + }, + { + "epoch": 0.00905279786135245, + "grad_norm": 0.2741773724555969, + "learning_rate": 9.999119934624663e-05, + "loss": 1.2232, + "step": 149 + }, + { + "epoch": 0.009113554893978979, + "grad_norm": 0.2959984242916107, + "learning_rate": 9.999101883011954e-05, + "loss": 1.2353, + "step": 150 + }, + { + "epoch": 0.009174311926605505, + "grad_norm": 0.24366505444049835, + "learning_rate": 9.9990836481562e-05, + "loss": 1.201, + "step": 151 + }, + { + "epoch": 0.009235068959232031, + "grad_norm": 0.27018189430236816, + "learning_rate": 9.99906523005807e-05, + "loss": 1.2382, + "step": 152 + }, + { + "epoch": 0.009295825991858558, + "grad_norm": 0.5289650559425354, + "learning_rate": 9.999046628718234e-05, + "loss": 1.2516, + "step": 153 + }, + { + "epoch": 0.009356583024485084, + "grad_norm": 0.23208869993686676, + "learning_rate": 9.99902784413738e-05, + "loss": 1.185, + "step": 154 + }, + { + "epoch": 0.00941734005711161, + "grad_norm": 0.28683531284332275, + "learning_rate": 9.999008876316194e-05, + "loss": 1.2603, + "step": 155 + }, + { + "epoch": 0.009478097089738137, + "grad_norm": 0.2525317072868347, + "learning_rate": 9.99898972525537e-05, + "loss": 1.1543, + "step": 156 + }, + { + "epoch": 0.009538854122364663, + "grad_norm": 0.5868335366249084, + "learning_rate": 9.998970390955613e-05, + "loss": 1.232, + "step": 157 + }, + { + "epoch": 0.009599611154991191, + "grad_norm": 0.2296774834394455, + "learning_rate": 9.99895087341763e-05, + "loss": 1.1516, + "step": 158 + }, + { + "epoch": 0.009660368187617717, + "grad_norm": 0.3657951354980469, + "learning_rate": 9.998931172642134e-05, + "loss": 1.1734, + "step": 159 + }, + { + "epoch": 0.009721125220244244, + "grad_norm": 0.2197127789258957, + "learning_rate": 9.998911288629852e-05, + "loss": 1.2657, + "step": 160 + }, + { + "epoch": 0.00978188225287077, + "grad_norm": 0.18814058601856232, + "learning_rate": 9.99889122138151e-05, + "loss": 1.181, + "step": 161 + }, + { + "epoch": 0.009842639285497296, + "grad_norm": 0.19529810547828674, + "learning_rate": 9.998870970897844e-05, + "loss": 1.1395, + "step": 162 + }, + { + "epoch": 0.009903396318123822, + "grad_norm": 2.752793788909912, + "learning_rate": 9.998850537179596e-05, + "loss": 1.1917, + "step": 163 + }, + { + "epoch": 0.009964153350750349, + "grad_norm": 0.29561033844947815, + "learning_rate": 9.998829920227514e-05, + "loss": 1.2864, + "step": 164 + }, + { + "epoch": 0.010024910383376875, + "grad_norm": 0.4728116989135742, + "learning_rate": 9.998809120042358e-05, + "loss": 1.1622, + "step": 165 + }, + { + "epoch": 0.010085667416003403, + "grad_norm": 0.3741132318973541, + "learning_rate": 9.998788136624885e-05, + "loss": 1.2486, + "step": 166 + }, + { + "epoch": 0.01014642444862993, + "grad_norm": 0.32585614919662476, + "learning_rate": 9.998766969975868e-05, + "loss": 1.1889, + "step": 167 + }, + { + "epoch": 0.010207181481256456, + "grad_norm": 0.38850677013397217, + "learning_rate": 9.99874562009608e-05, + "loss": 1.139, + "step": 168 + }, + { + "epoch": 0.010267938513882982, + "grad_norm": 0.23338811099529266, + "learning_rate": 9.998724086986307e-05, + "loss": 1.1367, + "step": 169 + }, + { + "epoch": 0.010328695546509508, + "grad_norm": 0.3551323711872101, + "learning_rate": 9.998702370647335e-05, + "loss": 1.2448, + "step": 170 + }, + { + "epoch": 0.010389452579136035, + "grad_norm": 0.3741970956325531, + "learning_rate": 9.998680471079962e-05, + "loss": 1.2648, + "step": 171 + }, + { + "epoch": 0.010450209611762561, + "grad_norm": 0.31997615098953247, + "learning_rate": 9.99865838828499e-05, + "loss": 1.2447, + "step": 172 + }, + { + "epoch": 0.010510966644389087, + "grad_norm": 0.24562925100326538, + "learning_rate": 9.998636122263228e-05, + "loss": 1.1752, + "step": 173 + }, + { + "epoch": 0.010571723677015615, + "grad_norm": 0.44380733370780945, + "learning_rate": 9.998613673015494e-05, + "loss": 1.263, + "step": 174 + }, + { + "epoch": 0.010632480709642142, + "grad_norm": 0.2343125343322754, + "learning_rate": 9.998591040542608e-05, + "loss": 1.193, + "step": 175 + }, + { + "epoch": 0.010693237742268668, + "grad_norm": 0.3768921196460724, + "learning_rate": 9.998568224845402e-05, + "loss": 1.2294, + "step": 176 + }, + { + "epoch": 0.010753994774895194, + "grad_norm": 0.2599649131298065, + "learning_rate": 9.998545225924711e-05, + "loss": 1.211, + "step": 177 + }, + { + "epoch": 0.01081475180752172, + "grad_norm": 0.35759758949279785, + "learning_rate": 9.99852204378138e-05, + "loss": 1.2009, + "step": 178 + }, + { + "epoch": 0.010875508840148247, + "grad_norm": 0.18380869925022125, + "learning_rate": 9.998498678416254e-05, + "loss": 1.1803, + "step": 179 + }, + { + "epoch": 0.010936265872774773, + "grad_norm": 0.2130039930343628, + "learning_rate": 9.998475129830196e-05, + "loss": 1.1625, + "step": 180 + }, + { + "epoch": 0.0109970229054013, + "grad_norm": 0.23620057106018066, + "learning_rate": 9.998451398024066e-05, + "loss": 1.2213, + "step": 181 + }, + { + "epoch": 0.011057779938027828, + "grad_norm": 0.3125370740890503, + "learning_rate": 9.998427482998732e-05, + "loss": 1.1596, + "step": 182 + }, + { + "epoch": 0.011118536970654354, + "grad_norm": 0.21282735466957092, + "learning_rate": 9.998403384755074e-05, + "loss": 1.1924, + "step": 183 + }, + { + "epoch": 0.01117929400328088, + "grad_norm": 0.19769485294818878, + "learning_rate": 9.998379103293973e-05, + "loss": 1.1965, + "step": 184 + }, + { + "epoch": 0.011240051035907406, + "grad_norm": 0.21187029778957367, + "learning_rate": 9.99835463861632e-05, + "loss": 1.1526, + "step": 185 + }, + { + "epoch": 0.011300808068533933, + "grad_norm": 0.39983147382736206, + "learning_rate": 9.998329990723011e-05, + "loss": 1.2193, + "step": 186 + }, + { + "epoch": 0.011361565101160459, + "grad_norm": 0.3108305335044861, + "learning_rate": 9.998305159614951e-05, + "loss": 1.2, + "step": 187 + }, + { + "epoch": 0.011422322133786985, + "grad_norm": 3.6537888050079346, + "learning_rate": 9.998280145293049e-05, + "loss": 1.2986, + "step": 188 + }, + { + "epoch": 0.011483079166413512, + "grad_norm": 0.378499835729599, + "learning_rate": 9.998254947758221e-05, + "loss": 1.2226, + "step": 189 + }, + { + "epoch": 0.01154383619904004, + "grad_norm": 0.2172241359949112, + "learning_rate": 9.998229567011393e-05, + "loss": 1.1442, + "step": 190 + }, + { + "epoch": 0.011604593231666566, + "grad_norm": 0.2371995598077774, + "learning_rate": 9.998204003053492e-05, + "loss": 1.2858, + "step": 191 + }, + { + "epoch": 0.011665350264293092, + "grad_norm": 0.2743389904499054, + "learning_rate": 9.99817825588546e-05, + "loss": 1.2653, + "step": 192 + }, + { + "epoch": 0.011726107296919619, + "grad_norm": 0.4419936239719391, + "learning_rate": 9.998152325508236e-05, + "loss": 1.241, + "step": 193 + }, + { + "epoch": 0.011786864329546145, + "grad_norm": 2.6996798515319824, + "learning_rate": 9.998126211922773e-05, + "loss": 1.2827, + "step": 194 + }, + { + "epoch": 0.011847621362172671, + "grad_norm": 0.5182858109474182, + "learning_rate": 9.998099915130029e-05, + "loss": 1.248, + "step": 195 + }, + { + "epoch": 0.011908378394799198, + "grad_norm": 0.22894130647182465, + "learning_rate": 9.998073435130963e-05, + "loss": 1.1438, + "step": 196 + }, + { + "epoch": 0.011969135427425724, + "grad_norm": 0.592929482460022, + "learning_rate": 9.998046771926553e-05, + "loss": 1.3242, + "step": 197 + }, + { + "epoch": 0.012029892460052252, + "grad_norm": 0.21945858001708984, + "learning_rate": 9.99801992551777e-05, + "loss": 1.1592, + "step": 198 + }, + { + "epoch": 0.012090649492678778, + "grad_norm": 0.40205755829811096, + "learning_rate": 9.997992895905601e-05, + "loss": 1.2715, + "step": 199 + }, + { + "epoch": 0.012151406525305304, + "grad_norm": 31.714807510375977, + "learning_rate": 9.997965683091037e-05, + "loss": 1.1869, + "step": 200 + }, + { + "epoch": 0.01221216355793183, + "grad_norm": 0.3603743016719818, + "learning_rate": 9.997938287075076e-05, + "loss": 1.2231, + "step": 201 + }, + { + "epoch": 0.012272920590558357, + "grad_norm": 0.33039894700050354, + "learning_rate": 9.997910707858718e-05, + "loss": 1.2133, + "step": 202 + }, + { + "epoch": 0.012333677623184883, + "grad_norm": 0.22338835895061493, + "learning_rate": 9.997882945442978e-05, + "loss": 1.1859, + "step": 203 + }, + { + "epoch": 0.01239443465581141, + "grad_norm": 0.41773277521133423, + "learning_rate": 9.997854999828873e-05, + "loss": 1.2061, + "step": 204 + }, + { + "epoch": 0.012455191688437936, + "grad_norm": 0.2586667537689209, + "learning_rate": 9.997826871017427e-05, + "loss": 1.2613, + "step": 205 + }, + { + "epoch": 0.012515948721064464, + "grad_norm": 0.4832109212875366, + "learning_rate": 9.99779855900967e-05, + "loss": 1.1931, + "step": 206 + }, + { + "epoch": 0.01257670575369099, + "grad_norm": 0.19160710275173187, + "learning_rate": 9.997770063806641e-05, + "loss": 1.1408, + "step": 207 + }, + { + "epoch": 0.012637462786317517, + "grad_norm": 0.36751052737236023, + "learning_rate": 9.997741385409385e-05, + "loss": 1.2612, + "step": 208 + }, + { + "epoch": 0.012698219818944043, + "grad_norm": 1.9435746669769287, + "learning_rate": 9.997712523818952e-05, + "loss": 1.1441, + "step": 209 + }, + { + "epoch": 0.01275897685157057, + "grad_norm": 0.3401590585708618, + "learning_rate": 9.997683479036401e-05, + "loss": 1.1435, + "step": 210 + }, + { + "epoch": 0.012819733884197096, + "grad_norm": 2.203845977783203, + "learning_rate": 9.997654251062795e-05, + "loss": 1.1754, + "step": 211 + }, + { + "epoch": 0.012880490916823622, + "grad_norm": 2.6514179706573486, + "learning_rate": 9.997624839899208e-05, + "loss": 1.1724, + "step": 212 + }, + { + "epoch": 0.012941247949450148, + "grad_norm": 0.378422349691391, + "learning_rate": 9.997595245546716e-05, + "loss": 1.1723, + "step": 213 + }, + { + "epoch": 0.013002004982076676, + "grad_norm": 0.21302397549152374, + "learning_rate": 9.997565468006404e-05, + "loss": 1.2132, + "step": 214 + }, + { + "epoch": 0.013062762014703203, + "grad_norm": 0.5641165971755981, + "learning_rate": 9.997535507279363e-05, + "loss": 1.2206, + "step": 215 + }, + { + "epoch": 0.013123519047329729, + "grad_norm": 0.22931545972824097, + "learning_rate": 9.997505363366693e-05, + "loss": 1.1798, + "step": 216 + }, + { + "epoch": 0.013184276079956255, + "grad_norm": 0.2969755530357361, + "learning_rate": 9.997475036269497e-05, + "loss": 1.1653, + "step": 217 + }, + { + "epoch": 0.013245033112582781, + "grad_norm": 0.318168580532074, + "learning_rate": 9.997444525988888e-05, + "loss": 1.2642, + "step": 218 + }, + { + "epoch": 0.013305790145209308, + "grad_norm": 0.22343435883522034, + "learning_rate": 9.997413832525984e-05, + "loss": 1.2654, + "step": 219 + }, + { + "epoch": 0.013366547177835834, + "grad_norm": 0.25345805287361145, + "learning_rate": 9.997382955881911e-05, + "loss": 1.2097, + "step": 220 + }, + { + "epoch": 0.01342730421046236, + "grad_norm": 0.24147434532642365, + "learning_rate": 9.997351896057799e-05, + "loss": 1.1422, + "step": 221 + }, + { + "epoch": 0.013488061243088888, + "grad_norm": 0.29657846689224243, + "learning_rate": 9.997320653054787e-05, + "loss": 1.1875, + "step": 222 + }, + { + "epoch": 0.013548818275715415, + "grad_norm": 0.22879545390605927, + "learning_rate": 9.99728922687402e-05, + "loss": 1.2242, + "step": 223 + }, + { + "epoch": 0.013609575308341941, + "grad_norm": 0.19556359946727753, + "learning_rate": 9.997257617516651e-05, + "loss": 1.189, + "step": 224 + }, + { + "epoch": 0.013670332340968467, + "grad_norm": 0.3813997507095337, + "learning_rate": 9.997225824983838e-05, + "loss": 1.2457, + "step": 225 + }, + { + "epoch": 0.013731089373594994, + "grad_norm": 0.17610788345336914, + "learning_rate": 9.997193849276746e-05, + "loss": 1.1453, + "step": 226 + }, + { + "epoch": 0.01379184640622152, + "grad_norm": 0.38481786847114563, + "learning_rate": 9.997161690396549e-05, + "loss": 1.2695, + "step": 227 + }, + { + "epoch": 0.013852603438848046, + "grad_norm": 0.19644692540168762, + "learning_rate": 9.997129348344423e-05, + "loss": 1.1992, + "step": 228 + }, + { + "epoch": 0.013913360471474573, + "grad_norm": 0.2646336853504181, + "learning_rate": 9.997096823121555e-05, + "loss": 1.2049, + "step": 229 + }, + { + "epoch": 0.013974117504101099, + "grad_norm": 0.22506456077098846, + "learning_rate": 9.997064114729138e-05, + "loss": 1.1622, + "step": 230 + }, + { + "epoch": 0.014034874536727627, + "grad_norm": 0.3764677345752716, + "learning_rate": 9.997031223168368e-05, + "loss": 1.2147, + "step": 231 + }, + { + "epoch": 0.014095631569354153, + "grad_norm": 0.3747536242008209, + "learning_rate": 9.996998148440454e-05, + "loss": 1.1736, + "step": 232 + }, + { + "epoch": 0.01415638860198068, + "grad_norm": 0.26718929409980774, + "learning_rate": 9.996964890546607e-05, + "loss": 1.1771, + "step": 233 + }, + { + "epoch": 0.014217145634607206, + "grad_norm": 0.6982794404029846, + "learning_rate": 9.996931449488045e-05, + "loss": 1.2514, + "step": 234 + }, + { + "epoch": 0.014277902667233732, + "grad_norm": 0.4008491635322571, + "learning_rate": 9.996897825265996e-05, + "loss": 1.2765, + "step": 235 + }, + { + "epoch": 0.014338659699860258, + "grad_norm": 0.21541957557201385, + "learning_rate": 9.99686401788169e-05, + "loss": 1.1965, + "step": 236 + }, + { + "epoch": 0.014399416732486785, + "grad_norm": 0.20242953300476074, + "learning_rate": 9.996830027336368e-05, + "loss": 1.2024, + "step": 237 + }, + { + "epoch": 0.014460173765113311, + "grad_norm": 0.41439831256866455, + "learning_rate": 9.996795853631278e-05, + "loss": 1.2789, + "step": 238 + }, + { + "epoch": 0.014520930797739839, + "grad_norm": 0.23059511184692383, + "learning_rate": 9.996761496767668e-05, + "loss": 1.1712, + "step": 239 + }, + { + "epoch": 0.014581687830366365, + "grad_norm": 0.19941109418869019, + "learning_rate": 9.9967269567468e-05, + "loss": 1.1966, + "step": 240 + }, + { + "epoch": 0.014642444862992892, + "grad_norm": 0.26879042387008667, + "learning_rate": 9.996692233569938e-05, + "loss": 1.2386, + "step": 241 + }, + { + "epoch": 0.014703201895619418, + "grad_norm": 0.19818615913391113, + "learning_rate": 9.996657327238357e-05, + "loss": 1.1954, + "step": 242 + }, + { + "epoch": 0.014763958928245944, + "grad_norm": 0.22947187721729279, + "learning_rate": 9.996622237753336e-05, + "loss": 1.1885, + "step": 243 + }, + { + "epoch": 0.01482471596087247, + "grad_norm": 0.458009272813797, + "learning_rate": 9.996586965116161e-05, + "loss": 1.3335, + "step": 244 + }, + { + "epoch": 0.014885472993498997, + "grad_norm": 0.30162313580513, + "learning_rate": 9.996551509328127e-05, + "loss": 1.23, + "step": 245 + }, + { + "epoch": 0.014946230026125523, + "grad_norm": 0.2162998914718628, + "learning_rate": 9.996515870390529e-05, + "loss": 1.1318, + "step": 246 + }, + { + "epoch": 0.015006987058752051, + "grad_norm": 0.2238757312297821, + "learning_rate": 9.996480048304677e-05, + "loss": 1.1285, + "step": 247 + }, + { + "epoch": 0.015067744091378578, + "grad_norm": 0.20800887048244476, + "learning_rate": 9.996444043071883e-05, + "loss": 1.1329, + "step": 248 + }, + { + "epoch": 0.015128501124005104, + "grad_norm": 0.18456752598285675, + "learning_rate": 9.996407854693467e-05, + "loss": 1.1904, + "step": 249 + }, + { + "epoch": 0.01518925815663163, + "grad_norm": 0.20270679891109467, + "learning_rate": 9.996371483170754e-05, + "loss": 1.1364, + "step": 250 + }, + { + "epoch": 0.015250015189258156, + "grad_norm": 0.3495061993598938, + "learning_rate": 9.99633492850508e-05, + "loss": 1.3883, + "step": 251 + }, + { + "epoch": 0.015310772221884683, + "grad_norm": 0.26616501808166504, + "learning_rate": 9.996298190697783e-05, + "loss": 1.2393, + "step": 252 + }, + { + "epoch": 0.015371529254511209, + "grad_norm": 0.20503906905651093, + "learning_rate": 9.996261269750211e-05, + "loss": 1.1654, + "step": 253 + }, + { + "epoch": 0.015432286287137735, + "grad_norm": 0.26730602979660034, + "learning_rate": 9.996224165663714e-05, + "loss": 1.2569, + "step": 254 + }, + { + "epoch": 0.015493043319764263, + "grad_norm": 0.47674667835235596, + "learning_rate": 9.996186878439657e-05, + "loss": 1.3208, + "step": 255 + }, + { + "epoch": 0.01555380035239079, + "grad_norm": 0.26248225569725037, + "learning_rate": 9.996149408079403e-05, + "loss": 1.2583, + "step": 256 + }, + { + "epoch": 0.015614557385017316, + "grad_norm": 0.4549104571342468, + "learning_rate": 9.996111754584327e-05, + "loss": 1.241, + "step": 257 + }, + { + "epoch": 0.015675314417643842, + "grad_norm": 0.21723133325576782, + "learning_rate": 9.996073917955808e-05, + "loss": 1.1343, + "step": 258 + }, + { + "epoch": 0.01573607145027037, + "grad_norm": 0.48765337467193604, + "learning_rate": 9.996035898195236e-05, + "loss": 1.1317, + "step": 259 + }, + { + "epoch": 0.015796828482896895, + "grad_norm": 0.26611945033073425, + "learning_rate": 9.995997695304e-05, + "loss": 1.1473, + "step": 260 + }, + { + "epoch": 0.015857585515523423, + "grad_norm": 0.4465886354446411, + "learning_rate": 9.995959309283505e-05, + "loss": 1.3175, + "step": 261 + }, + { + "epoch": 0.015918342548149948, + "grad_norm": 2.0013575553894043, + "learning_rate": 9.995920740135154e-05, + "loss": 1.1756, + "step": 262 + }, + { + "epoch": 0.015979099580776476, + "grad_norm": 0.2904813587665558, + "learning_rate": 9.995881987860363e-05, + "loss": 1.14, + "step": 263 + }, + { + "epoch": 0.016039856613403, + "grad_norm": 0.27571505308151245, + "learning_rate": 9.995843052460552e-05, + "loss": 1.2668, + "step": 264 + }, + { + "epoch": 0.016100613646029528, + "grad_norm": 10.14664077758789, + "learning_rate": 9.995803933937149e-05, + "loss": 1.233, + "step": 265 + }, + { + "epoch": 0.016161370678656053, + "grad_norm": 0.2483414113521576, + "learning_rate": 9.995764632291586e-05, + "loss": 1.1974, + "step": 266 + }, + { + "epoch": 0.01622212771128258, + "grad_norm": 0.2465965896844864, + "learning_rate": 9.995725147525306e-05, + "loss": 1.2868, + "step": 267 + }, + { + "epoch": 0.01628288474390911, + "grad_norm": 0.3032623827457428, + "learning_rate": 9.995685479639754e-05, + "loss": 1.1648, + "step": 268 + }, + { + "epoch": 0.016343641776535633, + "grad_norm": 0.17899607121944427, + "learning_rate": 9.995645628636385e-05, + "loss": 1.1919, + "step": 269 + }, + { + "epoch": 0.01640439880916216, + "grad_norm": 0.19266390800476074, + "learning_rate": 9.995605594516661e-05, + "loss": 1.1668, + "step": 270 + }, + { + "epoch": 0.016465155841788686, + "grad_norm": 0.3554879128932953, + "learning_rate": 9.995565377282048e-05, + "loss": 1.1938, + "step": 271 + }, + { + "epoch": 0.016525912874415214, + "grad_norm": 0.23894625902175903, + "learning_rate": 9.995524976934018e-05, + "loss": 1.1835, + "step": 272 + }, + { + "epoch": 0.01658666990704174, + "grad_norm": 0.17239512503147125, + "learning_rate": 9.995484393474057e-05, + "loss": 1.1438, + "step": 273 + }, + { + "epoch": 0.016647426939668267, + "grad_norm": 0.23845449090003967, + "learning_rate": 9.995443626903651e-05, + "loss": 1.2587, + "step": 274 + }, + { + "epoch": 0.016708183972294795, + "grad_norm": 0.31088948249816895, + "learning_rate": 9.995402677224291e-05, + "loss": 1.1911, + "step": 275 + }, + { + "epoch": 0.01676894100492132, + "grad_norm": 0.2854573428630829, + "learning_rate": 9.995361544437481e-05, + "loss": 1.2585, + "step": 276 + }, + { + "epoch": 0.016829698037547847, + "grad_norm": 0.5123035311698914, + "learning_rate": 9.995320228544727e-05, + "loss": 1.4816, + "step": 277 + }, + { + "epoch": 0.016890455070174372, + "grad_norm": 0.2312437742948532, + "learning_rate": 9.995278729547546e-05, + "loss": 1.1743, + "step": 278 + }, + { + "epoch": 0.0169512121028009, + "grad_norm": 0.6658782958984375, + "learning_rate": 9.995237047447458e-05, + "loss": 1.3736, + "step": 279 + }, + { + "epoch": 0.017011969135427425, + "grad_norm": 0.45647940039634705, + "learning_rate": 9.995195182245988e-05, + "loss": 1.2589, + "step": 280 + }, + { + "epoch": 0.017072726168053953, + "grad_norm": 0.37576019763946533, + "learning_rate": 9.995153133944676e-05, + "loss": 1.3273, + "step": 281 + }, + { + "epoch": 0.017133483200680477, + "grad_norm": 0.3769761323928833, + "learning_rate": 9.995110902545058e-05, + "loss": 1.2635, + "step": 282 + }, + { + "epoch": 0.017194240233307005, + "grad_norm": 0.7584442496299744, + "learning_rate": 9.995068488048685e-05, + "loss": 1.1655, + "step": 283 + }, + { + "epoch": 0.017254997265933533, + "grad_norm": 0.5158006548881531, + "learning_rate": 9.995025890457113e-05, + "loss": 1.1605, + "step": 284 + }, + { + "epoch": 0.017315754298560058, + "grad_norm": 0.3699159026145935, + "learning_rate": 9.994983109771898e-05, + "loss": 1.1622, + "step": 285 + }, + { + "epoch": 0.017376511331186586, + "grad_norm": 0.23761245608329773, + "learning_rate": 9.994940145994614e-05, + "loss": 1.1539, + "step": 286 + }, + { + "epoch": 0.01743726836381311, + "grad_norm": 0.47396376729011536, + "learning_rate": 9.994896999126832e-05, + "loss": 1.127, + "step": 287 + }, + { + "epoch": 0.01749802539643964, + "grad_norm": 0.20256276428699493, + "learning_rate": 9.994853669170136e-05, + "loss": 1.1673, + "step": 288 + }, + { + "epoch": 0.017558782429066163, + "grad_norm": 0.4062318205833435, + "learning_rate": 9.994810156126114e-05, + "loss": 1.1567, + "step": 289 + }, + { + "epoch": 0.01761953946169269, + "grad_norm": 0.41786348819732666, + "learning_rate": 9.994766459996359e-05, + "loss": 1.2449, + "step": 290 + }, + { + "epoch": 0.01768029649431922, + "grad_norm": 0.28585192561149597, + "learning_rate": 9.994722580782475e-05, + "loss": 1.168, + "step": 291 + }, + { + "epoch": 0.017741053526945744, + "grad_norm": 0.3699677586555481, + "learning_rate": 9.994678518486067e-05, + "loss": 1.1523, + "step": 292 + }, + { + "epoch": 0.01780181055957227, + "grad_norm": 0.300614595413208, + "learning_rate": 9.994634273108755e-05, + "loss": 1.1457, + "step": 293 + }, + { + "epoch": 0.017862567592198796, + "grad_norm": 0.33786723017692566, + "learning_rate": 9.994589844652157e-05, + "loss": 1.154, + "step": 294 + }, + { + "epoch": 0.017923324624825324, + "grad_norm": 0.43694934248924255, + "learning_rate": 9.994545233117903e-05, + "loss": 1.3118, + "step": 295 + }, + { + "epoch": 0.01798408165745185, + "grad_norm": 0.5097596645355225, + "learning_rate": 9.994500438507628e-05, + "loss": 1.3259, + "step": 296 + }, + { + "epoch": 0.018044838690078377, + "grad_norm": 0.3348499536514282, + "learning_rate": 9.994455460822975e-05, + "loss": 1.1655, + "step": 297 + }, + { + "epoch": 0.0181055957227049, + "grad_norm": 33.572723388671875, + "learning_rate": 9.994410300065591e-05, + "loss": 1.1179, + "step": 298 + }, + { + "epoch": 0.01816635275533143, + "grad_norm": 0.3271271288394928, + "learning_rate": 9.994364956237133e-05, + "loss": 1.3444, + "step": 299 + }, + { + "epoch": 0.018227109787957958, + "grad_norm": 0.27192381024360657, + "learning_rate": 9.994319429339259e-05, + "loss": 1.3966, + "step": 300 + }, + { + "epoch": 0.018287866820584482, + "grad_norm": 0.2065957635641098, + "learning_rate": 9.994273719373643e-05, + "loss": 1.1691, + "step": 301 + }, + { + "epoch": 0.01834862385321101, + "grad_norm": 0.20475585758686066, + "learning_rate": 9.99422782634196e-05, + "loss": 1.1543, + "step": 302 + }, + { + "epoch": 0.018409380885837535, + "grad_norm": 0.2624063193798065, + "learning_rate": 9.994181750245887e-05, + "loss": 1.199, + "step": 303 + }, + { + "epoch": 0.018470137918464063, + "grad_norm": 2.3660545349121094, + "learning_rate": 9.99413549108712e-05, + "loss": 1.1645, + "step": 304 + }, + { + "epoch": 0.018530894951090587, + "grad_norm": 0.3732457160949707, + "learning_rate": 9.99408904886735e-05, + "loss": 1.2981, + "step": 305 + }, + { + "epoch": 0.018591651983717115, + "grad_norm": 0.23340417444705963, + "learning_rate": 9.99404242358828e-05, + "loss": 1.1844, + "step": 306 + }, + { + "epoch": 0.018652409016343643, + "grad_norm": 0.8010504245758057, + "learning_rate": 9.993995615251622e-05, + "loss": 1.1564, + "step": 307 + }, + { + "epoch": 0.018713166048970168, + "grad_norm": 0.33755290508270264, + "learning_rate": 9.993948623859086e-05, + "loss": 1.2193, + "step": 308 + }, + { + "epoch": 0.018773923081596696, + "grad_norm": 0.7690300345420837, + "learning_rate": 9.993901449412399e-05, + "loss": 1.3164, + "step": 309 + }, + { + "epoch": 0.01883468011422322, + "grad_norm": 0.22133295238018036, + "learning_rate": 9.993854091913291e-05, + "loss": 1.1487, + "step": 310 + }, + { + "epoch": 0.01889543714684975, + "grad_norm": 0.42820850014686584, + "learning_rate": 9.993806551363494e-05, + "loss": 1.1758, + "step": 311 + }, + { + "epoch": 0.018956194179476273, + "grad_norm": 0.5701407194137573, + "learning_rate": 9.993758827764752e-05, + "loss": 1.41, + "step": 312 + }, + { + "epoch": 0.0190169512121028, + "grad_norm": 0.2954898178577423, + "learning_rate": 9.993710921118816e-05, + "loss": 1.3422, + "step": 313 + }, + { + "epoch": 0.019077708244729326, + "grad_norm": 0.34081703424453735, + "learning_rate": 9.993662831427442e-05, + "loss": 1.1599, + "step": 314 + }, + { + "epoch": 0.019138465277355854, + "grad_norm": 0.47216105461120605, + "learning_rate": 9.99361455869239e-05, + "loss": 1.291, + "step": 315 + }, + { + "epoch": 0.019199222309982382, + "grad_norm": 0.2706839144229889, + "learning_rate": 9.993566102915432e-05, + "loss": 1.1446, + "step": 316 + }, + { + "epoch": 0.019259979342608907, + "grad_norm": 0.5909138917922974, + "learning_rate": 9.993517464098344e-05, + "loss": 1.2405, + "step": 317 + }, + { + "epoch": 0.019320736375235435, + "grad_norm": 0.2069237381219864, + "learning_rate": 9.993468642242905e-05, + "loss": 1.1611, + "step": 318 + }, + { + "epoch": 0.01938149340786196, + "grad_norm": 0.3580915331840515, + "learning_rate": 9.99341963735091e-05, + "loss": 1.1563, + "step": 319 + }, + { + "epoch": 0.019442250440488487, + "grad_norm": 0.33375483751296997, + "learning_rate": 9.993370449424153e-05, + "loss": 1.2603, + "step": 320 + }, + { + "epoch": 0.01950300747311501, + "grad_norm": 0.4273125231266022, + "learning_rate": 9.993321078464437e-05, + "loss": 1.4076, + "step": 321 + }, + { + "epoch": 0.01956376450574154, + "grad_norm": 0.38784006237983704, + "learning_rate": 9.99327152447357e-05, + "loss": 1.1459, + "step": 322 + }, + { + "epoch": 0.019624521538368068, + "grad_norm": 0.34369736909866333, + "learning_rate": 9.993221787453373e-05, + "loss": 1.2008, + "step": 323 + }, + { + "epoch": 0.019685278570994592, + "grad_norm": 0.3320353627204895, + "learning_rate": 9.993171867405664e-05, + "loss": 1.1748, + "step": 324 + }, + { + "epoch": 0.01974603560362112, + "grad_norm": 0.31289252638816833, + "learning_rate": 9.993121764332276e-05, + "loss": 1.1769, + "step": 325 + }, + { + "epoch": 0.019806792636247645, + "grad_norm": 0.44145724177360535, + "learning_rate": 9.993071478235044e-05, + "loss": 1.2588, + "step": 326 + }, + { + "epoch": 0.019867549668874173, + "grad_norm": 0.7167911529541016, + "learning_rate": 9.993021009115812e-05, + "loss": 1.1599, + "step": 327 + }, + { + "epoch": 0.019928306701500698, + "grad_norm": 0.4565293490886688, + "learning_rate": 9.99297035697643e-05, + "loss": 1.1647, + "step": 328 + }, + { + "epoch": 0.019989063734127226, + "grad_norm": 0.2835087776184082, + "learning_rate": 9.992919521818756e-05, + "loss": 1.2806, + "step": 329 + }, + { + "epoch": 0.02004982076675375, + "grad_norm": 1.6273994445800781, + "learning_rate": 9.99286850364465e-05, + "loss": 1.1873, + "step": 330 + }, + { + "epoch": 0.020110577799380278, + "grad_norm": 0.3168855905532837, + "learning_rate": 9.992817302455983e-05, + "loss": 1.1684, + "step": 331 + }, + { + "epoch": 0.020171334832006806, + "grad_norm": 0.5479095578193665, + "learning_rate": 9.992765918254634e-05, + "loss": 1.1279, + "step": 332 + }, + { + "epoch": 0.02023209186463333, + "grad_norm": 0.5789514780044556, + "learning_rate": 9.992714351042485e-05, + "loss": 1.4497, + "step": 333 + }, + { + "epoch": 0.02029284889725986, + "grad_norm": 0.2367178201675415, + "learning_rate": 9.992662600821427e-05, + "loss": 1.2091, + "step": 334 + }, + { + "epoch": 0.020353605929886383, + "grad_norm": 0.3046312630176544, + "learning_rate": 9.992610667593355e-05, + "loss": 1.2742, + "step": 335 + }, + { + "epoch": 0.02041436296251291, + "grad_norm": 0.32870152592658997, + "learning_rate": 9.992558551360174e-05, + "loss": 1.2338, + "step": 336 + }, + { + "epoch": 0.020475119995139436, + "grad_norm": 0.21014073491096497, + "learning_rate": 9.992506252123796e-05, + "loss": 1.1617, + "step": 337 + }, + { + "epoch": 0.020535877027765964, + "grad_norm": 0.2694036662578583, + "learning_rate": 9.992453769886135e-05, + "loss": 1.1296, + "step": 338 + }, + { + "epoch": 0.02059663406039249, + "grad_norm": 0.21450552344322205, + "learning_rate": 9.992401104649116e-05, + "loss": 1.2217, + "step": 339 + }, + { + "epoch": 0.020657391093019017, + "grad_norm": 0.25186142325401306, + "learning_rate": 9.99234825641467e-05, + "loss": 1.3127, + "step": 340 + }, + { + "epoch": 0.020718148125645545, + "grad_norm": 0.3109813928604126, + "learning_rate": 9.992295225184733e-05, + "loss": 1.2242, + "step": 341 + }, + { + "epoch": 0.02077890515827207, + "grad_norm": 0.21320192515850067, + "learning_rate": 9.99224201096125e-05, + "loss": 1.262, + "step": 342 + }, + { + "epoch": 0.020839662190898597, + "grad_norm": 0.3940616250038147, + "learning_rate": 9.99218861374617e-05, + "loss": 1.2604, + "step": 343 + }, + { + "epoch": 0.020900419223525122, + "grad_norm": 0.4618244767189026, + "learning_rate": 9.992135033541454e-05, + "loss": 1.2799, + "step": 344 + }, + { + "epoch": 0.02096117625615165, + "grad_norm": 0.22235806286334991, + "learning_rate": 9.992081270349061e-05, + "loss": 1.144, + "step": 345 + }, + { + "epoch": 0.021021933288778175, + "grad_norm": 0.27402716875076294, + "learning_rate": 9.992027324170965e-05, + "loss": 1.2279, + "step": 346 + }, + { + "epoch": 0.021082690321404703, + "grad_norm": 0.24074974656105042, + "learning_rate": 9.991973195009143e-05, + "loss": 1.1355, + "step": 347 + }, + { + "epoch": 0.02114344735403123, + "grad_norm": 0.21464583277702332, + "learning_rate": 9.991918882865576e-05, + "loss": 1.2164, + "step": 348 + }, + { + "epoch": 0.021204204386657755, + "grad_norm": 0.2532672882080078, + "learning_rate": 9.99186438774226e-05, + "loss": 1.1653, + "step": 349 + }, + { + "epoch": 0.021264961419284283, + "grad_norm": 0.19243644177913666, + "learning_rate": 9.991809709641189e-05, + "loss": 1.1504, + "step": 350 + }, + { + "epoch": 0.021325718451910808, + "grad_norm": 0.17330235242843628, + "learning_rate": 9.99175484856437e-05, + "loss": 1.1787, + "step": 351 + }, + { + "epoch": 0.021386475484537336, + "grad_norm": 0.44046974182128906, + "learning_rate": 9.99169980451381e-05, + "loss": 1.2375, + "step": 352 + }, + { + "epoch": 0.02144723251716386, + "grad_norm": 0.27522218227386475, + "learning_rate": 9.99164457749153e-05, + "loss": 1.261, + "step": 353 + }, + { + "epoch": 0.02150798954979039, + "grad_norm": 0.24426224827766418, + "learning_rate": 9.991589167499553e-05, + "loss": 1.2295, + "step": 354 + }, + { + "epoch": 0.021568746582416913, + "grad_norm": 0.1877678632736206, + "learning_rate": 9.991533574539909e-05, + "loss": 1.1229, + "step": 355 + }, + { + "epoch": 0.02162950361504344, + "grad_norm": 0.18022331595420837, + "learning_rate": 9.991477798614638e-05, + "loss": 1.1579, + "step": 356 + }, + { + "epoch": 0.02169026064766997, + "grad_norm": 0.2182506024837494, + "learning_rate": 9.991421839725783e-05, + "loss": 1.2418, + "step": 357 + }, + { + "epoch": 0.021751017680296494, + "grad_norm": 0.20350974798202515, + "learning_rate": 9.991365697875397e-05, + "loss": 1.1571, + "step": 358 + }, + { + "epoch": 0.021811774712923022, + "grad_norm": 0.17868244647979736, + "learning_rate": 9.991309373065533e-05, + "loss": 1.1354, + "step": 359 + }, + { + "epoch": 0.021872531745549546, + "grad_norm": 0.23201099038124084, + "learning_rate": 9.991252865298262e-05, + "loss": 1.2203, + "step": 360 + }, + { + "epoch": 0.021933288778176074, + "grad_norm": 0.9139298796653748, + "learning_rate": 9.991196174575653e-05, + "loss": 1.3402, + "step": 361 + }, + { + "epoch": 0.0219940458108026, + "grad_norm": 0.3012075424194336, + "learning_rate": 9.991139300899784e-05, + "loss": 1.4223, + "step": 362 + }, + { + "epoch": 0.022054802843429127, + "grad_norm": 0.43885961174964905, + "learning_rate": 9.991082244272736e-05, + "loss": 1.2401, + "step": 363 + }, + { + "epoch": 0.022115559876055655, + "grad_norm": 0.2859572768211365, + "learning_rate": 9.991025004696605e-05, + "loss": 1.1391, + "step": 364 + }, + { + "epoch": 0.02217631690868218, + "grad_norm": 0.3498673737049103, + "learning_rate": 9.990967582173489e-05, + "loss": 1.2427, + "step": 365 + }, + { + "epoch": 0.022237073941308708, + "grad_norm": 0.3147677183151245, + "learning_rate": 9.99090997670549e-05, + "loss": 1.1636, + "step": 366 + }, + { + "epoch": 0.022297830973935232, + "grad_norm": 0.19915129244327545, + "learning_rate": 9.990852188294721e-05, + "loss": 1.1531, + "step": 367 + }, + { + "epoch": 0.02235858800656176, + "grad_norm": 0.22790472209453583, + "learning_rate": 9.990794216943302e-05, + "loss": 1.144, + "step": 368 + }, + { + "epoch": 0.022419345039188285, + "grad_norm": 2.5829055309295654, + "learning_rate": 9.990736062653354e-05, + "loss": 1.1795, + "step": 369 + }, + { + "epoch": 0.022480102071814813, + "grad_norm": 0.263721764087677, + "learning_rate": 9.990677725427013e-05, + "loss": 1.2126, + "step": 370 + }, + { + "epoch": 0.022540859104441337, + "grad_norm": 0.8752437829971313, + "learning_rate": 9.990619205266414e-05, + "loss": 1.1374, + "step": 371 + }, + { + "epoch": 0.022601616137067865, + "grad_norm": 0.25581830739974976, + "learning_rate": 9.990560502173704e-05, + "loss": 1.2511, + "step": 372 + }, + { + "epoch": 0.022662373169694393, + "grad_norm": 0.2347150593996048, + "learning_rate": 9.990501616151034e-05, + "loss": 1.1987, + "step": 373 + }, + { + "epoch": 0.022723130202320918, + "grad_norm": 0.25042441487312317, + "learning_rate": 9.990442547200562e-05, + "loss": 1.1607, + "step": 374 + }, + { + "epoch": 0.022783887234947446, + "grad_norm": 0.2432336062192917, + "learning_rate": 9.990383295324454e-05, + "loss": 1.4697, + "step": 375 + }, + { + "epoch": 0.02284464426757397, + "grad_norm": 0.20277556777000427, + "learning_rate": 9.990323860524882e-05, + "loss": 1.1355, + "step": 376 + }, + { + "epoch": 0.0229054013002005, + "grad_norm": 0.2945151925086975, + "learning_rate": 9.990264242804026e-05, + "loss": 1.3467, + "step": 377 + }, + { + "epoch": 0.022966158332827023, + "grad_norm": 0.2620909810066223, + "learning_rate": 9.990204442164066e-05, + "loss": 1.2339, + "step": 378 + }, + { + "epoch": 0.02302691536545355, + "grad_norm": 0.3506776988506317, + "learning_rate": 9.9901444586072e-05, + "loss": 1.2258, + "step": 379 + }, + { + "epoch": 0.02308767239808008, + "grad_norm": 0.3582307994365692, + "learning_rate": 9.990084292135624e-05, + "loss": 1.1352, + "step": 380 + }, + { + "epoch": 0.023148429430706604, + "grad_norm": 14.608827590942383, + "learning_rate": 9.990023942751543e-05, + "loss": 1.3026, + "step": 381 + }, + { + "epoch": 0.023209186463333132, + "grad_norm": 0.32785990834236145, + "learning_rate": 9.989963410457171e-05, + "loss": 1.2796, + "step": 382 + }, + { + "epoch": 0.023269943495959657, + "grad_norm": 0.24141253530979156, + "learning_rate": 9.989902695254725e-05, + "loss": 1.2581, + "step": 383 + }, + { + "epoch": 0.023330700528586185, + "grad_norm": 0.32767367362976074, + "learning_rate": 9.989841797146431e-05, + "loss": 1.211, + "step": 384 + }, + { + "epoch": 0.02339145756121271, + "grad_norm": 0.28206735849380493, + "learning_rate": 9.989780716134522e-05, + "loss": 1.2436, + "step": 385 + }, + { + "epoch": 0.023452214593839237, + "grad_norm": 0.2510630190372467, + "learning_rate": 9.989719452221235e-05, + "loss": 1.235, + "step": 386 + }, + { + "epoch": 0.023512971626465762, + "grad_norm": 0.36102914810180664, + "learning_rate": 9.989658005408818e-05, + "loss": 1.2017, + "step": 387 + }, + { + "epoch": 0.02357372865909229, + "grad_norm": 0.3127259612083435, + "learning_rate": 9.989596375699522e-05, + "loss": 1.1618, + "step": 388 + }, + { + "epoch": 0.023634485691718818, + "grad_norm": 2.716749429702759, + "learning_rate": 9.989534563095607e-05, + "loss": 1.362, + "step": 389 + }, + { + "epoch": 0.023695242724345342, + "grad_norm": 0.2603667974472046, + "learning_rate": 9.989472567599338e-05, + "loss": 1.2619, + "step": 390 + }, + { + "epoch": 0.02375599975697187, + "grad_norm": 0.37386372685432434, + "learning_rate": 9.989410389212987e-05, + "loss": 1.2361, + "step": 391 + }, + { + "epoch": 0.023816756789598395, + "grad_norm": 0.3464028239250183, + "learning_rate": 9.989348027938834e-05, + "loss": 1.146, + "step": 392 + }, + { + "epoch": 0.023877513822224923, + "grad_norm": 0.40520069003105164, + "learning_rate": 9.989285483779164e-05, + "loss": 1.3619, + "step": 393 + }, + { + "epoch": 0.023938270854851448, + "grad_norm": 0.18270514905452728, + "learning_rate": 9.989222756736271e-05, + "loss": 1.1226, + "step": 394 + }, + { + "epoch": 0.023999027887477976, + "grad_norm": 0.278956800699234, + "learning_rate": 9.989159846812454e-05, + "loss": 1.2004, + "step": 395 + }, + { + "epoch": 0.024059784920104504, + "grad_norm": 0.38345643877983093, + "learning_rate": 9.989096754010018e-05, + "loss": 1.1555, + "step": 396 + }, + { + "epoch": 0.02412054195273103, + "grad_norm": 0.297656774520874, + "learning_rate": 9.989033478331276e-05, + "loss": 1.2252, + "step": 397 + }, + { + "epoch": 0.024181298985357556, + "grad_norm": 23.014734268188477, + "learning_rate": 9.988970019778547e-05, + "loss": 1.217, + "step": 398 + }, + { + "epoch": 0.02424205601798408, + "grad_norm": 0.3520590364933014, + "learning_rate": 9.988906378354158e-05, + "loss": 1.1651, + "step": 399 + }, + { + "epoch": 0.02430281305061061, + "grad_norm": 0.8311512470245361, + "learning_rate": 9.988842554060441e-05, + "loss": 1.1466, + "step": 400 + }, + { + "epoch": 0.024363570083237134, + "grad_norm": 0.7144541144371033, + "learning_rate": 9.988778546899738e-05, + "loss": 1.125, + "step": 401 + }, + { + "epoch": 0.02442432711586366, + "grad_norm": 0.40504345297813416, + "learning_rate": 9.98871435687439e-05, + "loss": 1.1774, + "step": 402 + }, + { + "epoch": 0.024485084148490186, + "grad_norm": 0.2774827778339386, + "learning_rate": 9.988649983986755e-05, + "loss": 1.1722, + "step": 403 + }, + { + "epoch": 0.024545841181116714, + "grad_norm": 0.3524899184703827, + "learning_rate": 9.98858542823919e-05, + "loss": 1.2614, + "step": 404 + }, + { + "epoch": 0.024606598213743242, + "grad_norm": 0.2749498188495636, + "learning_rate": 9.98852068963406e-05, + "loss": 1.1467, + "step": 405 + }, + { + "epoch": 0.024667355246369767, + "grad_norm": 0.2550564110279083, + "learning_rate": 9.988455768173741e-05, + "loss": 1.2173, + "step": 406 + }, + { + "epoch": 0.024728112278996295, + "grad_norm": 0.32169127464294434, + "learning_rate": 9.988390663860613e-05, + "loss": 1.256, + "step": 407 + }, + { + "epoch": 0.02478886931162282, + "grad_norm": 0.26087912917137146, + "learning_rate": 9.988325376697059e-05, + "loss": 1.1355, + "step": 408 + }, + { + "epoch": 0.024849626344249347, + "grad_norm": 0.3460897207260132, + "learning_rate": 9.988259906685475e-05, + "loss": 1.147, + "step": 409 + }, + { + "epoch": 0.024910383376875872, + "grad_norm": 0.23557282984256744, + "learning_rate": 9.98819425382826e-05, + "loss": 1.2472, + "step": 410 + }, + { + "epoch": 0.0249711404095024, + "grad_norm": 0.3029102683067322, + "learning_rate": 9.988128418127822e-05, + "loss": 1.1614, + "step": 411 + }, + { + "epoch": 0.025031897442128928, + "grad_norm": 0.401030957698822, + "learning_rate": 9.98806239958657e-05, + "loss": 1.2113, + "step": 412 + }, + { + "epoch": 0.025092654474755453, + "grad_norm": 0.21606552600860596, + "learning_rate": 9.987996198206928e-05, + "loss": 1.0921, + "step": 413 + }, + { + "epoch": 0.02515341150738198, + "grad_norm": 0.30670490860939026, + "learning_rate": 9.987929813991321e-05, + "loss": 1.2477, + "step": 414 + }, + { + "epoch": 0.025214168540008505, + "grad_norm": 0.2819117605686188, + "learning_rate": 9.987863246942181e-05, + "loss": 1.1571, + "step": 415 + }, + { + "epoch": 0.025274925572635033, + "grad_norm": 0.44100797176361084, + "learning_rate": 9.987796497061952e-05, + "loss": 1.2278, + "step": 416 + }, + { + "epoch": 0.025335682605261558, + "grad_norm": 0.3788920044898987, + "learning_rate": 9.987729564353078e-05, + "loss": 1.2841, + "step": 417 + }, + { + "epoch": 0.025396439637888086, + "grad_norm": 0.6525524258613586, + "learning_rate": 9.987662448818011e-05, + "loss": 1.1318, + "step": 418 + }, + { + "epoch": 0.02545719667051461, + "grad_norm": 0.5220158696174622, + "learning_rate": 9.987595150459215e-05, + "loss": 1.2277, + "step": 419 + }, + { + "epoch": 0.02551795370314114, + "grad_norm": 0.43092164397239685, + "learning_rate": 9.987527669279152e-05, + "loss": 1.16, + "step": 420 + }, + { + "epoch": 0.025578710735767667, + "grad_norm": 0.5233555436134338, + "learning_rate": 9.9874600052803e-05, + "loss": 1.2413, + "step": 421 + }, + { + "epoch": 0.02563946776839419, + "grad_norm": 0.20341508090496063, + "learning_rate": 9.987392158465138e-05, + "loss": 1.1414, + "step": 422 + }, + { + "epoch": 0.02570022480102072, + "grad_norm": 0.5091098546981812, + "learning_rate": 9.987324128836151e-05, + "loss": 1.1614, + "step": 423 + }, + { + "epoch": 0.025760981833647244, + "grad_norm": 0.2747238576412201, + "learning_rate": 9.987255916395835e-05, + "loss": 1.2224, + "step": 424 + }, + { + "epoch": 0.025821738866273772, + "grad_norm": 0.4567953646183014, + "learning_rate": 9.987187521146688e-05, + "loss": 1.224, + "step": 425 + }, + { + "epoch": 0.025882495898900296, + "grad_norm": 0.37994566559791565, + "learning_rate": 9.98711894309122e-05, + "loss": 1.1301, + "step": 426 + }, + { + "epoch": 0.025943252931526824, + "grad_norm": 0.22085264325141907, + "learning_rate": 9.987050182231943e-05, + "loss": 1.1586, + "step": 427 + }, + { + "epoch": 0.026004009964153352, + "grad_norm": 0.41410860419273376, + "learning_rate": 9.986981238571376e-05, + "loss": 1.2118, + "step": 428 + }, + { + "epoch": 0.026064766996779877, + "grad_norm": 2.435889959335327, + "learning_rate": 9.986912112112049e-05, + "loss": 1.1195, + "step": 429 + }, + { + "epoch": 0.026125524029406405, + "grad_norm": 0.35806381702423096, + "learning_rate": 9.986842802856494e-05, + "loss": 1.1783, + "step": 430 + }, + { + "epoch": 0.02618628106203293, + "grad_norm": 0.2825682461261749, + "learning_rate": 9.986773310807254e-05, + "loss": 1.2659, + "step": 431 + }, + { + "epoch": 0.026247038094659458, + "grad_norm": 0.17404058575630188, + "learning_rate": 9.986703635966872e-05, + "loss": 1.1721, + "step": 432 + }, + { + "epoch": 0.026307795127285982, + "grad_norm": 0.27772650122642517, + "learning_rate": 9.986633778337905e-05, + "loss": 1.2585, + "step": 433 + }, + { + "epoch": 0.02636855215991251, + "grad_norm": 0.17797604203224182, + "learning_rate": 9.986563737922914e-05, + "loss": 1.138, + "step": 434 + }, + { + "epoch": 0.026429309192539035, + "grad_norm": 0.319357305765152, + "learning_rate": 9.986493514724464e-05, + "loss": 1.2481, + "step": 435 + }, + { + "epoch": 0.026490066225165563, + "grad_norm": 0.24233309924602509, + "learning_rate": 9.98642310874513e-05, + "loss": 1.2086, + "step": 436 + }, + { + "epoch": 0.02655082325779209, + "grad_norm": 44.90375900268555, + "learning_rate": 9.986352519987494e-05, + "loss": 1.2702, + "step": 437 + }, + { + "epoch": 0.026611580290418616, + "grad_norm": 0.24607115983963013, + "learning_rate": 9.986281748454143e-05, + "loss": 1.1617, + "step": 438 + }, + { + "epoch": 0.026672337323045144, + "grad_norm": 0.21750274300575256, + "learning_rate": 9.986210794147671e-05, + "loss": 1.1653, + "step": 439 + }, + { + "epoch": 0.026733094355671668, + "grad_norm": 0.23555466532707214, + "learning_rate": 9.986139657070677e-05, + "loss": 1.1698, + "step": 440 + }, + { + "epoch": 0.026793851388298196, + "grad_norm": 0.1842479705810547, + "learning_rate": 9.98606833722577e-05, + "loss": 1.1573, + "step": 441 + }, + { + "epoch": 0.02685460842092472, + "grad_norm": 0.21280412375926971, + "learning_rate": 9.985996834615566e-05, + "loss": 1.1245, + "step": 442 + }, + { + "epoch": 0.02691536545355125, + "grad_norm": 0.24020196497440338, + "learning_rate": 9.985925149242683e-05, + "loss": 1.2301, + "step": 443 + }, + { + "epoch": 0.026976122486177777, + "grad_norm": 0.4025554656982422, + "learning_rate": 9.98585328110975e-05, + "loss": 1.3422, + "step": 444 + }, + { + "epoch": 0.0270368795188043, + "grad_norm": 0.20919737219810486, + "learning_rate": 9.985781230219402e-05, + "loss": 1.222, + "step": 445 + }, + { + "epoch": 0.02709763655143083, + "grad_norm": 0.2558465301990509, + "learning_rate": 9.985708996574278e-05, + "loss": 1.2962, + "step": 446 + }, + { + "epoch": 0.027158393584057354, + "grad_norm": 0.30489349365234375, + "learning_rate": 9.985636580177026e-05, + "loss": 1.4212, + "step": 447 + }, + { + "epoch": 0.027219150616683882, + "grad_norm": 0.17291104793548584, + "learning_rate": 9.985563981030304e-05, + "loss": 1.1398, + "step": 448 + }, + { + "epoch": 0.027279907649310407, + "grad_norm": 0.2797357141971588, + "learning_rate": 9.98549119913677e-05, + "loss": 1.194, + "step": 449 + }, + { + "epoch": 0.027340664681936935, + "grad_norm": 0.20370079576969147, + "learning_rate": 9.985418234499094e-05, + "loss": 1.156, + "step": 450 + }, + { + "epoch": 0.02740142171456346, + "grad_norm": 0.19331230223178864, + "learning_rate": 9.985345087119946e-05, + "loss": 1.1384, + "step": 451 + }, + { + "epoch": 0.027462178747189987, + "grad_norm": 0.3319239616394043, + "learning_rate": 9.98527175700201e-05, + "loss": 1.266, + "step": 452 + }, + { + "epoch": 0.027522935779816515, + "grad_norm": 0.30400246381759644, + "learning_rate": 9.985198244147978e-05, + "loss": 1.1891, + "step": 453 + }, + { + "epoch": 0.02758369281244304, + "grad_norm": 0.21529674530029297, + "learning_rate": 9.985124548560536e-05, + "loss": 1.1589, + "step": 454 + }, + { + "epoch": 0.027644449845069568, + "grad_norm": 0.605178952217102, + "learning_rate": 9.985050670242393e-05, + "loss": 1.2537, + "step": 455 + }, + { + "epoch": 0.027705206877696092, + "grad_norm": 0.27420905232429504, + "learning_rate": 9.984976609196253e-05, + "loss": 1.234, + "step": 456 + }, + { + "epoch": 0.02776596391032262, + "grad_norm": 0.19000773131847382, + "learning_rate": 9.984902365424833e-05, + "loss": 1.1405, + "step": 457 + }, + { + "epoch": 0.027826720942949145, + "grad_norm": 0.19945290684700012, + "learning_rate": 9.984827938930852e-05, + "loss": 1.2119, + "step": 458 + }, + { + "epoch": 0.027887477975575673, + "grad_norm": 0.22797727584838867, + "learning_rate": 9.98475332971704e-05, + "loss": 1.2138, + "step": 459 + }, + { + "epoch": 0.027948235008202198, + "grad_norm": 0.26123619079589844, + "learning_rate": 9.98467853778613e-05, + "loss": 1.1181, + "step": 460 + }, + { + "epoch": 0.028008992040828726, + "grad_norm": 0.19567619264125824, + "learning_rate": 9.984603563140866e-05, + "loss": 1.1587, + "step": 461 + }, + { + "epoch": 0.028069749073455254, + "grad_norm": 0.3084138333797455, + "learning_rate": 9.984528405783994e-05, + "loss": 1.2768, + "step": 462 + }, + { + "epoch": 0.02813050610608178, + "grad_norm": 0.24002857506275177, + "learning_rate": 9.984453065718271e-05, + "loss": 1.1468, + "step": 463 + }, + { + "epoch": 0.028191263138708306, + "grad_norm": 0.2294125258922577, + "learning_rate": 9.984377542946458e-05, + "loss": 1.2839, + "step": 464 + }, + { + "epoch": 0.02825202017133483, + "grad_norm": 0.26755377650260925, + "learning_rate": 9.98430183747132e-05, + "loss": 1.1732, + "step": 465 + }, + { + "epoch": 0.02831277720396136, + "grad_norm": 0.22083140909671783, + "learning_rate": 9.984225949295636e-05, + "loss": 1.1504, + "step": 466 + }, + { + "epoch": 0.028373534236587884, + "grad_norm": 0.15084177255630493, + "learning_rate": 9.984149878422186e-05, + "loss": 1.1279, + "step": 467 + }, + { + "epoch": 0.02843429126921441, + "grad_norm": 0.32440218329429626, + "learning_rate": 9.98407362485376e-05, + "loss": 1.1775, + "step": 468 + }, + { + "epoch": 0.02849504830184094, + "grad_norm": 0.22640341520309448, + "learning_rate": 9.98399718859315e-05, + "loss": 1.2064, + "step": 469 + }, + { + "epoch": 0.028555805334467464, + "grad_norm": 0.2065805196762085, + "learning_rate": 9.983920569643161e-05, + "loss": 1.1692, + "step": 470 + }, + { + "epoch": 0.028616562367093992, + "grad_norm": 0.5118774175643921, + "learning_rate": 9.9838437680066e-05, + "loss": 1.2046, + "step": 471 + }, + { + "epoch": 0.028677319399720517, + "grad_norm": 0.3265882432460785, + "learning_rate": 9.983766783686284e-05, + "loss": 1.2811, + "step": 472 + }, + { + "epoch": 0.028738076432347045, + "grad_norm": 0.377768337726593, + "learning_rate": 9.983689616685031e-05, + "loss": 1.3418, + "step": 473 + }, + { + "epoch": 0.02879883346497357, + "grad_norm": 0.3566953241825104, + "learning_rate": 9.983612267005671e-05, + "loss": 1.2245, + "step": 474 + }, + { + "epoch": 0.028859590497600097, + "grad_norm": 0.3818965554237366, + "learning_rate": 9.983534734651042e-05, + "loss": 1.1651, + "step": 475 + }, + { + "epoch": 0.028920347530226622, + "grad_norm": 0.3248099684715271, + "learning_rate": 9.983457019623983e-05, + "loss": 1.1557, + "step": 476 + }, + { + "epoch": 0.02898110456285315, + "grad_norm": 0.22260965406894684, + "learning_rate": 9.983379121927345e-05, + "loss": 1.1751, + "step": 477 + }, + { + "epoch": 0.029041861595479678, + "grad_norm": 0.29320624470710754, + "learning_rate": 9.98330104156398e-05, + "loss": 1.2332, + "step": 478 + }, + { + "epoch": 0.029102618628106203, + "grad_norm": 0.2943146526813507, + "learning_rate": 9.983222778536754e-05, + "loss": 1.1922, + "step": 479 + }, + { + "epoch": 0.02916337566073273, + "grad_norm": 0.2977176010608673, + "learning_rate": 9.983144332848533e-05, + "loss": 1.265, + "step": 480 + }, + { + "epoch": 0.029224132693359255, + "grad_norm": 2.0611252784729004, + "learning_rate": 9.983065704502193e-05, + "loss": 1.2488, + "step": 481 + }, + { + "epoch": 0.029284889725985783, + "grad_norm": 0.325751394033432, + "learning_rate": 9.982986893500617e-05, + "loss": 1.1734, + "step": 482 + }, + { + "epoch": 0.029345646758612308, + "grad_norm": 0.20634117722511292, + "learning_rate": 9.982907899846692e-05, + "loss": 1.1138, + "step": 483 + }, + { + "epoch": 0.029406403791238836, + "grad_norm": 0.27968963980674744, + "learning_rate": 9.982828723543316e-05, + "loss": 1.34, + "step": 484 + }, + { + "epoch": 0.029467160823865364, + "grad_norm": 0.23482371866703033, + "learning_rate": 9.98274936459339e-05, + "loss": 1.2572, + "step": 485 + }, + { + "epoch": 0.02952791785649189, + "grad_norm": 0.2467680126428604, + "learning_rate": 9.982669822999822e-05, + "loss": 1.144, + "step": 486 + }, + { + "epoch": 0.029588674889118417, + "grad_norm": 0.2581479549407959, + "learning_rate": 9.982590098765528e-05, + "loss": 1.3201, + "step": 487 + }, + { + "epoch": 0.02964943192174494, + "grad_norm": 0.528778612613678, + "learning_rate": 9.982510191893433e-05, + "loss": 1.201, + "step": 488 + }, + { + "epoch": 0.02971018895437147, + "grad_norm": 0.194417804479599, + "learning_rate": 9.982430102386462e-05, + "loss": 1.1689, + "step": 489 + }, + { + "epoch": 0.029770945986997994, + "grad_norm": 0.7083895802497864, + "learning_rate": 9.982349830247553e-05, + "loss": 1.2267, + "step": 490 + }, + { + "epoch": 0.029831703019624522, + "grad_norm": 0.2740737199783325, + "learning_rate": 9.982269375479648e-05, + "loss": 1.1455, + "step": 491 + }, + { + "epoch": 0.029892460052251046, + "grad_norm": 0.3193904757499695, + "learning_rate": 9.982188738085697e-05, + "loss": 1.274, + "step": 492 + }, + { + "epoch": 0.029953217084877574, + "grad_norm": 0.24167534708976746, + "learning_rate": 9.982107918068652e-05, + "loss": 1.2912, + "step": 493 + }, + { + "epoch": 0.030013974117504102, + "grad_norm": 0.3258008062839508, + "learning_rate": 9.98202691543148e-05, + "loss": 1.1588, + "step": 494 + }, + { + "epoch": 0.030074731150130627, + "grad_norm": 0.2044263482093811, + "learning_rate": 9.98194573017715e-05, + "loss": 1.1273, + "step": 495 + }, + { + "epoch": 0.030135488182757155, + "grad_norm": 0.17838792502880096, + "learning_rate": 9.981864362308634e-05, + "loss": 1.1111, + "step": 496 + }, + { + "epoch": 0.03019624521538368, + "grad_norm": 0.4175994396209717, + "learning_rate": 9.981782811828918e-05, + "loss": 1.1661, + "step": 497 + }, + { + "epoch": 0.030257002248010208, + "grad_norm": 0.4278128445148468, + "learning_rate": 9.981701078740988e-05, + "loss": 1.1671, + "step": 498 + }, + { + "epoch": 0.030317759280636732, + "grad_norm": 0.3493889272212982, + "learning_rate": 9.981619163047843e-05, + "loss": 1.2339, + "step": 499 + }, + { + "epoch": 0.03037851631326326, + "grad_norm": 0.37411755323410034, + "learning_rate": 9.981537064752486e-05, + "loss": 1.3125, + "step": 500 + }, + { + "epoch": 0.03043927334588979, + "grad_norm": 0.2385791838169098, + "learning_rate": 9.981454783857925e-05, + "loss": 1.2324, + "step": 501 + }, + { + "epoch": 0.030500030378516313, + "grad_norm": 0.20261655747890472, + "learning_rate": 9.981372320367175e-05, + "loss": 1.1406, + "step": 502 + }, + { + "epoch": 0.03056078741114284, + "grad_norm": 0.2751544713973999, + "learning_rate": 9.98128967428326e-05, + "loss": 1.1737, + "step": 503 + }, + { + "epoch": 0.030621544443769366, + "grad_norm": 0.4848704934120178, + "learning_rate": 9.981206845609209e-05, + "loss": 1.3857, + "step": 504 + }, + { + "epoch": 0.030682301476395894, + "grad_norm": 0.48026302456855774, + "learning_rate": 9.981123834348058e-05, + "loss": 1.2668, + "step": 505 + }, + { + "epoch": 0.030743058509022418, + "grad_norm": 0.33266085386276245, + "learning_rate": 9.98104064050285e-05, + "loss": 1.1248, + "step": 506 + }, + { + "epoch": 0.030803815541648946, + "grad_norm": 0.25252965092658997, + "learning_rate": 9.980957264076635e-05, + "loss": 1.2272, + "step": 507 + }, + { + "epoch": 0.03086457257427547, + "grad_norm": 0.3843928873538971, + "learning_rate": 9.980873705072468e-05, + "loss": 1.2356, + "step": 508 + }, + { + "epoch": 0.030925329606902, + "grad_norm": 0.33188146352767944, + "learning_rate": 9.980789963493415e-05, + "loss": 1.2464, + "step": 509 + }, + { + "epoch": 0.030986086639528527, + "grad_norm": 0.20660185813903809, + "learning_rate": 9.98070603934254e-05, + "loss": 1.1742, + "step": 510 + }, + { + "epoch": 0.03104684367215505, + "grad_norm": 0.23412880301475525, + "learning_rate": 9.980621932622924e-05, + "loss": 1.1094, + "step": 511 + }, + { + "epoch": 0.03110760070478158, + "grad_norm": 0.27089810371398926, + "learning_rate": 9.980537643337648e-05, + "loss": 1.1472, + "step": 512 + }, + { + "epoch": 0.031168357737408104, + "grad_norm": 0.6027984023094177, + "learning_rate": 9.980453171489802e-05, + "loss": 1.1834, + "step": 513 + }, + { + "epoch": 0.031229114770034632, + "grad_norm": 0.22745102643966675, + "learning_rate": 9.980368517082481e-05, + "loss": 1.1557, + "step": 514 + }, + { + "epoch": 0.03128987180266116, + "grad_norm": 0.31312257051467896, + "learning_rate": 9.980283680118792e-05, + "loss": 1.388, + "step": 515 + }, + { + "epoch": 0.031350628835287685, + "grad_norm": 2.9024300575256348, + "learning_rate": 9.98019866060184e-05, + "loss": 1.2529, + "step": 516 + }, + { + "epoch": 0.03141138586791421, + "grad_norm": 0.2904795706272125, + "learning_rate": 9.980113458534744e-05, + "loss": 1.1286, + "step": 517 + }, + { + "epoch": 0.03147214290054074, + "grad_norm": 18.508773803710938, + "learning_rate": 9.980028073920627e-05, + "loss": 1.1558, + "step": 518 + }, + { + "epoch": 0.03153289993316726, + "grad_norm": 0.5735429525375366, + "learning_rate": 9.979942506762617e-05, + "loss": 1.3104, + "step": 519 + }, + { + "epoch": 0.03159365696579379, + "grad_norm": 0.885894775390625, + "learning_rate": 9.979856757063853e-05, + "loss": 1.1641, + "step": 520 + }, + { + "epoch": 0.03165441399842032, + "grad_norm": 0.30875757336616516, + "learning_rate": 9.979770824827477e-05, + "loss": 1.1308, + "step": 521 + }, + { + "epoch": 0.031715171031046846, + "grad_norm": 0.39021605253219604, + "learning_rate": 9.979684710056639e-05, + "loss": 1.1799, + "step": 522 + }, + { + "epoch": 0.03177592806367337, + "grad_norm": 0.23148266971111298, + "learning_rate": 9.979598412754493e-05, + "loss": 1.1633, + "step": 523 + }, + { + "epoch": 0.031836685096299895, + "grad_norm": 0.2037530392408371, + "learning_rate": 9.979511932924209e-05, + "loss": 1.2202, + "step": 524 + }, + { + "epoch": 0.03189744212892642, + "grad_norm": 0.23318496346473694, + "learning_rate": 9.979425270568949e-05, + "loss": 1.1552, + "step": 525 + }, + { + "epoch": 0.03195819916155295, + "grad_norm": 0.22681386768817902, + "learning_rate": 9.979338425691894e-05, + "loss": 1.2651, + "step": 526 + }, + { + "epoch": 0.03201895619417948, + "grad_norm": 0.2016787976026535, + "learning_rate": 9.979251398296228e-05, + "loss": 1.2703, + "step": 527 + }, + { + "epoch": 0.032079713226806, + "grad_norm": 0.3406883478164673, + "learning_rate": 9.979164188385138e-05, + "loss": 1.1342, + "step": 528 + }, + { + "epoch": 0.03214047025943253, + "grad_norm": 0.19631391763687134, + "learning_rate": 9.979076795961821e-05, + "loss": 1.1635, + "step": 529 + }, + { + "epoch": 0.032201227292059056, + "grad_norm": 0.6007776856422424, + "learning_rate": 9.978989221029484e-05, + "loss": 1.1395, + "step": 530 + }, + { + "epoch": 0.032261984324685584, + "grad_norm": 0.17776396870613098, + "learning_rate": 9.978901463591335e-05, + "loss": 1.1325, + "step": 531 + }, + { + "epoch": 0.032322741357312106, + "grad_norm": 0.21798866987228394, + "learning_rate": 9.978813523650589e-05, + "loss": 1.1477, + "step": 532 + }, + { + "epoch": 0.032383498389938634, + "grad_norm": 0.28931427001953125, + "learning_rate": 9.978725401210471e-05, + "loss": 1.171, + "step": 533 + }, + { + "epoch": 0.03244425542256516, + "grad_norm": 0.28167280554771423, + "learning_rate": 9.97863709627421e-05, + "loss": 1.2423, + "step": 534 + }, + { + "epoch": 0.03250501245519169, + "grad_norm": 0.27362319827079773, + "learning_rate": 9.978548608845046e-05, + "loss": 1.1502, + "step": 535 + }, + { + "epoch": 0.03256576948781822, + "grad_norm": 0.2425500899553299, + "learning_rate": 9.97845993892622e-05, + "loss": 1.1928, + "step": 536 + }, + { + "epoch": 0.03262652652044474, + "grad_norm": 0.3945496380329132, + "learning_rate": 9.978371086520983e-05, + "loss": 1.1624, + "step": 537 + }, + { + "epoch": 0.03268728355307127, + "grad_norm": 0.3749012351036072, + "learning_rate": 9.97828205163259e-05, + "loss": 1.1653, + "step": 538 + }, + { + "epoch": 0.032748040585697795, + "grad_norm": 0.23593762516975403, + "learning_rate": 9.978192834264306e-05, + "loss": 1.1626, + "step": 539 + }, + { + "epoch": 0.03280879761832432, + "grad_norm": 0.3769228756427765, + "learning_rate": 9.978103434419403e-05, + "loss": 1.1896, + "step": 540 + }, + { + "epoch": 0.03286955465095085, + "grad_norm": 0.1705779880285263, + "learning_rate": 9.978013852101155e-05, + "loss": 1.1551, + "step": 541 + }, + { + "epoch": 0.03293031168357737, + "grad_norm": 0.20579193532466888, + "learning_rate": 9.977924087312849e-05, + "loss": 1.1785, + "step": 542 + }, + { + "epoch": 0.0329910687162039, + "grad_norm": 0.22284187376499176, + "learning_rate": 9.977834140057771e-05, + "loss": 1.2733, + "step": 543 + }, + { + "epoch": 0.03305182574883043, + "grad_norm": 0.23286156356334686, + "learning_rate": 9.977744010339222e-05, + "loss": 1.2651, + "step": 544 + }, + { + "epoch": 0.033112582781456956, + "grad_norm": 0.20310372114181519, + "learning_rate": 9.977653698160503e-05, + "loss": 1.1101, + "step": 545 + }, + { + "epoch": 0.03317333981408348, + "grad_norm": 0.3925718367099762, + "learning_rate": 9.977563203524926e-05, + "loss": 1.3441, + "step": 546 + }, + { + "epoch": 0.033234096846710005, + "grad_norm": 0.36995238065719604, + "learning_rate": 9.977472526435809e-05, + "loss": 1.222, + "step": 547 + }, + { + "epoch": 0.03329485387933653, + "grad_norm": 0.269410640001297, + "learning_rate": 9.977381666896472e-05, + "loss": 1.1506, + "step": 548 + }, + { + "epoch": 0.03335561091196306, + "grad_norm": 0.3384246230125427, + "learning_rate": 9.977290624910249e-05, + "loss": 1.1627, + "step": 549 + }, + { + "epoch": 0.03341636794458959, + "grad_norm": 1.7741787433624268, + "learning_rate": 9.977199400480477e-05, + "loss": 1.1497, + "step": 550 + }, + { + "epoch": 0.03347712497721611, + "grad_norm": 0.5351099371910095, + "learning_rate": 9.977107993610497e-05, + "loss": 1.1993, + "step": 551 + }, + { + "epoch": 0.03353788200984264, + "grad_norm": 0.4378606379032135, + "learning_rate": 9.977016404303663e-05, + "loss": 1.232, + "step": 552 + }, + { + "epoch": 0.03359863904246917, + "grad_norm": 0.4066349267959595, + "learning_rate": 9.97692463256333e-05, + "loss": 1.1599, + "step": 553 + }, + { + "epoch": 0.033659396075095695, + "grad_norm": 1.2139794826507568, + "learning_rate": 9.976832678392862e-05, + "loss": 1.3566, + "step": 554 + }, + { + "epoch": 0.033720153107722216, + "grad_norm": 0.38028234243392944, + "learning_rate": 9.976740541795632e-05, + "loss": 1.2715, + "step": 555 + }, + { + "epoch": 0.033780910140348744, + "grad_norm": 0.3841686248779297, + "learning_rate": 9.976648222775013e-05, + "loss": 1.1234, + "step": 556 + }, + { + "epoch": 0.03384166717297527, + "grad_norm": 0.28865474462509155, + "learning_rate": 9.976555721334392e-05, + "loss": 1.1951, + "step": 557 + }, + { + "epoch": 0.0339024242056018, + "grad_norm": 0.46802064776420593, + "learning_rate": 9.976463037477162e-05, + "loss": 1.1814, + "step": 558 + }, + { + "epoch": 0.03396318123822833, + "grad_norm": 0.3400562107563019, + "learning_rate": 9.976370171206714e-05, + "loss": 1.2694, + "step": 559 + }, + { + "epoch": 0.03402393827085485, + "grad_norm": 0.24644635617733002, + "learning_rate": 9.976277122526457e-05, + "loss": 1.2131, + "step": 560 + }, + { + "epoch": 0.03408469530348138, + "grad_norm": 0.35238033533096313, + "learning_rate": 9.976183891439801e-05, + "loss": 1.2933, + "step": 561 + }, + { + "epoch": 0.034145452336107905, + "grad_norm": 0.5878807902336121, + "learning_rate": 9.976090477950161e-05, + "loss": 1.1863, + "step": 562 + }, + { + "epoch": 0.03420620936873443, + "grad_norm": 0.23976045846939087, + "learning_rate": 9.975996882060963e-05, + "loss": 1.1726, + "step": 563 + }, + { + "epoch": 0.034266966401360954, + "grad_norm": 0.45904335379600525, + "learning_rate": 9.975903103775638e-05, + "loss": 1.256, + "step": 564 + }, + { + "epoch": 0.03432772343398748, + "grad_norm": 0.22525691986083984, + "learning_rate": 9.975809143097622e-05, + "loss": 1.1364, + "step": 565 + }, + { + "epoch": 0.03438848046661401, + "grad_norm": 0.31694698333740234, + "learning_rate": 9.975715000030361e-05, + "loss": 1.2514, + "step": 566 + }, + { + "epoch": 0.03444923749924054, + "grad_norm": 0.46745648980140686, + "learning_rate": 9.975620674577303e-05, + "loss": 1.1186, + "step": 567 + }, + { + "epoch": 0.034509994531867066, + "grad_norm": 0.5622209310531616, + "learning_rate": 9.97552616674191e-05, + "loss": 1.2989, + "step": 568 + }, + { + "epoch": 0.03457075156449359, + "grad_norm": 0.2631570100784302, + "learning_rate": 9.975431476527642e-05, + "loss": 1.1795, + "step": 569 + }, + { + "epoch": 0.034631508597120116, + "grad_norm": 0.706419825553894, + "learning_rate": 9.975336603937972e-05, + "loss": 1.1334, + "step": 570 + }, + { + "epoch": 0.034692265629746644, + "grad_norm": 0.234999418258667, + "learning_rate": 9.975241548976377e-05, + "loss": 1.1608, + "step": 571 + }, + { + "epoch": 0.03475302266237317, + "grad_norm": 0.3765067458152771, + "learning_rate": 9.975146311646342e-05, + "loss": 1.2703, + "step": 572 + }, + { + "epoch": 0.03481377969499969, + "grad_norm": 0.3500179946422577, + "learning_rate": 9.975050891951356e-05, + "loss": 1.1921, + "step": 573 + }, + { + "epoch": 0.03487453672762622, + "grad_norm": 0.3293748199939728, + "learning_rate": 9.97495528989492e-05, + "loss": 1.1633, + "step": 574 + }, + { + "epoch": 0.03493529376025275, + "grad_norm": 0.24921773374080658, + "learning_rate": 9.974859505480534e-05, + "loss": 1.1629, + "step": 575 + }, + { + "epoch": 0.03499605079287928, + "grad_norm": 0.553653359413147, + "learning_rate": 9.974763538711713e-05, + "loss": 1.1635, + "step": 576 + }, + { + "epoch": 0.035056807825505805, + "grad_norm": 0.25096818804740906, + "learning_rate": 9.97466738959197e-05, + "loss": 1.1521, + "step": 577 + }, + { + "epoch": 0.035117564858132326, + "grad_norm": 0.7093678712844849, + "learning_rate": 9.974571058124836e-05, + "loss": 1.1843, + "step": 578 + }, + { + "epoch": 0.035178321890758854, + "grad_norm": 0.4011084735393524, + "learning_rate": 9.974474544313835e-05, + "loss": 1.2213, + "step": 579 + }, + { + "epoch": 0.03523907892338538, + "grad_norm": 0.2578686773777008, + "learning_rate": 9.974377848162511e-05, + "loss": 1.2518, + "step": 580 + }, + { + "epoch": 0.03529983595601191, + "grad_norm": 0.5746710896492004, + "learning_rate": 9.974280969674403e-05, + "loss": 1.1599, + "step": 581 + }, + { + "epoch": 0.03536059298863844, + "grad_norm": 0.46072524785995483, + "learning_rate": 9.974183908853065e-05, + "loss": 1.1874, + "step": 582 + }, + { + "epoch": 0.03542135002126496, + "grad_norm": 0.3364895284175873, + "learning_rate": 9.974086665702055e-05, + "loss": 1.2154, + "step": 583 + }, + { + "epoch": 0.03548210705389149, + "grad_norm": 0.6634140610694885, + "learning_rate": 9.973989240224936e-05, + "loss": 1.2645, + "step": 584 + }, + { + "epoch": 0.035542864086518015, + "grad_norm": 0.18739502131938934, + "learning_rate": 9.97389163242528e-05, + "loss": 1.1574, + "step": 585 + }, + { + "epoch": 0.03560362111914454, + "grad_norm": 0.3704637587070465, + "learning_rate": 9.973793842306667e-05, + "loss": 1.1632, + "step": 586 + }, + { + "epoch": 0.035664378151771065, + "grad_norm": 0.406078964471817, + "learning_rate": 9.973695869872676e-05, + "loss": 1.1416, + "step": 587 + }, + { + "epoch": 0.03572513518439759, + "grad_norm": 0.22447341680526733, + "learning_rate": 9.973597715126904e-05, + "loss": 1.1105, + "step": 588 + }, + { + "epoch": 0.03578589221702412, + "grad_norm": 0.20255105197429657, + "learning_rate": 9.973499378072945e-05, + "loss": 1.1235, + "step": 589 + }, + { + "epoch": 0.03584664924965065, + "grad_norm": 0.35011032223701477, + "learning_rate": 9.973400858714406e-05, + "loss": 1.1137, + "step": 590 + }, + { + "epoch": 0.03590740628227718, + "grad_norm": 0.33853498101234436, + "learning_rate": 9.973302157054897e-05, + "loss": 1.3758, + "step": 591 + }, + { + "epoch": 0.0359681633149037, + "grad_norm": 0.2971729040145874, + "learning_rate": 9.973203273098035e-05, + "loss": 1.1714, + "step": 592 + }, + { + "epoch": 0.036028920347530226, + "grad_norm": 0.35486486554145813, + "learning_rate": 9.973104206847446e-05, + "loss": 1.2644, + "step": 593 + }, + { + "epoch": 0.036089677380156754, + "grad_norm": 0.382726788520813, + "learning_rate": 9.973004958306761e-05, + "loss": 1.1486, + "step": 594 + }, + { + "epoch": 0.03615043441278328, + "grad_norm": 0.28380724787712097, + "learning_rate": 9.972905527479621e-05, + "loss": 1.1434, + "step": 595 + }, + { + "epoch": 0.0362111914454098, + "grad_norm": 0.27264660596847534, + "learning_rate": 9.972805914369665e-05, + "loss": 1.1639, + "step": 596 + }, + { + "epoch": 0.03627194847803633, + "grad_norm": 0.30375102162361145, + "learning_rate": 9.972706118980546e-05, + "loss": 1.1345, + "step": 597 + }, + { + "epoch": 0.03633270551066286, + "grad_norm": 0.20119057595729828, + "learning_rate": 9.972606141315924e-05, + "loss": 1.1644, + "step": 598 + }, + { + "epoch": 0.03639346254328939, + "grad_norm": 1.647966980934143, + "learning_rate": 9.972505981379464e-05, + "loss": 1.1781, + "step": 599 + }, + { + "epoch": 0.036454219575915915, + "grad_norm": 0.41259098052978516, + "learning_rate": 9.972405639174834e-05, + "loss": 1.2058, + "step": 600 + }, + { + "epoch": 0.036514976608542436, + "grad_norm": 10.903308868408203, + "learning_rate": 9.972305114705715e-05, + "loss": 1.2472, + "step": 601 + }, + { + "epoch": 0.036575733641168964, + "grad_norm": 0.8497883677482605, + "learning_rate": 9.972204407975792e-05, + "loss": 1.3055, + "step": 602 + }, + { + "epoch": 0.03663649067379549, + "grad_norm": 0.2843227684497833, + "learning_rate": 9.972103518988753e-05, + "loss": 1.3834, + "step": 603 + }, + { + "epoch": 0.03669724770642202, + "grad_norm": 0.24960339069366455, + "learning_rate": 9.9720024477483e-05, + "loss": 1.1357, + "step": 604 + }, + { + "epoch": 0.03675800473904854, + "grad_norm": 0.3230237662792206, + "learning_rate": 9.971901194258135e-05, + "loss": 1.2343, + "step": 605 + }, + { + "epoch": 0.03681876177167507, + "grad_norm": 0.4179176092147827, + "learning_rate": 9.971799758521972e-05, + "loss": 1.1919, + "step": 606 + }, + { + "epoch": 0.0368795188043016, + "grad_norm": 0.1874077171087265, + "learning_rate": 9.971698140543528e-05, + "loss": 1.1471, + "step": 607 + }, + { + "epoch": 0.036940275836928126, + "grad_norm": 0.28784072399139404, + "learning_rate": 9.971596340326526e-05, + "loss": 1.2477, + "step": 608 + }, + { + "epoch": 0.037001032869554654, + "grad_norm": 0.44791775941848755, + "learning_rate": 9.971494357874701e-05, + "loss": 1.2427, + "step": 609 + }, + { + "epoch": 0.037061789902181175, + "grad_norm": 0.19704169034957886, + "learning_rate": 9.971392193191789e-05, + "loss": 1.1317, + "step": 610 + }, + { + "epoch": 0.0371225469348077, + "grad_norm": 0.760901689529419, + "learning_rate": 9.971289846281535e-05, + "loss": 1.2597, + "step": 611 + }, + { + "epoch": 0.03718330396743423, + "grad_norm": 0.24059328436851501, + "learning_rate": 9.97118731714769e-05, + "loss": 1.3124, + "step": 612 + }, + { + "epoch": 0.03724406100006076, + "grad_norm": 0.46775710582733154, + "learning_rate": 9.971084605794013e-05, + "loss": 1.1538, + "step": 613 + }, + { + "epoch": 0.03730481803268729, + "grad_norm": 0.2856523096561432, + "learning_rate": 9.97098171222427e-05, + "loss": 1.2097, + "step": 614 + }, + { + "epoch": 0.03736557506531381, + "grad_norm": 0.21430863440036774, + "learning_rate": 9.970878636442231e-05, + "loss": 1.1704, + "step": 615 + }, + { + "epoch": 0.037426332097940336, + "grad_norm": 0.2931556701660156, + "learning_rate": 9.970775378451674e-05, + "loss": 1.1479, + "step": 616 + }, + { + "epoch": 0.037487089130566864, + "grad_norm": 0.45134103298187256, + "learning_rate": 9.970671938256386e-05, + "loss": 1.4385, + "step": 617 + }, + { + "epoch": 0.03754784616319339, + "grad_norm": 0.28096574544906616, + "learning_rate": 9.970568315860156e-05, + "loss": 1.189, + "step": 618 + }, + { + "epoch": 0.03760860319581991, + "grad_norm": 0.5512173175811768, + "learning_rate": 9.970464511266784e-05, + "loss": 1.2531, + "step": 619 + }, + { + "epoch": 0.03766936022844644, + "grad_norm": 0.16908709704875946, + "learning_rate": 9.970360524480076e-05, + "loss": 1.1393, + "step": 620 + }, + { + "epoch": 0.03773011726107297, + "grad_norm": 0.4691760540008545, + "learning_rate": 9.97025635550384e-05, + "loss": 1.2468, + "step": 621 + }, + { + "epoch": 0.0377908742936995, + "grad_norm": 0.4211539924144745, + "learning_rate": 9.970152004341897e-05, + "loss": 1.1406, + "step": 622 + }, + { + "epoch": 0.037851631326326025, + "grad_norm": 0.2903730869293213, + "learning_rate": 9.970047470998071e-05, + "loss": 1.2194, + "step": 623 + }, + { + "epoch": 0.037912388358952546, + "grad_norm": 0.7190700173377991, + "learning_rate": 9.969942755476198e-05, + "loss": 1.2489, + "step": 624 + }, + { + "epoch": 0.037973145391579075, + "grad_norm": 1.3949562311172485, + "learning_rate": 9.969837857780109e-05, + "loss": 1.4563, + "step": 625 + }, + { + "epoch": 0.0380339024242056, + "grad_norm": 0.250702828168869, + "learning_rate": 9.969732777913653e-05, + "loss": 1.1457, + "step": 626 + }, + { + "epoch": 0.03809465945683213, + "grad_norm": 0.24044571816921234, + "learning_rate": 9.969627515880682e-05, + "loss": 1.1348, + "step": 627 + }, + { + "epoch": 0.03815541648945865, + "grad_norm": 0.23596933484077454, + "learning_rate": 9.969522071685054e-05, + "loss": 1.1084, + "step": 628 + }, + { + "epoch": 0.03821617352208518, + "grad_norm": 0.39057591557502747, + "learning_rate": 9.969416445330634e-05, + "loss": 1.1658, + "step": 629 + }, + { + "epoch": 0.03827693055471171, + "grad_norm": 0.19264733791351318, + "learning_rate": 9.969310636821294e-05, + "loss": 1.2475, + "step": 630 + }, + { + "epoch": 0.038337687587338236, + "grad_norm": 0.23034612834453583, + "learning_rate": 9.96920464616091e-05, + "loss": 1.1769, + "step": 631 + }, + { + "epoch": 0.038398444619964764, + "grad_norm": 0.21617116034030914, + "learning_rate": 9.96909847335337e-05, + "loss": 1.1668, + "step": 632 + }, + { + "epoch": 0.038459201652591285, + "grad_norm": 0.2674920856952667, + "learning_rate": 9.968992118402567e-05, + "loss": 1.2422, + "step": 633 + }, + { + "epoch": 0.03851995868521781, + "grad_norm": 0.30965131521224976, + "learning_rate": 9.968885581312397e-05, + "loss": 1.1773, + "step": 634 + }, + { + "epoch": 0.03858071571784434, + "grad_norm": 0.20929965376853943, + "learning_rate": 9.968778862086763e-05, + "loss": 1.2276, + "step": 635 + }, + { + "epoch": 0.03864147275047087, + "grad_norm": 0.28453242778778076, + "learning_rate": 9.968671960729581e-05, + "loss": 1.1192, + "step": 636 + }, + { + "epoch": 0.03870222978309739, + "grad_norm": 0.29931753873825073, + "learning_rate": 9.968564877244768e-05, + "loss": 1.2148, + "step": 637 + }, + { + "epoch": 0.03876298681572392, + "grad_norm": 0.4244219660758972, + "learning_rate": 9.968457611636248e-05, + "loss": 1.2368, + "step": 638 + }, + { + "epoch": 0.038823743848350446, + "grad_norm": 0.27939629554748535, + "learning_rate": 9.968350163907956e-05, + "loss": 1.0929, + "step": 639 + }, + { + "epoch": 0.038884500880976974, + "grad_norm": 0.23533456027507782, + "learning_rate": 9.968242534063827e-05, + "loss": 1.1855, + "step": 640 + }, + { + "epoch": 0.0389452579136035, + "grad_norm": 0.22506773471832275, + "learning_rate": 9.968134722107807e-05, + "loss": 1.319, + "step": 641 + }, + { + "epoch": 0.03900601494623002, + "grad_norm": 0.3768306374549866, + "learning_rate": 9.968026728043849e-05, + "loss": 1.1154, + "step": 642 + }, + { + "epoch": 0.03906677197885655, + "grad_norm": 0.2744412422180176, + "learning_rate": 9.96791855187591e-05, + "loss": 1.3277, + "step": 643 + }, + { + "epoch": 0.03912752901148308, + "grad_norm": 0.6380423307418823, + "learning_rate": 9.967810193607958e-05, + "loss": 1.1232, + "step": 644 + }, + { + "epoch": 0.03918828604410961, + "grad_norm": 0.2580445408821106, + "learning_rate": 9.967701653243963e-05, + "loss": 1.2685, + "step": 645 + }, + { + "epoch": 0.039249043076736136, + "grad_norm": 0.33161017298698425, + "learning_rate": 9.967592930787902e-05, + "loss": 1.1529, + "step": 646 + }, + { + "epoch": 0.03930980010936266, + "grad_norm": 0.21009348332881927, + "learning_rate": 9.967484026243763e-05, + "loss": 1.1526, + "step": 647 + }, + { + "epoch": 0.039370557141989185, + "grad_norm": 0.22496499121189117, + "learning_rate": 9.967374939615537e-05, + "loss": 1.1729, + "step": 648 + }, + { + "epoch": 0.03943131417461571, + "grad_norm": 0.2177799940109253, + "learning_rate": 9.967265670907222e-05, + "loss": 1.2111, + "step": 649 + }, + { + "epoch": 0.03949207120724224, + "grad_norm": 0.19962714612483978, + "learning_rate": 9.967156220122824e-05, + "loss": 1.1794, + "step": 650 + }, + { + "epoch": 0.03955282823986876, + "grad_norm": 0.3001613914966583, + "learning_rate": 9.967046587266354e-05, + "loss": 1.2756, + "step": 651 + }, + { + "epoch": 0.03961358527249529, + "grad_norm": 0.17096281051635742, + "learning_rate": 9.966936772341832e-05, + "loss": 1.2438, + "step": 652 + }, + { + "epoch": 0.03967434230512182, + "grad_norm": 0.3118416666984558, + "learning_rate": 9.966826775353281e-05, + "loss": 1.2722, + "step": 653 + }, + { + "epoch": 0.039735099337748346, + "grad_norm": 0.1986151784658432, + "learning_rate": 9.966716596304737e-05, + "loss": 1.1658, + "step": 654 + }, + { + "epoch": 0.039795856370374874, + "grad_norm": 0.2683010697364807, + "learning_rate": 9.966606235200234e-05, + "loss": 1.1902, + "step": 655 + }, + { + "epoch": 0.039856613403001395, + "grad_norm": 0.20436552166938782, + "learning_rate": 9.96649569204382e-05, + "loss": 1.2356, + "step": 656 + }, + { + "epoch": 0.03991737043562792, + "grad_norm": 0.31484872102737427, + "learning_rate": 9.966384966839547e-05, + "loss": 1.2013, + "step": 657 + }, + { + "epoch": 0.03997812746825445, + "grad_norm": 0.1810244768857956, + "learning_rate": 9.966274059591472e-05, + "loss": 1.1815, + "step": 658 + }, + { + "epoch": 0.04003888450088098, + "grad_norm": 0.21846987307071686, + "learning_rate": 9.966162970303663e-05, + "loss": 1.1703, + "step": 659 + }, + { + "epoch": 0.0400996415335075, + "grad_norm": 0.17850124835968018, + "learning_rate": 9.96605169898019e-05, + "loss": 1.1477, + "step": 660 + }, + { + "epoch": 0.04016039856613403, + "grad_norm": 0.23468884825706482, + "learning_rate": 9.965940245625131e-05, + "loss": 1.1484, + "step": 661 + }, + { + "epoch": 0.040221155598760557, + "grad_norm": 0.1681394726037979, + "learning_rate": 9.965828610242574e-05, + "loss": 1.1739, + "step": 662 + }, + { + "epoch": 0.040281912631387085, + "grad_norm": 0.3256566822528839, + "learning_rate": 9.965716792836606e-05, + "loss": 1.3391, + "step": 663 + }, + { + "epoch": 0.04034266966401361, + "grad_norm": 0.31519389152526855, + "learning_rate": 9.965604793411331e-05, + "loss": 1.3248, + "step": 664 + }, + { + "epoch": 0.040403426696640134, + "grad_norm": 0.38986799120903015, + "learning_rate": 9.965492611970854e-05, + "loss": 1.1911, + "step": 665 + }, + { + "epoch": 0.04046418372926666, + "grad_norm": 0.2666741907596588, + "learning_rate": 9.965380248519284e-05, + "loss": 1.1738, + "step": 666 + }, + { + "epoch": 0.04052494076189319, + "grad_norm": 4.861932277679443, + "learning_rate": 9.965267703060743e-05, + "loss": 1.2675, + "step": 667 + }, + { + "epoch": 0.04058569779451972, + "grad_norm": 0.2190086990594864, + "learning_rate": 9.965154975599352e-05, + "loss": 1.2847, + "step": 668 + }, + { + "epoch": 0.04064645482714624, + "grad_norm": 0.300801545381546, + "learning_rate": 9.965042066139248e-05, + "loss": 1.1476, + "step": 669 + }, + { + "epoch": 0.04070721185977277, + "grad_norm": 0.1510702222585678, + "learning_rate": 9.964928974684565e-05, + "loss": 1.1339, + "step": 670 + }, + { + "epoch": 0.040767968892399295, + "grad_norm": 0.250924289226532, + "learning_rate": 9.964815701239452e-05, + "loss": 1.2956, + "step": 671 + }, + { + "epoch": 0.04082872592502582, + "grad_norm": 0.3033612072467804, + "learning_rate": 9.964702245808059e-05, + "loss": 1.0902, + "step": 672 + }, + { + "epoch": 0.04088948295765235, + "grad_norm": 0.22391510009765625, + "learning_rate": 9.964588608394547e-05, + "loss": 1.3219, + "step": 673 + }, + { + "epoch": 0.04095023999027887, + "grad_norm": 0.28877395391464233, + "learning_rate": 9.96447478900308e-05, + "loss": 1.1541, + "step": 674 + }, + { + "epoch": 0.0410109970229054, + "grad_norm": 0.21077710390090942, + "learning_rate": 9.964360787637827e-05, + "loss": 1.2123, + "step": 675 + }, + { + "epoch": 0.04107175405553193, + "grad_norm": 0.2088579684495926, + "learning_rate": 9.964246604302974e-05, + "loss": 1.1197, + "step": 676 + }, + { + "epoch": 0.041132511088158456, + "grad_norm": 0.2369183748960495, + "learning_rate": 9.964132239002699e-05, + "loss": 1.2081, + "step": 677 + }, + { + "epoch": 0.04119326812078498, + "grad_norm": 0.35517820715904236, + "learning_rate": 9.964017691741198e-05, + "loss": 1.1635, + "step": 678 + }, + { + "epoch": 0.041254025153411505, + "grad_norm": 0.2329184114933014, + "learning_rate": 9.96390296252267e-05, + "loss": 1.1153, + "step": 679 + }, + { + "epoch": 0.041314782186038033, + "grad_norm": 0.249697744846344, + "learning_rate": 9.963788051351318e-05, + "loss": 1.308, + "step": 680 + }, + { + "epoch": 0.04137553921866456, + "grad_norm": 0.2712596356868744, + "learning_rate": 9.963672958231356e-05, + "loss": 1.2595, + "step": 681 + }, + { + "epoch": 0.04143629625129109, + "grad_norm": 0.15390761196613312, + "learning_rate": 9.963557683167003e-05, + "loss": 1.133, + "step": 682 + }, + { + "epoch": 0.04149705328391761, + "grad_norm": 0.31179946660995483, + "learning_rate": 9.963442226162482e-05, + "loss": 1.1638, + "step": 683 + }, + { + "epoch": 0.04155781031654414, + "grad_norm": 0.25089114904403687, + "learning_rate": 9.963326587222028e-05, + "loss": 1.1421, + "step": 684 + }, + { + "epoch": 0.04161856734917067, + "grad_norm": 0.2573951184749603, + "learning_rate": 9.96321076634988e-05, + "loss": 1.1881, + "step": 685 + }, + { + "epoch": 0.041679324381797195, + "grad_norm": 12.048274993896484, + "learning_rate": 9.96309476355028e-05, + "loss": 1.1963, + "step": 686 + }, + { + "epoch": 0.04174008141442372, + "grad_norm": 0.5792553424835205, + "learning_rate": 9.962978578827483e-05, + "loss": 1.1683, + "step": 687 + }, + { + "epoch": 0.041800838447050244, + "grad_norm": 0.26494306325912476, + "learning_rate": 9.962862212185746e-05, + "loss": 1.3472, + "step": 688 + }, + { + "epoch": 0.04186159547967677, + "grad_norm": 0.28178900480270386, + "learning_rate": 9.962745663629335e-05, + "loss": 1.2154, + "step": 689 + }, + { + "epoch": 0.0419223525123033, + "grad_norm": 0.340414434671402, + "learning_rate": 9.962628933162524e-05, + "loss": 1.2354, + "step": 690 + }, + { + "epoch": 0.04198310954492983, + "grad_norm": 0.24672427773475647, + "learning_rate": 9.962512020789588e-05, + "loss": 1.1686, + "step": 691 + }, + { + "epoch": 0.04204386657755635, + "grad_norm": 0.22035415470600128, + "learning_rate": 9.962394926514817e-05, + "loss": 1.1783, + "step": 692 + }, + { + "epoch": 0.04210462361018288, + "grad_norm": 0.4015302062034607, + "learning_rate": 9.9622776503425e-05, + "loss": 1.2413, + "step": 693 + }, + { + "epoch": 0.042165380642809405, + "grad_norm": 0.18768566846847534, + "learning_rate": 9.962160192276935e-05, + "loss": 1.1275, + "step": 694 + }, + { + "epoch": 0.04222613767543593, + "grad_norm": 0.3100888729095459, + "learning_rate": 9.962042552322431e-05, + "loss": 1.1383, + "step": 695 + }, + { + "epoch": 0.04228689470806246, + "grad_norm": 0.2774471938610077, + "learning_rate": 9.961924730483297e-05, + "loss": 1.2108, + "step": 696 + }, + { + "epoch": 0.04234765174068898, + "grad_norm": 0.510339617729187, + "learning_rate": 9.961806726763854e-05, + "loss": 1.2698, + "step": 697 + }, + { + "epoch": 0.04240840877331551, + "grad_norm": 0.4789173901081085, + "learning_rate": 9.961688541168424e-05, + "loss": 1.1169, + "step": 698 + }, + { + "epoch": 0.04246916580594204, + "grad_norm": 0.20484888553619385, + "learning_rate": 9.961570173701344e-05, + "loss": 1.1606, + "step": 699 + }, + { + "epoch": 0.04252992283856857, + "grad_norm": 0.30656594038009644, + "learning_rate": 9.961451624366949e-05, + "loss": 1.13, + "step": 700 + }, + { + "epoch": 0.04259067987119509, + "grad_norm": 0.33675429224967957, + "learning_rate": 9.961332893169586e-05, + "loss": 1.3475, + "step": 701 + }, + { + "epoch": 0.042651436903821616, + "grad_norm": 0.3515656888484955, + "learning_rate": 9.961213980113606e-05, + "loss": 1.1764, + "step": 702 + }, + { + "epoch": 0.042712193936448144, + "grad_norm": 0.43863317370414734, + "learning_rate": 9.96109488520337e-05, + "loss": 1.1448, + "step": 703 + }, + { + "epoch": 0.04277295096907467, + "grad_norm": 0.3262401521205902, + "learning_rate": 9.960975608443241e-05, + "loss": 1.1955, + "step": 704 + }, + { + "epoch": 0.0428337080017012, + "grad_norm": 0.28044047951698303, + "learning_rate": 9.960856149837592e-05, + "loss": 1.1635, + "step": 705 + }, + { + "epoch": 0.04289446503432772, + "grad_norm": 0.2565014362335205, + "learning_rate": 9.960736509390804e-05, + "loss": 1.1502, + "step": 706 + }, + { + "epoch": 0.04295522206695425, + "grad_norm": 0.2596074640750885, + "learning_rate": 9.960616687107258e-05, + "loss": 1.1355, + "step": 707 + }, + { + "epoch": 0.04301597909958078, + "grad_norm": 0.3365881145000458, + "learning_rate": 9.96049668299135e-05, + "loss": 1.1826, + "step": 708 + }, + { + "epoch": 0.043076736132207305, + "grad_norm": 0.20425531268119812, + "learning_rate": 9.960376497047476e-05, + "loss": 1.1858, + "step": 709 + }, + { + "epoch": 0.043137493164833826, + "grad_norm": 0.27772635221481323, + "learning_rate": 9.960256129280043e-05, + "loss": 1.1995, + "step": 710 + }, + { + "epoch": 0.043198250197460354, + "grad_norm": 0.9875108003616333, + "learning_rate": 9.960135579693462e-05, + "loss": 1.1236, + "step": 711 + }, + { + "epoch": 0.04325900723008688, + "grad_norm": 0.21602267026901245, + "learning_rate": 9.960014848292154e-05, + "loss": 1.1126, + "step": 712 + }, + { + "epoch": 0.04331976426271341, + "grad_norm": 0.19521121680736542, + "learning_rate": 9.959893935080542e-05, + "loss": 1.1219, + "step": 713 + }, + { + "epoch": 0.04338052129533994, + "grad_norm": 0.19386346638202667, + "learning_rate": 9.95977284006306e-05, + "loss": 1.2103, + "step": 714 + }, + { + "epoch": 0.04344127832796646, + "grad_norm": 0.2295754998922348, + "learning_rate": 9.959651563244146e-05, + "loss": 1.1571, + "step": 715 + }, + { + "epoch": 0.04350203536059299, + "grad_norm": 0.20171459019184113, + "learning_rate": 9.959530104628245e-05, + "loss": 1.1399, + "step": 716 + }, + { + "epoch": 0.043562792393219515, + "grad_norm": 0.216019868850708, + "learning_rate": 9.959408464219809e-05, + "loss": 1.248, + "step": 717 + }, + { + "epoch": 0.043623549425846044, + "grad_norm": 0.386947900056839, + "learning_rate": 9.959286642023297e-05, + "loss": 1.1545, + "step": 718 + }, + { + "epoch": 0.04368430645847257, + "grad_norm": 0.33577942848205566, + "learning_rate": 9.959164638043175e-05, + "loss": 1.1484, + "step": 719 + }, + { + "epoch": 0.04374506349109909, + "grad_norm": 0.24220316112041473, + "learning_rate": 9.959042452283914e-05, + "loss": 1.3661, + "step": 720 + }, + { + "epoch": 0.04380582052372562, + "grad_norm": 0.2601158916950226, + "learning_rate": 9.958920084749993e-05, + "loss": 1.2403, + "step": 721 + }, + { + "epoch": 0.04386657755635215, + "grad_norm": 0.29173552989959717, + "learning_rate": 9.958797535445899e-05, + "loss": 1.1332, + "step": 722 + }, + { + "epoch": 0.04392733458897868, + "grad_norm": 0.17891642451286316, + "learning_rate": 9.958674804376123e-05, + "loss": 1.1169, + "step": 723 + }, + { + "epoch": 0.0439880916216052, + "grad_norm": 0.3143637478351593, + "learning_rate": 9.958551891545161e-05, + "loss": 1.227, + "step": 724 + }, + { + "epoch": 0.044048848654231726, + "grad_norm": 0.22817392647266388, + "learning_rate": 9.958428796957523e-05, + "loss": 1.1594, + "step": 725 + }, + { + "epoch": 0.044109605686858254, + "grad_norm": 0.14670951664447784, + "learning_rate": 9.95830552061772e-05, + "loss": 1.1391, + "step": 726 + }, + { + "epoch": 0.04417036271948478, + "grad_norm": 0.1939353346824646, + "learning_rate": 9.958182062530268e-05, + "loss": 1.137, + "step": 727 + }, + { + "epoch": 0.04423111975211131, + "grad_norm": 0.22217488288879395, + "learning_rate": 9.958058422699695e-05, + "loss": 1.1166, + "step": 728 + }, + { + "epoch": 0.04429187678473783, + "grad_norm": 0.3429369032382965, + "learning_rate": 9.957934601130529e-05, + "loss": 1.337, + "step": 729 + }, + { + "epoch": 0.04435263381736436, + "grad_norm": 0.3005068898200989, + "learning_rate": 9.957810597827316e-05, + "loss": 1.262, + "step": 730 + }, + { + "epoch": 0.04441339084999089, + "grad_norm": 0.3939972519874573, + "learning_rate": 9.957686412794595e-05, + "loss": 1.1021, + "step": 731 + }, + { + "epoch": 0.044474147882617415, + "grad_norm": 0.20464569330215454, + "learning_rate": 9.957562046036921e-05, + "loss": 1.2641, + "step": 732 + }, + { + "epoch": 0.044534904915243936, + "grad_norm": 0.3335719406604767, + "learning_rate": 9.957437497558852e-05, + "loss": 1.1632, + "step": 733 + }, + { + "epoch": 0.044595661947870464, + "grad_norm": 0.35726863145828247, + "learning_rate": 9.957312767364953e-05, + "loss": 1.2758, + "step": 734 + }, + { + "epoch": 0.04465641898049699, + "grad_norm": 0.2093924731016159, + "learning_rate": 9.957187855459798e-05, + "loss": 1.1587, + "step": 735 + }, + { + "epoch": 0.04471717601312352, + "grad_norm": 0.3535689115524292, + "learning_rate": 9.957062761847962e-05, + "loss": 1.2929, + "step": 736 + }, + { + "epoch": 0.04477793304575005, + "grad_norm": 0.1873035728931427, + "learning_rate": 9.956937486534033e-05, + "loss": 1.1691, + "step": 737 + }, + { + "epoch": 0.04483869007837657, + "grad_norm": 0.22934308648109436, + "learning_rate": 9.956812029522602e-05, + "loss": 1.1608, + "step": 738 + }, + { + "epoch": 0.0448994471110031, + "grad_norm": 0.4806212782859802, + "learning_rate": 9.956686390818268e-05, + "loss": 1.3248, + "step": 739 + }, + { + "epoch": 0.044960204143629626, + "grad_norm": 0.38668379187583923, + "learning_rate": 9.956560570425635e-05, + "loss": 1.358, + "step": 740 + }, + { + "epoch": 0.045020961176256154, + "grad_norm": 0.3260784149169922, + "learning_rate": 9.956434568349318e-05, + "loss": 1.1787, + "step": 741 + }, + { + "epoch": 0.045081718208882675, + "grad_norm": 0.20555247366428375, + "learning_rate": 9.956308384593933e-05, + "loss": 1.1206, + "step": 742 + }, + { + "epoch": 0.0451424752415092, + "grad_norm": 0.21444964408874512, + "learning_rate": 9.956182019164106e-05, + "loss": 1.2573, + "step": 743 + }, + { + "epoch": 0.04520323227413573, + "grad_norm": 0.20223313570022583, + "learning_rate": 9.956055472064469e-05, + "loss": 1.1326, + "step": 744 + }, + { + "epoch": 0.04526398930676226, + "grad_norm": 0.17892780900001526, + "learning_rate": 9.955928743299662e-05, + "loss": 1.1954, + "step": 745 + }, + { + "epoch": 0.04532474633938879, + "grad_norm": 0.1672462522983551, + "learning_rate": 9.955801832874328e-05, + "loss": 1.1484, + "step": 746 + }, + { + "epoch": 0.04538550337201531, + "grad_norm": 0.36391040682792664, + "learning_rate": 9.955674740793119e-05, + "loss": 1.2516, + "step": 747 + }, + { + "epoch": 0.045446260404641836, + "grad_norm": 0.38535046577453613, + "learning_rate": 9.955547467060696e-05, + "loss": 1.2676, + "step": 748 + }, + { + "epoch": 0.045507017437268364, + "grad_norm": 0.27136051654815674, + "learning_rate": 9.955420011681722e-05, + "loss": 1.1743, + "step": 749 + }, + { + "epoch": 0.04556777446989489, + "grad_norm": 0.19951479136943817, + "learning_rate": 9.955292374660869e-05, + "loss": 1.1381, + "step": 750 + }, + { + "epoch": 0.04562853150252142, + "grad_norm": 0.1706264764070511, + "learning_rate": 9.955164556002819e-05, + "loss": 1.1095, + "step": 751 + }, + { + "epoch": 0.04568928853514794, + "grad_norm": 0.2307135909795761, + "learning_rate": 9.955036555712252e-05, + "loss": 1.1944, + "step": 752 + }, + { + "epoch": 0.04575004556777447, + "grad_norm": 0.5656225085258484, + "learning_rate": 9.954908373793861e-05, + "loss": 1.5714, + "step": 753 + }, + { + "epoch": 0.045810802600401, + "grad_norm": 0.26978978514671326, + "learning_rate": 9.954780010252347e-05, + "loss": 1.1083, + "step": 754 + }, + { + "epoch": 0.045871559633027525, + "grad_norm": 0.21326936781406403, + "learning_rate": 9.954651465092414e-05, + "loss": 1.124, + "step": 755 + }, + { + "epoch": 0.04593231666565405, + "grad_norm": 0.4203796088695526, + "learning_rate": 9.954522738318776e-05, + "loss": 1.2392, + "step": 756 + }, + { + "epoch": 0.045993073698280575, + "grad_norm": 0.27247968316078186, + "learning_rate": 9.954393829936146e-05, + "loss": 1.1137, + "step": 757 + }, + { + "epoch": 0.0460538307309071, + "grad_norm": 0.31686800718307495, + "learning_rate": 9.954264739949255e-05, + "loss": 1.2118, + "step": 758 + }, + { + "epoch": 0.04611458776353363, + "grad_norm": 0.3327178359031677, + "learning_rate": 9.954135468362831e-05, + "loss": 1.1996, + "step": 759 + }, + { + "epoch": 0.04617534479616016, + "grad_norm": 0.228616401553154, + "learning_rate": 9.954006015181614e-05, + "loss": 1.272, + "step": 760 + }, + { + "epoch": 0.04623610182878668, + "grad_norm": 0.5685213208198547, + "learning_rate": 9.95387638041035e-05, + "loss": 1.0946, + "step": 761 + }, + { + "epoch": 0.04629685886141321, + "grad_norm": 0.40294498205184937, + "learning_rate": 9.953746564053788e-05, + "loss": 1.1519, + "step": 762 + }, + { + "epoch": 0.046357615894039736, + "grad_norm": 0.3475242853164673, + "learning_rate": 9.953616566116689e-05, + "loss": 1.2288, + "step": 763 + }, + { + "epoch": 0.046418372926666264, + "grad_norm": 0.2769295275211334, + "learning_rate": 9.953486386603816e-05, + "loss": 1.107, + "step": 764 + }, + { + "epoch": 0.046479129959292785, + "grad_norm": 0.19877159595489502, + "learning_rate": 9.953356025519945e-05, + "loss": 1.1911, + "step": 765 + }, + { + "epoch": 0.04653988699191931, + "grad_norm": 0.32342344522476196, + "learning_rate": 9.953225482869849e-05, + "loss": 1.2127, + "step": 766 + }, + { + "epoch": 0.04660064402454584, + "grad_norm": 0.4708094596862793, + "learning_rate": 9.953094758658315e-05, + "loss": 1.1604, + "step": 767 + }, + { + "epoch": 0.04666140105717237, + "grad_norm": 1.619902491569519, + "learning_rate": 9.952963852890137e-05, + "loss": 1.1045, + "step": 768 + }, + { + "epoch": 0.0467221580897989, + "grad_norm": 0.6743023991584778, + "learning_rate": 9.95283276557011e-05, + "loss": 1.2388, + "step": 769 + }, + { + "epoch": 0.04678291512242542, + "grad_norm": 0.2565208375453949, + "learning_rate": 9.952701496703043e-05, + "loss": 1.1911, + "step": 770 + }, + { + "epoch": 0.046843672155051946, + "grad_norm": 0.5599068999290466, + "learning_rate": 9.952570046293743e-05, + "loss": 1.189, + "step": 771 + }, + { + "epoch": 0.046904429187678474, + "grad_norm": 0.22475752234458923, + "learning_rate": 9.952438414347032e-05, + "loss": 1.1731, + "step": 772 + }, + { + "epoch": 0.046965186220305, + "grad_norm": 0.18163125216960907, + "learning_rate": 9.952306600867734e-05, + "loss": 1.1635, + "step": 773 + }, + { + "epoch": 0.047025943252931524, + "grad_norm": 0.23896318674087524, + "learning_rate": 9.952174605860678e-05, + "loss": 1.2031, + "step": 774 + }, + { + "epoch": 0.04708670028555805, + "grad_norm": 0.23537391424179077, + "learning_rate": 9.952042429330705e-05, + "loss": 1.1295, + "step": 775 + }, + { + "epoch": 0.04714745731818458, + "grad_norm": 0.17851340770721436, + "learning_rate": 9.951910071282662e-05, + "loss": 1.138, + "step": 776 + }, + { + "epoch": 0.04720821435081111, + "grad_norm": 0.25157350301742554, + "learning_rate": 9.951777531721396e-05, + "loss": 1.1467, + "step": 777 + }, + { + "epoch": 0.047268971383437636, + "grad_norm": 0.19467604160308838, + "learning_rate": 9.951644810651767e-05, + "loss": 1.2312, + "step": 778 + }, + { + "epoch": 0.04732972841606416, + "grad_norm": 0.34383106231689453, + "learning_rate": 9.951511908078644e-05, + "loss": 1.3084, + "step": 779 + }, + { + "epoch": 0.047390485448690685, + "grad_norm": 0.2738543450832367, + "learning_rate": 9.951378824006891e-05, + "loss": 1.2193, + "step": 780 + }, + { + "epoch": 0.04745124248131721, + "grad_norm": 0.3195165991783142, + "learning_rate": 9.951245558441391e-05, + "loss": 1.2006, + "step": 781 + }, + { + "epoch": 0.04751199951394374, + "grad_norm": 0.20014366507530212, + "learning_rate": 9.951112111387029e-05, + "loss": 1.148, + "step": 782 + }, + { + "epoch": 0.04757275654657027, + "grad_norm": 0.7697231769561768, + "learning_rate": 9.950978482848693e-05, + "loss": 1.2827, + "step": 783 + }, + { + "epoch": 0.04763351357919679, + "grad_norm": 0.21268193423748016, + "learning_rate": 9.950844672831288e-05, + "loss": 1.2048, + "step": 784 + }, + { + "epoch": 0.04769427061182332, + "grad_norm": 0.20504657924175262, + "learning_rate": 9.950710681339712e-05, + "loss": 1.114, + "step": 785 + }, + { + "epoch": 0.047755027644449846, + "grad_norm": 0.2534680664539337, + "learning_rate": 9.95057650837888e-05, + "loss": 1.1585, + "step": 786 + }, + { + "epoch": 0.047815784677076374, + "grad_norm": 0.23410208523273468, + "learning_rate": 9.950442153953707e-05, + "loss": 1.1851, + "step": 787 + }, + { + "epoch": 0.047876541709702895, + "grad_norm": 0.21911850571632385, + "learning_rate": 9.950307618069122e-05, + "loss": 1.1967, + "step": 788 + }, + { + "epoch": 0.04793729874232942, + "grad_norm": 0.2831370234489441, + "learning_rate": 9.950172900730053e-05, + "loss": 1.2854, + "step": 789 + }, + { + "epoch": 0.04799805577495595, + "grad_norm": 0.20728982985019684, + "learning_rate": 9.950038001941441e-05, + "loss": 1.2317, + "step": 790 + }, + { + "epoch": 0.04805881280758248, + "grad_norm": 0.32236918807029724, + "learning_rate": 9.949902921708228e-05, + "loss": 1.2141, + "step": 791 + }, + { + "epoch": 0.04811956984020901, + "grad_norm": 0.2485806941986084, + "learning_rate": 9.949767660035368e-05, + "loss": 1.1434, + "step": 792 + }, + { + "epoch": 0.04818032687283553, + "grad_norm": 0.19720609486103058, + "learning_rate": 9.949632216927815e-05, + "loss": 1.1489, + "step": 793 + }, + { + "epoch": 0.04824108390546206, + "grad_norm": 0.2985354959964752, + "learning_rate": 9.949496592390539e-05, + "loss": 1.2711, + "step": 794 + }, + { + "epoch": 0.048301840938088585, + "grad_norm": 0.2601931393146515, + "learning_rate": 9.949360786428508e-05, + "loss": 1.1099, + "step": 795 + }, + { + "epoch": 0.04836259797071511, + "grad_norm": 0.21803413331508636, + "learning_rate": 9.949224799046699e-05, + "loss": 1.3067, + "step": 796 + }, + { + "epoch": 0.048423355003341634, + "grad_norm": 0.1801166832447052, + "learning_rate": 9.9490886302501e-05, + "loss": 1.1214, + "step": 797 + }, + { + "epoch": 0.04848411203596816, + "grad_norm": 0.16227313876152039, + "learning_rate": 9.948952280043701e-05, + "loss": 1.1485, + "step": 798 + }, + { + "epoch": 0.04854486906859469, + "grad_norm": 0.1986556202173233, + "learning_rate": 9.948815748432499e-05, + "loss": 1.1811, + "step": 799 + }, + { + "epoch": 0.04860562610122122, + "grad_norm": 0.2063964605331421, + "learning_rate": 9.948679035421499e-05, + "loss": 1.1029, + "step": 800 + }, + { + "epoch": 0.048666383133847746, + "grad_norm": 0.21714679896831512, + "learning_rate": 9.948542141015714e-05, + "loss": 1.2234, + "step": 801 + }, + { + "epoch": 0.04872714016647427, + "grad_norm": 0.19539695978164673, + "learning_rate": 9.948405065220156e-05, + "loss": 1.1159, + "step": 802 + }, + { + "epoch": 0.048787897199100795, + "grad_norm": 0.8304261565208435, + "learning_rate": 9.948267808039857e-05, + "loss": 1.1999, + "step": 803 + }, + { + "epoch": 0.04884865423172732, + "grad_norm": 0.2534937858581543, + "learning_rate": 9.948130369479844e-05, + "loss": 1.1281, + "step": 804 + }, + { + "epoch": 0.04890941126435385, + "grad_norm": 0.6079438328742981, + "learning_rate": 9.947992749545158e-05, + "loss": 1.1047, + "step": 805 + }, + { + "epoch": 0.04897016829698037, + "grad_norm": 0.7486149668693542, + "learning_rate": 9.94785494824084e-05, + "loss": 1.3219, + "step": 806 + }, + { + "epoch": 0.0490309253296069, + "grad_norm": 0.40212923288345337, + "learning_rate": 9.947716965571941e-05, + "loss": 1.1158, + "step": 807 + }, + { + "epoch": 0.04909168236223343, + "grad_norm": 0.32530421018600464, + "learning_rate": 9.947578801543522e-05, + "loss": 1.1595, + "step": 808 + }, + { + "epoch": 0.049152439394859956, + "grad_norm": 0.3300507962703705, + "learning_rate": 9.947440456160645e-05, + "loss": 1.1426, + "step": 809 + }, + { + "epoch": 0.049213196427486484, + "grad_norm": 0.2577906548976898, + "learning_rate": 9.947301929428384e-05, + "loss": 1.1482, + "step": 810 + }, + { + "epoch": 0.049273953460113006, + "grad_norm": 93.94365692138672, + "learning_rate": 9.94716322135181e-05, + "loss": 1.1666, + "step": 811 + }, + { + "epoch": 0.049334710492739534, + "grad_norm": 1.0377168655395508, + "learning_rate": 9.947024331936015e-05, + "loss": 1.1194, + "step": 812 + }, + { + "epoch": 0.04939546752536606, + "grad_norm": 0.2052767127752304, + "learning_rate": 9.946885261186086e-05, + "loss": 1.1368, + "step": 813 + }, + { + "epoch": 0.04945622455799259, + "grad_norm": 0.28331485390663147, + "learning_rate": 9.946746009107121e-05, + "loss": 1.126, + "step": 814 + }, + { + "epoch": 0.04951698159061911, + "grad_norm": 0.2663123309612274, + "learning_rate": 9.946606575704227e-05, + "loss": 1.1422, + "step": 815 + }, + { + "epoch": 0.04957773862324564, + "grad_norm": 0.3345920741558075, + "learning_rate": 9.946466960982512e-05, + "loss": 1.225, + "step": 816 + }, + { + "epoch": 0.04963849565587217, + "grad_norm": 0.27361583709716797, + "learning_rate": 9.946327164947094e-05, + "loss": 1.1195, + "step": 817 + }, + { + "epoch": 0.049699252688498695, + "grad_norm": 0.42706793546676636, + "learning_rate": 9.946187187603097e-05, + "loss": 1.157, + "step": 818 + }, + { + "epoch": 0.04976000972112522, + "grad_norm": 0.20006580650806427, + "learning_rate": 9.946047028955651e-05, + "loss": 1.2072, + "step": 819 + }, + { + "epoch": 0.049820766753751744, + "grad_norm": 0.33667150139808655, + "learning_rate": 9.945906689009898e-05, + "loss": 1.2, + "step": 820 + }, + { + "epoch": 0.04988152378637827, + "grad_norm": 0.1654760092496872, + "learning_rate": 9.94576616777098e-05, + "loss": 1.1349, + "step": 821 + }, + { + "epoch": 0.0499422808190048, + "grad_norm": 0.2234649658203125, + "learning_rate": 9.945625465244044e-05, + "loss": 1.1722, + "step": 822 + }, + { + "epoch": 0.05000303785163133, + "grad_norm": 0.2198670357465744, + "learning_rate": 9.945484581434253e-05, + "loss": 1.1624, + "step": 823 + }, + { + "epoch": 0.050063794884257856, + "grad_norm": 0.3813166916370392, + "learning_rate": 9.945343516346768e-05, + "loss": 1.1479, + "step": 824 + }, + { + "epoch": 0.05012455191688438, + "grad_norm": 0.17954643070697784, + "learning_rate": 9.94520226998676e-05, + "loss": 1.168, + "step": 825 + }, + { + "epoch": 0.050185308949510905, + "grad_norm": 0.3486316204071045, + "learning_rate": 9.945060842359408e-05, + "loss": 1.1609, + "step": 826 + }, + { + "epoch": 0.05024606598213743, + "grad_norm": 0.18670342862606049, + "learning_rate": 9.944919233469894e-05, + "loss": 1.2275, + "step": 827 + }, + { + "epoch": 0.05030682301476396, + "grad_norm": 0.42580702900886536, + "learning_rate": 9.944777443323409e-05, + "loss": 1.2689, + "step": 828 + }, + { + "epoch": 0.05036758004739048, + "grad_norm": 0.1723891943693161, + "learning_rate": 9.944635471925153e-05, + "loss": 1.1456, + "step": 829 + }, + { + "epoch": 0.05042833708001701, + "grad_norm": 0.4415867328643799, + "learning_rate": 9.944493319280326e-05, + "loss": 1.171, + "step": 830 + }, + { + "epoch": 0.05048909411264354, + "grad_norm": 0.18282508850097656, + "learning_rate": 9.944350985394142e-05, + "loss": 1.1425, + "step": 831 + }, + { + "epoch": 0.05054985114527007, + "grad_norm": 5.300233364105225, + "learning_rate": 9.944208470271816e-05, + "loss": 1.1379, + "step": 832 + }, + { + "epoch": 0.050610608177896595, + "grad_norm": 0.4158828854560852, + "learning_rate": 9.944065773918574e-05, + "loss": 1.1716, + "step": 833 + }, + { + "epoch": 0.050671365210523116, + "grad_norm": 0.9802979230880737, + "learning_rate": 9.943922896339644e-05, + "loss": 1.2526, + "step": 834 + }, + { + "epoch": 0.050732122243149644, + "grad_norm": 0.15631850063800812, + "learning_rate": 9.943779837540266e-05, + "loss": 1.1664, + "step": 835 + }, + { + "epoch": 0.05079287927577617, + "grad_norm": 0.39770379662513733, + "learning_rate": 9.94363659752568e-05, + "loss": 1.1575, + "step": 836 + }, + { + "epoch": 0.0508536363084027, + "grad_norm": 0.2634415030479431, + "learning_rate": 9.943493176301142e-05, + "loss": 1.1118, + "step": 837 + }, + { + "epoch": 0.05091439334102922, + "grad_norm": 0.4317116439342499, + "learning_rate": 9.943349573871905e-05, + "loss": 1.4348, + "step": 838 + }, + { + "epoch": 0.05097515037365575, + "grad_norm": 0.4688839316368103, + "learning_rate": 9.943205790243233e-05, + "loss": 1.0981, + "step": 839 + }, + { + "epoch": 0.05103590740628228, + "grad_norm": 0.3700367510318756, + "learning_rate": 9.943061825420398e-05, + "loss": 1.168, + "step": 840 + }, + { + "epoch": 0.051096664438908805, + "grad_norm": 0.18428249657154083, + "learning_rate": 9.942917679408676e-05, + "loss": 1.2174, + "step": 841 + }, + { + "epoch": 0.05115742147153533, + "grad_norm": 0.1904333531856537, + "learning_rate": 9.94277335221335e-05, + "loss": 1.1403, + "step": 842 + }, + { + "epoch": 0.051218178504161854, + "grad_norm": 0.23415088653564453, + "learning_rate": 9.942628843839714e-05, + "loss": 1.1146, + "step": 843 + }, + { + "epoch": 0.05127893553678838, + "grad_norm": 0.17599831521511078, + "learning_rate": 9.94248415429306e-05, + "loss": 1.1422, + "step": 844 + }, + { + "epoch": 0.05133969256941491, + "grad_norm": 0.3157031238079071, + "learning_rate": 9.942339283578694e-05, + "loss": 1.1025, + "step": 845 + }, + { + "epoch": 0.05140044960204144, + "grad_norm": 0.27406126260757446, + "learning_rate": 9.942194231701929e-05, + "loss": 1.3205, + "step": 846 + }, + { + "epoch": 0.05146120663466796, + "grad_norm": 0.534218966960907, + "learning_rate": 9.942048998668077e-05, + "loss": 1.2346, + "step": 847 + }, + { + "epoch": 0.05152196366729449, + "grad_norm": 0.24832171201705933, + "learning_rate": 9.941903584482463e-05, + "loss": 1.2309, + "step": 848 + }, + { + "epoch": 0.051582720699921016, + "grad_norm": 0.3907499313354492, + "learning_rate": 9.941757989150419e-05, + "loss": 1.2425, + "step": 849 + }, + { + "epoch": 0.051643477732547544, + "grad_norm": 0.17559818923473358, + "learning_rate": 9.94161221267728e-05, + "loss": 1.1128, + "step": 850 + }, + { + "epoch": 0.05170423476517407, + "grad_norm": 0.7275607585906982, + "learning_rate": 9.94146625506839e-05, + "loss": 1.3044, + "step": 851 + }, + { + "epoch": 0.05176499179780059, + "grad_norm": 0.37362074851989746, + "learning_rate": 9.9413201163291e-05, + "loss": 1.2876, + "step": 852 + }, + { + "epoch": 0.05182574883042712, + "grad_norm": 0.3065381348133087, + "learning_rate": 9.941173796464767e-05, + "loss": 1.1509, + "step": 853 + }, + { + "epoch": 0.05188650586305365, + "grad_norm": 0.19520434737205505, + "learning_rate": 9.941027295480752e-05, + "loss": 1.1182, + "step": 854 + }, + { + "epoch": 0.05194726289568018, + "grad_norm": 0.26167190074920654, + "learning_rate": 9.940880613382427e-05, + "loss": 1.217, + "step": 855 + }, + { + "epoch": 0.052008019928306705, + "grad_norm": 0.3808404505252838, + "learning_rate": 9.940733750175166e-05, + "loss": 1.2523, + "step": 856 + }, + { + "epoch": 0.052068776960933226, + "grad_norm": 0.3537200391292572, + "learning_rate": 9.940586705864357e-05, + "loss": 1.2519, + "step": 857 + }, + { + "epoch": 0.052129533993559754, + "grad_norm": 0.2563754916191101, + "learning_rate": 9.940439480455386e-05, + "loss": 1.1331, + "step": 858 + }, + { + "epoch": 0.05219029102618628, + "grad_norm": 0.3466266393661499, + "learning_rate": 9.940292073953652e-05, + "loss": 1.1245, + "step": 859 + }, + { + "epoch": 0.05225104805881281, + "grad_norm": 0.4410223662853241, + "learning_rate": 9.940144486364556e-05, + "loss": 1.0973, + "step": 860 + }, + { + "epoch": 0.05231180509143933, + "grad_norm": 0.15505509078502655, + "learning_rate": 9.93999671769351e-05, + "loss": 1.1085, + "step": 861 + }, + { + "epoch": 0.05237256212406586, + "grad_norm": 0.3102237582206726, + "learning_rate": 9.939848767945927e-05, + "loss": 1.1551, + "step": 862 + }, + { + "epoch": 0.05243331915669239, + "grad_norm": 0.2113533914089203, + "learning_rate": 9.939700637127235e-05, + "loss": 1.1252, + "step": 863 + }, + { + "epoch": 0.052494076189318915, + "grad_norm": 0.37543654441833496, + "learning_rate": 9.93955232524286e-05, + "loss": 1.0799, + "step": 864 + }, + { + "epoch": 0.05255483322194544, + "grad_norm": 0.18159475922584534, + "learning_rate": 9.93940383229824e-05, + "loss": 1.1994, + "step": 865 + }, + { + "epoch": 0.052615590254571964, + "grad_norm": 0.2303076982498169, + "learning_rate": 9.939255158298819e-05, + "loss": 1.1605, + "step": 866 + }, + { + "epoch": 0.05267634728719849, + "grad_norm": 0.20846877992153168, + "learning_rate": 9.939106303250044e-05, + "loss": 1.1441, + "step": 867 + }, + { + "epoch": 0.05273710431982502, + "grad_norm": 0.215975821018219, + "learning_rate": 9.938957267157374e-05, + "loss": 1.1633, + "step": 868 + }, + { + "epoch": 0.05279786135245155, + "grad_norm": 0.18180370330810547, + "learning_rate": 9.93880805002627e-05, + "loss": 1.1145, + "step": 869 + }, + { + "epoch": 0.05285861838507807, + "grad_norm": 0.4241054654121399, + "learning_rate": 9.938658651862201e-05, + "loss": 1.2631, + "step": 870 + }, + { + "epoch": 0.0529193754177046, + "grad_norm": 0.48412010073661804, + "learning_rate": 9.938509072670646e-05, + "loss": 1.1602, + "step": 871 + }, + { + "epoch": 0.052980132450331126, + "grad_norm": 0.24033743143081665, + "learning_rate": 9.938359312457087e-05, + "loss": 1.2714, + "step": 872 + }, + { + "epoch": 0.053040889482957654, + "grad_norm": 0.2354423701763153, + "learning_rate": 9.938209371227011e-05, + "loss": 1.1154, + "step": 873 + }, + { + "epoch": 0.05310164651558418, + "grad_norm": 0.19644786417484283, + "learning_rate": 9.938059248985916e-05, + "loss": 1.1415, + "step": 874 + }, + { + "epoch": 0.0531624035482107, + "grad_norm": 0.1432892233133316, + "learning_rate": 9.937908945739307e-05, + "loss": 1.114, + "step": 875 + }, + { + "epoch": 0.05322316058083723, + "grad_norm": 0.30041730403900146, + "learning_rate": 9.93775846149269e-05, + "loss": 1.2071, + "step": 876 + }, + { + "epoch": 0.05328391761346376, + "grad_norm": 0.24715879559516907, + "learning_rate": 9.937607796251582e-05, + "loss": 1.2008, + "step": 877 + }, + { + "epoch": 0.05334467464609029, + "grad_norm": 0.2856390178203583, + "learning_rate": 9.937456950021504e-05, + "loss": 1.1197, + "step": 878 + }, + { + "epoch": 0.05340543167871681, + "grad_norm": 0.589713454246521, + "learning_rate": 9.937305922807989e-05, + "loss": 1.1317, + "step": 879 + }, + { + "epoch": 0.053466188711343336, + "grad_norm": 0.2540833055973053, + "learning_rate": 9.937154714616572e-05, + "loss": 1.2142, + "step": 880 + }, + { + "epoch": 0.053526945743969864, + "grad_norm": 0.37999507784843445, + "learning_rate": 9.937003325452792e-05, + "loss": 1.1927, + "step": 881 + }, + { + "epoch": 0.05358770277659639, + "grad_norm": 0.2898063659667969, + "learning_rate": 9.936851755322203e-05, + "loss": 1.1661, + "step": 882 + }, + { + "epoch": 0.05364845980922292, + "grad_norm": 0.20970571041107178, + "learning_rate": 9.936700004230358e-05, + "loss": 1.0903, + "step": 883 + }, + { + "epoch": 0.05370921684184944, + "grad_norm": 0.5133906602859497, + "learning_rate": 9.93654807218282e-05, + "loss": 1.194, + "step": 884 + }, + { + "epoch": 0.05376997387447597, + "grad_norm": 0.2385426014661789, + "learning_rate": 9.936395959185158e-05, + "loss": 1.2504, + "step": 885 + }, + { + "epoch": 0.0538307309071025, + "grad_norm": 1.2981534004211426, + "learning_rate": 9.936243665242948e-05, + "loss": 1.1405, + "step": 886 + }, + { + "epoch": 0.053891487939729026, + "grad_norm": 0.4661676287651062, + "learning_rate": 9.936091190361772e-05, + "loss": 1.1215, + "step": 887 + }, + { + "epoch": 0.053952244972355554, + "grad_norm": 0.20790480077266693, + "learning_rate": 9.93593853454722e-05, + "loss": 1.2572, + "step": 888 + }, + { + "epoch": 0.054013002004982075, + "grad_norm": 0.23346084356307983, + "learning_rate": 9.935785697804887e-05, + "loss": 1.1955, + "step": 889 + }, + { + "epoch": 0.0540737590376086, + "grad_norm": 0.31828176975250244, + "learning_rate": 9.935632680140374e-05, + "loss": 1.1165, + "step": 890 + }, + { + "epoch": 0.05413451607023513, + "grad_norm": 0.2110651433467865, + "learning_rate": 9.935479481559291e-05, + "loss": 1.2794, + "step": 891 + }, + { + "epoch": 0.05419527310286166, + "grad_norm": 0.2790307402610779, + "learning_rate": 9.935326102067255e-05, + "loss": 1.131, + "step": 892 + }, + { + "epoch": 0.05425603013548818, + "grad_norm": 0.2054932862520218, + "learning_rate": 9.935172541669886e-05, + "loss": 1.0983, + "step": 893 + }, + { + "epoch": 0.05431678716811471, + "grad_norm": 0.23212817311286926, + "learning_rate": 9.935018800372813e-05, + "loss": 1.1096, + "step": 894 + }, + { + "epoch": 0.054377544200741236, + "grad_norm": 0.19015608727931976, + "learning_rate": 9.934864878181673e-05, + "loss": 1.1842, + "step": 895 + }, + { + "epoch": 0.054438301233367764, + "grad_norm": 0.48252880573272705, + "learning_rate": 9.934710775102107e-05, + "loss": 1.2139, + "step": 896 + }, + { + "epoch": 0.05449905826599429, + "grad_norm": 0.2064215987920761, + "learning_rate": 9.934556491139764e-05, + "loss": 1.2348, + "step": 897 + }, + { + "epoch": 0.05455981529862081, + "grad_norm": 0.15281763672828674, + "learning_rate": 9.934402026300298e-05, + "loss": 1.1138, + "step": 898 + }, + { + "epoch": 0.05462057233124734, + "grad_norm": 0.32750558853149414, + "learning_rate": 9.934247380589371e-05, + "loss": 1.2344, + "step": 899 + }, + { + "epoch": 0.05468132936387387, + "grad_norm": 0.21798034012317657, + "learning_rate": 9.934092554012655e-05, + "loss": 1.1157, + "step": 900 + }, + { + "epoch": 0.0547420863965004, + "grad_norm": 0.20733627676963806, + "learning_rate": 9.933937546575822e-05, + "loss": 1.132, + "step": 901 + }, + { + "epoch": 0.05480284342912692, + "grad_norm": 0.2235470861196518, + "learning_rate": 9.933782358284553e-05, + "loss": 1.1446, + "step": 902 + }, + { + "epoch": 0.054863600461753446, + "grad_norm": 0.1842838078737259, + "learning_rate": 9.93362698914454e-05, + "loss": 1.1237, + "step": 903 + }, + { + "epoch": 0.054924357494379974, + "grad_norm": 0.42488864064216614, + "learning_rate": 9.933471439161475e-05, + "loss": 1.1635, + "step": 904 + }, + { + "epoch": 0.0549851145270065, + "grad_norm": 0.15811462700366974, + "learning_rate": 9.93331570834106e-05, + "loss": 1.1065, + "step": 905 + }, + { + "epoch": 0.05504587155963303, + "grad_norm": 0.39768218994140625, + "learning_rate": 9.933159796689005e-05, + "loss": 1.2619, + "step": 906 + }, + { + "epoch": 0.05510662859225955, + "grad_norm": 0.29817453026771545, + "learning_rate": 9.933003704211024e-05, + "loss": 1.1998, + "step": 907 + }, + { + "epoch": 0.05516738562488608, + "grad_norm": 0.2882938086986542, + "learning_rate": 9.93284743091284e-05, + "loss": 1.132, + "step": 908 + }, + { + "epoch": 0.05522814265751261, + "grad_norm": 0.21322005987167358, + "learning_rate": 9.932690976800177e-05, + "loss": 1.1996, + "step": 909 + }, + { + "epoch": 0.055288899690139136, + "grad_norm": 0.29061540961265564, + "learning_rate": 9.932534341878774e-05, + "loss": 1.3895, + "step": 910 + }, + { + "epoch": 0.05534965672276566, + "grad_norm": 0.37948229908943176, + "learning_rate": 9.932377526154373e-05, + "loss": 1.2223, + "step": 911 + }, + { + "epoch": 0.055410413755392185, + "grad_norm": 0.17468896508216858, + "learning_rate": 9.932220529632719e-05, + "loss": 1.0767, + "step": 912 + }, + { + "epoch": 0.05547117078801871, + "grad_norm": 0.432624876499176, + "learning_rate": 9.932063352319567e-05, + "loss": 1.1254, + "step": 913 + }, + { + "epoch": 0.05553192782064524, + "grad_norm": 0.25776907801628113, + "learning_rate": 9.931905994220681e-05, + "loss": 1.2517, + "step": 914 + }, + { + "epoch": 0.05559268485327177, + "grad_norm": 0.19674532115459442, + "learning_rate": 9.931748455341828e-05, + "loss": 1.1216, + "step": 915 + }, + { + "epoch": 0.05565344188589829, + "grad_norm": 0.23850758373737335, + "learning_rate": 9.93159073568878e-05, + "loss": 1.1144, + "step": 916 + }, + { + "epoch": 0.05571419891852482, + "grad_norm": 0.18916510045528412, + "learning_rate": 9.931432835267322e-05, + "loss": 1.1824, + "step": 917 + }, + { + "epoch": 0.055774955951151346, + "grad_norm": 0.35642287135124207, + "learning_rate": 9.931274754083239e-05, + "loss": 1.1282, + "step": 918 + }, + { + "epoch": 0.055835712983777874, + "grad_norm": 0.22358252108097076, + "learning_rate": 9.931116492142328e-05, + "loss": 1.1392, + "step": 919 + }, + { + "epoch": 0.055896470016404395, + "grad_norm": 0.35258379578590393, + "learning_rate": 9.930958049450389e-05, + "loss": 1.195, + "step": 920 + }, + { + "epoch": 0.05595722704903092, + "grad_norm": 2.5283114910125732, + "learning_rate": 9.930799426013229e-05, + "loss": 1.1529, + "step": 921 + }, + { + "epoch": 0.05601798408165745, + "grad_norm": 0.38866716623306274, + "learning_rate": 9.930640621836662e-05, + "loss": 1.122, + "step": 922 + }, + { + "epoch": 0.05607874111428398, + "grad_norm": 0.2504761815071106, + "learning_rate": 9.93048163692651e-05, + "loss": 1.1247, + "step": 923 + }, + { + "epoch": 0.05613949814691051, + "grad_norm": 0.31465429067611694, + "learning_rate": 9.930322471288603e-05, + "loss": 1.1066, + "step": 924 + }, + { + "epoch": 0.05620025517953703, + "grad_norm": 0.38542166352272034, + "learning_rate": 9.93016312492877e-05, + "loss": 1.1648, + "step": 925 + }, + { + "epoch": 0.05626101221216356, + "grad_norm": 0.2616014778614044, + "learning_rate": 9.930003597852855e-05, + "loss": 1.1363, + "step": 926 + }, + { + "epoch": 0.056321769244790085, + "grad_norm": 0.4364415109157562, + "learning_rate": 9.929843890066705e-05, + "loss": 1.2609, + "step": 927 + }, + { + "epoch": 0.05638252627741661, + "grad_norm": 0.9745305776596069, + "learning_rate": 9.929684001576176e-05, + "loss": 1.2179, + "step": 928 + }, + { + "epoch": 0.05644328331004314, + "grad_norm": 0.28503185510635376, + "learning_rate": 9.929523932387126e-05, + "loss": 1.0999, + "step": 929 + }, + { + "epoch": 0.05650404034266966, + "grad_norm": 0.2756706476211548, + "learning_rate": 9.929363682505424e-05, + "loss": 1.2154, + "step": 930 + }, + { + "epoch": 0.05656479737529619, + "grad_norm": 0.22424040734767914, + "learning_rate": 9.929203251936942e-05, + "loss": 1.1516, + "step": 931 + }, + { + "epoch": 0.05662555440792272, + "grad_norm": 0.2764267027378082, + "learning_rate": 9.929042640687565e-05, + "loss": 1.1456, + "step": 932 + }, + { + "epoch": 0.056686311440549246, + "grad_norm": 0.2318948656320572, + "learning_rate": 9.928881848763174e-05, + "loss": 1.2542, + "step": 933 + }, + { + "epoch": 0.05674706847317577, + "grad_norm": 0.1942901760339737, + "learning_rate": 9.928720876169668e-05, + "loss": 1.1262, + "step": 934 + }, + { + "epoch": 0.056807825505802295, + "grad_norm": 0.2438351958990097, + "learning_rate": 9.928559722912944e-05, + "loss": 1.1353, + "step": 935 + }, + { + "epoch": 0.05686858253842882, + "grad_norm": 0.2117772400379181, + "learning_rate": 9.928398388998911e-05, + "loss": 1.1261, + "step": 936 + }, + { + "epoch": 0.05692933957105535, + "grad_norm": 2.341944932937622, + "learning_rate": 9.928236874433485e-05, + "loss": 1.1439, + "step": 937 + }, + { + "epoch": 0.05699009660368188, + "grad_norm": 0.3281532824039459, + "learning_rate": 9.92807517922258e-05, + "loss": 1.1972, + "step": 938 + }, + { + "epoch": 0.0570508536363084, + "grad_norm": 0.17223651707172394, + "learning_rate": 9.92791330337213e-05, + "loss": 1.0931, + "step": 939 + }, + { + "epoch": 0.05711161066893493, + "grad_norm": 0.5078747868537903, + "learning_rate": 9.927751246888064e-05, + "loss": 1.4061, + "step": 940 + }, + { + "epoch": 0.057172367701561456, + "grad_norm": 0.2017948180437088, + "learning_rate": 9.927589009776322e-05, + "loss": 1.1117, + "step": 941 + }, + { + "epoch": 0.057233124734187985, + "grad_norm": 0.2222599983215332, + "learning_rate": 9.927426592042854e-05, + "loss": 1.1915, + "step": 942 + }, + { + "epoch": 0.057293881766814506, + "grad_norm": 0.19689585268497467, + "learning_rate": 9.927263993693612e-05, + "loss": 1.1632, + "step": 943 + }, + { + "epoch": 0.057354638799441034, + "grad_norm": 0.38269299268722534, + "learning_rate": 9.927101214734555e-05, + "loss": 1.2244, + "step": 944 + }, + { + "epoch": 0.05741539583206756, + "grad_norm": 0.2522115707397461, + "learning_rate": 9.926938255171652e-05, + "loss": 1.2768, + "step": 945 + }, + { + "epoch": 0.05747615286469409, + "grad_norm": 0.22336071729660034, + "learning_rate": 9.926775115010873e-05, + "loss": 1.1996, + "step": 946 + }, + { + "epoch": 0.05753690989732062, + "grad_norm": 0.27844613790512085, + "learning_rate": 9.9266117942582e-05, + "loss": 1.1432, + "step": 947 + }, + { + "epoch": 0.05759766692994714, + "grad_norm": 0.23788277804851532, + "learning_rate": 9.926448292919622e-05, + "loss": 1.2991, + "step": 948 + }, + { + "epoch": 0.05765842396257367, + "grad_norm": 0.24779294431209564, + "learning_rate": 9.926284611001126e-05, + "loss": 1.2684, + "step": 949 + }, + { + "epoch": 0.057719180995200195, + "grad_norm": 0.7948906421661377, + "learning_rate": 9.926120748508719e-05, + "loss": 1.3878, + "step": 950 + }, + { + "epoch": 0.05777993802782672, + "grad_norm": 0.24091163277626038, + "learning_rate": 9.925956705448401e-05, + "loss": 1.1803, + "step": 951 + }, + { + "epoch": 0.057840695060453244, + "grad_norm": 0.4433269500732422, + "learning_rate": 9.925792481826188e-05, + "loss": 1.2907, + "step": 952 + }, + { + "epoch": 0.05790145209307977, + "grad_norm": 0.1771276295185089, + "learning_rate": 9.9256280776481e-05, + "loss": 1.1229, + "step": 953 + }, + { + "epoch": 0.0579622091257063, + "grad_norm": 0.17962491512298584, + "learning_rate": 9.925463492920163e-05, + "loss": 1.1335, + "step": 954 + }, + { + "epoch": 0.05802296615833283, + "grad_norm": 0.1824798583984375, + "learning_rate": 9.92529872764841e-05, + "loss": 1.1585, + "step": 955 + }, + { + "epoch": 0.058083723190959356, + "grad_norm": 0.34185677766799927, + "learning_rate": 9.925133781838878e-05, + "loss": 1.1331, + "step": 956 + }, + { + "epoch": 0.05814448022358588, + "grad_norm": 0.1740175038576126, + "learning_rate": 9.924968655497617e-05, + "loss": 1.1418, + "step": 957 + }, + { + "epoch": 0.058205237256212405, + "grad_norm": 0.4179982841014862, + "learning_rate": 9.924803348630678e-05, + "loss": 1.2057, + "step": 958 + }, + { + "epoch": 0.05826599428883893, + "grad_norm": 0.3665924370288849, + "learning_rate": 9.924637861244121e-05, + "loss": 1.3906, + "step": 959 + }, + { + "epoch": 0.05832675132146546, + "grad_norm": 0.6215278506278992, + "learning_rate": 9.92447219334401e-05, + "loss": 1.269, + "step": 960 + }, + { + "epoch": 0.05838750835409199, + "grad_norm": 0.24217715859413147, + "learning_rate": 9.924306344936418e-05, + "loss": 1.1126, + "step": 961 + }, + { + "epoch": 0.05844826538671851, + "grad_norm": 0.5626233220100403, + "learning_rate": 9.924140316027426e-05, + "loss": 1.277, + "step": 962 + }, + { + "epoch": 0.05850902241934504, + "grad_norm": 0.1770382672548294, + "learning_rate": 9.92397410662312e-05, + "loss": 1.1785, + "step": 963 + }, + { + "epoch": 0.05856977945197157, + "grad_norm": 2.129831314086914, + "learning_rate": 9.92380771672959e-05, + "loss": 1.188, + "step": 964 + }, + { + "epoch": 0.058630536484598095, + "grad_norm": 0.3123109042644501, + "learning_rate": 9.923641146352938e-05, + "loss": 1.2407, + "step": 965 + }, + { + "epoch": 0.058691293517224616, + "grad_norm": 0.22422635555267334, + "learning_rate": 9.923474395499265e-05, + "loss": 1.2625, + "step": 966 + }, + { + "epoch": 0.058752050549851144, + "grad_norm": 0.6242901682853699, + "learning_rate": 9.923307464174689e-05, + "loss": 1.1412, + "step": 967 + }, + { + "epoch": 0.05881280758247767, + "grad_norm": 0.17358358204364777, + "learning_rate": 9.923140352385325e-05, + "loss": 1.0924, + "step": 968 + }, + { + "epoch": 0.0588735646151042, + "grad_norm": 0.20891009271144867, + "learning_rate": 9.922973060137301e-05, + "loss": 1.17, + "step": 969 + }, + { + "epoch": 0.05893432164773073, + "grad_norm": 0.4106835126876831, + "learning_rate": 9.922805587436748e-05, + "loss": 1.1235, + "step": 970 + }, + { + "epoch": 0.05899507868035725, + "grad_norm": 0.23005026578903198, + "learning_rate": 9.922637934289802e-05, + "loss": 1.1308, + "step": 971 + }, + { + "epoch": 0.05905583571298378, + "grad_norm": 0.19174528121948242, + "learning_rate": 9.922470100702615e-05, + "loss": 1.1512, + "step": 972 + }, + { + "epoch": 0.059116592745610305, + "grad_norm": 0.1874125599861145, + "learning_rate": 9.922302086681332e-05, + "loss": 1.1523, + "step": 973 + }, + { + "epoch": 0.05917734977823683, + "grad_norm": 0.5835013389587402, + "learning_rate": 9.922133892232115e-05, + "loss": 1.1181, + "step": 974 + }, + { + "epoch": 0.059238106810863354, + "grad_norm": 0.2347194254398346, + "learning_rate": 9.921965517361129e-05, + "loss": 1.1478, + "step": 975 + }, + { + "epoch": 0.05929886384348988, + "grad_norm": 0.40479937195777893, + "learning_rate": 9.921796962074545e-05, + "loss": 1.1101, + "step": 976 + }, + { + "epoch": 0.05935962087611641, + "grad_norm": 0.6876659393310547, + "learning_rate": 9.921628226378543e-05, + "loss": 1.1141, + "step": 977 + }, + { + "epoch": 0.05942037790874294, + "grad_norm": 0.3584234416484833, + "learning_rate": 9.921459310279306e-05, + "loss": 1.2632, + "step": 978 + }, + { + "epoch": 0.059481134941369466, + "grad_norm": 0.2755407989025116, + "learning_rate": 9.921290213783027e-05, + "loss": 1.2233, + "step": 979 + }, + { + "epoch": 0.05954189197399599, + "grad_norm": 0.3321934640407562, + "learning_rate": 9.921120936895904e-05, + "loss": 1.1421, + "step": 980 + }, + { + "epoch": 0.059602649006622516, + "grad_norm": 0.4014555513858795, + "learning_rate": 9.920951479624143e-05, + "loss": 1.1824, + "step": 981 + }, + { + "epoch": 0.059663406039249044, + "grad_norm": 0.24272578954696655, + "learning_rate": 9.920781841973955e-05, + "loss": 1.1869, + "step": 982 + }, + { + "epoch": 0.05972416307187557, + "grad_norm": 0.24495117366313934, + "learning_rate": 9.920612023951554e-05, + "loss": 1.1139, + "step": 983 + }, + { + "epoch": 0.05978492010450209, + "grad_norm": 1.0873308181762695, + "learning_rate": 9.920442025563171e-05, + "loss": 1.2191, + "step": 984 + }, + { + "epoch": 0.05984567713712862, + "grad_norm": 0.20630523562431335, + "learning_rate": 9.920271846815032e-05, + "loss": 1.1433, + "step": 985 + }, + { + "epoch": 0.05990643416975515, + "grad_norm": 0.36086443066596985, + "learning_rate": 9.92010148771338e-05, + "loss": 1.2207, + "step": 986 + }, + { + "epoch": 0.05996719120238168, + "grad_norm": 0.2526913583278656, + "learning_rate": 9.919930948264456e-05, + "loss": 1.1345, + "step": 987 + }, + { + "epoch": 0.060027948235008205, + "grad_norm": 0.3324904143810272, + "learning_rate": 9.91976022847451e-05, + "loss": 1.105, + "step": 988 + }, + { + "epoch": 0.060088705267634726, + "grad_norm": 0.2521546185016632, + "learning_rate": 9.919589328349805e-05, + "loss": 1.2172, + "step": 989 + }, + { + "epoch": 0.060149462300261254, + "grad_norm": 0.2850136160850525, + "learning_rate": 9.919418247896602e-05, + "loss": 1.3185, + "step": 990 + }, + { + "epoch": 0.06021021933288778, + "grad_norm": 0.24153576791286469, + "learning_rate": 9.919246987121169e-05, + "loss": 1.1532, + "step": 991 + }, + { + "epoch": 0.06027097636551431, + "grad_norm": 0.17544732987880707, + "learning_rate": 9.91907554602979e-05, + "loss": 1.0686, + "step": 992 + }, + { + "epoch": 0.06033173339814084, + "grad_norm": 1.1323429346084595, + "learning_rate": 9.918903924628744e-05, + "loss": 1.1567, + "step": 993 + }, + { + "epoch": 0.06039249043076736, + "grad_norm": 0.23179759085178375, + "learning_rate": 9.918732122924326e-05, + "loss": 1.1906, + "step": 994 + }, + { + "epoch": 0.06045324746339389, + "grad_norm": 0.20516924560070038, + "learning_rate": 9.91856014092283e-05, + "loss": 1.1259, + "step": 995 + }, + { + "epoch": 0.060514004496020415, + "grad_norm": 0.18624304234981537, + "learning_rate": 9.91838797863056e-05, + "loss": 1.1802, + "step": 996 + }, + { + "epoch": 0.06057476152864694, + "grad_norm": 0.4013606607913971, + "learning_rate": 9.91821563605383e-05, + "loss": 1.0975, + "step": 997 + }, + { + "epoch": 0.060635518561273465, + "grad_norm": 0.5464007258415222, + "learning_rate": 9.918043113198955e-05, + "loss": 1.3132, + "step": 998 + }, + { + "epoch": 0.06069627559389999, + "grad_norm": 0.18759983777999878, + "learning_rate": 9.917870410072258e-05, + "loss": 1.2185, + "step": 999 + }, + { + "epoch": 0.06075703262652652, + "grad_norm": 0.1758163571357727, + "learning_rate": 9.917697526680072e-05, + "loss": 1.1618, + "step": 1000 + }, + { + "epoch": 0.06081778965915305, + "grad_norm": 0.18332824110984802, + "learning_rate": 9.917524463028731e-05, + "loss": 1.1097, + "step": 1001 + }, + { + "epoch": 0.06087854669177958, + "grad_norm": 0.16281358897686005, + "learning_rate": 9.91735121912458e-05, + "loss": 1.139, + "step": 1002 + }, + { + "epoch": 0.0609393037244061, + "grad_norm": 0.1521851271390915, + "learning_rate": 9.91717779497397e-05, + "loss": 1.1511, + "step": 1003 + }, + { + "epoch": 0.061000060757032626, + "grad_norm": 0.17558541893959045, + "learning_rate": 9.917004190583259e-05, + "loss": 1.0927, + "step": 1004 + }, + { + "epoch": 0.061060817789659154, + "grad_norm": 0.17117518186569214, + "learning_rate": 9.916830405958808e-05, + "loss": 1.1405, + "step": 1005 + }, + { + "epoch": 0.06112157482228568, + "grad_norm": 0.3612566888332367, + "learning_rate": 9.916656441106989e-05, + "loss": 1.2507, + "step": 1006 + }, + { + "epoch": 0.0611823318549122, + "grad_norm": 0.17844192683696747, + "learning_rate": 9.916482296034175e-05, + "loss": 1.1921, + "step": 1007 + }, + { + "epoch": 0.06124308888753873, + "grad_norm": 0.29219844937324524, + "learning_rate": 9.916307970746754e-05, + "loss": 1.2054, + "step": 1008 + }, + { + "epoch": 0.06130384592016526, + "grad_norm": 0.16926388442516327, + "learning_rate": 9.916133465251114e-05, + "loss": 1.1033, + "step": 1009 + }, + { + "epoch": 0.06136460295279179, + "grad_norm": 0.22389206290245056, + "learning_rate": 9.915958779553652e-05, + "loss": 1.1324, + "step": 1010 + }, + { + "epoch": 0.061425359985418315, + "grad_norm": 0.21128493547439575, + "learning_rate": 9.91578391366077e-05, + "loss": 1.2314, + "step": 1011 + }, + { + "epoch": 0.061486117018044836, + "grad_norm": 0.20081967115402222, + "learning_rate": 9.915608867578878e-05, + "loss": 1.1144, + "step": 1012 + }, + { + "epoch": 0.061546874050671364, + "grad_norm": 0.25086820125579834, + "learning_rate": 9.915433641314394e-05, + "loss": 1.1498, + "step": 1013 + }, + { + "epoch": 0.06160763108329789, + "grad_norm": 0.14907516539096832, + "learning_rate": 9.915258234873739e-05, + "loss": 1.1583, + "step": 1014 + }, + { + "epoch": 0.06166838811592442, + "grad_norm": 0.2024078518152237, + "learning_rate": 9.915082648263343e-05, + "loss": 1.1158, + "step": 1015 + }, + { + "epoch": 0.06172914514855094, + "grad_norm": 0.29856154322624207, + "learning_rate": 9.914906881489643e-05, + "loss": 1.2516, + "step": 1016 + }, + { + "epoch": 0.06178990218117747, + "grad_norm": 5.346932888031006, + "learning_rate": 9.914730934559083e-05, + "loss": 1.0974, + "step": 1017 + }, + { + "epoch": 0.061850659213804, + "grad_norm": 0.35687682032585144, + "learning_rate": 9.914554807478108e-05, + "loss": 1.1446, + "step": 1018 + }, + { + "epoch": 0.061911416246430526, + "grad_norm": 0.1732306331396103, + "learning_rate": 9.914378500253176e-05, + "loss": 1.2398, + "step": 1019 + }, + { + "epoch": 0.061972173279057054, + "grad_norm": 0.2185167372226715, + "learning_rate": 9.914202012890751e-05, + "loss": 1.14, + "step": 1020 + }, + { + "epoch": 0.062032930311683575, + "grad_norm": 0.19063043594360352, + "learning_rate": 9.914025345397304e-05, + "loss": 1.1288, + "step": 1021 + }, + { + "epoch": 0.0620936873443101, + "grad_norm": 0.14763686060905457, + "learning_rate": 9.913848497779306e-05, + "loss": 1.1335, + "step": 1022 + }, + { + "epoch": 0.06215444437693663, + "grad_norm": 0.23980973660945892, + "learning_rate": 9.91367147004324e-05, + "loss": 1.183, + "step": 1023 + }, + { + "epoch": 0.06221520140956316, + "grad_norm": 0.212006077170372, + "learning_rate": 9.913494262195597e-05, + "loss": 1.2123, + "step": 1024 + }, + { + "epoch": 0.06227595844218968, + "grad_norm": 5.338240623474121, + "learning_rate": 9.913316874242874e-05, + "loss": 1.1678, + "step": 1025 + }, + { + "epoch": 0.06233671547481621, + "grad_norm": 0.25872495770454407, + "learning_rate": 9.91313930619157e-05, + "loss": 1.2411, + "step": 1026 + }, + { + "epoch": 0.062397472507442736, + "grad_norm": 0.3396739363670349, + "learning_rate": 9.912961558048195e-05, + "loss": 1.1413, + "step": 1027 + }, + { + "epoch": 0.062458229540069264, + "grad_norm": 0.22687095403671265, + "learning_rate": 9.912783629819264e-05, + "loss": 1.2663, + "step": 1028 + }, + { + "epoch": 0.06251898657269579, + "grad_norm": 0.43820720911026, + "learning_rate": 9.912605521511301e-05, + "loss": 1.2936, + "step": 1029 + }, + { + "epoch": 0.06257974360532231, + "grad_norm": 0.30604350566864014, + "learning_rate": 9.912427233130832e-05, + "loss": 1.1777, + "step": 1030 + }, + { + "epoch": 0.06264050063794885, + "grad_norm": 0.2056499719619751, + "learning_rate": 9.912248764684393e-05, + "loss": 1.1757, + "step": 1031 + }, + { + "epoch": 0.06270125767057537, + "grad_norm": 0.43428412079811096, + "learning_rate": 9.912070116178526e-05, + "loss": 1.0841, + "step": 1032 + }, + { + "epoch": 0.06276201470320189, + "grad_norm": 0.219242662191391, + "learning_rate": 9.91189128761978e-05, + "loss": 1.138, + "step": 1033 + }, + { + "epoch": 0.06282277173582843, + "grad_norm": 0.20234309136867523, + "learning_rate": 9.911712279014709e-05, + "loss": 1.1664, + "step": 1034 + }, + { + "epoch": 0.06288352876845495, + "grad_norm": 0.2271445393562317, + "learning_rate": 9.911533090369876e-05, + "loss": 1.323, + "step": 1035 + }, + { + "epoch": 0.06294428580108148, + "grad_norm": 0.2031880021095276, + "learning_rate": 9.911353721691848e-05, + "loss": 1.1477, + "step": 1036 + }, + { + "epoch": 0.063005042833708, + "grad_norm": 0.1386653035879135, + "learning_rate": 9.911174172987199e-05, + "loss": 1.111, + "step": 1037 + }, + { + "epoch": 0.06306579986633452, + "grad_norm": 0.23107148706912994, + "learning_rate": 9.910994444262512e-05, + "loss": 1.2201, + "step": 1038 + }, + { + "epoch": 0.06312655689896106, + "grad_norm": 0.14069907367229462, + "learning_rate": 9.910814535524372e-05, + "loss": 1.1235, + "step": 1039 + }, + { + "epoch": 0.06318731393158758, + "grad_norm": 0.18750937283039093, + "learning_rate": 9.910634446779379e-05, + "loss": 1.1225, + "step": 1040 + }, + { + "epoch": 0.0632480709642141, + "grad_norm": 0.2907664179801941, + "learning_rate": 9.91045417803413e-05, + "loss": 1.1408, + "step": 1041 + }, + { + "epoch": 0.06330882799684064, + "grad_norm": 0.22893452644348145, + "learning_rate": 9.910273729295235e-05, + "loss": 1.1074, + "step": 1042 + }, + { + "epoch": 0.06336958502946716, + "grad_norm": 0.7455631494522095, + "learning_rate": 9.910093100569305e-05, + "loss": 1.1824, + "step": 1043 + }, + { + "epoch": 0.06343034206209369, + "grad_norm": 0.17006048560142517, + "learning_rate": 9.909912291862966e-05, + "loss": 1.1108, + "step": 1044 + }, + { + "epoch": 0.06349109909472021, + "grad_norm": 0.17600016295909882, + "learning_rate": 9.909731303182841e-05, + "loss": 1.1277, + "step": 1045 + }, + { + "epoch": 0.06355185612734673, + "grad_norm": 0.2076251208782196, + "learning_rate": 9.909550134535566e-05, + "loss": 1.1285, + "step": 1046 + }, + { + "epoch": 0.06361261315997327, + "grad_norm": 0.2192397117614746, + "learning_rate": 9.909368785927783e-05, + "loss": 1.4415, + "step": 1047 + }, + { + "epoch": 0.06367337019259979, + "grad_norm": 0.4094569683074951, + "learning_rate": 9.909187257366137e-05, + "loss": 1.1842, + "step": 1048 + }, + { + "epoch": 0.06373412722522633, + "grad_norm": 0.1654943972826004, + "learning_rate": 9.909005548857284e-05, + "loss": 1.1334, + "step": 1049 + }, + { + "epoch": 0.06379488425785285, + "grad_norm": 0.31593313813209534, + "learning_rate": 9.908823660407884e-05, + "loss": 1.1099, + "step": 1050 + }, + { + "epoch": 0.06385564129047937, + "grad_norm": 0.5052706003189087, + "learning_rate": 9.908641592024603e-05, + "loss": 1.1977, + "step": 1051 + }, + { + "epoch": 0.0639163983231059, + "grad_norm": 0.17280252277851105, + "learning_rate": 9.908459343714116e-05, + "loss": 1.1342, + "step": 1052 + }, + { + "epoch": 0.06397715535573242, + "grad_norm": 0.34011808037757874, + "learning_rate": 9.908276915483104e-05, + "loss": 1.0999, + "step": 1053 + }, + { + "epoch": 0.06403791238835896, + "grad_norm": 0.20396362245082855, + "learning_rate": 9.908094307338251e-05, + "loss": 1.1962, + "step": 1054 + }, + { + "epoch": 0.06409866942098548, + "grad_norm": 0.21117782592773438, + "learning_rate": 9.907911519286254e-05, + "loss": 1.1473, + "step": 1055 + }, + { + "epoch": 0.064159426453612, + "grad_norm": 0.22127699851989746, + "learning_rate": 9.90772855133381e-05, + "loss": 1.1186, + "step": 1056 + }, + { + "epoch": 0.06422018348623854, + "grad_norm": 0.2519165575504303, + "learning_rate": 9.907545403487629e-05, + "loss": 1.2315, + "step": 1057 + }, + { + "epoch": 0.06428094051886506, + "grad_norm": 0.17791754007339478, + "learning_rate": 9.907362075754422e-05, + "loss": 1.1335, + "step": 1058 + }, + { + "epoch": 0.06434169755149159, + "grad_norm": 0.45089244842529297, + "learning_rate": 9.907178568140909e-05, + "loss": 1.1355, + "step": 1059 + }, + { + "epoch": 0.06440245458411811, + "grad_norm": 0.19349536299705505, + "learning_rate": 9.906994880653818e-05, + "loss": 1.1226, + "step": 1060 + }, + { + "epoch": 0.06446321161674463, + "grad_norm": 0.41839855909347534, + "learning_rate": 9.90681101329988e-05, + "loss": 1.308, + "step": 1061 + }, + { + "epoch": 0.06452396864937117, + "grad_norm": 0.2293994426727295, + "learning_rate": 9.906626966085837e-05, + "loss": 1.1184, + "step": 1062 + }, + { + "epoch": 0.06458472568199769, + "grad_norm": 0.19593217968940735, + "learning_rate": 9.906442739018433e-05, + "loss": 1.1379, + "step": 1063 + }, + { + "epoch": 0.06464548271462421, + "grad_norm": 0.26556774973869324, + "learning_rate": 9.906258332104424e-05, + "loss": 1.1278, + "step": 1064 + }, + { + "epoch": 0.06470623974725075, + "grad_norm": 0.37029293179512024, + "learning_rate": 9.906073745350565e-05, + "loss": 1.1897, + "step": 1065 + }, + { + "epoch": 0.06476699677987727, + "grad_norm": 0.29654473066329956, + "learning_rate": 9.905888978763627e-05, + "loss": 1.1712, + "step": 1066 + }, + { + "epoch": 0.0648277538125038, + "grad_norm": 0.24746425449848175, + "learning_rate": 9.905704032350378e-05, + "loss": 1.1696, + "step": 1067 + }, + { + "epoch": 0.06488851084513032, + "grad_norm": 0.2603473961353302, + "learning_rate": 9.9055189061176e-05, + "loss": 1.1313, + "step": 1068 + }, + { + "epoch": 0.06494926787775684, + "grad_norm": 0.21180987358093262, + "learning_rate": 9.905333600072079e-05, + "loss": 1.1054, + "step": 1069 + }, + { + "epoch": 0.06501002491038338, + "grad_norm": 0.23979009687900543, + "learning_rate": 9.905148114220606e-05, + "loss": 1.2503, + "step": 1070 + }, + { + "epoch": 0.0650707819430099, + "grad_norm": 0.19146323204040527, + "learning_rate": 9.904962448569981e-05, + "loss": 1.1472, + "step": 1071 + }, + { + "epoch": 0.06513153897563644, + "grad_norm": 0.1779710352420807, + "learning_rate": 9.904776603127009e-05, + "loss": 1.1426, + "step": 1072 + }, + { + "epoch": 0.06519229600826296, + "grad_norm": 0.21693828701972961, + "learning_rate": 9.904590577898503e-05, + "loss": 1.157, + "step": 1073 + }, + { + "epoch": 0.06525305304088948, + "grad_norm": 0.16904886066913605, + "learning_rate": 9.904404372891281e-05, + "loss": 1.0952, + "step": 1074 + }, + { + "epoch": 0.06531381007351601, + "grad_norm": 0.18482241034507751, + "learning_rate": 9.90421798811217e-05, + "loss": 1.1501, + "step": 1075 + }, + { + "epoch": 0.06537456710614253, + "grad_norm": 0.19291286170482635, + "learning_rate": 9.904031423567998e-05, + "loss": 1.1276, + "step": 1076 + }, + { + "epoch": 0.06543532413876907, + "grad_norm": 0.16024720668792725, + "learning_rate": 9.903844679265608e-05, + "loss": 1.0988, + "step": 1077 + }, + { + "epoch": 0.06549608117139559, + "grad_norm": 0.27875852584838867, + "learning_rate": 9.903657755211844e-05, + "loss": 1.2271, + "step": 1078 + }, + { + "epoch": 0.06555683820402211, + "grad_norm": 0.31460678577423096, + "learning_rate": 9.903470651413555e-05, + "loss": 1.2298, + "step": 1079 + }, + { + "epoch": 0.06561759523664865, + "grad_norm": 0.35740309953689575, + "learning_rate": 9.903283367877603e-05, + "loss": 1.1621, + "step": 1080 + }, + { + "epoch": 0.06567835226927517, + "grad_norm": 0.4617520570755005, + "learning_rate": 9.903095904610851e-05, + "loss": 1.1541, + "step": 1081 + }, + { + "epoch": 0.0657391093019017, + "grad_norm": 0.2352830469608307, + "learning_rate": 9.902908261620172e-05, + "loss": 1.1506, + "step": 1082 + }, + { + "epoch": 0.06579986633452822, + "grad_norm": 0.5313760638237, + "learning_rate": 9.902720438912441e-05, + "loss": 1.317, + "step": 1083 + }, + { + "epoch": 0.06586062336715474, + "grad_norm": 0.332553505897522, + "learning_rate": 9.902532436494547e-05, + "loss": 1.1067, + "step": 1084 + }, + { + "epoch": 0.06592138039978128, + "grad_norm": 0.20040388405323029, + "learning_rate": 9.902344254373377e-05, + "loss": 1.1594, + "step": 1085 + }, + { + "epoch": 0.0659821374324078, + "grad_norm": 0.2770216763019562, + "learning_rate": 9.902155892555831e-05, + "loss": 1.2575, + "step": 1086 + }, + { + "epoch": 0.06604289446503432, + "grad_norm": 0.3603607416152954, + "learning_rate": 9.901967351048813e-05, + "loss": 1.1843, + "step": 1087 + }, + { + "epoch": 0.06610365149766086, + "grad_norm": 0.16854630410671234, + "learning_rate": 9.901778629859235e-05, + "loss": 1.1679, + "step": 1088 + }, + { + "epoch": 0.06616440853028738, + "grad_norm": 0.3734396696090698, + "learning_rate": 9.901589728994014e-05, + "loss": 1.1973, + "step": 1089 + }, + { + "epoch": 0.06622516556291391, + "grad_norm": 0.396348237991333, + "learning_rate": 9.901400648460074e-05, + "loss": 1.1766, + "step": 1090 + }, + { + "epoch": 0.06628592259554043, + "grad_norm": 0.2696155607700348, + "learning_rate": 9.901211388264345e-05, + "loss": 1.1673, + "step": 1091 + }, + { + "epoch": 0.06634667962816695, + "grad_norm": 0.3086203634738922, + "learning_rate": 9.901021948413765e-05, + "loss": 1.1411, + "step": 1092 + }, + { + "epoch": 0.06640743666079349, + "grad_norm": 0.17181465029716492, + "learning_rate": 9.90083232891528e-05, + "loss": 1.1331, + "step": 1093 + }, + { + "epoch": 0.06646819369342001, + "grad_norm": 0.18774396181106567, + "learning_rate": 9.900642529775836e-05, + "loss": 1.1222, + "step": 1094 + }, + { + "epoch": 0.06652895072604655, + "grad_norm": 0.4674195349216461, + "learning_rate": 9.900452551002395e-05, + "loss": 1.4539, + "step": 1095 + }, + { + "epoch": 0.06658970775867307, + "grad_norm": 0.2495420128107071, + "learning_rate": 9.900262392601918e-05, + "loss": 1.2424, + "step": 1096 + }, + { + "epoch": 0.06665046479129959, + "grad_norm": 0.22591935098171234, + "learning_rate": 9.900072054581375e-05, + "loss": 1.1439, + "step": 1097 + }, + { + "epoch": 0.06671122182392612, + "grad_norm": 0.2648542523384094, + "learning_rate": 9.899881536947744e-05, + "loss": 1.2879, + "step": 1098 + }, + { + "epoch": 0.06677197885655264, + "grad_norm": 0.2803839147090912, + "learning_rate": 9.899690839708008e-05, + "loss": 1.2386, + "step": 1099 + }, + { + "epoch": 0.06683273588917918, + "grad_norm": 0.18792013823986053, + "learning_rate": 9.899499962869157e-05, + "loss": 1.224, + "step": 1100 + }, + { + "epoch": 0.0668934929218057, + "grad_norm": 0.211750790476799, + "learning_rate": 9.899308906438189e-05, + "loss": 1.2128, + "step": 1101 + }, + { + "epoch": 0.06695424995443222, + "grad_norm": 0.2044948935508728, + "learning_rate": 9.899117670422104e-05, + "loss": 1.122, + "step": 1102 + }, + { + "epoch": 0.06701500698705876, + "grad_norm": 0.2009739726781845, + "learning_rate": 9.898926254827916e-05, + "loss": 1.2161, + "step": 1103 + }, + { + "epoch": 0.06707576401968528, + "grad_norm": 0.2129359394311905, + "learning_rate": 9.898734659662638e-05, + "loss": 1.1392, + "step": 1104 + }, + { + "epoch": 0.0671365210523118, + "grad_norm": 0.2559523582458496, + "learning_rate": 9.898542884933295e-05, + "loss": 1.1593, + "step": 1105 + }, + { + "epoch": 0.06719727808493833, + "grad_norm": 0.2548045516014099, + "learning_rate": 9.898350930646916e-05, + "loss": 1.2174, + "step": 1106 + }, + { + "epoch": 0.06725803511756485, + "grad_norm": 0.17254139482975006, + "learning_rate": 9.898158796810535e-05, + "loss": 1.1374, + "step": 1107 + }, + { + "epoch": 0.06731879215019139, + "grad_norm": 0.17324994504451752, + "learning_rate": 9.897966483431198e-05, + "loss": 1.1529, + "step": 1108 + }, + { + "epoch": 0.06737954918281791, + "grad_norm": 0.22996747493743896, + "learning_rate": 9.897773990515952e-05, + "loss": 1.1929, + "step": 1109 + }, + { + "epoch": 0.06744030621544443, + "grad_norm": 0.18637020885944366, + "learning_rate": 9.897581318071853e-05, + "loss": 1.1107, + "step": 1110 + }, + { + "epoch": 0.06750106324807097, + "grad_norm": 0.17861661314964294, + "learning_rate": 9.897388466105966e-05, + "loss": 1.1971, + "step": 1111 + }, + { + "epoch": 0.06756182028069749, + "grad_norm": 0.5430263876914978, + "learning_rate": 9.897195434625356e-05, + "loss": 1.206, + "step": 1112 + }, + { + "epoch": 0.06762257731332402, + "grad_norm": 0.16525183618068695, + "learning_rate": 9.897002223637103e-05, + "loss": 1.1195, + "step": 1113 + }, + { + "epoch": 0.06768333434595054, + "grad_norm": 0.17113755643367767, + "learning_rate": 9.896808833148285e-05, + "loss": 1.1206, + "step": 1114 + }, + { + "epoch": 0.06774409137857706, + "grad_norm": 0.20011554658412933, + "learning_rate": 9.896615263165994e-05, + "loss": 1.1031, + "step": 1115 + }, + { + "epoch": 0.0678048484112036, + "grad_norm": 0.2237466424703598, + "learning_rate": 9.896421513697323e-05, + "loss": 1.1701, + "step": 1116 + }, + { + "epoch": 0.06786560544383012, + "grad_norm": 0.7923113107681274, + "learning_rate": 9.896227584749375e-05, + "loss": 1.3552, + "step": 1117 + }, + { + "epoch": 0.06792636247645666, + "grad_norm": 0.16993427276611328, + "learning_rate": 9.89603347632926e-05, + "loss": 1.1334, + "step": 1118 + }, + { + "epoch": 0.06798711950908318, + "grad_norm": 0.4024191200733185, + "learning_rate": 9.89583918844409e-05, + "loss": 1.1952, + "step": 1119 + }, + { + "epoch": 0.0680478765417097, + "grad_norm": 0.5233056545257568, + "learning_rate": 9.895644721100989e-05, + "loss": 1.2922, + "step": 1120 + }, + { + "epoch": 0.06810863357433623, + "grad_norm": 0.24080172181129456, + "learning_rate": 9.895450074307084e-05, + "loss": 1.0954, + "step": 1121 + }, + { + "epoch": 0.06816939060696275, + "grad_norm": 0.2148400843143463, + "learning_rate": 9.89525524806951e-05, + "loss": 1.1461, + "step": 1122 + }, + { + "epoch": 0.06823014763958929, + "grad_norm": 0.21003541350364685, + "learning_rate": 9.895060242395408e-05, + "loss": 1.1404, + "step": 1123 + }, + { + "epoch": 0.06829090467221581, + "grad_norm": 0.27357712388038635, + "learning_rate": 9.894865057291928e-05, + "loss": 1.1488, + "step": 1124 + }, + { + "epoch": 0.06835166170484233, + "grad_norm": 0.19512055814266205, + "learning_rate": 9.894669692766223e-05, + "loss": 1.2444, + "step": 1125 + }, + { + "epoch": 0.06841241873746887, + "grad_norm": 0.2192995697259903, + "learning_rate": 9.894474148825454e-05, + "loss": 1.1388, + "step": 1126 + }, + { + "epoch": 0.06847317577009539, + "grad_norm": 0.6291006207466125, + "learning_rate": 9.89427842547679e-05, + "loss": 1.2148, + "step": 1127 + }, + { + "epoch": 0.06853393280272191, + "grad_norm": 0.22833654284477234, + "learning_rate": 9.894082522727403e-05, + "loss": 1.1385, + "step": 1128 + }, + { + "epoch": 0.06859468983534844, + "grad_norm": 0.389053076505661, + "learning_rate": 9.893886440584476e-05, + "loss": 1.2045, + "step": 1129 + }, + { + "epoch": 0.06865544686797496, + "grad_norm": 0.3422658443450928, + "learning_rate": 9.893690179055194e-05, + "loss": 1.0926, + "step": 1130 + }, + { + "epoch": 0.0687162039006015, + "grad_norm": 0.297583669424057, + "learning_rate": 9.893493738146754e-05, + "loss": 1.2369, + "step": 1131 + }, + { + "epoch": 0.06877696093322802, + "grad_norm": 0.34056973457336426, + "learning_rate": 9.893297117866355e-05, + "loss": 1.1381, + "step": 1132 + }, + { + "epoch": 0.06883771796585454, + "grad_norm": 0.1922212690114975, + "learning_rate": 9.893100318221204e-05, + "loss": 1.1479, + "step": 1133 + }, + { + "epoch": 0.06889847499848108, + "grad_norm": 0.26063457131385803, + "learning_rate": 9.892903339218515e-05, + "loss": 1.1112, + "step": 1134 + }, + { + "epoch": 0.0689592320311076, + "grad_norm": 0.2509213387966156, + "learning_rate": 9.892706180865509e-05, + "loss": 1.1531, + "step": 1135 + }, + { + "epoch": 0.06901998906373413, + "grad_norm": 0.25920236110687256, + "learning_rate": 9.89250884316941e-05, + "loss": 1.2092, + "step": 1136 + }, + { + "epoch": 0.06908074609636065, + "grad_norm": 0.28422781825065613, + "learning_rate": 9.892311326137455e-05, + "loss": 1.1242, + "step": 1137 + }, + { + "epoch": 0.06914150312898718, + "grad_norm": 0.30794256925582886, + "learning_rate": 9.892113629776884e-05, + "loss": 1.2653, + "step": 1138 + }, + { + "epoch": 0.06920226016161371, + "grad_norm": 0.3708270788192749, + "learning_rate": 9.891915754094941e-05, + "loss": 1.1174, + "step": 1139 + }, + { + "epoch": 0.06926301719424023, + "grad_norm": 0.284943550825119, + "learning_rate": 9.891717699098882e-05, + "loss": 1.3187, + "step": 1140 + }, + { + "epoch": 0.06932377422686677, + "grad_norm": 0.15996769070625305, + "learning_rate": 9.891519464795961e-05, + "loss": 1.1042, + "step": 1141 + }, + { + "epoch": 0.06938453125949329, + "grad_norm": 0.24439401924610138, + "learning_rate": 9.891321051193452e-05, + "loss": 1.1945, + "step": 1142 + }, + { + "epoch": 0.06944528829211981, + "grad_norm": 3.1704530715942383, + "learning_rate": 9.891122458298626e-05, + "loss": 1.2771, + "step": 1143 + }, + { + "epoch": 0.06950604532474634, + "grad_norm": 0.47090238332748413, + "learning_rate": 9.890923686118758e-05, + "loss": 1.0811, + "step": 1144 + }, + { + "epoch": 0.06956680235737286, + "grad_norm": 0.194844052195549, + "learning_rate": 9.890724734661137e-05, + "loss": 1.1444, + "step": 1145 + }, + { + "epoch": 0.06962755938999939, + "grad_norm": 0.2976624071598053, + "learning_rate": 9.890525603933057e-05, + "loss": 1.1794, + "step": 1146 + }, + { + "epoch": 0.06968831642262592, + "grad_norm": 0.42262864112854004, + "learning_rate": 9.890326293941813e-05, + "loss": 1.0882, + "step": 1147 + }, + { + "epoch": 0.06974907345525244, + "grad_norm": 0.418404757976532, + "learning_rate": 9.890126804694715e-05, + "loss": 1.2888, + "step": 1148 + }, + { + "epoch": 0.06980983048787898, + "grad_norm": 0.26866504549980164, + "learning_rate": 9.889927136199074e-05, + "loss": 1.3024, + "step": 1149 + }, + { + "epoch": 0.0698705875205055, + "grad_norm": 0.27440404891967773, + "learning_rate": 9.889727288462208e-05, + "loss": 1.149, + "step": 1150 + }, + { + "epoch": 0.06993134455313202, + "grad_norm": 0.18479019403457642, + "learning_rate": 9.889527261491444e-05, + "loss": 1.1183, + "step": 1151 + }, + { + "epoch": 0.06999210158575855, + "grad_norm": 0.22347456216812134, + "learning_rate": 9.889327055294112e-05, + "loss": 1.3339, + "step": 1152 + }, + { + "epoch": 0.07005285861838507, + "grad_norm": 0.21663208305835724, + "learning_rate": 9.889126669877553e-05, + "loss": 1.1024, + "step": 1153 + }, + { + "epoch": 0.07011361565101161, + "grad_norm": 0.1680421233177185, + "learning_rate": 9.88892610524911e-05, + "loss": 1.1227, + "step": 1154 + }, + { + "epoch": 0.07017437268363813, + "grad_norm": 0.42199599742889404, + "learning_rate": 9.888725361416136e-05, + "loss": 1.2934, + "step": 1155 + }, + { + "epoch": 0.07023512971626465, + "grad_norm": 0.18065239489078522, + "learning_rate": 9.888524438385988e-05, + "loss": 1.1768, + "step": 1156 + }, + { + "epoch": 0.07029588674889119, + "grad_norm": 0.2552657127380371, + "learning_rate": 9.888323336166033e-05, + "loss": 1.2412, + "step": 1157 + }, + { + "epoch": 0.07035664378151771, + "grad_norm": 0.23475800454616547, + "learning_rate": 9.88812205476364e-05, + "loss": 1.0921, + "step": 1158 + }, + { + "epoch": 0.07041740081414424, + "grad_norm": 0.18801848590373993, + "learning_rate": 9.88792059418619e-05, + "loss": 1.1756, + "step": 1159 + }, + { + "epoch": 0.07047815784677076, + "grad_norm": 0.2849925756454468, + "learning_rate": 9.887718954441064e-05, + "loss": 1.2694, + "step": 1160 + }, + { + "epoch": 0.07053891487939729, + "grad_norm": 0.2758830189704895, + "learning_rate": 9.887517135535656e-05, + "loss": 1.0912, + "step": 1161 + }, + { + "epoch": 0.07059967191202382, + "grad_norm": 0.1720307469367981, + "learning_rate": 9.887315137477362e-05, + "loss": 1.1549, + "step": 1162 + }, + { + "epoch": 0.07066042894465034, + "grad_norm": 0.2587137222290039, + "learning_rate": 9.887112960273587e-05, + "loss": 1.1093, + "step": 1163 + }, + { + "epoch": 0.07072118597727688, + "grad_norm": 0.17992408573627472, + "learning_rate": 9.886910603931743e-05, + "loss": 1.0915, + "step": 1164 + }, + { + "epoch": 0.0707819430099034, + "grad_norm": 0.5324072241783142, + "learning_rate": 9.886708068459244e-05, + "loss": 1.1718, + "step": 1165 + }, + { + "epoch": 0.07084270004252992, + "grad_norm": 0.19721661508083344, + "learning_rate": 9.886505353863518e-05, + "loss": 1.1445, + "step": 1166 + }, + { + "epoch": 0.07090345707515645, + "grad_norm": 0.3066786527633667, + "learning_rate": 9.886302460151992e-05, + "loss": 1.2077, + "step": 1167 + }, + { + "epoch": 0.07096421410778297, + "grad_norm": 0.4767993688583374, + "learning_rate": 9.886099387332107e-05, + "loss": 1.4113, + "step": 1168 + }, + { + "epoch": 0.0710249711404095, + "grad_norm": 0.35383138060569763, + "learning_rate": 9.885896135411304e-05, + "loss": 1.1748, + "step": 1169 + }, + { + "epoch": 0.07108572817303603, + "grad_norm": 0.23907434940338135, + "learning_rate": 9.885692704397032e-05, + "loss": 1.209, + "step": 1170 + }, + { + "epoch": 0.07114648520566255, + "grad_norm": 0.5726443529129028, + "learning_rate": 9.885489094296751e-05, + "loss": 1.2388, + "step": 1171 + }, + { + "epoch": 0.07120724223828909, + "grad_norm": 0.19819119572639465, + "learning_rate": 9.885285305117922e-05, + "loss": 1.2159, + "step": 1172 + }, + { + "epoch": 0.07126799927091561, + "grad_norm": 0.2896987497806549, + "learning_rate": 9.885081336868018e-05, + "loss": 1.2073, + "step": 1173 + }, + { + "epoch": 0.07132875630354213, + "grad_norm": 0.20259325206279755, + "learning_rate": 9.884877189554511e-05, + "loss": 1.166, + "step": 1174 + }, + { + "epoch": 0.07138951333616866, + "grad_norm": 0.21030500531196594, + "learning_rate": 9.884672863184888e-05, + "loss": 1.2611, + "step": 1175 + }, + { + "epoch": 0.07145027036879519, + "grad_norm": 0.19030259549617767, + "learning_rate": 9.884468357766636e-05, + "loss": 1.1742, + "step": 1176 + }, + { + "epoch": 0.07151102740142172, + "grad_norm": 0.3621237277984619, + "learning_rate": 9.884263673307253e-05, + "loss": 1.1848, + "step": 1177 + }, + { + "epoch": 0.07157178443404824, + "grad_norm": 0.2247137874364853, + "learning_rate": 9.884058809814239e-05, + "loss": 1.1659, + "step": 1178 + }, + { + "epoch": 0.07163254146667476, + "grad_norm": 0.2524459660053253, + "learning_rate": 9.883853767295108e-05, + "loss": 1.1831, + "step": 1179 + }, + { + "epoch": 0.0716932984993013, + "grad_norm": 0.29188045859336853, + "learning_rate": 9.883648545757372e-05, + "loss": 1.1499, + "step": 1180 + }, + { + "epoch": 0.07175405553192782, + "grad_norm": 0.21408967673778534, + "learning_rate": 9.883443145208553e-05, + "loss": 1.196, + "step": 1181 + }, + { + "epoch": 0.07181481256455435, + "grad_norm": 0.5212968587875366, + "learning_rate": 9.883237565656182e-05, + "loss": 1.1737, + "step": 1182 + }, + { + "epoch": 0.07187556959718087, + "grad_norm": 0.24218496680259705, + "learning_rate": 9.883031807107797e-05, + "loss": 1.2154, + "step": 1183 + }, + { + "epoch": 0.0719363266298074, + "grad_norm": 2.173280715942383, + "learning_rate": 9.882825869570934e-05, + "loss": 1.2281, + "step": 1184 + }, + { + "epoch": 0.07199708366243393, + "grad_norm": 0.2711932361125946, + "learning_rate": 9.882619753053146e-05, + "loss": 1.1877, + "step": 1185 + }, + { + "epoch": 0.07205784069506045, + "grad_norm": 0.2696588337421417, + "learning_rate": 9.882413457561988e-05, + "loss": 1.2172, + "step": 1186 + }, + { + "epoch": 0.07211859772768699, + "grad_norm": 0.2311020791530609, + "learning_rate": 9.88220698310502e-05, + "loss": 1.2176, + "step": 1187 + }, + { + "epoch": 0.07217935476031351, + "grad_norm": 0.2556701898574829, + "learning_rate": 9.88200032968981e-05, + "loss": 1.0911, + "step": 1188 + }, + { + "epoch": 0.07224011179294003, + "grad_norm": 0.30819210410118103, + "learning_rate": 9.881793497323937e-05, + "loss": 1.2522, + "step": 1189 + }, + { + "epoch": 0.07230086882556656, + "grad_norm": 0.3643665611743927, + "learning_rate": 9.881586486014979e-05, + "loss": 1.0996, + "step": 1190 + }, + { + "epoch": 0.07236162585819308, + "grad_norm": 0.37965983152389526, + "learning_rate": 9.881379295770524e-05, + "loss": 1.1983, + "step": 1191 + }, + { + "epoch": 0.0724223828908196, + "grad_norm": 0.21066147089004517, + "learning_rate": 9.881171926598167e-05, + "loss": 1.1675, + "step": 1192 + }, + { + "epoch": 0.07248313992344614, + "grad_norm": 0.41477102041244507, + "learning_rate": 9.88096437850551e-05, + "loss": 1.1229, + "step": 1193 + }, + { + "epoch": 0.07254389695607266, + "grad_norm": 0.25262150168418884, + "learning_rate": 9.880756651500162e-05, + "loss": 1.1047, + "step": 1194 + }, + { + "epoch": 0.0726046539886992, + "grad_norm": 0.3302343487739563, + "learning_rate": 9.880548745589733e-05, + "loss": 1.0916, + "step": 1195 + }, + { + "epoch": 0.07266541102132572, + "grad_norm": 0.1993912160396576, + "learning_rate": 9.880340660781848e-05, + "loss": 1.2562, + "step": 1196 + }, + { + "epoch": 0.07272616805395224, + "grad_norm": 0.215809166431427, + "learning_rate": 9.880132397084132e-05, + "loss": 1.1034, + "step": 1197 + }, + { + "epoch": 0.07278692508657877, + "grad_norm": 1.1583436727523804, + "learning_rate": 9.87992395450422e-05, + "loss": 1.1674, + "step": 1198 + }, + { + "epoch": 0.0728476821192053, + "grad_norm": 0.2466304749250412, + "learning_rate": 9.87971533304975e-05, + "loss": 1.1824, + "step": 1199 + }, + { + "epoch": 0.07290843915183183, + "grad_norm": 0.3802541494369507, + "learning_rate": 9.879506532728375e-05, + "loss": 1.1907, + "step": 1200 + }, + { + "epoch": 0.07296919618445835, + "grad_norm": 0.25770261883735657, + "learning_rate": 9.879297553547742e-05, + "loss": 1.1257, + "step": 1201 + }, + { + "epoch": 0.07302995321708487, + "grad_norm": 0.18711112439632416, + "learning_rate": 9.879088395515516e-05, + "loss": 1.1677, + "step": 1202 + }, + { + "epoch": 0.07309071024971141, + "grad_norm": 0.22568635642528534, + "learning_rate": 9.87887905863936e-05, + "loss": 1.1684, + "step": 1203 + }, + { + "epoch": 0.07315146728233793, + "grad_norm": 0.28566041588783264, + "learning_rate": 9.878669542926951e-05, + "loss": 1.2044, + "step": 1204 + }, + { + "epoch": 0.07321222431496446, + "grad_norm": 0.2669213116168976, + "learning_rate": 9.878459848385965e-05, + "loss": 1.2526, + "step": 1205 + }, + { + "epoch": 0.07327298134759098, + "grad_norm": 0.2798711061477661, + "learning_rate": 9.87824997502409e-05, + "loss": 1.1711, + "step": 1206 + }, + { + "epoch": 0.0733337383802175, + "grad_norm": 0.2689701318740845, + "learning_rate": 9.87803992284902e-05, + "loss": 1.1288, + "step": 1207 + }, + { + "epoch": 0.07339449541284404, + "grad_norm": 0.21430884301662445, + "learning_rate": 9.877829691868454e-05, + "loss": 1.1165, + "step": 1208 + }, + { + "epoch": 0.07345525244547056, + "grad_norm": 0.18101859092712402, + "learning_rate": 9.877619282090099e-05, + "loss": 1.1032, + "step": 1209 + }, + { + "epoch": 0.07351600947809708, + "grad_norm": 0.9025177359580994, + "learning_rate": 9.877408693521663e-05, + "loss": 1.1539, + "step": 1210 + }, + { + "epoch": 0.07357676651072362, + "grad_norm": 0.37415724992752075, + "learning_rate": 9.877197926170872e-05, + "loss": 1.1166, + "step": 1211 + }, + { + "epoch": 0.07363752354335014, + "grad_norm": 0.8708227872848511, + "learning_rate": 9.876986980045446e-05, + "loss": 1.2109, + "step": 1212 + }, + { + "epoch": 0.07369828057597667, + "grad_norm": 0.4024532735347748, + "learning_rate": 9.876775855153122e-05, + "loss": 1.2499, + "step": 1213 + }, + { + "epoch": 0.0737590376086032, + "grad_norm": 0.21525508165359497, + "learning_rate": 9.876564551501635e-05, + "loss": 1.1584, + "step": 1214 + }, + { + "epoch": 0.07381979464122972, + "grad_norm": 0.28530144691467285, + "learning_rate": 9.87635306909873e-05, + "loss": 1.2519, + "step": 1215 + }, + { + "epoch": 0.07388055167385625, + "grad_norm": 0.17295902967453003, + "learning_rate": 9.876141407952163e-05, + "loss": 1.1868, + "step": 1216 + }, + { + "epoch": 0.07394130870648277, + "grad_norm": 12.117770195007324, + "learning_rate": 9.87592956806969e-05, + "loss": 1.2399, + "step": 1217 + }, + { + "epoch": 0.07400206573910931, + "grad_norm": 0.23098962008953094, + "learning_rate": 9.875717549459075e-05, + "loss": 1.132, + "step": 1218 + }, + { + "epoch": 0.07406282277173583, + "grad_norm": 0.5400723814964294, + "learning_rate": 9.875505352128091e-05, + "loss": 1.0978, + "step": 1219 + }, + { + "epoch": 0.07412357980436235, + "grad_norm": 0.17768548429012299, + "learning_rate": 9.875292976084516e-05, + "loss": 1.1184, + "step": 1220 + }, + { + "epoch": 0.07418433683698888, + "grad_norm": 0.1565571129322052, + "learning_rate": 9.875080421336135e-05, + "loss": 1.1045, + "step": 1221 + }, + { + "epoch": 0.0742450938696154, + "grad_norm": 0.1982605904340744, + "learning_rate": 9.874867687890739e-05, + "loss": 1.2224, + "step": 1222 + }, + { + "epoch": 0.07430585090224194, + "grad_norm": 0.1820371150970459, + "learning_rate": 9.874654775756125e-05, + "loss": 1.1732, + "step": 1223 + }, + { + "epoch": 0.07436660793486846, + "grad_norm": 0.1819758266210556, + "learning_rate": 9.874441684940099e-05, + "loss": 1.1459, + "step": 1224 + }, + { + "epoch": 0.07442736496749498, + "grad_norm": 0.15922798216342926, + "learning_rate": 9.874228415450469e-05, + "loss": 1.1319, + "step": 1225 + }, + { + "epoch": 0.07448812200012152, + "grad_norm": 0.29691576957702637, + "learning_rate": 9.874014967295055e-05, + "loss": 1.4323, + "step": 1226 + }, + { + "epoch": 0.07454887903274804, + "grad_norm": 0.2321590930223465, + "learning_rate": 9.873801340481681e-05, + "loss": 1.2083, + "step": 1227 + }, + { + "epoch": 0.07460963606537457, + "grad_norm": 0.22122740745544434, + "learning_rate": 9.873587535018177e-05, + "loss": 1.1489, + "step": 1228 + }, + { + "epoch": 0.0746703930980011, + "grad_norm": 0.24191616475582123, + "learning_rate": 9.87337355091238e-05, + "loss": 1.2007, + "step": 1229 + }, + { + "epoch": 0.07473115013062762, + "grad_norm": 0.1998981088399887, + "learning_rate": 9.873159388172132e-05, + "loss": 1.1998, + "step": 1230 + }, + { + "epoch": 0.07479190716325415, + "grad_norm": 0.22300809621810913, + "learning_rate": 9.872945046805284e-05, + "loss": 1.1431, + "step": 1231 + }, + { + "epoch": 0.07485266419588067, + "grad_norm": 0.17642222344875336, + "learning_rate": 9.872730526819696e-05, + "loss": 1.1344, + "step": 1232 + }, + { + "epoch": 0.0749134212285072, + "grad_norm": 0.2079276293516159, + "learning_rate": 9.872515828223228e-05, + "loss": 1.2673, + "step": 1233 + }, + { + "epoch": 0.07497417826113373, + "grad_norm": 0.19985605776309967, + "learning_rate": 9.87230095102375e-05, + "loss": 1.106, + "step": 1234 + }, + { + "epoch": 0.07503493529376025, + "grad_norm": 0.22975324094295502, + "learning_rate": 9.872085895229138e-05, + "loss": 1.2679, + "step": 1235 + }, + { + "epoch": 0.07509569232638678, + "grad_norm": 0.41138115525245667, + "learning_rate": 9.871870660847276e-05, + "loss": 1.1804, + "step": 1236 + }, + { + "epoch": 0.0751564493590133, + "grad_norm": 0.23521828651428223, + "learning_rate": 9.871655247886053e-05, + "loss": 1.198, + "step": 1237 + }, + { + "epoch": 0.07521720639163983, + "grad_norm": 0.1866956353187561, + "learning_rate": 9.871439656353367e-05, + "loss": 1.1982, + "step": 1238 + }, + { + "epoch": 0.07527796342426636, + "grad_norm": 0.27496084570884705, + "learning_rate": 9.871223886257117e-05, + "loss": 1.2967, + "step": 1239 + }, + { + "epoch": 0.07533872045689288, + "grad_norm": 0.3111632764339447, + "learning_rate": 9.871007937605214e-05, + "loss": 1.101, + "step": 1240 + }, + { + "epoch": 0.07539947748951942, + "grad_norm": 0.16673867404460907, + "learning_rate": 9.870791810405575e-05, + "loss": 1.1584, + "step": 1241 + }, + { + "epoch": 0.07546023452214594, + "grad_norm": 0.538335919380188, + "learning_rate": 9.870575504666119e-05, + "loss": 1.1885, + "step": 1242 + }, + { + "epoch": 0.07552099155477246, + "grad_norm": 0.18732623755931854, + "learning_rate": 9.870359020394779e-05, + "loss": 1.1032, + "step": 1243 + }, + { + "epoch": 0.075581748587399, + "grad_norm": 0.15791022777557373, + "learning_rate": 9.870142357599485e-05, + "loss": 1.1374, + "step": 1244 + }, + { + "epoch": 0.07564250562002552, + "grad_norm": 0.19111013412475586, + "learning_rate": 9.869925516288182e-05, + "loss": 1.1167, + "step": 1245 + }, + { + "epoch": 0.07570326265265205, + "grad_norm": 0.22212238609790802, + "learning_rate": 9.869708496468819e-05, + "loss": 1.1865, + "step": 1246 + }, + { + "epoch": 0.07576401968527857, + "grad_norm": 0.20679518580436707, + "learning_rate": 9.869491298149348e-05, + "loss": 1.1974, + "step": 1247 + }, + { + "epoch": 0.07582477671790509, + "grad_norm": 0.40185847878456116, + "learning_rate": 9.869273921337734e-05, + "loss": 1.2046, + "step": 1248 + }, + { + "epoch": 0.07588553375053163, + "grad_norm": 0.2737756371498108, + "learning_rate": 9.869056366041942e-05, + "loss": 1.1058, + "step": 1249 + }, + { + "epoch": 0.07594629078315815, + "grad_norm": 0.24144308269023895, + "learning_rate": 9.868838632269947e-05, + "loss": 1.165, + "step": 1250 + }, + { + "epoch": 0.07600704781578467, + "grad_norm": 0.4306756854057312, + "learning_rate": 9.868620720029734e-05, + "loss": 1.2316, + "step": 1251 + }, + { + "epoch": 0.0760678048484112, + "grad_norm": 0.18132753670215607, + "learning_rate": 9.868402629329284e-05, + "loss": 1.1462, + "step": 1252 + }, + { + "epoch": 0.07612856188103773, + "grad_norm": 0.1788402497768402, + "learning_rate": 9.868184360176596e-05, + "loss": 1.1362, + "step": 1253 + }, + { + "epoch": 0.07618931891366426, + "grad_norm": 0.2483174353837967, + "learning_rate": 9.86796591257967e-05, + "loss": 1.1175, + "step": 1254 + }, + { + "epoch": 0.07625007594629078, + "grad_norm": 0.2149389535188675, + "learning_rate": 9.867747286546512e-05, + "loss": 1.2059, + "step": 1255 + }, + { + "epoch": 0.0763108329789173, + "grad_norm": 0.20216621458530426, + "learning_rate": 9.867528482085135e-05, + "loss": 1.1647, + "step": 1256 + }, + { + "epoch": 0.07637159001154384, + "grad_norm": 0.24809569120407104, + "learning_rate": 9.867309499203562e-05, + "loss": 1.138, + "step": 1257 + }, + { + "epoch": 0.07643234704417036, + "grad_norm": 0.2221045196056366, + "learning_rate": 9.867090337909819e-05, + "loss": 1.1432, + "step": 1258 + }, + { + "epoch": 0.0764931040767969, + "grad_norm": 0.3315385580062866, + "learning_rate": 9.866870998211937e-05, + "loss": 1.1068, + "step": 1259 + }, + { + "epoch": 0.07655386110942342, + "grad_norm": 0.18087199330329895, + "learning_rate": 9.866651480117958e-05, + "loss": 1.221, + "step": 1260 + }, + { + "epoch": 0.07661461814204994, + "grad_norm": 0.14501695334911346, + "learning_rate": 9.86643178363593e-05, + "loss": 1.1216, + "step": 1261 + }, + { + "epoch": 0.07667537517467647, + "grad_norm": 0.19891618192195892, + "learning_rate": 9.866211908773903e-05, + "loss": 1.212, + "step": 1262 + }, + { + "epoch": 0.07673613220730299, + "grad_norm": 0.21249403059482574, + "learning_rate": 9.865991855539939e-05, + "loss": 1.1338, + "step": 1263 + }, + { + "epoch": 0.07679688923992953, + "grad_norm": 0.16505753993988037, + "learning_rate": 9.865771623942102e-05, + "loss": 1.1192, + "step": 1264 + }, + { + "epoch": 0.07685764627255605, + "grad_norm": 0.3122229278087616, + "learning_rate": 9.865551213988467e-05, + "loss": 1.1952, + "step": 1265 + }, + { + "epoch": 0.07691840330518257, + "grad_norm": 0.18771880865097046, + "learning_rate": 9.86533062568711e-05, + "loss": 1.2009, + "step": 1266 + }, + { + "epoch": 0.0769791603378091, + "grad_norm": 0.21376119554042816, + "learning_rate": 9.86510985904612e-05, + "loss": 1.1342, + "step": 1267 + }, + { + "epoch": 0.07703991737043563, + "grad_norm": 0.16033343970775604, + "learning_rate": 9.864888914073586e-05, + "loss": 1.1222, + "step": 1268 + }, + { + "epoch": 0.07710067440306216, + "grad_norm": 1.6344711780548096, + "learning_rate": 9.864667790777609e-05, + "loss": 1.121, + "step": 1269 + }, + { + "epoch": 0.07716143143568868, + "grad_norm": 0.2040167599916458, + "learning_rate": 9.864446489166294e-05, + "loss": 1.1286, + "step": 1270 + }, + { + "epoch": 0.0772221884683152, + "grad_norm": 0.3274246156215668, + "learning_rate": 9.864225009247751e-05, + "loss": 1.1375, + "step": 1271 + }, + { + "epoch": 0.07728294550094174, + "grad_norm": 0.14763733744621277, + "learning_rate": 9.864003351030104e-05, + "loss": 1.1291, + "step": 1272 + }, + { + "epoch": 0.07734370253356826, + "grad_norm": 0.2939620018005371, + "learning_rate": 9.863781514521471e-05, + "loss": 1.0642, + "step": 1273 + }, + { + "epoch": 0.07740445956619478, + "grad_norm": 0.20359748601913452, + "learning_rate": 9.863559499729988e-05, + "loss": 1.1845, + "step": 1274 + }, + { + "epoch": 0.07746521659882132, + "grad_norm": 0.17832833528518677, + "learning_rate": 9.86333730666379e-05, + "loss": 1.1146, + "step": 1275 + }, + { + "epoch": 0.07752597363144784, + "grad_norm": 0.18401280045509338, + "learning_rate": 9.863114935331023e-05, + "loss": 1.228, + "step": 1276 + }, + { + "epoch": 0.07758673066407437, + "grad_norm": 0.46494609117507935, + "learning_rate": 9.86289238573984e-05, + "loss": 1.255, + "step": 1277 + }, + { + "epoch": 0.07764748769670089, + "grad_norm": 0.251230925321579, + "learning_rate": 9.862669657898395e-05, + "loss": 1.2977, + "step": 1278 + }, + { + "epoch": 0.07770824472932741, + "grad_norm": 0.23302753269672394, + "learning_rate": 9.862446751814854e-05, + "loss": 1.2235, + "step": 1279 + }, + { + "epoch": 0.07776900176195395, + "grad_norm": 0.16729407012462616, + "learning_rate": 9.862223667497388e-05, + "loss": 1.1208, + "step": 1280 + }, + { + "epoch": 0.07782975879458047, + "grad_norm": 0.21287155151367188, + "learning_rate": 9.862000404954172e-05, + "loss": 1.1885, + "step": 1281 + }, + { + "epoch": 0.077890515827207, + "grad_norm": 0.37683218717575073, + "learning_rate": 9.861776964193392e-05, + "loss": 1.3295, + "step": 1282 + }, + { + "epoch": 0.07795127285983353, + "grad_norm": 0.23647919297218323, + "learning_rate": 9.861553345223238e-05, + "loss": 1.0999, + "step": 1283 + }, + { + "epoch": 0.07801202989246005, + "grad_norm": 0.24196778237819672, + "learning_rate": 9.861329548051908e-05, + "loss": 1.1301, + "step": 1284 + }, + { + "epoch": 0.07807278692508658, + "grad_norm": 0.22322338819503784, + "learning_rate": 9.861105572687602e-05, + "loss": 1.1164, + "step": 1285 + }, + { + "epoch": 0.0781335439577131, + "grad_norm": 0.17207689583301544, + "learning_rate": 9.860881419138531e-05, + "loss": 1.1383, + "step": 1286 + }, + { + "epoch": 0.07819430099033964, + "grad_norm": 0.19567154347896576, + "learning_rate": 9.860657087412912e-05, + "loss": 1.1383, + "step": 1287 + }, + { + "epoch": 0.07825505802296616, + "grad_norm": 0.20552359521389008, + "learning_rate": 9.86043257751897e-05, + "loss": 1.1251, + "step": 1288 + }, + { + "epoch": 0.07831581505559268, + "grad_norm": 0.16334939002990723, + "learning_rate": 9.860207889464929e-05, + "loss": 1.1959, + "step": 1289 + }, + { + "epoch": 0.07837657208821922, + "grad_norm": 0.196927011013031, + "learning_rate": 9.859983023259031e-05, + "loss": 1.2013, + "step": 1290 + }, + { + "epoch": 0.07843732912084574, + "grad_norm": 0.1954898089170456, + "learning_rate": 9.859757978909516e-05, + "loss": 1.2278, + "step": 1291 + }, + { + "epoch": 0.07849808615347227, + "grad_norm": 0.18258541822433472, + "learning_rate": 9.859532756424632e-05, + "loss": 1.2141, + "step": 1292 + }, + { + "epoch": 0.07855884318609879, + "grad_norm": 9.932856559753418, + "learning_rate": 9.859307355812637e-05, + "loss": 1.1842, + "step": 1293 + }, + { + "epoch": 0.07861960021872531, + "grad_norm": 0.31945687532424927, + "learning_rate": 9.859081777081789e-05, + "loss": 1.2044, + "step": 1294 + }, + { + "epoch": 0.07868035725135185, + "grad_norm": 0.2624160647392273, + "learning_rate": 9.858856020240362e-05, + "loss": 1.071, + "step": 1295 + }, + { + "epoch": 0.07874111428397837, + "grad_norm": 0.257998526096344, + "learning_rate": 9.858630085296625e-05, + "loss": 1.1729, + "step": 1296 + }, + { + "epoch": 0.07880187131660489, + "grad_norm": 0.3425275385379791, + "learning_rate": 9.858403972258865e-05, + "loss": 1.1558, + "step": 1297 + }, + { + "epoch": 0.07886262834923143, + "grad_norm": 0.2115568369626999, + "learning_rate": 9.858177681135368e-05, + "loss": 1.1376, + "step": 1298 + }, + { + "epoch": 0.07892338538185795, + "grad_norm": 0.1707857847213745, + "learning_rate": 9.85795121193443e-05, + "loss": 1.0875, + "step": 1299 + }, + { + "epoch": 0.07898414241448448, + "grad_norm": 0.38808855414390564, + "learning_rate": 9.85772456466435e-05, + "loss": 1.0925, + "step": 1300 + }, + { + "epoch": 0.079044899447111, + "grad_norm": 0.17025871574878693, + "learning_rate": 9.85749773933344e-05, + "loss": 1.0948, + "step": 1301 + }, + { + "epoch": 0.07910565647973752, + "grad_norm": 0.23683875799179077, + "learning_rate": 9.857270735950008e-05, + "loss": 1.1932, + "step": 1302 + }, + { + "epoch": 0.07916641351236406, + "grad_norm": 42.265953063964844, + "learning_rate": 9.857043554522379e-05, + "loss": 1.1412, + "step": 1303 + }, + { + "epoch": 0.07922717054499058, + "grad_norm": 0.22707150876522064, + "learning_rate": 9.85681619505888e-05, + "loss": 1.128, + "step": 1304 + }, + { + "epoch": 0.07928792757761711, + "grad_norm": 0.23970550298690796, + "learning_rate": 9.856588657567845e-05, + "loss": 1.1108, + "step": 1305 + }, + { + "epoch": 0.07934868461024364, + "grad_norm": 0.16838587820529938, + "learning_rate": 9.856360942057615e-05, + "loss": 1.1313, + "step": 1306 + }, + { + "epoch": 0.07940944164287016, + "grad_norm": 0.21601083874702454, + "learning_rate": 9.856133048536533e-05, + "loss": 1.1223, + "step": 1307 + }, + { + "epoch": 0.07947019867549669, + "grad_norm": 0.2647416293621063, + "learning_rate": 9.855904977012958e-05, + "loss": 1.2414, + "step": 1308 + }, + { + "epoch": 0.07953095570812321, + "grad_norm": 0.2626563012599945, + "learning_rate": 9.855676727495246e-05, + "loss": 1.0582, + "step": 1309 + }, + { + "epoch": 0.07959171274074975, + "grad_norm": 0.2018452137708664, + "learning_rate": 9.855448299991765e-05, + "loss": 1.127, + "step": 1310 + }, + { + "epoch": 0.07965246977337627, + "grad_norm": 0.21179260313510895, + "learning_rate": 9.855219694510889e-05, + "loss": 1.1301, + "step": 1311 + }, + { + "epoch": 0.07971322680600279, + "grad_norm": 0.20376284420490265, + "learning_rate": 9.854990911060995e-05, + "loss": 1.086, + "step": 1312 + }, + { + "epoch": 0.07977398383862933, + "grad_norm": 0.1992858648300171, + "learning_rate": 9.854761949650473e-05, + "loss": 1.1166, + "step": 1313 + }, + { + "epoch": 0.07983474087125585, + "grad_norm": 0.4276573956012726, + "learning_rate": 9.854532810287711e-05, + "loss": 1.2091, + "step": 1314 + }, + { + "epoch": 0.07989549790388237, + "grad_norm": 0.9072816371917725, + "learning_rate": 9.854303492981111e-05, + "loss": 1.1063, + "step": 1315 + }, + { + "epoch": 0.0799562549365089, + "grad_norm": 0.29177671670913696, + "learning_rate": 9.854073997739079e-05, + "loss": 1.1673, + "step": 1316 + }, + { + "epoch": 0.08001701196913542, + "grad_norm": 0.2633453607559204, + "learning_rate": 9.853844324570027e-05, + "loss": 1.1838, + "step": 1317 + }, + { + "epoch": 0.08007776900176196, + "grad_norm": 0.349187970161438, + "learning_rate": 9.853614473482371e-05, + "loss": 1.2875, + "step": 1318 + }, + { + "epoch": 0.08013852603438848, + "grad_norm": 0.22905027866363525, + "learning_rate": 9.85338444448454e-05, + "loss": 1.153, + "step": 1319 + }, + { + "epoch": 0.080199283067015, + "grad_norm": 0.17674872279167175, + "learning_rate": 9.853154237584965e-05, + "loss": 1.1263, + "step": 1320 + }, + { + "epoch": 0.08026004009964154, + "grad_norm": 0.16252605617046356, + "learning_rate": 9.852923852792081e-05, + "loss": 1.0811, + "step": 1321 + }, + { + "epoch": 0.08032079713226806, + "grad_norm": 0.2902568578720093, + "learning_rate": 9.852693290114335e-05, + "loss": 1.2009, + "step": 1322 + }, + { + "epoch": 0.08038155416489459, + "grad_norm": 0.20993369817733765, + "learning_rate": 9.85246254956018e-05, + "loss": 1.1819, + "step": 1323 + }, + { + "epoch": 0.08044231119752111, + "grad_norm": 0.36674124002456665, + "learning_rate": 9.852231631138071e-05, + "loss": 1.0939, + "step": 1324 + }, + { + "epoch": 0.08050306823014763, + "grad_norm": 0.2898842394351959, + "learning_rate": 9.852000534856474e-05, + "loss": 1.1931, + "step": 1325 + }, + { + "epoch": 0.08056382526277417, + "grad_norm": 0.36723166704177856, + "learning_rate": 9.851769260723862e-05, + "loss": 1.1553, + "step": 1326 + }, + { + "epoch": 0.08062458229540069, + "grad_norm": 0.2708528935909271, + "learning_rate": 9.851537808748706e-05, + "loss": 1.139, + "step": 1327 + }, + { + "epoch": 0.08068533932802723, + "grad_norm": 0.2334737330675125, + "learning_rate": 9.851306178939497e-05, + "loss": 1.095, + "step": 1328 + }, + { + "epoch": 0.08074609636065375, + "grad_norm": 0.17848482728004456, + "learning_rate": 9.85107437130472e-05, + "loss": 1.1491, + "step": 1329 + }, + { + "epoch": 0.08080685339328027, + "grad_norm": 0.22593189775943756, + "learning_rate": 9.850842385852876e-05, + "loss": 1.088, + "step": 1330 + }, + { + "epoch": 0.0808676104259068, + "grad_norm": 0.21066004037857056, + "learning_rate": 9.850610222592465e-05, + "loss": 1.3048, + "step": 1331 + }, + { + "epoch": 0.08092836745853332, + "grad_norm": 0.24408243596553802, + "learning_rate": 9.850377881531999e-05, + "loss": 1.1438, + "step": 1332 + }, + { + "epoch": 0.08098912449115986, + "grad_norm": 0.18779738247394562, + "learning_rate": 9.850145362679994e-05, + "loss": 1.1229, + "step": 1333 + }, + { + "epoch": 0.08104988152378638, + "grad_norm": 0.2079487442970276, + "learning_rate": 9.849912666044975e-05, + "loss": 1.1505, + "step": 1334 + }, + { + "epoch": 0.0811106385564129, + "grad_norm": 0.4151386022567749, + "learning_rate": 9.84967979163547e-05, + "loss": 1.4287, + "step": 1335 + }, + { + "epoch": 0.08117139558903944, + "grad_norm": 0.14635898172855377, + "learning_rate": 9.849446739460012e-05, + "loss": 1.0805, + "step": 1336 + }, + { + "epoch": 0.08123215262166596, + "grad_norm": 0.17209403216838837, + "learning_rate": 9.849213509527148e-05, + "loss": 1.0971, + "step": 1337 + }, + { + "epoch": 0.08129290965429248, + "grad_norm": 0.17887446284294128, + "learning_rate": 9.848980101845426e-05, + "loss": 1.1353, + "step": 1338 + }, + { + "epoch": 0.08135366668691901, + "grad_norm": 0.30196571350097656, + "learning_rate": 9.848746516423399e-05, + "loss": 1.1899, + "step": 1339 + }, + { + "epoch": 0.08141442371954553, + "grad_norm": 0.2856433391571045, + "learning_rate": 9.848512753269635e-05, + "loss": 1.2318, + "step": 1340 + }, + { + "epoch": 0.08147518075217207, + "grad_norm": 0.3166598677635193, + "learning_rate": 9.848278812392696e-05, + "loss": 1.1151, + "step": 1341 + }, + { + "epoch": 0.08153593778479859, + "grad_norm": 0.165154367685318, + "learning_rate": 9.848044693801159e-05, + "loss": 1.0753, + "step": 1342 + }, + { + "epoch": 0.08159669481742511, + "grad_norm": 0.18315227329730988, + "learning_rate": 9.847810397503609e-05, + "loss": 1.1308, + "step": 1343 + }, + { + "epoch": 0.08165745185005165, + "grad_norm": 0.20149187743663788, + "learning_rate": 9.84757592350863e-05, + "loss": 1.1014, + "step": 1344 + }, + { + "epoch": 0.08171820888267817, + "grad_norm": 0.1828075647354126, + "learning_rate": 9.84734127182482e-05, + "loss": 1.1288, + "step": 1345 + }, + { + "epoch": 0.0817789659153047, + "grad_norm": 0.36319729685783386, + "learning_rate": 9.847106442460778e-05, + "loss": 1.1111, + "step": 1346 + }, + { + "epoch": 0.08183972294793122, + "grad_norm": 0.389381468296051, + "learning_rate": 9.846871435425112e-05, + "loss": 1.2138, + "step": 1347 + }, + { + "epoch": 0.08190047998055774, + "grad_norm": 0.2722962498664856, + "learning_rate": 9.846636250726438e-05, + "loss": 1.1577, + "step": 1348 + }, + { + "epoch": 0.08196123701318428, + "grad_norm": 0.16517791152000427, + "learning_rate": 9.846400888373374e-05, + "loss": 1.1192, + "step": 1349 + }, + { + "epoch": 0.0820219940458108, + "grad_norm": 0.2437630146741867, + "learning_rate": 9.84616534837455e-05, + "loss": 1.1106, + "step": 1350 + }, + { + "epoch": 0.08208275107843734, + "grad_norm": 0.21548691391944885, + "learning_rate": 9.845929630738598e-05, + "loss": 1.1218, + "step": 1351 + }, + { + "epoch": 0.08214350811106386, + "grad_norm": 0.374189168214798, + "learning_rate": 9.845693735474158e-05, + "loss": 1.1204, + "step": 1352 + }, + { + "epoch": 0.08220426514369038, + "grad_norm": 0.201692134141922, + "learning_rate": 9.845457662589877e-05, + "loss": 1.2067, + "step": 1353 + }, + { + "epoch": 0.08226502217631691, + "grad_norm": 0.23874618113040924, + "learning_rate": 9.845221412094409e-05, + "loss": 1.1763, + "step": 1354 + }, + { + "epoch": 0.08232577920894343, + "grad_norm": 0.15880019962787628, + "learning_rate": 9.844984983996415e-05, + "loss": 1.1304, + "step": 1355 + }, + { + "epoch": 0.08238653624156995, + "grad_norm": 0.17314967513084412, + "learning_rate": 9.84474837830456e-05, + "loss": 1.1418, + "step": 1356 + }, + { + "epoch": 0.08244729327419649, + "grad_norm": 0.21444889903068542, + "learning_rate": 9.844511595027515e-05, + "loss": 1.1605, + "step": 1357 + }, + { + "epoch": 0.08250805030682301, + "grad_norm": 0.17126399278640747, + "learning_rate": 9.844274634173962e-05, + "loss": 1.1315, + "step": 1358 + }, + { + "epoch": 0.08256880733944955, + "grad_norm": 0.7597689032554626, + "learning_rate": 9.844037495752584e-05, + "loss": 1.1185, + "step": 1359 + }, + { + "epoch": 0.08262956437207607, + "grad_norm": 0.17652194201946259, + "learning_rate": 9.843800179772078e-05, + "loss": 1.1937, + "step": 1360 + }, + { + "epoch": 0.08269032140470259, + "grad_norm": 0.22796188294887543, + "learning_rate": 9.843562686241139e-05, + "loss": 1.2028, + "step": 1361 + }, + { + "epoch": 0.08275107843732912, + "grad_norm": 0.20764288306236267, + "learning_rate": 9.843325015168474e-05, + "loss": 1.2556, + "step": 1362 + }, + { + "epoch": 0.08281183546995564, + "grad_norm": 0.25892022252082825, + "learning_rate": 9.843087166562795e-05, + "loss": 1.3303, + "step": 1363 + }, + { + "epoch": 0.08287259250258218, + "grad_norm": 0.19510486721992493, + "learning_rate": 9.842849140432817e-05, + "loss": 1.2397, + "step": 1364 + }, + { + "epoch": 0.0829333495352087, + "grad_norm": 0.23131868243217468, + "learning_rate": 9.84261093678727e-05, + "loss": 1.2398, + "step": 1365 + }, + { + "epoch": 0.08299410656783522, + "grad_norm": 0.3368610739707947, + "learning_rate": 9.842372555634882e-05, + "loss": 1.1568, + "step": 1366 + }, + { + "epoch": 0.08305486360046176, + "grad_norm": 0.16682179272174835, + "learning_rate": 9.842133996984393e-05, + "loss": 1.0998, + "step": 1367 + }, + { + "epoch": 0.08311562063308828, + "grad_norm": 0.2443765252828598, + "learning_rate": 9.841895260844546e-05, + "loss": 1.1174, + "step": 1368 + }, + { + "epoch": 0.08317637766571481, + "grad_norm": 0.5775284171104431, + "learning_rate": 9.841656347224091e-05, + "loss": 1.323, + "step": 1369 + }, + { + "epoch": 0.08323713469834133, + "grad_norm": 1.681067705154419, + "learning_rate": 9.841417256131788e-05, + "loss": 1.107, + "step": 1370 + }, + { + "epoch": 0.08329789173096785, + "grad_norm": 0.3213003873825073, + "learning_rate": 9.8411779875764e-05, + "loss": 1.1204, + "step": 1371 + }, + { + "epoch": 0.08335864876359439, + "grad_norm": 0.2900693714618683, + "learning_rate": 9.840938541566698e-05, + "loss": 1.2616, + "step": 1372 + }, + { + "epoch": 0.08341940579622091, + "grad_norm": 0.767281711101532, + "learning_rate": 9.840698918111455e-05, + "loss": 1.1139, + "step": 1373 + }, + { + "epoch": 0.08348016282884745, + "grad_norm": 0.2874467372894287, + "learning_rate": 9.84045911721946e-05, + "loss": 1.1888, + "step": 1374 + }, + { + "epoch": 0.08354091986147397, + "grad_norm": 2.1407675743103027, + "learning_rate": 9.8402191388995e-05, + "loss": 1.2438, + "step": 1375 + }, + { + "epoch": 0.08360167689410049, + "grad_norm": 0.2052859365940094, + "learning_rate": 9.83997898316037e-05, + "loss": 1.1505, + "step": 1376 + }, + { + "epoch": 0.08366243392672702, + "grad_norm": 0.25272366404533386, + "learning_rate": 9.839738650010877e-05, + "loss": 1.0833, + "step": 1377 + }, + { + "epoch": 0.08372319095935354, + "grad_norm": 0.25053277611732483, + "learning_rate": 9.839498139459827e-05, + "loss": 1.1944, + "step": 1378 + }, + { + "epoch": 0.08378394799198007, + "grad_norm": 0.2881149649620056, + "learning_rate": 9.839257451516037e-05, + "loss": 1.3879, + "step": 1379 + }, + { + "epoch": 0.0838447050246066, + "grad_norm": 0.262215256690979, + "learning_rate": 9.839016586188331e-05, + "loss": 1.1336, + "step": 1380 + }, + { + "epoch": 0.08390546205723312, + "grad_norm": 0.20073488354682922, + "learning_rate": 9.838775543485537e-05, + "loss": 1.1795, + "step": 1381 + }, + { + "epoch": 0.08396621908985966, + "grad_norm": 0.2712678611278534, + "learning_rate": 9.838534323416489e-05, + "loss": 1.1929, + "step": 1382 + }, + { + "epoch": 0.08402697612248618, + "grad_norm": 0.18539045751094818, + "learning_rate": 9.838292925990029e-05, + "loss": 1.1202, + "step": 1383 + }, + { + "epoch": 0.0840877331551127, + "grad_norm": 0.22492966055870056, + "learning_rate": 9.83805135121501e-05, + "loss": 1.1616, + "step": 1384 + }, + { + "epoch": 0.08414849018773923, + "grad_norm": 0.26752761006355286, + "learning_rate": 9.837809599100281e-05, + "loss": 1.2089, + "step": 1385 + }, + { + "epoch": 0.08420924722036575, + "grad_norm": 0.16586409509181976, + "learning_rate": 9.837567669654706e-05, + "loss": 1.0907, + "step": 1386 + }, + { + "epoch": 0.08427000425299229, + "grad_norm": 0.34646153450012207, + "learning_rate": 9.837325562887155e-05, + "loss": 1.1581, + "step": 1387 + }, + { + "epoch": 0.08433076128561881, + "grad_norm": 0.16896191239356995, + "learning_rate": 9.837083278806497e-05, + "loss": 1.095, + "step": 1388 + }, + { + "epoch": 0.08439151831824533, + "grad_norm": 111.09854125976562, + "learning_rate": 9.836840817421619e-05, + "loss": 1.1189, + "step": 1389 + }, + { + "epoch": 0.08445227535087187, + "grad_norm": 0.6128570437431335, + "learning_rate": 9.836598178741405e-05, + "loss": 1.1847, + "step": 1390 + }, + { + "epoch": 0.08451303238349839, + "grad_norm": 0.25501441955566406, + "learning_rate": 9.83635536277475e-05, + "loss": 1.0972, + "step": 1391 + }, + { + "epoch": 0.08457378941612492, + "grad_norm": 0.17326213419437408, + "learning_rate": 9.836112369530555e-05, + "loss": 1.1662, + "step": 1392 + }, + { + "epoch": 0.08463454644875144, + "grad_norm": 0.6709007024765015, + "learning_rate": 9.835869199017725e-05, + "loss": 1.0639, + "step": 1393 + }, + { + "epoch": 0.08469530348137796, + "grad_norm": 0.4226175844669342, + "learning_rate": 9.835625851245177e-05, + "loss": 1.1667, + "step": 1394 + }, + { + "epoch": 0.0847560605140045, + "grad_norm": 0.5547512173652649, + "learning_rate": 9.835382326221826e-05, + "loss": 1.197, + "step": 1395 + }, + { + "epoch": 0.08481681754663102, + "grad_norm": 0.27206289768218994, + "learning_rate": 9.835138623956603e-05, + "loss": 1.1978, + "step": 1396 + }, + { + "epoch": 0.08487757457925756, + "grad_norm": 0.2442013919353485, + "learning_rate": 9.834894744458438e-05, + "loss": 1.2557, + "step": 1397 + }, + { + "epoch": 0.08493833161188408, + "grad_norm": 0.30892157554626465, + "learning_rate": 9.834650687736272e-05, + "loss": 1.1426, + "step": 1398 + }, + { + "epoch": 0.0849990886445106, + "grad_norm": 0.1691623330116272, + "learning_rate": 9.83440645379905e-05, + "loss": 1.1941, + "step": 1399 + }, + { + "epoch": 0.08505984567713713, + "grad_norm": 0.17318879067897797, + "learning_rate": 9.834162042655726e-05, + "loss": 1.1134, + "step": 1400 + }, + { + "epoch": 0.08512060270976365, + "grad_norm": 0.21950559318065643, + "learning_rate": 9.833917454315258e-05, + "loss": 1.1139, + "step": 1401 + }, + { + "epoch": 0.08518135974239018, + "grad_norm": 0.1323520839214325, + "learning_rate": 9.83367268878661e-05, + "loss": 1.0788, + "step": 1402 + }, + { + "epoch": 0.08524211677501671, + "grad_norm": 0.15223145484924316, + "learning_rate": 9.833427746078756e-05, + "loss": 1.0954, + "step": 1403 + }, + { + "epoch": 0.08530287380764323, + "grad_norm": 0.15274764597415924, + "learning_rate": 9.833182626200675e-05, + "loss": 1.117, + "step": 1404 + }, + { + "epoch": 0.08536363084026977, + "grad_norm": 0.1928999423980713, + "learning_rate": 9.83293732916135e-05, + "loss": 1.0919, + "step": 1405 + }, + { + "epoch": 0.08542438787289629, + "grad_norm": 0.2198067456483841, + "learning_rate": 9.832691854969771e-05, + "loss": 1.1946, + "step": 1406 + }, + { + "epoch": 0.08548514490552281, + "grad_norm": 0.1498374044895172, + "learning_rate": 9.83244620363494e-05, + "loss": 1.1198, + "step": 1407 + }, + { + "epoch": 0.08554590193814934, + "grad_norm": 0.2062024027109146, + "learning_rate": 9.832200375165858e-05, + "loss": 1.2049, + "step": 1408 + }, + { + "epoch": 0.08560665897077586, + "grad_norm": 0.35424208641052246, + "learning_rate": 9.831954369571538e-05, + "loss": 1.1415, + "step": 1409 + }, + { + "epoch": 0.0856674160034024, + "grad_norm": 0.19721995294094086, + "learning_rate": 9.831708186860996e-05, + "loss": 1.1542, + "step": 1410 + }, + { + "epoch": 0.08572817303602892, + "grad_norm": 0.31192660331726074, + "learning_rate": 9.831461827043257e-05, + "loss": 1.1151, + "step": 1411 + }, + { + "epoch": 0.08578893006865544, + "grad_norm": 0.228634312748909, + "learning_rate": 9.831215290127351e-05, + "loss": 1.2332, + "step": 1412 + }, + { + "epoch": 0.08584968710128198, + "grad_norm": 0.25092756748199463, + "learning_rate": 9.830968576122315e-05, + "loss": 1.385, + "step": 1413 + }, + { + "epoch": 0.0859104441339085, + "grad_norm": 0.16668741405010223, + "learning_rate": 9.830721685037191e-05, + "loss": 1.1658, + "step": 1414 + }, + { + "epoch": 0.08597120116653503, + "grad_norm": 0.1422152817249298, + "learning_rate": 9.830474616881029e-05, + "loss": 1.0965, + "step": 1415 + }, + { + "epoch": 0.08603195819916155, + "grad_norm": 0.20908355712890625, + "learning_rate": 9.830227371662888e-05, + "loss": 1.057, + "step": 1416 + }, + { + "epoch": 0.08609271523178808, + "grad_norm": 0.262549489736557, + "learning_rate": 9.829979949391828e-05, + "loss": 1.1591, + "step": 1417 + }, + { + "epoch": 0.08615347226441461, + "grad_norm": 0.964087188243866, + "learning_rate": 9.82973235007692e-05, + "loss": 1.3327, + "step": 1418 + }, + { + "epoch": 0.08621422929704113, + "grad_norm": 0.34942829608917236, + "learning_rate": 9.82948457372724e-05, + "loss": 1.1333, + "step": 1419 + }, + { + "epoch": 0.08627498632966765, + "grad_norm": 0.1866210699081421, + "learning_rate": 9.829236620351868e-05, + "loss": 1.1201, + "step": 1420 + }, + { + "epoch": 0.08633574336229419, + "grad_norm": 0.4157951772212982, + "learning_rate": 9.828988489959894e-05, + "loss": 1.101, + "step": 1421 + }, + { + "epoch": 0.08639650039492071, + "grad_norm": 0.16485145688056946, + "learning_rate": 9.828740182560415e-05, + "loss": 1.1931, + "step": 1422 + }, + { + "epoch": 0.08645725742754724, + "grad_norm": 0.4265294075012207, + "learning_rate": 9.828491698162531e-05, + "loss": 1.1155, + "step": 1423 + }, + { + "epoch": 0.08651801446017376, + "grad_norm": 0.20402522385120392, + "learning_rate": 9.828243036775348e-05, + "loss": 1.1142, + "step": 1424 + }, + { + "epoch": 0.08657877149280029, + "grad_norm": 0.2030715048313141, + "learning_rate": 9.827994198407986e-05, + "loss": 1.1302, + "step": 1425 + }, + { + "epoch": 0.08663952852542682, + "grad_norm": 0.2869037687778473, + "learning_rate": 9.827745183069563e-05, + "loss": 1.1581, + "step": 1426 + }, + { + "epoch": 0.08670028555805334, + "grad_norm": 0.2697184383869171, + "learning_rate": 9.827495990769207e-05, + "loss": 1.2221, + "step": 1427 + }, + { + "epoch": 0.08676104259067988, + "grad_norm": 0.22515903413295746, + "learning_rate": 9.827246621516051e-05, + "loss": 1.1929, + "step": 1428 + }, + { + "epoch": 0.0868217996233064, + "grad_norm": 0.20299787819385529, + "learning_rate": 9.826997075319238e-05, + "loss": 1.1041, + "step": 1429 + }, + { + "epoch": 0.08688255665593292, + "grad_norm": 0.542178750038147, + "learning_rate": 9.826747352187915e-05, + "loss": 1.1418, + "step": 1430 + }, + { + "epoch": 0.08694331368855945, + "grad_norm": 1.657292366027832, + "learning_rate": 9.826497452131234e-05, + "loss": 1.1563, + "step": 1431 + }, + { + "epoch": 0.08700407072118597, + "grad_norm": 0.28403112292289734, + "learning_rate": 9.826247375158355e-05, + "loss": 1.1742, + "step": 1432 + }, + { + "epoch": 0.08706482775381251, + "grad_norm": 0.20294104516506195, + "learning_rate": 9.825997121278448e-05, + "loss": 1.1269, + "step": 1433 + }, + { + "epoch": 0.08712558478643903, + "grad_norm": 0.3333057165145874, + "learning_rate": 9.825746690500681e-05, + "loss": 1.0829, + "step": 1434 + }, + { + "epoch": 0.08718634181906555, + "grad_norm": 0.258637398481369, + "learning_rate": 9.825496082834238e-05, + "loss": 1.1133, + "step": 1435 + }, + { + "epoch": 0.08724709885169209, + "grad_norm": 0.18659645318984985, + "learning_rate": 9.825245298288301e-05, + "loss": 1.142, + "step": 1436 + }, + { + "epoch": 0.08730785588431861, + "grad_norm": 0.7480716705322266, + "learning_rate": 9.824994336872068e-05, + "loss": 1.1573, + "step": 1437 + }, + { + "epoch": 0.08736861291694514, + "grad_norm": 0.17065827548503876, + "learning_rate": 9.824743198594734e-05, + "loss": 1.1086, + "step": 1438 + }, + { + "epoch": 0.08742936994957166, + "grad_norm": 0.2011638879776001, + "learning_rate": 9.824491883465504e-05, + "loss": 1.2348, + "step": 1439 + }, + { + "epoch": 0.08749012698219819, + "grad_norm": 0.29686489701271057, + "learning_rate": 9.824240391493593e-05, + "loss": 1.1317, + "step": 1440 + }, + { + "epoch": 0.08755088401482472, + "grad_norm": 0.24617424607276917, + "learning_rate": 9.823988722688218e-05, + "loss": 1.182, + "step": 1441 + }, + { + "epoch": 0.08761164104745124, + "grad_norm": 0.3014746904373169, + "learning_rate": 9.823736877058602e-05, + "loss": 1.26, + "step": 1442 + }, + { + "epoch": 0.08767239808007776, + "grad_norm": 0.2874422073364258, + "learning_rate": 9.82348485461398e-05, + "loss": 1.0887, + "step": 1443 + }, + { + "epoch": 0.0877331551127043, + "grad_norm": 0.2400795817375183, + "learning_rate": 9.823232655363588e-05, + "loss": 1.266, + "step": 1444 + }, + { + "epoch": 0.08779391214533082, + "grad_norm": 0.18708005547523499, + "learning_rate": 9.822980279316671e-05, + "loss": 1.2317, + "step": 1445 + }, + { + "epoch": 0.08785466917795735, + "grad_norm": 0.26659032702445984, + "learning_rate": 9.82272772648248e-05, + "loss": 1.2477, + "step": 1446 + }, + { + "epoch": 0.08791542621058387, + "grad_norm": 0.18628554046154022, + "learning_rate": 9.82247499687027e-05, + "loss": 1.1306, + "step": 1447 + }, + { + "epoch": 0.0879761832432104, + "grad_norm": 0.2454700917005539, + "learning_rate": 9.822222090489307e-05, + "loss": 1.2603, + "step": 1448 + }, + { + "epoch": 0.08803694027583693, + "grad_norm": 0.18475788831710815, + "learning_rate": 9.821969007348863e-05, + "loss": 1.1994, + "step": 1449 + }, + { + "epoch": 0.08809769730846345, + "grad_norm": 3.490493059158325, + "learning_rate": 9.821715747458213e-05, + "loss": 1.1901, + "step": 1450 + }, + { + "epoch": 0.08815845434108999, + "grad_norm": 0.36890801787376404, + "learning_rate": 9.821462310826639e-05, + "loss": 1.2076, + "step": 1451 + }, + { + "epoch": 0.08821921137371651, + "grad_norm": 0.1761927753686905, + "learning_rate": 9.821208697463434e-05, + "loss": 1.2016, + "step": 1452 + }, + { + "epoch": 0.08827996840634303, + "grad_norm": 0.17505629360675812, + "learning_rate": 9.820954907377891e-05, + "loss": 1.1376, + "step": 1453 + }, + { + "epoch": 0.08834072543896956, + "grad_norm": 0.5224376916885376, + "learning_rate": 9.820700940579312e-05, + "loss": 1.1075, + "step": 1454 + }, + { + "epoch": 0.08840148247159609, + "grad_norm": 0.2143557369709015, + "learning_rate": 9.82044679707701e-05, + "loss": 1.1989, + "step": 1455 + }, + { + "epoch": 0.08846223950422262, + "grad_norm": 0.31814083456993103, + "learning_rate": 9.820192476880299e-05, + "loss": 1.2507, + "step": 1456 + }, + { + "epoch": 0.08852299653684914, + "grad_norm": 0.27409324049949646, + "learning_rate": 9.819937979998501e-05, + "loss": 1.1286, + "step": 1457 + }, + { + "epoch": 0.08858375356947566, + "grad_norm": 0.16574041545391083, + "learning_rate": 9.819683306440945e-05, + "loss": 1.0965, + "step": 1458 + }, + { + "epoch": 0.0886445106021022, + "grad_norm": 0.2751779556274414, + "learning_rate": 9.819428456216966e-05, + "loss": 1.2349, + "step": 1459 + }, + { + "epoch": 0.08870526763472872, + "grad_norm": 0.2155376523733139, + "learning_rate": 9.819173429335904e-05, + "loss": 1.1434, + "step": 1460 + }, + { + "epoch": 0.08876602466735524, + "grad_norm": 0.18793049454689026, + "learning_rate": 9.81891822580711e-05, + "loss": 1.1261, + "step": 1461 + }, + { + "epoch": 0.08882678169998177, + "grad_norm": 0.7266697883605957, + "learning_rate": 9.818662845639936e-05, + "loss": 1.1947, + "step": 1462 + }, + { + "epoch": 0.0888875387326083, + "grad_norm": 0.2719860374927521, + "learning_rate": 9.818407288843743e-05, + "loss": 1.3699, + "step": 1463 + }, + { + "epoch": 0.08894829576523483, + "grad_norm": 0.32996392250061035, + "learning_rate": 9.818151555427901e-05, + "loss": 1.1435, + "step": 1464 + }, + { + "epoch": 0.08900905279786135, + "grad_norm": 0.4552542269229889, + "learning_rate": 9.817895645401783e-05, + "loss": 1.2124, + "step": 1465 + }, + { + "epoch": 0.08906980983048787, + "grad_norm": 0.16603194177150726, + "learning_rate": 9.817639558774768e-05, + "loss": 1.1151, + "step": 1466 + }, + { + "epoch": 0.08913056686311441, + "grad_norm": 0.25212615728378296, + "learning_rate": 9.817383295556242e-05, + "loss": 1.2246, + "step": 1467 + }, + { + "epoch": 0.08919132389574093, + "grad_norm": 0.15637518465518951, + "learning_rate": 9.817126855755601e-05, + "loss": 1.1218, + "step": 1468 + }, + { + "epoch": 0.08925208092836746, + "grad_norm": 0.17580582201480865, + "learning_rate": 9.816870239382245e-05, + "loss": 1.1458, + "step": 1469 + }, + { + "epoch": 0.08931283796099398, + "grad_norm": 0.3365723490715027, + "learning_rate": 9.81661344644558e-05, + "loss": 1.1947, + "step": 1470 + }, + { + "epoch": 0.0893735949936205, + "grad_norm": 0.26322677731513977, + "learning_rate": 9.816356476955015e-05, + "loss": 1.158, + "step": 1471 + }, + { + "epoch": 0.08943435202624704, + "grad_norm": 0.14961378276348114, + "learning_rate": 9.816099330919975e-05, + "loss": 1.1121, + "step": 1472 + }, + { + "epoch": 0.08949510905887356, + "grad_norm": 0.2173193097114563, + "learning_rate": 9.815842008349882e-05, + "loss": 1.1972, + "step": 1473 + }, + { + "epoch": 0.0895558660915001, + "grad_norm": 0.18777205049991608, + "learning_rate": 9.815584509254169e-05, + "loss": 1.1275, + "step": 1474 + }, + { + "epoch": 0.08961662312412662, + "grad_norm": 0.23447135090827942, + "learning_rate": 9.815326833642278e-05, + "loss": 1.0941, + "step": 1475 + }, + { + "epoch": 0.08967738015675314, + "grad_norm": 0.26662781834602356, + "learning_rate": 9.815068981523647e-05, + "loss": 1.2104, + "step": 1476 + }, + { + "epoch": 0.08973813718937967, + "grad_norm": 2.838395357131958, + "learning_rate": 9.814810952907735e-05, + "loss": 1.2111, + "step": 1477 + }, + { + "epoch": 0.0897988942220062, + "grad_norm": 0.17212064564228058, + "learning_rate": 9.814552747803996e-05, + "loss": 1.1448, + "step": 1478 + }, + { + "epoch": 0.08985965125463273, + "grad_norm": 0.38833194971084595, + "learning_rate": 9.814294366221895e-05, + "loss": 1.182, + "step": 1479 + }, + { + "epoch": 0.08992040828725925, + "grad_norm": 0.44230154156684875, + "learning_rate": 9.814035808170902e-05, + "loss": 1.1896, + "step": 1480 + }, + { + "epoch": 0.08998116531988577, + "grad_norm": 0.24242104589939117, + "learning_rate": 9.813777073660498e-05, + "loss": 1.1827, + "step": 1481 + }, + { + "epoch": 0.09004192235251231, + "grad_norm": 0.1917129009962082, + "learning_rate": 9.813518162700164e-05, + "loss": 1.154, + "step": 1482 + }, + { + "epoch": 0.09010267938513883, + "grad_norm": 0.34480440616607666, + "learning_rate": 9.81325907529939e-05, + "loss": 1.1323, + "step": 1483 + }, + { + "epoch": 0.09016343641776535, + "grad_norm": 0.2701270580291748, + "learning_rate": 9.812999811467676e-05, + "loss": 1.1819, + "step": 1484 + }, + { + "epoch": 0.09022419345039188, + "grad_norm": 0.31518250703811646, + "learning_rate": 9.812740371214523e-05, + "loss": 1.0967, + "step": 1485 + }, + { + "epoch": 0.0902849504830184, + "grad_norm": 0.19874174892902374, + "learning_rate": 9.812480754549442e-05, + "loss": 1.1014, + "step": 1486 + }, + { + "epoch": 0.09034570751564494, + "grad_norm": 0.21445833146572113, + "learning_rate": 9.812220961481949e-05, + "loss": 1.172, + "step": 1487 + }, + { + "epoch": 0.09040646454827146, + "grad_norm": 0.21220004558563232, + "learning_rate": 9.811960992021565e-05, + "loss": 1.1239, + "step": 1488 + }, + { + "epoch": 0.09046722158089798, + "grad_norm": 0.23813070356845856, + "learning_rate": 9.811700846177823e-05, + "loss": 1.2943, + "step": 1489 + }, + { + "epoch": 0.09052797861352452, + "grad_norm": 0.2615263760089874, + "learning_rate": 9.811440523960254e-05, + "loss": 1.19, + "step": 1490 + }, + { + "epoch": 0.09058873564615104, + "grad_norm": 0.36815667152404785, + "learning_rate": 9.811180025378404e-05, + "loss": 1.1033, + "step": 1491 + }, + { + "epoch": 0.09064949267877757, + "grad_norm": 0.3342699706554413, + "learning_rate": 9.810919350441818e-05, + "loss": 1.1374, + "step": 1492 + }, + { + "epoch": 0.0907102497114041, + "grad_norm": 0.37105441093444824, + "learning_rate": 9.810658499160056e-05, + "loss": 1.2355, + "step": 1493 + }, + { + "epoch": 0.09077100674403062, + "grad_norm": 0.20406359434127808, + "learning_rate": 9.810397471542676e-05, + "loss": 1.2127, + "step": 1494 + }, + { + "epoch": 0.09083176377665715, + "grad_norm": 0.21318334341049194, + "learning_rate": 9.810136267599246e-05, + "loss": 1.125, + "step": 1495 + }, + { + "epoch": 0.09089252080928367, + "grad_norm": 0.2612029016017914, + "learning_rate": 9.809874887339343e-05, + "loss": 1.1731, + "step": 1496 + }, + { + "epoch": 0.09095327784191021, + "grad_norm": 0.23621846735477448, + "learning_rate": 9.809613330772544e-05, + "loss": 1.1748, + "step": 1497 + }, + { + "epoch": 0.09101403487453673, + "grad_norm": 0.2109217643737793, + "learning_rate": 9.809351597908441e-05, + "loss": 1.1202, + "step": 1498 + }, + { + "epoch": 0.09107479190716325, + "grad_norm": 0.24618826806545258, + "learning_rate": 9.809089688756623e-05, + "loss": 1.1282, + "step": 1499 + }, + { + "epoch": 0.09113554893978978, + "grad_norm": 0.15508413314819336, + "learning_rate": 9.808827603326695e-05, + "loss": 1.1079, + "step": 1500 + }, + { + "epoch": 0.0911963059724163, + "grad_norm": 0.24064220488071442, + "learning_rate": 9.808565341628261e-05, + "loss": 1.1677, + "step": 1501 + }, + { + "epoch": 0.09125706300504284, + "grad_norm": 0.16915874183177948, + "learning_rate": 9.808302903670935e-05, + "loss": 1.1159, + "step": 1502 + }, + { + "epoch": 0.09131782003766936, + "grad_norm": 0.2899004817008972, + "learning_rate": 9.808040289464336e-05, + "loss": 1.0971, + "step": 1503 + }, + { + "epoch": 0.09137857707029588, + "grad_norm": 0.13940022885799408, + "learning_rate": 9.807777499018092e-05, + "loss": 1.1236, + "step": 1504 + }, + { + "epoch": 0.09143933410292242, + "grad_norm": 0.16495467722415924, + "learning_rate": 9.807514532341832e-05, + "loss": 1.085, + "step": 1505 + }, + { + "epoch": 0.09150009113554894, + "grad_norm": 0.19143767654895782, + "learning_rate": 9.8072513894452e-05, + "loss": 1.1395, + "step": 1506 + }, + { + "epoch": 0.09156084816817546, + "grad_norm": 0.2714698612689972, + "learning_rate": 9.806988070337839e-05, + "loss": 1.2117, + "step": 1507 + }, + { + "epoch": 0.091621605200802, + "grad_norm": 0.18536613881587982, + "learning_rate": 9.8067245750294e-05, + "loss": 1.1944, + "step": 1508 + }, + { + "epoch": 0.09168236223342852, + "grad_norm": 0.3692673444747925, + "learning_rate": 9.806460903529544e-05, + "loss": 1.1712, + "step": 1509 + }, + { + "epoch": 0.09174311926605505, + "grad_norm": 0.1665329784154892, + "learning_rate": 9.806197055847934e-05, + "loss": 1.1485, + "step": 1510 + }, + { + "epoch": 0.09180387629868157, + "grad_norm": 0.24269534647464752, + "learning_rate": 9.805933031994241e-05, + "loss": 1.2609, + "step": 1511 + }, + { + "epoch": 0.0918646333313081, + "grad_norm": 0.19015014171600342, + "learning_rate": 9.805668831978145e-05, + "loss": 1.2201, + "step": 1512 + }, + { + "epoch": 0.09192539036393463, + "grad_norm": 2.2654175758361816, + "learning_rate": 9.805404455809328e-05, + "loss": 1.1309, + "step": 1513 + }, + { + "epoch": 0.09198614739656115, + "grad_norm": 0.24054457247257233, + "learning_rate": 9.805139903497482e-05, + "loss": 1.3016, + "step": 1514 + }, + { + "epoch": 0.09204690442918768, + "grad_norm": 0.1855025440454483, + "learning_rate": 9.804875175052304e-05, + "loss": 1.11, + "step": 1515 + }, + { + "epoch": 0.0921076614618142, + "grad_norm": 0.15947149693965912, + "learning_rate": 9.804610270483498e-05, + "loss": 1.075, + "step": 1516 + }, + { + "epoch": 0.09216841849444073, + "grad_norm": 0.27962183952331543, + "learning_rate": 9.804345189800774e-05, + "loss": 1.29, + "step": 1517 + }, + { + "epoch": 0.09222917552706726, + "grad_norm": 0.1883019655942917, + "learning_rate": 9.804079933013847e-05, + "loss": 1.1871, + "step": 1518 + }, + { + "epoch": 0.09228993255969378, + "grad_norm": 0.43887776136398315, + "learning_rate": 9.803814500132445e-05, + "loss": 1.1545, + "step": 1519 + }, + { + "epoch": 0.09235068959232032, + "grad_norm": 0.21325574815273285, + "learning_rate": 9.80354889116629e-05, + "loss": 1.0737, + "step": 1520 + }, + { + "epoch": 0.09241144662494684, + "grad_norm": 0.2204999029636383, + "learning_rate": 9.803283106125123e-05, + "loss": 1.166, + "step": 1521 + }, + { + "epoch": 0.09247220365757336, + "grad_norm": 0.5710244178771973, + "learning_rate": 9.803017145018687e-05, + "loss": 1.1305, + "step": 1522 + }, + { + "epoch": 0.0925329606901999, + "grad_norm": 0.5452705025672913, + "learning_rate": 9.802751007856728e-05, + "loss": 1.2947, + "step": 1523 + }, + { + "epoch": 0.09259371772282642, + "grad_norm": 0.28894880414009094, + "learning_rate": 9.802484694649001e-05, + "loss": 1.1546, + "step": 1524 + }, + { + "epoch": 0.09265447475545294, + "grad_norm": 0.25489091873168945, + "learning_rate": 9.802218205405272e-05, + "loss": 1.1156, + "step": 1525 + }, + { + "epoch": 0.09271523178807947, + "grad_norm": 0.2926221787929535, + "learning_rate": 9.801951540135304e-05, + "loss": 1.1505, + "step": 1526 + }, + { + "epoch": 0.09277598882070599, + "grad_norm": 0.2533493936061859, + "learning_rate": 9.801684698848874e-05, + "loss": 1.1537, + "step": 1527 + }, + { + "epoch": 0.09283674585333253, + "grad_norm": 0.2091417759656906, + "learning_rate": 9.801417681555765e-05, + "loss": 1.2596, + "step": 1528 + }, + { + "epoch": 0.09289750288595905, + "grad_norm": 0.3001818358898163, + "learning_rate": 9.801150488265762e-05, + "loss": 1.2709, + "step": 1529 + }, + { + "epoch": 0.09295825991858557, + "grad_norm": 0.23405836522579193, + "learning_rate": 9.80088311898866e-05, + "loss": 1.1304, + "step": 1530 + }, + { + "epoch": 0.0930190169512121, + "grad_norm": 0.20990322530269623, + "learning_rate": 9.80061557373426e-05, + "loss": 1.1991, + "step": 1531 + }, + { + "epoch": 0.09307977398383863, + "grad_norm": 0.198495551943779, + "learning_rate": 9.800347852512366e-05, + "loss": 1.087, + "step": 1532 + }, + { + "epoch": 0.09314053101646516, + "grad_norm": 0.17552435398101807, + "learning_rate": 9.800079955332795e-05, + "loss": 1.1108, + "step": 1533 + }, + { + "epoch": 0.09320128804909168, + "grad_norm": 0.36946913599967957, + "learning_rate": 9.799811882205366e-05, + "loss": 1.1175, + "step": 1534 + }, + { + "epoch": 0.0932620450817182, + "grad_norm": 0.33861300349235535, + "learning_rate": 9.799543633139904e-05, + "loss": 1.2174, + "step": 1535 + }, + { + "epoch": 0.09332280211434474, + "grad_norm": 0.1550939530134201, + "learning_rate": 9.799275208146242e-05, + "loss": 1.1529, + "step": 1536 + }, + { + "epoch": 0.09338355914697126, + "grad_norm": 0.26320332288742065, + "learning_rate": 9.79900660723422e-05, + "loss": 1.2943, + "step": 1537 + }, + { + "epoch": 0.0934443161795978, + "grad_norm": 0.24720866978168488, + "learning_rate": 9.798737830413683e-05, + "loss": 1.1566, + "step": 1538 + }, + { + "epoch": 0.09350507321222432, + "grad_norm": 0.17509174346923828, + "learning_rate": 9.798468877694484e-05, + "loss": 1.0469, + "step": 1539 + }, + { + "epoch": 0.09356583024485084, + "grad_norm": 0.16493506729602814, + "learning_rate": 9.79819974908648e-05, + "loss": 1.1515, + "step": 1540 + }, + { + "epoch": 0.09362658727747737, + "grad_norm": 0.18030470609664917, + "learning_rate": 9.797930444599537e-05, + "loss": 1.1241, + "step": 1541 + }, + { + "epoch": 0.09368734431010389, + "grad_norm": 0.3736095428466797, + "learning_rate": 9.797660964243527e-05, + "loss": 1.2003, + "step": 1542 + }, + { + "epoch": 0.09374810134273043, + "grad_norm": 0.5499204993247986, + "learning_rate": 9.797391308028325e-05, + "loss": 1.1973, + "step": 1543 + }, + { + "epoch": 0.09380885837535695, + "grad_norm": 0.17458085715770721, + "learning_rate": 9.797121475963819e-05, + "loss": 1.1444, + "step": 1544 + }, + { + "epoch": 0.09386961540798347, + "grad_norm": 2.378434658050537, + "learning_rate": 9.796851468059897e-05, + "loss": 1.1095, + "step": 1545 + }, + { + "epoch": 0.09393037244061, + "grad_norm": 0.20916523039340973, + "learning_rate": 9.796581284326458e-05, + "loss": 1.1683, + "step": 1546 + }, + { + "epoch": 0.09399112947323653, + "grad_norm": 0.22721506655216217, + "learning_rate": 9.796310924773404e-05, + "loss": 1.0673, + "step": 1547 + }, + { + "epoch": 0.09405188650586305, + "grad_norm": 0.19539077579975128, + "learning_rate": 9.796040389410645e-05, + "loss": 1.0904, + "step": 1548 + }, + { + "epoch": 0.09411264353848958, + "grad_norm": 0.23285514116287231, + "learning_rate": 9.795769678248101e-05, + "loss": 1.1746, + "step": 1549 + }, + { + "epoch": 0.0941734005711161, + "grad_norm": 0.36843621730804443, + "learning_rate": 9.795498791295692e-05, + "loss": 1.2558, + "step": 1550 + }, + { + "epoch": 0.09423415760374264, + "grad_norm": 0.24364718794822693, + "learning_rate": 9.795227728563346e-05, + "loss": 1.1049, + "step": 1551 + }, + { + "epoch": 0.09429491463636916, + "grad_norm": 0.1703343242406845, + "learning_rate": 9.794956490061002e-05, + "loss": 1.1199, + "step": 1552 + }, + { + "epoch": 0.09435567166899568, + "grad_norm": 0.3334558308124542, + "learning_rate": 9.794685075798602e-05, + "loss": 1.166, + "step": 1553 + }, + { + "epoch": 0.09441642870162222, + "grad_norm": 0.16733412444591522, + "learning_rate": 9.794413485786092e-05, + "loss": 1.1018, + "step": 1554 + }, + { + "epoch": 0.09447718573424874, + "grad_norm": 0.1812572330236435, + "learning_rate": 9.79414172003343e-05, + "loss": 1.1022, + "step": 1555 + }, + { + "epoch": 0.09453794276687527, + "grad_norm": 0.24859310686588287, + "learning_rate": 9.793869778550575e-05, + "loss": 1.2903, + "step": 1556 + }, + { + "epoch": 0.09459869979950179, + "grad_norm": 0.4420587718486786, + "learning_rate": 9.793597661347498e-05, + "loss": 1.1542, + "step": 1557 + }, + { + "epoch": 0.09465945683212831, + "grad_norm": 0.20983166992664337, + "learning_rate": 9.793325368434172e-05, + "loss": 1.2063, + "step": 1558 + }, + { + "epoch": 0.09472021386475485, + "grad_norm": 0.3158051669597626, + "learning_rate": 9.793052899820579e-05, + "loss": 1.1, + "step": 1559 + }, + { + "epoch": 0.09478097089738137, + "grad_norm": 0.2202339470386505, + "learning_rate": 9.792780255516704e-05, + "loss": 1.1121, + "step": 1560 + }, + { + "epoch": 0.0948417279300079, + "grad_norm": 0.3246478736400604, + "learning_rate": 9.792507435532543e-05, + "loss": 1.2683, + "step": 1561 + }, + { + "epoch": 0.09490248496263443, + "grad_norm": 0.5433441400527954, + "learning_rate": 9.792234439878095e-05, + "loss": 1.205, + "step": 1562 + }, + { + "epoch": 0.09496324199526095, + "grad_norm": 0.20413453876972198, + "learning_rate": 9.791961268563368e-05, + "loss": 1.2, + "step": 1563 + }, + { + "epoch": 0.09502399902788748, + "grad_norm": 0.20412945747375488, + "learning_rate": 9.791687921598374e-05, + "loss": 1.1271, + "step": 1564 + }, + { + "epoch": 0.095084756060514, + "grad_norm": 0.3125613033771515, + "learning_rate": 9.791414398993133e-05, + "loss": 1.0784, + "step": 1565 + }, + { + "epoch": 0.09514551309314054, + "grad_norm": 0.22349762916564941, + "learning_rate": 9.791140700757671e-05, + "loss": 1.2507, + "step": 1566 + }, + { + "epoch": 0.09520627012576706, + "grad_norm": 0.30813756585121155, + "learning_rate": 9.790866826902021e-05, + "loss": 1.0591, + "step": 1567 + }, + { + "epoch": 0.09526702715839358, + "grad_norm": 0.13589349389076233, + "learning_rate": 9.790592777436221e-05, + "loss": 1.1089, + "step": 1568 + }, + { + "epoch": 0.09532778419102012, + "grad_norm": 0.3689309358596802, + "learning_rate": 9.790318552370317e-05, + "loss": 1.263, + "step": 1569 + }, + { + "epoch": 0.09538854122364664, + "grad_norm": 0.26732349395751953, + "learning_rate": 9.79004415171436e-05, + "loss": 1.1818, + "step": 1570 + }, + { + "epoch": 0.09544929825627316, + "grad_norm": 0.16413229703903198, + "learning_rate": 9.78976957547841e-05, + "loss": 1.2115, + "step": 1571 + }, + { + "epoch": 0.09551005528889969, + "grad_norm": 0.49256080389022827, + "learning_rate": 9.789494823672529e-05, + "loss": 1.1012, + "step": 1572 + }, + { + "epoch": 0.09557081232152621, + "grad_norm": 0.23552145063877106, + "learning_rate": 9.78921989630679e-05, + "loss": 1.109, + "step": 1573 + }, + { + "epoch": 0.09563156935415275, + "grad_norm": 0.36763328313827515, + "learning_rate": 9.78894479339127e-05, + "loss": 1.1065, + "step": 1574 + }, + { + "epoch": 0.09569232638677927, + "grad_norm": 0.4475506544113159, + "learning_rate": 9.788669514936052e-05, + "loss": 1.2324, + "step": 1575 + }, + { + "epoch": 0.09575308341940579, + "grad_norm": 0.21985404193401337, + "learning_rate": 9.788394060951229e-05, + "loss": 1.0976, + "step": 1576 + }, + { + "epoch": 0.09581384045203233, + "grad_norm": 0.22121183574199677, + "learning_rate": 9.788118431446894e-05, + "loss": 1.1328, + "step": 1577 + }, + { + "epoch": 0.09587459748465885, + "grad_norm": 0.25006103515625, + "learning_rate": 9.787842626433154e-05, + "loss": 1.1504, + "step": 1578 + }, + { + "epoch": 0.09593535451728538, + "grad_norm": 0.18385586142539978, + "learning_rate": 9.787566645920116e-05, + "loss": 1.0625, + "step": 1579 + }, + { + "epoch": 0.0959961115499119, + "grad_norm": 0.2686677873134613, + "learning_rate": 9.787290489917899e-05, + "loss": 1.2092, + "step": 1580 + }, + { + "epoch": 0.09605686858253842, + "grad_norm": 0.14898186922073364, + "learning_rate": 9.787014158436623e-05, + "loss": 1.0999, + "step": 1581 + }, + { + "epoch": 0.09611762561516496, + "grad_norm": 0.2643880546092987, + "learning_rate": 9.786737651486416e-05, + "loss": 1.196, + "step": 1582 + }, + { + "epoch": 0.09617838264779148, + "grad_norm": 0.32479721307754517, + "learning_rate": 9.786460969077417e-05, + "loss": 1.2581, + "step": 1583 + }, + { + "epoch": 0.09623913968041801, + "grad_norm": 0.2819264829158783, + "learning_rate": 9.786184111219765e-05, + "loss": 1.0928, + "step": 1584 + }, + { + "epoch": 0.09629989671304454, + "grad_norm": 0.24512705206871033, + "learning_rate": 9.78590707792361e-05, + "loss": 1.2701, + "step": 1585 + }, + { + "epoch": 0.09636065374567106, + "grad_norm": 0.8442322015762329, + "learning_rate": 9.785629869199104e-05, + "loss": 1.1323, + "step": 1586 + }, + { + "epoch": 0.09642141077829759, + "grad_norm": 2.7841343879699707, + "learning_rate": 9.785352485056413e-05, + "loss": 1.0889, + "step": 1587 + }, + { + "epoch": 0.09648216781092411, + "grad_norm": 0.2092188447713852, + "learning_rate": 9.7850749255057e-05, + "loss": 1.2605, + "step": 1588 + }, + { + "epoch": 0.09654292484355063, + "grad_norm": 0.3294709324836731, + "learning_rate": 9.784797190557142e-05, + "loss": 1.1841, + "step": 1589 + }, + { + "epoch": 0.09660368187617717, + "grad_norm": 0.18616704642772675, + "learning_rate": 9.784519280220916e-05, + "loss": 1.0728, + "step": 1590 + }, + { + "epoch": 0.09666443890880369, + "grad_norm": 0.2453993260860443, + "learning_rate": 9.784241194507212e-05, + "loss": 1.1221, + "step": 1591 + }, + { + "epoch": 0.09672519594143023, + "grad_norm": 0.1521400660276413, + "learning_rate": 9.783962933426223e-05, + "loss": 1.117, + "step": 1592 + }, + { + "epoch": 0.09678595297405675, + "grad_norm": 0.33559900522232056, + "learning_rate": 9.783684496988147e-05, + "loss": 1.1055, + "step": 1593 + }, + { + "epoch": 0.09684671000668327, + "grad_norm": 0.4022634029388428, + "learning_rate": 9.783405885203192e-05, + "loss": 1.1465, + "step": 1594 + }, + { + "epoch": 0.0969074670393098, + "grad_norm": 0.34860312938690186, + "learning_rate": 9.783127098081569e-05, + "loss": 1.2242, + "step": 1595 + }, + { + "epoch": 0.09696822407193632, + "grad_norm": 0.3028814196586609, + "learning_rate": 9.782848135633497e-05, + "loss": 1.1923, + "step": 1596 + }, + { + "epoch": 0.09702898110456286, + "grad_norm": 0.1872156411409378, + "learning_rate": 9.782568997869203e-05, + "loss": 1.1541, + "step": 1597 + }, + { + "epoch": 0.09708973813718938, + "grad_norm": 0.2270052582025528, + "learning_rate": 9.782289684798917e-05, + "loss": 1.194, + "step": 1598 + }, + { + "epoch": 0.0971504951698159, + "grad_norm": 0.27103304862976074, + "learning_rate": 9.78201019643288e-05, + "loss": 1.1189, + "step": 1599 + }, + { + "epoch": 0.09721125220244244, + "grad_norm": 0.18456096947193146, + "learning_rate": 9.781730532781333e-05, + "loss": 1.1643, + "step": 1600 + }, + { + "epoch": 0.09727200923506896, + "grad_norm": 0.2039758712053299, + "learning_rate": 9.781450693854531e-05, + "loss": 1.1189, + "step": 1601 + }, + { + "epoch": 0.09733276626769549, + "grad_norm": 0.5236639976501465, + "learning_rate": 9.781170679662728e-05, + "loss": 1.1399, + "step": 1602 + }, + { + "epoch": 0.09739352330032201, + "grad_norm": 0.8051347136497498, + "learning_rate": 9.78089049021619e-05, + "loss": 1.2496, + "step": 1603 + }, + { + "epoch": 0.09745428033294853, + "grad_norm": 0.35441306233406067, + "learning_rate": 9.780610125525187e-05, + "loss": 1.2011, + "step": 1604 + }, + { + "epoch": 0.09751503736557507, + "grad_norm": 0.32944169640541077, + "learning_rate": 9.780329585599995e-05, + "loss": 1.088, + "step": 1605 + }, + { + "epoch": 0.09757579439820159, + "grad_norm": 0.42272084951400757, + "learning_rate": 9.7800488704509e-05, + "loss": 1.0473, + "step": 1606 + }, + { + "epoch": 0.09763655143082813, + "grad_norm": 0.19113437831401825, + "learning_rate": 9.779767980088189e-05, + "loss": 1.1319, + "step": 1607 + }, + { + "epoch": 0.09769730846345465, + "grad_norm": 0.30906257033348083, + "learning_rate": 9.779486914522157e-05, + "loss": 1.1885, + "step": 1608 + }, + { + "epoch": 0.09775806549608117, + "grad_norm": 0.29779139161109924, + "learning_rate": 9.779205673763109e-05, + "loss": 1.2956, + "step": 1609 + }, + { + "epoch": 0.0978188225287077, + "grad_norm": 0.22943831980228424, + "learning_rate": 9.778924257821353e-05, + "loss": 1.0899, + "step": 1610 + }, + { + "epoch": 0.09787957956133422, + "grad_norm": 0.15976007282733917, + "learning_rate": 9.778642666707206e-05, + "loss": 1.1268, + "step": 1611 + }, + { + "epoch": 0.09794033659396074, + "grad_norm": 0.2557831108570099, + "learning_rate": 9.778360900430987e-05, + "loss": 1.2081, + "step": 1612 + }, + { + "epoch": 0.09800109362658728, + "grad_norm": 0.16285130381584167, + "learning_rate": 9.778078959003027e-05, + "loss": 1.1314, + "step": 1613 + }, + { + "epoch": 0.0980618506592138, + "grad_norm": 0.3343181908130646, + "learning_rate": 9.777796842433658e-05, + "loss": 1.3532, + "step": 1614 + }, + { + "epoch": 0.09812260769184034, + "grad_norm": 0.7450612783432007, + "learning_rate": 9.777514550733224e-05, + "loss": 1.0902, + "step": 1615 + }, + { + "epoch": 0.09818336472446686, + "grad_norm": 0.25191569328308105, + "learning_rate": 9.777232083912069e-05, + "loss": 1.2048, + "step": 1616 + }, + { + "epoch": 0.09824412175709338, + "grad_norm": 1.6702781915664673, + "learning_rate": 9.776949441980549e-05, + "loss": 1.1222, + "step": 1617 + }, + { + "epoch": 0.09830487878971991, + "grad_norm": 0.1711432784795761, + "learning_rate": 9.776666624949023e-05, + "loss": 1.1741, + "step": 1618 + }, + { + "epoch": 0.09836563582234643, + "grad_norm": 0.392383337020874, + "learning_rate": 9.77638363282786e-05, + "loss": 1.1959, + "step": 1619 + }, + { + "epoch": 0.09842639285497297, + "grad_norm": 0.26078468561172485, + "learning_rate": 9.77610046562743e-05, + "loss": 1.1689, + "step": 1620 + }, + { + "epoch": 0.09848714988759949, + "grad_norm": 0.2494201809167862, + "learning_rate": 9.775817123358116e-05, + "loss": 1.2252, + "step": 1621 + }, + { + "epoch": 0.09854790692022601, + "grad_norm": 0.3646562099456787, + "learning_rate": 9.7755336060303e-05, + "loss": 1.186, + "step": 1622 + }, + { + "epoch": 0.09860866395285255, + "grad_norm": 0.240766704082489, + "learning_rate": 9.775249913654378e-05, + "loss": 1.2895, + "step": 1623 + }, + { + "epoch": 0.09866942098547907, + "grad_norm": 1.1815179586410522, + "learning_rate": 9.774966046240748e-05, + "loss": 1.251, + "step": 1624 + }, + { + "epoch": 0.0987301780181056, + "grad_norm": 0.5118951797485352, + "learning_rate": 9.774682003799813e-05, + "loss": 1.1115, + "step": 1625 + }, + { + "epoch": 0.09879093505073212, + "grad_norm": 0.28984978795051575, + "learning_rate": 9.774397786341988e-05, + "loss": 1.1508, + "step": 1626 + }, + { + "epoch": 0.09885169208335864, + "grad_norm": 0.2386028915643692, + "learning_rate": 9.774113393877688e-05, + "loss": 1.0675, + "step": 1627 + }, + { + "epoch": 0.09891244911598518, + "grad_norm": 0.2303244024515152, + "learning_rate": 9.77382882641734e-05, + "loss": 1.2194, + "step": 1628 + }, + { + "epoch": 0.0989732061486117, + "grad_norm": 0.2386283427476883, + "learning_rate": 9.773544083971372e-05, + "loss": 1.2006, + "step": 1629 + }, + { + "epoch": 0.09903396318123822, + "grad_norm": 0.36446717381477356, + "learning_rate": 9.773259166550224e-05, + "loss": 1.1489, + "step": 1630 + }, + { + "epoch": 0.09909472021386476, + "grad_norm": 0.6474947929382324, + "learning_rate": 9.772974074164338e-05, + "loss": 1.1415, + "step": 1631 + }, + { + "epoch": 0.09915547724649128, + "grad_norm": 0.3799828886985779, + "learning_rate": 9.772688806824164e-05, + "loss": 1.1391, + "step": 1632 + }, + { + "epoch": 0.09921623427911781, + "grad_norm": 0.26079803705215454, + "learning_rate": 9.772403364540161e-05, + "loss": 1.2086, + "step": 1633 + }, + { + "epoch": 0.09927699131174433, + "grad_norm": 3.875704050064087, + "learning_rate": 9.77211774732279e-05, + "loss": 1.1272, + "step": 1634 + }, + { + "epoch": 0.09933774834437085, + "grad_norm": 0.5074887871742249, + "learning_rate": 9.77183195518252e-05, + "loss": 1.1026, + "step": 1635 + }, + { + "epoch": 0.09939850537699739, + "grad_norm": 0.21204863488674164, + "learning_rate": 9.771545988129827e-05, + "loss": 1.2959, + "step": 1636 + }, + { + "epoch": 0.09945926240962391, + "grad_norm": 0.2473514825105667, + "learning_rate": 9.771259846175193e-05, + "loss": 1.1836, + "step": 1637 + }, + { + "epoch": 0.09952001944225045, + "grad_norm": 0.26578426361083984, + "learning_rate": 9.770973529329108e-05, + "loss": 1.0606, + "step": 1638 + }, + { + "epoch": 0.09958077647487697, + "grad_norm": 0.20983055233955383, + "learning_rate": 9.770687037602066e-05, + "loss": 1.0774, + "step": 1639 + }, + { + "epoch": 0.09964153350750349, + "grad_norm": 0.23893332481384277, + "learning_rate": 9.770400371004568e-05, + "loss": 1.1362, + "step": 1640 + }, + { + "epoch": 0.09970229054013002, + "grad_norm": 0.2511257827281952, + "learning_rate": 9.770113529547123e-05, + "loss": 1.2297, + "step": 1641 + }, + { + "epoch": 0.09976304757275654, + "grad_norm": 0.18219970166683197, + "learning_rate": 9.769826513240244e-05, + "loss": 1.208, + "step": 1642 + }, + { + "epoch": 0.09982380460538308, + "grad_norm": 0.3269709050655365, + "learning_rate": 9.769539322094453e-05, + "loss": 1.3274, + "step": 1643 + }, + { + "epoch": 0.0998845616380096, + "grad_norm": 0.3039088547229767, + "learning_rate": 9.769251956120276e-05, + "loss": 1.2201, + "step": 1644 + }, + { + "epoch": 0.09994531867063612, + "grad_norm": 0.29194459319114685, + "learning_rate": 9.768964415328245e-05, + "loss": 1.1301, + "step": 1645 + }, + { + "epoch": 0.10000607570326266, + "grad_norm": 0.2589316964149475, + "learning_rate": 9.768676699728903e-05, + "loss": 1.0832, + "step": 1646 + }, + { + "epoch": 0.10006683273588918, + "grad_norm": 0.2882067561149597, + "learning_rate": 9.768388809332796e-05, + "loss": 1.1025, + "step": 1647 + }, + { + "epoch": 0.10012758976851571, + "grad_norm": 0.21772560477256775, + "learning_rate": 9.768100744150473e-05, + "loss": 1.2394, + "step": 1648 + }, + { + "epoch": 0.10018834680114223, + "grad_norm": 0.206012561917305, + "learning_rate": 9.767812504192497e-05, + "loss": 1.0577, + "step": 1649 + }, + { + "epoch": 0.10024910383376875, + "grad_norm": 0.5202322006225586, + "learning_rate": 9.767524089469432e-05, + "loss": 1.2404, + "step": 1650 + }, + { + "epoch": 0.10030986086639529, + "grad_norm": 0.20388440787792206, + "learning_rate": 9.76723549999185e-05, + "loss": 1.064, + "step": 1651 + }, + { + "epoch": 0.10037061789902181, + "grad_norm": 0.16354244947433472, + "learning_rate": 9.766946735770328e-05, + "loss": 1.0964, + "step": 1652 + }, + { + "epoch": 0.10043137493164833, + "grad_norm": 0.17277511954307556, + "learning_rate": 9.766657796815453e-05, + "loss": 1.0597, + "step": 1653 + }, + { + "epoch": 0.10049213196427487, + "grad_norm": 0.2964755892753601, + "learning_rate": 9.766368683137815e-05, + "loss": 1.2554, + "step": 1654 + }, + { + "epoch": 0.10055288899690139, + "grad_norm": 0.16489538550376892, + "learning_rate": 9.766079394748012e-05, + "loss": 1.1278, + "step": 1655 + }, + { + "epoch": 0.10061364602952792, + "grad_norm": 0.30312579870224, + "learning_rate": 9.765789931656646e-05, + "loss": 1.2997, + "step": 1656 + }, + { + "epoch": 0.10067440306215444, + "grad_norm": 0.3645249307155609, + "learning_rate": 9.765500293874331e-05, + "loss": 1.3426, + "step": 1657 + }, + { + "epoch": 0.10073516009478097, + "grad_norm": 0.19458957016468048, + "learning_rate": 9.765210481411681e-05, + "loss": 1.1525, + "step": 1658 + }, + { + "epoch": 0.1007959171274075, + "grad_norm": 0.17949829995632172, + "learning_rate": 9.764920494279319e-05, + "loss": 1.123, + "step": 1659 + }, + { + "epoch": 0.10085667416003402, + "grad_norm": 0.25160884857177734, + "learning_rate": 9.764630332487876e-05, + "loss": 1.1265, + "step": 1660 + }, + { + "epoch": 0.10091743119266056, + "grad_norm": 0.2752400040626526, + "learning_rate": 9.764339996047987e-05, + "loss": 1.3065, + "step": 1661 + }, + { + "epoch": 0.10097818822528708, + "grad_norm": 0.2502821087837219, + "learning_rate": 9.764049484970293e-05, + "loss": 1.2764, + "step": 1662 + }, + { + "epoch": 0.1010389452579136, + "grad_norm": 0.20508714020252228, + "learning_rate": 9.763758799265446e-05, + "loss": 1.1136, + "step": 1663 + }, + { + "epoch": 0.10109970229054013, + "grad_norm": 0.2746257483959198, + "learning_rate": 9.763467938944098e-05, + "loss": 1.0847, + "step": 1664 + }, + { + "epoch": 0.10116045932316665, + "grad_norm": 0.21187755465507507, + "learning_rate": 9.763176904016913e-05, + "loss": 1.1526, + "step": 1665 + }, + { + "epoch": 0.10122121635579319, + "grad_norm": 0.27146077156066895, + "learning_rate": 9.762885694494558e-05, + "loss": 1.1298, + "step": 1666 + }, + { + "epoch": 0.10128197338841971, + "grad_norm": 0.19881564378738403, + "learning_rate": 9.762594310387707e-05, + "loss": 1.1272, + "step": 1667 + }, + { + "epoch": 0.10134273042104623, + "grad_norm": 0.23094390332698822, + "learning_rate": 9.762302751707039e-05, + "loss": 1.0704, + "step": 1668 + }, + { + "epoch": 0.10140348745367277, + "grad_norm": 0.18011099100112915, + "learning_rate": 9.762011018463246e-05, + "loss": 1.1111, + "step": 1669 + }, + { + "epoch": 0.10146424448629929, + "grad_norm": 0.15692360699176788, + "learning_rate": 9.761719110667019e-05, + "loss": 1.1067, + "step": 1670 + }, + { + "epoch": 0.10152500151892582, + "grad_norm": 0.23953652381896973, + "learning_rate": 9.761427028329056e-05, + "loss": 1.2114, + "step": 1671 + }, + { + "epoch": 0.10158575855155234, + "grad_norm": 0.2880992889404297, + "learning_rate": 9.761134771460064e-05, + "loss": 1.0508, + "step": 1672 + }, + { + "epoch": 0.10164651558417886, + "grad_norm": 1.0627996921539307, + "learning_rate": 9.760842340070758e-05, + "loss": 1.0736, + "step": 1673 + }, + { + "epoch": 0.1017072726168054, + "grad_norm": 0.19745668768882751, + "learning_rate": 9.760549734171856e-05, + "loss": 1.1869, + "step": 1674 + }, + { + "epoch": 0.10176802964943192, + "grad_norm": 0.23604899644851685, + "learning_rate": 9.760256953774083e-05, + "loss": 1.1255, + "step": 1675 + }, + { + "epoch": 0.10182878668205844, + "grad_norm": 0.2554061710834503, + "learning_rate": 9.759963998888171e-05, + "loss": 1.1129, + "step": 1676 + }, + { + "epoch": 0.10188954371468498, + "grad_norm": 0.35546228289604187, + "learning_rate": 9.759670869524858e-05, + "loss": 1.181, + "step": 1677 + }, + { + "epoch": 0.1019503007473115, + "grad_norm": 0.3503586947917938, + "learning_rate": 9.75937756569489e-05, + "loss": 1.25, + "step": 1678 + }, + { + "epoch": 0.10201105777993803, + "grad_norm": 0.20203061401844025, + "learning_rate": 9.759084087409017e-05, + "loss": 1.1654, + "step": 1679 + }, + { + "epoch": 0.10207181481256455, + "grad_norm": 0.36170125007629395, + "learning_rate": 9.758790434677998e-05, + "loss": 1.1405, + "step": 1680 + }, + { + "epoch": 0.10213257184519108, + "grad_norm": 0.4543344974517822, + "learning_rate": 9.758496607512594e-05, + "loss": 1.2559, + "step": 1681 + }, + { + "epoch": 0.10219332887781761, + "grad_norm": 0.20456837117671967, + "learning_rate": 9.758202605923579e-05, + "loss": 1.1705, + "step": 1682 + }, + { + "epoch": 0.10225408591044413, + "grad_norm": 0.22422146797180176, + "learning_rate": 9.757908429921725e-05, + "loss": 1.1844, + "step": 1683 + }, + { + "epoch": 0.10231484294307067, + "grad_norm": 0.2442871630191803, + "learning_rate": 9.757614079517821e-05, + "loss": 1.1151, + "step": 1684 + }, + { + "epoch": 0.10237559997569719, + "grad_norm": 0.1535910964012146, + "learning_rate": 9.75731955472265e-05, + "loss": 1.1399, + "step": 1685 + }, + { + "epoch": 0.10243635700832371, + "grad_norm": 0.25678959488868713, + "learning_rate": 9.757024855547014e-05, + "loss": 1.1288, + "step": 1686 + }, + { + "epoch": 0.10249711404095024, + "grad_norm": 0.3460509777069092, + "learning_rate": 9.756729982001712e-05, + "loss": 1.2216, + "step": 1687 + }, + { + "epoch": 0.10255787107357676, + "grad_norm": 0.17896756529808044, + "learning_rate": 9.756434934097552e-05, + "loss": 1.1684, + "step": 1688 + }, + { + "epoch": 0.1026186281062033, + "grad_norm": 0.2536429166793823, + "learning_rate": 9.756139711845349e-05, + "loss": 1.1846, + "step": 1689 + }, + { + "epoch": 0.10267938513882982, + "grad_norm": 0.33528488874435425, + "learning_rate": 9.755844315255928e-05, + "loss": 1.2888, + "step": 1690 + }, + { + "epoch": 0.10274014217145634, + "grad_norm": 0.18117205798625946, + "learning_rate": 9.755548744340112e-05, + "loss": 1.08, + "step": 1691 + }, + { + "epoch": 0.10280089920408288, + "grad_norm": 0.1998588591814041, + "learning_rate": 9.75525299910874e-05, + "loss": 1.0944, + "step": 1692 + }, + { + "epoch": 0.1028616562367094, + "grad_norm": 0.9771398901939392, + "learning_rate": 9.754957079572649e-05, + "loss": 1.1342, + "step": 1693 + }, + { + "epoch": 0.10292241326933592, + "grad_norm": 0.5298376679420471, + "learning_rate": 9.754660985742688e-05, + "loss": 1.2322, + "step": 1694 + }, + { + "epoch": 0.10298317030196245, + "grad_norm": 0.2937504053115845, + "learning_rate": 9.754364717629707e-05, + "loss": 1.0599, + "step": 1695 + }, + { + "epoch": 0.10304392733458898, + "grad_norm": 0.2285371869802475, + "learning_rate": 9.75406827524457e-05, + "loss": 1.1776, + "step": 1696 + }, + { + "epoch": 0.10310468436721551, + "grad_norm": 0.3292471468448639, + "learning_rate": 9.753771658598141e-05, + "loss": 1.134, + "step": 1697 + }, + { + "epoch": 0.10316544139984203, + "grad_norm": 0.2330576777458191, + "learning_rate": 9.753474867701294e-05, + "loss": 1.1176, + "step": 1698 + }, + { + "epoch": 0.10322619843246855, + "grad_norm": 0.24973610043525696, + "learning_rate": 9.753177902564905e-05, + "loss": 1.1582, + "step": 1699 + }, + { + "epoch": 0.10328695546509509, + "grad_norm": 0.9256431460380554, + "learning_rate": 9.752880763199864e-05, + "loss": 1.1076, + "step": 1700 + }, + { + "epoch": 0.10334771249772161, + "grad_norm": 0.26770707964897156, + "learning_rate": 9.752583449617058e-05, + "loss": 1.1696, + "step": 1701 + }, + { + "epoch": 0.10340846953034814, + "grad_norm": 4.23089075088501, + "learning_rate": 9.752285961827388e-05, + "loss": 1.1308, + "step": 1702 + }, + { + "epoch": 0.10346922656297466, + "grad_norm": 0.3109228312969208, + "learning_rate": 9.751988299841756e-05, + "loss": 1.1146, + "step": 1703 + }, + { + "epoch": 0.10352998359560119, + "grad_norm": 0.1922626793384552, + "learning_rate": 9.751690463671075e-05, + "loss": 1.0804, + "step": 1704 + }, + { + "epoch": 0.10359074062822772, + "grad_norm": 0.316914439201355, + "learning_rate": 9.751392453326262e-05, + "loss": 1.1219, + "step": 1705 + }, + { + "epoch": 0.10365149766085424, + "grad_norm": 0.5654424428939819, + "learning_rate": 9.751094268818239e-05, + "loss": 1.1409, + "step": 1706 + }, + { + "epoch": 0.10371225469348078, + "grad_norm": 0.31209778785705566, + "learning_rate": 9.750795910157939e-05, + "loss": 1.1141, + "step": 1707 + }, + { + "epoch": 0.1037730117261073, + "grad_norm": 0.19559776782989502, + "learning_rate": 9.750497377356296e-05, + "loss": 1.1685, + "step": 1708 + }, + { + "epoch": 0.10383376875873382, + "grad_norm": 0.19794543087482452, + "learning_rate": 9.750198670424254e-05, + "loss": 1.1754, + "step": 1709 + }, + { + "epoch": 0.10389452579136035, + "grad_norm": 0.23751486837863922, + "learning_rate": 9.749899789372759e-05, + "loss": 1.1905, + "step": 1710 + }, + { + "epoch": 0.10395528282398687, + "grad_norm": 0.27798542380332947, + "learning_rate": 9.749600734212771e-05, + "loss": 1.1134, + "step": 1711 + }, + { + "epoch": 0.10401603985661341, + "grad_norm": 0.237484872341156, + "learning_rate": 9.74930150495525e-05, + "loss": 1.0911, + "step": 1712 + }, + { + "epoch": 0.10407679688923993, + "grad_norm": 6.3565354347229, + "learning_rate": 9.749002101611164e-05, + "loss": 1.1332, + "step": 1713 + }, + { + "epoch": 0.10413755392186645, + "grad_norm": 0.19158251583576202, + "learning_rate": 9.748702524191489e-05, + "loss": 1.1536, + "step": 1714 + }, + { + "epoch": 0.10419831095449299, + "grad_norm": 0.19259385764598846, + "learning_rate": 9.748402772707204e-05, + "loss": 1.0904, + "step": 1715 + }, + { + "epoch": 0.10425906798711951, + "grad_norm": 0.28275877237319946, + "learning_rate": 9.748102847169296e-05, + "loss": 1.1084, + "step": 1716 + }, + { + "epoch": 0.10431982501974603, + "grad_norm": 0.19667786359786987, + "learning_rate": 9.747802747588763e-05, + "loss": 1.2508, + "step": 1717 + }, + { + "epoch": 0.10438058205237256, + "grad_norm": 0.4985334873199463, + "learning_rate": 9.747502473976602e-05, + "loss": 1.2777, + "step": 1718 + }, + { + "epoch": 0.10444133908499909, + "grad_norm": 0.16044367849826813, + "learning_rate": 9.74720202634382e-05, + "loss": 1.1079, + "step": 1719 + }, + { + "epoch": 0.10450209611762562, + "grad_norm": 0.21739163994789124, + "learning_rate": 9.74690140470143e-05, + "loss": 1.2065, + "step": 1720 + }, + { + "epoch": 0.10456285315025214, + "grad_norm": 0.22656330466270447, + "learning_rate": 9.746600609060451e-05, + "loss": 1.1168, + "step": 1721 + }, + { + "epoch": 0.10462361018287866, + "grad_norm": 0.18773066997528076, + "learning_rate": 9.746299639431909e-05, + "loss": 1.1428, + "step": 1722 + }, + { + "epoch": 0.1046843672155052, + "grad_norm": 0.2294616550207138, + "learning_rate": 9.745998495826836e-05, + "loss": 1.1423, + "step": 1723 + }, + { + "epoch": 0.10474512424813172, + "grad_norm": 0.2877761125564575, + "learning_rate": 9.745697178256273e-05, + "loss": 1.1063, + "step": 1724 + }, + { + "epoch": 0.10480588128075825, + "grad_norm": 0.34316951036453247, + "learning_rate": 9.74539568673126e-05, + "loss": 1.2422, + "step": 1725 + }, + { + "epoch": 0.10486663831338477, + "grad_norm": 13.001538276672363, + "learning_rate": 9.745094021262852e-05, + "loss": 1.0724, + "step": 1726 + }, + { + "epoch": 0.1049273953460113, + "grad_norm": 0.5038268566131592, + "learning_rate": 9.744792181862106e-05, + "loss": 1.2204, + "step": 1727 + }, + { + "epoch": 0.10498815237863783, + "grad_norm": 0.38262248039245605, + "learning_rate": 9.744490168540083e-05, + "loss": 1.1701, + "step": 1728 + }, + { + "epoch": 0.10504890941126435, + "grad_norm": 0.2696792483329773, + "learning_rate": 9.74418798130786e-05, + "loss": 1.2236, + "step": 1729 + }, + { + "epoch": 0.10510966644389089, + "grad_norm": 0.5042509436607361, + "learning_rate": 9.743885620176506e-05, + "loss": 1.1234, + "step": 1730 + }, + { + "epoch": 0.10517042347651741, + "grad_norm": 0.21859440207481384, + "learning_rate": 9.743583085157109e-05, + "loss": 1.2039, + "step": 1731 + }, + { + "epoch": 0.10523118050914393, + "grad_norm": 0.2642357051372528, + "learning_rate": 9.743280376260758e-05, + "loss": 1.2029, + "step": 1732 + }, + { + "epoch": 0.10529193754177046, + "grad_norm": 0.4985358417034149, + "learning_rate": 9.742977493498546e-05, + "loss": 1.102, + "step": 1733 + }, + { + "epoch": 0.10535269457439699, + "grad_norm": 0.17374558746814728, + "learning_rate": 9.742674436881578e-05, + "loss": 1.0923, + "step": 1734 + }, + { + "epoch": 0.1054134516070235, + "grad_norm": 0.35733723640441895, + "learning_rate": 9.742371206420962e-05, + "loss": 1.1821, + "step": 1735 + }, + { + "epoch": 0.10547420863965004, + "grad_norm": 0.4453296363353729, + "learning_rate": 9.742067802127812e-05, + "loss": 1.0994, + "step": 1736 + }, + { + "epoch": 0.10553496567227656, + "grad_norm": 0.193424254655838, + "learning_rate": 9.74176422401325e-05, + "loss": 1.2039, + "step": 1737 + }, + { + "epoch": 0.1055957227049031, + "grad_norm": 0.48401230573654175, + "learning_rate": 9.741460472088404e-05, + "loss": 1.2039, + "step": 1738 + }, + { + "epoch": 0.10565647973752962, + "grad_norm": 0.2332983911037445, + "learning_rate": 9.74115654636441e-05, + "loss": 1.1414, + "step": 1739 + }, + { + "epoch": 0.10571723677015614, + "grad_norm": 0.2843562662601471, + "learning_rate": 9.740852446852403e-05, + "loss": 1.092, + "step": 1740 + }, + { + "epoch": 0.10577799380278267, + "grad_norm": 0.21252241730690002, + "learning_rate": 9.740548173563536e-05, + "loss": 1.1014, + "step": 1741 + }, + { + "epoch": 0.1058387508354092, + "grad_norm": 0.3645320236682892, + "learning_rate": 9.740243726508957e-05, + "loss": 1.0982, + "step": 1742 + }, + { + "epoch": 0.10589950786803573, + "grad_norm": 0.2132883071899414, + "learning_rate": 9.739939105699829e-05, + "loss": 1.0891, + "step": 1743 + }, + { + "epoch": 0.10596026490066225, + "grad_norm": 0.19943320751190186, + "learning_rate": 9.739634311147318e-05, + "loss": 1.1056, + "step": 1744 + }, + { + "epoch": 0.10602102193328877, + "grad_norm": 0.19694775342941284, + "learning_rate": 9.739329342862593e-05, + "loss": 1.1578, + "step": 1745 + }, + { + "epoch": 0.10608177896591531, + "grad_norm": 0.32192084193229675, + "learning_rate": 9.739024200856836e-05, + "loss": 1.2166, + "step": 1746 + }, + { + "epoch": 0.10614253599854183, + "grad_norm": 0.29021573066711426, + "learning_rate": 9.738718885141231e-05, + "loss": 1.0897, + "step": 1747 + }, + { + "epoch": 0.10620329303116836, + "grad_norm": 0.18188691139221191, + "learning_rate": 9.73841339572697e-05, + "loss": 1.1592, + "step": 1748 + }, + { + "epoch": 0.10626405006379488, + "grad_norm": 0.4569520354270935, + "learning_rate": 9.738107732625249e-05, + "loss": 1.1454, + "step": 1749 + }, + { + "epoch": 0.1063248070964214, + "grad_norm": 0.27050137519836426, + "learning_rate": 9.737801895847273e-05, + "loss": 1.1302, + "step": 1750 + }, + { + "epoch": 0.10638556412904794, + "grad_norm": 0.16040939092636108, + "learning_rate": 9.737495885404253e-05, + "loss": 1.1544, + "step": 1751 + }, + { + "epoch": 0.10644632116167446, + "grad_norm": 0.15606291592121124, + "learning_rate": 9.737189701307405e-05, + "loss": 1.1086, + "step": 1752 + }, + { + "epoch": 0.106507078194301, + "grad_norm": 0.1992669552564621, + "learning_rate": 9.736883343567955e-05, + "loss": 1.0962, + "step": 1753 + }, + { + "epoch": 0.10656783522692752, + "grad_norm": 0.23568962514400482, + "learning_rate": 9.736576812197129e-05, + "loss": 1.2839, + "step": 1754 + }, + { + "epoch": 0.10662859225955404, + "grad_norm": 0.21118885278701782, + "learning_rate": 9.736270107206164e-05, + "loss": 1.345, + "step": 1755 + }, + { + "epoch": 0.10668934929218057, + "grad_norm": 0.3307506740093231, + "learning_rate": 9.735963228606303e-05, + "loss": 1.0882, + "step": 1756 + }, + { + "epoch": 0.1067501063248071, + "grad_norm": 0.19812536239624023, + "learning_rate": 9.735656176408793e-05, + "loss": 1.1122, + "step": 1757 + }, + { + "epoch": 0.10681086335743362, + "grad_norm": 0.24365496635437012, + "learning_rate": 9.735348950624895e-05, + "loss": 1.0807, + "step": 1758 + }, + { + "epoch": 0.10687162039006015, + "grad_norm": 0.18023565411567688, + "learning_rate": 9.735041551265862e-05, + "loss": 1.147, + "step": 1759 + }, + { + "epoch": 0.10693237742268667, + "grad_norm": 0.18322375416755676, + "learning_rate": 9.734733978342965e-05, + "loss": 1.114, + "step": 1760 + }, + { + "epoch": 0.10699313445531321, + "grad_norm": 0.1496008038520813, + "learning_rate": 9.734426231867482e-05, + "loss": 1.1278, + "step": 1761 + }, + { + "epoch": 0.10705389148793973, + "grad_norm": 0.23670625686645508, + "learning_rate": 9.734118311850688e-05, + "loss": 1.1621, + "step": 1762 + }, + { + "epoch": 0.10711464852056625, + "grad_norm": 0.5887247323989868, + "learning_rate": 9.733810218303873e-05, + "loss": 1.3251, + "step": 1763 + }, + { + "epoch": 0.10717540555319278, + "grad_norm": 0.35178157687187195, + "learning_rate": 9.733501951238329e-05, + "loss": 1.2094, + "step": 1764 + }, + { + "epoch": 0.1072361625858193, + "grad_norm": 0.19476966559886932, + "learning_rate": 9.733193510665358e-05, + "loss": 1.0805, + "step": 1765 + }, + { + "epoch": 0.10729691961844584, + "grad_norm": 0.20517274737358093, + "learning_rate": 9.732884896596262e-05, + "loss": 1.1121, + "step": 1766 + }, + { + "epoch": 0.10735767665107236, + "grad_norm": 0.1934758722782135, + "learning_rate": 9.732576109042355e-05, + "loss": 1.1029, + "step": 1767 + }, + { + "epoch": 0.10741843368369888, + "grad_norm": 0.28425684571266174, + "learning_rate": 9.732267148014955e-05, + "loss": 1.2216, + "step": 1768 + }, + { + "epoch": 0.10747919071632542, + "grad_norm": 0.9072685837745667, + "learning_rate": 9.73195801352539e-05, + "loss": 1.1507, + "step": 1769 + }, + { + "epoch": 0.10753994774895194, + "grad_norm": 0.4446607232093811, + "learning_rate": 9.731648705584989e-05, + "loss": 1.1356, + "step": 1770 + }, + { + "epoch": 0.10760070478157847, + "grad_norm": 1.4298312664031982, + "learning_rate": 9.73133922420509e-05, + "loss": 1.2759, + "step": 1771 + }, + { + "epoch": 0.107661461814205, + "grad_norm": 0.2696684002876282, + "learning_rate": 9.731029569397037e-05, + "loss": 1.2024, + "step": 1772 + }, + { + "epoch": 0.10772221884683152, + "grad_norm": 0.20106251537799835, + "learning_rate": 9.730719741172181e-05, + "loss": 1.2146, + "step": 1773 + }, + { + "epoch": 0.10778297587945805, + "grad_norm": 0.26841217279434204, + "learning_rate": 9.730409739541877e-05, + "loss": 1.3705, + "step": 1774 + }, + { + "epoch": 0.10784373291208457, + "grad_norm": 0.1646181046962738, + "learning_rate": 9.730099564517491e-05, + "loss": 1.1204, + "step": 1775 + }, + { + "epoch": 0.10790448994471111, + "grad_norm": 0.25667041540145874, + "learning_rate": 9.729789216110392e-05, + "loss": 1.0921, + "step": 1776 + }, + { + "epoch": 0.10796524697733763, + "grad_norm": 0.33628520369529724, + "learning_rate": 9.729478694331954e-05, + "loss": 1.1826, + "step": 1777 + }, + { + "epoch": 0.10802600400996415, + "grad_norm": 0.21840974688529968, + "learning_rate": 9.72916799919356e-05, + "loss": 1.2281, + "step": 1778 + }, + { + "epoch": 0.10808676104259068, + "grad_norm": 0.8014854788780212, + "learning_rate": 9.7288571307066e-05, + "loss": 1.2282, + "step": 1779 + }, + { + "epoch": 0.1081475180752172, + "grad_norm": 0.2203182727098465, + "learning_rate": 9.728546088882466e-05, + "loss": 1.1345, + "step": 1780 + }, + { + "epoch": 0.10820827510784373, + "grad_norm": 0.463958740234375, + "learning_rate": 9.728234873732562e-05, + "loss": 1.109, + "step": 1781 + }, + { + "epoch": 0.10826903214047026, + "grad_norm": 0.32588332891464233, + "learning_rate": 9.727923485268296e-05, + "loss": 1.165, + "step": 1782 + }, + { + "epoch": 0.10832978917309678, + "grad_norm": 0.2140774130821228, + "learning_rate": 9.727611923501079e-05, + "loss": 1.2313, + "step": 1783 + }, + { + "epoch": 0.10839054620572332, + "grad_norm": 0.2978186309337616, + "learning_rate": 9.727300188442334e-05, + "loss": 1.1756, + "step": 1784 + }, + { + "epoch": 0.10845130323834984, + "grad_norm": 0.18642206490039825, + "learning_rate": 9.726988280103485e-05, + "loss": 1.0946, + "step": 1785 + }, + { + "epoch": 0.10851206027097636, + "grad_norm": 0.2392071634531021, + "learning_rate": 9.726676198495968e-05, + "loss": 1.2295, + "step": 1786 + }, + { + "epoch": 0.1085728173036029, + "grad_norm": 0.3410801887512207, + "learning_rate": 9.726363943631223e-05, + "loss": 1.1238, + "step": 1787 + }, + { + "epoch": 0.10863357433622942, + "grad_norm": 0.19988597929477692, + "learning_rate": 9.726051515520692e-05, + "loss": 1.0833, + "step": 1788 + }, + { + "epoch": 0.10869433136885595, + "grad_norm": 0.48140695691108704, + "learning_rate": 9.725738914175831e-05, + "loss": 1.1641, + "step": 1789 + }, + { + "epoch": 0.10875508840148247, + "grad_norm": 0.25766366720199585, + "learning_rate": 9.725426139608094e-05, + "loss": 1.1248, + "step": 1790 + }, + { + "epoch": 0.108815845434109, + "grad_norm": 0.7419375777244568, + "learning_rate": 9.72511319182895e-05, + "loss": 1.2262, + "step": 1791 + }, + { + "epoch": 0.10887660246673553, + "grad_norm": 0.3252478241920471, + "learning_rate": 9.724800070849869e-05, + "loss": 1.1827, + "step": 1792 + }, + { + "epoch": 0.10893735949936205, + "grad_norm": 0.3237375020980835, + "learning_rate": 9.724486776682328e-05, + "loss": 1.3831, + "step": 1793 + }, + { + "epoch": 0.10899811653198858, + "grad_norm": 0.1870550662279129, + "learning_rate": 9.72417330933781e-05, + "loss": 1.1907, + "step": 1794 + }, + { + "epoch": 0.1090588735646151, + "grad_norm": 0.27788177132606506, + "learning_rate": 9.723859668827809e-05, + "loss": 1.1254, + "step": 1795 + }, + { + "epoch": 0.10911963059724163, + "grad_norm": 0.49869078397750854, + "learning_rate": 9.723545855163816e-05, + "loss": 1.2561, + "step": 1796 + }, + { + "epoch": 0.10918038762986816, + "grad_norm": 0.7547779083251953, + "learning_rate": 9.723231868357337e-05, + "loss": 1.1466, + "step": 1797 + }, + { + "epoch": 0.10924114466249468, + "grad_norm": 0.5552695393562317, + "learning_rate": 9.722917708419881e-05, + "loss": 1.111, + "step": 1798 + }, + { + "epoch": 0.1093019016951212, + "grad_norm": 0.17805175483226776, + "learning_rate": 9.722603375362963e-05, + "loss": 1.1214, + "step": 1799 + }, + { + "epoch": 0.10936265872774774, + "grad_norm": 0.4798164963722229, + "learning_rate": 9.722288869198106e-05, + "loss": 1.1838, + "step": 1800 + }, + { + "epoch": 0.10942341576037426, + "grad_norm": 0.5813899040222168, + "learning_rate": 9.721974189936837e-05, + "loss": 1.2165, + "step": 1801 + }, + { + "epoch": 0.1094841727930008, + "grad_norm": 0.17604577541351318, + "learning_rate": 9.721659337590693e-05, + "loss": 1.1176, + "step": 1802 + }, + { + "epoch": 0.10954492982562732, + "grad_norm": 0.9933179616928101, + "learning_rate": 9.721344312171213e-05, + "loss": 1.2117, + "step": 1803 + }, + { + "epoch": 0.10960568685825384, + "grad_norm": 0.3179173767566681, + "learning_rate": 9.721029113689943e-05, + "loss": 1.0823, + "step": 1804 + }, + { + "epoch": 0.10966644389088037, + "grad_norm": 1.0834643840789795, + "learning_rate": 9.72071374215844e-05, + "loss": 1.1301, + "step": 1805 + }, + { + "epoch": 0.10972720092350689, + "grad_norm": 0.29624152183532715, + "learning_rate": 9.72039819758826e-05, + "loss": 1.1041, + "step": 1806 + }, + { + "epoch": 0.10978795795613343, + "grad_norm": 0.3164976239204407, + "learning_rate": 9.720082479990974e-05, + "loss": 1.2096, + "step": 1807 + }, + { + "epoch": 0.10984871498875995, + "grad_norm": 0.25658395886421204, + "learning_rate": 9.719766589378152e-05, + "loss": 1.0777, + "step": 1808 + }, + { + "epoch": 0.10990947202138647, + "grad_norm": 0.3463870882987976, + "learning_rate": 9.719450525761373e-05, + "loss": 1.1936, + "step": 1809 + }, + { + "epoch": 0.109970229054013, + "grad_norm": 0.2338787317276001, + "learning_rate": 9.719134289152224e-05, + "loss": 1.1548, + "step": 1810 + }, + { + "epoch": 0.11003098608663953, + "grad_norm": 0.16285766661167145, + "learning_rate": 9.718817879562293e-05, + "loss": 1.1017, + "step": 1811 + }, + { + "epoch": 0.11009174311926606, + "grad_norm": 0.4239717423915863, + "learning_rate": 9.718501297003181e-05, + "loss": 1.2376, + "step": 1812 + }, + { + "epoch": 0.11015250015189258, + "grad_norm": 0.24120941758155823, + "learning_rate": 9.718184541486492e-05, + "loss": 1.097, + "step": 1813 + }, + { + "epoch": 0.1102132571845191, + "grad_norm": 0.3054282069206238, + "learning_rate": 9.717867613023838e-05, + "loss": 1.1179, + "step": 1814 + }, + { + "epoch": 0.11027401421714564, + "grad_norm": 0.2704208493232727, + "learning_rate": 9.717550511626834e-05, + "loss": 1.1473, + "step": 1815 + }, + { + "epoch": 0.11033477124977216, + "grad_norm": 0.500048816204071, + "learning_rate": 9.717233237307104e-05, + "loss": 1.2067, + "step": 1816 + }, + { + "epoch": 0.1103955282823987, + "grad_norm": 0.2887870967388153, + "learning_rate": 9.716915790076278e-05, + "loss": 1.2325, + "step": 1817 + }, + { + "epoch": 0.11045628531502522, + "grad_norm": 0.2627791166305542, + "learning_rate": 9.71659816994599e-05, + "loss": 1.1396, + "step": 1818 + }, + { + "epoch": 0.11051704234765174, + "grad_norm": 0.20322081446647644, + "learning_rate": 9.716280376927887e-05, + "loss": 1.1051, + "step": 1819 + }, + { + "epoch": 0.11057779938027827, + "grad_norm": 0.18145835399627686, + "learning_rate": 9.715962411033615e-05, + "loss": 1.1262, + "step": 1820 + }, + { + "epoch": 0.11063855641290479, + "grad_norm": 0.31508344411849976, + "learning_rate": 9.715644272274828e-05, + "loss": 1.2081, + "step": 1821 + }, + { + "epoch": 0.11069931344553131, + "grad_norm": 0.560475766658783, + "learning_rate": 9.715325960663188e-05, + "loss": 1.3645, + "step": 1822 + }, + { + "epoch": 0.11076007047815785, + "grad_norm": 0.1997685581445694, + "learning_rate": 9.715007476210364e-05, + "loss": 1.1891, + "step": 1823 + }, + { + "epoch": 0.11082082751078437, + "grad_norm": 0.29143574833869934, + "learning_rate": 9.714688818928031e-05, + "loss": 1.0803, + "step": 1824 + }, + { + "epoch": 0.1108815845434109, + "grad_norm": 0.2193433791399002, + "learning_rate": 9.714369988827867e-05, + "loss": 1.0965, + "step": 1825 + }, + { + "epoch": 0.11094234157603743, + "grad_norm": 0.6419479250907898, + "learning_rate": 9.71405098592156e-05, + "loss": 1.095, + "step": 1826 + }, + { + "epoch": 0.11100309860866395, + "grad_norm": 0.2430139183998108, + "learning_rate": 9.713731810220803e-05, + "loss": 1.0786, + "step": 1827 + }, + { + "epoch": 0.11106385564129048, + "grad_norm": 0.22759728133678436, + "learning_rate": 9.713412461737296e-05, + "loss": 1.1457, + "step": 1828 + }, + { + "epoch": 0.111124612673917, + "grad_norm": 0.3639973998069763, + "learning_rate": 9.713092940482743e-05, + "loss": 1.1163, + "step": 1829 + }, + { + "epoch": 0.11118536970654354, + "grad_norm": 8.10069465637207, + "learning_rate": 9.712773246468859e-05, + "loss": 1.2446, + "step": 1830 + }, + { + "epoch": 0.11124612673917006, + "grad_norm": 0.19557404518127441, + "learning_rate": 9.712453379707357e-05, + "loss": 1.1389, + "step": 1831 + }, + { + "epoch": 0.11130688377179658, + "grad_norm": 0.23012775182724, + "learning_rate": 9.712133340209969e-05, + "loss": 1.2565, + "step": 1832 + }, + { + "epoch": 0.11136764080442312, + "grad_norm": 0.19063954055309296, + "learning_rate": 9.711813127988422e-05, + "loss": 1.0712, + "step": 1833 + }, + { + "epoch": 0.11142839783704964, + "grad_norm": 0.44537022709846497, + "learning_rate": 9.711492743054453e-05, + "loss": 1.1444, + "step": 1834 + }, + { + "epoch": 0.11148915486967617, + "grad_norm": 0.146654412150383, + "learning_rate": 9.711172185419808e-05, + "loss": 1.1223, + "step": 1835 + }, + { + "epoch": 0.11154991190230269, + "grad_norm": 0.1751268208026886, + "learning_rate": 9.710851455096235e-05, + "loss": 1.1225, + "step": 1836 + }, + { + "epoch": 0.11161066893492921, + "grad_norm": 0.34237268567085266, + "learning_rate": 9.71053055209549e-05, + "loss": 1.0778, + "step": 1837 + }, + { + "epoch": 0.11167142596755575, + "grad_norm": 0.2107909619808197, + "learning_rate": 9.71020947642934e-05, + "loss": 1.1847, + "step": 1838 + }, + { + "epoch": 0.11173218300018227, + "grad_norm": 0.6270161867141724, + "learning_rate": 9.70988822810955e-05, + "loss": 1.0963, + "step": 1839 + }, + { + "epoch": 0.11179294003280879, + "grad_norm": 0.21378874778747559, + "learning_rate": 9.709566807147895e-05, + "loss": 1.1671, + "step": 1840 + }, + { + "epoch": 0.11185369706543533, + "grad_norm": 0.1499761939048767, + "learning_rate": 9.70924521355616e-05, + "loss": 1.0603, + "step": 1841 + }, + { + "epoch": 0.11191445409806185, + "grad_norm": 0.1642175018787384, + "learning_rate": 9.70892344734613e-05, + "loss": 1.1341, + "step": 1842 + }, + { + "epoch": 0.11197521113068838, + "grad_norm": 0.56465744972229, + "learning_rate": 9.708601508529602e-05, + "loss": 1.1082, + "step": 1843 + }, + { + "epoch": 0.1120359681633149, + "grad_norm": 0.19741006195545197, + "learning_rate": 9.708279397118376e-05, + "loss": 1.2651, + "step": 1844 + }, + { + "epoch": 0.11209672519594142, + "grad_norm": 0.18394073843955994, + "learning_rate": 9.707957113124257e-05, + "loss": 1.1114, + "step": 1845 + }, + { + "epoch": 0.11215748222856796, + "grad_norm": 0.2483065277338028, + "learning_rate": 9.707634656559061e-05, + "loss": 1.1527, + "step": 1846 + }, + { + "epoch": 0.11221823926119448, + "grad_norm": 0.2128303050994873, + "learning_rate": 9.707312027434608e-05, + "loss": 1.2226, + "step": 1847 + }, + { + "epoch": 0.11227899629382102, + "grad_norm": 0.22518791258335114, + "learning_rate": 9.706989225762721e-05, + "loss": 1.1515, + "step": 1848 + }, + { + "epoch": 0.11233975332644754, + "grad_norm": 0.31041276454925537, + "learning_rate": 9.706666251555234e-05, + "loss": 1.2707, + "step": 1849 + }, + { + "epoch": 0.11240051035907406, + "grad_norm": 0.18872512876987457, + "learning_rate": 9.706343104823987e-05, + "loss": 1.1886, + "step": 1850 + }, + { + "epoch": 0.11246126739170059, + "grad_norm": 0.4778657853603363, + "learning_rate": 9.706019785580823e-05, + "loss": 1.1597, + "step": 1851 + }, + { + "epoch": 0.11252202442432711, + "grad_norm": 0.20142699778079987, + "learning_rate": 9.705696293837594e-05, + "loss": 1.1902, + "step": 1852 + }, + { + "epoch": 0.11258278145695365, + "grad_norm": 0.179465651512146, + "learning_rate": 9.705372629606158e-05, + "loss": 1.1218, + "step": 1853 + }, + { + "epoch": 0.11264353848958017, + "grad_norm": 0.22530245780944824, + "learning_rate": 9.705048792898378e-05, + "loss": 1.1999, + "step": 1854 + }, + { + "epoch": 0.11270429552220669, + "grad_norm": 0.2807729244232178, + "learning_rate": 9.704724783726127e-05, + "loss": 1.1244, + "step": 1855 + }, + { + "epoch": 0.11276505255483323, + "grad_norm": 0.305794894695282, + "learning_rate": 9.704400602101278e-05, + "loss": 1.1369, + "step": 1856 + }, + { + "epoch": 0.11282580958745975, + "grad_norm": 0.37672796845436096, + "learning_rate": 9.704076248035717e-05, + "loss": 1.2643, + "step": 1857 + }, + { + "epoch": 0.11288656662008628, + "grad_norm": 0.19682687520980835, + "learning_rate": 9.703751721541329e-05, + "loss": 1.1059, + "step": 1858 + }, + { + "epoch": 0.1129473236527128, + "grad_norm": 0.15786024928092957, + "learning_rate": 9.703427022630015e-05, + "loss": 1.2051, + "step": 1859 + }, + { + "epoch": 0.11300808068533932, + "grad_norm": 0.5112740397453308, + "learning_rate": 9.703102151313674e-05, + "loss": 1.2165, + "step": 1860 + }, + { + "epoch": 0.11306883771796586, + "grad_norm": 0.3164134621620178, + "learning_rate": 9.702777107604215e-05, + "loss": 1.1781, + "step": 1861 + }, + { + "epoch": 0.11312959475059238, + "grad_norm": 0.18649594485759735, + "learning_rate": 9.70245189151355e-05, + "loss": 1.0863, + "step": 1862 + }, + { + "epoch": 0.1131903517832189, + "grad_norm": 0.19943934679031372, + "learning_rate": 9.702126503053603e-05, + "loss": 1.2801, + "step": 1863 + }, + { + "epoch": 0.11325110881584544, + "grad_norm": 0.29414230585098267, + "learning_rate": 9.701800942236301e-05, + "loss": 1.1029, + "step": 1864 + }, + { + "epoch": 0.11331186584847196, + "grad_norm": 0.2480049878358841, + "learning_rate": 9.701475209073576e-05, + "loss": 1.1546, + "step": 1865 + }, + { + "epoch": 0.11337262288109849, + "grad_norm": 0.2536844313144684, + "learning_rate": 9.701149303577368e-05, + "loss": 1.2017, + "step": 1866 + }, + { + "epoch": 0.11343337991372501, + "grad_norm": 0.20147861540317535, + "learning_rate": 9.700823225759623e-05, + "loss": 1.1049, + "step": 1867 + }, + { + "epoch": 0.11349413694635153, + "grad_norm": 5.159074306488037, + "learning_rate": 9.700496975632295e-05, + "loss": 1.1675, + "step": 1868 + }, + { + "epoch": 0.11355489397897807, + "grad_norm": 1.5400784015655518, + "learning_rate": 9.700170553207341e-05, + "loss": 1.1369, + "step": 1869 + }, + { + "epoch": 0.11361565101160459, + "grad_norm": 0.6182984709739685, + "learning_rate": 9.699843958496729e-05, + "loss": 1.146, + "step": 1870 + }, + { + "epoch": 0.11367640804423113, + "grad_norm": 0.19991667568683624, + "learning_rate": 9.699517191512426e-05, + "loss": 1.1584, + "step": 1871 + }, + { + "epoch": 0.11373716507685765, + "grad_norm": 0.35074588656425476, + "learning_rate": 9.699190252266414e-05, + "loss": 1.124, + "step": 1872 + }, + { + "epoch": 0.11379792210948417, + "grad_norm": 0.243534654378891, + "learning_rate": 9.698863140770673e-05, + "loss": 1.2255, + "step": 1873 + }, + { + "epoch": 0.1138586791421107, + "grad_norm": 0.20570048689842224, + "learning_rate": 9.698535857037197e-05, + "loss": 1.1179, + "step": 1874 + }, + { + "epoch": 0.11391943617473722, + "grad_norm": 0.2860731780529022, + "learning_rate": 9.698208401077979e-05, + "loss": 1.1203, + "step": 1875 + }, + { + "epoch": 0.11398019320736376, + "grad_norm": 0.4588428735733032, + "learning_rate": 9.697880772905026e-05, + "loss": 1.3248, + "step": 1876 + }, + { + "epoch": 0.11404095023999028, + "grad_norm": 0.3027544319629669, + "learning_rate": 9.697552972530344e-05, + "loss": 1.3643, + "step": 1877 + }, + { + "epoch": 0.1141017072726168, + "grad_norm": 0.24429982900619507, + "learning_rate": 9.697224999965948e-05, + "loss": 1.2928, + "step": 1878 + }, + { + "epoch": 0.11416246430524334, + "grad_norm": 0.2989386022090912, + "learning_rate": 9.696896855223865e-05, + "loss": 1.0822, + "step": 1879 + }, + { + "epoch": 0.11422322133786986, + "grad_norm": 0.2681278586387634, + "learning_rate": 9.696568538316117e-05, + "loss": 1.1431, + "step": 1880 + }, + { + "epoch": 0.11428397837049639, + "grad_norm": 14.011563301086426, + "learning_rate": 9.696240049254743e-05, + "loss": 1.1651, + "step": 1881 + }, + { + "epoch": 0.11434473540312291, + "grad_norm": 0.5634395480155945, + "learning_rate": 9.695911388051783e-05, + "loss": 1.1015, + "step": 1882 + }, + { + "epoch": 0.11440549243574943, + "grad_norm": 0.42623788118362427, + "learning_rate": 9.695582554719283e-05, + "loss": 1.0696, + "step": 1883 + }, + { + "epoch": 0.11446624946837597, + "grad_norm": 0.7773213386535645, + "learning_rate": 9.695253549269296e-05, + "loss": 1.1993, + "step": 1884 + }, + { + "epoch": 0.11452700650100249, + "grad_norm": 1.0263391733169556, + "learning_rate": 9.694924371713883e-05, + "loss": 1.1557, + "step": 1885 + }, + { + "epoch": 0.11458776353362901, + "grad_norm": 0.4339512288570404, + "learning_rate": 9.694595022065109e-05, + "loss": 1.2667, + "step": 1886 + }, + { + "epoch": 0.11464852056625555, + "grad_norm": 0.3500816524028778, + "learning_rate": 9.694265500335046e-05, + "loss": 1.2477, + "step": 1887 + }, + { + "epoch": 0.11470927759888207, + "grad_norm": 0.885793924331665, + "learning_rate": 9.693935806535776e-05, + "loss": 1.1568, + "step": 1888 + }, + { + "epoch": 0.1147700346315086, + "grad_norm": 0.6202738285064697, + "learning_rate": 9.693605940679381e-05, + "loss": 1.1341, + "step": 1889 + }, + { + "epoch": 0.11483079166413512, + "grad_norm": 0.3628444969654083, + "learning_rate": 9.693275902777953e-05, + "loss": 1.1351, + "step": 1890 + }, + { + "epoch": 0.11489154869676164, + "grad_norm": 0.6494534611701965, + "learning_rate": 9.692945692843589e-05, + "loss": 1.2245, + "step": 1891 + }, + { + "epoch": 0.11495230572938818, + "grad_norm": 0.6455859541893005, + "learning_rate": 9.692615310888394e-05, + "loss": 1.1446, + "step": 1892 + }, + { + "epoch": 0.1150130627620147, + "grad_norm": 0.48541146516799927, + "learning_rate": 9.692284756924477e-05, + "loss": 1.1475, + "step": 1893 + }, + { + "epoch": 0.11507381979464124, + "grad_norm": 0.4792094826698303, + "learning_rate": 9.691954030963957e-05, + "loss": 1.227, + "step": 1894 + }, + { + "epoch": 0.11513457682726776, + "grad_norm": 0.7279345393180847, + "learning_rate": 9.691623133018955e-05, + "loss": 1.3239, + "step": 1895 + }, + { + "epoch": 0.11519533385989428, + "grad_norm": 0.2750690281391144, + "learning_rate": 9.691292063101598e-05, + "loss": 1.1426, + "step": 1896 + }, + { + "epoch": 0.11525609089252081, + "grad_norm": 0.40976056456565857, + "learning_rate": 9.690960821224025e-05, + "loss": 1.106, + "step": 1897 + }, + { + "epoch": 0.11531684792514733, + "grad_norm": 0.6517239212989807, + "learning_rate": 9.690629407398376e-05, + "loss": 1.2699, + "step": 1898 + }, + { + "epoch": 0.11537760495777387, + "grad_norm": 0.2929058372974396, + "learning_rate": 9.6902978216368e-05, + "loss": 1.2669, + "step": 1899 + }, + { + "epoch": 0.11543836199040039, + "grad_norm": 0.4887823760509491, + "learning_rate": 9.68996606395145e-05, + "loss": 1.1478, + "step": 1900 + }, + { + "epoch": 0.11549911902302691, + "grad_norm": 0.2126196324825287, + "learning_rate": 9.689634134354489e-05, + "loss": 1.0822, + "step": 1901 + }, + { + "epoch": 0.11555987605565345, + "grad_norm": 0.19483217597007751, + "learning_rate": 9.68930203285808e-05, + "loss": 1.1242, + "step": 1902 + }, + { + "epoch": 0.11562063308827997, + "grad_norm": 0.37179771065711975, + "learning_rate": 9.688969759474399e-05, + "loss": 1.1922, + "step": 1903 + }, + { + "epoch": 0.11568139012090649, + "grad_norm": 0.37623727321624756, + "learning_rate": 9.688637314215625e-05, + "loss": 1.0962, + "step": 1904 + }, + { + "epoch": 0.11574214715353302, + "grad_norm": 0.25979217886924744, + "learning_rate": 9.688304697093947e-05, + "loss": 1.1665, + "step": 1905 + }, + { + "epoch": 0.11580290418615954, + "grad_norm": 0.15539799630641937, + "learning_rate": 9.68797190812155e-05, + "loss": 1.1105, + "step": 1906 + }, + { + "epoch": 0.11586366121878608, + "grad_norm": 0.1922324299812317, + "learning_rate": 9.687638947310638e-05, + "loss": 1.1682, + "step": 1907 + }, + { + "epoch": 0.1159244182514126, + "grad_norm": 1.510488510131836, + "learning_rate": 9.687305814673415e-05, + "loss": 1.117, + "step": 1908 + }, + { + "epoch": 0.11598517528403912, + "grad_norm": 14.29343032836914, + "learning_rate": 9.686972510222091e-05, + "loss": 1.1978, + "step": 1909 + }, + { + "epoch": 0.11604593231666566, + "grad_norm": 1.5797078609466553, + "learning_rate": 9.686639033968883e-05, + "loss": 1.1014, + "step": 1910 + }, + { + "epoch": 0.11610668934929218, + "grad_norm": 0.2675571143627167, + "learning_rate": 9.686305385926016e-05, + "loss": 1.1589, + "step": 1911 + }, + { + "epoch": 0.11616744638191871, + "grad_norm": 0.21127727627754211, + "learning_rate": 9.685971566105719e-05, + "loss": 1.094, + "step": 1912 + }, + { + "epoch": 0.11622820341454523, + "grad_norm": 0.3406842350959778, + "learning_rate": 9.685637574520228e-05, + "loss": 1.1041, + "step": 1913 + }, + { + "epoch": 0.11628896044717175, + "grad_norm": 0.23028457164764404, + "learning_rate": 9.685303411181788e-05, + "loss": 1.1838, + "step": 1914 + }, + { + "epoch": 0.11634971747979829, + "grad_norm": 0.19239498674869537, + "learning_rate": 9.684969076102643e-05, + "loss": 1.1053, + "step": 1915 + }, + { + "epoch": 0.11641047451242481, + "grad_norm": 0.3043873906135559, + "learning_rate": 9.684634569295053e-05, + "loss": 1.1838, + "step": 1916 + }, + { + "epoch": 0.11647123154505135, + "grad_norm": 0.24072453379631042, + "learning_rate": 9.684299890771275e-05, + "loss": 1.101, + "step": 1917 + }, + { + "epoch": 0.11653198857767787, + "grad_norm": 0.2098928540945053, + "learning_rate": 9.683965040543582e-05, + "loss": 1.2147, + "step": 1918 + }, + { + "epoch": 0.11659274561030439, + "grad_norm": 0.2557503283023834, + "learning_rate": 9.683630018624243e-05, + "loss": 1.0932, + "step": 1919 + }, + { + "epoch": 0.11665350264293092, + "grad_norm": 0.2447894662618637, + "learning_rate": 9.683294825025541e-05, + "loss": 1.2172, + "step": 1920 + }, + { + "epoch": 0.11671425967555744, + "grad_norm": 0.2724704146385193, + "learning_rate": 9.682959459759763e-05, + "loss": 1.1753, + "step": 1921 + }, + { + "epoch": 0.11677501670818398, + "grad_norm": 1.4174976348876953, + "learning_rate": 9.6826239228392e-05, + "loss": 1.1319, + "step": 1922 + }, + { + "epoch": 0.1168357737408105, + "grad_norm": 0.2176712453365326, + "learning_rate": 9.682288214276152e-05, + "loss": 1.1378, + "step": 1923 + }, + { + "epoch": 0.11689653077343702, + "grad_norm": 0.1698758900165558, + "learning_rate": 9.681952334082923e-05, + "loss": 1.0954, + "step": 1924 + }, + { + "epoch": 0.11695728780606356, + "grad_norm": 0.15976884961128235, + "learning_rate": 9.681616282271828e-05, + "loss": 1.0782, + "step": 1925 + }, + { + "epoch": 0.11701804483869008, + "grad_norm": 0.2583989202976227, + "learning_rate": 9.681280058855183e-05, + "loss": 1.0959, + "step": 1926 + }, + { + "epoch": 0.1170788018713166, + "grad_norm": 0.30633655190467834, + "learning_rate": 9.680943663845312e-05, + "loss": 1.1164, + "step": 1927 + }, + { + "epoch": 0.11713955890394313, + "grad_norm": 0.4438948333263397, + "learning_rate": 9.680607097254546e-05, + "loss": 1.1459, + "step": 1928 + }, + { + "epoch": 0.11720031593656965, + "grad_norm": 0.17000195384025574, + "learning_rate": 9.680270359095222e-05, + "loss": 1.0975, + "step": 1929 + }, + { + "epoch": 0.11726107296919619, + "grad_norm": 0.20205284655094147, + "learning_rate": 9.679933449379684e-05, + "loss": 1.2877, + "step": 1930 + }, + { + "epoch": 0.11732183000182271, + "grad_norm": 0.20819339156150818, + "learning_rate": 9.67959636812028e-05, + "loss": 1.137, + "step": 1931 + }, + { + "epoch": 0.11738258703444923, + "grad_norm": 0.20226801931858063, + "learning_rate": 9.679259115329369e-05, + "loss": 1.095, + "step": 1932 + }, + { + "epoch": 0.11744334406707577, + "grad_norm": 0.26550427079200745, + "learning_rate": 9.678921691019308e-05, + "loss": 1.2601, + "step": 1933 + }, + { + "epoch": 0.11750410109970229, + "grad_norm": 0.1841442584991455, + "learning_rate": 9.678584095202468e-05, + "loss": 1.161, + "step": 1934 + }, + { + "epoch": 0.11756485813232882, + "grad_norm": 0.2929319739341736, + "learning_rate": 9.678246327891224e-05, + "loss": 1.1133, + "step": 1935 + }, + { + "epoch": 0.11762561516495534, + "grad_norm": 0.3009735345840454, + "learning_rate": 9.677908389097959e-05, + "loss": 1.0937, + "step": 1936 + }, + { + "epoch": 0.11768637219758186, + "grad_norm": 0.2584429681301117, + "learning_rate": 9.677570278835055e-05, + "loss": 1.1119, + "step": 1937 + }, + { + "epoch": 0.1177471292302084, + "grad_norm": 0.3425711989402771, + "learning_rate": 9.677231997114909e-05, + "loss": 1.1299, + "step": 1938 + }, + { + "epoch": 0.11780788626283492, + "grad_norm": 0.23567362129688263, + "learning_rate": 9.676893543949921e-05, + "loss": 1.1459, + "step": 1939 + }, + { + "epoch": 0.11786864329546146, + "grad_norm": 0.19330216944217682, + "learning_rate": 9.676554919352494e-05, + "loss": 1.1361, + "step": 1940 + }, + { + "epoch": 0.11792940032808798, + "grad_norm": 0.5115233659744263, + "learning_rate": 9.676216123335045e-05, + "loss": 1.3027, + "step": 1941 + }, + { + "epoch": 0.1179901573607145, + "grad_norm": 0.18392927944660187, + "learning_rate": 9.675877155909988e-05, + "loss": 1.1172, + "step": 1942 + }, + { + "epoch": 0.11805091439334103, + "grad_norm": 0.3016204535961151, + "learning_rate": 9.675538017089752e-05, + "loss": 1.0995, + "step": 1943 + }, + { + "epoch": 0.11811167142596755, + "grad_norm": 0.48495960235595703, + "learning_rate": 9.675198706886765e-05, + "loss": 1.1967, + "step": 1944 + }, + { + "epoch": 0.11817242845859408, + "grad_norm": 0.20842774212360382, + "learning_rate": 9.674859225313465e-05, + "loss": 1.1686, + "step": 1945 + }, + { + "epoch": 0.11823318549122061, + "grad_norm": 0.5592312216758728, + "learning_rate": 9.674519572382297e-05, + "loss": 1.1757, + "step": 1946 + }, + { + "epoch": 0.11829394252384713, + "grad_norm": 1.581610083580017, + "learning_rate": 9.674179748105712e-05, + "loss": 1.3027, + "step": 1947 + }, + { + "epoch": 0.11835469955647367, + "grad_norm": 0.27288055419921875, + "learning_rate": 9.673839752496163e-05, + "loss": 1.3086, + "step": 1948 + }, + { + "epoch": 0.11841545658910019, + "grad_norm": 2.6161465644836426, + "learning_rate": 9.673499585566114e-05, + "loss": 1.0706, + "step": 1949 + }, + { + "epoch": 0.11847621362172671, + "grad_norm": 0.3142314553260803, + "learning_rate": 9.673159247328036e-05, + "loss": 1.1208, + "step": 1950 + }, + { + "epoch": 0.11853697065435324, + "grad_norm": 0.1905544102191925, + "learning_rate": 9.672818737794403e-05, + "loss": 1.1175, + "step": 1951 + }, + { + "epoch": 0.11859772768697976, + "grad_norm": 0.19226299226284027, + "learning_rate": 9.672478056977694e-05, + "loss": 1.1381, + "step": 1952 + }, + { + "epoch": 0.1186584847196063, + "grad_norm": 0.4684247672557831, + "learning_rate": 9.672137204890398e-05, + "loss": 1.1744, + "step": 1953 + }, + { + "epoch": 0.11871924175223282, + "grad_norm": 0.17875200510025024, + "learning_rate": 9.671796181545012e-05, + "loss": 1.1189, + "step": 1954 + }, + { + "epoch": 0.11877999878485934, + "grad_norm": 1.081351399421692, + "learning_rate": 9.671454986954035e-05, + "loss": 1.1078, + "step": 1955 + }, + { + "epoch": 0.11884075581748588, + "grad_norm": 0.19021007418632507, + "learning_rate": 9.671113621129969e-05, + "loss": 1.1212, + "step": 1956 + }, + { + "epoch": 0.1189015128501124, + "grad_norm": 0.22308015823364258, + "learning_rate": 9.670772084085331e-05, + "loss": 1.2717, + "step": 1957 + }, + { + "epoch": 0.11896226988273893, + "grad_norm": 0.21829305589199066, + "learning_rate": 9.670430375832641e-05, + "loss": 1.1336, + "step": 1958 + }, + { + "epoch": 0.11902302691536545, + "grad_norm": 0.19864550232887268, + "learning_rate": 9.670088496384422e-05, + "loss": 1.0512, + "step": 1959 + }, + { + "epoch": 0.11908378394799198, + "grad_norm": 0.14482282102108002, + "learning_rate": 9.669746445753207e-05, + "loss": 1.1212, + "step": 1960 + }, + { + "epoch": 0.11914454098061851, + "grad_norm": 0.1603849083185196, + "learning_rate": 9.669404223951533e-05, + "loss": 1.1138, + "step": 1961 + }, + { + "epoch": 0.11920529801324503, + "grad_norm": 0.2783275246620178, + "learning_rate": 9.669061830991943e-05, + "loss": 1.1294, + "step": 1962 + }, + { + "epoch": 0.11926605504587157, + "grad_norm": 0.17078639566898346, + "learning_rate": 9.668719266886992e-05, + "loss": 1.1165, + "step": 1963 + }, + { + "epoch": 0.11932681207849809, + "grad_norm": 0.35063353180885315, + "learning_rate": 9.668376531649233e-05, + "loss": 1.1364, + "step": 1964 + }, + { + "epoch": 0.11938756911112461, + "grad_norm": 0.2500370144844055, + "learning_rate": 9.668033625291229e-05, + "loss": 1.2439, + "step": 1965 + }, + { + "epoch": 0.11944832614375114, + "grad_norm": 0.18498098850250244, + "learning_rate": 9.667690547825549e-05, + "loss": 1.1212, + "step": 1966 + }, + { + "epoch": 0.11950908317637766, + "grad_norm": 0.3198544681072235, + "learning_rate": 9.667347299264772e-05, + "loss": 1.1064, + "step": 1967 + }, + { + "epoch": 0.11956984020900419, + "grad_norm": 0.1935877799987793, + "learning_rate": 9.667003879621477e-05, + "loss": 1.1498, + "step": 1968 + }, + { + "epoch": 0.11963059724163072, + "grad_norm": 0.2101842761039734, + "learning_rate": 9.666660288908251e-05, + "loss": 1.0786, + "step": 1969 + }, + { + "epoch": 0.11969135427425724, + "grad_norm": 0.204456627368927, + "learning_rate": 9.666316527137693e-05, + "loss": 1.0552, + "step": 1970 + }, + { + "epoch": 0.11975211130688378, + "grad_norm": 0.1915925294160843, + "learning_rate": 9.665972594322399e-05, + "loss": 1.0912, + "step": 1971 + }, + { + "epoch": 0.1198128683395103, + "grad_norm": 0.7430527806282043, + "learning_rate": 9.665628490474977e-05, + "loss": 1.2088, + "step": 1972 + }, + { + "epoch": 0.11987362537213682, + "grad_norm": 1.4529510736465454, + "learning_rate": 9.665284215608041e-05, + "loss": 1.1075, + "step": 1973 + }, + { + "epoch": 0.11993438240476335, + "grad_norm": 0.26790040731430054, + "learning_rate": 9.664939769734212e-05, + "loss": 1.2762, + "step": 1974 + }, + { + "epoch": 0.11999513943738987, + "grad_norm": 0.256966769695282, + "learning_rate": 9.664595152866113e-05, + "loss": 1.1003, + "step": 1975 + }, + { + "epoch": 0.12005589647001641, + "grad_norm": 1.6612577438354492, + "learning_rate": 9.664250365016376e-05, + "loss": 1.2115, + "step": 1976 + }, + { + "epoch": 0.12011665350264293, + "grad_norm": 0.26674968004226685, + "learning_rate": 9.663905406197642e-05, + "loss": 1.2234, + "step": 1977 + }, + { + "epoch": 0.12017741053526945, + "grad_norm": 0.24348914623260498, + "learning_rate": 9.663560276422553e-05, + "loss": 1.0615, + "step": 1978 + }, + { + "epoch": 0.12023816756789599, + "grad_norm": 0.18058128654956818, + "learning_rate": 9.66321497570376e-05, + "loss": 1.1416, + "step": 1979 + }, + { + "epoch": 0.12029892460052251, + "grad_norm": 0.14928728342056274, + "learning_rate": 9.662869504053923e-05, + "loss": 1.2116, + "step": 1980 + }, + { + "epoch": 0.12035968163314904, + "grad_norm": 0.4173123240470886, + "learning_rate": 9.6625238614857e-05, + "loss": 1.2366, + "step": 1981 + }, + { + "epoch": 0.12042043866577556, + "grad_norm": 0.26743918657302856, + "learning_rate": 9.662178048011765e-05, + "loss": 1.1249, + "step": 1982 + }, + { + "epoch": 0.12048119569840209, + "grad_norm": 0.25977104902267456, + "learning_rate": 9.661832063644794e-05, + "loss": 1.1114, + "step": 1983 + }, + { + "epoch": 0.12054195273102862, + "grad_norm": 6.254899024963379, + "learning_rate": 9.661485908397465e-05, + "loss": 1.1036, + "step": 1984 + }, + { + "epoch": 0.12060270976365514, + "grad_norm": 0.32930925488471985, + "learning_rate": 9.66113958228247e-05, + "loss": 1.0942, + "step": 1985 + }, + { + "epoch": 0.12066346679628168, + "grad_norm": 0.2594299912452698, + "learning_rate": 9.660793085312503e-05, + "loss": 1.294, + "step": 1986 + }, + { + "epoch": 0.1207242238289082, + "grad_norm": 0.21070104837417603, + "learning_rate": 9.660446417500265e-05, + "loss": 1.176, + "step": 1987 + }, + { + "epoch": 0.12078498086153472, + "grad_norm": 0.2816389203071594, + "learning_rate": 9.660099578858461e-05, + "loss": 1.1136, + "step": 1988 + }, + { + "epoch": 0.12084573789416125, + "grad_norm": 0.18064992129802704, + "learning_rate": 9.659752569399807e-05, + "loss": 1.0706, + "step": 1989 + }, + { + "epoch": 0.12090649492678777, + "grad_norm": 0.19540466368198395, + "learning_rate": 9.659405389137021e-05, + "loss": 1.0981, + "step": 1990 + }, + { + "epoch": 0.1209672519594143, + "grad_norm": 0.23018568754196167, + "learning_rate": 9.659058038082833e-05, + "loss": 1.1232, + "step": 1991 + }, + { + "epoch": 0.12102800899204083, + "grad_norm": 0.42158737778663635, + "learning_rate": 9.65871051624997e-05, + "loss": 1.2598, + "step": 1992 + }, + { + "epoch": 0.12108876602466735, + "grad_norm": 0.3112030625343323, + "learning_rate": 9.658362823651172e-05, + "loss": 1.186, + "step": 1993 + }, + { + "epoch": 0.12114952305729389, + "grad_norm": 0.4480117857456207, + "learning_rate": 9.658014960299185e-05, + "loss": 1.159, + "step": 1994 + }, + { + "epoch": 0.12121028008992041, + "grad_norm": 0.29857707023620605, + "learning_rate": 9.657666926206758e-05, + "loss": 1.1786, + "step": 1995 + }, + { + "epoch": 0.12127103712254693, + "grad_norm": 2.4238693714141846, + "learning_rate": 9.65731872138665e-05, + "loss": 1.164, + "step": 1996 + }, + { + "epoch": 0.12133179415517346, + "grad_norm": 0.29271450638771057, + "learning_rate": 9.656970345851625e-05, + "loss": 1.3141, + "step": 1997 + }, + { + "epoch": 0.12139255118779999, + "grad_norm": 0.14524228870868683, + "learning_rate": 9.656621799614452e-05, + "loss": 1.1148, + "step": 1998 + }, + { + "epoch": 0.12145330822042652, + "grad_norm": 0.18711404502391815, + "learning_rate": 9.656273082687905e-05, + "loss": 1.1084, + "step": 1999 + }, + { + "epoch": 0.12151406525305304, + "grad_norm": 0.36704376339912415, + "learning_rate": 9.655924195084768e-05, + "loss": 1.1146, + "step": 2000 + }, + { + "epoch": 0.12157482228567956, + "grad_norm": 0.17453031241893768, + "learning_rate": 9.655575136817831e-05, + "loss": 1.3484, + "step": 2001 + }, + { + "epoch": 0.1216355793183061, + "grad_norm": 0.4107412099838257, + "learning_rate": 9.655225907899888e-05, + "loss": 1.0793, + "step": 2002 + }, + { + "epoch": 0.12169633635093262, + "grad_norm": 0.24669001996517181, + "learning_rate": 9.654876508343738e-05, + "loss": 1.0473, + "step": 2003 + }, + { + "epoch": 0.12175709338355915, + "grad_norm": 0.9859693050384521, + "learning_rate": 9.654526938162191e-05, + "loss": 1.1558, + "step": 2004 + }, + { + "epoch": 0.12181785041618567, + "grad_norm": 0.7443386316299438, + "learning_rate": 9.65417719736806e-05, + "loss": 1.1378, + "step": 2005 + }, + { + "epoch": 0.1218786074488122, + "grad_norm": 0.21646425127983093, + "learning_rate": 9.653827285974163e-05, + "loss": 1.1407, + "step": 2006 + }, + { + "epoch": 0.12193936448143873, + "grad_norm": 0.2408502846956253, + "learning_rate": 9.653477203993327e-05, + "loss": 1.1871, + "step": 2007 + }, + { + "epoch": 0.12200012151406525, + "grad_norm": 0.21233363449573517, + "learning_rate": 9.653126951438385e-05, + "loss": 1.1506, + "step": 2008 + }, + { + "epoch": 0.12206087854669177, + "grad_norm": 0.19248034060001373, + "learning_rate": 9.652776528322175e-05, + "loss": 1.1267, + "step": 2009 + }, + { + "epoch": 0.12212163557931831, + "grad_norm": 0.17353682219982147, + "learning_rate": 9.652425934657542e-05, + "loss": 1.2013, + "step": 2010 + }, + { + "epoch": 0.12218239261194483, + "grad_norm": 0.16145093739032745, + "learning_rate": 9.652075170457337e-05, + "loss": 1.1187, + "step": 2011 + }, + { + "epoch": 0.12224314964457136, + "grad_norm": 0.17668648064136505, + "learning_rate": 9.651724235734418e-05, + "loss": 1.1471, + "step": 2012 + }, + { + "epoch": 0.12230390667719789, + "grad_norm": 0.1906491369009018, + "learning_rate": 9.651373130501649e-05, + "loss": 1.1935, + "step": 2013 + }, + { + "epoch": 0.1223646637098244, + "grad_norm": 0.2117723822593689, + "learning_rate": 9.651021854771897e-05, + "loss": 1.1284, + "step": 2014 + }, + { + "epoch": 0.12242542074245094, + "grad_norm": 0.33802300691604614, + "learning_rate": 9.650670408558041e-05, + "loss": 1.1806, + "step": 2015 + }, + { + "epoch": 0.12248617777507746, + "grad_norm": 0.4124891459941864, + "learning_rate": 9.65031879187296e-05, + "loss": 1.2326, + "step": 2016 + }, + { + "epoch": 0.122546934807704, + "grad_norm": 3.347226858139038, + "learning_rate": 9.649967004729548e-05, + "loss": 1.1085, + "step": 2017 + }, + { + "epoch": 0.12260769184033052, + "grad_norm": 0.3927828371524811, + "learning_rate": 9.649615047140695e-05, + "loss": 1.1895, + "step": 2018 + }, + { + "epoch": 0.12266844887295704, + "grad_norm": 0.27255886793136597, + "learning_rate": 9.649262919119303e-05, + "loss": 1.1501, + "step": 2019 + }, + { + "epoch": 0.12272920590558357, + "grad_norm": 0.2866484522819519, + "learning_rate": 9.64891062067828e-05, + "loss": 1.1284, + "step": 2020 + }, + { + "epoch": 0.1227899629382101, + "grad_norm": 0.24547314643859863, + "learning_rate": 9.64855815183054e-05, + "loss": 1.1576, + "step": 2021 + }, + { + "epoch": 0.12285071997083663, + "grad_norm": 0.46214425563812256, + "learning_rate": 9.648205512589003e-05, + "loss": 1.0792, + "step": 2022 + }, + { + "epoch": 0.12291147700346315, + "grad_norm": 0.23865380883216858, + "learning_rate": 9.647852702966596e-05, + "loss": 1.1023, + "step": 2023 + }, + { + "epoch": 0.12297223403608967, + "grad_norm": 0.48760348558425903, + "learning_rate": 9.647499722976247e-05, + "loss": 1.1602, + "step": 2024 + }, + { + "epoch": 0.12303299106871621, + "grad_norm": 0.2786392569541931, + "learning_rate": 9.647146572630899e-05, + "loss": 1.1064, + "step": 2025 + }, + { + "epoch": 0.12309374810134273, + "grad_norm": 0.2121281921863556, + "learning_rate": 9.646793251943493e-05, + "loss": 1.1417, + "step": 2026 + }, + { + "epoch": 0.12315450513396926, + "grad_norm": 1.844231367111206, + "learning_rate": 9.646439760926984e-05, + "loss": 1.0822, + "step": 2027 + }, + { + "epoch": 0.12321526216659578, + "grad_norm": 0.2326965034008026, + "learning_rate": 9.646086099594326e-05, + "loss": 1.175, + "step": 2028 + }, + { + "epoch": 0.1232760191992223, + "grad_norm": 0.3406953811645508, + "learning_rate": 9.645732267958485e-05, + "loss": 1.3142, + "step": 2029 + }, + { + "epoch": 0.12333677623184884, + "grad_norm": 0.40433642268180847, + "learning_rate": 9.645378266032429e-05, + "loss": 1.2599, + "step": 2030 + }, + { + "epoch": 0.12339753326447536, + "grad_norm": 0.228359192609787, + "learning_rate": 9.645024093829136e-05, + "loss": 1.0942, + "step": 2031 + }, + { + "epoch": 0.12345829029710188, + "grad_norm": 0.2280920147895813, + "learning_rate": 9.644669751361587e-05, + "loss": 1.1415, + "step": 2032 + }, + { + "epoch": 0.12351904732972842, + "grad_norm": 0.45911288261413574, + "learning_rate": 9.644315238642769e-05, + "loss": 1.0738, + "step": 2033 + }, + { + "epoch": 0.12357980436235494, + "grad_norm": 0.21488778293132782, + "learning_rate": 9.643960555685679e-05, + "loss": 1.0921, + "step": 2034 + }, + { + "epoch": 0.12364056139498147, + "grad_norm": 0.5509289503097534, + "learning_rate": 9.643605702503318e-05, + "loss": 1.1341, + "step": 2035 + }, + { + "epoch": 0.123701318427608, + "grad_norm": 0.16387374699115753, + "learning_rate": 9.643250679108691e-05, + "loss": 1.1103, + "step": 2036 + }, + { + "epoch": 0.12376207546023452, + "grad_norm": 0.2071555256843567, + "learning_rate": 9.642895485514813e-05, + "loss": 1.1346, + "step": 2037 + }, + { + "epoch": 0.12382283249286105, + "grad_norm": 0.22651129961013794, + "learning_rate": 9.642540121734705e-05, + "loss": 1.1164, + "step": 2038 + }, + { + "epoch": 0.12388358952548757, + "grad_norm": 0.17861492931842804, + "learning_rate": 9.64218458778139e-05, + "loss": 1.1708, + "step": 2039 + }, + { + "epoch": 0.12394434655811411, + "grad_norm": 0.26989665627479553, + "learning_rate": 9.641828883667902e-05, + "loss": 1.1775, + "step": 2040 + }, + { + "epoch": 0.12400510359074063, + "grad_norm": 0.29572224617004395, + "learning_rate": 9.641473009407278e-05, + "loss": 1.1271, + "step": 2041 + }, + { + "epoch": 0.12406586062336715, + "grad_norm": 0.19515860080718994, + "learning_rate": 9.641116965012565e-05, + "loss": 1.1297, + "step": 2042 + }, + { + "epoch": 0.12412661765599368, + "grad_norm": 0.5475988984107971, + "learning_rate": 9.640760750496812e-05, + "loss": 1.3058, + "step": 2043 + }, + { + "epoch": 0.1241873746886202, + "grad_norm": 0.1982891708612442, + "learning_rate": 9.640404365873077e-05, + "loss": 1.0944, + "step": 2044 + }, + { + "epoch": 0.12424813172124674, + "grad_norm": 0.20374320447444916, + "learning_rate": 9.640047811154424e-05, + "loss": 1.2383, + "step": 2045 + }, + { + "epoch": 0.12430888875387326, + "grad_norm": 0.22014868259429932, + "learning_rate": 9.639691086353919e-05, + "loss": 1.1166, + "step": 2046 + }, + { + "epoch": 0.12436964578649978, + "grad_norm": 0.1554224193096161, + "learning_rate": 9.639334191484642e-05, + "loss": 1.1428, + "step": 2047 + }, + { + "epoch": 0.12443040281912632, + "grad_norm": 0.20734219253063202, + "learning_rate": 9.638977126559675e-05, + "loss": 1.1305, + "step": 2048 + }, + { + "epoch": 0.12449115985175284, + "grad_norm": 0.2932432293891907, + "learning_rate": 9.638619891592102e-05, + "loss": 1.1506, + "step": 2049 + }, + { + "epoch": 0.12455191688437936, + "grad_norm": 0.2742735743522644, + "learning_rate": 9.638262486595022e-05, + "loss": 1.0932, + "step": 2050 + }, + { + "epoch": 0.1246126739170059, + "grad_norm": 0.5033144354820251, + "learning_rate": 9.637904911581532e-05, + "loss": 1.171, + "step": 2051 + }, + { + "epoch": 0.12467343094963242, + "grad_norm": 0.22795073688030243, + "learning_rate": 9.637547166564741e-05, + "loss": 1.1873, + "step": 2052 + }, + { + "epoch": 0.12473418798225895, + "grad_norm": 0.2395406812429428, + "learning_rate": 9.637189251557766e-05, + "loss": 1.1115, + "step": 2053 + }, + { + "epoch": 0.12479494501488547, + "grad_norm": 0.31437185406684875, + "learning_rate": 9.636831166573719e-05, + "loss": 1.0799, + "step": 2054 + }, + { + "epoch": 0.124855702047512, + "grad_norm": 0.24858003854751587, + "learning_rate": 9.63647291162573e-05, + "loss": 1.1045, + "step": 2055 + }, + { + "epoch": 0.12491645908013853, + "grad_norm": 0.2202848494052887, + "learning_rate": 9.63611448672693e-05, + "loss": 1.1087, + "step": 2056 + }, + { + "epoch": 0.12497721611276505, + "grad_norm": 0.2237938791513443, + "learning_rate": 9.635755891890459e-05, + "loss": 1.1379, + "step": 2057 + }, + { + "epoch": 0.12503797314539158, + "grad_norm": 0.259045273065567, + "learning_rate": 9.635397127129458e-05, + "loss": 1.1036, + "step": 2058 + }, + { + "epoch": 0.12509873017801812, + "grad_norm": 0.42144864797592163, + "learning_rate": 9.63503819245708e-05, + "loss": 1.2534, + "step": 2059 + }, + { + "epoch": 0.12515948721064463, + "grad_norm": 0.3736537992954254, + "learning_rate": 9.634679087886481e-05, + "loss": 1.2059, + "step": 2060 + }, + { + "epoch": 0.12522024424327116, + "grad_norm": 0.3590710759162903, + "learning_rate": 9.634319813430826e-05, + "loss": 1.1077, + "step": 2061 + }, + { + "epoch": 0.1252810012758977, + "grad_norm": 0.21131914854049683, + "learning_rate": 9.633960369103279e-05, + "loss": 1.117, + "step": 2062 + }, + { + "epoch": 0.1253417583085242, + "grad_norm": 2.375375270843506, + "learning_rate": 9.63360075491702e-05, + "loss": 1.2209, + "step": 2063 + }, + { + "epoch": 0.12540251534115074, + "grad_norm": 0.25554004311561584, + "learning_rate": 9.63324097088523e-05, + "loss": 1.2378, + "step": 2064 + }, + { + "epoch": 0.12546327237377727, + "grad_norm": 0.18050119280815125, + "learning_rate": 9.632881017021098e-05, + "loss": 1.1429, + "step": 2065 + }, + { + "epoch": 0.12552402940640378, + "grad_norm": 0.3133840560913086, + "learning_rate": 9.632520893337815e-05, + "loss": 1.1511, + "step": 2066 + }, + { + "epoch": 0.12558478643903032, + "grad_norm": 0.16320054233074188, + "learning_rate": 9.632160599848585e-05, + "loss": 1.1112, + "step": 2067 + }, + { + "epoch": 0.12564554347165685, + "grad_norm": 0.1702062040567398, + "learning_rate": 9.63180013656661e-05, + "loss": 1.1993, + "step": 2068 + }, + { + "epoch": 0.12570630050428336, + "grad_norm": 0.18877971172332764, + "learning_rate": 9.631439503505108e-05, + "loss": 1.2219, + "step": 2069 + }, + { + "epoch": 0.1257670575369099, + "grad_norm": 0.21893955767154694, + "learning_rate": 9.631078700677296e-05, + "loss": 1.1868, + "step": 2070 + }, + { + "epoch": 0.12582781456953643, + "grad_norm": 0.186843603849411, + "learning_rate": 9.6307177280964e-05, + "loss": 1.095, + "step": 2071 + }, + { + "epoch": 0.12588857160216296, + "grad_norm": 0.21330058574676514, + "learning_rate": 9.630356585775647e-05, + "loss": 1.2843, + "step": 2072 + }, + { + "epoch": 0.12594932863478947, + "grad_norm": 2.1996421813964844, + "learning_rate": 9.62999527372828e-05, + "loss": 1.1012, + "step": 2073 + }, + { + "epoch": 0.126010085667416, + "grad_norm": 0.21556246280670166, + "learning_rate": 9.629633791967541e-05, + "loss": 1.0712, + "step": 2074 + }, + { + "epoch": 0.12607084270004254, + "grad_norm": 1.677614450454712, + "learning_rate": 9.629272140506682e-05, + "loss": 1.0929, + "step": 2075 + }, + { + "epoch": 0.12613159973266905, + "grad_norm": 0.1669522523880005, + "learning_rate": 9.628910319358956e-05, + "loss": 1.2191, + "step": 2076 + }, + { + "epoch": 0.12619235676529558, + "grad_norm": 0.21765844523906708, + "learning_rate": 9.62854832853763e-05, + "loss": 1.1005, + "step": 2077 + }, + { + "epoch": 0.12625311379792212, + "grad_norm": 14.662498474121094, + "learning_rate": 9.628186168055968e-05, + "loss": 1.1484, + "step": 2078 + }, + { + "epoch": 0.12631387083054862, + "grad_norm": 1.579979419708252, + "learning_rate": 9.627823837927247e-05, + "loss": 1.1292, + "step": 2079 + }, + { + "epoch": 0.12637462786317516, + "grad_norm": 0.18985305726528168, + "learning_rate": 9.627461338164751e-05, + "loss": 1.0702, + "step": 2080 + }, + { + "epoch": 0.1264353848958017, + "grad_norm": 0.25318366289138794, + "learning_rate": 9.627098668781763e-05, + "loss": 1.2087, + "step": 2081 + }, + { + "epoch": 0.1264961419284282, + "grad_norm": 0.20284001529216766, + "learning_rate": 9.62673582979158e-05, + "loss": 1.1284, + "step": 2082 + }, + { + "epoch": 0.12655689896105474, + "grad_norm": 0.22754044830799103, + "learning_rate": 9.626372821207501e-05, + "loss": 1.1194, + "step": 2083 + }, + { + "epoch": 0.12661765599368127, + "grad_norm": 0.2247520387172699, + "learning_rate": 9.62600964304283e-05, + "loss": 1.2736, + "step": 2084 + }, + { + "epoch": 0.1266784130263078, + "grad_norm": 0.1490245908498764, + "learning_rate": 9.625646295310884e-05, + "loss": 1.0571, + "step": 2085 + }, + { + "epoch": 0.12673917005893431, + "grad_norm": 0.22191564738750458, + "learning_rate": 9.625282778024978e-05, + "loss": 1.2504, + "step": 2086 + }, + { + "epoch": 0.12679992709156085, + "grad_norm": 0.26560288667678833, + "learning_rate": 9.624919091198437e-05, + "loss": 1.1647, + "step": 2087 + }, + { + "epoch": 0.12686068412418738, + "grad_norm": 1.2738986015319824, + "learning_rate": 9.624555234844595e-05, + "loss": 1.0918, + "step": 2088 + }, + { + "epoch": 0.1269214411568139, + "grad_norm": 0.16314783692359924, + "learning_rate": 9.624191208976785e-05, + "loss": 1.1497, + "step": 2089 + }, + { + "epoch": 0.12698219818944043, + "grad_norm": 0.2169427126646042, + "learning_rate": 9.623827013608351e-05, + "loss": 1.075, + "step": 2090 + }, + { + "epoch": 0.12704295522206696, + "grad_norm": 0.2520436942577362, + "learning_rate": 9.623462648752648e-05, + "loss": 1.1826, + "step": 2091 + }, + { + "epoch": 0.12710371225469347, + "grad_norm": 0.16898399591445923, + "learning_rate": 9.623098114423024e-05, + "loss": 1.1824, + "step": 2092 + }, + { + "epoch": 0.12716446928732, + "grad_norm": 0.1813206523656845, + "learning_rate": 9.622733410632847e-05, + "loss": 1.1155, + "step": 2093 + }, + { + "epoch": 0.12722522631994654, + "grad_norm": 0.16200028359889984, + "learning_rate": 9.622368537395483e-05, + "loss": 1.1357, + "step": 2094 + }, + { + "epoch": 0.12728598335257307, + "grad_norm": 0.14974044263362885, + "learning_rate": 9.622003494724307e-05, + "loss": 1.067, + "step": 2095 + }, + { + "epoch": 0.12734674038519958, + "grad_norm": 1.432485818862915, + "learning_rate": 9.621638282632698e-05, + "loss": 1.2379, + "step": 2096 + }, + { + "epoch": 0.12740749741782612, + "grad_norm": 0.38108500838279724, + "learning_rate": 9.621272901134046e-05, + "loss": 1.1973, + "step": 2097 + }, + { + "epoch": 0.12746825445045265, + "grad_norm": 0.19689838588237762, + "learning_rate": 9.620907350241742e-05, + "loss": 1.1828, + "step": 2098 + }, + { + "epoch": 0.12752901148307916, + "grad_norm": 0.17784243822097778, + "learning_rate": 9.620541629969187e-05, + "loss": 1.1185, + "step": 2099 + }, + { + "epoch": 0.1275897685157057, + "grad_norm": 0.21143406629562378, + "learning_rate": 9.620175740329784e-05, + "loss": 1.2074, + "step": 2100 + }, + { + "epoch": 0.12765052554833223, + "grad_norm": 0.18847842514514923, + "learning_rate": 9.619809681336947e-05, + "loss": 1.0765, + "step": 2101 + }, + { + "epoch": 0.12771128258095873, + "grad_norm": 1.8641302585601807, + "learning_rate": 9.619443453004092e-05, + "loss": 1.231, + "step": 2102 + }, + { + "epoch": 0.12777203961358527, + "grad_norm": 0.23724384605884552, + "learning_rate": 9.619077055344645e-05, + "loss": 1.1779, + "step": 2103 + }, + { + "epoch": 0.1278327966462118, + "grad_norm": 0.8766135573387146, + "learning_rate": 9.618710488372035e-05, + "loss": 1.3835, + "step": 2104 + }, + { + "epoch": 0.1278935536788383, + "grad_norm": 1.4700849056243896, + "learning_rate": 9.618343752099701e-05, + "loss": 1.2594, + "step": 2105 + }, + { + "epoch": 0.12795431071146485, + "grad_norm": 0.22571195662021637, + "learning_rate": 9.617976846541084e-05, + "loss": 1.1178, + "step": 2106 + }, + { + "epoch": 0.12801506774409138, + "grad_norm": 0.21347974240779877, + "learning_rate": 9.617609771709632e-05, + "loss": 1.1699, + "step": 2107 + }, + { + "epoch": 0.12807582477671792, + "grad_norm": 0.2740815281867981, + "learning_rate": 9.617242527618802e-05, + "loss": 1.1872, + "step": 2108 + }, + { + "epoch": 0.12813658180934442, + "grad_norm": 0.1986178308725357, + "learning_rate": 9.616875114282053e-05, + "loss": 1.1545, + "step": 2109 + }, + { + "epoch": 0.12819733884197096, + "grad_norm": 0.30879130959510803, + "learning_rate": 9.616507531712855e-05, + "loss": 1.1211, + "step": 2110 + }, + { + "epoch": 0.1282580958745975, + "grad_norm": 0.5088827013969421, + "learning_rate": 9.616139779924682e-05, + "loss": 1.2997, + "step": 2111 + }, + { + "epoch": 0.128318852907224, + "grad_norm": 0.33232057094573975, + "learning_rate": 9.615771858931013e-05, + "loss": 1.0808, + "step": 2112 + }, + { + "epoch": 0.12837960993985054, + "grad_norm": 8.197784423828125, + "learning_rate": 9.615403768745332e-05, + "loss": 1.2707, + "step": 2113 + }, + { + "epoch": 0.12844036697247707, + "grad_norm": 0.2203400433063507, + "learning_rate": 9.615035509381136e-05, + "loss": 1.1625, + "step": 2114 + }, + { + "epoch": 0.12850112400510358, + "grad_norm": 0.1596183031797409, + "learning_rate": 9.614667080851922e-05, + "loss": 1.1055, + "step": 2115 + }, + { + "epoch": 0.1285618810377301, + "grad_norm": 0.30698826909065247, + "learning_rate": 9.614298483171192e-05, + "loss": 1.2276, + "step": 2116 + }, + { + "epoch": 0.12862263807035665, + "grad_norm": 0.32907792925834656, + "learning_rate": 9.61392971635246e-05, + "loss": 1.1992, + "step": 2117 + }, + { + "epoch": 0.12868339510298318, + "grad_norm": 0.15932609140872955, + "learning_rate": 9.613560780409243e-05, + "loss": 1.1416, + "step": 2118 + }, + { + "epoch": 0.1287441521356097, + "grad_norm": 0.1595773547887802, + "learning_rate": 9.613191675355063e-05, + "loss": 1.1133, + "step": 2119 + }, + { + "epoch": 0.12880490916823623, + "grad_norm": 0.17283505201339722, + "learning_rate": 9.612822401203452e-05, + "loss": 1.1745, + "step": 2120 + }, + { + "epoch": 0.12886566620086276, + "grad_norm": 0.19520869851112366, + "learning_rate": 9.612452957967943e-05, + "loss": 1.0721, + "step": 2121 + }, + { + "epoch": 0.12892642323348927, + "grad_norm": 0.1592271476984024, + "learning_rate": 9.612083345662078e-05, + "loss": 1.0574, + "step": 2122 + }, + { + "epoch": 0.1289871802661158, + "grad_norm": 0.16798152029514313, + "learning_rate": 9.611713564299408e-05, + "loss": 1.2063, + "step": 2123 + }, + { + "epoch": 0.12904793729874234, + "grad_norm": 0.22492818534374237, + "learning_rate": 9.611343613893487e-05, + "loss": 1.2212, + "step": 2124 + }, + { + "epoch": 0.12910869433136885, + "grad_norm": 0.17656837403774261, + "learning_rate": 9.610973494457872e-05, + "loss": 1.1992, + "step": 2125 + }, + { + "epoch": 0.12916945136399538, + "grad_norm": 0.45715394616127014, + "learning_rate": 9.610603206006133e-05, + "loss": 1.1674, + "step": 2126 + }, + { + "epoch": 0.12923020839662192, + "grad_norm": 0.26578259468078613, + "learning_rate": 9.610232748551844e-05, + "loss": 1.2901, + "step": 2127 + }, + { + "epoch": 0.12929096542924842, + "grad_norm": 0.8173021078109741, + "learning_rate": 9.60986212210858e-05, + "loss": 1.1727, + "step": 2128 + }, + { + "epoch": 0.12935172246187496, + "grad_norm": 0.43204405903816223, + "learning_rate": 9.60949132668993e-05, + "loss": 1.2081, + "step": 2129 + }, + { + "epoch": 0.1294124794945015, + "grad_norm": 0.25303393602371216, + "learning_rate": 9.609120362309485e-05, + "loss": 1.078, + "step": 2130 + }, + { + "epoch": 0.12947323652712803, + "grad_norm": 1.1481194496154785, + "learning_rate": 9.608749228980841e-05, + "loss": 1.2162, + "step": 2131 + }, + { + "epoch": 0.12953399355975453, + "grad_norm": 0.9371265172958374, + "learning_rate": 9.608377926717604e-05, + "loss": 1.1349, + "step": 2132 + }, + { + "epoch": 0.12959475059238107, + "grad_norm": 0.33399972319602966, + "learning_rate": 9.608006455533383e-05, + "loss": 1.2676, + "step": 2133 + }, + { + "epoch": 0.1296555076250076, + "grad_norm": 0.1876756250858307, + "learning_rate": 9.607634815441795e-05, + "loss": 1.1356, + "step": 2134 + }, + { + "epoch": 0.1297162646576341, + "grad_norm": 0.22510530054569244, + "learning_rate": 9.607263006456462e-05, + "loss": 1.1162, + "step": 2135 + }, + { + "epoch": 0.12977702169026065, + "grad_norm": 0.1685524880886078, + "learning_rate": 9.606891028591013e-05, + "loss": 1.0784, + "step": 2136 + }, + { + "epoch": 0.12983777872288718, + "grad_norm": 0.1984611600637436, + "learning_rate": 9.606518881859083e-05, + "loss": 1.1671, + "step": 2137 + }, + { + "epoch": 0.1298985357555137, + "grad_norm": 0.16086313128471375, + "learning_rate": 9.606146566274312e-05, + "loss": 1.1053, + "step": 2138 + }, + { + "epoch": 0.12995929278814022, + "grad_norm": 0.19444610178470612, + "learning_rate": 9.60577408185035e-05, + "loss": 1.1507, + "step": 2139 + }, + { + "epoch": 0.13002004982076676, + "grad_norm": 0.22640952467918396, + "learning_rate": 9.605401428600846e-05, + "loss": 1.0466, + "step": 2140 + }, + { + "epoch": 0.1300808068533933, + "grad_norm": 0.5249062180519104, + "learning_rate": 9.605028606539466e-05, + "loss": 1.314, + "step": 2141 + }, + { + "epoch": 0.1301415638860198, + "grad_norm": 0.2367764264345169, + "learning_rate": 9.60465561567987e-05, + "loss": 1.2181, + "step": 2142 + }, + { + "epoch": 0.13020232091864634, + "grad_norm": 0.4680385887622833, + "learning_rate": 9.604282456035733e-05, + "loss": 1.2948, + "step": 2143 + }, + { + "epoch": 0.13026307795127287, + "grad_norm": 0.18987536430358887, + "learning_rate": 9.603909127620732e-05, + "loss": 1.1559, + "step": 2144 + }, + { + "epoch": 0.13032383498389938, + "grad_norm": 0.9202283024787903, + "learning_rate": 9.603535630448552e-05, + "loss": 1.085, + "step": 2145 + }, + { + "epoch": 0.1303845920165259, + "grad_norm": 0.25142955780029297, + "learning_rate": 9.603161964532885e-05, + "loss": 1.1354, + "step": 2146 + }, + { + "epoch": 0.13044534904915245, + "grad_norm": 0.34282171726226807, + "learning_rate": 9.602788129887426e-05, + "loss": 1.2372, + "step": 2147 + }, + { + "epoch": 0.13050610608177896, + "grad_norm": 0.3733201026916504, + "learning_rate": 9.602414126525876e-05, + "loss": 1.2886, + "step": 2148 + }, + { + "epoch": 0.1305668631144055, + "grad_norm": 0.2155725210905075, + "learning_rate": 9.60203995446195e-05, + "loss": 1.0738, + "step": 2149 + }, + { + "epoch": 0.13062762014703203, + "grad_norm": 0.2998858392238617, + "learning_rate": 9.601665613709357e-05, + "loss": 1.201, + "step": 2150 + }, + { + "epoch": 0.13068837717965853, + "grad_norm": 0.1856468766927719, + "learning_rate": 9.601291104281823e-05, + "loss": 1.1018, + "step": 2151 + }, + { + "epoch": 0.13074913421228507, + "grad_norm": 0.17658545076847076, + "learning_rate": 9.600916426193074e-05, + "loss": 1.1235, + "step": 2152 + }, + { + "epoch": 0.1308098912449116, + "grad_norm": 0.1622500866651535, + "learning_rate": 9.600541579456844e-05, + "loss": 1.1145, + "step": 2153 + }, + { + "epoch": 0.13087064827753814, + "grad_norm": 0.17708943784236908, + "learning_rate": 9.600166564086874e-05, + "loss": 1.0893, + "step": 2154 + }, + { + "epoch": 0.13093140531016464, + "grad_norm": 0.14311006665229797, + "learning_rate": 9.599791380096908e-05, + "loss": 1.1153, + "step": 2155 + }, + { + "epoch": 0.13099216234279118, + "grad_norm": 0.1791146993637085, + "learning_rate": 9.5994160275007e-05, + "loss": 1.2152, + "step": 2156 + }, + { + "epoch": 0.13105291937541771, + "grad_norm": 0.5301501750946045, + "learning_rate": 9.59904050631201e-05, + "loss": 1.1234, + "step": 2157 + }, + { + "epoch": 0.13111367640804422, + "grad_norm": 0.21681536734104156, + "learning_rate": 9.5986648165446e-05, + "loss": 1.2372, + "step": 2158 + }, + { + "epoch": 0.13117443344067076, + "grad_norm": 0.3805176317691803, + "learning_rate": 9.598288958212242e-05, + "loss": 1.1109, + "step": 2159 + }, + { + "epoch": 0.1312351904732973, + "grad_norm": 0.19702747464179993, + "learning_rate": 9.597912931328714e-05, + "loss": 1.2659, + "step": 2160 + }, + { + "epoch": 0.1312959475059238, + "grad_norm": 0.20077842473983765, + "learning_rate": 9.597536735907797e-05, + "loss": 1.1336, + "step": 2161 + }, + { + "epoch": 0.13135670453855033, + "grad_norm": 0.36625880002975464, + "learning_rate": 9.597160371963286e-05, + "loss": 1.2307, + "step": 2162 + }, + { + "epoch": 0.13141746157117687, + "grad_norm": 0.38198354840278625, + "learning_rate": 9.596783839508969e-05, + "loss": 1.2184, + "step": 2163 + }, + { + "epoch": 0.1314782186038034, + "grad_norm": 0.5571491122245789, + "learning_rate": 9.596407138558653e-05, + "loss": 1.0653, + "step": 2164 + }, + { + "epoch": 0.1315389756364299, + "grad_norm": 0.15978047251701355, + "learning_rate": 9.596030269126146e-05, + "loss": 1.1472, + "step": 2165 + }, + { + "epoch": 0.13159973266905645, + "grad_norm": 0.2738722562789917, + "learning_rate": 9.595653231225261e-05, + "loss": 1.2037, + "step": 2166 + }, + { + "epoch": 0.13166048970168298, + "grad_norm": 0.2074570208787918, + "learning_rate": 9.595276024869817e-05, + "loss": 1.1429, + "step": 2167 + }, + { + "epoch": 0.1317212467343095, + "grad_norm": 0.29405421018600464, + "learning_rate": 9.594898650073644e-05, + "loss": 1.1024, + "step": 2168 + }, + { + "epoch": 0.13178200376693602, + "grad_norm": 0.21341866254806519, + "learning_rate": 9.594521106850571e-05, + "loss": 1.0728, + "step": 2169 + }, + { + "epoch": 0.13184276079956256, + "grad_norm": 0.1805810034275055, + "learning_rate": 9.594143395214441e-05, + "loss": 1.1444, + "step": 2170 + }, + { + "epoch": 0.13190351783218907, + "grad_norm": 0.2784469723701477, + "learning_rate": 9.593765515179095e-05, + "loss": 1.1289, + "step": 2171 + }, + { + "epoch": 0.1319642748648156, + "grad_norm": 0.2613830268383026, + "learning_rate": 9.593387466758386e-05, + "loss": 1.1165, + "step": 2172 + }, + { + "epoch": 0.13202503189744214, + "grad_norm": 0.19762438535690308, + "learning_rate": 9.593009249966172e-05, + "loss": 1.0818, + "step": 2173 + }, + { + "epoch": 0.13208578893006864, + "grad_norm": 0.353282630443573, + "learning_rate": 9.592630864816317e-05, + "loss": 1.1068, + "step": 2174 + }, + { + "epoch": 0.13214654596269518, + "grad_norm": 0.1910041719675064, + "learning_rate": 9.59225231132269e-05, + "loss": 1.1169, + "step": 2175 + }, + { + "epoch": 0.1322073029953217, + "grad_norm": 0.19785964488983154, + "learning_rate": 9.591873589499166e-05, + "loss": 1.2027, + "step": 2176 + }, + { + "epoch": 0.13226806002794825, + "grad_norm": 0.3330136835575104, + "learning_rate": 9.591494699359627e-05, + "loss": 1.0565, + "step": 2177 + }, + { + "epoch": 0.13232881706057475, + "grad_norm": 0.23309539258480072, + "learning_rate": 9.591115640917963e-05, + "loss": 1.3634, + "step": 2178 + }, + { + "epoch": 0.1323895740932013, + "grad_norm": 0.743053138256073, + "learning_rate": 9.590736414188069e-05, + "loss": 1.2602, + "step": 2179 + }, + { + "epoch": 0.13245033112582782, + "grad_norm": 0.16662898659706116, + "learning_rate": 9.590357019183843e-05, + "loss": 1.119, + "step": 2180 + }, + { + "epoch": 0.13251108815845433, + "grad_norm": 0.4592370390892029, + "learning_rate": 9.589977455919193e-05, + "loss": 1.099, + "step": 2181 + }, + { + "epoch": 0.13257184519108087, + "grad_norm": 0.2229791283607483, + "learning_rate": 9.589597724408032e-05, + "loss": 1.1117, + "step": 2182 + }, + { + "epoch": 0.1326326022237074, + "grad_norm": 0.22165454924106598, + "learning_rate": 9.589217824664281e-05, + "loss": 1.0925, + "step": 2183 + }, + { + "epoch": 0.1326933592563339, + "grad_norm": 0.2257971316576004, + "learning_rate": 9.588837756701861e-05, + "loss": 1.2189, + "step": 2184 + }, + { + "epoch": 0.13275411628896044, + "grad_norm": 0.21456827223300934, + "learning_rate": 9.588457520534708e-05, + "loss": 1.1333, + "step": 2185 + }, + { + "epoch": 0.13281487332158698, + "grad_norm": 0.18152987957000732, + "learning_rate": 9.588077116176756e-05, + "loss": 1.1679, + "step": 2186 + }, + { + "epoch": 0.1328756303542135, + "grad_norm": 0.8997235298156738, + "learning_rate": 9.587696543641953e-05, + "loss": 1.2548, + "step": 2187 + }, + { + "epoch": 0.13293638738684002, + "grad_norm": 0.30472829937934875, + "learning_rate": 9.587315802944245e-05, + "loss": 1.2025, + "step": 2188 + }, + { + "epoch": 0.13299714441946656, + "grad_norm": 0.24143661558628082, + "learning_rate": 9.58693489409759e-05, + "loss": 1.1173, + "step": 2189 + }, + { + "epoch": 0.1330579014520931, + "grad_norm": 0.24073444306850433, + "learning_rate": 9.586553817115948e-05, + "loss": 1.0794, + "step": 2190 + }, + { + "epoch": 0.1331186584847196, + "grad_norm": 0.1910742074251175, + "learning_rate": 9.586172572013291e-05, + "loss": 1.1332, + "step": 2191 + }, + { + "epoch": 0.13317941551734613, + "grad_norm": 0.19061161577701569, + "learning_rate": 9.585791158803592e-05, + "loss": 1.1671, + "step": 2192 + }, + { + "epoch": 0.13324017254997267, + "grad_norm": 0.23281781375408173, + "learning_rate": 9.58540957750083e-05, + "loss": 1.1418, + "step": 2193 + }, + { + "epoch": 0.13330092958259918, + "grad_norm": 0.1959117352962494, + "learning_rate": 9.585027828118995e-05, + "loss": 1.125, + "step": 2194 + }, + { + "epoch": 0.1333616866152257, + "grad_norm": 0.13621953129768372, + "learning_rate": 9.584645910672078e-05, + "loss": 1.0938, + "step": 2195 + }, + { + "epoch": 0.13342244364785225, + "grad_norm": 0.31365418434143066, + "learning_rate": 9.58426382517408e-05, + "loss": 1.1188, + "step": 2196 + }, + { + "epoch": 0.13348320068047875, + "grad_norm": 0.2360733449459076, + "learning_rate": 9.583881571639005e-05, + "loss": 1.2033, + "step": 2197 + }, + { + "epoch": 0.1335439577131053, + "grad_norm": 0.5398380160331726, + "learning_rate": 9.583499150080865e-05, + "loss": 1.1213, + "step": 2198 + }, + { + "epoch": 0.13360471474573182, + "grad_norm": 0.2268095314502716, + "learning_rate": 9.583116560513678e-05, + "loss": 1.0736, + "step": 2199 + }, + { + "epoch": 0.13366547177835836, + "grad_norm": 0.2760641276836395, + "learning_rate": 9.582733802951467e-05, + "loss": 1.1001, + "step": 2200 + }, + { + "epoch": 0.13372622881098487, + "grad_norm": 0.2718813419342041, + "learning_rate": 9.582350877408262e-05, + "loss": 1.1135, + "step": 2201 + }, + { + "epoch": 0.1337869858436114, + "grad_norm": 0.16864360868930817, + "learning_rate": 9.5819677838981e-05, + "loss": 1.1534, + "step": 2202 + }, + { + "epoch": 0.13384774287623794, + "grad_norm": 0.1424243301153183, + "learning_rate": 9.581584522435024e-05, + "loss": 1.0753, + "step": 2203 + }, + { + "epoch": 0.13390849990886444, + "grad_norm": 0.33547699451446533, + "learning_rate": 9.581201093033082e-05, + "loss": 1.1696, + "step": 2204 + }, + { + "epoch": 0.13396925694149098, + "grad_norm": 0.1380401849746704, + "learning_rate": 9.580817495706327e-05, + "loss": 1.0863, + "step": 2205 + }, + { + "epoch": 0.1340300139741175, + "grad_norm": 0.3057708740234375, + "learning_rate": 9.580433730468823e-05, + "loss": 1.1465, + "step": 2206 + }, + { + "epoch": 0.13409077100674402, + "grad_norm": 0.16703514754772186, + "learning_rate": 9.580049797334634e-05, + "loss": 1.1367, + "step": 2207 + }, + { + "epoch": 0.13415152803937055, + "grad_norm": 0.2826164960861206, + "learning_rate": 9.579665696317835e-05, + "loss": 1.2703, + "step": 2208 + }, + { + "epoch": 0.1342122850719971, + "grad_norm": 0.19026175141334534, + "learning_rate": 9.579281427432504e-05, + "loss": 1.1272, + "step": 2209 + }, + { + "epoch": 0.1342730421046236, + "grad_norm": 0.33325132727622986, + "learning_rate": 9.578896990692727e-05, + "loss": 1.1676, + "step": 2210 + }, + { + "epoch": 0.13433379913725013, + "grad_norm": 0.2609262764453888, + "learning_rate": 9.578512386112596e-05, + "loss": 1.0918, + "step": 2211 + }, + { + "epoch": 0.13439455616987667, + "grad_norm": 0.26915979385375977, + "learning_rate": 9.578127613706209e-05, + "loss": 1.1017, + "step": 2212 + }, + { + "epoch": 0.1344553132025032, + "grad_norm": 0.3109450936317444, + "learning_rate": 9.577742673487668e-05, + "loss": 1.1867, + "step": 2213 + }, + { + "epoch": 0.1345160702351297, + "grad_norm": 0.38755548000335693, + "learning_rate": 9.577357565471086e-05, + "loss": 1.1065, + "step": 2214 + }, + { + "epoch": 0.13457682726775624, + "grad_norm": 0.18848983943462372, + "learning_rate": 9.576972289670578e-05, + "loss": 1.1081, + "step": 2215 + }, + { + "epoch": 0.13463758430038278, + "grad_norm": 0.5497241020202637, + "learning_rate": 9.576586846100264e-05, + "loss": 1.1253, + "step": 2216 + }, + { + "epoch": 0.13469834133300929, + "grad_norm": 0.3082449436187744, + "learning_rate": 9.576201234774275e-05, + "loss": 1.0867, + "step": 2217 + }, + { + "epoch": 0.13475909836563582, + "grad_norm": 0.26647523045539856, + "learning_rate": 9.575815455706748e-05, + "loss": 1.0775, + "step": 2218 + }, + { + "epoch": 0.13481985539826236, + "grad_norm": 0.30094531178474426, + "learning_rate": 9.575429508911817e-05, + "loss": 1.198, + "step": 2219 + }, + { + "epoch": 0.13488061243088886, + "grad_norm": 0.33741235733032227, + "learning_rate": 9.575043394403635e-05, + "loss": 1.2459, + "step": 2220 + }, + { + "epoch": 0.1349413694635154, + "grad_norm": 0.16895785927772522, + "learning_rate": 9.574657112196354e-05, + "loss": 1.1017, + "step": 2221 + }, + { + "epoch": 0.13500212649614193, + "grad_norm": 5.2270708084106445, + "learning_rate": 9.57427066230413e-05, + "loss": 1.0723, + "step": 2222 + }, + { + "epoch": 0.13506288352876847, + "grad_norm": 0.4301412105560303, + "learning_rate": 9.573884044741133e-05, + "loss": 1.1206, + "step": 2223 + }, + { + "epoch": 0.13512364056139498, + "grad_norm": 1.5419443845748901, + "learning_rate": 9.57349725952153e-05, + "loss": 1.1332, + "step": 2224 + }, + { + "epoch": 0.1351843975940215, + "grad_norm": 0.25238433480262756, + "learning_rate": 9.573110306659504e-05, + "loss": 1.1976, + "step": 2225 + }, + { + "epoch": 0.13524515462664805, + "grad_norm": 0.5974071025848389, + "learning_rate": 9.572723186169232e-05, + "loss": 1.1163, + "step": 2226 + }, + { + "epoch": 0.13530591165927455, + "grad_norm": 0.6074419617652893, + "learning_rate": 9.57233589806491e-05, + "loss": 1.0886, + "step": 2227 + }, + { + "epoch": 0.1353666686919011, + "grad_norm": 0.3090222179889679, + "learning_rate": 9.57194844236073e-05, + "loss": 1.0706, + "step": 2228 + }, + { + "epoch": 0.13542742572452762, + "grad_norm": 0.35570967197418213, + "learning_rate": 9.571560819070898e-05, + "loss": 1.1781, + "step": 2229 + }, + { + "epoch": 0.13548818275715413, + "grad_norm": 0.165644571185112, + "learning_rate": 9.57117302820962e-05, + "loss": 1.1643, + "step": 2230 + }, + { + "epoch": 0.13554893978978066, + "grad_norm": 6.318443775177002, + "learning_rate": 9.57078506979111e-05, + "loss": 1.1198, + "step": 2231 + }, + { + "epoch": 0.1356096968224072, + "grad_norm": 0.587719202041626, + "learning_rate": 9.570396943829592e-05, + "loss": 1.1641, + "step": 2232 + }, + { + "epoch": 0.1356704538550337, + "grad_norm": 0.34268298745155334, + "learning_rate": 9.570008650339288e-05, + "loss": 1.1181, + "step": 2233 + }, + { + "epoch": 0.13573121088766024, + "grad_norm": 0.49453502893447876, + "learning_rate": 9.569620189334435e-05, + "loss": 1.2803, + "step": 2234 + }, + { + "epoch": 0.13579196792028678, + "grad_norm": 0.7012412548065186, + "learning_rate": 9.56923156082927e-05, + "loss": 1.1048, + "step": 2235 + }, + { + "epoch": 0.1358527249529133, + "grad_norm": 0.8635398745536804, + "learning_rate": 9.568842764838041e-05, + "loss": 1.0926, + "step": 2236 + }, + { + "epoch": 0.13591348198553982, + "grad_norm": 0.42028212547302246, + "learning_rate": 9.568453801374995e-05, + "loss": 1.2891, + "step": 2237 + }, + { + "epoch": 0.13597423901816635, + "grad_norm": 0.851941704750061, + "learning_rate": 9.568064670454393e-05, + "loss": 1.0685, + "step": 2238 + }, + { + "epoch": 0.1360349960507929, + "grad_norm": 0.20819708704948425, + "learning_rate": 9.567675372090497e-05, + "loss": 1.1275, + "step": 2239 + }, + { + "epoch": 0.1360957530834194, + "grad_norm": 0.4963993430137634, + "learning_rate": 9.567285906297577e-05, + "loss": 1.2704, + "step": 2240 + }, + { + "epoch": 0.13615651011604593, + "grad_norm": 0.5887370109558105, + "learning_rate": 9.56689627308991e-05, + "loss": 1.1956, + "step": 2241 + }, + { + "epoch": 0.13621726714867247, + "grad_norm": 0.33166342973709106, + "learning_rate": 9.566506472481778e-05, + "loss": 1.0992, + "step": 2242 + }, + { + "epoch": 0.13627802418129897, + "grad_norm": 0.3141010105609894, + "learning_rate": 9.566116504487466e-05, + "loss": 1.1705, + "step": 2243 + }, + { + "epoch": 0.1363387812139255, + "grad_norm": 0.34887564182281494, + "learning_rate": 9.565726369121273e-05, + "loss": 1.1057, + "step": 2244 + }, + { + "epoch": 0.13639953824655204, + "grad_norm": 0.2421671450138092, + "learning_rate": 9.565336066397496e-05, + "loss": 1.0938, + "step": 2245 + }, + { + "epoch": 0.13646029527917858, + "grad_norm": 0.41549810767173767, + "learning_rate": 9.564945596330446e-05, + "loss": 1.3261, + "step": 2246 + }, + { + "epoch": 0.13652105231180509, + "grad_norm": 0.44896894693374634, + "learning_rate": 9.564554958934431e-05, + "loss": 1.2324, + "step": 2247 + }, + { + "epoch": 0.13658180934443162, + "grad_norm": 0.46312132477760315, + "learning_rate": 9.56416415422377e-05, + "loss": 1.1076, + "step": 2248 + }, + { + "epoch": 0.13664256637705816, + "grad_norm": 0.22303961217403412, + "learning_rate": 9.563773182212791e-05, + "loss": 1.1594, + "step": 2249 + }, + { + "epoch": 0.13670332340968466, + "grad_norm": 0.5063765645027161, + "learning_rate": 9.563382042915826e-05, + "loss": 1.0987, + "step": 2250 + }, + { + "epoch": 0.1367640804423112, + "grad_norm": 0.31590166687965393, + "learning_rate": 9.562990736347207e-05, + "loss": 1.1698, + "step": 2251 + }, + { + "epoch": 0.13682483747493773, + "grad_norm": 0.23123250901699066, + "learning_rate": 9.562599262521281e-05, + "loss": 1.1019, + "step": 2252 + }, + { + "epoch": 0.13688559450756424, + "grad_norm": 0.4999677538871765, + "learning_rate": 9.562207621452398e-05, + "loss": 1.2709, + "step": 2253 + }, + { + "epoch": 0.13694635154019077, + "grad_norm": 0.24811404943466187, + "learning_rate": 9.561815813154913e-05, + "loss": 1.1109, + "step": 2254 + }, + { + "epoch": 0.1370071085728173, + "grad_norm": 0.16323330998420715, + "learning_rate": 9.561423837643186e-05, + "loss": 1.1211, + "step": 2255 + }, + { + "epoch": 0.13706786560544382, + "grad_norm": 0.6281250715255737, + "learning_rate": 9.561031694931589e-05, + "loss": 1.1062, + "step": 2256 + }, + { + "epoch": 0.13712862263807035, + "grad_norm": 0.24023033678531647, + "learning_rate": 9.560639385034491e-05, + "loss": 1.1081, + "step": 2257 + }, + { + "epoch": 0.1371893796706969, + "grad_norm": 0.21369919180870056, + "learning_rate": 9.560246907966276e-05, + "loss": 1.0844, + "step": 2258 + }, + { + "epoch": 0.13725013670332342, + "grad_norm": 0.2748388350009918, + "learning_rate": 9.55985426374133e-05, + "loss": 1.0781, + "step": 2259 + }, + { + "epoch": 0.13731089373594993, + "grad_norm": 0.23953093588352203, + "learning_rate": 9.559461452374043e-05, + "loss": 1.1211, + "step": 2260 + }, + { + "epoch": 0.13737165076857646, + "grad_norm": 0.26806795597076416, + "learning_rate": 9.559068473878816e-05, + "loss": 1.0397, + "step": 2261 + }, + { + "epoch": 0.137432407801203, + "grad_norm": 0.5642307996749878, + "learning_rate": 9.558675328270053e-05, + "loss": 1.2112, + "step": 2262 + }, + { + "epoch": 0.1374931648338295, + "grad_norm": 0.23219093680381775, + "learning_rate": 9.558282015562165e-05, + "loss": 1.0983, + "step": 2263 + }, + { + "epoch": 0.13755392186645604, + "grad_norm": 0.4778085947036743, + "learning_rate": 9.557888535769568e-05, + "loss": 1.0925, + "step": 2264 + }, + { + "epoch": 0.13761467889908258, + "grad_norm": 0.24083127081394196, + "learning_rate": 9.557494888906685e-05, + "loss": 1.095, + "step": 2265 + }, + { + "epoch": 0.13767543593170908, + "grad_norm": 0.16870884597301483, + "learning_rate": 9.557101074987946e-05, + "loss": 1.1156, + "step": 2266 + }, + { + "epoch": 0.13773619296433562, + "grad_norm": 0.15064562857151031, + "learning_rate": 9.556707094027786e-05, + "loss": 1.1584, + "step": 2267 + }, + { + "epoch": 0.13779694999696215, + "grad_norm": 0.24823467433452606, + "learning_rate": 9.556312946040649e-05, + "loss": 1.3144, + "step": 2268 + }, + { + "epoch": 0.1378577070295887, + "grad_norm": 0.15248197317123413, + "learning_rate": 9.555918631040979e-05, + "loss": 1.1203, + "step": 2269 + }, + { + "epoch": 0.1379184640622152, + "grad_norm": 0.28081315755844116, + "learning_rate": 9.55552414904323e-05, + "loss": 1.1326, + "step": 2270 + }, + { + "epoch": 0.13797922109484173, + "grad_norm": 0.16592133045196533, + "learning_rate": 9.555129500061864e-05, + "loss": 1.1351, + "step": 2271 + }, + { + "epoch": 0.13803997812746827, + "grad_norm": 0.24708499014377594, + "learning_rate": 9.554734684111345e-05, + "loss": 1.1884, + "step": 2272 + }, + { + "epoch": 0.13810073516009477, + "grad_norm": 0.1949533224105835, + "learning_rate": 9.554339701206145e-05, + "loss": 1.2322, + "step": 2273 + }, + { + "epoch": 0.1381614921927213, + "grad_norm": 0.17027607560157776, + "learning_rate": 9.553944551360745e-05, + "loss": 1.1543, + "step": 2274 + }, + { + "epoch": 0.13822224922534784, + "grad_norm": 0.38415852189064026, + "learning_rate": 9.553549234589624e-05, + "loss": 1.2894, + "step": 2275 + }, + { + "epoch": 0.13828300625797435, + "grad_norm": 0.31658172607421875, + "learning_rate": 9.553153750907279e-05, + "loss": 1.105, + "step": 2276 + }, + { + "epoch": 0.13834376329060089, + "grad_norm": 0.22498324513435364, + "learning_rate": 9.552758100328202e-05, + "loss": 1.1804, + "step": 2277 + }, + { + "epoch": 0.13840452032322742, + "grad_norm": 0.5629101395606995, + "learning_rate": 9.552362282866896e-05, + "loss": 1.0719, + "step": 2278 + }, + { + "epoch": 0.13846527735585393, + "grad_norm": 0.3577674925327301, + "learning_rate": 9.55196629853787e-05, + "loss": 1.1355, + "step": 2279 + }, + { + "epoch": 0.13852603438848046, + "grad_norm": 0.16320045292377472, + "learning_rate": 9.551570147355642e-05, + "loss": 1.2234, + "step": 2280 + }, + { + "epoch": 0.138586791421107, + "grad_norm": 0.2317439317703247, + "learning_rate": 9.55117382933473e-05, + "loss": 1.1686, + "step": 2281 + }, + { + "epoch": 0.13864754845373353, + "grad_norm": 0.4535728693008423, + "learning_rate": 9.55077734448966e-05, + "loss": 1.2012, + "step": 2282 + }, + { + "epoch": 0.13870830548636004, + "grad_norm": 0.19711743295192719, + "learning_rate": 9.550380692834967e-05, + "loss": 1.1153, + "step": 2283 + }, + { + "epoch": 0.13876906251898657, + "grad_norm": 0.3493136465549469, + "learning_rate": 9.549983874385191e-05, + "loss": 1.1343, + "step": 2284 + }, + { + "epoch": 0.1388298195516131, + "grad_norm": 1.0998477935791016, + "learning_rate": 9.549586889154875e-05, + "loss": 1.1199, + "step": 2285 + }, + { + "epoch": 0.13889057658423962, + "grad_norm": 0.2433186024427414, + "learning_rate": 9.549189737158574e-05, + "loss": 1.0883, + "step": 2286 + }, + { + "epoch": 0.13895133361686615, + "grad_norm": 0.3650475740432739, + "learning_rate": 9.548792418410844e-05, + "loss": 1.1727, + "step": 2287 + }, + { + "epoch": 0.1390120906494927, + "grad_norm": 0.49776846170425415, + "learning_rate": 9.548394932926248e-05, + "loss": 1.2129, + "step": 2288 + }, + { + "epoch": 0.1390728476821192, + "grad_norm": 0.34604546427726746, + "learning_rate": 9.547997280719355e-05, + "loss": 1.1886, + "step": 2289 + }, + { + "epoch": 0.13913360471474573, + "grad_norm": 0.17261995375156403, + "learning_rate": 9.547599461804743e-05, + "loss": 1.1657, + "step": 2290 + }, + { + "epoch": 0.13919436174737226, + "grad_norm": 0.1966523379087448, + "learning_rate": 9.547201476196996e-05, + "loss": 1.1179, + "step": 2291 + }, + { + "epoch": 0.13925511877999877, + "grad_norm": 0.2130250483751297, + "learning_rate": 9.546803323910698e-05, + "loss": 1.0747, + "step": 2292 + }, + { + "epoch": 0.1393158758126253, + "grad_norm": 0.8317158222198486, + "learning_rate": 9.546405004960445e-05, + "loss": 1.0561, + "step": 2293 + }, + { + "epoch": 0.13937663284525184, + "grad_norm": 0.15411683917045593, + "learning_rate": 9.546006519360839e-05, + "loss": 1.0869, + "step": 2294 + }, + { + "epoch": 0.13943738987787838, + "grad_norm": 1.1167824268341064, + "learning_rate": 9.545607867126484e-05, + "loss": 1.1704, + "step": 2295 + }, + { + "epoch": 0.13949814691050488, + "grad_norm": 0.9352914690971375, + "learning_rate": 9.545209048271997e-05, + "loss": 1.1141, + "step": 2296 + }, + { + "epoch": 0.13955890394313142, + "grad_norm": 0.3974536061286926, + "learning_rate": 9.54481006281199e-05, + "loss": 1.2302, + "step": 2297 + }, + { + "epoch": 0.13961966097575795, + "grad_norm": 0.8662892580032349, + "learning_rate": 9.544410910761094e-05, + "loss": 1.15, + "step": 2298 + }, + { + "epoch": 0.13968041800838446, + "grad_norm": 0.1656741201877594, + "learning_rate": 9.544011592133936e-05, + "loss": 1.0683, + "step": 2299 + }, + { + "epoch": 0.139741175041011, + "grad_norm": 0.5774582028388977, + "learning_rate": 9.543612106945157e-05, + "loss": 1.076, + "step": 2300 + }, + { + "epoch": 0.13980193207363753, + "grad_norm": 0.24073489010334015, + "learning_rate": 9.543212455209396e-05, + "loss": 1.0723, + "step": 2301 + }, + { + "epoch": 0.13986268910626404, + "grad_norm": 0.6168996691703796, + "learning_rate": 9.542812636941306e-05, + "loss": 1.099, + "step": 2302 + }, + { + "epoch": 0.13992344613889057, + "grad_norm": 0.3081745207309723, + "learning_rate": 9.542412652155539e-05, + "loss": 1.1008, + "step": 2303 + }, + { + "epoch": 0.1399842031715171, + "grad_norm": 0.1635631024837494, + "learning_rate": 9.542012500866759e-05, + "loss": 1.1042, + "step": 2304 + }, + { + "epoch": 0.14004496020414364, + "grad_norm": 0.36980193853378296, + "learning_rate": 9.541612183089632e-05, + "loss": 1.191, + "step": 2305 + }, + { + "epoch": 0.14010571723677015, + "grad_norm": 0.277229905128479, + "learning_rate": 9.541211698838834e-05, + "loss": 1.056, + "step": 2306 + }, + { + "epoch": 0.14016647426939668, + "grad_norm": 0.20339277386665344, + "learning_rate": 9.540811048129041e-05, + "loss": 1.1423, + "step": 2307 + }, + { + "epoch": 0.14022723130202322, + "grad_norm": 0.21004325151443481, + "learning_rate": 9.540410230974943e-05, + "loss": 1.3987, + "step": 2308 + }, + { + "epoch": 0.14028798833464973, + "grad_norm": 0.30993664264678955, + "learning_rate": 9.54000924739123e-05, + "loss": 1.1818, + "step": 2309 + }, + { + "epoch": 0.14034874536727626, + "grad_norm": 1.4932667016983032, + "learning_rate": 9.539608097392602e-05, + "loss": 1.1422, + "step": 2310 + }, + { + "epoch": 0.1404095023999028, + "grad_norm": 0.3883330523967743, + "learning_rate": 9.539206780993761e-05, + "loss": 1.1709, + "step": 2311 + }, + { + "epoch": 0.1404702594325293, + "grad_norm": 0.24175414443016052, + "learning_rate": 9.538805298209416e-05, + "loss": 1.129, + "step": 2312 + }, + { + "epoch": 0.14053101646515584, + "grad_norm": 0.2146836668252945, + "learning_rate": 9.538403649054288e-05, + "loss": 1.0827, + "step": 2313 + }, + { + "epoch": 0.14059177349778237, + "grad_norm": 0.3535554111003876, + "learning_rate": 9.538001833543095e-05, + "loss": 1.1411, + "step": 2314 + }, + { + "epoch": 0.14065253053040888, + "grad_norm": 0.18057365715503693, + "learning_rate": 9.537599851690569e-05, + "loss": 1.1256, + "step": 2315 + }, + { + "epoch": 0.14071328756303542, + "grad_norm": 0.3745160400867462, + "learning_rate": 9.537197703511443e-05, + "loss": 1.1264, + "step": 2316 + }, + { + "epoch": 0.14077404459566195, + "grad_norm": 0.5218973159790039, + "learning_rate": 9.536795389020457e-05, + "loss": 1.2895, + "step": 2317 + }, + { + "epoch": 0.1408348016282885, + "grad_norm": 0.17800843715667725, + "learning_rate": 9.536392908232361e-05, + "loss": 1.1102, + "step": 2318 + }, + { + "epoch": 0.140895558660915, + "grad_norm": 0.22896233201026917, + "learning_rate": 9.535990261161904e-05, + "loss": 1.1925, + "step": 2319 + }, + { + "epoch": 0.14095631569354153, + "grad_norm": 0.34376412630081177, + "learning_rate": 9.535587447823849e-05, + "loss": 1.1925, + "step": 2320 + }, + { + "epoch": 0.14101707272616806, + "grad_norm": 0.147017240524292, + "learning_rate": 9.535184468232958e-05, + "loss": 1.132, + "step": 2321 + }, + { + "epoch": 0.14107782975879457, + "grad_norm": 0.3092877268791199, + "learning_rate": 9.534781322404005e-05, + "loss": 1.1574, + "step": 2322 + }, + { + "epoch": 0.1411385867914211, + "grad_norm": 0.15649278461933136, + "learning_rate": 9.534378010351766e-05, + "loss": 1.066, + "step": 2323 + }, + { + "epoch": 0.14119934382404764, + "grad_norm": 0.21558749675750732, + "learning_rate": 9.533974532091023e-05, + "loss": 1.0897, + "step": 2324 + }, + { + "epoch": 0.14126010085667415, + "grad_norm": 0.2511991560459137, + "learning_rate": 9.53357088763657e-05, + "loss": 1.1565, + "step": 2325 + }, + { + "epoch": 0.14132085788930068, + "grad_norm": 0.3122481405735016, + "learning_rate": 9.533167077003198e-05, + "loss": 1.0551, + "step": 2326 + }, + { + "epoch": 0.14138161492192722, + "grad_norm": 0.23617467284202576, + "learning_rate": 9.532763100205709e-05, + "loss": 1.1767, + "step": 2327 + }, + { + "epoch": 0.14144237195455375, + "grad_norm": 0.26131677627563477, + "learning_rate": 9.532358957258915e-05, + "loss": 1.208, + "step": 2328 + }, + { + "epoch": 0.14150312898718026, + "grad_norm": 0.1727125346660614, + "learning_rate": 9.531954648177625e-05, + "loss": 1.0769, + "step": 2329 + }, + { + "epoch": 0.1415638860198068, + "grad_norm": 2.5413877964019775, + "learning_rate": 9.531550172976664e-05, + "loss": 1.2453, + "step": 2330 + }, + { + "epoch": 0.14162464305243333, + "grad_norm": 0.22673800587654114, + "learning_rate": 9.531145531670853e-05, + "loss": 1.1197, + "step": 2331 + }, + { + "epoch": 0.14168540008505984, + "grad_norm": 0.16913120448589325, + "learning_rate": 9.530740724275028e-05, + "loss": 1.1916, + "step": 2332 + }, + { + "epoch": 0.14174615711768637, + "grad_norm": 0.31697878241539, + "learning_rate": 9.530335750804025e-05, + "loss": 1.2311, + "step": 2333 + }, + { + "epoch": 0.1418069141503129, + "grad_norm": 0.26434314250946045, + "learning_rate": 9.52993061127269e-05, + "loss": 1.1003, + "step": 2334 + }, + { + "epoch": 0.14186767118293941, + "grad_norm": 0.4128108322620392, + "learning_rate": 9.529525305695874e-05, + "loss": 1.1663, + "step": 2335 + }, + { + "epoch": 0.14192842821556595, + "grad_norm": 0.6479545831680298, + "learning_rate": 9.529119834088432e-05, + "loss": 1.0976, + "step": 2336 + }, + { + "epoch": 0.14198918524819248, + "grad_norm": 0.5405136942863464, + "learning_rate": 9.528714196465224e-05, + "loss": 1.2017, + "step": 2337 + }, + { + "epoch": 0.142049942280819, + "grad_norm": 0.7159753441810608, + "learning_rate": 9.528308392841126e-05, + "loss": 1.2827, + "step": 2338 + }, + { + "epoch": 0.14211069931344553, + "grad_norm": 0.26535531878471375, + "learning_rate": 9.527902423231005e-05, + "loss": 1.1856, + "step": 2339 + }, + { + "epoch": 0.14217145634607206, + "grad_norm": 1.685245156288147, + "learning_rate": 9.527496287649748e-05, + "loss": 1.1421, + "step": 2340 + }, + { + "epoch": 0.1422322133786986, + "grad_norm": 0.21378839015960693, + "learning_rate": 9.527089986112239e-05, + "loss": 1.1778, + "step": 2341 + }, + { + "epoch": 0.1422929704113251, + "grad_norm": 0.22931383550167084, + "learning_rate": 9.52668351863337e-05, + "loss": 1.1815, + "step": 2342 + }, + { + "epoch": 0.14235372744395164, + "grad_norm": 0.2110852152109146, + "learning_rate": 9.526276885228044e-05, + "loss": 1.0991, + "step": 2343 + }, + { + "epoch": 0.14241448447657817, + "grad_norm": 0.18488439917564392, + "learning_rate": 9.525870085911162e-05, + "loss": 1.3852, + "step": 2344 + }, + { + "epoch": 0.14247524150920468, + "grad_norm": 0.1488366723060608, + "learning_rate": 9.525463120697637e-05, + "loss": 1.0614, + "step": 2345 + }, + { + "epoch": 0.14253599854183122, + "grad_norm": 0.19828559458255768, + "learning_rate": 9.525055989602388e-05, + "loss": 1.298, + "step": 2346 + }, + { + "epoch": 0.14259675557445775, + "grad_norm": 0.16060969233512878, + "learning_rate": 9.524648692640336e-05, + "loss": 1.1147, + "step": 2347 + }, + { + "epoch": 0.14265751260708426, + "grad_norm": 1.2450884580612183, + "learning_rate": 9.524241229826411e-05, + "loss": 1.1099, + "step": 2348 + }, + { + "epoch": 0.1427182696397108, + "grad_norm": 0.28205931186676025, + "learning_rate": 9.523833601175551e-05, + "loss": 1.0847, + "step": 2349 + }, + { + "epoch": 0.14277902667233733, + "grad_norm": 0.206913024187088, + "learning_rate": 9.523425806702695e-05, + "loss": 1.0987, + "step": 2350 + }, + { + "epoch": 0.14283978370496386, + "grad_norm": 0.16962188482284546, + "learning_rate": 9.523017846422793e-05, + "loss": 1.1007, + "step": 2351 + }, + { + "epoch": 0.14290054073759037, + "grad_norm": 0.3351297676563263, + "learning_rate": 9.522609720350795e-05, + "loss": 1.2933, + "step": 2352 + }, + { + "epoch": 0.1429612977702169, + "grad_norm": 0.26097571849823, + "learning_rate": 9.522201428501664e-05, + "loss": 1.0871, + "step": 2353 + }, + { + "epoch": 0.14302205480284344, + "grad_norm": 0.23117025196552277, + "learning_rate": 9.521792970890366e-05, + "loss": 1.1786, + "step": 2354 + }, + { + "epoch": 0.14308281183546995, + "grad_norm": 0.29111966490745544, + "learning_rate": 9.521384347531874e-05, + "loss": 1.2099, + "step": 2355 + }, + { + "epoch": 0.14314356886809648, + "grad_norm": 0.18806366622447968, + "learning_rate": 9.520975558441164e-05, + "loss": 1.0685, + "step": 2356 + }, + { + "epoch": 0.14320432590072302, + "grad_norm": 0.1590515524148941, + "learning_rate": 9.520566603633221e-05, + "loss": 1.0795, + "step": 2357 + }, + { + "epoch": 0.14326508293334952, + "grad_norm": 0.172119602560997, + "learning_rate": 9.520157483123034e-05, + "loss": 1.18, + "step": 2358 + }, + { + "epoch": 0.14332583996597606, + "grad_norm": 0.1784110963344574, + "learning_rate": 9.519748196925601e-05, + "loss": 1.2058, + "step": 2359 + }, + { + "epoch": 0.1433865969986026, + "grad_norm": 0.17967002093791962, + "learning_rate": 9.519338745055926e-05, + "loss": 1.0634, + "step": 2360 + }, + { + "epoch": 0.1434473540312291, + "grad_norm": 0.30833303928375244, + "learning_rate": 9.518929127529016e-05, + "loss": 1.1298, + "step": 2361 + }, + { + "epoch": 0.14350811106385564, + "grad_norm": 0.162101611495018, + "learning_rate": 9.518519344359882e-05, + "loss": 1.1489, + "step": 2362 + }, + { + "epoch": 0.14356886809648217, + "grad_norm": 0.22001470625400543, + "learning_rate": 9.51810939556355e-05, + "loss": 1.1747, + "step": 2363 + }, + { + "epoch": 0.1436296251291087, + "grad_norm": 0.20032212138175964, + "learning_rate": 9.517699281155046e-05, + "loss": 1.1118, + "step": 2364 + }, + { + "epoch": 0.14369038216173521, + "grad_norm": 0.206923246383667, + "learning_rate": 9.5172890011494e-05, + "loss": 1.2583, + "step": 2365 + }, + { + "epoch": 0.14375113919436175, + "grad_norm": 0.14203689992427826, + "learning_rate": 9.516878555561652e-05, + "loss": 1.0626, + "step": 2366 + }, + { + "epoch": 0.14381189622698828, + "grad_norm": 2.838843822479248, + "learning_rate": 9.516467944406848e-05, + "loss": 1.1281, + "step": 2367 + }, + { + "epoch": 0.1438726532596148, + "grad_norm": 0.16107012331485748, + "learning_rate": 9.516057167700039e-05, + "loss": 1.1291, + "step": 2368 + }, + { + "epoch": 0.14393341029224133, + "grad_norm": 0.23750454187393188, + "learning_rate": 9.515646225456282e-05, + "loss": 1.1163, + "step": 2369 + }, + { + "epoch": 0.14399416732486786, + "grad_norm": 0.7830845713615417, + "learning_rate": 9.51523511769064e-05, + "loss": 1.1287, + "step": 2370 + }, + { + "epoch": 0.14405492435749437, + "grad_norm": 0.30437442660331726, + "learning_rate": 9.51482384441818e-05, + "loss": 1.046, + "step": 2371 + }, + { + "epoch": 0.1441156813901209, + "grad_norm": 0.17047496140003204, + "learning_rate": 9.51441240565398e-05, + "loss": 1.1627, + "step": 2372 + }, + { + "epoch": 0.14417643842274744, + "grad_norm": 0.18535156548023224, + "learning_rate": 9.514000801413122e-05, + "loss": 1.0805, + "step": 2373 + }, + { + "epoch": 0.14423719545537397, + "grad_norm": 0.21000000834465027, + "learning_rate": 9.513589031710691e-05, + "loss": 1.1576, + "step": 2374 + }, + { + "epoch": 0.14429795248800048, + "grad_norm": 0.22640864551067352, + "learning_rate": 9.513177096561781e-05, + "loss": 1.1064, + "step": 2375 + }, + { + "epoch": 0.14435870952062702, + "grad_norm": 0.20231229066848755, + "learning_rate": 9.512764995981493e-05, + "loss": 1.1338, + "step": 2376 + }, + { + "epoch": 0.14441946655325355, + "grad_norm": 0.1628715693950653, + "learning_rate": 9.512352729984932e-05, + "loss": 1.1069, + "step": 2377 + }, + { + "epoch": 0.14448022358588006, + "grad_norm": 0.3245583772659302, + "learning_rate": 9.51194029858721e-05, + "loss": 1.1173, + "step": 2378 + }, + { + "epoch": 0.1445409806185066, + "grad_norm": 0.21512292325496674, + "learning_rate": 9.511527701803442e-05, + "loss": 1.0938, + "step": 2379 + }, + { + "epoch": 0.14460173765113313, + "grad_norm": 0.184569850564003, + "learning_rate": 9.511114939648755e-05, + "loss": 1.1926, + "step": 2380 + }, + { + "epoch": 0.14466249468375963, + "grad_norm": 0.175813227891922, + "learning_rate": 9.51070201213828e-05, + "loss": 1.0953, + "step": 2381 + }, + { + "epoch": 0.14472325171638617, + "grad_norm": 0.21451465785503387, + "learning_rate": 9.510288919287149e-05, + "loss": 1.1846, + "step": 2382 + }, + { + "epoch": 0.1447840087490127, + "grad_norm": 0.15214553475379944, + "learning_rate": 9.509875661110506e-05, + "loss": 1.1163, + "step": 2383 + }, + { + "epoch": 0.1448447657816392, + "grad_norm": 0.1464489847421646, + "learning_rate": 9.509462237623499e-05, + "loss": 1.1324, + "step": 2384 + }, + { + "epoch": 0.14490552281426575, + "grad_norm": 0.14790543913841248, + "learning_rate": 9.509048648841281e-05, + "loss": 1.1133, + "step": 2385 + }, + { + "epoch": 0.14496627984689228, + "grad_norm": 0.23965315520763397, + "learning_rate": 9.508634894779014e-05, + "loss": 1.0818, + "step": 2386 + }, + { + "epoch": 0.14502703687951882, + "grad_norm": 0.18583054840564728, + "learning_rate": 9.508220975451864e-05, + "loss": 1.1308, + "step": 2387 + }, + { + "epoch": 0.14508779391214532, + "grad_norm": 0.960493266582489, + "learning_rate": 9.507806890875002e-05, + "loss": 1.173, + "step": 2388 + }, + { + "epoch": 0.14514855094477186, + "grad_norm": 0.16508913040161133, + "learning_rate": 9.507392641063607e-05, + "loss": 1.1809, + "step": 2389 + }, + { + "epoch": 0.1452093079773984, + "grad_norm": 0.233011394739151, + "learning_rate": 9.506978226032864e-05, + "loss": 1.1236, + "step": 2390 + }, + { + "epoch": 0.1452700650100249, + "grad_norm": 0.1477642059326172, + "learning_rate": 9.506563645797963e-05, + "loss": 1.1063, + "step": 2391 + }, + { + "epoch": 0.14533082204265144, + "grad_norm": 0.14913016557693481, + "learning_rate": 9.5061489003741e-05, + "loss": 1.132, + "step": 2392 + }, + { + "epoch": 0.14539157907527797, + "grad_norm": 0.1605728417634964, + "learning_rate": 9.505733989776478e-05, + "loss": 1.1114, + "step": 2393 + }, + { + "epoch": 0.14545233610790448, + "grad_norm": 0.24527214467525482, + "learning_rate": 9.505318914020307e-05, + "loss": 1.0602, + "step": 2394 + }, + { + "epoch": 0.145513093140531, + "grad_norm": 0.18539302051067352, + "learning_rate": 9.504903673120799e-05, + "loss": 1.1224, + "step": 2395 + }, + { + "epoch": 0.14557385017315755, + "grad_norm": 0.35246172547340393, + "learning_rate": 9.504488267093175e-05, + "loss": 1.0643, + "step": 2396 + }, + { + "epoch": 0.14563460720578406, + "grad_norm": 0.18054181337356567, + "learning_rate": 9.504072695952664e-05, + "loss": 1.1607, + "step": 2397 + }, + { + "epoch": 0.1456953642384106, + "grad_norm": 0.34346428513526917, + "learning_rate": 9.503656959714499e-05, + "loss": 1.1836, + "step": 2398 + }, + { + "epoch": 0.14575612127103713, + "grad_norm": 0.24999122321605682, + "learning_rate": 9.503241058393917e-05, + "loss": 1.1015, + "step": 2399 + }, + { + "epoch": 0.14581687830366366, + "grad_norm": 0.23386499285697937, + "learning_rate": 9.502824992006163e-05, + "loss": 1.1424, + "step": 2400 + }, + { + "epoch": 0.14587763533629017, + "grad_norm": 0.43365755677223206, + "learning_rate": 9.502408760566489e-05, + "loss": 1.3813, + "step": 2401 + }, + { + "epoch": 0.1459383923689167, + "grad_norm": 0.9056212306022644, + "learning_rate": 9.501992364090151e-05, + "loss": 1.1485, + "step": 2402 + }, + { + "epoch": 0.14599914940154324, + "grad_norm": 0.34531742334365845, + "learning_rate": 9.501575802592413e-05, + "loss": 1.1166, + "step": 2403 + }, + { + "epoch": 0.14605990643416975, + "grad_norm": 0.20710986852645874, + "learning_rate": 9.501159076088543e-05, + "loss": 1.1896, + "step": 2404 + }, + { + "epoch": 0.14612066346679628, + "grad_norm": 0.19179341197013855, + "learning_rate": 9.500742184593818e-05, + "loss": 1.1641, + "step": 2405 + }, + { + "epoch": 0.14618142049942282, + "grad_norm": 0.208840012550354, + "learning_rate": 9.500325128123517e-05, + "loss": 1.1568, + "step": 2406 + }, + { + "epoch": 0.14624217753204932, + "grad_norm": 0.3734762370586395, + "learning_rate": 9.49990790669293e-05, + "loss": 1.1741, + "step": 2407 + }, + { + "epoch": 0.14630293456467586, + "grad_norm": 0.19922985136508942, + "learning_rate": 9.499490520317349e-05, + "loss": 1.1229, + "step": 2408 + }, + { + "epoch": 0.1463636915973024, + "grad_norm": 0.22177734971046448, + "learning_rate": 9.499072969012073e-05, + "loss": 1.0884, + "step": 2409 + }, + { + "epoch": 0.14642444862992893, + "grad_norm": 0.17181159555912018, + "learning_rate": 9.498655252792406e-05, + "loss": 1.0948, + "step": 2410 + }, + { + "epoch": 0.14648520566255543, + "grad_norm": 0.2920275330543518, + "learning_rate": 9.498237371673663e-05, + "loss": 1.0722, + "step": 2411 + }, + { + "epoch": 0.14654596269518197, + "grad_norm": 0.15354323387145996, + "learning_rate": 9.497819325671159e-05, + "loss": 1.081, + "step": 2412 + }, + { + "epoch": 0.1466067197278085, + "grad_norm": 0.2565551996231079, + "learning_rate": 9.497401114800219e-05, + "loss": 1.233, + "step": 2413 + }, + { + "epoch": 0.146667476760435, + "grad_norm": 0.21972645819187164, + "learning_rate": 9.49698273907617e-05, + "loss": 1.0564, + "step": 2414 + }, + { + "epoch": 0.14672823379306155, + "grad_norm": 0.27179932594299316, + "learning_rate": 9.496564198514351e-05, + "loss": 1.263, + "step": 2415 + }, + { + "epoch": 0.14678899082568808, + "grad_norm": 0.21390116214752197, + "learning_rate": 9.496145493130102e-05, + "loss": 1.1002, + "step": 2416 + }, + { + "epoch": 0.1468497478583146, + "grad_norm": 0.19426123797893524, + "learning_rate": 9.49572662293877e-05, + "loss": 1.0568, + "step": 2417 + }, + { + "epoch": 0.14691050489094112, + "grad_norm": 3.3889048099517822, + "learning_rate": 9.495307587955711e-05, + "loss": 1.0866, + "step": 2418 + }, + { + "epoch": 0.14697126192356766, + "grad_norm": 0.18671445548534393, + "learning_rate": 9.494888388196282e-05, + "loss": 1.0914, + "step": 2419 + }, + { + "epoch": 0.14703201895619417, + "grad_norm": 0.17855724692344666, + "learning_rate": 9.494469023675851e-05, + "loss": 1.1418, + "step": 2420 + }, + { + "epoch": 0.1470927759888207, + "grad_norm": 0.1991187036037445, + "learning_rate": 9.49404949440979e-05, + "loss": 1.0787, + "step": 2421 + }, + { + "epoch": 0.14715353302144724, + "grad_norm": 0.21052385866641998, + "learning_rate": 9.493629800413474e-05, + "loss": 1.2927, + "step": 2422 + }, + { + "epoch": 0.14721429005407377, + "grad_norm": 0.25615546107292175, + "learning_rate": 9.49320994170229e-05, + "loss": 1.1977, + "step": 2423 + }, + { + "epoch": 0.14727504708670028, + "grad_norm": 0.3489145040512085, + "learning_rate": 9.492789918291627e-05, + "loss": 1.166, + "step": 2424 + }, + { + "epoch": 0.1473358041193268, + "grad_norm": 0.13200810551643372, + "learning_rate": 9.492369730196882e-05, + "loss": 1.0489, + "step": 2425 + }, + { + "epoch": 0.14739656115195335, + "grad_norm": 0.16536767780780792, + "learning_rate": 9.491949377433456e-05, + "loss": 1.1017, + "step": 2426 + }, + { + "epoch": 0.14745731818457986, + "grad_norm": 0.19354049861431122, + "learning_rate": 9.491528860016756e-05, + "loss": 1.1395, + "step": 2427 + }, + { + "epoch": 0.1475180752172064, + "grad_norm": 0.2011626660823822, + "learning_rate": 9.491108177962199e-05, + "loss": 1.1231, + "step": 2428 + }, + { + "epoch": 0.14757883224983293, + "grad_norm": 0.17097003757953644, + "learning_rate": 9.490687331285202e-05, + "loss": 1.0884, + "step": 2429 + }, + { + "epoch": 0.14763958928245943, + "grad_norm": 0.35095110535621643, + "learning_rate": 9.490266320001195e-05, + "loss": 1.0691, + "step": 2430 + }, + { + "epoch": 0.14770034631508597, + "grad_norm": 0.19442760944366455, + "learning_rate": 9.489845144125607e-05, + "loss": 1.1133, + "step": 2431 + }, + { + "epoch": 0.1477611033477125, + "grad_norm": 0.17445772886276245, + "learning_rate": 9.489423803673877e-05, + "loss": 1.0856, + "step": 2432 + }, + { + "epoch": 0.14782186038033904, + "grad_norm": 0.21538683772087097, + "learning_rate": 9.489002298661449e-05, + "loss": 1.1704, + "step": 2433 + }, + { + "epoch": 0.14788261741296554, + "grad_norm": 0.22413907945156097, + "learning_rate": 9.488580629103776e-05, + "loss": 1.0727, + "step": 2434 + }, + { + "epoch": 0.14794337444559208, + "grad_norm": 0.23690840601921082, + "learning_rate": 9.488158795016311e-05, + "loss": 1.1026, + "step": 2435 + }, + { + "epoch": 0.14800413147821861, + "grad_norm": 1.9819402694702148, + "learning_rate": 9.487736796414519e-05, + "loss": 1.1287, + "step": 2436 + }, + { + "epoch": 0.14806488851084512, + "grad_norm": 0.31615498661994934, + "learning_rate": 9.487314633313867e-05, + "loss": 1.2106, + "step": 2437 + }, + { + "epoch": 0.14812564554347166, + "grad_norm": 0.2629581093788147, + "learning_rate": 9.486892305729828e-05, + "loss": 1.1235, + "step": 2438 + }, + { + "epoch": 0.1481864025760982, + "grad_norm": 0.13540659844875336, + "learning_rate": 9.486469813677885e-05, + "loss": 1.0822, + "step": 2439 + }, + { + "epoch": 0.1482471596087247, + "grad_norm": 0.2806120812892914, + "learning_rate": 9.486047157173526e-05, + "loss": 1.0551, + "step": 2440 + }, + { + "epoch": 0.14830791664135123, + "grad_norm": 0.16001953184604645, + "learning_rate": 9.485624336232239e-05, + "loss": 1.1186, + "step": 2441 + }, + { + "epoch": 0.14836867367397777, + "grad_norm": 1.1844679117202759, + "learning_rate": 9.485201350869525e-05, + "loss": 1.1247, + "step": 2442 + }, + { + "epoch": 0.14842943070660428, + "grad_norm": 0.16722291707992554, + "learning_rate": 9.48477820110089e-05, + "loss": 1.1587, + "step": 2443 + }, + { + "epoch": 0.1484901877392308, + "grad_norm": 0.9222338795661926, + "learning_rate": 9.484354886941842e-05, + "loss": 1.1013, + "step": 2444 + }, + { + "epoch": 0.14855094477185735, + "grad_norm": 0.4270765781402588, + "learning_rate": 9.483931408407898e-05, + "loss": 1.129, + "step": 2445 + }, + { + "epoch": 0.14861170180448388, + "grad_norm": 0.25752943754196167, + "learning_rate": 9.483507765514583e-05, + "loss": 1.2061, + "step": 2446 + }, + { + "epoch": 0.1486724588371104, + "grad_norm": 0.2032468020915985, + "learning_rate": 9.483083958277423e-05, + "loss": 1.2155, + "step": 2447 + }, + { + "epoch": 0.14873321586973692, + "grad_norm": 0.17130106687545776, + "learning_rate": 9.482659986711955e-05, + "loss": 1.1266, + "step": 2448 + }, + { + "epoch": 0.14879397290236346, + "grad_norm": 0.17864754796028137, + "learning_rate": 9.482235850833719e-05, + "loss": 1.1025, + "step": 2449 + }, + { + "epoch": 0.14885472993498997, + "grad_norm": 0.2654895782470703, + "learning_rate": 9.48181155065826e-05, + "loss": 1.1251, + "step": 2450 + }, + { + "epoch": 0.1489154869676165, + "grad_norm": 0.13779211044311523, + "learning_rate": 9.481387086201132e-05, + "loss": 1.0993, + "step": 2451 + }, + { + "epoch": 0.14897624400024304, + "grad_norm": 0.17782364785671234, + "learning_rate": 9.480962457477896e-05, + "loss": 1.0823, + "step": 2452 + }, + { + "epoch": 0.14903700103286954, + "grad_norm": 0.2756407856941223, + "learning_rate": 9.480537664504115e-05, + "loss": 1.1324, + "step": 2453 + }, + { + "epoch": 0.14909775806549608, + "grad_norm": 0.22156783938407898, + "learning_rate": 9.480112707295359e-05, + "loss": 1.158, + "step": 2454 + }, + { + "epoch": 0.1491585150981226, + "grad_norm": 0.15597949922084808, + "learning_rate": 9.479687585867205e-05, + "loss": 1.1511, + "step": 2455 + }, + { + "epoch": 0.14921927213074915, + "grad_norm": 0.5653387308120728, + "learning_rate": 9.479262300235238e-05, + "loss": 1.0808, + "step": 2456 + }, + { + "epoch": 0.14928002916337565, + "grad_norm": 0.16475999355316162, + "learning_rate": 9.478836850415045e-05, + "loss": 1.2192, + "step": 2457 + }, + { + "epoch": 0.1493407861960022, + "grad_norm": 0.20751553773880005, + "learning_rate": 9.478411236422221e-05, + "loss": 1.0879, + "step": 2458 + }, + { + "epoch": 0.14940154322862872, + "grad_norm": 0.9479624032974243, + "learning_rate": 9.477985458272369e-05, + "loss": 1.1913, + "step": 2459 + }, + { + "epoch": 0.14946230026125523, + "grad_norm": 0.1658661663532257, + "learning_rate": 9.477559515981092e-05, + "loss": 1.1942, + "step": 2460 + }, + { + "epoch": 0.14952305729388177, + "grad_norm": 0.16715334355831146, + "learning_rate": 9.477133409564007e-05, + "loss": 1.2325, + "step": 2461 + }, + { + "epoch": 0.1495838143265083, + "grad_norm": 6.42448091506958, + "learning_rate": 9.476707139036731e-05, + "loss": 1.1289, + "step": 2462 + }, + { + "epoch": 0.1496445713591348, + "grad_norm": 0.20347461104393005, + "learning_rate": 9.47628070441489e-05, + "loss": 1.0632, + "step": 2463 + }, + { + "epoch": 0.14970532839176134, + "grad_norm": 0.27208903431892395, + "learning_rate": 9.475854105714115e-05, + "loss": 1.111, + "step": 2464 + }, + { + "epoch": 0.14976608542438788, + "grad_norm": 0.16360872983932495, + "learning_rate": 9.475427342950042e-05, + "loss": 1.1489, + "step": 2465 + }, + { + "epoch": 0.1498268424570144, + "grad_norm": 0.13911795616149902, + "learning_rate": 9.475000416138314e-05, + "loss": 1.0844, + "step": 2466 + }, + { + "epoch": 0.14988759948964092, + "grad_norm": 0.3688059151172638, + "learning_rate": 9.474573325294582e-05, + "loss": 1.084, + "step": 2467 + }, + { + "epoch": 0.14994835652226746, + "grad_norm": 0.17356690764427185, + "learning_rate": 9.4741460704345e-05, + "loss": 1.1703, + "step": 2468 + }, + { + "epoch": 0.150009113554894, + "grad_norm": 0.194352388381958, + "learning_rate": 9.473718651573726e-05, + "loss": 1.1525, + "step": 2469 + }, + { + "epoch": 0.1500698705875205, + "grad_norm": 0.1539538949728012, + "learning_rate": 9.473291068727933e-05, + "loss": 1.107, + "step": 2470 + }, + { + "epoch": 0.15013062762014703, + "grad_norm": 0.2254716455936432, + "learning_rate": 9.472863321912791e-05, + "loss": 1.2808, + "step": 2471 + }, + { + "epoch": 0.15019138465277357, + "grad_norm": 0.20095323026180267, + "learning_rate": 9.472435411143978e-05, + "loss": 1.1233, + "step": 2472 + }, + { + "epoch": 0.15025214168540008, + "grad_norm": 0.18226878345012665, + "learning_rate": 9.47200733643718e-05, + "loss": 1.2446, + "step": 2473 + }, + { + "epoch": 0.1503128987180266, + "grad_norm": 0.5279532074928284, + "learning_rate": 9.471579097808092e-05, + "loss": 1.1541, + "step": 2474 + }, + { + "epoch": 0.15037365575065315, + "grad_norm": 0.19655056297779083, + "learning_rate": 9.471150695272406e-05, + "loss": 1.1113, + "step": 2475 + }, + { + "epoch": 0.15043441278327965, + "grad_norm": 0.20901861786842346, + "learning_rate": 9.470722128845826e-05, + "loss": 1.0721, + "step": 2476 + }, + { + "epoch": 0.1504951698159062, + "grad_norm": 0.22774384915828705, + "learning_rate": 9.470293398544064e-05, + "loss": 1.2215, + "step": 2477 + }, + { + "epoch": 0.15055592684853272, + "grad_norm": 0.20482629537582397, + "learning_rate": 9.469864504382832e-05, + "loss": 1.114, + "step": 2478 + }, + { + "epoch": 0.15061668388115926, + "grad_norm": 1.307689905166626, + "learning_rate": 9.469435446377852e-05, + "loss": 1.1239, + "step": 2479 + }, + { + "epoch": 0.15067744091378577, + "grad_norm": 0.2381213754415512, + "learning_rate": 9.469006224544854e-05, + "loss": 1.131, + "step": 2480 + }, + { + "epoch": 0.1507381979464123, + "grad_norm": 0.14561453461647034, + "learning_rate": 9.468576838899567e-05, + "loss": 1.0768, + "step": 2481 + }, + { + "epoch": 0.15079895497903884, + "grad_norm": 0.20880720019340515, + "learning_rate": 9.468147289457732e-05, + "loss": 1.1832, + "step": 2482 + }, + { + "epoch": 0.15085971201166534, + "grad_norm": 0.6424425840377808, + "learning_rate": 9.467717576235095e-05, + "loss": 1.2518, + "step": 2483 + }, + { + "epoch": 0.15092046904429188, + "grad_norm": 0.20040066540241241, + "learning_rate": 9.467287699247409e-05, + "loss": 1.249, + "step": 2484 + }, + { + "epoch": 0.1509812260769184, + "grad_norm": 0.22147944569587708, + "learning_rate": 9.466857658510427e-05, + "loss": 1.1468, + "step": 2485 + }, + { + "epoch": 0.15104198310954492, + "grad_norm": 0.21854613721370697, + "learning_rate": 9.466427454039914e-05, + "loss": 1.083, + "step": 2486 + }, + { + "epoch": 0.15110274014217145, + "grad_norm": 0.2647652328014374, + "learning_rate": 9.465997085851639e-05, + "loss": 1.1411, + "step": 2487 + }, + { + "epoch": 0.151163497174798, + "grad_norm": 0.24156533181667328, + "learning_rate": 9.465566553961378e-05, + "loss": 1.1711, + "step": 2488 + }, + { + "epoch": 0.1512242542074245, + "grad_norm": 0.15108446776866913, + "learning_rate": 9.465135858384911e-05, + "loss": 1.0747, + "step": 2489 + }, + { + "epoch": 0.15128501124005103, + "grad_norm": 1.964840054512024, + "learning_rate": 9.464704999138027e-05, + "loss": 1.2311, + "step": 2490 + }, + { + "epoch": 0.15134576827267757, + "grad_norm": 0.3127102851867676, + "learning_rate": 9.464273976236517e-05, + "loss": 1.1105, + "step": 2491 + }, + { + "epoch": 0.1514065253053041, + "grad_norm": 0.2378004938364029, + "learning_rate": 9.463842789696184e-05, + "loss": 1.0764, + "step": 2492 + }, + { + "epoch": 0.1514672823379306, + "grad_norm": 0.14329063892364502, + "learning_rate": 9.463411439532828e-05, + "loss": 1.1309, + "step": 2493 + }, + { + "epoch": 0.15152803937055714, + "grad_norm": 0.23911572992801666, + "learning_rate": 9.462979925762266e-05, + "loss": 1.2079, + "step": 2494 + }, + { + "epoch": 0.15158879640318368, + "grad_norm": 0.19386433064937592, + "learning_rate": 9.462548248400309e-05, + "loss": 1.1623, + "step": 2495 + }, + { + "epoch": 0.15164955343581019, + "grad_norm": 0.21827779710292816, + "learning_rate": 9.462116407462784e-05, + "loss": 1.1149, + "step": 2496 + }, + { + "epoch": 0.15171031046843672, + "grad_norm": 0.2998878061771393, + "learning_rate": 9.46168440296552e-05, + "loss": 1.1089, + "step": 2497 + }, + { + "epoch": 0.15177106750106326, + "grad_norm": 0.3298884332180023, + "learning_rate": 9.461252234924351e-05, + "loss": 1.152, + "step": 2498 + }, + { + "epoch": 0.15183182453368976, + "grad_norm": 0.229386568069458, + "learning_rate": 9.460819903355121e-05, + "loss": 1.3153, + "step": 2499 + }, + { + "epoch": 0.1518925815663163, + "grad_norm": 0.27866312861442566, + "learning_rate": 9.460387408273675e-05, + "loss": 1.1676, + "step": 2500 + }, + { + "epoch": 0.15195333859894283, + "grad_norm": 0.2763281464576721, + "learning_rate": 9.459954749695865e-05, + "loss": 1.1258, + "step": 2501 + }, + { + "epoch": 0.15201409563156934, + "grad_norm": 0.34656262397766113, + "learning_rate": 9.459521927637551e-05, + "loss": 1.0938, + "step": 2502 + }, + { + "epoch": 0.15207485266419588, + "grad_norm": 1.4510091543197632, + "learning_rate": 9.4590889421146e-05, + "loss": 1.1494, + "step": 2503 + }, + { + "epoch": 0.1521356096968224, + "grad_norm": 3.499176263809204, + "learning_rate": 9.458655793142883e-05, + "loss": 1.1702, + "step": 2504 + }, + { + "epoch": 0.15219636672944895, + "grad_norm": 0.16883930563926697, + "learning_rate": 9.458222480738275e-05, + "loss": 1.1477, + "step": 2505 + }, + { + "epoch": 0.15225712376207545, + "grad_norm": 0.2272227555513382, + "learning_rate": 9.45778900491666e-05, + "loss": 1.1914, + "step": 2506 + }, + { + "epoch": 0.152317880794702, + "grad_norm": 0.3026867210865021, + "learning_rate": 9.457355365693927e-05, + "loss": 1.1802, + "step": 2507 + }, + { + "epoch": 0.15237863782732852, + "grad_norm": 0.14221596717834473, + "learning_rate": 9.456921563085972e-05, + "loss": 1.1192, + "step": 2508 + }, + { + "epoch": 0.15243939485995503, + "grad_norm": 0.20842139422893524, + "learning_rate": 9.456487597108694e-05, + "loss": 1.076, + "step": 2509 + }, + { + "epoch": 0.15250015189258156, + "grad_norm": 0.22788575291633606, + "learning_rate": 9.456053467778003e-05, + "loss": 1.0718, + "step": 2510 + }, + { + "epoch": 0.1525609089252081, + "grad_norm": 0.22824859619140625, + "learning_rate": 9.455619175109811e-05, + "loss": 1.1488, + "step": 2511 + }, + { + "epoch": 0.1526216659578346, + "grad_norm": 0.2251524180173874, + "learning_rate": 9.455184719120038e-05, + "loss": 1.1245, + "step": 2512 + }, + { + "epoch": 0.15268242299046114, + "grad_norm": 0.26367315649986267, + "learning_rate": 9.454750099824606e-05, + "loss": 1.0966, + "step": 2513 + }, + { + "epoch": 0.15274318002308768, + "grad_norm": 2.280456066131592, + "learning_rate": 9.454315317239448e-05, + "loss": 1.1042, + "step": 2514 + }, + { + "epoch": 0.1528039370557142, + "grad_norm": 0.23190902173519135, + "learning_rate": 9.453880371380503e-05, + "loss": 1.1163, + "step": 2515 + }, + { + "epoch": 0.15286469408834072, + "grad_norm": 0.2745177745819092, + "learning_rate": 9.453445262263711e-05, + "loss": 1.1081, + "step": 2516 + }, + { + "epoch": 0.15292545112096725, + "grad_norm": 1.4664905071258545, + "learning_rate": 9.453009989905022e-05, + "loss": 1.12, + "step": 2517 + }, + { + "epoch": 0.1529862081535938, + "grad_norm": 0.5669666528701782, + "learning_rate": 9.452574554320391e-05, + "loss": 1.1778, + "step": 2518 + }, + { + "epoch": 0.1530469651862203, + "grad_norm": 0.824489414691925, + "learning_rate": 9.452138955525779e-05, + "loss": 1.1576, + "step": 2519 + }, + { + "epoch": 0.15310772221884683, + "grad_norm": 0.6353117227554321, + "learning_rate": 9.451703193537153e-05, + "loss": 1.1855, + "step": 2520 + }, + { + "epoch": 0.15316847925147337, + "grad_norm": 1.1025758981704712, + "learning_rate": 9.451267268370486e-05, + "loss": 1.1288, + "step": 2521 + }, + { + "epoch": 0.15322923628409987, + "grad_norm": 0.4223443567752838, + "learning_rate": 9.450831180041757e-05, + "loss": 1.1644, + "step": 2522 + }, + { + "epoch": 0.1532899933167264, + "grad_norm": 0.2359962910413742, + "learning_rate": 9.450394928566951e-05, + "loss": 1.077, + "step": 2523 + }, + { + "epoch": 0.15335075034935294, + "grad_norm": 0.5190343856811523, + "learning_rate": 9.449958513962059e-05, + "loss": 1.0871, + "step": 2524 + }, + { + "epoch": 0.15341150738197945, + "grad_norm": 0.6135958433151245, + "learning_rate": 9.449521936243075e-05, + "loss": 1.1836, + "step": 2525 + }, + { + "epoch": 0.15347226441460599, + "grad_norm": 0.5102086663246155, + "learning_rate": 9.449085195426007e-05, + "loss": 1.1572, + "step": 2526 + }, + { + "epoch": 0.15353302144723252, + "grad_norm": 0.2285347729921341, + "learning_rate": 9.448648291526861e-05, + "loss": 1.1021, + "step": 2527 + }, + { + "epoch": 0.15359377847985906, + "grad_norm": 0.16917438805103302, + "learning_rate": 9.448211224561651e-05, + "loss": 1.0787, + "step": 2528 + }, + { + "epoch": 0.15365453551248556, + "grad_norm": 0.2284194976091385, + "learning_rate": 9.447773994546399e-05, + "loss": 1.1107, + "step": 2529 + }, + { + "epoch": 0.1537152925451121, + "grad_norm": 0.9874343276023865, + "learning_rate": 9.44733660149713e-05, + "loss": 1.2561, + "step": 2530 + }, + { + "epoch": 0.15377604957773863, + "grad_norm": 0.46491485834121704, + "learning_rate": 9.446899045429881e-05, + "loss": 1.272, + "step": 2531 + }, + { + "epoch": 0.15383680661036514, + "grad_norm": 0.18492628633975983, + "learning_rate": 9.446461326360687e-05, + "loss": 1.1473, + "step": 2532 + }, + { + "epoch": 0.15389756364299167, + "grad_norm": 0.26542723178863525, + "learning_rate": 9.446023444305592e-05, + "loss": 1.3215, + "step": 2533 + }, + { + "epoch": 0.1539583206756182, + "grad_norm": 0.32704147696495056, + "learning_rate": 9.44558539928065e-05, + "loss": 1.0896, + "step": 2534 + }, + { + "epoch": 0.15401907770824472, + "grad_norm": 0.3065110743045807, + "learning_rate": 9.445147191301917e-05, + "loss": 1.1356, + "step": 2535 + }, + { + "epoch": 0.15407983474087125, + "grad_norm": 0.23968765139579773, + "learning_rate": 9.444708820385451e-05, + "loss": 1.1387, + "step": 2536 + }, + { + "epoch": 0.1541405917734978, + "grad_norm": 0.26323044300079346, + "learning_rate": 9.444270286547328e-05, + "loss": 1.0891, + "step": 2537 + }, + { + "epoch": 0.15420134880612432, + "grad_norm": 0.21186158061027527, + "learning_rate": 9.443831589803616e-05, + "loss": 1.2058, + "step": 2538 + }, + { + "epoch": 0.15426210583875083, + "grad_norm": 0.2574732005596161, + "learning_rate": 9.443392730170398e-05, + "loss": 1.1316, + "step": 2539 + }, + { + "epoch": 0.15432286287137736, + "grad_norm": 0.22741980850696564, + "learning_rate": 9.442953707663763e-05, + "loss": 1.2031, + "step": 2540 + }, + { + "epoch": 0.1543836199040039, + "grad_norm": 0.3144490420818329, + "learning_rate": 9.442514522299797e-05, + "loss": 1.0981, + "step": 2541 + }, + { + "epoch": 0.1544443769366304, + "grad_norm": 0.2632023096084595, + "learning_rate": 9.442075174094606e-05, + "loss": 1.0804, + "step": 2542 + }, + { + "epoch": 0.15450513396925694, + "grad_norm": 0.279153972864151, + "learning_rate": 9.441635663064289e-05, + "loss": 1.0267, + "step": 2543 + }, + { + "epoch": 0.15456589100188348, + "grad_norm": 0.17136962711811066, + "learning_rate": 9.441195989224958e-05, + "loss": 1.1099, + "step": 2544 + }, + { + "epoch": 0.15462664803450998, + "grad_norm": 0.2738109827041626, + "learning_rate": 9.440756152592731e-05, + "loss": 1.144, + "step": 2545 + }, + { + "epoch": 0.15468740506713652, + "grad_norm": 0.3377268612384796, + "learning_rate": 9.440316153183729e-05, + "loss": 1.221, + "step": 2546 + }, + { + "epoch": 0.15474816209976305, + "grad_norm": 0.6182905435562134, + "learning_rate": 9.43987599101408e-05, + "loss": 1.3155, + "step": 2547 + }, + { + "epoch": 0.15480891913238956, + "grad_norm": 0.25432828068733215, + "learning_rate": 9.439435666099916e-05, + "loss": 1.1005, + "step": 2548 + }, + { + "epoch": 0.1548696761650161, + "grad_norm": 0.28780269622802734, + "learning_rate": 9.438995178457381e-05, + "loss": 1.2799, + "step": 2549 + }, + { + "epoch": 0.15493043319764263, + "grad_norm": 0.3011924922466278, + "learning_rate": 9.438554528102621e-05, + "loss": 1.1175, + "step": 2550 + }, + { + "epoch": 0.15499119023026917, + "grad_norm": 0.24541157484054565, + "learning_rate": 9.438113715051785e-05, + "loss": 1.1392, + "step": 2551 + }, + { + "epoch": 0.15505194726289567, + "grad_norm": 0.2185327410697937, + "learning_rate": 9.437672739321034e-05, + "loss": 1.0796, + "step": 2552 + }, + { + "epoch": 0.1551127042955222, + "grad_norm": 0.1773419827222824, + "learning_rate": 9.43723160092653e-05, + "loss": 1.0839, + "step": 2553 + }, + { + "epoch": 0.15517346132814874, + "grad_norm": 0.2325228601694107, + "learning_rate": 9.436790299884445e-05, + "loss": 1.1762, + "step": 2554 + }, + { + "epoch": 0.15523421836077525, + "grad_norm": 0.3388921916484833, + "learning_rate": 9.436348836210953e-05, + "loss": 1.1389, + "step": 2555 + }, + { + "epoch": 0.15529497539340179, + "grad_norm": 0.18111151456832886, + "learning_rate": 9.435907209922238e-05, + "loss": 1.0575, + "step": 2556 + }, + { + "epoch": 0.15535573242602832, + "grad_norm": 0.3273448050022125, + "learning_rate": 9.435465421034486e-05, + "loss": 1.5042, + "step": 2557 + }, + { + "epoch": 0.15541648945865483, + "grad_norm": 0.27869129180908203, + "learning_rate": 9.435023469563892e-05, + "loss": 1.1753, + "step": 2558 + }, + { + "epoch": 0.15547724649128136, + "grad_norm": 0.21914230287075043, + "learning_rate": 9.434581355526655e-05, + "loss": 1.1313, + "step": 2559 + }, + { + "epoch": 0.1555380035239079, + "grad_norm": 0.2693517506122589, + "learning_rate": 9.434139078938982e-05, + "loss": 1.196, + "step": 2560 + }, + { + "epoch": 0.15559876055653443, + "grad_norm": 0.31690874695777893, + "learning_rate": 9.433696639817082e-05, + "loss": 1.1136, + "step": 2561 + }, + { + "epoch": 0.15565951758916094, + "grad_norm": 0.28778785467147827, + "learning_rate": 9.433254038177178e-05, + "loss": 1.1675, + "step": 2562 + }, + { + "epoch": 0.15572027462178747, + "grad_norm": 0.32684969902038574, + "learning_rate": 9.432811274035486e-05, + "loss": 1.1399, + "step": 2563 + }, + { + "epoch": 0.155781031654414, + "grad_norm": 0.152581125497818, + "learning_rate": 9.432368347408244e-05, + "loss": 1.0687, + "step": 2564 + }, + { + "epoch": 0.15584178868704052, + "grad_norm": 0.22406163811683655, + "learning_rate": 9.431925258311682e-05, + "loss": 1.1608, + "step": 2565 + }, + { + "epoch": 0.15590254571966705, + "grad_norm": 0.19227343797683716, + "learning_rate": 9.431482006762042e-05, + "loss": 1.1161, + "step": 2566 + }, + { + "epoch": 0.1559633027522936, + "grad_norm": 0.22506600618362427, + "learning_rate": 9.431038592775575e-05, + "loss": 1.1232, + "step": 2567 + }, + { + "epoch": 0.1560240597849201, + "grad_norm": 0.43968236446380615, + "learning_rate": 9.430595016368528e-05, + "loss": 1.2479, + "step": 2568 + }, + { + "epoch": 0.15608481681754663, + "grad_norm": 0.6710606813430786, + "learning_rate": 9.430151277557167e-05, + "loss": 1.1688, + "step": 2569 + }, + { + "epoch": 0.15614557385017316, + "grad_norm": 0.2122555822134018, + "learning_rate": 9.429707376357753e-05, + "loss": 1.167, + "step": 2570 + }, + { + "epoch": 0.15620633088279967, + "grad_norm": 0.20122987031936646, + "learning_rate": 9.42926331278656e-05, + "loss": 1.1036, + "step": 2571 + }, + { + "epoch": 0.1562670879154262, + "grad_norm": 0.19168125092983246, + "learning_rate": 9.428819086859865e-05, + "loss": 1.1438, + "step": 2572 + }, + { + "epoch": 0.15632784494805274, + "grad_norm": 0.18105441331863403, + "learning_rate": 9.428374698593949e-05, + "loss": 1.236, + "step": 2573 + }, + { + "epoch": 0.15638860198067928, + "grad_norm": 0.1889175921678543, + "learning_rate": 9.427930148005102e-05, + "loss": 1.1755, + "step": 2574 + }, + { + "epoch": 0.15644935901330578, + "grad_norm": 0.17544624209403992, + "learning_rate": 9.42748543510962e-05, + "loss": 1.1175, + "step": 2575 + }, + { + "epoch": 0.15651011604593232, + "grad_norm": 0.21776503324508667, + "learning_rate": 9.427040559923804e-05, + "loss": 1.0939, + "step": 2576 + }, + { + "epoch": 0.15657087307855885, + "grad_norm": 0.24412862956523895, + "learning_rate": 9.426595522463961e-05, + "loss": 1.0695, + "step": 2577 + }, + { + "epoch": 0.15663163011118536, + "grad_norm": 0.16117967665195465, + "learning_rate": 9.426150322746404e-05, + "loss": 1.0902, + "step": 2578 + }, + { + "epoch": 0.1566923871438119, + "grad_norm": 0.15866626799106598, + "learning_rate": 9.425704960787449e-05, + "loss": 1.0782, + "step": 2579 + }, + { + "epoch": 0.15675314417643843, + "grad_norm": 0.19047687947750092, + "learning_rate": 9.425259436603425e-05, + "loss": 1.1106, + "step": 2580 + }, + { + "epoch": 0.15681390120906494, + "grad_norm": 0.19610057771205902, + "learning_rate": 9.42481375021066e-05, + "loss": 1.148, + "step": 2581 + }, + { + "epoch": 0.15687465824169147, + "grad_norm": 2.3571863174438477, + "learning_rate": 9.424367901625491e-05, + "loss": 1.0905, + "step": 2582 + }, + { + "epoch": 0.156935415274318, + "grad_norm": 0.17872482538223267, + "learning_rate": 9.423921890864263e-05, + "loss": 1.1368, + "step": 2583 + }, + { + "epoch": 0.15699617230694454, + "grad_norm": 0.2017214000225067, + "learning_rate": 9.423475717943321e-05, + "loss": 1.1529, + "step": 2584 + }, + { + "epoch": 0.15705692933957105, + "grad_norm": 0.15281355381011963, + "learning_rate": 9.423029382879023e-05, + "loss": 1.1106, + "step": 2585 + }, + { + "epoch": 0.15711768637219758, + "grad_norm": 0.42306381464004517, + "learning_rate": 9.422582885687728e-05, + "loss": 1.1726, + "step": 2586 + }, + { + "epoch": 0.15717844340482412, + "grad_norm": 0.17120933532714844, + "learning_rate": 9.422136226385802e-05, + "loss": 1.1587, + "step": 2587 + }, + { + "epoch": 0.15723920043745063, + "grad_norm": 0.26416444778442383, + "learning_rate": 9.42168940498962e-05, + "loss": 1.2959, + "step": 2588 + }, + { + "epoch": 0.15729995747007716, + "grad_norm": 0.7534836530685425, + "learning_rate": 9.421242421515554e-05, + "loss": 1.0588, + "step": 2589 + }, + { + "epoch": 0.1573607145027037, + "grad_norm": 0.2690413296222687, + "learning_rate": 9.420795275979997e-05, + "loss": 1.1023, + "step": 2590 + }, + { + "epoch": 0.1574214715353302, + "grad_norm": 0.1511676013469696, + "learning_rate": 9.420347968399332e-05, + "loss": 1.0809, + "step": 2591 + }, + { + "epoch": 0.15748222856795674, + "grad_norm": 0.14605583250522614, + "learning_rate": 9.419900498789958e-05, + "loss": 1.1054, + "step": 2592 + }, + { + "epoch": 0.15754298560058327, + "grad_norm": 0.1760273426771164, + "learning_rate": 9.419452867168277e-05, + "loss": 1.1226, + "step": 2593 + }, + { + "epoch": 0.15760374263320978, + "grad_norm": 7.571736812591553, + "learning_rate": 9.419005073550697e-05, + "loss": 1.2598, + "step": 2594 + }, + { + "epoch": 0.15766449966583632, + "grad_norm": 0.16633674502372742, + "learning_rate": 9.418557117953632e-05, + "loss": 1.0656, + "step": 2595 + }, + { + "epoch": 0.15772525669846285, + "grad_norm": 0.15302538871765137, + "learning_rate": 9.418109000393501e-05, + "loss": 1.0378, + "step": 2596 + }, + { + "epoch": 0.1577860137310894, + "grad_norm": 0.16779173910617828, + "learning_rate": 9.417660720886732e-05, + "loss": 1.1809, + "step": 2597 + }, + { + "epoch": 0.1578467707637159, + "grad_norm": 0.14601485431194305, + "learning_rate": 9.417212279449754e-05, + "loss": 1.0573, + "step": 2598 + }, + { + "epoch": 0.15790752779634243, + "grad_norm": 0.4659806489944458, + "learning_rate": 9.416763676099007e-05, + "loss": 1.2161, + "step": 2599 + }, + { + "epoch": 0.15796828482896896, + "grad_norm": 0.1553792655467987, + "learning_rate": 9.416314910850931e-05, + "loss": 1.1836, + "step": 2600 + }, + { + "epoch": 0.15802904186159547, + "grad_norm": 0.3346526026725769, + "learning_rate": 9.415865983721982e-05, + "loss": 1.1007, + "step": 2601 + }, + { + "epoch": 0.158089798894222, + "grad_norm": 0.18200501799583435, + "learning_rate": 9.41541689472861e-05, + "loss": 1.1573, + "step": 2602 + }, + { + "epoch": 0.15815055592684854, + "grad_norm": 0.19293729960918427, + "learning_rate": 9.414967643887279e-05, + "loss": 1.1278, + "step": 2603 + }, + { + "epoch": 0.15821131295947505, + "grad_norm": 0.21648606657981873, + "learning_rate": 9.414518231214455e-05, + "loss": 1.129, + "step": 2604 + }, + { + "epoch": 0.15827206999210158, + "grad_norm": 0.1847764402627945, + "learning_rate": 9.414068656726612e-05, + "loss": 1.104, + "step": 2605 + }, + { + "epoch": 0.15833282702472812, + "grad_norm": 0.2570077180862427, + "learning_rate": 9.41361892044023e-05, + "loss": 1.1204, + "step": 2606 + }, + { + "epoch": 0.15839358405735462, + "grad_norm": 0.14342860877513885, + "learning_rate": 9.413169022371793e-05, + "loss": 1.0479, + "step": 2607 + }, + { + "epoch": 0.15845434108998116, + "grad_norm": 0.781670093536377, + "learning_rate": 9.412718962537792e-05, + "loss": 1.0611, + "step": 2608 + }, + { + "epoch": 0.1585150981226077, + "grad_norm": 0.23704147338867188, + "learning_rate": 9.412268740954724e-05, + "loss": 1.2103, + "step": 2609 + }, + { + "epoch": 0.15857585515523423, + "grad_norm": 0.24569956958293915, + "learning_rate": 9.411818357639093e-05, + "loss": 1.1834, + "step": 2610 + }, + { + "epoch": 0.15863661218786074, + "grad_norm": 0.20215216279029846, + "learning_rate": 9.411367812607408e-05, + "loss": 1.1998, + "step": 2611 + }, + { + "epoch": 0.15869736922048727, + "grad_norm": 0.6925764679908752, + "learning_rate": 9.410917105876183e-05, + "loss": 1.2567, + "step": 2612 + }, + { + "epoch": 0.1587581262531138, + "grad_norm": 0.18702685832977295, + "learning_rate": 9.410466237461937e-05, + "loss": 1.1092, + "step": 2613 + }, + { + "epoch": 0.15881888328574031, + "grad_norm": 0.822773277759552, + "learning_rate": 9.4100152073812e-05, + "loss": 1.1646, + "step": 2614 + }, + { + "epoch": 0.15887964031836685, + "grad_norm": 0.27991822361946106, + "learning_rate": 9.409564015650502e-05, + "loss": 1.1806, + "step": 2615 + }, + { + "epoch": 0.15894039735099338, + "grad_norm": 0.2208004593849182, + "learning_rate": 9.409112662286385e-05, + "loss": 1.201, + "step": 2616 + }, + { + "epoch": 0.1590011543836199, + "grad_norm": 0.28630945086479187, + "learning_rate": 9.408661147305389e-05, + "loss": 1.1783, + "step": 2617 + }, + { + "epoch": 0.15906191141624643, + "grad_norm": 0.16344986855983734, + "learning_rate": 9.408209470724068e-05, + "loss": 1.074, + "step": 2618 + }, + { + "epoch": 0.15912266844887296, + "grad_norm": 0.9132686853408813, + "learning_rate": 9.407757632558975e-05, + "loss": 1.1918, + "step": 2619 + }, + { + "epoch": 0.1591834254814995, + "grad_norm": 0.24981817603111267, + "learning_rate": 9.407305632826675e-05, + "loss": 1.2166, + "step": 2620 + }, + { + "epoch": 0.159244182514126, + "grad_norm": 0.14491136372089386, + "learning_rate": 9.406853471543735e-05, + "loss": 1.0663, + "step": 2621 + }, + { + "epoch": 0.15930493954675254, + "grad_norm": 0.7831388711929321, + "learning_rate": 9.40640114872673e-05, + "loss": 1.0644, + "step": 2622 + }, + { + "epoch": 0.15936569657937907, + "grad_norm": 0.3581639230251312, + "learning_rate": 9.405948664392239e-05, + "loss": 1.0748, + "step": 2623 + }, + { + "epoch": 0.15942645361200558, + "grad_norm": 0.3864647150039673, + "learning_rate": 9.405496018556847e-05, + "loss": 1.075, + "step": 2624 + }, + { + "epoch": 0.15948721064463212, + "grad_norm": 0.21236863732337952, + "learning_rate": 9.405043211237147e-05, + "loss": 1.1033, + "step": 2625 + }, + { + "epoch": 0.15954796767725865, + "grad_norm": 0.24810928106307983, + "learning_rate": 9.404590242449738e-05, + "loss": 1.2299, + "step": 2626 + }, + { + "epoch": 0.15960872470988516, + "grad_norm": 0.16409023106098175, + "learning_rate": 9.40413711221122e-05, + "loss": 1.1018, + "step": 2627 + }, + { + "epoch": 0.1596694817425117, + "grad_norm": 0.319889098405838, + "learning_rate": 9.403683820538208e-05, + "loss": 1.2046, + "step": 2628 + }, + { + "epoch": 0.15973023877513823, + "grad_norm": 0.2668726146221161, + "learning_rate": 9.403230367447311e-05, + "loss": 1.1948, + "step": 2629 + }, + { + "epoch": 0.15979099580776474, + "grad_norm": 0.254044771194458, + "learning_rate": 9.402776752955155e-05, + "loss": 1.2068, + "step": 2630 + }, + { + "epoch": 0.15985175284039127, + "grad_norm": 0.23893888294696808, + "learning_rate": 9.402322977078366e-05, + "loss": 1.1305, + "step": 2631 + }, + { + "epoch": 0.1599125098730178, + "grad_norm": 0.18897725641727448, + "learning_rate": 9.401869039833577e-05, + "loss": 1.1491, + "step": 2632 + }, + { + "epoch": 0.15997326690564434, + "grad_norm": 0.2109520137310028, + "learning_rate": 9.40141494123743e-05, + "loss": 1.0789, + "step": 2633 + }, + { + "epoch": 0.16003402393827085, + "grad_norm": 0.417977899312973, + "learning_rate": 9.400960681306566e-05, + "loss": 1.4091, + "step": 2634 + }, + { + "epoch": 0.16009478097089738, + "grad_norm": 0.2826853394508362, + "learning_rate": 9.400506260057635e-05, + "loss": 1.149, + "step": 2635 + }, + { + "epoch": 0.16015553800352392, + "grad_norm": 1.1785639524459839, + "learning_rate": 9.4000516775073e-05, + "loss": 1.0705, + "step": 2636 + }, + { + "epoch": 0.16021629503615042, + "grad_norm": 0.15625526010990143, + "learning_rate": 9.399596933672217e-05, + "loss": 1.1168, + "step": 2637 + }, + { + "epoch": 0.16027705206877696, + "grad_norm": 0.3457401692867279, + "learning_rate": 9.399142028569061e-05, + "loss": 1.0788, + "step": 2638 + }, + { + "epoch": 0.1603378091014035, + "grad_norm": 0.2573912739753723, + "learning_rate": 9.398686962214502e-05, + "loss": 1.2239, + "step": 2639 + }, + { + "epoch": 0.16039856613403, + "grad_norm": 0.18996167182922363, + "learning_rate": 9.398231734625222e-05, + "loss": 1.0599, + "step": 2640 + }, + { + "epoch": 0.16045932316665654, + "grad_norm": 0.1542142927646637, + "learning_rate": 9.39777634581791e-05, + "loss": 1.0834, + "step": 2641 + }, + { + "epoch": 0.16052008019928307, + "grad_norm": 0.26509547233581543, + "learning_rate": 9.397320795809254e-05, + "loss": 1.3499, + "step": 2642 + }, + { + "epoch": 0.1605808372319096, + "grad_norm": 0.16394969820976257, + "learning_rate": 9.396865084615954e-05, + "loss": 1.0985, + "step": 2643 + }, + { + "epoch": 0.16064159426453611, + "grad_norm": 0.24615415930747986, + "learning_rate": 9.396409212254715e-05, + "loss": 1.1233, + "step": 2644 + }, + { + "epoch": 0.16070235129716265, + "grad_norm": 0.2274923473596573, + "learning_rate": 9.395953178742246e-05, + "loss": 1.1726, + "step": 2645 + }, + { + "epoch": 0.16076310832978918, + "grad_norm": 0.2049041986465454, + "learning_rate": 9.395496984095264e-05, + "loss": 1.0847, + "step": 2646 + }, + { + "epoch": 0.1608238653624157, + "grad_norm": 0.16849271953105927, + "learning_rate": 9.39504062833049e-05, + "loss": 1.2479, + "step": 2647 + }, + { + "epoch": 0.16088462239504223, + "grad_norm": 1.42002534866333, + "learning_rate": 9.394584111464653e-05, + "loss": 1.1433, + "step": 2648 + }, + { + "epoch": 0.16094537942766876, + "grad_norm": 0.21978677809238434, + "learning_rate": 9.394127433514485e-05, + "loss": 1.2329, + "step": 2649 + }, + { + "epoch": 0.16100613646029527, + "grad_norm": 0.2311258316040039, + "learning_rate": 9.393670594496727e-05, + "loss": 1.1341, + "step": 2650 + }, + { + "epoch": 0.1610668934929218, + "grad_norm": 0.21976394951343536, + "learning_rate": 9.393213594428124e-05, + "loss": 1.1124, + "step": 2651 + }, + { + "epoch": 0.16112765052554834, + "grad_norm": 0.13613387942314148, + "learning_rate": 9.392756433325426e-05, + "loss": 1.0568, + "step": 2652 + }, + { + "epoch": 0.16118840755817485, + "grad_norm": 0.4879782795906067, + "learning_rate": 9.392299111205396e-05, + "loss": 1.3004, + "step": 2653 + }, + { + "epoch": 0.16124916459080138, + "grad_norm": 0.32171717286109924, + "learning_rate": 9.39184162808479e-05, + "loss": 1.1321, + "step": 2654 + }, + { + "epoch": 0.16130992162342792, + "grad_norm": 0.24977348744869232, + "learning_rate": 9.39138398398038e-05, + "loss": 1.1745, + "step": 2655 + }, + { + "epoch": 0.16137067865605445, + "grad_norm": 0.21851065754890442, + "learning_rate": 9.390926178908943e-05, + "loss": 1.2122, + "step": 2656 + }, + { + "epoch": 0.16143143568868096, + "grad_norm": 0.34062761068344116, + "learning_rate": 9.390468212887255e-05, + "loss": 1.0949, + "step": 2657 + }, + { + "epoch": 0.1614921927213075, + "grad_norm": 0.3069348633289337, + "learning_rate": 9.39001008593211e-05, + "loss": 1.0964, + "step": 2658 + }, + { + "epoch": 0.16155294975393403, + "grad_norm": 0.19767002761363983, + "learning_rate": 9.389551798060294e-05, + "loss": 1.087, + "step": 2659 + }, + { + "epoch": 0.16161370678656053, + "grad_norm": 0.32879358530044556, + "learning_rate": 9.38909334928861e-05, + "loss": 1.1335, + "step": 2660 + }, + { + "epoch": 0.16167446381918707, + "grad_norm": 0.5914991497993469, + "learning_rate": 9.388634739633859e-05, + "loss": 1.092, + "step": 2661 + }, + { + "epoch": 0.1617352208518136, + "grad_norm": 1.1937721967697144, + "learning_rate": 9.388175969112856e-05, + "loss": 1.0999, + "step": 2662 + }, + { + "epoch": 0.1617959778844401, + "grad_norm": 0.2987099587917328, + "learning_rate": 9.387717037742412e-05, + "loss": 1.1449, + "step": 2663 + }, + { + "epoch": 0.16185673491706665, + "grad_norm": 0.18860803544521332, + "learning_rate": 9.387257945539354e-05, + "loss": 1.0914, + "step": 2664 + }, + { + "epoch": 0.16191749194969318, + "grad_norm": 0.29970529675483704, + "learning_rate": 9.386798692520507e-05, + "loss": 1.1046, + "step": 2665 + }, + { + "epoch": 0.16197824898231972, + "grad_norm": 0.24461442232131958, + "learning_rate": 9.386339278702704e-05, + "loss": 1.1449, + "step": 2666 + }, + { + "epoch": 0.16203900601494622, + "grad_norm": 0.22977766394615173, + "learning_rate": 9.385879704102789e-05, + "loss": 1.0855, + "step": 2667 + }, + { + "epoch": 0.16209976304757276, + "grad_norm": 0.2146521955728531, + "learning_rate": 9.385419968737605e-05, + "loss": 1.114, + "step": 2668 + }, + { + "epoch": 0.1621605200801993, + "grad_norm": 0.18897241353988647, + "learning_rate": 9.384960072624005e-05, + "loss": 1.1655, + "step": 2669 + }, + { + "epoch": 0.1622212771128258, + "grad_norm": 0.16305464506149292, + "learning_rate": 9.384500015778846e-05, + "loss": 1.1466, + "step": 2670 + }, + { + "epoch": 0.16228203414545234, + "grad_norm": 0.20688949525356293, + "learning_rate": 9.384039798218991e-05, + "loss": 1.1434, + "step": 2671 + }, + { + "epoch": 0.16234279117807887, + "grad_norm": 0.18620194494724274, + "learning_rate": 9.38357941996131e-05, + "loss": 1.1145, + "step": 2672 + }, + { + "epoch": 0.16240354821070538, + "grad_norm": 0.2123836874961853, + "learning_rate": 9.383118881022679e-05, + "loss": 1.1196, + "step": 2673 + }, + { + "epoch": 0.1624643052433319, + "grad_norm": 0.23215189576148987, + "learning_rate": 9.382658181419977e-05, + "loss": 1.2262, + "step": 2674 + }, + { + "epoch": 0.16252506227595845, + "grad_norm": 2.866469383239746, + "learning_rate": 9.382197321170092e-05, + "loss": 1.1225, + "step": 2675 + }, + { + "epoch": 0.16258581930858496, + "grad_norm": 0.2372300624847412, + "learning_rate": 9.381736300289917e-05, + "loss": 1.1578, + "step": 2676 + }, + { + "epoch": 0.1626465763412115, + "grad_norm": 0.30157554149627686, + "learning_rate": 9.38127511879635e-05, + "loss": 1.1298, + "step": 2677 + }, + { + "epoch": 0.16270733337383803, + "grad_norm": 2.6343443393707275, + "learning_rate": 9.380813776706298e-05, + "loss": 1.1643, + "step": 2678 + }, + { + "epoch": 0.16276809040646456, + "grad_norm": 0.24585436284542084, + "learning_rate": 9.380352274036672e-05, + "loss": 1.2431, + "step": 2679 + }, + { + "epoch": 0.16282884743909107, + "grad_norm": 0.24145565927028656, + "learning_rate": 9.379890610804385e-05, + "loss": 1.212, + "step": 2680 + }, + { + "epoch": 0.1628896044717176, + "grad_norm": 0.23217593133449554, + "learning_rate": 9.379428787026361e-05, + "loss": 1.1875, + "step": 2681 + }, + { + "epoch": 0.16295036150434414, + "grad_norm": 0.21196480095386505, + "learning_rate": 9.378966802719528e-05, + "loss": 1.0943, + "step": 2682 + }, + { + "epoch": 0.16301111853697065, + "grad_norm": 0.33148467540740967, + "learning_rate": 9.378504657900821e-05, + "loss": 1.0745, + "step": 2683 + }, + { + "epoch": 0.16307187556959718, + "grad_norm": 0.3658837378025055, + "learning_rate": 9.37804235258718e-05, + "loss": 1.1366, + "step": 2684 + }, + { + "epoch": 0.16313263260222371, + "grad_norm": 0.2708093225955963, + "learning_rate": 9.377579886795549e-05, + "loss": 1.1838, + "step": 2685 + }, + { + "epoch": 0.16319338963485022, + "grad_norm": 0.8164376616477966, + "learning_rate": 9.37711726054288e-05, + "loss": 1.2086, + "step": 2686 + }, + { + "epoch": 0.16325414666747676, + "grad_norm": 0.17520476877689362, + "learning_rate": 9.376654473846134e-05, + "loss": 1.1692, + "step": 2687 + }, + { + "epoch": 0.1633149037001033, + "grad_norm": 1.635270357131958, + "learning_rate": 9.376191526722272e-05, + "loss": 1.2304, + "step": 2688 + }, + { + "epoch": 0.16337566073272983, + "grad_norm": 0.19752848148345947, + "learning_rate": 9.375728419188264e-05, + "loss": 1.1778, + "step": 2689 + }, + { + "epoch": 0.16343641776535633, + "grad_norm": 0.5907459259033203, + "learning_rate": 9.375265151261083e-05, + "loss": 1.1286, + "step": 2690 + }, + { + "epoch": 0.16349717479798287, + "grad_norm": 0.5910320281982422, + "learning_rate": 9.374801722957714e-05, + "loss": 1.0852, + "step": 2691 + }, + { + "epoch": 0.1635579318306094, + "grad_norm": 0.19244112074375153, + "learning_rate": 9.374338134295141e-05, + "loss": 1.061, + "step": 2692 + }, + { + "epoch": 0.1636186888632359, + "grad_norm": 0.1832464039325714, + "learning_rate": 9.373874385290358e-05, + "loss": 1.1541, + "step": 2693 + }, + { + "epoch": 0.16367944589586245, + "grad_norm": 0.2991447150707245, + "learning_rate": 9.373410475960364e-05, + "loss": 1.0924, + "step": 2694 + }, + { + "epoch": 0.16374020292848898, + "grad_norm": 0.25456613302230835, + "learning_rate": 9.372946406322164e-05, + "loss": 1.0989, + "step": 2695 + }, + { + "epoch": 0.1638009599611155, + "grad_norm": 0.21387642621994019, + "learning_rate": 9.37248217639277e-05, + "loss": 1.1089, + "step": 2696 + }, + { + "epoch": 0.16386171699374202, + "grad_norm": 0.26866665482521057, + "learning_rate": 9.372017786189195e-05, + "loss": 1.2257, + "step": 2697 + }, + { + "epoch": 0.16392247402636856, + "grad_norm": 0.2666727602481842, + "learning_rate": 9.371553235728463e-05, + "loss": 1.1979, + "step": 2698 + }, + { + "epoch": 0.16398323105899507, + "grad_norm": 0.20347638428211212, + "learning_rate": 9.371088525027602e-05, + "loss": 1.1374, + "step": 2699 + }, + { + "epoch": 0.1640439880916216, + "grad_norm": 0.3014492690563202, + "learning_rate": 9.370623654103646e-05, + "loss": 1.1903, + "step": 2700 + }, + { + "epoch": 0.16410474512424814, + "grad_norm": 0.7942011952400208, + "learning_rate": 9.370158622973637e-05, + "loss": 1.1665, + "step": 2701 + }, + { + "epoch": 0.16416550215687467, + "grad_norm": 0.3673689067363739, + "learning_rate": 9.369693431654617e-05, + "loss": 1.1997, + "step": 2702 + }, + { + "epoch": 0.16422625918950118, + "grad_norm": 0.22094377875328064, + "learning_rate": 9.36922808016364e-05, + "loss": 1.1707, + "step": 2703 + }, + { + "epoch": 0.1642870162221277, + "grad_norm": 0.22264212369918823, + "learning_rate": 9.368762568517763e-05, + "loss": 1.0806, + "step": 2704 + }, + { + "epoch": 0.16434777325475425, + "grad_norm": 0.3930180072784424, + "learning_rate": 9.36829689673405e-05, + "loss": 1.0871, + "step": 2705 + }, + { + "epoch": 0.16440853028738076, + "grad_norm": 0.1813138872385025, + "learning_rate": 9.367831064829571e-05, + "loss": 1.127, + "step": 2706 + }, + { + "epoch": 0.1644692873200073, + "grad_norm": 2.13755464553833, + "learning_rate": 9.367365072821397e-05, + "loss": 1.167, + "step": 2707 + }, + { + "epoch": 0.16453004435263383, + "grad_norm": 0.2634475529193878, + "learning_rate": 9.366898920726615e-05, + "loss": 1.1304, + "step": 2708 + }, + { + "epoch": 0.16459080138526033, + "grad_norm": 0.1927611529827118, + "learning_rate": 9.366432608562308e-05, + "loss": 1.0681, + "step": 2709 + }, + { + "epoch": 0.16465155841788687, + "grad_norm": 0.1683140993118286, + "learning_rate": 9.365966136345569e-05, + "loss": 1.1105, + "step": 2710 + }, + { + "epoch": 0.1647123154505134, + "grad_norm": 0.23627561330795288, + "learning_rate": 9.365499504093498e-05, + "loss": 1.2114, + "step": 2711 + }, + { + "epoch": 0.1647730724831399, + "grad_norm": 0.696429967880249, + "learning_rate": 9.365032711823198e-05, + "loss": 1.3981, + "step": 2712 + }, + { + "epoch": 0.16483382951576644, + "grad_norm": 0.32340294122695923, + "learning_rate": 9.364565759551782e-05, + "loss": 1.1709, + "step": 2713 + }, + { + "epoch": 0.16489458654839298, + "grad_norm": 0.39964428544044495, + "learning_rate": 9.364098647296364e-05, + "loss": 1.1139, + "step": 2714 + }, + { + "epoch": 0.16495534358101951, + "grad_norm": 0.44005492329597473, + "learning_rate": 9.363631375074065e-05, + "loss": 1.131, + "step": 2715 + }, + { + "epoch": 0.16501610061364602, + "grad_norm": 0.7380490899085999, + "learning_rate": 9.363163942902016e-05, + "loss": 1.1971, + "step": 2716 + }, + { + "epoch": 0.16507685764627256, + "grad_norm": 0.45174384117126465, + "learning_rate": 9.362696350797348e-05, + "loss": 1.1194, + "step": 2717 + }, + { + "epoch": 0.1651376146788991, + "grad_norm": 0.31846198439598083, + "learning_rate": 9.362228598777203e-05, + "loss": 1.2118, + "step": 2718 + }, + { + "epoch": 0.1651983717115256, + "grad_norm": 0.24986636638641357, + "learning_rate": 9.361760686858725e-05, + "loss": 1.1776, + "step": 2719 + }, + { + "epoch": 0.16525912874415213, + "grad_norm": 0.20426741242408752, + "learning_rate": 9.361292615059065e-05, + "loss": 1.1528, + "step": 2720 + }, + { + "epoch": 0.16531988577677867, + "grad_norm": 0.19015654921531677, + "learning_rate": 9.360824383395383e-05, + "loss": 1.1477, + "step": 2721 + }, + { + "epoch": 0.16538064280940518, + "grad_norm": 0.2341787964105606, + "learning_rate": 9.360355991884839e-05, + "loss": 1.0444, + "step": 2722 + }, + { + "epoch": 0.1654413998420317, + "grad_norm": 0.34809285402297974, + "learning_rate": 9.359887440544605e-05, + "loss": 1.2855, + "step": 2723 + }, + { + "epoch": 0.16550215687465825, + "grad_norm": 0.20901837944984436, + "learning_rate": 9.359418729391852e-05, + "loss": 1.2398, + "step": 2724 + }, + { + "epoch": 0.16556291390728478, + "grad_norm": 0.20196299254894257, + "learning_rate": 9.358949858443764e-05, + "loss": 1.1097, + "step": 2725 + }, + { + "epoch": 0.1656236709399113, + "grad_norm": 0.3473243713378906, + "learning_rate": 9.358480827717526e-05, + "loss": 1.2743, + "step": 2726 + }, + { + "epoch": 0.16568442797253782, + "grad_norm": 0.22709813714027405, + "learning_rate": 9.358011637230331e-05, + "loss": 1.1176, + "step": 2727 + }, + { + "epoch": 0.16574518500516436, + "grad_norm": 0.9128461480140686, + "learning_rate": 9.357542286999377e-05, + "loss": 1.1398, + "step": 2728 + }, + { + "epoch": 0.16580594203779087, + "grad_norm": 0.31940165162086487, + "learning_rate": 9.357072777041869e-05, + "loss": 1.0853, + "step": 2729 + }, + { + "epoch": 0.1658666990704174, + "grad_norm": 0.3543567657470703, + "learning_rate": 9.356603107375015e-05, + "loss": 1.3009, + "step": 2730 + }, + { + "epoch": 0.16592745610304394, + "grad_norm": 0.3245181441307068, + "learning_rate": 9.356133278016033e-05, + "loss": 1.1683, + "step": 2731 + }, + { + "epoch": 0.16598821313567044, + "grad_norm": 0.2407148778438568, + "learning_rate": 9.355663288982141e-05, + "loss": 1.0895, + "step": 2732 + }, + { + "epoch": 0.16604897016829698, + "grad_norm": 0.476728230714798, + "learning_rate": 9.355193140290573e-05, + "loss": 1.3047, + "step": 2733 + }, + { + "epoch": 0.1661097272009235, + "grad_norm": 0.13437965512275696, + "learning_rate": 9.354722831958556e-05, + "loss": 1.0877, + "step": 2734 + }, + { + "epoch": 0.16617048423355002, + "grad_norm": 0.6698725819587708, + "learning_rate": 9.354252364003333e-05, + "loss": 1.2105, + "step": 2735 + }, + { + "epoch": 0.16623124126617655, + "grad_norm": 0.3112938702106476, + "learning_rate": 9.353781736442146e-05, + "loss": 1.0774, + "step": 2736 + }, + { + "epoch": 0.1662919982988031, + "grad_norm": 0.27897047996520996, + "learning_rate": 9.353310949292251e-05, + "loss": 1.132, + "step": 2737 + }, + { + "epoch": 0.16635275533142962, + "grad_norm": 0.26251503825187683, + "learning_rate": 9.3528400025709e-05, + "loss": 1.1084, + "step": 2738 + }, + { + "epoch": 0.16641351236405613, + "grad_norm": 1.4709620475769043, + "learning_rate": 9.352368896295356e-05, + "loss": 1.1027, + "step": 2739 + }, + { + "epoch": 0.16647426939668267, + "grad_norm": 0.2612626850605011, + "learning_rate": 9.351897630482891e-05, + "loss": 1.2431, + "step": 2740 + }, + { + "epoch": 0.1665350264293092, + "grad_norm": 0.5787556171417236, + "learning_rate": 9.351426205150777e-05, + "loss": 1.1367, + "step": 2741 + }, + { + "epoch": 0.1665957834619357, + "grad_norm": 0.34013068675994873, + "learning_rate": 9.350954620316293e-05, + "loss": 1.1198, + "step": 2742 + }, + { + "epoch": 0.16665654049456224, + "grad_norm": 0.5343618392944336, + "learning_rate": 9.350482875996726e-05, + "loss": 1.2055, + "step": 2743 + }, + { + "epoch": 0.16671729752718878, + "grad_norm": 0.49730050563812256, + "learning_rate": 9.350010972209371e-05, + "loss": 1.2903, + "step": 2744 + }, + { + "epoch": 0.1667780545598153, + "grad_norm": 0.2530938982963562, + "learning_rate": 9.349538908971521e-05, + "loss": 1.1995, + "step": 2745 + }, + { + "epoch": 0.16683881159244182, + "grad_norm": 1.39519464969635, + "learning_rate": 9.349066686300483e-05, + "loss": 1.2048, + "step": 2746 + }, + { + "epoch": 0.16689956862506836, + "grad_norm": 0.37702691555023193, + "learning_rate": 9.348594304213562e-05, + "loss": 1.3088, + "step": 2747 + }, + { + "epoch": 0.1669603256576949, + "grad_norm": 0.2868204712867737, + "learning_rate": 9.348121762728079e-05, + "loss": 1.1414, + "step": 2748 + }, + { + "epoch": 0.1670210826903214, + "grad_norm": 0.40916576981544495, + "learning_rate": 9.347649061861352e-05, + "loss": 1.2217, + "step": 2749 + }, + { + "epoch": 0.16708183972294793, + "grad_norm": 0.2935818135738373, + "learning_rate": 9.347176201630707e-05, + "loss": 1.1218, + "step": 2750 + }, + { + "epoch": 0.16714259675557447, + "grad_norm": 0.21246002614498138, + "learning_rate": 9.346703182053479e-05, + "loss": 1.1828, + "step": 2751 + }, + { + "epoch": 0.16720335378820098, + "grad_norm": 0.2406328022480011, + "learning_rate": 9.346230003147005e-05, + "loss": 1.2039, + "step": 2752 + }, + { + "epoch": 0.1672641108208275, + "grad_norm": 0.3806989789009094, + "learning_rate": 9.345756664928631e-05, + "loss": 1.1006, + "step": 2753 + }, + { + "epoch": 0.16732486785345405, + "grad_norm": 0.15987083315849304, + "learning_rate": 9.345283167415706e-05, + "loss": 1.1102, + "step": 2754 + }, + { + "epoch": 0.16738562488608055, + "grad_norm": 0.16697901487350464, + "learning_rate": 9.344809510625586e-05, + "loss": 1.0585, + "step": 2755 + }, + { + "epoch": 0.1674463819187071, + "grad_norm": 0.3083542585372925, + "learning_rate": 9.344335694575634e-05, + "loss": 1.1051, + "step": 2756 + }, + { + "epoch": 0.16750713895133362, + "grad_norm": 0.26503217220306396, + "learning_rate": 9.343861719283217e-05, + "loss": 1.0784, + "step": 2757 + }, + { + "epoch": 0.16756789598396013, + "grad_norm": 0.16039438545703888, + "learning_rate": 9.34338758476571e-05, + "loss": 1.1258, + "step": 2758 + }, + { + "epoch": 0.16762865301658667, + "grad_norm": 0.20033229887485504, + "learning_rate": 9.342913291040491e-05, + "loss": 1.1395, + "step": 2759 + }, + { + "epoch": 0.1676894100492132, + "grad_norm": 0.23209625482559204, + "learning_rate": 9.342438838124945e-05, + "loss": 1.0981, + "step": 2760 + }, + { + "epoch": 0.16775016708183974, + "grad_norm": 0.6559396386146545, + "learning_rate": 9.341964226036466e-05, + "loss": 1.0904, + "step": 2761 + }, + { + "epoch": 0.16781092411446624, + "grad_norm": 0.26381775736808777, + "learning_rate": 9.341489454792447e-05, + "loss": 1.2064, + "step": 2762 + }, + { + "epoch": 0.16787168114709278, + "grad_norm": 0.2045595496892929, + "learning_rate": 9.341014524410295e-05, + "loss": 1.1783, + "step": 2763 + }, + { + "epoch": 0.1679324381797193, + "grad_norm": 0.2016468644142151, + "learning_rate": 9.340539434907414e-05, + "loss": 1.112, + "step": 2764 + }, + { + "epoch": 0.16799319521234582, + "grad_norm": 0.18968884646892548, + "learning_rate": 9.340064186301221e-05, + "loss": 1.0729, + "step": 2765 + }, + { + "epoch": 0.16805395224497235, + "grad_norm": 0.12279792875051498, + "learning_rate": 9.339588778609138e-05, + "loss": 1.0543, + "step": 2766 + }, + { + "epoch": 0.1681147092775989, + "grad_norm": 0.210651233792305, + "learning_rate": 9.33911321184859e-05, + "loss": 1.204, + "step": 2767 + }, + { + "epoch": 0.1681754663102254, + "grad_norm": 0.30051079392433167, + "learning_rate": 9.338637486037007e-05, + "loss": 1.2788, + "step": 2768 + }, + { + "epoch": 0.16823622334285193, + "grad_norm": 0.21153174340724945, + "learning_rate": 9.338161601191828e-05, + "loss": 1.1877, + "step": 2769 + }, + { + "epoch": 0.16829698037547847, + "grad_norm": 0.18878796696662903, + "learning_rate": 9.337685557330497e-05, + "loss": 1.0766, + "step": 2770 + }, + { + "epoch": 0.168357737408105, + "grad_norm": 0.1966601461172104, + "learning_rate": 9.337209354470467e-05, + "loss": 1.117, + "step": 2771 + }, + { + "epoch": 0.1684184944407315, + "grad_norm": 0.3076675236225128, + "learning_rate": 9.336732992629186e-05, + "loss": 1.1717, + "step": 2772 + }, + { + "epoch": 0.16847925147335804, + "grad_norm": 0.15472185611724854, + "learning_rate": 9.33625647182412e-05, + "loss": 1.0859, + "step": 2773 + }, + { + "epoch": 0.16854000850598458, + "grad_norm": 0.1461690366268158, + "learning_rate": 9.335779792072734e-05, + "loss": 1.0672, + "step": 2774 + }, + { + "epoch": 0.16860076553861109, + "grad_norm": 0.577778697013855, + "learning_rate": 9.335302953392501e-05, + "loss": 1.2297, + "step": 2775 + }, + { + "epoch": 0.16866152257123762, + "grad_norm": 2.478369951248169, + "learning_rate": 9.334825955800904e-05, + "loss": 1.0548, + "step": 2776 + }, + { + "epoch": 0.16872227960386416, + "grad_norm": 0.3680042624473572, + "learning_rate": 9.33434879931542e-05, + "loss": 1.2124, + "step": 2777 + }, + { + "epoch": 0.16878303663649066, + "grad_norm": 0.23315078020095825, + "learning_rate": 9.333871483953544e-05, + "loss": 1.1652, + "step": 2778 + }, + { + "epoch": 0.1688437936691172, + "grad_norm": 9.248869895935059, + "learning_rate": 9.333394009732772e-05, + "loss": 1.3086, + "step": 2779 + }, + { + "epoch": 0.16890455070174373, + "grad_norm": 0.1792900711297989, + "learning_rate": 9.332916376670603e-05, + "loss": 1.124, + "step": 2780 + }, + { + "epoch": 0.16896530773437024, + "grad_norm": 0.29281532764434814, + "learning_rate": 9.332438584784549e-05, + "loss": 1.1318, + "step": 2781 + }, + { + "epoch": 0.16902606476699678, + "grad_norm": 0.14941686391830444, + "learning_rate": 9.33196063409212e-05, + "loss": 1.0851, + "step": 2782 + }, + { + "epoch": 0.1690868217996233, + "grad_norm": 0.30175021290779114, + "learning_rate": 9.331482524610837e-05, + "loss": 1.0731, + "step": 2783 + }, + { + "epoch": 0.16914757883224985, + "grad_norm": 0.2641928493976593, + "learning_rate": 9.331004256358224e-05, + "loss": 1.0891, + "step": 2784 + }, + { + "epoch": 0.16920833586487635, + "grad_norm": 0.15149296820163727, + "learning_rate": 9.330525829351813e-05, + "loss": 1.0991, + "step": 2785 + }, + { + "epoch": 0.1692690928975029, + "grad_norm": 0.2485985904932022, + "learning_rate": 9.330047243609142e-05, + "loss": 1.2373, + "step": 2786 + }, + { + "epoch": 0.16932984993012942, + "grad_norm": 0.2151057869195938, + "learning_rate": 9.32956849914775e-05, + "loss": 1.1282, + "step": 2787 + }, + { + "epoch": 0.16939060696275593, + "grad_norm": 0.2067212611436844, + "learning_rate": 9.329089595985189e-05, + "loss": 1.2167, + "step": 2788 + }, + { + "epoch": 0.16945136399538246, + "grad_norm": 0.19152012467384338, + "learning_rate": 9.328610534139013e-05, + "loss": 1.1732, + "step": 2789 + }, + { + "epoch": 0.169512121028009, + "grad_norm": 0.26229506731033325, + "learning_rate": 9.32813131362678e-05, + "loss": 1.1492, + "step": 2790 + }, + { + "epoch": 0.1695728780606355, + "grad_norm": 0.15380549430847168, + "learning_rate": 9.327651934466057e-05, + "loss": 1.0588, + "step": 2791 + }, + { + "epoch": 0.16963363509326204, + "grad_norm": 0.8333106637001038, + "learning_rate": 9.327172396674417e-05, + "loss": 1.109, + "step": 2792 + }, + { + "epoch": 0.16969439212588858, + "grad_norm": 0.16392317414283752, + "learning_rate": 9.326692700269434e-05, + "loss": 1.0875, + "step": 2793 + }, + { + "epoch": 0.1697551491585151, + "grad_norm": 0.14444230496883392, + "learning_rate": 9.326212845268695e-05, + "loss": 1.0835, + "step": 2794 + }, + { + "epoch": 0.16981590619114162, + "grad_norm": 0.13465823233127594, + "learning_rate": 9.325732831689791e-05, + "loss": 1.0591, + "step": 2795 + }, + { + "epoch": 0.16987666322376815, + "grad_norm": 0.244447261095047, + "learning_rate": 9.325252659550309e-05, + "loss": 1.0747, + "step": 2796 + }, + { + "epoch": 0.1699374202563947, + "grad_norm": 0.2744373679161072, + "learning_rate": 9.324772328867858e-05, + "loss": 1.0442, + "step": 2797 + }, + { + "epoch": 0.1699981772890212, + "grad_norm": 0.9067546725273132, + "learning_rate": 9.32429183966004e-05, + "loss": 1.0712, + "step": 2798 + }, + { + "epoch": 0.17005893432164773, + "grad_norm": 0.1323540359735489, + "learning_rate": 9.323811191944469e-05, + "loss": 1.0656, + "step": 2799 + }, + { + "epoch": 0.17011969135427427, + "grad_norm": 0.1757102906703949, + "learning_rate": 9.323330385738762e-05, + "loss": 1.1109, + "step": 2800 + }, + { + "epoch": 0.17018044838690077, + "grad_norm": 0.4943138659000397, + "learning_rate": 9.322849421060546e-05, + "loss": 1.1901, + "step": 2801 + }, + { + "epoch": 0.1702412054195273, + "grad_norm": 8.577896118164062, + "learning_rate": 9.322368297927448e-05, + "loss": 1.1125, + "step": 2802 + }, + { + "epoch": 0.17030196245215384, + "grad_norm": 0.20444080233573914, + "learning_rate": 9.321887016357104e-05, + "loss": 1.1183, + "step": 2803 + }, + { + "epoch": 0.17036271948478035, + "grad_norm": 0.24041442573070526, + "learning_rate": 9.321405576367157e-05, + "loss": 1.3482, + "step": 2804 + }, + { + "epoch": 0.17042347651740689, + "grad_norm": 0.5964112281799316, + "learning_rate": 9.320923977975253e-05, + "loss": 1.2177, + "step": 2805 + }, + { + "epoch": 0.17048423355003342, + "grad_norm": 7.097474575042725, + "learning_rate": 9.320442221199044e-05, + "loss": 1.2228, + "step": 2806 + }, + { + "epoch": 0.17054499058265996, + "grad_norm": 0.2913483679294586, + "learning_rate": 9.319960306056192e-05, + "loss": 1.1019, + "step": 2807 + }, + { + "epoch": 0.17060574761528646, + "grad_norm": 0.2946627736091614, + "learning_rate": 9.319478232564359e-05, + "loss": 1.0749, + "step": 2808 + }, + { + "epoch": 0.170666504647913, + "grad_norm": 0.2581668198108673, + "learning_rate": 9.318996000741215e-05, + "loss": 1.2164, + "step": 2809 + }, + { + "epoch": 0.17072726168053953, + "grad_norm": 0.3176397681236267, + "learning_rate": 9.31851361060444e-05, + "loss": 1.1545, + "step": 2810 + }, + { + "epoch": 0.17078801871316604, + "grad_norm": 0.7890453338623047, + "learning_rate": 9.318031062171715e-05, + "loss": 1.304, + "step": 2811 + }, + { + "epoch": 0.17084877574579257, + "grad_norm": 0.3896808326244354, + "learning_rate": 9.317548355460725e-05, + "loss": 1.3385, + "step": 2812 + }, + { + "epoch": 0.1709095327784191, + "grad_norm": 0.20673814415931702, + "learning_rate": 9.317065490489165e-05, + "loss": 1.149, + "step": 2813 + }, + { + "epoch": 0.17097028981104562, + "grad_norm": 4.286653995513916, + "learning_rate": 9.316582467274735e-05, + "loss": 1.1895, + "step": 2814 + }, + { + "epoch": 0.17103104684367215, + "grad_norm": 0.24091972410678864, + "learning_rate": 9.316099285835141e-05, + "loss": 1.1407, + "step": 2815 + }, + { + "epoch": 0.1710918038762987, + "grad_norm": 0.23865258693695068, + "learning_rate": 9.315615946188093e-05, + "loss": 1.1443, + "step": 2816 + }, + { + "epoch": 0.1711525609089252, + "grad_norm": 0.25635668635368347, + "learning_rate": 9.315132448351309e-05, + "loss": 1.1026, + "step": 2817 + }, + { + "epoch": 0.17121331794155173, + "grad_norm": 0.2651605010032654, + "learning_rate": 9.314648792342511e-05, + "loss": 1.0955, + "step": 2818 + }, + { + "epoch": 0.17127407497417826, + "grad_norm": 0.3693333864212036, + "learning_rate": 9.314164978179427e-05, + "loss": 1.1378, + "step": 2819 + }, + { + "epoch": 0.1713348320068048, + "grad_norm": 3.0835492610931396, + "learning_rate": 9.313681005879791e-05, + "loss": 1.0815, + "step": 2820 + }, + { + "epoch": 0.1713955890394313, + "grad_norm": 0.3346802890300751, + "learning_rate": 9.313196875461345e-05, + "loss": 1.1228, + "step": 2821 + }, + { + "epoch": 0.17145634607205784, + "grad_norm": 53.88895797729492, + "learning_rate": 9.312712586941834e-05, + "loss": 1.1496, + "step": 2822 + }, + { + "epoch": 0.17151710310468438, + "grad_norm": 0.1832890808582306, + "learning_rate": 9.312228140339009e-05, + "loss": 1.0758, + "step": 2823 + }, + { + "epoch": 0.17157786013731088, + "grad_norm": 0.2312437742948532, + "learning_rate": 9.311743535670628e-05, + "loss": 1.1014, + "step": 2824 + }, + { + "epoch": 0.17163861716993742, + "grad_norm": 0.1523139476776123, + "learning_rate": 9.311258772954454e-05, + "loss": 1.0823, + "step": 2825 + }, + { + "epoch": 0.17169937420256395, + "grad_norm": 0.20451554656028748, + "learning_rate": 9.310773852208256e-05, + "loss": 1.1055, + "step": 2826 + }, + { + "epoch": 0.17176013123519046, + "grad_norm": 0.16789209842681885, + "learning_rate": 9.310288773449811e-05, + "loss": 1.1059, + "step": 2827 + }, + { + "epoch": 0.171820888267817, + "grad_norm": 0.21686002612113953, + "learning_rate": 9.309803536696897e-05, + "loss": 1.2283, + "step": 2828 + }, + { + "epoch": 0.17188164530044353, + "grad_norm": 0.18368317186832428, + "learning_rate": 9.3093181419673e-05, + "loss": 1.1682, + "step": 2829 + }, + { + "epoch": 0.17194240233307007, + "grad_norm": 0.19566959142684937, + "learning_rate": 9.308832589278815e-05, + "loss": 1.1125, + "step": 2830 + }, + { + "epoch": 0.17200315936569657, + "grad_norm": 0.17353063821792603, + "learning_rate": 9.308346878649238e-05, + "loss": 1.1333, + "step": 2831 + }, + { + "epoch": 0.1720639163983231, + "grad_norm": 0.1296449452638626, + "learning_rate": 9.307861010096372e-05, + "loss": 1.0655, + "step": 2832 + }, + { + "epoch": 0.17212467343094964, + "grad_norm": 0.17133085429668427, + "learning_rate": 9.30737498363803e-05, + "loss": 1.1367, + "step": 2833 + }, + { + "epoch": 0.17218543046357615, + "grad_norm": 0.16282299160957336, + "learning_rate": 9.306888799292027e-05, + "loss": 1.077, + "step": 2834 + }, + { + "epoch": 0.17224618749620269, + "grad_norm": 0.20644213259220123, + "learning_rate": 9.306402457076178e-05, + "loss": 1.148, + "step": 2835 + }, + { + "epoch": 0.17230694452882922, + "grad_norm": 0.14916478097438812, + "learning_rate": 9.305915957008317e-05, + "loss": 1.0827, + "step": 2836 + }, + { + "epoch": 0.17236770156145573, + "grad_norm": 0.2374756783246994, + "learning_rate": 9.305429299106275e-05, + "loss": 1.0482, + "step": 2837 + }, + { + "epoch": 0.17242845859408226, + "grad_norm": 0.1406780183315277, + "learning_rate": 9.304942483387889e-05, + "loss": 1.0751, + "step": 2838 + }, + { + "epoch": 0.1724892156267088, + "grad_norm": 0.26799002289772034, + "learning_rate": 9.304455509871004e-05, + "loss": 1.1306, + "step": 2839 + }, + { + "epoch": 0.1725499726593353, + "grad_norm": 0.43531063199043274, + "learning_rate": 9.303968378573471e-05, + "loss": 1.1339, + "step": 2840 + }, + { + "epoch": 0.17261072969196184, + "grad_norm": 0.24498429894447327, + "learning_rate": 9.303481089513144e-05, + "loss": 1.2976, + "step": 2841 + }, + { + "epoch": 0.17267148672458837, + "grad_norm": 0.3665919899940491, + "learning_rate": 9.302993642707887e-05, + "loss": 1.1735, + "step": 2842 + }, + { + "epoch": 0.1727322437572149, + "grad_norm": 1.2711642980575562, + "learning_rate": 9.302506038175565e-05, + "loss": 1.2278, + "step": 2843 + }, + { + "epoch": 0.17279300078984142, + "grad_norm": 0.21386763453483582, + "learning_rate": 9.302018275934053e-05, + "loss": 1.0788, + "step": 2844 + }, + { + "epoch": 0.17285375782246795, + "grad_norm": 0.2421104460954666, + "learning_rate": 9.30153035600123e-05, + "loss": 1.1543, + "step": 2845 + }, + { + "epoch": 0.1729145148550945, + "grad_norm": 0.21388423442840576, + "learning_rate": 9.30104227839498e-05, + "loss": 1.1734, + "step": 2846 + }, + { + "epoch": 0.172975271887721, + "grad_norm": 0.21080902218818665, + "learning_rate": 9.300554043133194e-05, + "loss": 1.0814, + "step": 2847 + }, + { + "epoch": 0.17303602892034753, + "grad_norm": 0.2629931569099426, + "learning_rate": 9.300065650233767e-05, + "loss": 1.0584, + "step": 2848 + }, + { + "epoch": 0.17309678595297406, + "grad_norm": 0.22488714754581451, + "learning_rate": 9.299577099714605e-05, + "loss": 1.258, + "step": 2849 + }, + { + "epoch": 0.17315754298560057, + "grad_norm": 0.28502562642097473, + "learning_rate": 9.299088391593611e-05, + "loss": 1.0994, + "step": 2850 + }, + { + "epoch": 0.1732183000182271, + "grad_norm": 0.4765513241291046, + "learning_rate": 9.298599525888703e-05, + "loss": 1.1479, + "step": 2851 + }, + { + "epoch": 0.17327905705085364, + "grad_norm": 0.20497465133666992, + "learning_rate": 9.298110502617796e-05, + "loss": 1.1424, + "step": 2852 + }, + { + "epoch": 0.17333981408348018, + "grad_norm": 0.34637463092803955, + "learning_rate": 9.29762132179882e-05, + "loss": 1.1249, + "step": 2853 + }, + { + "epoch": 0.17340057111610668, + "grad_norm": 0.8920005559921265, + "learning_rate": 9.297131983449702e-05, + "loss": 1.2085, + "step": 2854 + }, + { + "epoch": 0.17346132814873322, + "grad_norm": 0.24549148976802826, + "learning_rate": 9.296642487588381e-05, + "loss": 1.1641, + "step": 2855 + }, + { + "epoch": 0.17352208518135975, + "grad_norm": 0.23367923498153687, + "learning_rate": 9.296152834232801e-05, + "loss": 1.0847, + "step": 2856 + }, + { + "epoch": 0.17358284221398626, + "grad_norm": 0.39331212639808655, + "learning_rate": 9.295663023400907e-05, + "loss": 1.2004, + "step": 2857 + }, + { + "epoch": 0.1736435992466128, + "grad_norm": 0.3297134339809418, + "learning_rate": 9.295173055110653e-05, + "loss": 1.3704, + "step": 2858 + }, + { + "epoch": 0.17370435627923933, + "grad_norm": 0.2672659456729889, + "learning_rate": 9.294682929380003e-05, + "loss": 1.274, + "step": 2859 + }, + { + "epoch": 0.17376511331186584, + "grad_norm": 0.2202669084072113, + "learning_rate": 9.29419264622692e-05, + "loss": 1.0505, + "step": 2860 + }, + { + "epoch": 0.17382587034449237, + "grad_norm": 0.22581006586551666, + "learning_rate": 9.293702205669373e-05, + "loss": 1.2409, + "step": 2861 + }, + { + "epoch": 0.1738866273771189, + "grad_norm": 0.33781954646110535, + "learning_rate": 9.293211607725342e-05, + "loss": 1.2296, + "step": 2862 + }, + { + "epoch": 0.17394738440974541, + "grad_norm": 0.22288081049919128, + "learning_rate": 9.292720852412811e-05, + "loss": 1.1575, + "step": 2863 + }, + { + "epoch": 0.17400814144237195, + "grad_norm": 0.17195765674114227, + "learning_rate": 9.292229939749768e-05, + "loss": 1.1396, + "step": 2864 + }, + { + "epoch": 0.17406889847499848, + "grad_norm": 0.2843230068683624, + "learning_rate": 9.291738869754207e-05, + "loss": 1.1374, + "step": 2865 + }, + { + "epoch": 0.17412965550762502, + "grad_norm": 0.22741979360580444, + "learning_rate": 9.291247642444127e-05, + "loss": 1.1021, + "step": 2866 + }, + { + "epoch": 0.17419041254025153, + "grad_norm": 0.31253963708877563, + "learning_rate": 9.290756257837535e-05, + "loss": 1.2492, + "step": 2867 + }, + { + "epoch": 0.17425116957287806, + "grad_norm": 0.3234567642211914, + "learning_rate": 9.290264715952444e-05, + "loss": 1.1904, + "step": 2868 + }, + { + "epoch": 0.1743119266055046, + "grad_norm": 0.19920937716960907, + "learning_rate": 9.28977301680687e-05, + "loss": 1.1446, + "step": 2869 + }, + { + "epoch": 0.1743726836381311, + "grad_norm": 0.2732466161251068, + "learning_rate": 9.289281160418837e-05, + "loss": 1.3202, + "step": 2870 + }, + { + "epoch": 0.17443344067075764, + "grad_norm": 0.15174159407615662, + "learning_rate": 9.288789146806374e-05, + "loss": 1.0677, + "step": 2871 + }, + { + "epoch": 0.17449419770338417, + "grad_norm": 0.26488688588142395, + "learning_rate": 9.288296975987515e-05, + "loss": 1.3555, + "step": 2872 + }, + { + "epoch": 0.17455495473601068, + "grad_norm": 0.3000930845737457, + "learning_rate": 9.287804647980303e-05, + "loss": 1.2565, + "step": 2873 + }, + { + "epoch": 0.17461571176863722, + "grad_norm": 0.1789645552635193, + "learning_rate": 9.287312162802782e-05, + "loss": 1.118, + "step": 2874 + }, + { + "epoch": 0.17467646880126375, + "grad_norm": 0.22553816437721252, + "learning_rate": 9.286819520473004e-05, + "loss": 1.1472, + "step": 2875 + }, + { + "epoch": 0.1747372258338903, + "grad_norm": 0.5480632185935974, + "learning_rate": 9.286326721009029e-05, + "loss": 1.1658, + "step": 2876 + }, + { + "epoch": 0.1747979828665168, + "grad_norm": 0.2610057294368744, + "learning_rate": 9.28583376442892e-05, + "loss": 1.181, + "step": 2877 + }, + { + "epoch": 0.17485873989914333, + "grad_norm": 0.5123583674430847, + "learning_rate": 9.285340650750745e-05, + "loss": 1.2391, + "step": 2878 + }, + { + "epoch": 0.17491949693176986, + "grad_norm": 0.18001104891300201, + "learning_rate": 9.284847379992578e-05, + "loss": 1.0335, + "step": 2879 + }, + { + "epoch": 0.17498025396439637, + "grad_norm": 0.261791467666626, + "learning_rate": 9.284353952172506e-05, + "loss": 1.2783, + "step": 2880 + }, + { + "epoch": 0.1750410109970229, + "grad_norm": 0.3688179552555084, + "learning_rate": 9.28386036730861e-05, + "loss": 1.1358, + "step": 2881 + }, + { + "epoch": 0.17510176802964944, + "grad_norm": 0.34989133477211, + "learning_rate": 9.283366625418984e-05, + "loss": 1.1603, + "step": 2882 + }, + { + "epoch": 0.17516252506227595, + "grad_norm": 0.30162036418914795, + "learning_rate": 9.282872726521726e-05, + "loss": 1.1486, + "step": 2883 + }, + { + "epoch": 0.17522328209490248, + "grad_norm": 0.38653531670570374, + "learning_rate": 9.282378670634941e-05, + "loss": 1.0619, + "step": 2884 + }, + { + "epoch": 0.17528403912752902, + "grad_norm": 0.16464611887931824, + "learning_rate": 9.281884457776739e-05, + "loss": 1.1325, + "step": 2885 + }, + { + "epoch": 0.17534479616015552, + "grad_norm": 0.3459137976169586, + "learning_rate": 9.281390087965232e-05, + "loss": 1.1303, + "step": 2886 + }, + { + "epoch": 0.17540555319278206, + "grad_norm": 0.4004635214805603, + "learning_rate": 9.280895561218546e-05, + "loss": 1.1083, + "step": 2887 + }, + { + "epoch": 0.1754663102254086, + "grad_norm": 0.20689265429973602, + "learning_rate": 9.280400877554805e-05, + "loss": 1.0448, + "step": 2888 + }, + { + "epoch": 0.17552706725803513, + "grad_norm": 0.5363435745239258, + "learning_rate": 9.279906036992143e-05, + "loss": 1.0712, + "step": 2889 + }, + { + "epoch": 0.17558782429066164, + "grad_norm": 0.5169433355331421, + "learning_rate": 9.279411039548697e-05, + "loss": 1.1009, + "step": 2890 + }, + { + "epoch": 0.17564858132328817, + "grad_norm": 0.22366322576999664, + "learning_rate": 9.278915885242612e-05, + "loss": 1.1869, + "step": 2891 + }, + { + "epoch": 0.1757093383559147, + "grad_norm": 0.5242984294891357, + "learning_rate": 9.278420574092039e-05, + "loss": 1.1001, + "step": 2892 + }, + { + "epoch": 0.17577009538854121, + "grad_norm": 0.5860154032707214, + "learning_rate": 9.277925106115133e-05, + "loss": 1.056, + "step": 2893 + }, + { + "epoch": 0.17583085242116775, + "grad_norm": 0.24157953262329102, + "learning_rate": 9.277429481330054e-05, + "loss": 1.1536, + "step": 2894 + }, + { + "epoch": 0.17589160945379428, + "grad_norm": 2.524925708770752, + "learning_rate": 9.276933699754973e-05, + "loss": 1.1161, + "step": 2895 + }, + { + "epoch": 0.1759523664864208, + "grad_norm": 0.9460970163345337, + "learning_rate": 9.276437761408058e-05, + "loss": 1.1056, + "step": 2896 + }, + { + "epoch": 0.17601312351904733, + "grad_norm": 0.19609951972961426, + "learning_rate": 9.27594166630749e-05, + "loss": 1.0992, + "step": 2897 + }, + { + "epoch": 0.17607388055167386, + "grad_norm": 0.3549923300743103, + "learning_rate": 9.275445414471455e-05, + "loss": 1.2927, + "step": 2898 + }, + { + "epoch": 0.1761346375843004, + "grad_norm": 0.6184433102607727, + "learning_rate": 9.274949005918143e-05, + "loss": 1.1187, + "step": 2899 + }, + { + "epoch": 0.1761953946169269, + "grad_norm": 0.21333909034729004, + "learning_rate": 9.274452440665748e-05, + "loss": 1.0563, + "step": 2900 + }, + { + "epoch": 0.17625615164955344, + "grad_norm": 0.48458486795425415, + "learning_rate": 9.273955718732472e-05, + "loss": 1.1032, + "step": 2901 + }, + { + "epoch": 0.17631690868217997, + "grad_norm": 0.3009887635707855, + "learning_rate": 9.273458840136523e-05, + "loss": 1.1468, + "step": 2902 + }, + { + "epoch": 0.17637766571480648, + "grad_norm": 0.207598015666008, + "learning_rate": 9.272961804896114e-05, + "loss": 1.1485, + "step": 2903 + }, + { + "epoch": 0.17643842274743302, + "grad_norm": 0.5576958060264587, + "learning_rate": 9.272464613029466e-05, + "loss": 1.1241, + "step": 2904 + }, + { + "epoch": 0.17649917978005955, + "grad_norm": 0.36373335123062134, + "learning_rate": 9.271967264554799e-05, + "loss": 1.1535, + "step": 2905 + }, + { + "epoch": 0.17655993681268606, + "grad_norm": 0.3215940296649933, + "learning_rate": 9.271469759490348e-05, + "loss": 1.1741, + "step": 2906 + }, + { + "epoch": 0.1766206938453126, + "grad_norm": 0.22963079810142517, + "learning_rate": 9.270972097854346e-05, + "loss": 1.0935, + "step": 2907 + }, + { + "epoch": 0.17668145087793913, + "grad_norm": 0.37165361642837524, + "learning_rate": 9.270474279665037e-05, + "loss": 1.2967, + "step": 2908 + }, + { + "epoch": 0.17674220791056564, + "grad_norm": 0.31660640239715576, + "learning_rate": 9.269976304940669e-05, + "loss": 1.0709, + "step": 2909 + }, + { + "epoch": 0.17680296494319217, + "grad_norm": 0.18994460999965668, + "learning_rate": 9.269478173699492e-05, + "loss": 1.0734, + "step": 2910 + }, + { + "epoch": 0.1768637219758187, + "grad_norm": 0.3182249963283539, + "learning_rate": 9.268979885959767e-05, + "loss": 1.1146, + "step": 2911 + }, + { + "epoch": 0.17692447900844524, + "grad_norm": 0.2843031585216522, + "learning_rate": 9.26848144173976e-05, + "loss": 1.2331, + "step": 2912 + }, + { + "epoch": 0.17698523604107175, + "grad_norm": 0.2277602106332779, + "learning_rate": 9.267982841057743e-05, + "loss": 1.2026, + "step": 2913 + }, + { + "epoch": 0.17704599307369828, + "grad_norm": 3.008589029312134, + "learning_rate": 9.267484083931987e-05, + "loss": 1.1579, + "step": 2914 + }, + { + "epoch": 0.17710675010632482, + "grad_norm": 0.21989019215106964, + "learning_rate": 9.266985170380777e-05, + "loss": 1.1289, + "step": 2915 + }, + { + "epoch": 0.17716750713895132, + "grad_norm": 3.9978926181793213, + "learning_rate": 9.266486100422402e-05, + "loss": 1.0819, + "step": 2916 + }, + { + "epoch": 0.17722826417157786, + "grad_norm": 0.18277336657047272, + "learning_rate": 9.265986874075155e-05, + "loss": 1.1572, + "step": 2917 + }, + { + "epoch": 0.1772890212042044, + "grad_norm": 0.19926071166992188, + "learning_rate": 9.265487491357334e-05, + "loss": 1.2134, + "step": 2918 + }, + { + "epoch": 0.1773497782368309, + "grad_norm": 0.19073501229286194, + "learning_rate": 9.264987952287243e-05, + "loss": 1.0866, + "step": 2919 + }, + { + "epoch": 0.17741053526945744, + "grad_norm": 0.1900995820760727, + "learning_rate": 9.264488256883194e-05, + "loss": 1.0677, + "step": 2920 + }, + { + "epoch": 0.17747129230208397, + "grad_norm": 0.23190531134605408, + "learning_rate": 9.263988405163506e-05, + "loss": 1.3003, + "step": 2921 + }, + { + "epoch": 0.17753204933471048, + "grad_norm": 0.3566952049732208, + "learning_rate": 9.263488397146498e-05, + "loss": 1.1074, + "step": 2922 + }, + { + "epoch": 0.177592806367337, + "grad_norm": 0.4597981572151184, + "learning_rate": 9.262988232850497e-05, + "loss": 1.1799, + "step": 2923 + }, + { + "epoch": 0.17765356339996355, + "grad_norm": 0.20072588324546814, + "learning_rate": 9.262487912293841e-05, + "loss": 1.1389, + "step": 2924 + }, + { + "epoch": 0.17771432043259008, + "grad_norm": 0.28654614090919495, + "learning_rate": 9.261987435494866e-05, + "loss": 1.219, + "step": 2925 + }, + { + "epoch": 0.1777750774652166, + "grad_norm": 5.362672328948975, + "learning_rate": 9.261486802471916e-05, + "loss": 1.1247, + "step": 2926 + }, + { + "epoch": 0.17783583449784313, + "grad_norm": 0.2221587747335434, + "learning_rate": 9.260986013243345e-05, + "loss": 1.1279, + "step": 2927 + }, + { + "epoch": 0.17789659153046966, + "grad_norm": 0.8303794860839844, + "learning_rate": 9.260485067827508e-05, + "loss": 1.0504, + "step": 2928 + }, + { + "epoch": 0.17795734856309617, + "grad_norm": 0.22825317084789276, + "learning_rate": 9.259983966242767e-05, + "loss": 1.1048, + "step": 2929 + }, + { + "epoch": 0.1780181055957227, + "grad_norm": 0.21876034140586853, + "learning_rate": 9.25948270850749e-05, + "loss": 1.196, + "step": 2930 + }, + { + "epoch": 0.17807886262834924, + "grad_norm": 0.18129688501358032, + "learning_rate": 9.258981294640051e-05, + "loss": 1.1931, + "step": 2931 + }, + { + "epoch": 0.17813961966097575, + "grad_norm": 0.2925190329551697, + "learning_rate": 9.25847972465883e-05, + "loss": 1.0909, + "step": 2932 + }, + { + "epoch": 0.17820037669360228, + "grad_norm": 0.19749584794044495, + "learning_rate": 9.25797799858221e-05, + "loss": 1.0849, + "step": 2933 + }, + { + "epoch": 0.17826113372622882, + "grad_norm": 0.2684710919857025, + "learning_rate": 9.257476116428586e-05, + "loss": 1.1571, + "step": 2934 + }, + { + "epoch": 0.17832189075885535, + "grad_norm": 0.19014763832092285, + "learning_rate": 9.256974078216349e-05, + "loss": 1.1072, + "step": 2935 + }, + { + "epoch": 0.17838264779148186, + "grad_norm": 0.3571806848049164, + "learning_rate": 9.256471883963906e-05, + "loss": 1.0365, + "step": 2936 + }, + { + "epoch": 0.1784434048241084, + "grad_norm": 11.446825981140137, + "learning_rate": 9.255969533689664e-05, + "loss": 1.0546, + "step": 2937 + }, + { + "epoch": 0.17850416185673493, + "grad_norm": 0.2527330219745636, + "learning_rate": 9.255467027412034e-05, + "loss": 1.1649, + "step": 2938 + }, + { + "epoch": 0.17856491888936143, + "grad_norm": 0.19866426289081573, + "learning_rate": 9.25496436514944e-05, + "loss": 1.1168, + "step": 2939 + }, + { + "epoch": 0.17862567592198797, + "grad_norm": 0.1877591460943222, + "learning_rate": 9.254461546920303e-05, + "loss": 1.0906, + "step": 2940 + }, + { + "epoch": 0.1786864329546145, + "grad_norm": 0.17339429259300232, + "learning_rate": 9.253958572743058e-05, + "loss": 1.1445, + "step": 2941 + }, + { + "epoch": 0.178747189987241, + "grad_norm": 3.7702085971832275, + "learning_rate": 9.253455442636137e-05, + "loss": 1.1286, + "step": 2942 + }, + { + "epoch": 0.17880794701986755, + "grad_norm": 2.1460444927215576, + "learning_rate": 9.252952156617985e-05, + "loss": 1.1913, + "step": 2943 + }, + { + "epoch": 0.17886870405249408, + "grad_norm": 0.31069785356521606, + "learning_rate": 9.25244871470705e-05, + "loss": 1.2596, + "step": 2944 + }, + { + "epoch": 0.1789294610851206, + "grad_norm": 0.34162893891334534, + "learning_rate": 9.251945116921786e-05, + "loss": 1.0846, + "step": 2945 + }, + { + "epoch": 0.17899021811774712, + "grad_norm": 0.2573460638523102, + "learning_rate": 9.25144136328065e-05, + "loss": 1.1143, + "step": 2946 + }, + { + "epoch": 0.17905097515037366, + "grad_norm": 0.24816922843456268, + "learning_rate": 9.250937453802109e-05, + "loss": 1.2445, + "step": 2947 + }, + { + "epoch": 0.1791117321830002, + "grad_norm": 0.7603325247764587, + "learning_rate": 9.250433388504635e-05, + "loss": 1.1113, + "step": 2948 + }, + { + "epoch": 0.1791724892156267, + "grad_norm": 0.28203895688056946, + "learning_rate": 9.249929167406703e-05, + "loss": 1.2478, + "step": 2949 + }, + { + "epoch": 0.17923324624825324, + "grad_norm": 0.5692694783210754, + "learning_rate": 9.249424790526797e-05, + "loss": 1.2256, + "step": 2950 + }, + { + "epoch": 0.17929400328087977, + "grad_norm": 1.2878317832946777, + "learning_rate": 9.248920257883404e-05, + "loss": 1.2078, + "step": 2951 + }, + { + "epoch": 0.17935476031350628, + "grad_norm": 0.30191266536712646, + "learning_rate": 9.248415569495016e-05, + "loss": 1.0396, + "step": 2952 + }, + { + "epoch": 0.1794155173461328, + "grad_norm": 0.2611783444881439, + "learning_rate": 9.247910725380136e-05, + "loss": 1.1523, + "step": 2953 + }, + { + "epoch": 0.17947627437875935, + "grad_norm": 1.5869423151016235, + "learning_rate": 9.247405725557266e-05, + "loss": 1.2259, + "step": 2954 + }, + { + "epoch": 0.17953703141138586, + "grad_norm": 0.35859984159469604, + "learning_rate": 9.246900570044917e-05, + "loss": 1.1144, + "step": 2955 + }, + { + "epoch": 0.1795977884440124, + "grad_norm": 0.26638662815093994, + "learning_rate": 9.246395258861609e-05, + "loss": 1.1497, + "step": 2956 + }, + { + "epoch": 0.17965854547663893, + "grad_norm": 0.15947876870632172, + "learning_rate": 9.245889792025862e-05, + "loss": 1.0957, + "step": 2957 + }, + { + "epoch": 0.17971930250926546, + "grad_norm": 0.2812144160270691, + "learning_rate": 9.245384169556202e-05, + "loss": 1.129, + "step": 2958 + }, + { + "epoch": 0.17978005954189197, + "grad_norm": 0.2027055025100708, + "learning_rate": 9.244878391471166e-05, + "loss": 1.2172, + "step": 2959 + }, + { + "epoch": 0.1798408165745185, + "grad_norm": 0.15256880223751068, + "learning_rate": 9.244372457789291e-05, + "loss": 1.1098, + "step": 2960 + }, + { + "epoch": 0.17990157360714504, + "grad_norm": 0.276745080947876, + "learning_rate": 9.243866368529124e-05, + "loss": 1.1624, + "step": 2961 + }, + { + "epoch": 0.17996233063977154, + "grad_norm": 0.3117915987968445, + "learning_rate": 9.243360123709215e-05, + "loss": 1.1335, + "step": 2962 + }, + { + "epoch": 0.18002308767239808, + "grad_norm": 0.15853925049304962, + "learning_rate": 9.242853723348119e-05, + "loss": 1.1877, + "step": 2963 + }, + { + "epoch": 0.18008384470502461, + "grad_norm": 0.24287500977516174, + "learning_rate": 9.242347167464402e-05, + "loss": 1.1927, + "step": 2964 + }, + { + "epoch": 0.18014460173765112, + "grad_norm": 0.19106827676296234, + "learning_rate": 9.241840456076628e-05, + "loss": 1.097, + "step": 2965 + }, + { + "epoch": 0.18020535877027766, + "grad_norm": 0.18172281980514526, + "learning_rate": 9.241333589203372e-05, + "loss": 1.2125, + "step": 2966 + }, + { + "epoch": 0.1802661158029042, + "grad_norm": 0.1866511106491089, + "learning_rate": 9.240826566863214e-05, + "loss": 1.1017, + "step": 2967 + }, + { + "epoch": 0.1803268728355307, + "grad_norm": 0.14384622871875763, + "learning_rate": 9.240319389074739e-05, + "loss": 1.0846, + "step": 2968 + }, + { + "epoch": 0.18038762986815723, + "grad_norm": 5.6967010498046875, + "learning_rate": 9.239812055856536e-05, + "loss": 1.1679, + "step": 2969 + }, + { + "epoch": 0.18044838690078377, + "grad_norm": 0.26043370366096497, + "learning_rate": 9.239304567227204e-05, + "loss": 1.0847, + "step": 2970 + }, + { + "epoch": 0.1805091439334103, + "grad_norm": 0.2230168581008911, + "learning_rate": 9.238796923205343e-05, + "loss": 1.1662, + "step": 2971 + }, + { + "epoch": 0.1805699009660368, + "grad_norm": 0.270250529050827, + "learning_rate": 9.238289123809561e-05, + "loss": 1.1246, + "step": 2972 + }, + { + "epoch": 0.18063065799866335, + "grad_norm": 0.7367274761199951, + "learning_rate": 9.237781169058473e-05, + "loss": 1.2119, + "step": 2973 + }, + { + "epoch": 0.18069141503128988, + "grad_norm": 0.31884264945983887, + "learning_rate": 9.237273058970694e-05, + "loss": 1.1147, + "step": 2974 + }, + { + "epoch": 0.1807521720639164, + "grad_norm": 0.1792020946741104, + "learning_rate": 9.236764793564857e-05, + "loss": 1.1491, + "step": 2975 + }, + { + "epoch": 0.18081292909654292, + "grad_norm": 0.8205214738845825, + "learning_rate": 9.236256372859583e-05, + "loss": 1.2785, + "step": 2976 + }, + { + "epoch": 0.18087368612916946, + "grad_norm": 0.3323436975479126, + "learning_rate": 9.235747796873514e-05, + "loss": 1.1044, + "step": 2977 + }, + { + "epoch": 0.18093444316179597, + "grad_norm": 0.22030521929264069, + "learning_rate": 9.235239065625292e-05, + "loss": 1.0525, + "step": 2978 + }, + { + "epoch": 0.1809952001944225, + "grad_norm": 0.2923031151294708, + "learning_rate": 9.234730179133564e-05, + "loss": 1.0687, + "step": 2979 + }, + { + "epoch": 0.18105595722704904, + "grad_norm": 0.16640354692935944, + "learning_rate": 9.234221137416981e-05, + "loss": 1.0725, + "step": 2980 + }, + { + "epoch": 0.18111671425967557, + "grad_norm": 0.408065527677536, + "learning_rate": 9.233711940494204e-05, + "loss": 1.1525, + "step": 2981 + }, + { + "epoch": 0.18117747129230208, + "grad_norm": 0.37556928396224976, + "learning_rate": 9.233202588383897e-05, + "loss": 1.2737, + "step": 2982 + }, + { + "epoch": 0.1812382283249286, + "grad_norm": 0.20571397244930267, + "learning_rate": 9.232693081104734e-05, + "loss": 1.1488, + "step": 2983 + }, + { + "epoch": 0.18129898535755515, + "grad_norm": 0.36000514030456543, + "learning_rate": 9.232183418675384e-05, + "loss": 1.1286, + "step": 2984 + }, + { + "epoch": 0.18135974239018166, + "grad_norm": 0.35860154032707214, + "learning_rate": 9.231673601114534e-05, + "loss": 1.0556, + "step": 2985 + }, + { + "epoch": 0.1814204994228082, + "grad_norm": 0.31243014335632324, + "learning_rate": 9.231163628440871e-05, + "loss": 1.221, + "step": 2986 + }, + { + "epoch": 0.18148125645543473, + "grad_norm": 0.22265656292438507, + "learning_rate": 9.230653500673086e-05, + "loss": 1.0512, + "step": 2987 + }, + { + "epoch": 0.18154201348806123, + "grad_norm": 0.44037526845932007, + "learning_rate": 9.230143217829881e-05, + "loss": 1.2424, + "step": 2988 + }, + { + "epoch": 0.18160277052068777, + "grad_norm": 0.18836897611618042, + "learning_rate": 9.229632779929957e-05, + "loss": 1.0907, + "step": 2989 + }, + { + "epoch": 0.1816635275533143, + "grad_norm": 0.680586040019989, + "learning_rate": 9.229122186992026e-05, + "loss": 1.0429, + "step": 2990 + }, + { + "epoch": 0.1817242845859408, + "grad_norm": 0.22910404205322266, + "learning_rate": 9.228611439034804e-05, + "loss": 1.3046, + "step": 2991 + }, + { + "epoch": 0.18178504161856734, + "grad_norm": 0.18243393301963806, + "learning_rate": 9.228100536077013e-05, + "loss": 1.1163, + "step": 2992 + }, + { + "epoch": 0.18184579865119388, + "grad_norm": 0.20854873955249786, + "learning_rate": 9.22758947813738e-05, + "loss": 1.121, + "step": 2993 + }, + { + "epoch": 0.18190655568382041, + "grad_norm": 17.038894653320312, + "learning_rate": 9.227078265234637e-05, + "loss": 1.1108, + "step": 2994 + }, + { + "epoch": 0.18196731271644692, + "grad_norm": 0.23351851105690002, + "learning_rate": 9.226566897387522e-05, + "loss": 1.1237, + "step": 2995 + }, + { + "epoch": 0.18202806974907346, + "grad_norm": 0.18451769649982452, + "learning_rate": 9.226055374614781e-05, + "loss": 1.1559, + "step": 2996 + }, + { + "epoch": 0.1820888267817, + "grad_norm": 0.248361274600029, + "learning_rate": 9.225543696935164e-05, + "loss": 1.0858, + "step": 2997 + }, + { + "epoch": 0.1821495838143265, + "grad_norm": 0.18257442116737366, + "learning_rate": 9.225031864367425e-05, + "loss": 1.1095, + "step": 2998 + }, + { + "epoch": 0.18221034084695303, + "grad_norm": 0.29653942584991455, + "learning_rate": 9.224519876930327e-05, + "loss": 1.2073, + "step": 2999 + }, + { + "epoch": 0.18227109787957957, + "grad_norm": 0.1599666327238083, + "learning_rate": 9.224007734642637e-05, + "loss": 1.0709, + "step": 3000 + }, + { + "epoch": 0.18233185491220608, + "grad_norm": 0.3649294078350067, + "learning_rate": 9.223495437523127e-05, + "loss": 1.1197, + "step": 3001 + }, + { + "epoch": 0.1823926119448326, + "grad_norm": 0.2119244933128357, + "learning_rate": 9.222982985590574e-05, + "loss": 1.2567, + "step": 3002 + }, + { + "epoch": 0.18245336897745915, + "grad_norm": 0.21541699767112732, + "learning_rate": 9.222470378863763e-05, + "loss": 1.1022, + "step": 3003 + }, + { + "epoch": 0.18251412601008568, + "grad_norm": 0.30862703919410706, + "learning_rate": 9.221957617361485e-05, + "loss": 1.1673, + "step": 3004 + }, + { + "epoch": 0.1825748830427122, + "grad_norm": 0.3197075426578522, + "learning_rate": 9.221444701102534e-05, + "loss": 1.1253, + "step": 3005 + }, + { + "epoch": 0.18263564007533872, + "grad_norm": 0.18545778095722198, + "learning_rate": 9.220931630105712e-05, + "loss": 1.0923, + "step": 3006 + }, + { + "epoch": 0.18269639710796526, + "grad_norm": 0.14189809560775757, + "learning_rate": 9.220418404389826e-05, + "loss": 1.0865, + "step": 3007 + }, + { + "epoch": 0.18275715414059177, + "grad_norm": 0.15887649357318878, + "learning_rate": 9.219905023973687e-05, + "loss": 1.0706, + "step": 3008 + }, + { + "epoch": 0.1828179111732183, + "grad_norm": 0.18498313426971436, + "learning_rate": 9.219391488876112e-05, + "loss": 1.1515, + "step": 3009 + }, + { + "epoch": 0.18287866820584484, + "grad_norm": 27.192461013793945, + "learning_rate": 9.218877799115928e-05, + "loss": 1.1109, + "step": 3010 + }, + { + "epoch": 0.18293942523847134, + "grad_norm": 0.2581998109817505, + "learning_rate": 9.218363954711964e-05, + "loss": 1.1937, + "step": 3011 + }, + { + "epoch": 0.18300018227109788, + "grad_norm": 0.26588180661201477, + "learning_rate": 9.217849955683051e-05, + "loss": 1.261, + "step": 3012 + }, + { + "epoch": 0.1830609393037244, + "grad_norm": 0.23390227556228638, + "learning_rate": 9.217335802048035e-05, + "loss": 1.1572, + "step": 3013 + }, + { + "epoch": 0.18312169633635092, + "grad_norm": 0.20528551936149597, + "learning_rate": 9.21682149382576e-05, + "loss": 1.1749, + "step": 3014 + }, + { + "epoch": 0.18318245336897745, + "grad_norm": 0.308460533618927, + "learning_rate": 9.216307031035077e-05, + "loss": 1.0873, + "step": 3015 + }, + { + "epoch": 0.183243210401604, + "grad_norm": 0.26810169219970703, + "learning_rate": 9.215792413694844e-05, + "loss": 1.2088, + "step": 3016 + }, + { + "epoch": 0.18330396743423052, + "grad_norm": 0.8332200050354004, + "learning_rate": 9.215277641823925e-05, + "loss": 1.1477, + "step": 3017 + }, + { + "epoch": 0.18336472446685703, + "grad_norm": 0.16305555403232574, + "learning_rate": 9.214762715441192e-05, + "loss": 1.0813, + "step": 3018 + }, + { + "epoch": 0.18342548149948357, + "grad_norm": 0.20657306909561157, + "learning_rate": 9.214247634565514e-05, + "loss": 1.1083, + "step": 3019 + }, + { + "epoch": 0.1834862385321101, + "grad_norm": 0.23661090433597565, + "learning_rate": 9.213732399215776e-05, + "loss": 1.2504, + "step": 3020 + }, + { + "epoch": 0.1835469955647366, + "grad_norm": 0.22513361275196075, + "learning_rate": 9.213217009410862e-05, + "loss": 1.0823, + "step": 3021 + }, + { + "epoch": 0.18360775259736314, + "grad_norm": 0.20578782260417938, + "learning_rate": 9.212701465169664e-05, + "loss": 1.1396, + "step": 3022 + }, + { + "epoch": 0.18366850962998968, + "grad_norm": 2.9302711486816406, + "learning_rate": 9.21218576651108e-05, + "loss": 1.1487, + "step": 3023 + }, + { + "epoch": 0.1837292666626162, + "grad_norm": 0.19550593197345734, + "learning_rate": 9.211669913454013e-05, + "loss": 1.0833, + "step": 3024 + }, + { + "epoch": 0.18379002369524272, + "grad_norm": 0.1986549347639084, + "learning_rate": 9.21115390601737e-05, + "loss": 1.1655, + "step": 3025 + }, + { + "epoch": 0.18385078072786926, + "grad_norm": 0.24554631114006042, + "learning_rate": 9.210637744220067e-05, + "loss": 1.0549, + "step": 3026 + }, + { + "epoch": 0.18391153776049576, + "grad_norm": 0.29472488164901733, + "learning_rate": 9.210121428081022e-05, + "loss": 1.2964, + "step": 3027 + }, + { + "epoch": 0.1839722947931223, + "grad_norm": 0.26621246337890625, + "learning_rate": 9.209604957619164e-05, + "loss": 1.0757, + "step": 3028 + }, + { + "epoch": 0.18403305182574883, + "grad_norm": 1.509060263633728, + "learning_rate": 9.209088332853424e-05, + "loss": 1.1352, + "step": 3029 + }, + { + "epoch": 0.18409380885837537, + "grad_norm": 0.15099680423736572, + "learning_rate": 9.208571553802734e-05, + "loss": 1.1149, + "step": 3030 + }, + { + "epoch": 0.18415456589100188, + "grad_norm": 0.3969057500362396, + "learning_rate": 9.208054620486043e-05, + "loss": 1.1671, + "step": 3031 + }, + { + "epoch": 0.1842153229236284, + "grad_norm": 28.691776275634766, + "learning_rate": 9.207537532922296e-05, + "loss": 1.2, + "step": 3032 + }, + { + "epoch": 0.18427607995625495, + "grad_norm": 0.2952224910259247, + "learning_rate": 9.207020291130448e-05, + "loss": 1.2385, + "step": 3033 + }, + { + "epoch": 0.18433683698888145, + "grad_norm": 0.16883589327335358, + "learning_rate": 9.206502895129456e-05, + "loss": 1.1068, + "step": 3034 + }, + { + "epoch": 0.184397594021508, + "grad_norm": 0.19729213416576385, + "learning_rate": 9.205985344938288e-05, + "loss": 1.1054, + "step": 3035 + }, + { + "epoch": 0.18445835105413452, + "grad_norm": 0.435029536485672, + "learning_rate": 9.205467640575916e-05, + "loss": 1.1658, + "step": 3036 + }, + { + "epoch": 0.18451910808676103, + "grad_norm": 0.15445058047771454, + "learning_rate": 9.204949782061311e-05, + "loss": 1.038, + "step": 3037 + }, + { + "epoch": 0.18457986511938757, + "grad_norm": 0.19000063836574554, + "learning_rate": 9.204431769413462e-05, + "loss": 1.2583, + "step": 3038 + }, + { + "epoch": 0.1846406221520141, + "grad_norm": 0.16063889861106873, + "learning_rate": 9.203913602651354e-05, + "loss": 1.1036, + "step": 3039 + }, + { + "epoch": 0.18470137918464063, + "grad_norm": 0.47722092270851135, + "learning_rate": 9.203395281793979e-05, + "loss": 1.0754, + "step": 3040 + }, + { + "epoch": 0.18476213621726714, + "grad_norm": 0.22657538950443268, + "learning_rate": 9.202876806860338e-05, + "loss": 1.0867, + "step": 3041 + }, + { + "epoch": 0.18482289324989368, + "grad_norm": 0.2091595083475113, + "learning_rate": 9.202358177869435e-05, + "loss": 1.2261, + "step": 3042 + }, + { + "epoch": 0.1848836502825202, + "grad_norm": 0.1922287940979004, + "learning_rate": 9.20183939484028e-05, + "loss": 1.179, + "step": 3043 + }, + { + "epoch": 0.18494440731514672, + "grad_norm": 0.2977611720561981, + "learning_rate": 9.201320457791891e-05, + "loss": 1.0848, + "step": 3044 + }, + { + "epoch": 0.18500516434777325, + "grad_norm": 0.14819122850894928, + "learning_rate": 9.200801366743289e-05, + "loss": 1.1249, + "step": 3045 + }, + { + "epoch": 0.1850659213803998, + "grad_norm": 0.20349188148975372, + "learning_rate": 9.200282121713499e-05, + "loss": 1.1914, + "step": 3046 + }, + { + "epoch": 0.1851266784130263, + "grad_norm": 0.25512316823005676, + "learning_rate": 9.199762722721557e-05, + "loss": 1.0801, + "step": 3047 + }, + { + "epoch": 0.18518743544565283, + "grad_norm": 0.3465666174888611, + "learning_rate": 9.1992431697865e-05, + "loss": 1.3186, + "step": 3048 + }, + { + "epoch": 0.18524819247827937, + "grad_norm": 0.202579066157341, + "learning_rate": 9.198723462927375e-05, + "loss": 1.0727, + "step": 3049 + }, + { + "epoch": 0.18530894951090587, + "grad_norm": 0.27540209889411926, + "learning_rate": 9.198203602163227e-05, + "loss": 1.0662, + "step": 3050 + }, + { + "epoch": 0.1853697065435324, + "grad_norm": 0.21132218837738037, + "learning_rate": 9.197683587513116e-05, + "loss": 1.1831, + "step": 3051 + }, + { + "epoch": 0.18543046357615894, + "grad_norm": 0.1563459038734436, + "learning_rate": 9.197163418996101e-05, + "loss": 1.0974, + "step": 3052 + }, + { + "epoch": 0.18549122060878548, + "grad_norm": 0.4012366235256195, + "learning_rate": 9.19664309663125e-05, + "loss": 1.0923, + "step": 3053 + }, + { + "epoch": 0.18555197764141199, + "grad_norm": 0.14499591290950775, + "learning_rate": 9.196122620437635e-05, + "loss": 1.0746, + "step": 3054 + }, + { + "epoch": 0.18561273467403852, + "grad_norm": 0.21168065071105957, + "learning_rate": 9.195601990434335e-05, + "loss": 1.2122, + "step": 3055 + }, + { + "epoch": 0.18567349170666506, + "grad_norm": 0.2861902415752411, + "learning_rate": 9.195081206640433e-05, + "loss": 1.1907, + "step": 3056 + }, + { + "epoch": 0.18573424873929156, + "grad_norm": 0.35591644048690796, + "learning_rate": 9.194560269075016e-05, + "loss": 1.0346, + "step": 3057 + }, + { + "epoch": 0.1857950057719181, + "grad_norm": 0.29218196868896484, + "learning_rate": 9.194039177757184e-05, + "loss": 1.0847, + "step": 3058 + }, + { + "epoch": 0.18585576280454463, + "grad_norm": 0.4533981680870056, + "learning_rate": 9.193517932706036e-05, + "loss": 1.101, + "step": 3059 + }, + { + "epoch": 0.18591651983717114, + "grad_norm": 0.20430108904838562, + "learning_rate": 9.192996533940675e-05, + "loss": 1.1202, + "step": 3060 + }, + { + "epoch": 0.18597727686979768, + "grad_norm": 0.3417561650276184, + "learning_rate": 9.192474981480216e-05, + "loss": 1.1267, + "step": 3061 + }, + { + "epoch": 0.1860380339024242, + "grad_norm": 0.22390389442443848, + "learning_rate": 9.191953275343779e-05, + "loss": 1.3037, + "step": 3062 + }, + { + "epoch": 0.18609879093505075, + "grad_norm": 0.18954911828041077, + "learning_rate": 9.191431415550482e-05, + "loss": 1.0919, + "step": 3063 + }, + { + "epoch": 0.18615954796767725, + "grad_norm": 0.15950433909893036, + "learning_rate": 9.190909402119456e-05, + "loss": 1.1088, + "step": 3064 + }, + { + "epoch": 0.1862203050003038, + "grad_norm": 0.28802913427352905, + "learning_rate": 9.190387235069837e-05, + "loss": 1.1666, + "step": 3065 + }, + { + "epoch": 0.18628106203293032, + "grad_norm": 0.31233733892440796, + "learning_rate": 9.189864914420763e-05, + "loss": 1.0677, + "step": 3066 + }, + { + "epoch": 0.18634181906555683, + "grad_norm": 0.1936078518629074, + "learning_rate": 9.189342440191382e-05, + "loss": 1.1203, + "step": 3067 + }, + { + "epoch": 0.18640257609818336, + "grad_norm": 0.20946606993675232, + "learning_rate": 9.188819812400842e-05, + "loss": 1.1818, + "step": 3068 + }, + { + "epoch": 0.1864633331308099, + "grad_norm": 0.4461238980293274, + "learning_rate": 9.188297031068304e-05, + "loss": 1.0657, + "step": 3069 + }, + { + "epoch": 0.1865240901634364, + "grad_norm": 0.17963141202926636, + "learning_rate": 9.187774096212926e-05, + "loss": 1.1849, + "step": 3070 + }, + { + "epoch": 0.18658484719606294, + "grad_norm": 0.3228558897972107, + "learning_rate": 9.187251007853883e-05, + "loss": 1.2921, + "step": 3071 + }, + { + "epoch": 0.18664560422868948, + "grad_norm": 0.17975732684135437, + "learning_rate": 9.186727766010343e-05, + "loss": 1.1752, + "step": 3072 + }, + { + "epoch": 0.18670636126131598, + "grad_norm": 0.228794127702713, + "learning_rate": 9.186204370701486e-05, + "loss": 1.1413, + "step": 3073 + }, + { + "epoch": 0.18676711829394252, + "grad_norm": 1.022456407546997, + "learning_rate": 9.185680821946501e-05, + "loss": 1.3289, + "step": 3074 + }, + { + "epoch": 0.18682787532656905, + "grad_norm": 0.15229488909244537, + "learning_rate": 9.185157119764575e-05, + "loss": 1.1194, + "step": 3075 + }, + { + "epoch": 0.1868886323591956, + "grad_norm": 0.33429181575775146, + "learning_rate": 9.184633264174907e-05, + "loss": 1.2345, + "step": 3076 + }, + { + "epoch": 0.1869493893918221, + "grad_norm": 0.31576576828956604, + "learning_rate": 9.184109255196696e-05, + "loss": 1.1097, + "step": 3077 + }, + { + "epoch": 0.18701014642444863, + "grad_norm": 0.5714771747589111, + "learning_rate": 9.183585092849153e-05, + "loss": 1.1388, + "step": 3078 + }, + { + "epoch": 0.18707090345707517, + "grad_norm": 0.32460519671440125, + "learning_rate": 9.183060777151489e-05, + "loss": 1.0879, + "step": 3079 + }, + { + "epoch": 0.18713166048970167, + "grad_norm": 0.2623949944972992, + "learning_rate": 9.182536308122925e-05, + "loss": 1.0947, + "step": 3080 + }, + { + "epoch": 0.1871924175223282, + "grad_norm": 0.1441493034362793, + "learning_rate": 9.182011685782683e-05, + "loss": 1.0689, + "step": 3081 + }, + { + "epoch": 0.18725317455495474, + "grad_norm": 0.32306981086730957, + "learning_rate": 9.181486910149994e-05, + "loss": 1.0787, + "step": 3082 + }, + { + "epoch": 0.18731393158758125, + "grad_norm": 0.19291339814662933, + "learning_rate": 9.180961981244094e-05, + "loss": 1.1086, + "step": 3083 + }, + { + "epoch": 0.18737468862020779, + "grad_norm": 0.19064386188983917, + "learning_rate": 9.180436899084225e-05, + "loss": 1.171, + "step": 3084 + }, + { + "epoch": 0.18743544565283432, + "grad_norm": 0.2563016712665558, + "learning_rate": 9.179911663689633e-05, + "loss": 1.1988, + "step": 3085 + }, + { + "epoch": 0.18749620268546086, + "grad_norm": 0.2070801556110382, + "learning_rate": 9.179386275079571e-05, + "loss": 1.2355, + "step": 3086 + }, + { + "epoch": 0.18755695971808736, + "grad_norm": 0.18820656836032867, + "learning_rate": 9.178860733273298e-05, + "loss": 1.1071, + "step": 3087 + }, + { + "epoch": 0.1876177167507139, + "grad_norm": 0.20887188613414764, + "learning_rate": 9.178335038290075e-05, + "loss": 1.107, + "step": 3088 + }, + { + "epoch": 0.18767847378334043, + "grad_norm": 0.18751654028892517, + "learning_rate": 9.177809190149176e-05, + "loss": 1.1717, + "step": 3089 + }, + { + "epoch": 0.18773923081596694, + "grad_norm": 0.19147314131259918, + "learning_rate": 9.177283188869873e-05, + "loss": 1.0788, + "step": 3090 + }, + { + "epoch": 0.18779998784859347, + "grad_norm": 0.3001182973384857, + "learning_rate": 9.176757034471447e-05, + "loss": 1.1573, + "step": 3091 + }, + { + "epoch": 0.18786074488122, + "grad_norm": 0.1793118417263031, + "learning_rate": 9.176230726973185e-05, + "loss": 1.1376, + "step": 3092 + }, + { + "epoch": 0.18792150191384652, + "grad_norm": 0.2062307745218277, + "learning_rate": 9.175704266394378e-05, + "loss": 1.1352, + "step": 3093 + }, + { + "epoch": 0.18798225894647305, + "grad_norm": 0.19472725689411163, + "learning_rate": 9.175177652754323e-05, + "loss": 1.0824, + "step": 3094 + }, + { + "epoch": 0.1880430159790996, + "grad_norm": 0.1955190896987915, + "learning_rate": 9.174650886072325e-05, + "loss": 1.165, + "step": 3095 + }, + { + "epoch": 0.1881037730117261, + "grad_norm": 0.1639154702425003, + "learning_rate": 9.174123966367691e-05, + "loss": 1.2194, + "step": 3096 + }, + { + "epoch": 0.18816453004435263, + "grad_norm": 0.3118985593318939, + "learning_rate": 9.173596893659738e-05, + "loss": 1.2413, + "step": 3097 + }, + { + "epoch": 0.18822528707697916, + "grad_norm": 0.16718478500843048, + "learning_rate": 9.173069667967785e-05, + "loss": 1.1035, + "step": 3098 + }, + { + "epoch": 0.1882860441096057, + "grad_norm": 0.2691071629524231, + "learning_rate": 9.172542289311153e-05, + "loss": 1.0747, + "step": 3099 + }, + { + "epoch": 0.1883468011422322, + "grad_norm": 0.2464882880449295, + "learning_rate": 9.172014757709179e-05, + "loss": 1.1528, + "step": 3100 + }, + { + "epoch": 0.18840755817485874, + "grad_norm": 0.472747802734375, + "learning_rate": 9.171487073181198e-05, + "loss": 1.1142, + "step": 3101 + }, + { + "epoch": 0.18846831520748528, + "grad_norm": 0.2910997271537781, + "learning_rate": 9.17095923574655e-05, + "loss": 1.0531, + "step": 3102 + }, + { + "epoch": 0.18852907224011178, + "grad_norm": 0.21663829684257507, + "learning_rate": 9.170431245424587e-05, + "loss": 1.0939, + "step": 3103 + }, + { + "epoch": 0.18858982927273832, + "grad_norm": 0.17455355823040009, + "learning_rate": 9.169903102234659e-05, + "loss": 1.1838, + "step": 3104 + }, + { + "epoch": 0.18865058630536485, + "grad_norm": 0.2848408818244934, + "learning_rate": 9.169374806196127e-05, + "loss": 1.1643, + "step": 3105 + }, + { + "epoch": 0.18871134333799136, + "grad_norm": 0.21605317294597626, + "learning_rate": 9.168846357328357e-05, + "loss": 1.1257, + "step": 3106 + }, + { + "epoch": 0.1887721003706179, + "grad_norm": 0.3147536516189575, + "learning_rate": 9.168317755650717e-05, + "loss": 1.3585, + "step": 3107 + }, + { + "epoch": 0.18883285740324443, + "grad_norm": 0.37438318133354187, + "learning_rate": 9.167789001182584e-05, + "loss": 1.1881, + "step": 3108 + }, + { + "epoch": 0.18889361443587097, + "grad_norm": 0.26369765400886536, + "learning_rate": 9.16726009394334e-05, + "loss": 1.0964, + "step": 3109 + }, + { + "epoch": 0.18895437146849747, + "grad_norm": 0.15647491812705994, + "learning_rate": 9.166731033952372e-05, + "loss": 1.1093, + "step": 3110 + }, + { + "epoch": 0.189015128501124, + "grad_norm": 0.15289811789989471, + "learning_rate": 9.166201821229072e-05, + "loss": 1.0353, + "step": 3111 + }, + { + "epoch": 0.18907588553375054, + "grad_norm": 0.2987431287765503, + "learning_rate": 9.165672455792838e-05, + "loss": 1.0674, + "step": 3112 + }, + { + "epoch": 0.18913664256637705, + "grad_norm": 0.21291735768318176, + "learning_rate": 9.165142937663076e-05, + "loss": 1.118, + "step": 3113 + }, + { + "epoch": 0.18919739959900359, + "grad_norm": 0.3091643452644348, + "learning_rate": 9.164613266859193e-05, + "loss": 1.2288, + "step": 3114 + }, + { + "epoch": 0.18925815663163012, + "grad_norm": 0.22075144946575165, + "learning_rate": 9.164083443400609e-05, + "loss": 1.053, + "step": 3115 + }, + { + "epoch": 0.18931891366425663, + "grad_norm": 0.3806934356689453, + "learning_rate": 9.163553467306738e-05, + "loss": 1.191, + "step": 3116 + }, + { + "epoch": 0.18937967069688316, + "grad_norm": 0.37007635831832886, + "learning_rate": 9.163023338597011e-05, + "loss": 1.1147, + "step": 3117 + }, + { + "epoch": 0.1894404277295097, + "grad_norm": 0.1943754106760025, + "learning_rate": 9.16249305729086e-05, + "loss": 1.1602, + "step": 3118 + }, + { + "epoch": 0.1895011847621362, + "grad_norm": 0.16483408212661743, + "learning_rate": 9.16196262340772e-05, + "loss": 1.0826, + "step": 3119 + }, + { + "epoch": 0.18956194179476274, + "grad_norm": 0.2626393437385559, + "learning_rate": 9.161432036967036e-05, + "loss": 1.0773, + "step": 3120 + }, + { + "epoch": 0.18962269882738927, + "grad_norm": 0.29525965452194214, + "learning_rate": 9.160901297988256e-05, + "loss": 1.1494, + "step": 3121 + }, + { + "epoch": 0.1896834558600158, + "grad_norm": 0.22029602527618408, + "learning_rate": 9.160370406490834e-05, + "loss": 1.111, + "step": 3122 + }, + { + "epoch": 0.18974421289264232, + "grad_norm": 0.21798256039619446, + "learning_rate": 9.159839362494232e-05, + "loss": 1.196, + "step": 3123 + }, + { + "epoch": 0.18980496992526885, + "grad_norm": 2.241982936859131, + "learning_rate": 9.159308166017915e-05, + "loss": 1.1281, + "step": 3124 + }, + { + "epoch": 0.1898657269578954, + "grad_norm": 0.2661694288253784, + "learning_rate": 9.158776817081351e-05, + "loss": 1.1396, + "step": 3125 + }, + { + "epoch": 0.1899264839905219, + "grad_norm": 0.17097768187522888, + "learning_rate": 9.158245315704019e-05, + "loss": 1.0994, + "step": 3126 + }, + { + "epoch": 0.18998724102314843, + "grad_norm": 0.16301696002483368, + "learning_rate": 9.157713661905402e-05, + "loss": 1.0794, + "step": 3127 + }, + { + "epoch": 0.19004799805577496, + "grad_norm": 0.24085086584091187, + "learning_rate": 9.157181855704988e-05, + "loss": 1.1211, + "step": 3128 + }, + { + "epoch": 0.19010875508840147, + "grad_norm": 0.18063335120677948, + "learning_rate": 9.15664989712227e-05, + "loss": 1.1099, + "step": 3129 + }, + { + "epoch": 0.190169512121028, + "grad_norm": 0.18649516999721527, + "learning_rate": 9.156117786176743e-05, + "loss": 1.1835, + "step": 3130 + }, + { + "epoch": 0.19023026915365454, + "grad_norm": 0.20546311140060425, + "learning_rate": 9.155585522887917e-05, + "loss": 1.1672, + "step": 3131 + }, + { + "epoch": 0.19029102618628108, + "grad_norm": 0.19135670363903046, + "learning_rate": 9.155053107275301e-05, + "loss": 1.0869, + "step": 3132 + }, + { + "epoch": 0.19035178321890758, + "grad_norm": 0.1522771567106247, + "learning_rate": 9.15452053935841e-05, + "loss": 1.0961, + "step": 3133 + }, + { + "epoch": 0.19041254025153412, + "grad_norm": 0.1502159982919693, + "learning_rate": 9.153987819156766e-05, + "loss": 1.0879, + "step": 3134 + }, + { + "epoch": 0.19047329728416065, + "grad_norm": 0.21839258074760437, + "learning_rate": 9.153454946689895e-05, + "loss": 1.0473, + "step": 3135 + }, + { + "epoch": 0.19053405431678716, + "grad_norm": 0.16287802159786224, + "learning_rate": 9.152921921977329e-05, + "loss": 1.1494, + "step": 3136 + }, + { + "epoch": 0.1905948113494137, + "grad_norm": 0.20858292281627655, + "learning_rate": 9.15238874503861e-05, + "loss": 1.129, + "step": 3137 + }, + { + "epoch": 0.19065556838204023, + "grad_norm": 0.3644711375236511, + "learning_rate": 9.151855415893276e-05, + "loss": 1.0882, + "step": 3138 + }, + { + "epoch": 0.19071632541466674, + "grad_norm": 0.2145756036043167, + "learning_rate": 9.15132193456088e-05, + "loss": 1.0757, + "step": 3139 + }, + { + "epoch": 0.19077708244729327, + "grad_norm": 0.17800205945968628, + "learning_rate": 9.150788301060977e-05, + "loss": 1.1829, + "step": 3140 + }, + { + "epoch": 0.1908378394799198, + "grad_norm": 0.1862039417028427, + "learning_rate": 9.150254515413126e-05, + "loss": 1.1993, + "step": 3141 + }, + { + "epoch": 0.19089859651254631, + "grad_norm": 0.1509667932987213, + "learning_rate": 9.149720577636893e-05, + "loss": 1.1101, + "step": 3142 + }, + { + "epoch": 0.19095935354517285, + "grad_norm": 0.15213097631931305, + "learning_rate": 9.14918648775185e-05, + "loss": 1.1364, + "step": 3143 + }, + { + "epoch": 0.19102011057779938, + "grad_norm": 0.20119433104991913, + "learning_rate": 9.148652245777576e-05, + "loss": 1.0713, + "step": 3144 + }, + { + "epoch": 0.19108086761042592, + "grad_norm": 0.5513084530830383, + "learning_rate": 9.14811785173365e-05, + "loss": 1.339, + "step": 3145 + }, + { + "epoch": 0.19114162464305243, + "grad_norm": 0.15572676062583923, + "learning_rate": 9.147583305639663e-05, + "loss": 1.1066, + "step": 3146 + }, + { + "epoch": 0.19120238167567896, + "grad_norm": 0.21261997520923615, + "learning_rate": 9.14704860751521e-05, + "loss": 1.0429, + "step": 3147 + }, + { + "epoch": 0.1912631387083055, + "grad_norm": 0.3659953474998474, + "learning_rate": 9.146513757379886e-05, + "loss": 1.1101, + "step": 3148 + }, + { + "epoch": 0.191323895740932, + "grad_norm": 0.2075212597846985, + "learning_rate": 9.145978755253299e-05, + "loss": 1.2015, + "step": 3149 + }, + { + "epoch": 0.19138465277355854, + "grad_norm": 1.9673792123794556, + "learning_rate": 9.14544360115506e-05, + "loss": 1.0626, + "step": 3150 + }, + { + "epoch": 0.19144540980618507, + "grad_norm": 0.2159821093082428, + "learning_rate": 9.144908295104786e-05, + "loss": 1.1379, + "step": 3151 + }, + { + "epoch": 0.19150616683881158, + "grad_norm": 0.2122391015291214, + "learning_rate": 9.144372837122095e-05, + "loss": 1.0993, + "step": 3152 + }, + { + "epoch": 0.19156692387143812, + "grad_norm": 0.2374678999185562, + "learning_rate": 9.143837227226617e-05, + "loss": 1.2314, + "step": 3153 + }, + { + "epoch": 0.19162768090406465, + "grad_norm": 0.16333802044391632, + "learning_rate": 9.143301465437986e-05, + "loss": 1.0771, + "step": 3154 + }, + { + "epoch": 0.19168843793669116, + "grad_norm": 0.31628501415252686, + "learning_rate": 9.142765551775838e-05, + "loss": 1.1685, + "step": 3155 + }, + { + "epoch": 0.1917491949693177, + "grad_norm": 0.20512613654136658, + "learning_rate": 9.142229486259817e-05, + "loss": 1.2663, + "step": 3156 + }, + { + "epoch": 0.19180995200194423, + "grad_norm": 0.3468573987483978, + "learning_rate": 9.141693268909575e-05, + "loss": 1.0842, + "step": 3157 + }, + { + "epoch": 0.19187070903457076, + "grad_norm": 1.7431859970092773, + "learning_rate": 9.141156899744764e-05, + "loss": 1.1773, + "step": 3158 + }, + { + "epoch": 0.19193146606719727, + "grad_norm": 0.31735095381736755, + "learning_rate": 9.140620378785047e-05, + "loss": 1.1777, + "step": 3159 + }, + { + "epoch": 0.1919922230998238, + "grad_norm": 0.19960574805736542, + "learning_rate": 9.14008370605009e-05, + "loss": 1.1418, + "step": 3160 + }, + { + "epoch": 0.19205298013245034, + "grad_norm": 0.14055337011814117, + "learning_rate": 9.139546881559564e-05, + "loss": 1.0975, + "step": 3161 + }, + { + "epoch": 0.19211373716507685, + "grad_norm": 0.13902729749679565, + "learning_rate": 9.139009905333147e-05, + "loss": 1.049, + "step": 3162 + }, + { + "epoch": 0.19217449419770338, + "grad_norm": 0.19508013129234314, + "learning_rate": 9.138472777390523e-05, + "loss": 1.1961, + "step": 3163 + }, + { + "epoch": 0.19223525123032992, + "grad_norm": 0.2648068964481354, + "learning_rate": 9.137935497751378e-05, + "loss": 1.1785, + "step": 3164 + }, + { + "epoch": 0.19229600826295642, + "grad_norm": 1.4964137077331543, + "learning_rate": 9.137398066435409e-05, + "loss": 1.1217, + "step": 3165 + }, + { + "epoch": 0.19235676529558296, + "grad_norm": 0.1764449030160904, + "learning_rate": 9.136860483462312e-05, + "loss": 1.0715, + "step": 3166 + }, + { + "epoch": 0.1924175223282095, + "grad_norm": 0.13910436630249023, + "learning_rate": 9.136322748851796e-05, + "loss": 1.0677, + "step": 3167 + }, + { + "epoch": 0.19247827936083603, + "grad_norm": 0.23355336487293243, + "learning_rate": 9.135784862623569e-05, + "loss": 1.0591, + "step": 3168 + }, + { + "epoch": 0.19253903639346254, + "grad_norm": 0.16057777404785156, + "learning_rate": 9.13524682479735e-05, + "loss": 1.1288, + "step": 3169 + }, + { + "epoch": 0.19259979342608907, + "grad_norm": 0.19113057851791382, + "learning_rate": 9.134708635392858e-05, + "loss": 1.1947, + "step": 3170 + }, + { + "epoch": 0.1926605504587156, + "grad_norm": 0.2551184296607971, + "learning_rate": 9.134170294429822e-05, + "loss": 1.1336, + "step": 3171 + }, + { + "epoch": 0.19272130749134211, + "grad_norm": 0.2899314761161804, + "learning_rate": 9.133631801927975e-05, + "loss": 1.1371, + "step": 3172 + }, + { + "epoch": 0.19278206452396865, + "grad_norm": 0.20496691763401031, + "learning_rate": 9.133093157907055e-05, + "loss": 1.17, + "step": 3173 + }, + { + "epoch": 0.19284282155659518, + "grad_norm": 0.20662060379981995, + "learning_rate": 9.132554362386807e-05, + "loss": 1.2085, + "step": 3174 + }, + { + "epoch": 0.1929035785892217, + "grad_norm": 0.18490777909755707, + "learning_rate": 9.132015415386979e-05, + "loss": 1.047, + "step": 3175 + }, + { + "epoch": 0.19296433562184823, + "grad_norm": 0.25164735317230225, + "learning_rate": 9.131476316927327e-05, + "loss": 1.2809, + "step": 3176 + }, + { + "epoch": 0.19302509265447476, + "grad_norm": 0.16715030372142792, + "learning_rate": 9.130937067027615e-05, + "loss": 1.1962, + "step": 3177 + }, + { + "epoch": 0.19308584968710127, + "grad_norm": 0.46896255016326904, + "learning_rate": 9.130397665707603e-05, + "loss": 1.1489, + "step": 3178 + }, + { + "epoch": 0.1931466067197278, + "grad_norm": 0.1492065042257309, + "learning_rate": 9.129858112987066e-05, + "loss": 1.0597, + "step": 3179 + }, + { + "epoch": 0.19320736375235434, + "grad_norm": 0.24866871535778046, + "learning_rate": 9.129318408885784e-05, + "loss": 1.0984, + "step": 3180 + }, + { + "epoch": 0.19326812078498087, + "grad_norm": 0.2036585658788681, + "learning_rate": 9.128778553423536e-05, + "loss": 1.1276, + "step": 3181 + }, + { + "epoch": 0.19332887781760738, + "grad_norm": 0.3559352457523346, + "learning_rate": 9.128238546620114e-05, + "loss": 1.188, + "step": 3182 + }, + { + "epoch": 0.19338963485023392, + "grad_norm": 0.22386854887008667, + "learning_rate": 9.127698388495307e-05, + "loss": 1.0944, + "step": 3183 + }, + { + "epoch": 0.19345039188286045, + "grad_norm": 0.17664191126823425, + "learning_rate": 9.12715807906892e-05, + "loss": 1.1793, + "step": 3184 + }, + { + "epoch": 0.19351114891548696, + "grad_norm": 0.27878689765930176, + "learning_rate": 9.126617618360756e-05, + "loss": 1.2249, + "step": 3185 + }, + { + "epoch": 0.1935719059481135, + "grad_norm": 0.1589423269033432, + "learning_rate": 9.126077006390623e-05, + "loss": 1.0777, + "step": 3186 + }, + { + "epoch": 0.19363266298074003, + "grad_norm": 0.1803324818611145, + "learning_rate": 9.125536243178342e-05, + "loss": 1.1675, + "step": 3187 + }, + { + "epoch": 0.19369342001336654, + "grad_norm": 0.22235427796840668, + "learning_rate": 9.124995328743732e-05, + "loss": 1.2067, + "step": 3188 + }, + { + "epoch": 0.19375417704599307, + "grad_norm": 0.15766115486621857, + "learning_rate": 9.124454263106622e-05, + "loss": 1.1238, + "step": 3189 + }, + { + "epoch": 0.1938149340786196, + "grad_norm": 0.2703237235546112, + "learning_rate": 9.123913046286841e-05, + "loss": 1.2034, + "step": 3190 + }, + { + "epoch": 0.19387569111124614, + "grad_norm": 0.13763552904129028, + "learning_rate": 9.123371678304232e-05, + "loss": 1.1076, + "step": 3191 + }, + { + "epoch": 0.19393644814387265, + "grad_norm": 0.1967121958732605, + "learning_rate": 9.122830159178636e-05, + "loss": 1.1278, + "step": 3192 + }, + { + "epoch": 0.19399720517649918, + "grad_norm": 0.23657777905464172, + "learning_rate": 9.122288488929906e-05, + "loss": 1.0981, + "step": 3193 + }, + { + "epoch": 0.19405796220912572, + "grad_norm": 1.0357747077941895, + "learning_rate": 9.121746667577893e-05, + "loss": 1.0994, + "step": 3194 + }, + { + "epoch": 0.19411871924175222, + "grad_norm": 0.15961074829101562, + "learning_rate": 9.12120469514246e-05, + "loss": 1.1181, + "step": 3195 + }, + { + "epoch": 0.19417947627437876, + "grad_norm": 0.23469798266887665, + "learning_rate": 9.12066257164347e-05, + "loss": 1.1791, + "step": 3196 + }, + { + "epoch": 0.1942402333070053, + "grad_norm": 0.3526883125305176, + "learning_rate": 9.120120297100799e-05, + "loss": 1.163, + "step": 3197 + }, + { + "epoch": 0.1943009903396318, + "grad_norm": 1.5761868953704834, + "learning_rate": 9.119577871534322e-05, + "loss": 1.0717, + "step": 3198 + }, + { + "epoch": 0.19436174737225834, + "grad_norm": 0.41292497515678406, + "learning_rate": 9.119035294963921e-05, + "loss": 1.13, + "step": 3199 + }, + { + "epoch": 0.19442250440488487, + "grad_norm": 0.2322758138179779, + "learning_rate": 9.118492567409486e-05, + "loss": 1.1806, + "step": 3200 + }, + { + "epoch": 0.19448326143751138, + "grad_norm": 0.30504292249679565, + "learning_rate": 9.117949688890909e-05, + "loss": 1.2634, + "step": 3201 + }, + { + "epoch": 0.1945440184701379, + "grad_norm": 0.2333499789237976, + "learning_rate": 9.11740665942809e-05, + "loss": 1.0816, + "step": 3202 + }, + { + "epoch": 0.19460477550276445, + "grad_norm": 0.2496776282787323, + "learning_rate": 9.116863479040936e-05, + "loss": 1.0963, + "step": 3203 + }, + { + "epoch": 0.19466553253539098, + "grad_norm": 0.1488950103521347, + "learning_rate": 9.116320147749354e-05, + "loss": 1.1188, + "step": 3204 + }, + { + "epoch": 0.1947262895680175, + "grad_norm": 0.1364702731370926, + "learning_rate": 9.11577666557326e-05, + "loss": 1.0807, + "step": 3205 + }, + { + "epoch": 0.19478704660064403, + "grad_norm": 0.2220604419708252, + "learning_rate": 9.115233032532578e-05, + "loss": 1.0772, + "step": 3206 + }, + { + "epoch": 0.19484780363327056, + "grad_norm": 0.1982111632823944, + "learning_rate": 9.114689248647235e-05, + "loss": 1.1418, + "step": 3207 + }, + { + "epoch": 0.19490856066589707, + "grad_norm": 0.16218845546245575, + "learning_rate": 9.114145313937161e-05, + "loss": 1.0943, + "step": 3208 + }, + { + "epoch": 0.1949693176985236, + "grad_norm": 0.19595873355865479, + "learning_rate": 9.113601228422294e-05, + "loss": 1.2117, + "step": 3209 + }, + { + "epoch": 0.19503007473115014, + "grad_norm": 0.20046143233776093, + "learning_rate": 9.11305699212258e-05, + "loss": 1.1831, + "step": 3210 + }, + { + "epoch": 0.19509083176377665, + "grad_norm": 0.19938664138317108, + "learning_rate": 9.112512605057968e-05, + "loss": 1.1029, + "step": 3211 + }, + { + "epoch": 0.19515158879640318, + "grad_norm": 0.29923155903816223, + "learning_rate": 9.111968067248411e-05, + "loss": 1.2037, + "step": 3212 + }, + { + "epoch": 0.19521234582902972, + "grad_norm": 0.12634147703647614, + "learning_rate": 9.111423378713868e-05, + "loss": 1.0746, + "step": 3213 + }, + { + "epoch": 0.19527310286165625, + "grad_norm": 0.14267168939113617, + "learning_rate": 9.110878539474308e-05, + "loss": 1.1066, + "step": 3214 + }, + { + "epoch": 0.19533385989428276, + "grad_norm": 0.18030843138694763, + "learning_rate": 9.1103335495497e-05, + "loss": 1.1082, + "step": 3215 + }, + { + "epoch": 0.1953946169269093, + "grad_norm": 0.12984958291053772, + "learning_rate": 9.10978840896002e-05, + "loss": 1.0789, + "step": 3216 + }, + { + "epoch": 0.19545537395953583, + "grad_norm": 0.4353140592575073, + "learning_rate": 9.109243117725254e-05, + "loss": 1.1851, + "step": 3217 + }, + { + "epoch": 0.19551613099216233, + "grad_norm": 0.2476755976676941, + "learning_rate": 9.108697675865385e-05, + "loss": 1.3506, + "step": 3218 + }, + { + "epoch": 0.19557688802478887, + "grad_norm": 0.19675281643867493, + "learning_rate": 9.10815208340041e-05, + "loss": 1.1011, + "step": 3219 + }, + { + "epoch": 0.1956376450574154, + "grad_norm": 0.1628730148077011, + "learning_rate": 9.107606340350325e-05, + "loss": 1.152, + "step": 3220 + }, + { + "epoch": 0.1956984020900419, + "grad_norm": 0.3081006705760956, + "learning_rate": 9.107060446735138e-05, + "loss": 1.0779, + "step": 3221 + }, + { + "epoch": 0.19575915912266845, + "grad_norm": 0.15976488590240479, + "learning_rate": 9.106514402574854e-05, + "loss": 1.0751, + "step": 3222 + }, + { + "epoch": 0.19581991615529498, + "grad_norm": 0.16874565184116364, + "learning_rate": 9.105968207889492e-05, + "loss": 1.0956, + "step": 3223 + }, + { + "epoch": 0.1958806731879215, + "grad_norm": 0.17898161709308624, + "learning_rate": 9.105421862699071e-05, + "loss": 1.1343, + "step": 3224 + }, + { + "epoch": 0.19594143022054802, + "grad_norm": 0.16883991658687592, + "learning_rate": 9.104875367023619e-05, + "loss": 1.1453, + "step": 3225 + }, + { + "epoch": 0.19600218725317456, + "grad_norm": 0.3435705006122589, + "learning_rate": 9.104328720883166e-05, + "loss": 1.1495, + "step": 3226 + }, + { + "epoch": 0.1960629442858011, + "grad_norm": 0.16361819207668304, + "learning_rate": 9.103781924297751e-05, + "loss": 1.1403, + "step": 3227 + }, + { + "epoch": 0.1961237013184276, + "grad_norm": 0.25067466497421265, + "learning_rate": 9.103234977287416e-05, + "loss": 1.1475, + "step": 3228 + }, + { + "epoch": 0.19618445835105414, + "grad_norm": 1.0418776273727417, + "learning_rate": 9.10268787987221e-05, + "loss": 1.1273, + "step": 3229 + }, + { + "epoch": 0.19624521538368067, + "grad_norm": 0.17021557688713074, + "learning_rate": 9.102140632072187e-05, + "loss": 1.0359, + "step": 3230 + }, + { + "epoch": 0.19630597241630718, + "grad_norm": 0.24031531810760498, + "learning_rate": 9.101593233907405e-05, + "loss": 1.092, + "step": 3231 + }, + { + "epoch": 0.1963667294489337, + "grad_norm": 5.0789055824279785, + "learning_rate": 9.101045685397931e-05, + "loss": 1.0906, + "step": 3232 + }, + { + "epoch": 0.19642748648156025, + "grad_norm": 0.26952677965164185, + "learning_rate": 9.100497986563833e-05, + "loss": 1.1626, + "step": 3233 + }, + { + "epoch": 0.19648824351418676, + "grad_norm": 0.2004709094762802, + "learning_rate": 9.099950137425193e-05, + "loss": 1.3217, + "step": 3234 + }, + { + "epoch": 0.1965490005468133, + "grad_norm": 0.17830704152584076, + "learning_rate": 9.099402138002084e-05, + "loss": 1.0553, + "step": 3235 + }, + { + "epoch": 0.19660975757943983, + "grad_norm": 0.2199990600347519, + "learning_rate": 9.098853988314599e-05, + "loss": 1.0785, + "step": 3236 + }, + { + "epoch": 0.19667051461206636, + "grad_norm": 0.20517028868198395, + "learning_rate": 9.098305688382829e-05, + "loss": 1.1831, + "step": 3237 + }, + { + "epoch": 0.19673127164469287, + "grad_norm": 0.30997705459594727, + "learning_rate": 9.097757238226872e-05, + "loss": 1.1353, + "step": 3238 + }, + { + "epoch": 0.1967920286773194, + "grad_norm": 0.19933241605758667, + "learning_rate": 9.097208637866829e-05, + "loss": 1.1661, + "step": 3239 + }, + { + "epoch": 0.19685278570994594, + "grad_norm": 0.16712698340415955, + "learning_rate": 9.096659887322813e-05, + "loss": 1.0909, + "step": 3240 + }, + { + "epoch": 0.19691354274257244, + "grad_norm": 2.340427875518799, + "learning_rate": 9.096110986614938e-05, + "loss": 1.0552, + "step": 3241 + }, + { + "epoch": 0.19697429977519898, + "grad_norm": 42.843692779541016, + "learning_rate": 9.095561935763321e-05, + "loss": 1.1362, + "step": 3242 + }, + { + "epoch": 0.19703505680782551, + "grad_norm": 0.3771587312221527, + "learning_rate": 9.095012734788093e-05, + "loss": 1.1772, + "step": 3243 + }, + { + "epoch": 0.19709581384045202, + "grad_norm": 0.3147752285003662, + "learning_rate": 9.094463383709379e-05, + "loss": 1.1349, + "step": 3244 + }, + { + "epoch": 0.19715657087307856, + "grad_norm": 0.31446409225463867, + "learning_rate": 9.093913882547318e-05, + "loss": 1.0958, + "step": 3245 + }, + { + "epoch": 0.1972173279057051, + "grad_norm": 0.3560512661933899, + "learning_rate": 9.093364231322052e-05, + "loss": 1.0741, + "step": 3246 + }, + { + "epoch": 0.1972780849383316, + "grad_norm": 0.16011124849319458, + "learning_rate": 9.09281443005373e-05, + "loss": 1.0644, + "step": 3247 + }, + { + "epoch": 0.19733884197095813, + "grad_norm": 0.21780221164226532, + "learning_rate": 9.092264478762505e-05, + "loss": 1.0863, + "step": 3248 + }, + { + "epoch": 0.19739959900358467, + "grad_norm": 0.2751162052154541, + "learning_rate": 9.091714377468532e-05, + "loss": 1.1093, + "step": 3249 + }, + { + "epoch": 0.1974603560362112, + "grad_norm": 0.17367391288280487, + "learning_rate": 9.09116412619198e-05, + "loss": 1.115, + "step": 3250 + }, + { + "epoch": 0.1975211130688377, + "grad_norm": 1.4182554483413696, + "learning_rate": 9.090613724953014e-05, + "loss": 1.256, + "step": 3251 + }, + { + "epoch": 0.19758187010146425, + "grad_norm": 0.3456888794898987, + "learning_rate": 9.090063173771813e-05, + "loss": 1.1735, + "step": 3252 + }, + { + "epoch": 0.19764262713409078, + "grad_norm": 0.21287770569324493, + "learning_rate": 9.089512472668554e-05, + "loss": 1.064, + "step": 3253 + }, + { + "epoch": 0.1977033841667173, + "grad_norm": 0.49738410115242004, + "learning_rate": 9.088961621663425e-05, + "loss": 1.1764, + "step": 3254 + }, + { + "epoch": 0.19776414119934382, + "grad_norm": 0.19693298637866974, + "learning_rate": 9.08841062077662e-05, + "loss": 1.2348, + "step": 3255 + }, + { + "epoch": 0.19782489823197036, + "grad_norm": 0.1314329206943512, + "learning_rate": 9.08785947002833e-05, + "loss": 1.1, + "step": 3256 + }, + { + "epoch": 0.19788565526459687, + "grad_norm": 0.23774775862693787, + "learning_rate": 9.087308169438765e-05, + "loss": 1.1191, + "step": 3257 + }, + { + "epoch": 0.1979464122972234, + "grad_norm": 0.25573259592056274, + "learning_rate": 9.086756719028124e-05, + "loss": 1.1807, + "step": 3258 + }, + { + "epoch": 0.19800716932984994, + "grad_norm": 0.5008425116539001, + "learning_rate": 9.086205118816628e-05, + "loss": 1.2152, + "step": 3259 + }, + { + "epoch": 0.19806792636247644, + "grad_norm": 0.19056855142116547, + "learning_rate": 9.085653368824494e-05, + "loss": 1.1101, + "step": 3260 + }, + { + "epoch": 0.19812868339510298, + "grad_norm": 1.883497714996338, + "learning_rate": 9.085101469071945e-05, + "loss": 1.0974, + "step": 3261 + }, + { + "epoch": 0.1981894404277295, + "grad_norm": 0.18575043976306915, + "learning_rate": 9.084549419579213e-05, + "loss": 1.075, + "step": 3262 + }, + { + "epoch": 0.19825019746035605, + "grad_norm": 0.1883113831281662, + "learning_rate": 9.083997220366533e-05, + "loss": 1.1047, + "step": 3263 + }, + { + "epoch": 0.19831095449298256, + "grad_norm": 0.1844778209924698, + "learning_rate": 9.083444871454144e-05, + "loss": 1.1149, + "step": 3264 + }, + { + "epoch": 0.1983717115256091, + "grad_norm": 0.16545872390270233, + "learning_rate": 9.082892372862295e-05, + "loss": 1.1239, + "step": 3265 + }, + { + "epoch": 0.19843246855823563, + "grad_norm": 0.21504627168178558, + "learning_rate": 9.082339724611236e-05, + "loss": 1.0891, + "step": 3266 + }, + { + "epoch": 0.19849322559086213, + "grad_norm": 0.2592552602291107, + "learning_rate": 9.081786926721226e-05, + "loss": 1.0709, + "step": 3267 + }, + { + "epoch": 0.19855398262348867, + "grad_norm": 0.3487328290939331, + "learning_rate": 9.081233979212526e-05, + "loss": 1.1722, + "step": 3268 + }, + { + "epoch": 0.1986147396561152, + "grad_norm": 0.28977230191230774, + "learning_rate": 9.080680882105406e-05, + "loss": 1.1047, + "step": 3269 + }, + { + "epoch": 0.1986754966887417, + "grad_norm": 0.4293149411678314, + "learning_rate": 9.080127635420142e-05, + "loss": 1.1267, + "step": 3270 + }, + { + "epoch": 0.19873625372136824, + "grad_norm": 0.18525612354278564, + "learning_rate": 9.079574239177008e-05, + "loss": 1.1272, + "step": 3271 + }, + { + "epoch": 0.19879701075399478, + "grad_norm": 0.25222426652908325, + "learning_rate": 9.079020693396292e-05, + "loss": 1.134, + "step": 3272 + }, + { + "epoch": 0.19885776778662131, + "grad_norm": 0.2567189931869507, + "learning_rate": 9.078466998098285e-05, + "loss": 1.1383, + "step": 3273 + }, + { + "epoch": 0.19891852481924782, + "grad_norm": 0.560010552406311, + "learning_rate": 9.077913153303282e-05, + "loss": 1.2302, + "step": 3274 + }, + { + "epoch": 0.19897928185187436, + "grad_norm": 0.2820112109184265, + "learning_rate": 9.077359159031582e-05, + "loss": 1.1325, + "step": 3275 + }, + { + "epoch": 0.1990400388845009, + "grad_norm": 0.15497444570064545, + "learning_rate": 9.076805015303496e-05, + "loss": 1.0117, + "step": 3276 + }, + { + "epoch": 0.1991007959171274, + "grad_norm": 0.21076422929763794, + "learning_rate": 9.076250722139333e-05, + "loss": 1.1599, + "step": 3277 + }, + { + "epoch": 0.19916155294975393, + "grad_norm": 0.2797802984714508, + "learning_rate": 9.075696279559414e-05, + "loss": 1.2243, + "step": 3278 + }, + { + "epoch": 0.19922230998238047, + "grad_norm": 0.2437891662120819, + "learning_rate": 9.075141687584057e-05, + "loss": 1.1717, + "step": 3279 + }, + { + "epoch": 0.19928306701500698, + "grad_norm": 0.7717027068138123, + "learning_rate": 9.074586946233595e-05, + "loss": 1.1687, + "step": 3280 + }, + { + "epoch": 0.1993438240476335, + "grad_norm": 0.2633930742740631, + "learning_rate": 9.074032055528361e-05, + "loss": 1.1992, + "step": 3281 + }, + { + "epoch": 0.19940458108026005, + "grad_norm": 0.2726195454597473, + "learning_rate": 9.073477015488696e-05, + "loss": 1.1327, + "step": 3282 + }, + { + "epoch": 0.19946533811288655, + "grad_norm": 0.1584884077310562, + "learning_rate": 9.072921826134942e-05, + "loss": 1.1177, + "step": 3283 + }, + { + "epoch": 0.1995260951455131, + "grad_norm": 0.25239813327789307, + "learning_rate": 9.072366487487451e-05, + "loss": 1.0695, + "step": 3284 + }, + { + "epoch": 0.19958685217813962, + "grad_norm": 0.18151557445526123, + "learning_rate": 9.07181099956658e-05, + "loss": 1.1048, + "step": 3285 + }, + { + "epoch": 0.19964760921076616, + "grad_norm": 0.526943027973175, + "learning_rate": 9.071255362392688e-05, + "loss": 1.114, + "step": 3286 + }, + { + "epoch": 0.19970836624339267, + "grad_norm": 0.28988730907440186, + "learning_rate": 9.070699575986146e-05, + "loss": 1.1565, + "step": 3287 + }, + { + "epoch": 0.1997691232760192, + "grad_norm": 0.19109733402729034, + "learning_rate": 9.070143640367323e-05, + "loss": 1.1704, + "step": 3288 + }, + { + "epoch": 0.19982988030864574, + "grad_norm": 0.7349984049797058, + "learning_rate": 9.069587555556598e-05, + "loss": 1.2812, + "step": 3289 + }, + { + "epoch": 0.19989063734127224, + "grad_norm": 0.22742979228496552, + "learning_rate": 9.069031321574354e-05, + "loss": 1.1273, + "step": 3290 + }, + { + "epoch": 0.19995139437389878, + "grad_norm": 0.22230419516563416, + "learning_rate": 9.068474938440983e-05, + "loss": 1.108, + "step": 3291 + }, + { + "epoch": 0.2000121514065253, + "grad_norm": 0.3051619529724121, + "learning_rate": 9.067918406176875e-05, + "loss": 1.1005, + "step": 3292 + }, + { + "epoch": 0.20007290843915182, + "grad_norm": 0.1708410084247589, + "learning_rate": 9.06736172480243e-05, + "loss": 1.1002, + "step": 3293 + }, + { + "epoch": 0.20013366547177835, + "grad_norm": 0.1780860424041748, + "learning_rate": 9.066804894338056e-05, + "loss": 1.0511, + "step": 3294 + }, + { + "epoch": 0.2001944225044049, + "grad_norm": 0.18345783650875092, + "learning_rate": 9.066247914804164e-05, + "loss": 1.0988, + "step": 3295 + }, + { + "epoch": 0.20025517953703142, + "grad_norm": 0.2013816386461258, + "learning_rate": 9.065690786221168e-05, + "loss": 1.2174, + "step": 3296 + }, + { + "epoch": 0.20031593656965793, + "grad_norm": 0.17284561693668365, + "learning_rate": 9.06513350860949e-05, + "loss": 1.0817, + "step": 3297 + }, + { + "epoch": 0.20037669360228447, + "grad_norm": 0.14359714090824127, + "learning_rate": 9.064576081989557e-05, + "loss": 1.0861, + "step": 3298 + }, + { + "epoch": 0.200437450634911, + "grad_norm": 0.7734746932983398, + "learning_rate": 9.064018506381802e-05, + "loss": 1.1187, + "step": 3299 + }, + { + "epoch": 0.2004982076675375, + "grad_norm": 0.15058156847953796, + "learning_rate": 9.063460781806663e-05, + "loss": 1.1217, + "step": 3300 + }, + { + "epoch": 0.20055896470016404, + "grad_norm": 0.2474808543920517, + "learning_rate": 9.062902908284585e-05, + "loss": 1.1728, + "step": 3301 + }, + { + "epoch": 0.20061972173279058, + "grad_norm": 0.14751644432544708, + "learning_rate": 9.062344885836014e-05, + "loss": 1.1342, + "step": 3302 + }, + { + "epoch": 0.2006804787654171, + "grad_norm": 0.2249361276626587, + "learning_rate": 9.061786714481406e-05, + "loss": 1.1267, + "step": 3303 + }, + { + "epoch": 0.20074123579804362, + "grad_norm": 0.23503537476062775, + "learning_rate": 9.061228394241222e-05, + "loss": 1.1269, + "step": 3304 + }, + { + "epoch": 0.20080199283067016, + "grad_norm": 0.24838240444660187, + "learning_rate": 9.060669925135925e-05, + "loss": 1.0512, + "step": 3305 + }, + { + "epoch": 0.20086274986329666, + "grad_norm": 0.17550477385520935, + "learning_rate": 9.060111307185988e-05, + "loss": 1.0657, + "step": 3306 + }, + { + "epoch": 0.2009235068959232, + "grad_norm": 0.2066640853881836, + "learning_rate": 9.059552540411886e-05, + "loss": 1.1623, + "step": 3307 + }, + { + "epoch": 0.20098426392854973, + "grad_norm": 0.16343830525875092, + "learning_rate": 9.0589936248341e-05, + "loss": 1.1079, + "step": 3308 + }, + { + "epoch": 0.20104502096117627, + "grad_norm": 0.33369284868240356, + "learning_rate": 9.058434560473119e-05, + "loss": 1.0754, + "step": 3309 + }, + { + "epoch": 0.20110577799380278, + "grad_norm": 0.4190661311149597, + "learning_rate": 9.057875347349434e-05, + "loss": 1.125, + "step": 3310 + }, + { + "epoch": 0.2011665350264293, + "grad_norm": 1.5997530221939087, + "learning_rate": 9.057315985483545e-05, + "loss": 1.1197, + "step": 3311 + }, + { + "epoch": 0.20122729205905585, + "grad_norm": 0.1344272345304489, + "learning_rate": 9.056756474895954e-05, + "loss": 1.0992, + "step": 3312 + }, + { + "epoch": 0.20128804909168235, + "grad_norm": 0.14829017221927643, + "learning_rate": 9.05619681560717e-05, + "loss": 1.0901, + "step": 3313 + }, + { + "epoch": 0.2013488061243089, + "grad_norm": 0.20510458946228027, + "learning_rate": 9.055637007637707e-05, + "loss": 1.1626, + "step": 3314 + }, + { + "epoch": 0.20140956315693542, + "grad_norm": 0.16008488833904266, + "learning_rate": 9.055077051008087e-05, + "loss": 1.0767, + "step": 3315 + }, + { + "epoch": 0.20147032018956193, + "grad_norm": 0.23861490190029144, + "learning_rate": 9.054516945738833e-05, + "loss": 1.1874, + "step": 3316 + }, + { + "epoch": 0.20153107722218846, + "grad_norm": 0.16540761291980743, + "learning_rate": 9.053956691850476e-05, + "loss": 1.0869, + "step": 3317 + }, + { + "epoch": 0.201591834254815, + "grad_norm": 0.3213448226451874, + "learning_rate": 9.053396289363553e-05, + "loss": 1.4313, + "step": 3318 + }, + { + "epoch": 0.20165259128744153, + "grad_norm": 0.22906824946403503, + "learning_rate": 9.052835738298606e-05, + "loss": 1.1131, + "step": 3319 + }, + { + "epoch": 0.20171334832006804, + "grad_norm": 0.19157810509204865, + "learning_rate": 9.052275038676183e-05, + "loss": 1.057, + "step": 3320 + }, + { + "epoch": 0.20177410535269458, + "grad_norm": 0.31121575832366943, + "learning_rate": 9.051714190516834e-05, + "loss": 1.2415, + "step": 3321 + }, + { + "epoch": 0.2018348623853211, + "grad_norm": 0.4245741367340088, + "learning_rate": 9.051153193841118e-05, + "loss": 1.318, + "step": 3322 + }, + { + "epoch": 0.20189561941794762, + "grad_norm": 0.30429720878601074, + "learning_rate": 9.050592048669597e-05, + "loss": 1.0931, + "step": 3323 + }, + { + "epoch": 0.20195637645057415, + "grad_norm": 0.16724011301994324, + "learning_rate": 9.050030755022844e-05, + "loss": 1.103, + "step": 3324 + }, + { + "epoch": 0.2020171334832007, + "grad_norm": 0.16439297795295715, + "learning_rate": 9.04946931292143e-05, + "loss": 1.085, + "step": 3325 + }, + { + "epoch": 0.2020778905158272, + "grad_norm": 0.24683915078639984, + "learning_rate": 9.048907722385935e-05, + "loss": 1.2018, + "step": 3326 + }, + { + "epoch": 0.20213864754845373, + "grad_norm": 0.16863733530044556, + "learning_rate": 9.048345983436946e-05, + "loss": 1.1387, + "step": 3327 + }, + { + "epoch": 0.20219940458108027, + "grad_norm": 0.9390217065811157, + "learning_rate": 9.047784096095052e-05, + "loss": 1.1491, + "step": 3328 + }, + { + "epoch": 0.20226016161370677, + "grad_norm": 0.2639472484588623, + "learning_rate": 9.047222060380849e-05, + "loss": 1.0758, + "step": 3329 + }, + { + "epoch": 0.2023209186463333, + "grad_norm": 0.16340938210487366, + "learning_rate": 9.046659876314938e-05, + "loss": 1.0856, + "step": 3330 + }, + { + "epoch": 0.20238167567895984, + "grad_norm": 0.2741747498512268, + "learning_rate": 9.046097543917928e-05, + "loss": 1.1686, + "step": 3331 + }, + { + "epoch": 0.20244243271158638, + "grad_norm": 0.22821323573589325, + "learning_rate": 9.045535063210432e-05, + "loss": 1.1587, + "step": 3332 + }, + { + "epoch": 0.20250318974421289, + "grad_norm": 4.2999491691589355, + "learning_rate": 9.044972434213064e-05, + "loss": 1.1362, + "step": 3333 + }, + { + "epoch": 0.20256394677683942, + "grad_norm": 0.3759039044380188, + "learning_rate": 9.04440965694645e-05, + "loss": 1.2306, + "step": 3334 + }, + { + "epoch": 0.20262470380946596, + "grad_norm": 1.0817079544067383, + "learning_rate": 9.04384673143122e-05, + "loss": 1.1433, + "step": 3335 + }, + { + "epoch": 0.20268546084209246, + "grad_norm": 0.2255803495645523, + "learning_rate": 9.043283657688005e-05, + "loss": 1.0925, + "step": 3336 + }, + { + "epoch": 0.202746217874719, + "grad_norm": 0.2970569431781769, + "learning_rate": 9.042720435737447e-05, + "loss": 1.156, + "step": 3337 + }, + { + "epoch": 0.20280697490734553, + "grad_norm": 0.1731918901205063, + "learning_rate": 9.042157065600188e-05, + "loss": 1.0979, + "step": 3338 + }, + { + "epoch": 0.20286773193997204, + "grad_norm": 0.19859391450881958, + "learning_rate": 9.041593547296883e-05, + "loss": 1.0874, + "step": 3339 + }, + { + "epoch": 0.20292848897259858, + "grad_norm": 0.21392711997032166, + "learning_rate": 9.041029880848183e-05, + "loss": 1.2161, + "step": 3340 + }, + { + "epoch": 0.2029892460052251, + "grad_norm": 0.1796914041042328, + "learning_rate": 9.040466066274753e-05, + "loss": 1.1418, + "step": 3341 + }, + { + "epoch": 0.20305000303785165, + "grad_norm": 0.2005152553319931, + "learning_rate": 9.039902103597258e-05, + "loss": 1.1152, + "step": 3342 + }, + { + "epoch": 0.20311076007047815, + "grad_norm": 0.18333251774311066, + "learning_rate": 9.03933799283637e-05, + "loss": 1.1705, + "step": 3343 + }, + { + "epoch": 0.2031715171031047, + "grad_norm": 0.5630664229393005, + "learning_rate": 9.038773734012768e-05, + "loss": 1.0914, + "step": 3344 + }, + { + "epoch": 0.20323227413573122, + "grad_norm": 0.35749906301498413, + "learning_rate": 9.038209327147133e-05, + "loss": 1.2042, + "step": 3345 + }, + { + "epoch": 0.20329303116835773, + "grad_norm": 0.18422846496105194, + "learning_rate": 9.037644772260155e-05, + "loss": 1.108, + "step": 3346 + }, + { + "epoch": 0.20335378820098426, + "grad_norm": 0.179338276386261, + "learning_rate": 9.037080069372527e-05, + "loss": 1.1074, + "step": 3347 + }, + { + "epoch": 0.2034145452336108, + "grad_norm": 0.22576642036437988, + "learning_rate": 9.03651521850495e-05, + "loss": 1.134, + "step": 3348 + }, + { + "epoch": 0.2034753022662373, + "grad_norm": 0.1988999992609024, + "learning_rate": 9.035950219678126e-05, + "loss": 1.1069, + "step": 3349 + }, + { + "epoch": 0.20353605929886384, + "grad_norm": 0.44660013914108276, + "learning_rate": 9.035385072912768e-05, + "loss": 1.264, + "step": 3350 + }, + { + "epoch": 0.20359681633149038, + "grad_norm": 0.31285372376441956, + "learning_rate": 9.034819778229588e-05, + "loss": 1.2506, + "step": 3351 + }, + { + "epoch": 0.20365757336411688, + "grad_norm": 0.16578587889671326, + "learning_rate": 9.034254335649311e-05, + "loss": 1.0862, + "step": 3352 + }, + { + "epoch": 0.20371833039674342, + "grad_norm": 0.14541122317314148, + "learning_rate": 9.033688745192661e-05, + "loss": 1.0819, + "step": 3353 + }, + { + "epoch": 0.20377908742936995, + "grad_norm": 0.17592382431030273, + "learning_rate": 9.03312300688037e-05, + "loss": 1.0943, + "step": 3354 + }, + { + "epoch": 0.2038398444619965, + "grad_norm": 0.2473861128091812, + "learning_rate": 9.032557120733176e-05, + "loss": 1.1806, + "step": 3355 + }, + { + "epoch": 0.203900601494623, + "grad_norm": 0.22891579568386078, + "learning_rate": 9.031991086771823e-05, + "loss": 1.1701, + "step": 3356 + }, + { + "epoch": 0.20396135852724953, + "grad_norm": 0.1464105099439621, + "learning_rate": 9.031424905017054e-05, + "loss": 1.0917, + "step": 3357 + }, + { + "epoch": 0.20402211555987607, + "grad_norm": 0.18087196350097656, + "learning_rate": 9.030858575489628e-05, + "loss": 1.1303, + "step": 3358 + }, + { + "epoch": 0.20408287259250257, + "grad_norm": 0.14956440031528473, + "learning_rate": 9.030292098210302e-05, + "loss": 1.0915, + "step": 3359 + }, + { + "epoch": 0.2041436296251291, + "grad_norm": 0.2575131356716156, + "learning_rate": 9.029725473199839e-05, + "loss": 1.1665, + "step": 3360 + }, + { + "epoch": 0.20420438665775564, + "grad_norm": 0.17843805253505707, + "learning_rate": 9.02915870047901e-05, + "loss": 1.0882, + "step": 3361 + }, + { + "epoch": 0.20426514369038215, + "grad_norm": 0.20931629836559296, + "learning_rate": 9.02859178006859e-05, + "loss": 1.1664, + "step": 3362 + }, + { + "epoch": 0.20432590072300869, + "grad_norm": 0.21923457086086273, + "learning_rate": 9.02802471198936e-05, + "loss": 1.0871, + "step": 3363 + }, + { + "epoch": 0.20438665775563522, + "grad_norm": 0.4388730823993683, + "learning_rate": 9.027457496262104e-05, + "loss": 1.1021, + "step": 3364 + }, + { + "epoch": 0.20444741478826173, + "grad_norm": 0.23835933208465576, + "learning_rate": 9.026890132907618e-05, + "loss": 1.1708, + "step": 3365 + }, + { + "epoch": 0.20450817182088826, + "grad_norm": 0.2694539427757263, + "learning_rate": 9.026322621946692e-05, + "loss": 1.0946, + "step": 3366 + }, + { + "epoch": 0.2045689288535148, + "grad_norm": 1.1554162502288818, + "learning_rate": 9.025754963400134e-05, + "loss": 1.1365, + "step": 3367 + }, + { + "epoch": 0.20462968588614133, + "grad_norm": 0.1602601557970047, + "learning_rate": 9.02518715728875e-05, + "loss": 1.11, + "step": 3368 + }, + { + "epoch": 0.20469044291876784, + "grad_norm": 0.2515504062175751, + "learning_rate": 9.024619203633353e-05, + "loss": 1.1114, + "step": 3369 + }, + { + "epoch": 0.20475119995139437, + "grad_norm": 0.2663159966468811, + "learning_rate": 9.02405110245476e-05, + "loss": 1.144, + "step": 3370 + }, + { + "epoch": 0.2048119569840209, + "grad_norm": 0.1740141659975052, + "learning_rate": 9.023482853773797e-05, + "loss": 1.0782, + "step": 3371 + }, + { + "epoch": 0.20487271401664742, + "grad_norm": 1.3628073930740356, + "learning_rate": 9.022914457611293e-05, + "loss": 1.0954, + "step": 3372 + }, + { + "epoch": 0.20493347104927395, + "grad_norm": 1.5277669429779053, + "learning_rate": 9.02234591398808e-05, + "loss": 1.1113, + "step": 3373 + }, + { + "epoch": 0.2049942280819005, + "grad_norm": 0.22233106195926666, + "learning_rate": 9.021777222925001e-05, + "loss": 1.1034, + "step": 3374 + }, + { + "epoch": 0.205054985114527, + "grad_norm": 0.18932431936264038, + "learning_rate": 9.021208384442901e-05, + "loss": 1.1093, + "step": 3375 + }, + { + "epoch": 0.20511574214715353, + "grad_norm": 0.28633913397789, + "learning_rate": 9.020639398562629e-05, + "loss": 1.138, + "step": 3376 + }, + { + "epoch": 0.20517649917978006, + "grad_norm": 0.2889212965965271, + "learning_rate": 9.020070265305043e-05, + "loss": 1.0839, + "step": 3377 + }, + { + "epoch": 0.2052372562124066, + "grad_norm": 0.2777705192565918, + "learning_rate": 9.019500984691004e-05, + "loss": 1.3459, + "step": 3378 + }, + { + "epoch": 0.2052980132450331, + "grad_norm": 0.26010599732398987, + "learning_rate": 9.018931556741381e-05, + "loss": 1.1585, + "step": 3379 + }, + { + "epoch": 0.20535877027765964, + "grad_norm": 0.2716197669506073, + "learning_rate": 9.018361981477045e-05, + "loss": 1.0921, + "step": 3380 + }, + { + "epoch": 0.20541952731028618, + "grad_norm": 0.14544996619224548, + "learning_rate": 9.017792258918873e-05, + "loss": 1.1272, + "step": 3381 + }, + { + "epoch": 0.20548028434291268, + "grad_norm": 0.1978648602962494, + "learning_rate": 9.01722238908775e-05, + "loss": 1.1856, + "step": 3382 + }, + { + "epoch": 0.20554104137553922, + "grad_norm": 0.18912170827388763, + "learning_rate": 9.016652372004562e-05, + "loss": 1.0779, + "step": 3383 + }, + { + "epoch": 0.20560179840816575, + "grad_norm": 0.20028752088546753, + "learning_rate": 9.016082207690207e-05, + "loss": 1.118, + "step": 3384 + }, + { + "epoch": 0.20566255544079226, + "grad_norm": 2.3095862865448, + "learning_rate": 9.01551189616558e-05, + "loss": 1.1394, + "step": 3385 + }, + { + "epoch": 0.2057233124734188, + "grad_norm": 0.21716797351837158, + "learning_rate": 9.01494143745159e-05, + "loss": 1.1877, + "step": 3386 + }, + { + "epoch": 0.20578406950604533, + "grad_norm": 0.15557648241519928, + "learning_rate": 9.014370831569144e-05, + "loss": 1.1201, + "step": 3387 + }, + { + "epoch": 0.20584482653867184, + "grad_norm": 0.1555710881948471, + "learning_rate": 9.01380007853916e-05, + "loss": 1.0736, + "step": 3388 + }, + { + "epoch": 0.20590558357129837, + "grad_norm": 0.18824075162410736, + "learning_rate": 9.01322917838256e-05, + "loss": 1.0712, + "step": 3389 + }, + { + "epoch": 0.2059663406039249, + "grad_norm": 0.26053574681282043, + "learning_rate": 9.012658131120266e-05, + "loss": 1.0993, + "step": 3390 + }, + { + "epoch": 0.20602709763655144, + "grad_norm": 0.210182785987854, + "learning_rate": 9.012086936773214e-05, + "loss": 1.0759, + "step": 3391 + }, + { + "epoch": 0.20608785466917795, + "grad_norm": 0.17331752181053162, + "learning_rate": 9.011515595362342e-05, + "loss": 1.0844, + "step": 3392 + }, + { + "epoch": 0.20614861170180449, + "grad_norm": 0.14416228234767914, + "learning_rate": 9.010944106908586e-05, + "loss": 1.0983, + "step": 3393 + }, + { + "epoch": 0.20620936873443102, + "grad_norm": 0.3085445463657379, + "learning_rate": 9.010372471432902e-05, + "loss": 1.0516, + "step": 3394 + }, + { + "epoch": 0.20627012576705753, + "grad_norm": 0.1775248795747757, + "learning_rate": 9.009800688956237e-05, + "loss": 1.1444, + "step": 3395 + }, + { + "epoch": 0.20633088279968406, + "grad_norm": 0.24787196516990662, + "learning_rate": 9.009228759499555e-05, + "loss": 1.0822, + "step": 3396 + }, + { + "epoch": 0.2063916398323106, + "grad_norm": 0.4156123399734497, + "learning_rate": 9.008656683083816e-05, + "loss": 1.0482, + "step": 3397 + }, + { + "epoch": 0.2064523968649371, + "grad_norm": 0.3085361123085022, + "learning_rate": 9.008084459729993e-05, + "loss": 1.1109, + "step": 3398 + }, + { + "epoch": 0.20651315389756364, + "grad_norm": 0.22662469744682312, + "learning_rate": 9.007512089459058e-05, + "loss": 1.1664, + "step": 3399 + }, + { + "epoch": 0.20657391093019017, + "grad_norm": 0.24231384694576263, + "learning_rate": 9.006939572291994e-05, + "loss": 1.0466, + "step": 3400 + }, + { + "epoch": 0.2066346679628167, + "grad_norm": 0.1650387942790985, + "learning_rate": 9.006366908249784e-05, + "loss": 1.1078, + "step": 3401 + }, + { + "epoch": 0.20669542499544322, + "grad_norm": 0.3309670388698578, + "learning_rate": 9.005794097353422e-05, + "loss": 1.0796, + "step": 3402 + }, + { + "epoch": 0.20675618202806975, + "grad_norm": 0.2354893833398819, + "learning_rate": 9.005221139623903e-05, + "loss": 1.1447, + "step": 3403 + }, + { + "epoch": 0.2068169390606963, + "grad_norm": 0.25864624977111816, + "learning_rate": 9.004648035082228e-05, + "loss": 1.1428, + "step": 3404 + }, + { + "epoch": 0.2068776960933228, + "grad_norm": 0.2769456207752228, + "learning_rate": 9.004074783749405e-05, + "loss": 1.2156, + "step": 3405 + }, + { + "epoch": 0.20693845312594933, + "grad_norm": 0.3436751365661621, + "learning_rate": 9.003501385646449e-05, + "loss": 1.1811, + "step": 3406 + }, + { + "epoch": 0.20699921015857586, + "grad_norm": 0.18414154648780823, + "learning_rate": 9.002927840794374e-05, + "loss": 1.0763, + "step": 3407 + }, + { + "epoch": 0.20705996719120237, + "grad_norm": 0.2883095443248749, + "learning_rate": 9.002354149214205e-05, + "loss": 1.1309, + "step": 3408 + }, + { + "epoch": 0.2071207242238289, + "grad_norm": 0.8748995661735535, + "learning_rate": 9.001780310926972e-05, + "loss": 1.1212, + "step": 3409 + }, + { + "epoch": 0.20718148125645544, + "grad_norm": 0.16728021204471588, + "learning_rate": 9.001206325953709e-05, + "loss": 1.0571, + "step": 3410 + }, + { + "epoch": 0.20724223828908195, + "grad_norm": 0.16911764442920685, + "learning_rate": 9.000632194315453e-05, + "loss": 1.1535, + "step": 3411 + }, + { + "epoch": 0.20730299532170848, + "grad_norm": 0.23415647447109222, + "learning_rate": 9.00005791603325e-05, + "loss": 1.1268, + "step": 3412 + }, + { + "epoch": 0.20736375235433502, + "grad_norm": 0.1691472828388214, + "learning_rate": 8.999483491128153e-05, + "loss": 1.0652, + "step": 3413 + }, + { + "epoch": 0.20742450938696155, + "grad_norm": 0.1758832484483719, + "learning_rate": 8.998908919621214e-05, + "loss": 1.1133, + "step": 3414 + }, + { + "epoch": 0.20748526641958806, + "grad_norm": 0.301696240901947, + "learning_rate": 8.998334201533495e-05, + "loss": 1.1943, + "step": 3415 + }, + { + "epoch": 0.2075460234522146, + "grad_norm": 0.34288668632507324, + "learning_rate": 8.997759336886066e-05, + "loss": 1.188, + "step": 3416 + }, + { + "epoch": 0.20760678048484113, + "grad_norm": 0.6763338446617126, + "learning_rate": 8.997184325699994e-05, + "loss": 1.0859, + "step": 3417 + }, + { + "epoch": 0.20766753751746764, + "grad_norm": 0.39041468501091003, + "learning_rate": 8.996609167996357e-05, + "loss": 1.1999, + "step": 3418 + }, + { + "epoch": 0.20772829455009417, + "grad_norm": 0.14516863226890564, + "learning_rate": 8.99603386379624e-05, + "loss": 1.0651, + "step": 3419 + }, + { + "epoch": 0.2077890515827207, + "grad_norm": 12.497024536132812, + "learning_rate": 8.995458413120727e-05, + "loss": 1.1849, + "step": 3420 + }, + { + "epoch": 0.20784980861534721, + "grad_norm": 0.1711440533399582, + "learning_rate": 8.994882815990917e-05, + "loss": 1.1478, + "step": 3421 + }, + { + "epoch": 0.20791056564797375, + "grad_norm": 0.19265122711658478, + "learning_rate": 8.994307072427902e-05, + "loss": 1.0803, + "step": 3422 + }, + { + "epoch": 0.20797132268060028, + "grad_norm": 0.22108204662799835, + "learning_rate": 8.993731182452791e-05, + "loss": 1.3449, + "step": 3423 + }, + { + "epoch": 0.20803207971322682, + "grad_norm": 0.1360526978969574, + "learning_rate": 8.993155146086691e-05, + "loss": 1.0814, + "step": 3424 + }, + { + "epoch": 0.20809283674585333, + "grad_norm": 0.18838582932949066, + "learning_rate": 8.992578963350717e-05, + "loss": 1.156, + "step": 3425 + }, + { + "epoch": 0.20815359377847986, + "grad_norm": 0.4641799330711365, + "learning_rate": 8.99200263426599e-05, + "loss": 1.167, + "step": 3426 + }, + { + "epoch": 0.2082143508111064, + "grad_norm": 0.1729332059621811, + "learning_rate": 8.991426158853635e-05, + "loss": 1.0738, + "step": 3427 + }, + { + "epoch": 0.2082751078437329, + "grad_norm": 0.1480157971382141, + "learning_rate": 8.990849537134782e-05, + "loss": 1.1109, + "step": 3428 + }, + { + "epoch": 0.20833586487635944, + "grad_norm": 0.20335061848163605, + "learning_rate": 8.990272769130567e-05, + "loss": 1.2222, + "step": 3429 + }, + { + "epoch": 0.20839662190898597, + "grad_norm": 0.18276669085025787, + "learning_rate": 8.989695854862132e-05, + "loss": 1.0947, + "step": 3430 + }, + { + "epoch": 0.20845737894161248, + "grad_norm": 3.883676290512085, + "learning_rate": 8.989118794350625e-05, + "loss": 1.2349, + "step": 3431 + }, + { + "epoch": 0.20851813597423902, + "grad_norm": 0.2777820825576782, + "learning_rate": 8.9885415876172e-05, + "loss": 1.1044, + "step": 3432 + }, + { + "epoch": 0.20857889300686555, + "grad_norm": 0.1472485512495041, + "learning_rate": 8.987964234683009e-05, + "loss": 1.0502, + "step": 3433 + }, + { + "epoch": 0.20863965003949206, + "grad_norm": 0.16455063223838806, + "learning_rate": 8.98738673556922e-05, + "loss": 1.102, + "step": 3434 + }, + { + "epoch": 0.2087004070721186, + "grad_norm": 0.2353052943944931, + "learning_rate": 8.986809090296998e-05, + "loss": 1.1555, + "step": 3435 + }, + { + "epoch": 0.20876116410474513, + "grad_norm": 0.14562679827213287, + "learning_rate": 8.98623129888752e-05, + "loss": 1.0827, + "step": 3436 + }, + { + "epoch": 0.20882192113737166, + "grad_norm": 0.2758413851261139, + "learning_rate": 8.985653361361961e-05, + "loss": 1.0943, + "step": 3437 + }, + { + "epoch": 0.20888267816999817, + "grad_norm": 0.14613857865333557, + "learning_rate": 8.98507527774151e-05, + "loss": 1.279, + "step": 3438 + }, + { + "epoch": 0.2089434352026247, + "grad_norm": 0.11982880532741547, + "learning_rate": 8.98449704804735e-05, + "loss": 1.0714, + "step": 3439 + }, + { + "epoch": 0.20900419223525124, + "grad_norm": 0.9307172894477844, + "learning_rate": 8.983918672300687e-05, + "loss": 1.0974, + "step": 3440 + }, + { + "epoch": 0.20906494926787775, + "grad_norm": 0.23882558941841125, + "learning_rate": 8.98334015052271e-05, + "loss": 1.2063, + "step": 3441 + }, + { + "epoch": 0.20912570630050428, + "grad_norm": 0.14072297513484955, + "learning_rate": 8.982761482734632e-05, + "loss": 1.0889, + "step": 3442 + }, + { + "epoch": 0.20918646333313082, + "grad_norm": 0.15262438356876373, + "learning_rate": 8.982182668957661e-05, + "loss": 1.082, + "step": 3443 + }, + { + "epoch": 0.20924722036575732, + "grad_norm": 0.2728678584098816, + "learning_rate": 8.981603709213014e-05, + "loss": 1.1356, + "step": 3444 + }, + { + "epoch": 0.20930797739838386, + "grad_norm": 0.2701585292816162, + "learning_rate": 8.981024603521914e-05, + "loss": 1.2097, + "step": 3445 + }, + { + "epoch": 0.2093687344310104, + "grad_norm": 0.142396479845047, + "learning_rate": 8.980445351905588e-05, + "loss": 1.071, + "step": 3446 + }, + { + "epoch": 0.20942949146363693, + "grad_norm": 0.13167715072631836, + "learning_rate": 8.979865954385269e-05, + "loss": 1.2422, + "step": 3447 + }, + { + "epoch": 0.20949024849626344, + "grad_norm": 0.3174823224544525, + "learning_rate": 8.979286410982194e-05, + "loss": 1.1203, + "step": 3448 + }, + { + "epoch": 0.20955100552888997, + "grad_norm": 0.20211239159107208, + "learning_rate": 8.978706721717605e-05, + "loss": 1.1399, + "step": 3449 + }, + { + "epoch": 0.2096117625615165, + "grad_norm": 0.16633735597133636, + "learning_rate": 8.978126886612754e-05, + "loss": 1.1647, + "step": 3450 + }, + { + "epoch": 0.20967251959414301, + "grad_norm": 0.24114300310611725, + "learning_rate": 8.977546905688891e-05, + "loss": 1.085, + "step": 3451 + }, + { + "epoch": 0.20973327662676955, + "grad_norm": 0.3945278823375702, + "learning_rate": 8.976966778967279e-05, + "loss": 1.0882, + "step": 3452 + }, + { + "epoch": 0.20979403365939608, + "grad_norm": 0.33712682127952576, + "learning_rate": 8.976386506469178e-05, + "loss": 1.2284, + "step": 3453 + }, + { + "epoch": 0.2098547906920226, + "grad_norm": 0.2718818187713623, + "learning_rate": 8.975806088215864e-05, + "loss": 1.1495, + "step": 3454 + }, + { + "epoch": 0.20991554772464913, + "grad_norm": 0.15609952807426453, + "learning_rate": 8.975225524228609e-05, + "loss": 1.1036, + "step": 3455 + }, + { + "epoch": 0.20997630475727566, + "grad_norm": 0.1752764880657196, + "learning_rate": 8.974644814528694e-05, + "loss": 1.1146, + "step": 3456 + }, + { + "epoch": 0.21003706178990217, + "grad_norm": 0.21618975698947906, + "learning_rate": 8.974063959137405e-05, + "loss": 1.065, + "step": 3457 + }, + { + "epoch": 0.2100978188225287, + "grad_norm": 0.20082253217697144, + "learning_rate": 8.973482958076032e-05, + "loss": 1.0912, + "step": 3458 + }, + { + "epoch": 0.21015857585515524, + "grad_norm": 0.13558422029018402, + "learning_rate": 8.972901811365875e-05, + "loss": 1.083, + "step": 3459 + }, + { + "epoch": 0.21021933288778177, + "grad_norm": 17.55464744567871, + "learning_rate": 8.972320519028232e-05, + "loss": 1.1911, + "step": 3460 + }, + { + "epoch": 0.21028008992040828, + "grad_norm": 0.1939544528722763, + "learning_rate": 8.971739081084414e-05, + "loss": 1.0666, + "step": 3461 + }, + { + "epoch": 0.21034084695303482, + "grad_norm": 0.2793985605239868, + "learning_rate": 8.971157497555733e-05, + "loss": 1.2245, + "step": 3462 + }, + { + "epoch": 0.21040160398566135, + "grad_norm": 0.6017389893531799, + "learning_rate": 8.970575768463505e-05, + "loss": 1.278, + "step": 3463 + }, + { + "epoch": 0.21046236101828786, + "grad_norm": 12.897541999816895, + "learning_rate": 8.969993893829054e-05, + "loss": 1.1187, + "step": 3464 + }, + { + "epoch": 0.2105231180509144, + "grad_norm": 0.2867054343223572, + "learning_rate": 8.969411873673712e-05, + "loss": 1.1324, + "step": 3465 + }, + { + "epoch": 0.21058387508354093, + "grad_norm": 0.1979219615459442, + "learning_rate": 8.968829708018809e-05, + "loss": 1.1689, + "step": 3466 + }, + { + "epoch": 0.21064463211616744, + "grad_norm": 0.3768339157104492, + "learning_rate": 8.968247396885685e-05, + "loss": 1.0789, + "step": 3467 + }, + { + "epoch": 0.21070538914879397, + "grad_norm": 0.17232266068458557, + "learning_rate": 8.967664940295685e-05, + "loss": 1.1094, + "step": 3468 + }, + { + "epoch": 0.2107661461814205, + "grad_norm": 0.17694714665412903, + "learning_rate": 8.967082338270163e-05, + "loss": 1.1441, + "step": 3469 + }, + { + "epoch": 0.210826903214047, + "grad_norm": 0.35034406185150146, + "learning_rate": 8.966499590830468e-05, + "loss": 1.1235, + "step": 3470 + }, + { + "epoch": 0.21088766024667355, + "grad_norm": 0.40152639150619507, + "learning_rate": 8.965916697997966e-05, + "loss": 1.1417, + "step": 3471 + }, + { + "epoch": 0.21094841727930008, + "grad_norm": 0.17416897416114807, + "learning_rate": 8.96533365979402e-05, + "loss": 1.0615, + "step": 3472 + }, + { + "epoch": 0.21100917431192662, + "grad_norm": 0.3140276372432709, + "learning_rate": 8.964750476240003e-05, + "loss": 1.1857, + "step": 3473 + }, + { + "epoch": 0.21106993134455312, + "grad_norm": 0.30841904878616333, + "learning_rate": 8.964167147357289e-05, + "loss": 1.0743, + "step": 3474 + }, + { + "epoch": 0.21113068837717966, + "grad_norm": 0.1996213048696518, + "learning_rate": 8.963583673167263e-05, + "loss": 1.1743, + "step": 3475 + }, + { + "epoch": 0.2111914454098062, + "grad_norm": 0.2031165212392807, + "learning_rate": 8.96300005369131e-05, + "loss": 1.1261, + "step": 3476 + }, + { + "epoch": 0.2112522024424327, + "grad_norm": 0.8546181321144104, + "learning_rate": 8.962416288950827e-05, + "loss": 1.3121, + "step": 3477 + }, + { + "epoch": 0.21131295947505924, + "grad_norm": 0.22097034752368927, + "learning_rate": 8.961832378967207e-05, + "loss": 1.0704, + "step": 3478 + }, + { + "epoch": 0.21137371650768577, + "grad_norm": 0.1509115993976593, + "learning_rate": 8.961248323761855e-05, + "loss": 1.0723, + "step": 3479 + }, + { + "epoch": 0.21143447354031228, + "grad_norm": 0.24100449681282043, + "learning_rate": 8.960664123356182e-05, + "loss": 1.1231, + "step": 3480 + }, + { + "epoch": 0.2114952305729388, + "grad_norm": 0.18507619202136993, + "learning_rate": 8.960079777771598e-05, + "loss": 1.1152, + "step": 3481 + }, + { + "epoch": 0.21155598760556535, + "grad_norm": 0.4750421345233917, + "learning_rate": 8.959495287029525e-05, + "loss": 1.0421, + "step": 3482 + }, + { + "epoch": 0.21161674463819188, + "grad_norm": 0.18387670814990997, + "learning_rate": 8.958910651151387e-05, + "loss": 1.0501, + "step": 3483 + }, + { + "epoch": 0.2116775016708184, + "grad_norm": 6.725606441497803, + "learning_rate": 8.958325870158613e-05, + "loss": 1.0825, + "step": 3484 + }, + { + "epoch": 0.21173825870344493, + "grad_norm": 0.3028864562511444, + "learning_rate": 8.95774094407264e-05, + "loss": 1.0771, + "step": 3485 + }, + { + "epoch": 0.21179901573607146, + "grad_norm": 0.15914765000343323, + "learning_rate": 8.957155872914907e-05, + "loss": 1.0584, + "step": 3486 + }, + { + "epoch": 0.21185977276869797, + "grad_norm": 0.2881225645542145, + "learning_rate": 8.956570656706862e-05, + "loss": 1.1153, + "step": 3487 + }, + { + "epoch": 0.2119205298013245, + "grad_norm": 0.3317762315273285, + "learning_rate": 8.955985295469953e-05, + "loss": 1.0941, + "step": 3488 + }, + { + "epoch": 0.21198128683395104, + "grad_norm": 0.33383673429489136, + "learning_rate": 8.95539978922564e-05, + "loss": 1.0717, + "step": 3489 + }, + { + "epoch": 0.21204204386657755, + "grad_norm": 0.16560310125350952, + "learning_rate": 8.954814137995382e-05, + "loss": 1.1082, + "step": 3490 + }, + { + "epoch": 0.21210280089920408, + "grad_norm": 0.2758764624595642, + "learning_rate": 8.954228341800648e-05, + "loss": 1.069, + "step": 3491 + }, + { + "epoch": 0.21216355793183062, + "grad_norm": 0.3274565041065216, + "learning_rate": 8.953642400662909e-05, + "loss": 1.0943, + "step": 3492 + }, + { + "epoch": 0.21222431496445712, + "grad_norm": 0.2620927691459656, + "learning_rate": 8.953056314603644e-05, + "loss": 1.1549, + "step": 3493 + }, + { + "epoch": 0.21228507199708366, + "grad_norm": 0.14102204144001007, + "learning_rate": 8.952470083644336e-05, + "loss": 1.1133, + "step": 3494 + }, + { + "epoch": 0.2123458290297102, + "grad_norm": 0.16233918070793152, + "learning_rate": 8.951883707806472e-05, + "loss": 1.0934, + "step": 3495 + }, + { + "epoch": 0.21240658606233673, + "grad_norm": 0.22172996401786804, + "learning_rate": 8.951297187111547e-05, + "loss": 1.0896, + "step": 3496 + }, + { + "epoch": 0.21246734309496323, + "grad_norm": 2.096376657485962, + "learning_rate": 8.95071052158106e-05, + "loss": 1.1297, + "step": 3497 + }, + { + "epoch": 0.21252810012758977, + "grad_norm": 0.23521722853183746, + "learning_rate": 8.950123711236517e-05, + "loss": 1.0986, + "step": 3498 + }, + { + "epoch": 0.2125888571602163, + "grad_norm": 0.206913024187088, + "learning_rate": 8.949536756099424e-05, + "loss": 1.1535, + "step": 3499 + }, + { + "epoch": 0.2126496141928428, + "grad_norm": 0.5676190257072449, + "learning_rate": 8.948949656191299e-05, + "loss": 1.1095, + "step": 3500 + }, + { + "epoch": 0.21271037122546935, + "grad_norm": 0.23115776479244232, + "learning_rate": 8.94836241153366e-05, + "loss": 1.0747, + "step": 3501 + }, + { + "epoch": 0.21277112825809588, + "grad_norm": 0.14194437861442566, + "learning_rate": 8.947775022148033e-05, + "loss": 1.0772, + "step": 3502 + }, + { + "epoch": 0.2128318852907224, + "grad_norm": 0.2450140416622162, + "learning_rate": 8.94718748805595e-05, + "loss": 1.2935, + "step": 3503 + }, + { + "epoch": 0.21289264232334892, + "grad_norm": 0.2724868357181549, + "learning_rate": 8.946599809278949e-05, + "loss": 1.1514, + "step": 3504 + }, + { + "epoch": 0.21295339935597546, + "grad_norm": 0.23426179587841034, + "learning_rate": 8.946011985838565e-05, + "loss": 1.3099, + "step": 3505 + }, + { + "epoch": 0.213014156388602, + "grad_norm": 0.3953523337841034, + "learning_rate": 8.945424017756352e-05, + "loss": 1.1211, + "step": 3506 + }, + { + "epoch": 0.2130749134212285, + "grad_norm": 0.19598069787025452, + "learning_rate": 8.944835905053858e-05, + "loss": 1.0589, + "step": 3507 + }, + { + "epoch": 0.21313567045385504, + "grad_norm": 0.19311000406742096, + "learning_rate": 8.944247647752642e-05, + "loss": 1.1569, + "step": 3508 + }, + { + "epoch": 0.21319642748648157, + "grad_norm": 2.1531529426574707, + "learning_rate": 8.943659245874266e-05, + "loss": 1.2014, + "step": 3509 + }, + { + "epoch": 0.21325718451910808, + "grad_norm": 0.3676398694515228, + "learning_rate": 8.943070699440295e-05, + "loss": 1.3526, + "step": 3510 + }, + { + "epoch": 0.2133179415517346, + "grad_norm": 0.354640394449234, + "learning_rate": 8.942482008472309e-05, + "loss": 1.1498, + "step": 3511 + }, + { + "epoch": 0.21337869858436115, + "grad_norm": 0.37891027331352234, + "learning_rate": 8.941893172991882e-05, + "loss": 1.073, + "step": 3512 + }, + { + "epoch": 0.21343945561698766, + "grad_norm": 0.418806254863739, + "learning_rate": 8.941304193020599e-05, + "loss": 1.327, + "step": 3513 + }, + { + "epoch": 0.2135002126496142, + "grad_norm": 2.414013624191284, + "learning_rate": 8.940715068580047e-05, + "loss": 1.2974, + "step": 3514 + }, + { + "epoch": 0.21356096968224073, + "grad_norm": 0.2315111607313156, + "learning_rate": 8.940125799691826e-05, + "loss": 1.0818, + "step": 3515 + }, + { + "epoch": 0.21362172671486723, + "grad_norm": 0.25556647777557373, + "learning_rate": 8.93953638637753e-05, + "loss": 1.0616, + "step": 3516 + }, + { + "epoch": 0.21368248374749377, + "grad_norm": 0.17034971714019775, + "learning_rate": 8.938946828658766e-05, + "loss": 1.0675, + "step": 3517 + }, + { + "epoch": 0.2137432407801203, + "grad_norm": 0.3051464557647705, + "learning_rate": 8.938357126557144e-05, + "loss": 1.265, + "step": 3518 + }, + { + "epoch": 0.21380399781274684, + "grad_norm": 0.29688090085983276, + "learning_rate": 8.937767280094281e-05, + "loss": 1.081, + "step": 3519 + }, + { + "epoch": 0.21386475484537334, + "grad_norm": 0.1980140060186386, + "learning_rate": 8.937177289291798e-05, + "loss": 1.0786, + "step": 3520 + }, + { + "epoch": 0.21392551187799988, + "grad_norm": 0.19571496546268463, + "learning_rate": 8.93658715417132e-05, + "loss": 1.0961, + "step": 3521 + }, + { + "epoch": 0.21398626891062641, + "grad_norm": 0.21203705668449402, + "learning_rate": 8.935996874754478e-05, + "loss": 1.2127, + "step": 3522 + }, + { + "epoch": 0.21404702594325292, + "grad_norm": 0.3411729037761688, + "learning_rate": 8.935406451062911e-05, + "loss": 1.2697, + "step": 3523 + }, + { + "epoch": 0.21410778297587946, + "grad_norm": 0.17787259817123413, + "learning_rate": 8.93481588311826e-05, + "loss": 1.103, + "step": 3524 + }, + { + "epoch": 0.214168540008506, + "grad_norm": 0.14897394180297852, + "learning_rate": 8.934225170942172e-05, + "loss": 1.079, + "step": 3525 + }, + { + "epoch": 0.2142292970411325, + "grad_norm": 0.19840256869792938, + "learning_rate": 8.933634314556299e-05, + "loss": 1.1951, + "step": 3526 + }, + { + "epoch": 0.21429005407375903, + "grad_norm": 0.25057125091552734, + "learning_rate": 8.9330433139823e-05, + "loss": 1.0791, + "step": 3527 + }, + { + "epoch": 0.21435081110638557, + "grad_norm": 0.21097703278064728, + "learning_rate": 8.932452169241838e-05, + "loss": 1.1514, + "step": 3528 + }, + { + "epoch": 0.2144115681390121, + "grad_norm": 0.42289677262306213, + "learning_rate": 8.931860880356583e-05, + "loss": 1.1388, + "step": 3529 + }, + { + "epoch": 0.2144723251716386, + "grad_norm": 0.18627946078777313, + "learning_rate": 8.931269447348206e-05, + "loss": 1.1116, + "step": 3530 + }, + { + "epoch": 0.21453308220426515, + "grad_norm": 0.2027835249900818, + "learning_rate": 8.930677870238388e-05, + "loss": 1.126, + "step": 3531 + }, + { + "epoch": 0.21459383923689168, + "grad_norm": 0.5285890102386475, + "learning_rate": 8.930086149048813e-05, + "loss": 1.0493, + "step": 3532 + }, + { + "epoch": 0.2146545962695182, + "grad_norm": 0.22224541008472443, + "learning_rate": 8.92949428380117e-05, + "loss": 1.2482, + "step": 3533 + }, + { + "epoch": 0.21471535330214472, + "grad_norm": 0.22938747704029083, + "learning_rate": 8.928902274517154e-05, + "loss": 1.138, + "step": 3534 + }, + { + "epoch": 0.21477611033477126, + "grad_norm": 0.241376131772995, + "learning_rate": 8.928310121218469e-05, + "loss": 1.1248, + "step": 3535 + }, + { + "epoch": 0.21483686736739777, + "grad_norm": 0.21722251176834106, + "learning_rate": 8.927717823926813e-05, + "loss": 1.139, + "step": 3536 + }, + { + "epoch": 0.2148976244000243, + "grad_norm": 0.6935981512069702, + "learning_rate": 8.927125382663903e-05, + "loss": 1.1276, + "step": 3537 + }, + { + "epoch": 0.21495838143265084, + "grad_norm": 0.16614855825901031, + "learning_rate": 8.926532797451452e-05, + "loss": 1.1664, + "step": 3538 + }, + { + "epoch": 0.21501913846527734, + "grad_norm": 0.21620385348796844, + "learning_rate": 8.925940068311183e-05, + "loss": 1.2537, + "step": 3539 + }, + { + "epoch": 0.21507989549790388, + "grad_norm": 0.21182186901569366, + "learning_rate": 8.925347195264823e-05, + "loss": 1.0652, + "step": 3540 + }, + { + "epoch": 0.2151406525305304, + "grad_norm": 0.15097814798355103, + "learning_rate": 8.924754178334102e-05, + "loss": 1.0744, + "step": 3541 + }, + { + "epoch": 0.21520140956315695, + "grad_norm": 0.16458383202552795, + "learning_rate": 8.924161017540758e-05, + "loss": 1.093, + "step": 3542 + }, + { + "epoch": 0.21526216659578346, + "grad_norm": 0.5575851798057556, + "learning_rate": 8.923567712906534e-05, + "loss": 1.1397, + "step": 3543 + }, + { + "epoch": 0.21532292362841, + "grad_norm": 0.20459336042404175, + "learning_rate": 8.922974264453175e-05, + "loss": 1.1298, + "step": 3544 + }, + { + "epoch": 0.21538368066103653, + "grad_norm": 0.20945602655410767, + "learning_rate": 8.922380672202437e-05, + "loss": 1.2046, + "step": 3545 + }, + { + "epoch": 0.21544443769366303, + "grad_norm": 0.24499641358852386, + "learning_rate": 8.921786936176077e-05, + "loss": 1.1787, + "step": 3546 + }, + { + "epoch": 0.21550519472628957, + "grad_norm": 0.2540135085582733, + "learning_rate": 8.92119305639586e-05, + "loss": 1.1748, + "step": 3547 + }, + { + "epoch": 0.2155659517589161, + "grad_norm": 0.20312140882015228, + "learning_rate": 8.920599032883554e-05, + "loss": 1.0987, + "step": 3548 + }, + { + "epoch": 0.2156267087915426, + "grad_norm": 0.1747300773859024, + "learning_rate": 8.92000486566093e-05, + "loss": 1.0825, + "step": 3549 + }, + { + "epoch": 0.21568746582416914, + "grad_norm": 0.1898849755525589, + "learning_rate": 8.919410554749772e-05, + "loss": 1.1206, + "step": 3550 + }, + { + "epoch": 0.21574822285679568, + "grad_norm": 0.21884649991989136, + "learning_rate": 8.918816100171862e-05, + "loss": 1.1316, + "step": 3551 + }, + { + "epoch": 0.21580897988942221, + "grad_norm": 0.15853926539421082, + "learning_rate": 8.91822150194899e-05, + "loss": 1.1037, + "step": 3552 + }, + { + "epoch": 0.21586973692204872, + "grad_norm": 0.29001468420028687, + "learning_rate": 8.917626760102951e-05, + "loss": 1.1606, + "step": 3553 + }, + { + "epoch": 0.21593049395467526, + "grad_norm": 0.1479199230670929, + "learning_rate": 8.917031874655546e-05, + "loss": 1.0943, + "step": 3554 + }, + { + "epoch": 0.2159912509873018, + "grad_norm": 0.16203229129314423, + "learning_rate": 8.916436845628582e-05, + "loss": 1.1757, + "step": 3555 + }, + { + "epoch": 0.2160520080199283, + "grad_norm": 0.2718237340450287, + "learning_rate": 8.915841673043868e-05, + "loss": 1.1535, + "step": 3556 + }, + { + "epoch": 0.21611276505255483, + "grad_norm": 0.24396225810050964, + "learning_rate": 8.91524635692322e-05, + "loss": 1.1568, + "step": 3557 + }, + { + "epoch": 0.21617352208518137, + "grad_norm": 0.2197808027267456, + "learning_rate": 8.91465089728846e-05, + "loss": 1.14, + "step": 3558 + }, + { + "epoch": 0.21623427911780788, + "grad_norm": 0.35085004568099976, + "learning_rate": 8.914055294161414e-05, + "loss": 1.1216, + "step": 3559 + }, + { + "epoch": 0.2162950361504344, + "grad_norm": 0.2695600986480713, + "learning_rate": 8.913459547563917e-05, + "loss": 1.2923, + "step": 3560 + }, + { + "epoch": 0.21635579318306095, + "grad_norm": 0.2137691229581833, + "learning_rate": 8.9128636575178e-05, + "loss": 1.2772, + "step": 3561 + }, + { + "epoch": 0.21641655021568745, + "grad_norm": 0.2682209312915802, + "learning_rate": 8.912267624044913e-05, + "loss": 1.0875, + "step": 3562 + }, + { + "epoch": 0.216477307248314, + "grad_norm": 2.038889169692993, + "learning_rate": 8.911671447167099e-05, + "loss": 1.2319, + "step": 3563 + }, + { + "epoch": 0.21653806428094052, + "grad_norm": 0.19789953529834747, + "learning_rate": 8.91107512690621e-05, + "loss": 1.0811, + "step": 3564 + }, + { + "epoch": 0.21659882131356706, + "grad_norm": 0.3618752360343933, + "learning_rate": 8.910478663284108e-05, + "loss": 1.1574, + "step": 3565 + }, + { + "epoch": 0.21665957834619357, + "grad_norm": 0.20511986315250397, + "learning_rate": 8.909882056322653e-05, + "loss": 1.1594, + "step": 3566 + }, + { + "epoch": 0.2167203353788201, + "grad_norm": 0.4069550931453705, + "learning_rate": 8.909285306043717e-05, + "loss": 1.1093, + "step": 3567 + }, + { + "epoch": 0.21678109241144664, + "grad_norm": 1.0260331630706787, + "learning_rate": 8.908688412469173e-05, + "loss": 1.0382, + "step": 3568 + }, + { + "epoch": 0.21684184944407314, + "grad_norm": 0.17617836594581604, + "learning_rate": 8.9080913756209e-05, + "loss": 1.1001, + "step": 3569 + }, + { + "epoch": 0.21690260647669968, + "grad_norm": 0.2915533781051636, + "learning_rate": 8.90749419552078e-05, + "loss": 1.0576, + "step": 3570 + }, + { + "epoch": 0.2169633635093262, + "grad_norm": 0.1760842353105545, + "learning_rate": 8.906896872190706e-05, + "loss": 1.1451, + "step": 3571 + }, + { + "epoch": 0.21702412054195272, + "grad_norm": 0.2882809042930603, + "learning_rate": 8.906299405652573e-05, + "loss": 1.1624, + "step": 3572 + }, + { + "epoch": 0.21708487757457925, + "grad_norm": 0.21022208034992218, + "learning_rate": 8.90570179592828e-05, + "loss": 1.0747, + "step": 3573 + }, + { + "epoch": 0.2171456346072058, + "grad_norm": 0.1654147058725357, + "learning_rate": 8.905104043039731e-05, + "loss": 1.0847, + "step": 3574 + }, + { + "epoch": 0.2172063916398323, + "grad_norm": 0.17736396193504333, + "learning_rate": 8.90450614700884e-05, + "loss": 1.0945, + "step": 3575 + }, + { + "epoch": 0.21726714867245883, + "grad_norm": 0.1518404185771942, + "learning_rate": 8.903908107857521e-05, + "loss": 1.1247, + "step": 3576 + }, + { + "epoch": 0.21732790570508537, + "grad_norm": 0.2741888761520386, + "learning_rate": 8.903309925607696e-05, + "loss": 1.088, + "step": 3577 + }, + { + "epoch": 0.2173886627377119, + "grad_norm": 0.7856914401054382, + "learning_rate": 8.90271160028129e-05, + "loss": 1.3599, + "step": 3578 + }, + { + "epoch": 0.2174494197703384, + "grad_norm": 0.5083143711090088, + "learning_rate": 8.902113131900238e-05, + "loss": 1.137, + "step": 3579 + }, + { + "epoch": 0.21751017680296494, + "grad_norm": 0.37476813793182373, + "learning_rate": 8.901514520486473e-05, + "loss": 1.0453, + "step": 3580 + }, + { + "epoch": 0.21757093383559148, + "grad_norm": 0.31943994760513306, + "learning_rate": 8.90091576606194e-05, + "loss": 1.2407, + "step": 3581 + }, + { + "epoch": 0.217631690868218, + "grad_norm": 0.26216375827789307, + "learning_rate": 8.900316868648586e-05, + "loss": 1.1617, + "step": 3582 + }, + { + "epoch": 0.21769244790084452, + "grad_norm": 0.2337801158428192, + "learning_rate": 8.899717828268361e-05, + "loss": 1.0981, + "step": 3583 + }, + { + "epoch": 0.21775320493347106, + "grad_norm": 0.24762462079524994, + "learning_rate": 8.899118644943227e-05, + "loss": 1.1134, + "step": 3584 + }, + { + "epoch": 0.21781396196609756, + "grad_norm": 0.35765013098716736, + "learning_rate": 8.898519318695146e-05, + "loss": 1.0458, + "step": 3585 + }, + { + "epoch": 0.2178747189987241, + "grad_norm": 0.1970292031764984, + "learning_rate": 8.897919849546086e-05, + "loss": 1.0927, + "step": 3586 + }, + { + "epoch": 0.21793547603135063, + "grad_norm": 0.31617265939712524, + "learning_rate": 8.897320237518019e-05, + "loss": 1.0529, + "step": 3587 + }, + { + "epoch": 0.21799623306397717, + "grad_norm": 0.15339192748069763, + "learning_rate": 8.896720482632925e-05, + "loss": 1.0535, + "step": 3588 + }, + { + "epoch": 0.21805699009660368, + "grad_norm": 0.19203001260757446, + "learning_rate": 8.896120584912791e-05, + "loss": 1.0831, + "step": 3589 + }, + { + "epoch": 0.2181177471292302, + "grad_norm": 0.2581885755062103, + "learning_rate": 8.895520544379601e-05, + "loss": 1.1794, + "step": 3590 + }, + { + "epoch": 0.21817850416185675, + "grad_norm": 0.38130053877830505, + "learning_rate": 8.894920361055354e-05, + "loss": 1.112, + "step": 3591 + }, + { + "epoch": 0.21823926119448325, + "grad_norm": 0.18617768585681915, + "learning_rate": 8.894320034962047e-05, + "loss": 1.2227, + "step": 3592 + }, + { + "epoch": 0.2183000182271098, + "grad_norm": 0.37853309512138367, + "learning_rate": 8.893719566121688e-05, + "loss": 1.1876, + "step": 3593 + }, + { + "epoch": 0.21836077525973632, + "grad_norm": 0.3068560063838959, + "learning_rate": 8.893118954556286e-05, + "loss": 1.0832, + "step": 3594 + }, + { + "epoch": 0.21842153229236283, + "grad_norm": 0.1947592943906784, + "learning_rate": 8.892518200287854e-05, + "loss": 1.073, + "step": 3595 + }, + { + "epoch": 0.21848228932498936, + "grad_norm": 0.3098341226577759, + "learning_rate": 8.891917303338416e-05, + "loss": 1.0463, + "step": 3596 + }, + { + "epoch": 0.2185430463576159, + "grad_norm": 0.16980788111686707, + "learning_rate": 8.891316263729997e-05, + "loss": 1.1057, + "step": 3597 + }, + { + "epoch": 0.2186038033902424, + "grad_norm": 0.22771577537059784, + "learning_rate": 8.890715081484628e-05, + "loss": 1.1578, + "step": 3598 + }, + { + "epoch": 0.21866456042286894, + "grad_norm": 0.46696770191192627, + "learning_rate": 8.890113756624347e-05, + "loss": 1.1605, + "step": 3599 + }, + { + "epoch": 0.21872531745549548, + "grad_norm": 0.27803370356559753, + "learning_rate": 8.889512289171194e-05, + "loss": 1.1542, + "step": 3600 + }, + { + "epoch": 0.218786074488122, + "grad_norm": 0.21509814262390137, + "learning_rate": 8.888910679147215e-05, + "loss": 1.0754, + "step": 3601 + }, + { + "epoch": 0.21884683152074852, + "grad_norm": 0.181820347905159, + "learning_rate": 8.888308926574463e-05, + "loss": 1.2084, + "step": 3602 + }, + { + "epoch": 0.21890758855337505, + "grad_norm": 0.22318626940250397, + "learning_rate": 8.887707031474996e-05, + "loss": 1.1644, + "step": 3603 + }, + { + "epoch": 0.2189683455860016, + "grad_norm": 0.25211507081985474, + "learning_rate": 8.887104993870878e-05, + "loss": 1.1419, + "step": 3604 + }, + { + "epoch": 0.2190291026186281, + "grad_norm": 0.3897862732410431, + "learning_rate": 8.886502813784173e-05, + "loss": 1.1857, + "step": 3605 + }, + { + "epoch": 0.21908985965125463, + "grad_norm": 0.17802387475967407, + "learning_rate": 8.885900491236957e-05, + "loss": 1.1071, + "step": 3606 + }, + { + "epoch": 0.21915061668388117, + "grad_norm": 0.42088744044303894, + "learning_rate": 8.885298026251307e-05, + "loss": 1.1424, + "step": 3607 + }, + { + "epoch": 0.21921137371650767, + "grad_norm": 0.3917873203754425, + "learning_rate": 8.884695418849307e-05, + "loss": 1.0925, + "step": 3608 + }, + { + "epoch": 0.2192721307491342, + "grad_norm": 0.23588791489601135, + "learning_rate": 8.884092669053046e-05, + "loss": 1.1482, + "step": 3609 + }, + { + "epoch": 0.21933288778176074, + "grad_norm": 0.22042417526245117, + "learning_rate": 8.883489776884617e-05, + "loss": 1.1998, + "step": 3610 + }, + { + "epoch": 0.21939364481438728, + "grad_norm": 0.15223854780197144, + "learning_rate": 8.88288674236612e-05, + "loss": 1.1156, + "step": 3611 + }, + { + "epoch": 0.21945440184701379, + "grad_norm": 0.1689809411764145, + "learning_rate": 8.882283565519658e-05, + "loss": 1.0694, + "step": 3612 + }, + { + "epoch": 0.21951515887964032, + "grad_norm": 0.17674720287322998, + "learning_rate": 8.881680246367343e-05, + "loss": 1.1402, + "step": 3613 + }, + { + "epoch": 0.21957591591226686, + "grad_norm": 3.3900296688079834, + "learning_rate": 8.881076784931287e-05, + "loss": 1.0622, + "step": 3614 + }, + { + "epoch": 0.21963667294489336, + "grad_norm": 0.17992424964904785, + "learning_rate": 8.880473181233611e-05, + "loss": 1.0617, + "step": 3615 + }, + { + "epoch": 0.2196974299775199, + "grad_norm": 0.2613798677921295, + "learning_rate": 8.879869435296442e-05, + "loss": 1.1863, + "step": 3616 + }, + { + "epoch": 0.21975818701014643, + "grad_norm": 0.3601166009902954, + "learning_rate": 8.879265547141909e-05, + "loss": 1.1004, + "step": 3617 + }, + { + "epoch": 0.21981894404277294, + "grad_norm": 0.1415201872587204, + "learning_rate": 8.878661516792144e-05, + "loss": 1.0756, + "step": 3618 + }, + { + "epoch": 0.21987970107539948, + "grad_norm": 1.9572412967681885, + "learning_rate": 8.878057344269295e-05, + "loss": 1.1808, + "step": 3619 + }, + { + "epoch": 0.219940458108026, + "grad_norm": 0.326018363237381, + "learning_rate": 8.877453029595504e-05, + "loss": 1.0405, + "step": 3620 + }, + { + "epoch": 0.22000121514065252, + "grad_norm": 0.22868525981903076, + "learning_rate": 8.876848572792922e-05, + "loss": 1.0882, + "step": 3621 + }, + { + "epoch": 0.22006197217327905, + "grad_norm": 0.2570403814315796, + "learning_rate": 8.876243973883706e-05, + "loss": 1.277, + "step": 3622 + }, + { + "epoch": 0.2201227292059056, + "grad_norm": 0.16984650492668152, + "learning_rate": 8.875639232890018e-05, + "loss": 1.1458, + "step": 3623 + }, + { + "epoch": 0.22018348623853212, + "grad_norm": 0.16814802587032318, + "learning_rate": 8.875034349834023e-05, + "loss": 1.0737, + "step": 3624 + }, + { + "epoch": 0.22024424327115863, + "grad_norm": 0.19076281785964966, + "learning_rate": 8.874429324737898e-05, + "loss": 1.1215, + "step": 3625 + }, + { + "epoch": 0.22030500030378516, + "grad_norm": 0.1807945966720581, + "learning_rate": 8.873824157623816e-05, + "loss": 1.2199, + "step": 3626 + }, + { + "epoch": 0.2203657573364117, + "grad_norm": 0.36208876967430115, + "learning_rate": 8.87321884851396e-05, + "loss": 1.2129, + "step": 3627 + }, + { + "epoch": 0.2204265143690382, + "grad_norm": 0.2234572470188141, + "learning_rate": 8.872613397430519e-05, + "loss": 1.1678, + "step": 3628 + }, + { + "epoch": 0.22048727140166474, + "grad_norm": 0.38514724373817444, + "learning_rate": 8.872007804395685e-05, + "loss": 1.1732, + "step": 3629 + }, + { + "epoch": 0.22054802843429128, + "grad_norm": 0.31021639704704285, + "learning_rate": 8.871402069431659e-05, + "loss": 1.3114, + "step": 3630 + }, + { + "epoch": 0.22060878546691778, + "grad_norm": 0.2765832543373108, + "learning_rate": 8.870796192560639e-05, + "loss": 1.0864, + "step": 3631 + }, + { + "epoch": 0.22066954249954432, + "grad_norm": 0.13771745562553406, + "learning_rate": 8.870190173804837e-05, + "loss": 1.0653, + "step": 3632 + }, + { + "epoch": 0.22073029953217085, + "grad_norm": 0.14678744971752167, + "learning_rate": 8.869584013186465e-05, + "loss": 1.0717, + "step": 3633 + }, + { + "epoch": 0.2207910565647974, + "grad_norm": 0.28426042199134827, + "learning_rate": 8.868977710727744e-05, + "loss": 1.2044, + "step": 3634 + }, + { + "epoch": 0.2208518135974239, + "grad_norm": 0.18499046564102173, + "learning_rate": 8.8683712664509e-05, + "loss": 1.1538, + "step": 3635 + }, + { + "epoch": 0.22091257063005043, + "grad_norm": 0.16694340109825134, + "learning_rate": 8.867764680378155e-05, + "loss": 1.0826, + "step": 3636 + }, + { + "epoch": 0.22097332766267697, + "grad_norm": 0.3892214000225067, + "learning_rate": 8.867157952531751e-05, + "loss": 1.3207, + "step": 3637 + }, + { + "epoch": 0.22103408469530347, + "grad_norm": 0.1352841705083847, + "learning_rate": 8.866551082933923e-05, + "loss": 1.0926, + "step": 3638 + }, + { + "epoch": 0.22109484172793, + "grad_norm": 0.27010589838027954, + "learning_rate": 8.865944071606918e-05, + "loss": 1.1157, + "step": 3639 + }, + { + "epoch": 0.22115559876055654, + "grad_norm": 0.20642493665218353, + "learning_rate": 8.865336918572986e-05, + "loss": 1.1863, + "step": 3640 + }, + { + "epoch": 0.22121635579318305, + "grad_norm": 0.17077583074569702, + "learning_rate": 8.864729623854382e-05, + "loss": 1.117, + "step": 3641 + }, + { + "epoch": 0.22127711282580959, + "grad_norm": 0.1897292584180832, + "learning_rate": 8.864122187473367e-05, + "loss": 1.1191, + "step": 3642 + }, + { + "epoch": 0.22133786985843612, + "grad_norm": 0.17921572923660278, + "learning_rate": 8.863514609452205e-05, + "loss": 1.1178, + "step": 3643 + }, + { + "epoch": 0.22139862689106263, + "grad_norm": 0.15557968616485596, + "learning_rate": 8.86290688981317e-05, + "loss": 1.1214, + "step": 3644 + }, + { + "epoch": 0.22145938392368916, + "grad_norm": 0.21869108080863953, + "learning_rate": 8.862299028578536e-05, + "loss": 1.0722, + "step": 3645 + }, + { + "epoch": 0.2215201409563157, + "grad_norm": 1.952772855758667, + "learning_rate": 8.861691025770583e-05, + "loss": 1.1345, + "step": 3646 + }, + { + "epoch": 0.22158089798894223, + "grad_norm": 0.1759057492017746, + "learning_rate": 8.861082881411599e-05, + "loss": 1.1673, + "step": 3647 + }, + { + "epoch": 0.22164165502156874, + "grad_norm": 0.15730935335159302, + "learning_rate": 8.860474595523878e-05, + "loss": 1.1144, + "step": 3648 + }, + { + "epoch": 0.22170241205419527, + "grad_norm": 0.1722322702407837, + "learning_rate": 8.859866168129714e-05, + "loss": 1.0729, + "step": 3649 + }, + { + "epoch": 0.2217631690868218, + "grad_norm": 0.3860315680503845, + "learning_rate": 8.859257599251407e-05, + "loss": 1.1058, + "step": 3650 + }, + { + "epoch": 0.22182392611944832, + "grad_norm": 2.915440320968628, + "learning_rate": 8.85864888891127e-05, + "loss": 1.1387, + "step": 3651 + }, + { + "epoch": 0.22188468315207485, + "grad_norm": 0.2498331218957901, + "learning_rate": 8.858040037131609e-05, + "loss": 1.0924, + "step": 3652 + }, + { + "epoch": 0.2219454401847014, + "grad_norm": 0.16406171023845673, + "learning_rate": 8.857431043934747e-05, + "loss": 1.0686, + "step": 3653 + }, + { + "epoch": 0.2220061972173279, + "grad_norm": 0.20061764121055603, + "learning_rate": 8.856821909343004e-05, + "loss": 1.1362, + "step": 3654 + }, + { + "epoch": 0.22206695424995443, + "grad_norm": 0.17562153935432434, + "learning_rate": 8.856212633378707e-05, + "loss": 1.0865, + "step": 3655 + }, + { + "epoch": 0.22212771128258096, + "grad_norm": 0.1663135141134262, + "learning_rate": 8.855603216064192e-05, + "loss": 1.1391, + "step": 3656 + }, + { + "epoch": 0.2221884683152075, + "grad_norm": 0.5653345584869385, + "learning_rate": 8.854993657421796e-05, + "loss": 1.5107, + "step": 3657 + }, + { + "epoch": 0.222249225347834, + "grad_norm": 0.3727596402168274, + "learning_rate": 8.854383957473863e-05, + "loss": 1.1561, + "step": 3658 + }, + { + "epoch": 0.22230998238046054, + "grad_norm": 0.18117567896842957, + "learning_rate": 8.85377411624274e-05, + "loss": 1.106, + "step": 3659 + }, + { + "epoch": 0.22237073941308708, + "grad_norm": 0.23197732865810394, + "learning_rate": 8.85316413375078e-05, + "loss": 1.1812, + "step": 3660 + }, + { + "epoch": 0.22243149644571358, + "grad_norm": 0.1865433156490326, + "learning_rate": 8.852554010020346e-05, + "loss": 1.0467, + "step": 3661 + }, + { + "epoch": 0.22249225347834012, + "grad_norm": 0.24907264113426208, + "learning_rate": 8.8519437450738e-05, + "loss": 1.0755, + "step": 3662 + }, + { + "epoch": 0.22255301051096665, + "grad_norm": 0.44304803013801575, + "learning_rate": 8.85133333893351e-05, + "loss": 1.0524, + "step": 3663 + }, + { + "epoch": 0.22261376754359316, + "grad_norm": 0.24000073969364166, + "learning_rate": 8.850722791621853e-05, + "loss": 1.3545, + "step": 3664 + }, + { + "epoch": 0.2226745245762197, + "grad_norm": 0.18592676520347595, + "learning_rate": 8.850112103161207e-05, + "loss": 1.1556, + "step": 3665 + }, + { + "epoch": 0.22273528160884623, + "grad_norm": 0.2611141502857208, + "learning_rate": 8.849501273573959e-05, + "loss": 1.2765, + "step": 3666 + }, + { + "epoch": 0.22279603864147274, + "grad_norm": 0.2191876918077469, + "learning_rate": 8.848890302882497e-05, + "loss": 1.0758, + "step": 3667 + }, + { + "epoch": 0.22285679567409927, + "grad_norm": 0.524749755859375, + "learning_rate": 8.848279191109217e-05, + "loss": 1.2897, + "step": 3668 + }, + { + "epoch": 0.2229175527067258, + "grad_norm": 0.20052844285964966, + "learning_rate": 8.84766793827652e-05, + "loss": 1.1826, + "step": 3669 + }, + { + "epoch": 0.22297830973935234, + "grad_norm": 0.3286508321762085, + "learning_rate": 8.847056544406811e-05, + "loss": 1.1109, + "step": 3670 + }, + { + "epoch": 0.22303906677197885, + "grad_norm": 0.13770145177841187, + "learning_rate": 8.8464450095225e-05, + "loss": 1.0742, + "step": 3671 + }, + { + "epoch": 0.22309982380460538, + "grad_norm": 0.19726188480854034, + "learning_rate": 8.845833333646005e-05, + "loss": 1.0962, + "step": 3672 + }, + { + "epoch": 0.22316058083723192, + "grad_norm": 0.21102310717105865, + "learning_rate": 8.845221516799745e-05, + "loss": 1.074, + "step": 3673 + }, + { + "epoch": 0.22322133786985843, + "grad_norm": 0.19460909068584442, + "learning_rate": 8.844609559006145e-05, + "loss": 1.1209, + "step": 3674 + }, + { + "epoch": 0.22328209490248496, + "grad_norm": 0.1998390257358551, + "learning_rate": 8.843997460287641e-05, + "loss": 1.0577, + "step": 3675 + }, + { + "epoch": 0.2233428519351115, + "grad_norm": 0.19281549751758575, + "learning_rate": 8.843385220666667e-05, + "loss": 1.0754, + "step": 3676 + }, + { + "epoch": 0.223403608967738, + "grad_norm": 0.18167705833911896, + "learning_rate": 8.842772840165663e-05, + "loss": 1.0898, + "step": 3677 + }, + { + "epoch": 0.22346436600036454, + "grad_norm": 0.1706853061914444, + "learning_rate": 8.84216031880708e-05, + "loss": 1.116, + "step": 3678 + }, + { + "epoch": 0.22352512303299107, + "grad_norm": 0.19648264348506927, + "learning_rate": 8.841547656613367e-05, + "loss": 1.1903, + "step": 3679 + }, + { + "epoch": 0.22358588006561758, + "grad_norm": 0.19444352388381958, + "learning_rate": 8.840934853606981e-05, + "loss": 1.0587, + "step": 3680 + }, + { + "epoch": 0.22364663709824412, + "grad_norm": 0.2148924469947815, + "learning_rate": 8.840321909810386e-05, + "loss": 1.1402, + "step": 3681 + }, + { + "epoch": 0.22370739413087065, + "grad_norm": 0.35384291410446167, + "learning_rate": 8.839708825246049e-05, + "loss": 1.1711, + "step": 3682 + }, + { + "epoch": 0.2237681511634972, + "grad_norm": 0.30873680114746094, + "learning_rate": 8.839095599936444e-05, + "loss": 1.3454, + "step": 3683 + }, + { + "epoch": 0.2238289081961237, + "grad_norm": 0.3413650691509247, + "learning_rate": 8.838482233904046e-05, + "loss": 1.0719, + "step": 3684 + }, + { + "epoch": 0.22388966522875023, + "grad_norm": 0.4800724685192108, + "learning_rate": 8.837868727171342e-05, + "loss": 1.1946, + "step": 3685 + }, + { + "epoch": 0.22395042226137676, + "grad_norm": 0.1908242404460907, + "learning_rate": 8.837255079760815e-05, + "loss": 1.1637, + "step": 3686 + }, + { + "epoch": 0.22401117929400327, + "grad_norm": 0.4864121973514557, + "learning_rate": 8.836641291694964e-05, + "loss": 1.1185, + "step": 3687 + }, + { + "epoch": 0.2240719363266298, + "grad_norm": 0.9131503701210022, + "learning_rate": 8.836027362996285e-05, + "loss": 1.1372, + "step": 3688 + }, + { + "epoch": 0.22413269335925634, + "grad_norm": 0.197661355137825, + "learning_rate": 8.83541329368728e-05, + "loss": 1.0941, + "step": 3689 + }, + { + "epoch": 0.22419345039188285, + "grad_norm": 0.20966699719429016, + "learning_rate": 8.834799083790459e-05, + "loss": 1.0455, + "step": 3690 + }, + { + "epoch": 0.22425420742450938, + "grad_norm": 0.21055859327316284, + "learning_rate": 8.834184733328337e-05, + "loss": 1.1266, + "step": 3691 + }, + { + "epoch": 0.22431496445713592, + "grad_norm": 0.4950893819332123, + "learning_rate": 8.833570242323432e-05, + "loss": 1.0672, + "step": 3692 + }, + { + "epoch": 0.22437572148976245, + "grad_norm": 0.28170105814933777, + "learning_rate": 8.83295561079827e-05, + "loss": 1.226, + "step": 3693 + }, + { + "epoch": 0.22443647852238896, + "grad_norm": 0.3324810266494751, + "learning_rate": 8.832340838775378e-05, + "loss": 1.1796, + "step": 3694 + }, + { + "epoch": 0.2244972355550155, + "grad_norm": 0.1564098298549652, + "learning_rate": 8.831725926277293e-05, + "loss": 1.1227, + "step": 3695 + }, + { + "epoch": 0.22455799258764203, + "grad_norm": 0.18331372737884521, + "learning_rate": 8.831110873326552e-05, + "loss": 1.1592, + "step": 3696 + }, + { + "epoch": 0.22461874962026854, + "grad_norm": 0.22116729617118835, + "learning_rate": 8.830495679945703e-05, + "loss": 1.1262, + "step": 3697 + }, + { + "epoch": 0.22467950665289507, + "grad_norm": 0.16599038243293762, + "learning_rate": 8.829880346157294e-05, + "loss": 1.1148, + "step": 3698 + }, + { + "epoch": 0.2247402636855216, + "grad_norm": 0.13323058187961578, + "learning_rate": 8.82926487198388e-05, + "loss": 1.0943, + "step": 3699 + }, + { + "epoch": 0.22480102071814811, + "grad_norm": 0.2711448669433594, + "learning_rate": 8.82864925744802e-05, + "loss": 1.0608, + "step": 3700 + }, + { + "epoch": 0.22486177775077465, + "grad_norm": 0.19824278354644775, + "learning_rate": 8.828033502572285e-05, + "loss": 1.0967, + "step": 3701 + }, + { + "epoch": 0.22492253478340118, + "grad_norm": 0.24330858886241913, + "learning_rate": 8.82741760737924e-05, + "loss": 1.1607, + "step": 3702 + }, + { + "epoch": 0.2249832918160277, + "grad_norm": 0.18452030420303345, + "learning_rate": 8.826801571891463e-05, + "loss": 1.0607, + "step": 3703 + }, + { + "epoch": 0.22504404884865423, + "grad_norm": 0.2032107263803482, + "learning_rate": 8.826185396131536e-05, + "loss": 1.098, + "step": 3704 + }, + { + "epoch": 0.22510480588128076, + "grad_norm": 0.12496750056743622, + "learning_rate": 8.825569080122043e-05, + "loss": 1.0273, + "step": 3705 + }, + { + "epoch": 0.2251655629139073, + "grad_norm": 0.19655679166316986, + "learning_rate": 8.824952623885576e-05, + "loss": 1.0811, + "step": 3706 + }, + { + "epoch": 0.2252263199465338, + "grad_norm": 0.4549234211444855, + "learning_rate": 8.82433602744473e-05, + "loss": 1.0751, + "step": 3707 + }, + { + "epoch": 0.22528707697916034, + "grad_norm": 0.18192775547504425, + "learning_rate": 8.823719290822108e-05, + "loss": 1.1389, + "step": 3708 + }, + { + "epoch": 0.22534783401178687, + "grad_norm": 0.18826892971992493, + "learning_rate": 8.823102414040317e-05, + "loss": 1.1737, + "step": 3709 + }, + { + "epoch": 0.22540859104441338, + "grad_norm": 0.16363559663295746, + "learning_rate": 8.822485397121968e-05, + "loss": 1.124, + "step": 3710 + }, + { + "epoch": 0.22546934807703992, + "grad_norm": 0.18880683183670044, + "learning_rate": 8.821868240089676e-05, + "loss": 1.1377, + "step": 3711 + }, + { + "epoch": 0.22553010510966645, + "grad_norm": 0.17491993308067322, + "learning_rate": 8.821250942966069e-05, + "loss": 1.1446, + "step": 3712 + }, + { + "epoch": 0.22559086214229296, + "grad_norm": 0.18036296963691711, + "learning_rate": 8.820633505773766e-05, + "loss": 1.0953, + "step": 3713 + }, + { + "epoch": 0.2256516191749195, + "grad_norm": 0.19422069191932678, + "learning_rate": 8.820015928535406e-05, + "loss": 1.1226, + "step": 3714 + }, + { + "epoch": 0.22571237620754603, + "grad_norm": 0.1913684606552124, + "learning_rate": 8.819398211273623e-05, + "loss": 1.098, + "step": 3715 + }, + { + "epoch": 0.22577313324017256, + "grad_norm": 0.18528629839420319, + "learning_rate": 8.81878035401106e-05, + "loss": 1.1686, + "step": 3716 + }, + { + "epoch": 0.22583389027279907, + "grad_norm": 0.13923168182373047, + "learning_rate": 8.818162356770366e-05, + "loss": 1.0793, + "step": 3717 + }, + { + "epoch": 0.2258946473054256, + "grad_norm": 0.3453364074230194, + "learning_rate": 8.81754421957419e-05, + "loss": 1.1074, + "step": 3718 + }, + { + "epoch": 0.22595540433805214, + "grad_norm": 0.3160702586174011, + "learning_rate": 8.816925942445196e-05, + "loss": 1.0595, + "step": 3719 + }, + { + "epoch": 0.22601616137067865, + "grad_norm": 0.47938573360443115, + "learning_rate": 8.816307525406042e-05, + "loss": 1.2385, + "step": 3720 + }, + { + "epoch": 0.22607691840330518, + "grad_norm": 0.9228609800338745, + "learning_rate": 8.815688968479399e-05, + "loss": 1.1115, + "step": 3721 + }, + { + "epoch": 0.22613767543593172, + "grad_norm": 0.32599395513534546, + "learning_rate": 8.815070271687937e-05, + "loss": 1.1696, + "step": 3722 + }, + { + "epoch": 0.22619843246855822, + "grad_norm": 0.26398760080337524, + "learning_rate": 8.814451435054339e-05, + "loss": 1.112, + "step": 3723 + }, + { + "epoch": 0.22625918950118476, + "grad_norm": 0.2827211618423462, + "learning_rate": 8.813832458601285e-05, + "loss": 1.1057, + "step": 3724 + }, + { + "epoch": 0.2263199465338113, + "grad_norm": 0.9766664505004883, + "learning_rate": 8.813213342351466e-05, + "loss": 1.2191, + "step": 3725 + }, + { + "epoch": 0.2263807035664378, + "grad_norm": 0.17697346210479736, + "learning_rate": 8.812594086327574e-05, + "loss": 1.1614, + "step": 3726 + }, + { + "epoch": 0.22644146059906434, + "grad_norm": 0.35698121786117554, + "learning_rate": 8.811974690552311e-05, + "loss": 1.1805, + "step": 3727 + }, + { + "epoch": 0.22650221763169087, + "grad_norm": 0.2610389292240143, + "learning_rate": 8.811355155048377e-05, + "loss": 1.1218, + "step": 3728 + }, + { + "epoch": 0.2265629746643174, + "grad_norm": 0.13635700941085815, + "learning_rate": 8.810735479838483e-05, + "loss": 1.0754, + "step": 3729 + }, + { + "epoch": 0.22662373169694391, + "grad_norm": 0.29382216930389404, + "learning_rate": 8.810115664945343e-05, + "loss": 1.1801, + "step": 3730 + }, + { + "epoch": 0.22668448872957045, + "grad_norm": 0.3335505723953247, + "learning_rate": 8.809495710391678e-05, + "loss": 1.2342, + "step": 3731 + }, + { + "epoch": 0.22674524576219698, + "grad_norm": 0.4739501476287842, + "learning_rate": 8.808875616200211e-05, + "loss": 1.0965, + "step": 3732 + }, + { + "epoch": 0.2268060027948235, + "grad_norm": 0.5466656684875488, + "learning_rate": 8.808255382393672e-05, + "loss": 1.1186, + "step": 3733 + }, + { + "epoch": 0.22686675982745003, + "grad_norm": 0.2992178201675415, + "learning_rate": 8.807635008994796e-05, + "loss": 1.1411, + "step": 3734 + }, + { + "epoch": 0.22692751686007656, + "grad_norm": 0.20904937386512756, + "learning_rate": 8.807014496026322e-05, + "loss": 1.1973, + "step": 3735 + }, + { + "epoch": 0.22698827389270307, + "grad_norm": 0.2990994155406952, + "learning_rate": 8.806393843510996e-05, + "loss": 1.1496, + "step": 3736 + }, + { + "epoch": 0.2270490309253296, + "grad_norm": 0.28559526801109314, + "learning_rate": 8.805773051471568e-05, + "loss": 1.071, + "step": 3737 + }, + { + "epoch": 0.22710978795795614, + "grad_norm": 0.1629854291677475, + "learning_rate": 8.805152119930796e-05, + "loss": 1.1273, + "step": 3738 + }, + { + "epoch": 0.22717054499058267, + "grad_norm": 1.2423242330551147, + "learning_rate": 8.804531048911433e-05, + "loss": 1.1299, + "step": 3739 + }, + { + "epoch": 0.22723130202320918, + "grad_norm": 0.39676475524902344, + "learning_rate": 8.803909838436251e-05, + "loss": 1.1738, + "step": 3740 + }, + { + "epoch": 0.22729205905583572, + "grad_norm": 0.3079032599925995, + "learning_rate": 8.803288488528019e-05, + "loss": 1.0961, + "step": 3741 + }, + { + "epoch": 0.22735281608846225, + "grad_norm": 0.1665235161781311, + "learning_rate": 8.802666999209512e-05, + "loss": 1.0557, + "step": 3742 + }, + { + "epoch": 0.22741357312108876, + "grad_norm": 0.15917164087295532, + "learning_rate": 8.802045370503512e-05, + "loss": 1.0823, + "step": 3743 + }, + { + "epoch": 0.2274743301537153, + "grad_norm": 0.7063826322555542, + "learning_rate": 8.801423602432801e-05, + "loss": 1.1571, + "step": 3744 + }, + { + "epoch": 0.22753508718634183, + "grad_norm": 0.24075105786323547, + "learning_rate": 8.800801695020176e-05, + "loss": 1.1807, + "step": 3745 + }, + { + "epoch": 0.22759584421896834, + "grad_norm": 27.179767608642578, + "learning_rate": 8.800179648288429e-05, + "loss": 1.0843, + "step": 3746 + }, + { + "epoch": 0.22765660125159487, + "grad_norm": 0.18558628857135773, + "learning_rate": 8.799557462260361e-05, + "loss": 1.0655, + "step": 3747 + }, + { + "epoch": 0.2277173582842214, + "grad_norm": 0.24082210659980774, + "learning_rate": 8.798935136958782e-05, + "loss": 1.0474, + "step": 3748 + }, + { + "epoch": 0.2277781153168479, + "grad_norm": 0.21904778480529785, + "learning_rate": 8.798312672406501e-05, + "loss": 1.1789, + "step": 3749 + }, + { + "epoch": 0.22783887234947445, + "grad_norm": 0.38997456431388855, + "learning_rate": 8.797690068626334e-05, + "loss": 1.3074, + "step": 3750 + }, + { + "epoch": 0.22789962938210098, + "grad_norm": 0.2737247049808502, + "learning_rate": 8.797067325641104e-05, + "loss": 1.1582, + "step": 3751 + }, + { + "epoch": 0.22796038641472752, + "grad_norm": 0.3012070059776306, + "learning_rate": 8.796444443473637e-05, + "loss": 1.1549, + "step": 3752 + }, + { + "epoch": 0.22802114344735402, + "grad_norm": 0.16187265515327454, + "learning_rate": 8.795821422146766e-05, + "loss": 1.042, + "step": 3753 + }, + { + "epoch": 0.22808190047998056, + "grad_norm": 0.23431642353534698, + "learning_rate": 8.795198261683326e-05, + "loss": 1.2678, + "step": 3754 + }, + { + "epoch": 0.2281426575126071, + "grad_norm": 0.2818852961063385, + "learning_rate": 8.794574962106161e-05, + "loss": 1.1846, + "step": 3755 + }, + { + "epoch": 0.2282034145452336, + "grad_norm": 0.161614328622818, + "learning_rate": 8.793951523438116e-05, + "loss": 1.103, + "step": 3756 + }, + { + "epoch": 0.22826417157786014, + "grad_norm": 0.2946518361568451, + "learning_rate": 8.793327945702046e-05, + "loss": 1.0485, + "step": 3757 + }, + { + "epoch": 0.22832492861048667, + "grad_norm": 0.22220878303050995, + "learning_rate": 8.792704228920805e-05, + "loss": 1.0509, + "step": 3758 + }, + { + "epoch": 0.22838568564311318, + "grad_norm": 0.18830649554729462, + "learning_rate": 8.79208037311726e-05, + "loss": 1.12, + "step": 3759 + }, + { + "epoch": 0.2284464426757397, + "grad_norm": 0.18957538902759552, + "learning_rate": 8.791456378314273e-05, + "loss": 1.1573, + "step": 3760 + }, + { + "epoch": 0.22850719970836625, + "grad_norm": 0.3290649652481079, + "learning_rate": 8.790832244534721e-05, + "loss": 1.1844, + "step": 3761 + }, + { + "epoch": 0.22856795674099278, + "grad_norm": 0.34733831882476807, + "learning_rate": 8.790207971801481e-05, + "loss": 1.2929, + "step": 3762 + }, + { + "epoch": 0.2286287137736193, + "grad_norm": 0.33905866742134094, + "learning_rate": 8.789583560137435e-05, + "loss": 1.1612, + "step": 3763 + }, + { + "epoch": 0.22868947080624583, + "grad_norm": 0.3364482522010803, + "learning_rate": 8.78895900956547e-05, + "loss": 1.0785, + "step": 3764 + }, + { + "epoch": 0.22875022783887236, + "grad_norm": 0.23005352914333344, + "learning_rate": 8.78833432010848e-05, + "loss": 1.1679, + "step": 3765 + }, + { + "epoch": 0.22881098487149887, + "grad_norm": 0.25035086274147034, + "learning_rate": 8.787709491789364e-05, + "loss": 1.0954, + "step": 3766 + }, + { + "epoch": 0.2288717419041254, + "grad_norm": 0.26574721932411194, + "learning_rate": 8.787084524631025e-05, + "loss": 1.1232, + "step": 3767 + }, + { + "epoch": 0.22893249893675194, + "grad_norm": 0.4117295742034912, + "learning_rate": 8.786459418656369e-05, + "loss": 1.172, + "step": 3768 + }, + { + "epoch": 0.22899325596937845, + "grad_norm": 0.23607565462589264, + "learning_rate": 8.785834173888315e-05, + "loss": 1.1597, + "step": 3769 + }, + { + "epoch": 0.22905401300200498, + "grad_norm": 0.2233234941959381, + "learning_rate": 8.785208790349773e-05, + "loss": 1.1454, + "step": 3770 + }, + { + "epoch": 0.22911477003463152, + "grad_norm": 0.20566169917583466, + "learning_rate": 8.784583268063673e-05, + "loss": 1.151, + "step": 3771 + }, + { + "epoch": 0.22917552706725802, + "grad_norm": 0.1566246598958969, + "learning_rate": 8.783957607052941e-05, + "loss": 1.0916, + "step": 3772 + }, + { + "epoch": 0.22923628409988456, + "grad_norm": 0.30101585388183594, + "learning_rate": 8.783331807340514e-05, + "loss": 1.2645, + "step": 3773 + }, + { + "epoch": 0.2292970411325111, + "grad_norm": 0.5011516213417053, + "learning_rate": 8.782705868949326e-05, + "loss": 1.2457, + "step": 3774 + }, + { + "epoch": 0.22935779816513763, + "grad_norm": 0.1741447001695633, + "learning_rate": 8.782079791902322e-05, + "loss": 1.0693, + "step": 3775 + }, + { + "epoch": 0.22941855519776413, + "grad_norm": 0.41780203580856323, + "learning_rate": 8.781453576222455e-05, + "loss": 1.0987, + "step": 3776 + }, + { + "epoch": 0.22947931223039067, + "grad_norm": 0.23424886167049408, + "learning_rate": 8.780827221932675e-05, + "loss": 1.1255, + "step": 3777 + }, + { + "epoch": 0.2295400692630172, + "grad_norm": 0.20799852907657623, + "learning_rate": 8.780200729055943e-05, + "loss": 1.1814, + "step": 3778 + }, + { + "epoch": 0.2296008262956437, + "grad_norm": 0.26352182030677795, + "learning_rate": 8.77957409761522e-05, + "loss": 1.1503, + "step": 3779 + }, + { + "epoch": 0.22966158332827025, + "grad_norm": 0.19761155545711517, + "learning_rate": 8.778947327633481e-05, + "loss": 1.2301, + "step": 3780 + }, + { + "epoch": 0.22972234036089678, + "grad_norm": 0.4628106355667114, + "learning_rate": 8.778320419133697e-05, + "loss": 1.1296, + "step": 3781 + }, + { + "epoch": 0.2297830973935233, + "grad_norm": 0.19939939677715302, + "learning_rate": 8.777693372138846e-05, + "loss": 1.1382, + "step": 3782 + }, + { + "epoch": 0.22984385442614982, + "grad_norm": 0.16180074214935303, + "learning_rate": 8.777066186671915e-05, + "loss": 1.0956, + "step": 3783 + }, + { + "epoch": 0.22990461145877636, + "grad_norm": 0.19431500136852264, + "learning_rate": 8.776438862755893e-05, + "loss": 1.1338, + "step": 3784 + }, + { + "epoch": 0.22996536849140287, + "grad_norm": 0.28697606921195984, + "learning_rate": 8.775811400413774e-05, + "loss": 1.1386, + "step": 3785 + }, + { + "epoch": 0.2300261255240294, + "grad_norm": 0.196644127368927, + "learning_rate": 8.775183799668559e-05, + "loss": 1.1499, + "step": 3786 + }, + { + "epoch": 0.23008688255665594, + "grad_norm": 0.26397594809532166, + "learning_rate": 8.774556060543253e-05, + "loss": 1.1589, + "step": 3787 + }, + { + "epoch": 0.23014763958928247, + "grad_norm": 0.20761661231517792, + "learning_rate": 8.773928183060862e-05, + "loss": 1.1331, + "step": 3788 + }, + { + "epoch": 0.23020839662190898, + "grad_norm": 0.19499783217906952, + "learning_rate": 8.773300167244407e-05, + "loss": 1.0759, + "step": 3789 + }, + { + "epoch": 0.2302691536545355, + "grad_norm": 0.24904966354370117, + "learning_rate": 8.772672013116903e-05, + "loss": 1.0893, + "step": 3790 + }, + { + "epoch": 0.23032991068716205, + "grad_norm": 0.2817010283470154, + "learning_rate": 8.772043720701378e-05, + "loss": 1.0732, + "step": 3791 + }, + { + "epoch": 0.23039066771978856, + "grad_norm": 0.2399549037218094, + "learning_rate": 8.771415290020862e-05, + "loss": 1.1777, + "step": 3792 + }, + { + "epoch": 0.2304514247524151, + "grad_norm": 0.4989120066165924, + "learning_rate": 8.770786721098387e-05, + "loss": 1.2487, + "step": 3793 + }, + { + "epoch": 0.23051218178504163, + "grad_norm": 0.21616044640541077, + "learning_rate": 8.770158013956999e-05, + "loss": 1.0881, + "step": 3794 + }, + { + "epoch": 0.23057293881766813, + "grad_norm": 0.19007226824760437, + "learning_rate": 8.769529168619739e-05, + "loss": 1.0648, + "step": 3795 + }, + { + "epoch": 0.23063369585029467, + "grad_norm": 0.25410953164100647, + "learning_rate": 8.768900185109658e-05, + "loss": 1.1141, + "step": 3796 + }, + { + "epoch": 0.2306944528829212, + "grad_norm": 0.28295817971229553, + "learning_rate": 8.768271063449812e-05, + "loss": 1.0692, + "step": 3797 + }, + { + "epoch": 0.23075520991554774, + "grad_norm": 0.2932504415512085, + "learning_rate": 8.767641803663262e-05, + "loss": 1.1745, + "step": 3798 + }, + { + "epoch": 0.23081596694817424, + "grad_norm": 2.4574594497680664, + "learning_rate": 8.767012405773074e-05, + "loss": 1.2215, + "step": 3799 + }, + { + "epoch": 0.23087672398080078, + "grad_norm": 0.2336098849773407, + "learning_rate": 8.766382869802318e-05, + "loss": 1.3222, + "step": 3800 + }, + { + "epoch": 0.23093748101342731, + "grad_norm": 0.522935152053833, + "learning_rate": 8.76575319577407e-05, + "loss": 1.3722, + "step": 3801 + }, + { + "epoch": 0.23099823804605382, + "grad_norm": 0.17931655049324036, + "learning_rate": 8.76512338371141e-05, + "loss": 1.0627, + "step": 3802 + }, + { + "epoch": 0.23105899507868036, + "grad_norm": 0.24202129244804382, + "learning_rate": 8.764493433637426e-05, + "loss": 1.216, + "step": 3803 + }, + { + "epoch": 0.2311197521113069, + "grad_norm": 0.1920606642961502, + "learning_rate": 8.763863345575208e-05, + "loss": 1.2506, + "step": 3804 + }, + { + "epoch": 0.2311805091439334, + "grad_norm": 0.4114784300327301, + "learning_rate": 8.763233119547849e-05, + "loss": 1.0882, + "step": 3805 + }, + { + "epoch": 0.23124126617655993, + "grad_norm": 0.21371228992938995, + "learning_rate": 8.762602755578454e-05, + "loss": 1.1809, + "step": 3806 + }, + { + "epoch": 0.23130202320918647, + "grad_norm": 0.31484973430633545, + "learning_rate": 8.761972253690127e-05, + "loss": 1.0815, + "step": 3807 + }, + { + "epoch": 0.23136278024181298, + "grad_norm": 0.24163700640201569, + "learning_rate": 8.76134161390598e-05, + "loss": 1.1532, + "step": 3808 + }, + { + "epoch": 0.2314235372744395, + "grad_norm": 0.1594315767288208, + "learning_rate": 8.76071083624913e-05, + "loss": 1.1058, + "step": 3809 + }, + { + "epoch": 0.23148429430706605, + "grad_norm": 0.2613028883934021, + "learning_rate": 8.760079920742698e-05, + "loss": 1.1137, + "step": 3810 + }, + { + "epoch": 0.23154505133969258, + "grad_norm": 0.2661849856376648, + "learning_rate": 8.759448867409807e-05, + "loss": 1.029, + "step": 3811 + }, + { + "epoch": 0.2316058083723191, + "grad_norm": 0.2608145475387573, + "learning_rate": 8.758817676273593e-05, + "loss": 1.177, + "step": 3812 + }, + { + "epoch": 0.23166656540494562, + "grad_norm": 0.31724369525909424, + "learning_rate": 8.758186347357191e-05, + "loss": 1.2088, + "step": 3813 + }, + { + "epoch": 0.23172732243757216, + "grad_norm": 0.21606537699699402, + "learning_rate": 8.75755488068374e-05, + "loss": 1.1477, + "step": 3814 + }, + { + "epoch": 0.23178807947019867, + "grad_norm": 20.253334045410156, + "learning_rate": 8.75692327627639e-05, + "loss": 1.0873, + "step": 3815 + }, + { + "epoch": 0.2318488365028252, + "grad_norm": 0.1502893716096878, + "learning_rate": 8.756291534158292e-05, + "loss": 1.0762, + "step": 3816 + }, + { + "epoch": 0.23190959353545174, + "grad_norm": 0.14848195016384125, + "learning_rate": 8.755659654352599e-05, + "loss": 1.0858, + "step": 3817 + }, + { + "epoch": 0.23197035056807824, + "grad_norm": 0.15895536541938782, + "learning_rate": 8.755027636882478e-05, + "loss": 1.0642, + "step": 3818 + }, + { + "epoch": 0.23203110760070478, + "grad_norm": 0.1896950900554657, + "learning_rate": 8.754395481771091e-05, + "loss": 1.1298, + "step": 3819 + }, + { + "epoch": 0.2320918646333313, + "grad_norm": 0.18929994106292725, + "learning_rate": 8.753763189041614e-05, + "loss": 1.0555, + "step": 3820 + }, + { + "epoch": 0.23215262166595785, + "grad_norm": 0.21950852870941162, + "learning_rate": 8.75313075871722e-05, + "loss": 1.1665, + "step": 3821 + }, + { + "epoch": 0.23221337869858436, + "grad_norm": 0.16273915767669678, + "learning_rate": 8.752498190821095e-05, + "loss": 1.0839, + "step": 3822 + }, + { + "epoch": 0.2322741357312109, + "grad_norm": 0.15747126936912537, + "learning_rate": 8.75186548537642e-05, + "loss": 1.0848, + "step": 3823 + }, + { + "epoch": 0.23233489276383743, + "grad_norm": 0.23414598405361176, + "learning_rate": 8.751232642406392e-05, + "loss": 1.1249, + "step": 3824 + }, + { + "epoch": 0.23239564979646393, + "grad_norm": 4.0957560539245605, + "learning_rate": 8.750599661934206e-05, + "loss": 1.1789, + "step": 3825 + }, + { + "epoch": 0.23245640682909047, + "grad_norm": 0.3437539339065552, + "learning_rate": 8.749966543983066e-05, + "loss": 1.1209, + "step": 3826 + }, + { + "epoch": 0.232517163861717, + "grad_norm": 0.3358255624771118, + "learning_rate": 8.749333288576177e-05, + "loss": 1.2534, + "step": 3827 + }, + { + "epoch": 0.2325779208943435, + "grad_norm": 0.18156570196151733, + "learning_rate": 8.748699895736752e-05, + "loss": 1.067, + "step": 3828 + }, + { + "epoch": 0.23263867792697004, + "grad_norm": 0.26023340225219727, + "learning_rate": 8.748066365488006e-05, + "loss": 1.1019, + "step": 3829 + }, + { + "epoch": 0.23269943495959658, + "grad_norm": 0.19864824414253235, + "learning_rate": 8.747432697853164e-05, + "loss": 1.0978, + "step": 3830 + }, + { + "epoch": 0.2327601919922231, + "grad_norm": 0.1605607122182846, + "learning_rate": 8.74679889285545e-05, + "loss": 1.1128, + "step": 3831 + }, + { + "epoch": 0.23282094902484962, + "grad_norm": 0.2532506287097931, + "learning_rate": 8.746164950518101e-05, + "loss": 1.2199, + "step": 3832 + }, + { + "epoch": 0.23288170605747616, + "grad_norm": 0.15219800174236298, + "learning_rate": 8.745530870864351e-05, + "loss": 1.0595, + "step": 3833 + }, + { + "epoch": 0.2329424630901027, + "grad_norm": 0.22808650135993958, + "learning_rate": 8.744896653917443e-05, + "loss": 1.0397, + "step": 3834 + }, + { + "epoch": 0.2330032201227292, + "grad_norm": 1.2698999643325806, + "learning_rate": 8.744262299700623e-05, + "loss": 1.0765, + "step": 3835 + }, + { + "epoch": 0.23306397715535573, + "grad_norm": 0.134122833609581, + "learning_rate": 8.743627808237146e-05, + "loss": 1.249, + "step": 3836 + }, + { + "epoch": 0.23312473418798227, + "grad_norm": 0.22514206171035767, + "learning_rate": 8.742993179550267e-05, + "loss": 1.1934, + "step": 3837 + }, + { + "epoch": 0.23318549122060878, + "grad_norm": 0.18784482777118683, + "learning_rate": 8.74235841366325e-05, + "loss": 1.2562, + "step": 3838 + }, + { + "epoch": 0.2332462482532353, + "grad_norm": 0.24985751509666443, + "learning_rate": 8.741723510599363e-05, + "loss": 1.0411, + "step": 3839 + }, + { + "epoch": 0.23330700528586185, + "grad_norm": 0.17648372054100037, + "learning_rate": 8.741088470381877e-05, + "loss": 1.0666, + "step": 3840 + }, + { + "epoch": 0.23336776231848835, + "grad_norm": 0.2940845787525177, + "learning_rate": 8.740453293034069e-05, + "loss": 1.0762, + "step": 3841 + }, + { + "epoch": 0.2334285193511149, + "grad_norm": 0.13416394591331482, + "learning_rate": 8.739817978579223e-05, + "loss": 1.0697, + "step": 3842 + }, + { + "epoch": 0.23348927638374142, + "grad_norm": 0.1555926352739334, + "learning_rate": 8.739182527040626e-05, + "loss": 1.1161, + "step": 3843 + }, + { + "epoch": 0.23355003341636796, + "grad_norm": 0.23895837366580963, + "learning_rate": 8.738546938441571e-05, + "loss": 1.2879, + "step": 3844 + }, + { + "epoch": 0.23361079044899447, + "grad_norm": 0.19918054342269897, + "learning_rate": 8.737911212805355e-05, + "loss": 1.173, + "step": 3845 + }, + { + "epoch": 0.233671547481621, + "grad_norm": 0.30027905106544495, + "learning_rate": 8.737275350155281e-05, + "loss": 1.2347, + "step": 3846 + }, + { + "epoch": 0.23373230451424754, + "grad_norm": 0.25957414507865906, + "learning_rate": 8.736639350514658e-05, + "loss": 1.1053, + "step": 3847 + }, + { + "epoch": 0.23379306154687404, + "grad_norm": 0.24853013455867767, + "learning_rate": 8.736003213906795e-05, + "loss": 1.0713, + "step": 3848 + }, + { + "epoch": 0.23385381857950058, + "grad_norm": 0.15455150604248047, + "learning_rate": 8.735366940355014e-05, + "loss": 1.0926, + "step": 3849 + }, + { + "epoch": 0.2339145756121271, + "grad_norm": 0.5605828166007996, + "learning_rate": 8.734730529882634e-05, + "loss": 1.1265, + "step": 3850 + }, + { + "epoch": 0.23397533264475362, + "grad_norm": 0.1810554563999176, + "learning_rate": 8.734093982512986e-05, + "loss": 1.2079, + "step": 3851 + }, + { + "epoch": 0.23403608967738015, + "grad_norm": 0.21177291870117188, + "learning_rate": 8.733457298269401e-05, + "loss": 1.1391, + "step": 3852 + }, + { + "epoch": 0.2340968467100067, + "grad_norm": 0.16143867373466492, + "learning_rate": 8.732820477175217e-05, + "loss": 1.1609, + "step": 3853 + }, + { + "epoch": 0.2341576037426332, + "grad_norm": 0.1898384839296341, + "learning_rate": 8.732183519253776e-05, + "loss": 1.1843, + "step": 3854 + }, + { + "epoch": 0.23421836077525973, + "grad_norm": 0.17061768472194672, + "learning_rate": 8.731546424528428e-05, + "loss": 1.1317, + "step": 3855 + }, + { + "epoch": 0.23427911780788627, + "grad_norm": 0.17413118481636047, + "learning_rate": 8.730909193022523e-05, + "loss": 1.0467, + "step": 3856 + }, + { + "epoch": 0.2343398748405128, + "grad_norm": 0.39212051033973694, + "learning_rate": 8.730271824759421e-05, + "loss": 1.1719, + "step": 3857 + }, + { + "epoch": 0.2344006318731393, + "grad_norm": 0.21302811801433563, + "learning_rate": 8.729634319762486e-05, + "loss": 1.2754, + "step": 3858 + }, + { + "epoch": 0.23446138890576584, + "grad_norm": 23.89767837524414, + "learning_rate": 8.728996678055081e-05, + "loss": 1.0702, + "step": 3859 + }, + { + "epoch": 0.23452214593839238, + "grad_norm": 0.1730816662311554, + "learning_rate": 8.728358899660585e-05, + "loss": 1.0778, + "step": 3860 + }, + { + "epoch": 0.2345829029710189, + "grad_norm": 7.371491432189941, + "learning_rate": 8.727720984602371e-05, + "loss": 1.0689, + "step": 3861 + }, + { + "epoch": 0.23464366000364542, + "grad_norm": 0.1834903359413147, + "learning_rate": 8.727082932903823e-05, + "loss": 1.0628, + "step": 3862 + }, + { + "epoch": 0.23470441703627196, + "grad_norm": 0.1326923370361328, + "learning_rate": 8.72644474458833e-05, + "loss": 1.0816, + "step": 3863 + }, + { + "epoch": 0.23476517406889846, + "grad_norm": 0.19308006763458252, + "learning_rate": 8.725806419679287e-05, + "loss": 1.1026, + "step": 3864 + }, + { + "epoch": 0.234825931101525, + "grad_norm": 2.243105173110962, + "learning_rate": 8.725167958200088e-05, + "loss": 1.0872, + "step": 3865 + }, + { + "epoch": 0.23488668813415153, + "grad_norm": 0.3164255619049072, + "learning_rate": 8.724529360174137e-05, + "loss": 1.1596, + "step": 3866 + }, + { + "epoch": 0.23494744516677807, + "grad_norm": 0.4158472716808319, + "learning_rate": 8.723890625624844e-05, + "loss": 1.2297, + "step": 3867 + }, + { + "epoch": 0.23500820219940458, + "grad_norm": 0.2862829566001892, + "learning_rate": 8.723251754575619e-05, + "loss": 1.2261, + "step": 3868 + }, + { + "epoch": 0.2350689592320311, + "grad_norm": 0.14094950258731842, + "learning_rate": 8.72261274704988e-05, + "loss": 1.0538, + "step": 3869 + }, + { + "epoch": 0.23512971626465765, + "grad_norm": 0.2651413679122925, + "learning_rate": 8.721973603071053e-05, + "loss": 1.0878, + "step": 3870 + }, + { + "epoch": 0.23519047329728415, + "grad_norm": 0.19096867740154266, + "learning_rate": 8.721334322662564e-05, + "loss": 1.2233, + "step": 3871 + }, + { + "epoch": 0.2352512303299107, + "grad_norm": 0.21207264065742493, + "learning_rate": 8.720694905847844e-05, + "loss": 1.127, + "step": 3872 + }, + { + "epoch": 0.23531198736253722, + "grad_norm": 0.16888320446014404, + "learning_rate": 8.720055352650337e-05, + "loss": 1.1311, + "step": 3873 + }, + { + "epoch": 0.23537274439516373, + "grad_norm": 0.25098758935928345, + "learning_rate": 8.719415663093479e-05, + "loss": 1.1042, + "step": 3874 + }, + { + "epoch": 0.23543350142779026, + "grad_norm": 0.18126602470874786, + "learning_rate": 8.71877583720072e-05, + "loss": 1.133, + "step": 3875 + }, + { + "epoch": 0.2354942584604168, + "grad_norm": 0.19230689108371735, + "learning_rate": 8.718135874995516e-05, + "loss": 1.1127, + "step": 3876 + }, + { + "epoch": 0.2355550154930433, + "grad_norm": 0.15672238171100616, + "learning_rate": 8.717495776501323e-05, + "loss": 1.0651, + "step": 3877 + }, + { + "epoch": 0.23561577252566984, + "grad_norm": 0.16875606775283813, + "learning_rate": 8.716855541741603e-05, + "loss": 1.0939, + "step": 3878 + }, + { + "epoch": 0.23567652955829638, + "grad_norm": 0.2219727784395218, + "learning_rate": 8.716215170739825e-05, + "loss": 1.1563, + "step": 3879 + }, + { + "epoch": 0.2357372865909229, + "grad_norm": 0.33217132091522217, + "learning_rate": 8.715574663519462e-05, + "loss": 1.2994, + "step": 3880 + }, + { + "epoch": 0.23579804362354942, + "grad_norm": 0.21564796566963196, + "learning_rate": 8.71493402010399e-05, + "loss": 1.244, + "step": 3881 + }, + { + "epoch": 0.23585880065617595, + "grad_norm": 0.8253178596496582, + "learning_rate": 8.714293240516894e-05, + "loss": 1.1492, + "step": 3882 + }, + { + "epoch": 0.2359195576888025, + "grad_norm": 0.18198446929454803, + "learning_rate": 8.713652324781662e-05, + "loss": 1.0882, + "step": 3883 + }, + { + "epoch": 0.235980314721429, + "grad_norm": 0.20700703561306, + "learning_rate": 8.713011272921785e-05, + "loss": 1.2974, + "step": 3884 + }, + { + "epoch": 0.23604107175405553, + "grad_norm": 0.12848831713199615, + "learning_rate": 8.712370084960764e-05, + "loss": 1.1016, + "step": 3885 + }, + { + "epoch": 0.23610182878668207, + "grad_norm": 0.16396000981330872, + "learning_rate": 8.711728760922097e-05, + "loss": 1.0422, + "step": 3886 + }, + { + "epoch": 0.23616258581930857, + "grad_norm": 0.16817036271095276, + "learning_rate": 8.711087300829297e-05, + "loss": 1.1538, + "step": 3887 + }, + { + "epoch": 0.2362233428519351, + "grad_norm": 0.19173792004585266, + "learning_rate": 8.710445704705874e-05, + "loss": 1.0392, + "step": 3888 + }, + { + "epoch": 0.23628409988456164, + "grad_norm": 0.14542429149150848, + "learning_rate": 8.709803972575346e-05, + "loss": 1.1001, + "step": 3889 + }, + { + "epoch": 0.23634485691718815, + "grad_norm": 0.21353450417518616, + "learning_rate": 8.709162104461238e-05, + "loss": 1.0637, + "step": 3890 + }, + { + "epoch": 0.23640561394981469, + "grad_norm": 0.1834249198436737, + "learning_rate": 8.708520100387074e-05, + "loss": 1.1117, + "step": 3891 + }, + { + "epoch": 0.23646637098244122, + "grad_norm": 0.15393726527690887, + "learning_rate": 8.707877960376392e-05, + "loss": 1.0971, + "step": 3892 + }, + { + "epoch": 0.23652712801506776, + "grad_norm": 0.5606473088264465, + "learning_rate": 8.707235684452723e-05, + "loss": 1.0863, + "step": 3893 + }, + { + "epoch": 0.23658788504769426, + "grad_norm": 0.13707295060157776, + "learning_rate": 8.706593272639615e-05, + "loss": 1.0518, + "step": 3894 + }, + { + "epoch": 0.2366486420803208, + "grad_norm": 0.1525871753692627, + "learning_rate": 8.705950724960614e-05, + "loss": 1.0698, + "step": 3895 + }, + { + "epoch": 0.23670939911294733, + "grad_norm": 0.2221229523420334, + "learning_rate": 8.705308041439272e-05, + "loss": 1.1103, + "step": 3896 + }, + { + "epoch": 0.23677015614557384, + "grad_norm": 0.1364176869392395, + "learning_rate": 8.70466522209915e-05, + "loss": 1.0736, + "step": 3897 + }, + { + "epoch": 0.23683091317820038, + "grad_norm": 0.18070849776268005, + "learning_rate": 8.704022266963807e-05, + "loss": 1.2127, + "step": 3898 + }, + { + "epoch": 0.2368916702108269, + "grad_norm": 0.2248815894126892, + "learning_rate": 8.703379176056811e-05, + "loss": 1.1609, + "step": 3899 + }, + { + "epoch": 0.23695242724345342, + "grad_norm": 0.5836701393127441, + "learning_rate": 8.702735949401736e-05, + "loss": 1.1479, + "step": 3900 + }, + { + "epoch": 0.23701318427607995, + "grad_norm": 0.13758723437786102, + "learning_rate": 8.702092587022159e-05, + "loss": 1.0971, + "step": 3901 + }, + { + "epoch": 0.2370739413087065, + "grad_norm": 0.2805210053920746, + "learning_rate": 8.701449088941662e-05, + "loss": 1.0365, + "step": 3902 + }, + { + "epoch": 0.23713469834133302, + "grad_norm": 0.16959865391254425, + "learning_rate": 8.700805455183835e-05, + "loss": 1.155, + "step": 3903 + }, + { + "epoch": 0.23719545537395953, + "grad_norm": 0.14467813074588776, + "learning_rate": 8.700161685772267e-05, + "loss": 1.0939, + "step": 3904 + }, + { + "epoch": 0.23725621240658606, + "grad_norm": 0.156314417719841, + "learning_rate": 8.699517780730557e-05, + "loss": 1.1059, + "step": 3905 + }, + { + "epoch": 0.2373169694392126, + "grad_norm": 0.29469722509384155, + "learning_rate": 8.698873740082309e-05, + "loss": 1.1633, + "step": 3906 + }, + { + "epoch": 0.2373777264718391, + "grad_norm": 13.932208061218262, + "learning_rate": 8.698229563851129e-05, + "loss": 1.1156, + "step": 3907 + }, + { + "epoch": 0.23743848350446564, + "grad_norm": 0.1899506002664566, + "learning_rate": 8.697585252060629e-05, + "loss": 1.117, + "step": 3908 + }, + { + "epoch": 0.23749924053709218, + "grad_norm": 0.16111107170581818, + "learning_rate": 8.696940804734426e-05, + "loss": 1.0811, + "step": 3909 + }, + { + "epoch": 0.23755999756971868, + "grad_norm": 0.1723010540008545, + "learning_rate": 8.696296221896144e-05, + "loss": 1.0966, + "step": 3910 + }, + { + "epoch": 0.23762075460234522, + "grad_norm": 0.18821151554584503, + "learning_rate": 8.695651503569409e-05, + "loss": 1.1135, + "step": 3911 + }, + { + "epoch": 0.23768151163497175, + "grad_norm": 0.1950039565563202, + "learning_rate": 8.695006649777854e-05, + "loss": 1.1129, + "step": 3912 + }, + { + "epoch": 0.23774226866759826, + "grad_norm": 0.14388521015644073, + "learning_rate": 8.694361660545116e-05, + "loss": 1.128, + "step": 3913 + }, + { + "epoch": 0.2378030257002248, + "grad_norm": 0.2002483457326889, + "learning_rate": 8.693716535894837e-05, + "loss": 1.0713, + "step": 3914 + }, + { + "epoch": 0.23786378273285133, + "grad_norm": 0.36832526326179504, + "learning_rate": 8.693071275850666e-05, + "loss": 1.1553, + "step": 3915 + }, + { + "epoch": 0.23792453976547787, + "grad_norm": 2.9972379207611084, + "learning_rate": 8.692425880436253e-05, + "loss": 1.1144, + "step": 3916 + }, + { + "epoch": 0.23798529679810437, + "grad_norm": 0.17029139399528503, + "learning_rate": 8.691780349675255e-05, + "loss": 1.0681, + "step": 3917 + }, + { + "epoch": 0.2380460538307309, + "grad_norm": 0.6380923390388489, + "learning_rate": 8.691134683591334e-05, + "loss": 1.1038, + "step": 3918 + }, + { + "epoch": 0.23810681086335744, + "grad_norm": 0.19650183618068695, + "learning_rate": 8.690488882208159e-05, + "loss": 1.1262, + "step": 3919 + }, + { + "epoch": 0.23816756789598395, + "grad_norm": 0.19674339890480042, + "learning_rate": 8.6898429455494e-05, + "loss": 1.1344, + "step": 3920 + }, + { + "epoch": 0.23822832492861049, + "grad_norm": 0.1724308282136917, + "learning_rate": 8.689196873638734e-05, + "loss": 1.114, + "step": 3921 + }, + { + "epoch": 0.23828908196123702, + "grad_norm": 0.15377965569496155, + "learning_rate": 8.688550666499844e-05, + "loss": 1.0968, + "step": 3922 + }, + { + "epoch": 0.23834983899386353, + "grad_norm": 0.2071285992860794, + "learning_rate": 8.687904324156417e-05, + "loss": 1.1281, + "step": 3923 + }, + { + "epoch": 0.23841059602649006, + "grad_norm": 0.15540525317192078, + "learning_rate": 8.687257846632142e-05, + "loss": 1.0709, + "step": 3924 + }, + { + "epoch": 0.2384713530591166, + "grad_norm": 0.2519376277923584, + "learning_rate": 8.686611233950721e-05, + "loss": 1.1394, + "step": 3925 + }, + { + "epoch": 0.23853211009174313, + "grad_norm": 0.16704869270324707, + "learning_rate": 8.685964486135852e-05, + "loss": 1.0773, + "step": 3926 + }, + { + "epoch": 0.23859286712436964, + "grad_norm": 0.175711989402771, + "learning_rate": 8.68531760321124e-05, + "loss": 1.0482, + "step": 3927 + }, + { + "epoch": 0.23865362415699617, + "grad_norm": 0.21318276226520538, + "learning_rate": 8.684670585200601e-05, + "loss": 1.1116, + "step": 3928 + }, + { + "epoch": 0.2387143811896227, + "grad_norm": 0.15422223508358002, + "learning_rate": 8.684023432127648e-05, + "loss": 1.1867, + "step": 3929 + }, + { + "epoch": 0.23877513822224922, + "grad_norm": 0.19644825160503387, + "learning_rate": 8.683376144016106e-05, + "loss": 1.0807, + "step": 3930 + }, + { + "epoch": 0.23883589525487575, + "grad_norm": 0.14864318072795868, + "learning_rate": 8.682728720889697e-05, + "loss": 1.104, + "step": 3931 + }, + { + "epoch": 0.2388966522875023, + "grad_norm": 0.1880117654800415, + "learning_rate": 8.682081162772156e-05, + "loss": 1.0858, + "step": 3932 + }, + { + "epoch": 0.2389574093201288, + "grad_norm": 0.1523965448141098, + "learning_rate": 8.681433469687219e-05, + "loss": 1.1366, + "step": 3933 + }, + { + "epoch": 0.23901816635275533, + "grad_norm": 0.8888483643531799, + "learning_rate": 8.680785641658626e-05, + "loss": 1.111, + "step": 3934 + }, + { + "epoch": 0.23907892338538186, + "grad_norm": 0.23758070170879364, + "learning_rate": 8.680137678710125e-05, + "loss": 1.0727, + "step": 3935 + }, + { + "epoch": 0.23913968041800837, + "grad_norm": 0.19169849157333374, + "learning_rate": 8.679489580865463e-05, + "loss": 1.1219, + "step": 3936 + }, + { + "epoch": 0.2392004374506349, + "grad_norm": 0.20867425203323364, + "learning_rate": 8.6788413481484e-05, + "loss": 1.1069, + "step": 3937 + }, + { + "epoch": 0.23926119448326144, + "grad_norm": 0.1340804100036621, + "learning_rate": 8.678192980582697e-05, + "loss": 1.099, + "step": 3938 + }, + { + "epoch": 0.23932195151588798, + "grad_norm": 0.3417584002017975, + "learning_rate": 8.677544478192121e-05, + "loss": 1.1643, + "step": 3939 + }, + { + "epoch": 0.23938270854851448, + "grad_norm": 0.16733908653259277, + "learning_rate": 8.676895841000441e-05, + "loss": 1.0706, + "step": 3940 + }, + { + "epoch": 0.23944346558114102, + "grad_norm": 0.13786454498767853, + "learning_rate": 8.676247069031431e-05, + "loss": 1.0377, + "step": 3941 + }, + { + "epoch": 0.23950422261376755, + "grad_norm": 0.22302298247814178, + "learning_rate": 8.675598162308875e-05, + "loss": 1.2489, + "step": 3942 + }, + { + "epoch": 0.23956497964639406, + "grad_norm": 0.19295349717140198, + "learning_rate": 8.67494912085656e-05, + "loss": 1.1547, + "step": 3943 + }, + { + "epoch": 0.2396257366790206, + "grad_norm": 0.3624386489391327, + "learning_rate": 8.674299944698271e-05, + "loss": 1.0695, + "step": 3944 + }, + { + "epoch": 0.23968649371164713, + "grad_norm": 0.15758883953094482, + "learning_rate": 8.67365063385781e-05, + "loss": 1.1164, + "step": 3945 + }, + { + "epoch": 0.23974725074427364, + "grad_norm": 0.14956587553024292, + "learning_rate": 8.673001188358974e-05, + "loss": 1.0498, + "step": 3946 + }, + { + "epoch": 0.23980800777690017, + "grad_norm": 0.1457321047782898, + "learning_rate": 8.672351608225568e-05, + "loss": 1.1066, + "step": 3947 + }, + { + "epoch": 0.2398687648095267, + "grad_norm": 0.724364161491394, + "learning_rate": 8.671701893481405e-05, + "loss": 1.1812, + "step": 3948 + }, + { + "epoch": 0.23992952184215324, + "grad_norm": 0.2393360584974289, + "learning_rate": 8.6710520441503e-05, + "loss": 1.1985, + "step": 3949 + }, + { + "epoch": 0.23999027887477975, + "grad_norm": 0.23643238842487335, + "learning_rate": 8.670402060256073e-05, + "loss": 1.1031, + "step": 3950 + }, + { + "epoch": 0.24005103590740628, + "grad_norm": 0.17582935094833374, + "learning_rate": 8.669751941822549e-05, + "loss": 1.0795, + "step": 3951 + }, + { + "epoch": 0.24011179294003282, + "grad_norm": 0.15575046837329865, + "learning_rate": 8.669101688873558e-05, + "loss": 1.0899, + "step": 3952 + }, + { + "epoch": 0.24017254997265933, + "grad_norm": 0.12399573624134064, + "learning_rate": 8.668451301432934e-05, + "loss": 1.0854, + "step": 3953 + }, + { + "epoch": 0.24023330700528586, + "grad_norm": 0.15120410919189453, + "learning_rate": 8.66780077952452e-05, + "loss": 1.113, + "step": 3954 + }, + { + "epoch": 0.2402940640379124, + "grad_norm": 3.1159369945526123, + "learning_rate": 8.667150123172158e-05, + "loss": 1.0955, + "step": 3955 + }, + { + "epoch": 0.2403548210705389, + "grad_norm": 0.31155282258987427, + "learning_rate": 8.6664993323997e-05, + "loss": 1.2229, + "step": 3956 + }, + { + "epoch": 0.24041557810316544, + "grad_norm": 0.20258402824401855, + "learning_rate": 8.665848407231001e-05, + "loss": 1.0941, + "step": 3957 + }, + { + "epoch": 0.24047633513579197, + "grad_norm": 0.2539091408252716, + "learning_rate": 8.66519734768992e-05, + "loss": 1.1218, + "step": 3958 + }, + { + "epoch": 0.24053709216841848, + "grad_norm": 0.15777987241744995, + "learning_rate": 8.664546153800322e-05, + "loss": 1.1047, + "step": 3959 + }, + { + "epoch": 0.24059784920104502, + "grad_norm": 0.15768924355506897, + "learning_rate": 8.663894825586073e-05, + "loss": 1.1152, + "step": 3960 + }, + { + "epoch": 0.24065860623367155, + "grad_norm": 0.15072941780090332, + "learning_rate": 8.663243363071056e-05, + "loss": 1.1198, + "step": 3961 + }, + { + "epoch": 0.2407193632662981, + "grad_norm": 0.18585556745529175, + "learning_rate": 8.662591766279142e-05, + "loss": 1.2109, + "step": 3962 + }, + { + "epoch": 0.2407801202989246, + "grad_norm": 0.14197374880313873, + "learning_rate": 8.66194003523422e-05, + "loss": 1.0695, + "step": 3963 + }, + { + "epoch": 0.24084087733155113, + "grad_norm": 0.15779241919517517, + "learning_rate": 8.661288169960177e-05, + "loss": 1.1455, + "step": 3964 + }, + { + "epoch": 0.24090163436417766, + "grad_norm": 0.6295207142829895, + "learning_rate": 8.660636170480912e-05, + "loss": 1.1348, + "step": 3965 + }, + { + "epoch": 0.24096239139680417, + "grad_norm": 0.12053091078996658, + "learning_rate": 8.659984036820317e-05, + "loss": 1.0692, + "step": 3966 + }, + { + "epoch": 0.2410231484294307, + "grad_norm": 0.1854286789894104, + "learning_rate": 8.659331769002301e-05, + "loss": 1.2086, + "step": 3967 + }, + { + "epoch": 0.24108390546205724, + "grad_norm": 0.5717991590499878, + "learning_rate": 8.65867936705077e-05, + "loss": 1.2266, + "step": 3968 + }, + { + "epoch": 0.24114466249468375, + "grad_norm": 0.19806058704853058, + "learning_rate": 8.658026830989642e-05, + "loss": 1.1149, + "step": 3969 + }, + { + "epoch": 0.24120541952731028, + "grad_norm": 0.24884095788002014, + "learning_rate": 8.657374160842833e-05, + "loss": 1.0465, + "step": 3970 + }, + { + "epoch": 0.24126617655993682, + "grad_norm": 0.19447007775306702, + "learning_rate": 8.656721356634267e-05, + "loss": 1.1684, + "step": 3971 + }, + { + "epoch": 0.24132693359256335, + "grad_norm": 0.28897160291671753, + "learning_rate": 8.656068418387873e-05, + "loss": 1.105, + "step": 3972 + }, + { + "epoch": 0.24138769062518986, + "grad_norm": 0.14791648089885712, + "learning_rate": 8.655415346127582e-05, + "loss": 1.1586, + "step": 3973 + }, + { + "epoch": 0.2414484476578164, + "grad_norm": 0.2585868537425995, + "learning_rate": 8.654762139877338e-05, + "loss": 1.0975, + "step": 3974 + }, + { + "epoch": 0.24150920469044293, + "grad_norm": 0.19113487005233765, + "learning_rate": 8.654108799661081e-05, + "loss": 1.0899, + "step": 3975 + }, + { + "epoch": 0.24156996172306944, + "grad_norm": 0.3172217905521393, + "learning_rate": 8.653455325502757e-05, + "loss": 1.0752, + "step": 3976 + }, + { + "epoch": 0.24163071875569597, + "grad_norm": 2.1464030742645264, + "learning_rate": 8.652801717426324e-05, + "loss": 1.1344, + "step": 3977 + }, + { + "epoch": 0.2416914757883225, + "grad_norm": 0.27316781878471375, + "learning_rate": 8.652147975455738e-05, + "loss": 1.2896, + "step": 3978 + }, + { + "epoch": 0.24175223282094901, + "grad_norm": 0.30954229831695557, + "learning_rate": 8.651494099614963e-05, + "loss": 1.1448, + "step": 3979 + }, + { + "epoch": 0.24181298985357555, + "grad_norm": 0.14505885541439056, + "learning_rate": 8.650840089927964e-05, + "loss": 1.067, + "step": 3980 + }, + { + "epoch": 0.24187374688620208, + "grad_norm": 0.20532795786857605, + "learning_rate": 8.650185946418716e-05, + "loss": 1.1173, + "step": 3981 + }, + { + "epoch": 0.2419345039188286, + "grad_norm": 0.43968433141708374, + "learning_rate": 8.649531669111198e-05, + "loss": 1.054, + "step": 3982 + }, + { + "epoch": 0.24199526095145513, + "grad_norm": 0.16732043027877808, + "learning_rate": 8.648877258029389e-05, + "loss": 1.0866, + "step": 3983 + }, + { + "epoch": 0.24205601798408166, + "grad_norm": 0.3106529116630554, + "learning_rate": 8.64822271319728e-05, + "loss": 1.161, + "step": 3984 + }, + { + "epoch": 0.2421167750167082, + "grad_norm": 0.2534385621547699, + "learning_rate": 8.647568034638862e-05, + "loss": 1.1392, + "step": 3985 + }, + { + "epoch": 0.2421775320493347, + "grad_norm": 0.1997934877872467, + "learning_rate": 8.646913222378134e-05, + "loss": 1.0776, + "step": 3986 + }, + { + "epoch": 0.24223828908196124, + "grad_norm": 0.24092431366443634, + "learning_rate": 8.646258276439096e-05, + "loss": 1.0539, + "step": 3987 + }, + { + "epoch": 0.24229904611458777, + "grad_norm": 0.3611429035663605, + "learning_rate": 8.645603196845756e-05, + "loss": 1.0804, + "step": 3988 + }, + { + "epoch": 0.24235980314721428, + "grad_norm": 0.18255358934402466, + "learning_rate": 8.644947983622125e-05, + "loss": 1.0668, + "step": 3989 + }, + { + "epoch": 0.24242056017984082, + "grad_norm": 0.1695895493030548, + "learning_rate": 8.644292636792221e-05, + "loss": 1.0798, + "step": 3990 + }, + { + "epoch": 0.24248131721246735, + "grad_norm": 0.13713617622852325, + "learning_rate": 8.64363715638007e-05, + "loss": 1.0878, + "step": 3991 + }, + { + "epoch": 0.24254207424509386, + "grad_norm": 0.1993829607963562, + "learning_rate": 8.64298154240969e-05, + "loss": 1.1505, + "step": 3992 + }, + { + "epoch": 0.2426028312777204, + "grad_norm": 0.2562691271305084, + "learning_rate": 8.64232579490512e-05, + "loss": 1.1111, + "step": 3993 + }, + { + "epoch": 0.24266358831034693, + "grad_norm": 2.1048951148986816, + "learning_rate": 8.641669913890394e-05, + "loss": 1.1707, + "step": 3994 + }, + { + "epoch": 0.24272434534297344, + "grad_norm": 0.21180875599384308, + "learning_rate": 8.64101389938955e-05, + "loss": 1.0976, + "step": 3995 + }, + { + "epoch": 0.24278510237559997, + "grad_norm": 0.2694220244884491, + "learning_rate": 8.640357751426642e-05, + "loss": 1.1383, + "step": 3996 + }, + { + "epoch": 0.2428458594082265, + "grad_norm": 6.604887962341309, + "learning_rate": 8.639701470025714e-05, + "loss": 1.1034, + "step": 3997 + }, + { + "epoch": 0.24290661644085304, + "grad_norm": 0.5258581042289734, + "learning_rate": 8.639045055210827e-05, + "loss": 1.1047, + "step": 3998 + }, + { + "epoch": 0.24296737347347955, + "grad_norm": 0.18104997277259827, + "learning_rate": 8.638388507006039e-05, + "loss": 1.1544, + "step": 3999 + }, + { + "epoch": 0.24302813050610608, + "grad_norm": 0.21292586624622345, + "learning_rate": 8.637731825435418e-05, + "loss": 1.1266, + "step": 4000 + }, + { + "epoch": 0.24308888753873262, + "grad_norm": 0.3438781499862671, + "learning_rate": 8.637075010523032e-05, + "loss": 1.1126, + "step": 4001 + }, + { + "epoch": 0.24314964457135912, + "grad_norm": 0.14674068987369537, + "learning_rate": 8.636418062292962e-05, + "loss": 1.1072, + "step": 4002 + }, + { + "epoch": 0.24321040160398566, + "grad_norm": 0.38550257682800293, + "learning_rate": 8.635760980769281e-05, + "loss": 1.1461, + "step": 4003 + }, + { + "epoch": 0.2432711586366122, + "grad_norm": 0.16236717998981476, + "learning_rate": 8.635103765976081e-05, + "loss": 1.0789, + "step": 4004 + }, + { + "epoch": 0.2433319156692387, + "grad_norm": 0.5983213186264038, + "learning_rate": 8.634446417937449e-05, + "loss": 1.1615, + "step": 4005 + }, + { + "epoch": 0.24339267270186524, + "grad_norm": 0.3105098605155945, + "learning_rate": 8.63378893667748e-05, + "loss": 1.1373, + "step": 4006 + }, + { + "epoch": 0.24345342973449177, + "grad_norm": 0.18556448817253113, + "learning_rate": 8.633131322220276e-05, + "loss": 1.0749, + "step": 4007 + }, + { + "epoch": 0.2435141867671183, + "grad_norm": 0.26110541820526123, + "learning_rate": 8.632473574589941e-05, + "loss": 1.0877, + "step": 4008 + }, + { + "epoch": 0.24357494379974481, + "grad_norm": 0.46966123580932617, + "learning_rate": 8.631815693810586e-05, + "loss": 1.1334, + "step": 4009 + }, + { + "epoch": 0.24363570083237135, + "grad_norm": 0.23228362202644348, + "learning_rate": 8.631157679906323e-05, + "loss": 1.1264, + "step": 4010 + }, + { + "epoch": 0.24369645786499788, + "grad_norm": 0.2246306836605072, + "learning_rate": 8.630499532901275e-05, + "loss": 1.157, + "step": 4011 + }, + { + "epoch": 0.2437572148976244, + "grad_norm": 0.24068474769592285, + "learning_rate": 8.629841252819564e-05, + "loss": 1.1571, + "step": 4012 + }, + { + "epoch": 0.24381797193025093, + "grad_norm": 0.2753877639770508, + "learning_rate": 8.629182839685321e-05, + "loss": 1.1175, + "step": 4013 + }, + { + "epoch": 0.24387872896287746, + "grad_norm": 0.19838157296180725, + "learning_rate": 8.628524293522679e-05, + "loss": 1.1656, + "step": 4014 + }, + { + "epoch": 0.24393948599550397, + "grad_norm": 0.24955883622169495, + "learning_rate": 8.627865614355777e-05, + "loss": 1.0274, + "step": 4015 + }, + { + "epoch": 0.2440002430281305, + "grad_norm": 0.1909952610731125, + "learning_rate": 8.62720680220876e-05, + "loss": 1.1304, + "step": 4016 + }, + { + "epoch": 0.24406100006075704, + "grad_norm": 0.1548011600971222, + "learning_rate": 8.626547857105777e-05, + "loss": 1.0564, + "step": 4017 + }, + { + "epoch": 0.24412175709338355, + "grad_norm": 0.6794400811195374, + "learning_rate": 8.62588877907098e-05, + "loss": 1.144, + "step": 4018 + }, + { + "epoch": 0.24418251412601008, + "grad_norm": 0.2033773809671402, + "learning_rate": 8.625229568128531e-05, + "loss": 1.2283, + "step": 4019 + }, + { + "epoch": 0.24424327115863662, + "grad_norm": 0.19142000377178192, + "learning_rate": 8.624570224302591e-05, + "loss": 1.0659, + "step": 4020 + }, + { + "epoch": 0.24430402819126315, + "grad_norm": 0.20253171026706696, + "learning_rate": 8.623910747617328e-05, + "loss": 1.0685, + "step": 4021 + }, + { + "epoch": 0.24436478522388966, + "grad_norm": 0.15959037840366364, + "learning_rate": 8.623251138096916e-05, + "loss": 1.1282, + "step": 4022 + }, + { + "epoch": 0.2444255422565162, + "grad_norm": 0.2258574664592743, + "learning_rate": 8.622591395765533e-05, + "loss": 1.1014, + "step": 4023 + }, + { + "epoch": 0.24448629928914273, + "grad_norm": 0.3244194984436035, + "learning_rate": 8.621931520647363e-05, + "loss": 1.1262, + "step": 4024 + }, + { + "epoch": 0.24454705632176924, + "grad_norm": 0.22501277923583984, + "learning_rate": 8.621271512766593e-05, + "loss": 1.2468, + "step": 4025 + }, + { + "epoch": 0.24460781335439577, + "grad_norm": 0.3266246020793915, + "learning_rate": 8.620611372147414e-05, + "loss": 1.0979, + "step": 4026 + }, + { + "epoch": 0.2446685703870223, + "grad_norm": 0.1458095908164978, + "learning_rate": 8.619951098814028e-05, + "loss": 1.0966, + "step": 4027 + }, + { + "epoch": 0.2447293274196488, + "grad_norm": 0.7091784477233887, + "learning_rate": 8.619290692790633e-05, + "loss": 1.1805, + "step": 4028 + }, + { + "epoch": 0.24479008445227535, + "grad_norm": 0.3023465573787689, + "learning_rate": 8.618630154101438e-05, + "loss": 1.1671, + "step": 4029 + }, + { + "epoch": 0.24485084148490188, + "grad_norm": 0.3327714204788208, + "learning_rate": 8.617969482770656e-05, + "loss": 1.0544, + "step": 4030 + }, + { + "epoch": 0.24491159851752842, + "grad_norm": 0.3605053424835205, + "learning_rate": 8.617308678822502e-05, + "loss": 1.1492, + "step": 4031 + }, + { + "epoch": 0.24497235555015492, + "grad_norm": 18.277729034423828, + "learning_rate": 8.6166477422812e-05, + "loss": 1.127, + "step": 4032 + }, + { + "epoch": 0.24503311258278146, + "grad_norm": 0.34724563360214233, + "learning_rate": 8.615986673170977e-05, + "loss": 1.0869, + "step": 4033 + }, + { + "epoch": 0.245093869615408, + "grad_norm": 0.23811745643615723, + "learning_rate": 8.615325471516063e-05, + "loss": 1.1345, + "step": 4034 + }, + { + "epoch": 0.2451546266480345, + "grad_norm": 0.2629840075969696, + "learning_rate": 8.614664137340694e-05, + "loss": 1.0795, + "step": 4035 + }, + { + "epoch": 0.24521538368066104, + "grad_norm": 0.5882347822189331, + "learning_rate": 8.614002670669114e-05, + "loss": 1.3391, + "step": 4036 + }, + { + "epoch": 0.24527614071328757, + "grad_norm": 0.17361603677272797, + "learning_rate": 8.613341071525565e-05, + "loss": 1.1735, + "step": 4037 + }, + { + "epoch": 0.24533689774591408, + "grad_norm": 0.17132027447223663, + "learning_rate": 8.612679339934302e-05, + "loss": 1.0885, + "step": 4038 + }, + { + "epoch": 0.2453976547785406, + "grad_norm": 0.31367743015289307, + "learning_rate": 8.61201747591958e-05, + "loss": 1.2087, + "step": 4039 + }, + { + "epoch": 0.24545841181116715, + "grad_norm": 0.30873334407806396, + "learning_rate": 8.611355479505658e-05, + "loss": 1.0547, + "step": 4040 + }, + { + "epoch": 0.24551916884379366, + "grad_norm": 0.2688838541507721, + "learning_rate": 8.610693350716806e-05, + "loss": 1.1403, + "step": 4041 + }, + { + "epoch": 0.2455799258764202, + "grad_norm": 0.23757126927375793, + "learning_rate": 8.610031089577289e-05, + "loss": 1.0852, + "step": 4042 + }, + { + "epoch": 0.24564068290904673, + "grad_norm": 0.16408149898052216, + "learning_rate": 8.609368696111383e-05, + "loss": 1.0152, + "step": 4043 + }, + { + "epoch": 0.24570143994167326, + "grad_norm": 0.14532369375228882, + "learning_rate": 8.608706170343371e-05, + "loss": 1.0695, + "step": 4044 + }, + { + "epoch": 0.24576219697429977, + "grad_norm": 0.2933334410190582, + "learning_rate": 8.608043512297537e-05, + "loss": 1.1332, + "step": 4045 + }, + { + "epoch": 0.2458229540069263, + "grad_norm": 0.2835923433303833, + "learning_rate": 8.607380721998171e-05, + "loss": 1.0596, + "step": 4046 + }, + { + "epoch": 0.24588371103955284, + "grad_norm": 0.16383066773414612, + "learning_rate": 8.606717799469568e-05, + "loss": 1.1277, + "step": 4047 + }, + { + "epoch": 0.24594446807217935, + "grad_norm": 0.32419484853744507, + "learning_rate": 8.606054744736026e-05, + "loss": 1.1274, + "step": 4048 + }, + { + "epoch": 0.24600522510480588, + "grad_norm": 0.24372728168964386, + "learning_rate": 8.605391557821849e-05, + "loss": 1.1013, + "step": 4049 + }, + { + "epoch": 0.24606598213743242, + "grad_norm": 0.4960094094276428, + "learning_rate": 8.604728238751349e-05, + "loss": 1.2317, + "step": 4050 + }, + { + "epoch": 0.24612673917005892, + "grad_norm": 0.17118974030017853, + "learning_rate": 8.604064787548839e-05, + "loss": 1.1164, + "step": 4051 + }, + { + "epoch": 0.24618749620268546, + "grad_norm": 0.20355871319770813, + "learning_rate": 8.603401204238637e-05, + "loss": 1.1536, + "step": 4052 + }, + { + "epoch": 0.246248253235312, + "grad_norm": 0.1677030324935913, + "learning_rate": 8.602737488845067e-05, + "loss": 1.1402, + "step": 4053 + }, + { + "epoch": 0.24630901026793853, + "grad_norm": 0.1634184718132019, + "learning_rate": 8.602073641392458e-05, + "loss": 1.0338, + "step": 4054 + }, + { + "epoch": 0.24636976730056503, + "grad_norm": 0.19681808352470398, + "learning_rate": 8.601409661905144e-05, + "loss": 1.1602, + "step": 4055 + }, + { + "epoch": 0.24643052433319157, + "grad_norm": 0.17885880172252655, + "learning_rate": 8.600745550407464e-05, + "loss": 1.1191, + "step": 4056 + }, + { + "epoch": 0.2464912813658181, + "grad_norm": 0.20643024146556854, + "learning_rate": 8.600081306923758e-05, + "loss": 1.1407, + "step": 4057 + }, + { + "epoch": 0.2465520383984446, + "grad_norm": 0.20853880047798157, + "learning_rate": 8.599416931478374e-05, + "loss": 1.2485, + "step": 4058 + }, + { + "epoch": 0.24661279543107115, + "grad_norm": 0.15868984162807465, + "learning_rate": 8.598752424095669e-05, + "loss": 1.0705, + "step": 4059 + }, + { + "epoch": 0.24667355246369768, + "grad_norm": 0.8027739524841309, + "learning_rate": 8.598087784799998e-05, + "loss": 1.2232, + "step": 4060 + }, + { + "epoch": 0.2467343094963242, + "grad_norm": 0.13661803305149078, + "learning_rate": 8.597423013615724e-05, + "loss": 1.0464, + "step": 4061 + }, + { + "epoch": 0.24679506652895072, + "grad_norm": 0.25538215041160583, + "learning_rate": 8.596758110567214e-05, + "loss": 1.164, + "step": 4062 + }, + { + "epoch": 0.24685582356157726, + "grad_norm": 0.2539287507534027, + "learning_rate": 8.59609307567884e-05, + "loss": 1.0747, + "step": 4063 + }, + { + "epoch": 0.24691658059420377, + "grad_norm": 0.7665238976478577, + "learning_rate": 8.595427908974979e-05, + "loss": 1.1589, + "step": 4064 + }, + { + "epoch": 0.2469773376268303, + "grad_norm": 0.19996581971645355, + "learning_rate": 8.594762610480012e-05, + "loss": 1.2087, + "step": 4065 + }, + { + "epoch": 0.24703809465945684, + "grad_norm": 5.7554497718811035, + "learning_rate": 8.594097180218329e-05, + "loss": 1.1282, + "step": 4066 + }, + { + "epoch": 0.24709885169208337, + "grad_norm": 0.2446056306362152, + "learning_rate": 8.593431618214318e-05, + "loss": 1.2048, + "step": 4067 + }, + { + "epoch": 0.24715960872470988, + "grad_norm": 0.1952815055847168, + "learning_rate": 8.592765924492377e-05, + "loss": 1.1804, + "step": 4068 + }, + { + "epoch": 0.2472203657573364, + "grad_norm": 0.20773836970329285, + "learning_rate": 8.592100099076905e-05, + "loss": 1.0441, + "step": 4069 + }, + { + "epoch": 0.24728112278996295, + "grad_norm": 0.4322708547115326, + "learning_rate": 8.59143414199231e-05, + "loss": 1.2189, + "step": 4070 + }, + { + "epoch": 0.24734187982258946, + "grad_norm": 0.18040430545806885, + "learning_rate": 8.590768053263005e-05, + "loss": 1.1488, + "step": 4071 + }, + { + "epoch": 0.247402636855216, + "grad_norm": 0.28984418511390686, + "learning_rate": 8.5901018329134e-05, + "loss": 1.1822, + "step": 4072 + }, + { + "epoch": 0.24746339388784253, + "grad_norm": 0.24308879673480988, + "learning_rate": 8.589435480967918e-05, + "loss": 1.2023, + "step": 4073 + }, + { + "epoch": 0.24752415092046903, + "grad_norm": 0.1683763712644577, + "learning_rate": 8.588768997450987e-05, + "loss": 1.2014, + "step": 4074 + }, + { + "epoch": 0.24758490795309557, + "grad_norm": 0.2185237556695938, + "learning_rate": 8.588102382387033e-05, + "loss": 1.1085, + "step": 4075 + }, + { + "epoch": 0.2476456649857221, + "grad_norm": 0.3711228668689728, + "learning_rate": 8.587435635800494e-05, + "loss": 1.1659, + "step": 4076 + }, + { + "epoch": 0.24770642201834864, + "grad_norm": 0.14320625364780426, + "learning_rate": 8.586768757715806e-05, + "loss": 1.0488, + "step": 4077 + }, + { + "epoch": 0.24776717905097514, + "grad_norm": 0.21578729152679443, + "learning_rate": 8.586101748157418e-05, + "loss": 1.0628, + "step": 4078 + }, + { + "epoch": 0.24782793608360168, + "grad_norm": 0.15633291006088257, + "learning_rate": 8.585434607149776e-05, + "loss": 1.0938, + "step": 4079 + }, + { + "epoch": 0.24788869311622821, + "grad_norm": 0.18531322479248047, + "learning_rate": 8.584767334717337e-05, + "loss": 1.0921, + "step": 4080 + }, + { + "epoch": 0.24794945014885472, + "grad_norm": 0.23115776479244232, + "learning_rate": 8.584099930884557e-05, + "loss": 1.135, + "step": 4081 + }, + { + "epoch": 0.24801020718148126, + "grad_norm": 0.14599020779132843, + "learning_rate": 8.583432395675901e-05, + "loss": 1.0658, + "step": 4082 + }, + { + "epoch": 0.2480709642141078, + "grad_norm": 0.14802511036396027, + "learning_rate": 8.582764729115838e-05, + "loss": 1.0776, + "step": 4083 + }, + { + "epoch": 0.2481317212467343, + "grad_norm": 0.20576101541519165, + "learning_rate": 8.582096931228842e-05, + "loss": 1.2338, + "step": 4084 + }, + { + "epoch": 0.24819247827936083, + "grad_norm": 0.238702192902565, + "learning_rate": 8.581429002039392e-05, + "loss": 1.2124, + "step": 4085 + }, + { + "epoch": 0.24825323531198737, + "grad_norm": 0.2632155418395996, + "learning_rate": 8.580760941571967e-05, + "loss": 1.1338, + "step": 4086 + }, + { + "epoch": 0.24831399234461388, + "grad_norm": 0.18265683948993683, + "learning_rate": 8.580092749851058e-05, + "loss": 1.1111, + "step": 4087 + }, + { + "epoch": 0.2483747493772404, + "grad_norm": 0.1798429787158966, + "learning_rate": 8.579424426901159e-05, + "loss": 1.1069, + "step": 4088 + }, + { + "epoch": 0.24843550640986695, + "grad_norm": 1.7813425064086914, + "learning_rate": 8.578755972746763e-05, + "loss": 1.244, + "step": 4089 + }, + { + "epoch": 0.24849626344249348, + "grad_norm": 0.19594089686870575, + "learning_rate": 8.578087387412377e-05, + "loss": 1.0355, + "step": 4090 + }, + { + "epoch": 0.24855702047512, + "grad_norm": 0.18351732194423676, + "learning_rate": 8.577418670922506e-05, + "loss": 1.1217, + "step": 4091 + }, + { + "epoch": 0.24861777750774652, + "grad_norm": 0.15555503964424133, + "learning_rate": 8.576749823301662e-05, + "loss": 1.1005, + "step": 4092 + }, + { + "epoch": 0.24867853454037306, + "grad_norm": 0.15773619711399078, + "learning_rate": 8.576080844574362e-05, + "loss": 1.1117, + "step": 4093 + }, + { + "epoch": 0.24873929157299957, + "grad_norm": 0.24765901267528534, + "learning_rate": 8.575411734765128e-05, + "loss": 1.0443, + "step": 4094 + }, + { + "epoch": 0.2488000486056261, + "grad_norm": 0.16648131608963013, + "learning_rate": 8.574742493898484e-05, + "loss": 1.0965, + "step": 4095 + }, + { + "epoch": 0.24886080563825264, + "grad_norm": 0.14474792778491974, + "learning_rate": 8.574073121998964e-05, + "loss": 1.1223, + "step": 4096 + }, + { + "epoch": 0.24892156267087914, + "grad_norm": 0.1604004204273224, + "learning_rate": 8.573403619091103e-05, + "loss": 1.0575, + "step": 4097 + }, + { + "epoch": 0.24898231970350568, + "grad_norm": 0.2689473330974579, + "learning_rate": 8.572733985199442e-05, + "loss": 1.1346, + "step": 4098 + }, + { + "epoch": 0.2490430767361322, + "grad_norm": 0.16532671451568604, + "learning_rate": 8.572064220348525e-05, + "loss": 1.1929, + "step": 4099 + }, + { + "epoch": 0.24910383376875872, + "grad_norm": 0.12528428435325623, + "learning_rate": 8.571394324562905e-05, + "loss": 1.0594, + "step": 4100 + }, + { + "epoch": 0.24916459080138526, + "grad_norm": 0.316062331199646, + "learning_rate": 8.570724297867135e-05, + "loss": 1.0969, + "step": 4101 + }, + { + "epoch": 0.2492253478340118, + "grad_norm": 0.17280779778957367, + "learning_rate": 8.570054140285775e-05, + "loss": 1.1598, + "step": 4102 + }, + { + "epoch": 0.24928610486663833, + "grad_norm": 0.23015287518501282, + "learning_rate": 8.56938385184339e-05, + "loss": 1.2757, + "step": 4103 + }, + { + "epoch": 0.24934686189926483, + "grad_norm": 0.32703980803489685, + "learning_rate": 8.568713432564552e-05, + "loss": 1.103, + "step": 4104 + }, + { + "epoch": 0.24940761893189137, + "grad_norm": 0.24935807287693024, + "learning_rate": 8.56804288247383e-05, + "loss": 1.0897, + "step": 4105 + }, + { + "epoch": 0.2494683759645179, + "grad_norm": 0.24203459918498993, + "learning_rate": 8.56737220159581e-05, + "loss": 1.1474, + "step": 4106 + }, + { + "epoch": 0.2495291329971444, + "grad_norm": 0.1566619873046875, + "learning_rate": 8.56670138995507e-05, + "loss": 1.126, + "step": 4107 + }, + { + "epoch": 0.24958989002977094, + "grad_norm": 0.26541823148727417, + "learning_rate": 8.566030447576203e-05, + "loss": 1.1483, + "step": 4108 + }, + { + "epoch": 0.24965064706239748, + "grad_norm": 0.289373517036438, + "learning_rate": 8.565359374483799e-05, + "loss": 1.2045, + "step": 4109 + }, + { + "epoch": 0.249711404095024, + "grad_norm": 0.17153102159500122, + "learning_rate": 8.56468817070246e-05, + "loss": 1.1377, + "step": 4110 + }, + { + "epoch": 0.24977216112765052, + "grad_norm": 0.28802037239074707, + "learning_rate": 8.564016836256785e-05, + "loss": 1.2679, + "step": 4111 + }, + { + "epoch": 0.24983291816027706, + "grad_norm": 1.233233094215393, + "learning_rate": 8.563345371171385e-05, + "loss": 1.1001, + "step": 4112 + }, + { + "epoch": 0.2498936751929036, + "grad_norm": 0.38556793332099915, + "learning_rate": 8.56267377547087e-05, + "loss": 1.0648, + "step": 4113 + }, + { + "epoch": 0.2499544322255301, + "grad_norm": 0.19476401805877686, + "learning_rate": 8.562002049179862e-05, + "loss": 1.1667, + "step": 4114 + }, + { + "epoch": 0.25001518925815663, + "grad_norm": 0.19244134426116943, + "learning_rate": 8.561330192322978e-05, + "loss": 1.1302, + "step": 4115 + }, + { + "epoch": 0.25007594629078317, + "grad_norm": 0.4519463777542114, + "learning_rate": 8.560658204924849e-05, + "loss": 1.0912, + "step": 4116 + }, + { + "epoch": 0.2501367033234097, + "grad_norm": 0.24016918241977692, + "learning_rate": 8.559986087010104e-05, + "loss": 1.1388, + "step": 4117 + }, + { + "epoch": 0.25019746035603624, + "grad_norm": 0.3761630952358246, + "learning_rate": 8.559313838603381e-05, + "loss": 1.2035, + "step": 4118 + }, + { + "epoch": 0.2502582173886627, + "grad_norm": 0.3559887707233429, + "learning_rate": 8.558641459729321e-05, + "loss": 1.131, + "step": 4119 + }, + { + "epoch": 0.25031897442128925, + "grad_norm": 0.20240308344364166, + "learning_rate": 8.557968950412571e-05, + "loss": 1.0828, + "step": 4120 + }, + { + "epoch": 0.2503797314539158, + "grad_norm": 0.2807188332080841, + "learning_rate": 8.557296310677783e-05, + "loss": 1.1277, + "step": 4121 + }, + { + "epoch": 0.2504404884865423, + "grad_norm": 0.29699328541755676, + "learning_rate": 8.556623540549608e-05, + "loss": 1.039, + "step": 4122 + }, + { + "epoch": 0.25050124551916886, + "grad_norm": 0.22494173049926758, + "learning_rate": 8.555950640052713e-05, + "loss": 1.0894, + "step": 4123 + }, + { + "epoch": 0.2505620025517954, + "grad_norm": 0.3755808472633362, + "learning_rate": 8.555277609211757e-05, + "loss": 1.0686, + "step": 4124 + }, + { + "epoch": 0.2506227595844219, + "grad_norm": 0.28935179114341736, + "learning_rate": 8.554604448051413e-05, + "loss": 1.0538, + "step": 4125 + }, + { + "epoch": 0.2506835166170484, + "grad_norm": 0.32570749521255493, + "learning_rate": 8.553931156596358e-05, + "loss": 1.2517, + "step": 4126 + }, + { + "epoch": 0.25074427364967494, + "grad_norm": 0.41436418890953064, + "learning_rate": 8.553257734871267e-05, + "loss": 1.0494, + "step": 4127 + }, + { + "epoch": 0.2508050306823015, + "grad_norm": 0.23903585970401764, + "learning_rate": 8.55258418290083e-05, + "loss": 1.0858, + "step": 4128 + }, + { + "epoch": 0.250865787714928, + "grad_norm": 0.29696860909461975, + "learning_rate": 8.551910500709732e-05, + "loss": 1.1465, + "step": 4129 + }, + { + "epoch": 0.25092654474755455, + "grad_norm": 0.3306981027126312, + "learning_rate": 8.551236688322666e-05, + "loss": 1.2771, + "step": 4130 + }, + { + "epoch": 0.2509873017801811, + "grad_norm": 2.7766425609588623, + "learning_rate": 8.550562745764334e-05, + "loss": 1.0992, + "step": 4131 + }, + { + "epoch": 0.25104805881280756, + "grad_norm": 0.20471929013729095, + "learning_rate": 8.549888673059439e-05, + "loss": 1.0946, + "step": 4132 + }, + { + "epoch": 0.2511088158454341, + "grad_norm": 0.22637993097305298, + "learning_rate": 8.549214470232688e-05, + "loss": 1.1798, + "step": 4133 + }, + { + "epoch": 0.25116957287806063, + "grad_norm": 0.19550451636314392, + "learning_rate": 8.548540137308795e-05, + "loss": 1.1265, + "step": 4134 + }, + { + "epoch": 0.25123032991068717, + "grad_norm": 0.2085788995027542, + "learning_rate": 8.547865674312476e-05, + "loss": 1.1003, + "step": 4135 + }, + { + "epoch": 0.2512910869433137, + "grad_norm": 0.1719086915254593, + "learning_rate": 8.547191081268456e-05, + "loss": 1.1077, + "step": 4136 + }, + { + "epoch": 0.25135184397594024, + "grad_norm": 0.21202941238880157, + "learning_rate": 8.546516358201462e-05, + "loss": 1.109, + "step": 4137 + }, + { + "epoch": 0.2514126010085667, + "grad_norm": 0.3908483386039734, + "learning_rate": 8.545841505136224e-05, + "loss": 1.3424, + "step": 4138 + }, + { + "epoch": 0.25147335804119325, + "grad_norm": 0.192022442817688, + "learning_rate": 8.545166522097481e-05, + "loss": 1.1281, + "step": 4139 + }, + { + "epoch": 0.2515341150738198, + "grad_norm": 0.20203399658203125, + "learning_rate": 8.544491409109973e-05, + "loss": 1.1444, + "step": 4140 + }, + { + "epoch": 0.2515948721064463, + "grad_norm": 0.19474977254867554, + "learning_rate": 8.543816166198449e-05, + "loss": 1.1229, + "step": 4141 + }, + { + "epoch": 0.25165562913907286, + "grad_norm": 6.34874963760376, + "learning_rate": 8.543140793387657e-05, + "loss": 1.1531, + "step": 4142 + }, + { + "epoch": 0.2517163861716994, + "grad_norm": 1.0578254461288452, + "learning_rate": 8.542465290702353e-05, + "loss": 1.1249, + "step": 4143 + }, + { + "epoch": 0.2517771432043259, + "grad_norm": 0.3454793393611908, + "learning_rate": 8.541789658167301e-05, + "loss": 1.0144, + "step": 4144 + }, + { + "epoch": 0.2518379002369524, + "grad_norm": 0.244012251496315, + "learning_rate": 8.541113895807264e-05, + "loss": 1.1771, + "step": 4145 + }, + { + "epoch": 0.25189865726957894, + "grad_norm": 0.3643401265144348, + "learning_rate": 8.540438003647013e-05, + "loss": 1.0516, + "step": 4146 + }, + { + "epoch": 0.2519594143022055, + "grad_norm": 0.1746654510498047, + "learning_rate": 8.539761981711321e-05, + "loss": 1.0609, + "step": 4147 + }, + { + "epoch": 0.252020171334832, + "grad_norm": 0.17908631265163422, + "learning_rate": 8.539085830024971e-05, + "loss": 1.166, + "step": 4148 + }, + { + "epoch": 0.25208092836745855, + "grad_norm": 0.29916009306907654, + "learning_rate": 8.538409548612746e-05, + "loss": 1.2271, + "step": 4149 + }, + { + "epoch": 0.2521416854000851, + "grad_norm": 0.17699405550956726, + "learning_rate": 8.537733137499435e-05, + "loss": 1.1746, + "step": 4150 + }, + { + "epoch": 0.25220244243271156, + "grad_norm": 0.45046988129615784, + "learning_rate": 8.537056596709832e-05, + "loss": 1.1261, + "step": 4151 + }, + { + "epoch": 0.2522631994653381, + "grad_norm": 0.13706566393375397, + "learning_rate": 8.536379926268734e-05, + "loss": 1.0986, + "step": 4152 + }, + { + "epoch": 0.25232395649796463, + "grad_norm": 3.5723299980163574, + "learning_rate": 8.535703126200948e-05, + "loss": 1.105, + "step": 4153 + }, + { + "epoch": 0.25238471353059116, + "grad_norm": 0.255502849817276, + "learning_rate": 8.535026196531279e-05, + "loss": 1.137, + "step": 4154 + }, + { + "epoch": 0.2524454705632177, + "grad_norm": 0.5019230246543884, + "learning_rate": 8.534349137284543e-05, + "loss": 1.2151, + "step": 4155 + }, + { + "epoch": 0.25250622759584423, + "grad_norm": 0.15695320069789886, + "learning_rate": 8.533671948485555e-05, + "loss": 1.0984, + "step": 4156 + }, + { + "epoch": 0.25256698462847077, + "grad_norm": 0.23317253589630127, + "learning_rate": 8.53299463015914e-05, + "loss": 1.0763, + "step": 4157 + }, + { + "epoch": 0.25262774166109725, + "grad_norm": 0.1939711719751358, + "learning_rate": 8.532317182330123e-05, + "loss": 1.2722, + "step": 4158 + }, + { + "epoch": 0.2526884986937238, + "grad_norm": 0.2504059374332428, + "learning_rate": 8.531639605023339e-05, + "loss": 1.1656, + "step": 4159 + }, + { + "epoch": 0.2527492557263503, + "grad_norm": 0.1749381721019745, + "learning_rate": 8.53096189826362e-05, + "loss": 1.0923, + "step": 4160 + }, + { + "epoch": 0.25281001275897685, + "grad_norm": 0.21789798140525818, + "learning_rate": 8.53028406207581e-05, + "loss": 1.1331, + "step": 4161 + }, + { + "epoch": 0.2528707697916034, + "grad_norm": 0.17451786994934082, + "learning_rate": 8.529606096484757e-05, + "loss": 1.0753, + "step": 4162 + }, + { + "epoch": 0.2529315268242299, + "grad_norm": 0.2880437672138214, + "learning_rate": 8.528928001515309e-05, + "loss": 1.1936, + "step": 4163 + }, + { + "epoch": 0.2529922838568564, + "grad_norm": 0.15425370633602142, + "learning_rate": 8.528249777192324e-05, + "loss": 1.1206, + "step": 4164 + }, + { + "epoch": 0.25305304088948294, + "grad_norm": 0.14943325519561768, + "learning_rate": 8.527571423540662e-05, + "loss": 1.057, + "step": 4165 + }, + { + "epoch": 0.2531137979221095, + "grad_norm": 0.17268097400665283, + "learning_rate": 8.526892940585185e-05, + "loss": 1.2054, + "step": 4166 + }, + { + "epoch": 0.253174554954736, + "grad_norm": 3.0992114543914795, + "learning_rate": 8.526214328350767e-05, + "loss": 1.065, + "step": 4167 + }, + { + "epoch": 0.25323531198736254, + "grad_norm": 0.2546354830265045, + "learning_rate": 8.525535586862281e-05, + "loss": 1.1614, + "step": 4168 + }, + { + "epoch": 0.2532960690199891, + "grad_norm": 0.16679732501506805, + "learning_rate": 8.524856716144607e-05, + "loss": 1.0882, + "step": 4169 + }, + { + "epoch": 0.2533568260526156, + "grad_norm": 0.19214554131031036, + "learning_rate": 8.524177716222628e-05, + "loss": 1.1004, + "step": 4170 + }, + { + "epoch": 0.2534175830852421, + "grad_norm": 0.18400248885154724, + "learning_rate": 8.523498587121235e-05, + "loss": 1.0916, + "step": 4171 + }, + { + "epoch": 0.25347834011786863, + "grad_norm": 0.2818697392940521, + "learning_rate": 8.52281932886532e-05, + "loss": 1.2159, + "step": 4172 + }, + { + "epoch": 0.25353909715049516, + "grad_norm": 0.41420456767082214, + "learning_rate": 8.52213994147978e-05, + "loss": 1.1639, + "step": 4173 + }, + { + "epoch": 0.2535998541831217, + "grad_norm": 1.186070203781128, + "learning_rate": 8.52146042498952e-05, + "loss": 1.1032, + "step": 4174 + }, + { + "epoch": 0.25366061121574823, + "grad_norm": 0.34048500657081604, + "learning_rate": 8.520780779419448e-05, + "loss": 1.0767, + "step": 4175 + }, + { + "epoch": 0.25372136824837477, + "grad_norm": 0.21824349462985992, + "learning_rate": 8.520101004794476e-05, + "loss": 1.2362, + "step": 4176 + }, + { + "epoch": 0.2537821252810013, + "grad_norm": 0.31136640906333923, + "learning_rate": 8.519421101139522e-05, + "loss": 1.1252, + "step": 4177 + }, + { + "epoch": 0.2538428823136278, + "grad_norm": 0.17634399235248566, + "learning_rate": 8.518741068479506e-05, + "loss": 1.1509, + "step": 4178 + }, + { + "epoch": 0.2539036393462543, + "grad_norm": 0.1874326914548874, + "learning_rate": 8.518060906839356e-05, + "loss": 1.1848, + "step": 4179 + }, + { + "epoch": 0.25396439637888085, + "grad_norm": 0.2158823311328888, + "learning_rate": 8.517380616244005e-05, + "loss": 1.1532, + "step": 4180 + }, + { + "epoch": 0.2540251534115074, + "grad_norm": 0.21650990843772888, + "learning_rate": 8.516700196718385e-05, + "loss": 1.1425, + "step": 4181 + }, + { + "epoch": 0.2540859104441339, + "grad_norm": 0.27733394503593445, + "learning_rate": 8.516019648287441e-05, + "loss": 1.116, + "step": 4182 + }, + { + "epoch": 0.25414666747676046, + "grad_norm": 4.468677043914795, + "learning_rate": 8.515338970976117e-05, + "loss": 1.1007, + "step": 4183 + }, + { + "epoch": 0.25420742450938694, + "grad_norm": 0.22979959845542908, + "learning_rate": 8.514658164809364e-05, + "loss": 1.0845, + "step": 4184 + }, + { + "epoch": 0.25426818154201347, + "grad_norm": 0.22904656827449799, + "learning_rate": 8.513977229812135e-05, + "loss": 1.0828, + "step": 4185 + }, + { + "epoch": 0.25432893857464, + "grad_norm": 0.20850655436515808, + "learning_rate": 8.513296166009395e-05, + "loss": 1.0891, + "step": 4186 + }, + { + "epoch": 0.25438969560726654, + "grad_norm": 0.1685955971479416, + "learning_rate": 8.512614973426101e-05, + "loss": 1.1043, + "step": 4187 + }, + { + "epoch": 0.2544504526398931, + "grad_norm": 0.2577073276042938, + "learning_rate": 8.511933652087227e-05, + "loss": 1.099, + "step": 4188 + }, + { + "epoch": 0.2545112096725196, + "grad_norm": 0.7813034653663635, + "learning_rate": 8.511252202017747e-05, + "loss": 1.1036, + "step": 4189 + }, + { + "epoch": 0.25457196670514615, + "grad_norm": 0.18157364428043365, + "learning_rate": 8.51057062324264e-05, + "loss": 1.2152, + "step": 4190 + }, + { + "epoch": 0.2546327237377726, + "grad_norm": 0.23822636902332306, + "learning_rate": 8.509888915786886e-05, + "loss": 1.1606, + "step": 4191 + }, + { + "epoch": 0.25469348077039916, + "grad_norm": 0.24963602423667908, + "learning_rate": 8.509207079675478e-05, + "loss": 1.2555, + "step": 4192 + }, + { + "epoch": 0.2547542378030257, + "grad_norm": 0.16131234169006348, + "learning_rate": 8.508525114933404e-05, + "loss": 1.0895, + "step": 4193 + }, + { + "epoch": 0.25481499483565223, + "grad_norm": 0.16613486409187317, + "learning_rate": 8.507843021585665e-05, + "loss": 1.0926, + "step": 4194 + }, + { + "epoch": 0.25487575186827877, + "grad_norm": 0.24539700150489807, + "learning_rate": 8.50716079965726e-05, + "loss": 1.0889, + "step": 4195 + }, + { + "epoch": 0.2549365089009053, + "grad_norm": 0.16711640357971191, + "learning_rate": 8.5064784491732e-05, + "loss": 1.0738, + "step": 4196 + }, + { + "epoch": 0.2549972659335318, + "grad_norm": 0.20085102319717407, + "learning_rate": 8.505795970158493e-05, + "loss": 1.0931, + "step": 4197 + }, + { + "epoch": 0.2550580229661583, + "grad_norm": 0.2539522647857666, + "learning_rate": 8.505113362638158e-05, + "loss": 1.1715, + "step": 4198 + }, + { + "epoch": 0.25511877999878485, + "grad_norm": 0.39260950684547424, + "learning_rate": 8.504430626637215e-05, + "loss": 1.1744, + "step": 4199 + }, + { + "epoch": 0.2551795370314114, + "grad_norm": 0.27809226512908936, + "learning_rate": 8.503747762180691e-05, + "loss": 1.1154, + "step": 4200 + }, + { + "epoch": 0.2552402940640379, + "grad_norm": 0.21992912888526917, + "learning_rate": 8.503064769293614e-05, + "loss": 1.1283, + "step": 4201 + }, + { + "epoch": 0.25530105109666446, + "grad_norm": 0.26056337356567383, + "learning_rate": 8.50238164800102e-05, + "loss": 1.1155, + "step": 4202 + }, + { + "epoch": 0.255361808129291, + "grad_norm": 0.15038877725601196, + "learning_rate": 8.501698398327951e-05, + "loss": 1.0914, + "step": 4203 + }, + { + "epoch": 0.25542256516191747, + "grad_norm": 0.24054546654224396, + "learning_rate": 8.501015020299452e-05, + "loss": 1.3729, + "step": 4204 + }, + { + "epoch": 0.255483322194544, + "grad_norm": 0.17988722026348114, + "learning_rate": 8.500331513940569e-05, + "loss": 1.1602, + "step": 4205 + }, + { + "epoch": 0.25554407922717054, + "grad_norm": 0.12434462457895279, + "learning_rate": 8.499647879276359e-05, + "loss": 1.0818, + "step": 4206 + }, + { + "epoch": 0.2556048362597971, + "grad_norm": 0.18362249433994293, + "learning_rate": 8.498964116331876e-05, + "loss": 1.1345, + "step": 4207 + }, + { + "epoch": 0.2556655932924236, + "grad_norm": 0.22588826715946198, + "learning_rate": 8.49828022513219e-05, + "loss": 1.2381, + "step": 4208 + }, + { + "epoch": 0.25572635032505014, + "grad_norm": 0.1283007562160492, + "learning_rate": 8.497596205702366e-05, + "loss": 1.1278, + "step": 4209 + }, + { + "epoch": 0.2557871073576766, + "grad_norm": 0.9142860174179077, + "learning_rate": 8.496912058067476e-05, + "loss": 1.2001, + "step": 4210 + }, + { + "epoch": 0.25584786439030316, + "grad_norm": 0.17875459790229797, + "learning_rate": 8.496227782252599e-05, + "loss": 1.0761, + "step": 4211 + }, + { + "epoch": 0.2559086214229297, + "grad_norm": 0.12321389466524124, + "learning_rate": 8.495543378282818e-05, + "loss": 1.0366, + "step": 4212 + }, + { + "epoch": 0.25596937845555623, + "grad_norm": 0.1925307661294937, + "learning_rate": 8.494858846183217e-05, + "loss": 1.0533, + "step": 4213 + }, + { + "epoch": 0.25603013548818276, + "grad_norm": 0.868889331817627, + "learning_rate": 8.494174185978889e-05, + "loss": 1.1255, + "step": 4214 + }, + { + "epoch": 0.2560908925208093, + "grad_norm": 0.7886120080947876, + "learning_rate": 8.493489397694932e-05, + "loss": 1.1666, + "step": 4215 + }, + { + "epoch": 0.25615164955343583, + "grad_norm": 0.2437831610441208, + "learning_rate": 8.492804481356446e-05, + "loss": 1.2113, + "step": 4216 + }, + { + "epoch": 0.2562124065860623, + "grad_norm": 0.255470871925354, + "learning_rate": 8.492119436988538e-05, + "loss": 1.0551, + "step": 4217 + }, + { + "epoch": 0.25627316361868885, + "grad_norm": 0.240932896733284, + "learning_rate": 8.491434264616313e-05, + "loss": 1.1391, + "step": 4218 + }, + { + "epoch": 0.2563339206513154, + "grad_norm": 0.220208078622818, + "learning_rate": 8.490748964264894e-05, + "loss": 1.1684, + "step": 4219 + }, + { + "epoch": 0.2563946776839419, + "grad_norm": 0.21479731798171997, + "learning_rate": 8.490063535959396e-05, + "loss": 1.1756, + "step": 4220 + }, + { + "epoch": 0.25645543471656845, + "grad_norm": 0.38608306646347046, + "learning_rate": 8.489377979724944e-05, + "loss": 1.1017, + "step": 4221 + }, + { + "epoch": 0.256516191749195, + "grad_norm": 0.2426665723323822, + "learning_rate": 8.488692295586668e-05, + "loss": 1.0529, + "step": 4222 + }, + { + "epoch": 0.2565769487818215, + "grad_norm": 0.1743011623620987, + "learning_rate": 8.4880064835697e-05, + "loss": 1.0883, + "step": 4223 + }, + { + "epoch": 0.256637705814448, + "grad_norm": 0.8394467234611511, + "learning_rate": 8.487320543699183e-05, + "loss": 1.1242, + "step": 4224 + }, + { + "epoch": 0.25669846284707454, + "grad_norm": 0.21643461287021637, + "learning_rate": 8.486634476000255e-05, + "loss": 1.2507, + "step": 4225 + }, + { + "epoch": 0.2567592198797011, + "grad_norm": 0.14840495586395264, + "learning_rate": 8.485948280498067e-05, + "loss": 1.0298, + "step": 4226 + }, + { + "epoch": 0.2568199769123276, + "grad_norm": 0.20398449897766113, + "learning_rate": 8.485261957217771e-05, + "loss": 1.1437, + "step": 4227 + }, + { + "epoch": 0.25688073394495414, + "grad_norm": 0.19567550718784332, + "learning_rate": 8.484575506184525e-05, + "loss": 1.1123, + "step": 4228 + }, + { + "epoch": 0.2569414909775807, + "grad_norm": 0.27100253105163574, + "learning_rate": 8.483888927423489e-05, + "loss": 1.093, + "step": 4229 + }, + { + "epoch": 0.25700224801020716, + "grad_norm": 0.1475774496793747, + "learning_rate": 8.483202220959831e-05, + "loss": 1.0509, + "step": 4230 + }, + { + "epoch": 0.2570630050428337, + "grad_norm": 0.13975857198238373, + "learning_rate": 8.482515386818724e-05, + "loss": 1.0791, + "step": 4231 + }, + { + "epoch": 0.2571237620754602, + "grad_norm": 0.8387301564216614, + "learning_rate": 8.48182842502534e-05, + "loss": 1.1257, + "step": 4232 + }, + { + "epoch": 0.25718451910808676, + "grad_norm": 0.14440058171749115, + "learning_rate": 8.481141335604863e-05, + "loss": 1.0842, + "step": 4233 + }, + { + "epoch": 0.2572452761407133, + "grad_norm": 0.16440021991729736, + "learning_rate": 8.480454118582479e-05, + "loss": 1.0561, + "step": 4234 + }, + { + "epoch": 0.25730603317333983, + "grad_norm": 1.0560884475708008, + "learning_rate": 8.479766773983374e-05, + "loss": 1.0607, + "step": 4235 + }, + { + "epoch": 0.25736679020596637, + "grad_norm": 0.4002532958984375, + "learning_rate": 8.479079301832746e-05, + "loss": 1.1793, + "step": 4236 + }, + { + "epoch": 0.25742754723859285, + "grad_norm": 0.1388968974351883, + "learning_rate": 8.478391702155795e-05, + "loss": 1.1066, + "step": 4237 + }, + { + "epoch": 0.2574883042712194, + "grad_norm": 0.31794053316116333, + "learning_rate": 8.477703974977722e-05, + "loss": 1.1935, + "step": 4238 + }, + { + "epoch": 0.2575490613038459, + "grad_norm": 0.1266797035932541, + "learning_rate": 8.47701612032374e-05, + "loss": 1.0229, + "step": 4239 + }, + { + "epoch": 0.25760981833647245, + "grad_norm": 1.874619483947754, + "learning_rate": 8.476328138219058e-05, + "loss": 1.1506, + "step": 4240 + }, + { + "epoch": 0.257670575369099, + "grad_norm": 0.40473389625549316, + "learning_rate": 8.475640028688896e-05, + "loss": 1.2329, + "step": 4241 + }, + { + "epoch": 0.2577313324017255, + "grad_norm": 0.13560648262500763, + "learning_rate": 8.474951791758478e-05, + "loss": 1.0643, + "step": 4242 + }, + { + "epoch": 0.257792089434352, + "grad_norm": 0.16973525285720825, + "learning_rate": 8.474263427453031e-05, + "loss": 1.1239, + "step": 4243 + }, + { + "epoch": 0.25785284646697854, + "grad_norm": 0.24408873915672302, + "learning_rate": 8.473574935797786e-05, + "loss": 1.2423, + "step": 4244 + }, + { + "epoch": 0.25791360349960507, + "grad_norm": 0.2804044187068939, + "learning_rate": 8.472886316817979e-05, + "loss": 1.1492, + "step": 4245 + }, + { + "epoch": 0.2579743605322316, + "grad_norm": 0.2863284945487976, + "learning_rate": 8.472197570538853e-05, + "loss": 1.0617, + "step": 4246 + }, + { + "epoch": 0.25803511756485814, + "grad_norm": 0.17607618868350983, + "learning_rate": 8.471508696985655e-05, + "loss": 1.0959, + "step": 4247 + }, + { + "epoch": 0.2580958745974847, + "grad_norm": 0.23290187120437622, + "learning_rate": 8.470819696183634e-05, + "loss": 1.2404, + "step": 4248 + }, + { + "epoch": 0.2581566316301112, + "grad_norm": 0.4581441879272461, + "learning_rate": 8.470130568158047e-05, + "loss": 1.1571, + "step": 4249 + }, + { + "epoch": 0.2582173886627377, + "grad_norm": 0.16368238627910614, + "learning_rate": 8.469441312934153e-05, + "loss": 1.098, + "step": 4250 + }, + { + "epoch": 0.2582781456953642, + "grad_norm": 0.18559230864048004, + "learning_rate": 8.468751930537215e-05, + "loss": 1.0555, + "step": 4251 + }, + { + "epoch": 0.25833890272799076, + "grad_norm": 0.24541161954402924, + "learning_rate": 8.468062420992506e-05, + "loss": 1.1215, + "step": 4252 + }, + { + "epoch": 0.2583996597606173, + "grad_norm": 0.17333202064037323, + "learning_rate": 8.467372784325299e-05, + "loss": 1.1694, + "step": 4253 + }, + { + "epoch": 0.25846041679324383, + "grad_norm": 0.2998686730861664, + "learning_rate": 8.466683020560873e-05, + "loss": 1.1827, + "step": 4254 + }, + { + "epoch": 0.25852117382587037, + "grad_norm": 0.23007234930992126, + "learning_rate": 8.465993129724508e-05, + "loss": 1.1592, + "step": 4255 + }, + { + "epoch": 0.25858193085849684, + "grad_norm": 0.20353084802627563, + "learning_rate": 8.465303111841498e-05, + "loss": 1.2069, + "step": 4256 + }, + { + "epoch": 0.2586426878911234, + "grad_norm": 0.47833624482154846, + "learning_rate": 8.464612966937131e-05, + "loss": 1.2244, + "step": 4257 + }, + { + "epoch": 0.2587034449237499, + "grad_norm": 0.16635839641094208, + "learning_rate": 8.463922695036706e-05, + "loss": 1.0911, + "step": 4258 + }, + { + "epoch": 0.25876420195637645, + "grad_norm": 0.639962911605835, + "learning_rate": 8.463232296165526e-05, + "loss": 1.0513, + "step": 4259 + }, + { + "epoch": 0.258824958989003, + "grad_norm": 0.32631582021713257, + "learning_rate": 8.462541770348895e-05, + "loss": 1.2023, + "step": 4260 + }, + { + "epoch": 0.2588857160216295, + "grad_norm": 4.5949225425720215, + "learning_rate": 8.461851117612127e-05, + "loss": 1.0874, + "step": 4261 + }, + { + "epoch": 0.25894647305425605, + "grad_norm": 0.2057335525751114, + "learning_rate": 8.461160337980538e-05, + "loss": 1.1289, + "step": 4262 + }, + { + "epoch": 0.25900723008688253, + "grad_norm": 0.32290101051330566, + "learning_rate": 8.460469431479448e-05, + "loss": 1.125, + "step": 4263 + }, + { + "epoch": 0.25906798711950907, + "grad_norm": 0.16816361248493195, + "learning_rate": 8.45977839813418e-05, + "loss": 1.083, + "step": 4264 + }, + { + "epoch": 0.2591287441521356, + "grad_norm": 0.20611028373241425, + "learning_rate": 8.459087237970068e-05, + "loss": 1.1246, + "step": 4265 + }, + { + "epoch": 0.25918950118476214, + "grad_norm": 0.18841104209423065, + "learning_rate": 8.458395951012444e-05, + "loss": 1.1281, + "step": 4266 + }, + { + "epoch": 0.2592502582173887, + "grad_norm": 0.8322115540504456, + "learning_rate": 8.457704537286647e-05, + "loss": 1.1157, + "step": 4267 + }, + { + "epoch": 0.2593110152500152, + "grad_norm": 0.20937062799930573, + "learning_rate": 8.457012996818023e-05, + "loss": 1.1715, + "step": 4268 + }, + { + "epoch": 0.2593717722826417, + "grad_norm": 0.34700995683670044, + "learning_rate": 8.45632132963192e-05, + "loss": 1.1975, + "step": 4269 + }, + { + "epoch": 0.2594325293152682, + "grad_norm": 0.44231733679771423, + "learning_rate": 8.45562953575369e-05, + "loss": 1.0327, + "step": 4270 + }, + { + "epoch": 0.25949328634789476, + "grad_norm": 0.2505379319190979, + "learning_rate": 8.454937615208692e-05, + "loss": 1.1351, + "step": 4271 + }, + { + "epoch": 0.2595540433805213, + "grad_norm": 0.3006500005722046, + "learning_rate": 8.454245568022285e-05, + "loss": 1.0673, + "step": 4272 + }, + { + "epoch": 0.25961480041314783, + "grad_norm": 0.19378769397735596, + "learning_rate": 8.453553394219841e-05, + "loss": 1.1147, + "step": 4273 + }, + { + "epoch": 0.25967555744577436, + "grad_norm": 0.20149783790111542, + "learning_rate": 8.45286109382673e-05, + "loss": 1.1276, + "step": 4274 + }, + { + "epoch": 0.2597363144784009, + "grad_norm": 0.2344742715358734, + "learning_rate": 8.452168666868327e-05, + "loss": 1.096, + "step": 4275 + }, + { + "epoch": 0.2597970715110274, + "grad_norm": 0.23637834191322327, + "learning_rate": 8.451476113370016e-05, + "loss": 1.0784, + "step": 4276 + }, + { + "epoch": 0.2598578285436539, + "grad_norm": 0.4054419696331024, + "learning_rate": 8.450783433357182e-05, + "loss": 1.1277, + "step": 4277 + }, + { + "epoch": 0.25991858557628045, + "grad_norm": 0.18842460215091705, + "learning_rate": 8.45009062685521e-05, + "loss": 1.1831, + "step": 4278 + }, + { + "epoch": 0.259979342608907, + "grad_norm": 0.5069484710693359, + "learning_rate": 8.449397693889502e-05, + "loss": 1.1123, + "step": 4279 + }, + { + "epoch": 0.2600400996415335, + "grad_norm": 0.25176990032196045, + "learning_rate": 8.448704634485454e-05, + "loss": 1.1092, + "step": 4280 + }, + { + "epoch": 0.26010085667416005, + "grad_norm": 0.13377034664154053, + "learning_rate": 8.448011448668472e-05, + "loss": 1.0943, + "step": 4281 + }, + { + "epoch": 0.2601616137067866, + "grad_norm": 0.9178459048271179, + "learning_rate": 8.447318136463962e-05, + "loss": 1.1304, + "step": 4282 + }, + { + "epoch": 0.26022237073941307, + "grad_norm": 0.20416206121444702, + "learning_rate": 8.446624697897342e-05, + "loss": 1.077, + "step": 4283 + }, + { + "epoch": 0.2602831277720396, + "grad_norm": 0.1249048113822937, + "learning_rate": 8.445931132994025e-05, + "loss": 1.0849, + "step": 4284 + }, + { + "epoch": 0.26034388480466614, + "grad_norm": 1.0604158639907837, + "learning_rate": 8.445237441779437e-05, + "loss": 1.1111, + "step": 4285 + }, + { + "epoch": 0.26040464183729267, + "grad_norm": 0.20574629306793213, + "learning_rate": 8.444543624279006e-05, + "loss": 1.0639, + "step": 4286 + }, + { + "epoch": 0.2604653988699192, + "grad_norm": 0.27051597833633423, + "learning_rate": 8.443849680518161e-05, + "loss": 1.0105, + "step": 4287 + }, + { + "epoch": 0.26052615590254574, + "grad_norm": 0.16592645645141602, + "learning_rate": 8.44315561052234e-05, + "loss": 1.1072, + "step": 4288 + }, + { + "epoch": 0.2605869129351722, + "grad_norm": 0.22462166845798492, + "learning_rate": 8.442461414316986e-05, + "loss": 1.1015, + "step": 4289 + }, + { + "epoch": 0.26064766996779876, + "grad_norm": 0.3104848861694336, + "learning_rate": 8.441767091927543e-05, + "loss": 1.093, + "step": 4290 + }, + { + "epoch": 0.2607084270004253, + "grad_norm": 0.9585015177726746, + "learning_rate": 8.441072643379461e-05, + "loss": 1.1744, + "step": 4291 + }, + { + "epoch": 0.2607691840330518, + "grad_norm": 0.3315548896789551, + "learning_rate": 8.440378068698196e-05, + "loss": 1.1473, + "step": 4292 + }, + { + "epoch": 0.26082994106567836, + "grad_norm": 0.2847521901130676, + "learning_rate": 8.43968336790921e-05, + "loss": 1.1992, + "step": 4293 + }, + { + "epoch": 0.2608906980983049, + "grad_norm": 0.20233559608459473, + "learning_rate": 8.438988541037964e-05, + "loss": 1.2571, + "step": 4294 + }, + { + "epoch": 0.26095145513093143, + "grad_norm": 0.2187298834323883, + "learning_rate": 8.438293588109928e-05, + "loss": 1.3668, + "step": 4295 + }, + { + "epoch": 0.2610122121635579, + "grad_norm": 0.26863613724708557, + "learning_rate": 8.437598509150575e-05, + "loss": 1.0784, + "step": 4296 + }, + { + "epoch": 0.26107296919618445, + "grad_norm": 0.1813523918390274, + "learning_rate": 8.436903304185386e-05, + "loss": 1.0299, + "step": 4297 + }, + { + "epoch": 0.261133726228811, + "grad_norm": 0.18379293382167816, + "learning_rate": 8.436207973239842e-05, + "loss": 1.088, + "step": 4298 + }, + { + "epoch": 0.2611944832614375, + "grad_norm": 0.19964168965816498, + "learning_rate": 8.43551251633943e-05, + "loss": 1.0851, + "step": 4299 + }, + { + "epoch": 0.26125524029406405, + "grad_norm": 0.21964845061302185, + "learning_rate": 8.434816933509641e-05, + "loss": 1.1641, + "step": 4300 + }, + { + "epoch": 0.2613159973266906, + "grad_norm": 0.18196550011634827, + "learning_rate": 8.434121224775977e-05, + "loss": 1.0875, + "step": 4301 + }, + { + "epoch": 0.26137675435931707, + "grad_norm": 0.2147371768951416, + "learning_rate": 8.433425390163932e-05, + "loss": 1.1425, + "step": 4302 + }, + { + "epoch": 0.2614375113919436, + "grad_norm": 0.22804512083530426, + "learning_rate": 8.432729429699017e-05, + "loss": 1.2779, + "step": 4303 + }, + { + "epoch": 0.26149826842457013, + "grad_norm": 0.17448338866233826, + "learning_rate": 8.432033343406742e-05, + "loss": 1.1232, + "step": 4304 + }, + { + "epoch": 0.26155902545719667, + "grad_norm": 0.1834874004125595, + "learning_rate": 8.431337131312622e-05, + "loss": 1.0752, + "step": 4305 + }, + { + "epoch": 0.2616197824898232, + "grad_norm": 0.14007149636745453, + "learning_rate": 8.430640793442175e-05, + "loss": 1.0383, + "step": 4306 + }, + { + "epoch": 0.26168053952244974, + "grad_norm": 0.1365254521369934, + "learning_rate": 8.429944329820927e-05, + "loss": 1.0877, + "step": 4307 + }, + { + "epoch": 0.2617412965550763, + "grad_norm": 2.537677049636841, + "learning_rate": 8.429247740474406e-05, + "loss": 1.1223, + "step": 4308 + }, + { + "epoch": 0.26180205358770275, + "grad_norm": 0.18881793320178986, + "learning_rate": 8.428551025428146e-05, + "loss": 1.1266, + "step": 4309 + }, + { + "epoch": 0.2618628106203293, + "grad_norm": 0.1634659767150879, + "learning_rate": 8.427854184707686e-05, + "loss": 1.0816, + "step": 4310 + }, + { + "epoch": 0.2619235676529558, + "grad_norm": 0.1854061633348465, + "learning_rate": 8.427157218338568e-05, + "loss": 1.199, + "step": 4311 + }, + { + "epoch": 0.26198432468558236, + "grad_norm": 0.3584587872028351, + "learning_rate": 8.426460126346341e-05, + "loss": 1.2258, + "step": 4312 + }, + { + "epoch": 0.2620450817182089, + "grad_norm": 0.20669493079185486, + "learning_rate": 8.425762908756553e-05, + "loss": 1.0801, + "step": 4313 + }, + { + "epoch": 0.26210583875083543, + "grad_norm": 0.23939336836338043, + "learning_rate": 8.425065565594767e-05, + "loss": 1.0658, + "step": 4314 + }, + { + "epoch": 0.2621665957834619, + "grad_norm": 0.2513979375362396, + "learning_rate": 8.424368096886538e-05, + "loss": 1.1918, + "step": 4315 + }, + { + "epoch": 0.26222735281608844, + "grad_norm": 0.15272289514541626, + "learning_rate": 8.423670502657435e-05, + "loss": 1.0491, + "step": 4316 + }, + { + "epoch": 0.262288109848715, + "grad_norm": 0.20569448173046112, + "learning_rate": 8.422972782933029e-05, + "loss": 1.2418, + "step": 4317 + }, + { + "epoch": 0.2623488668813415, + "grad_norm": 0.17268654704093933, + "learning_rate": 8.422274937738893e-05, + "loss": 1.0728, + "step": 4318 + }, + { + "epoch": 0.26240962391396805, + "grad_norm": 0.12413746118545532, + "learning_rate": 8.421576967100607e-05, + "loss": 1.0836, + "step": 4319 + }, + { + "epoch": 0.2624703809465946, + "grad_norm": 0.19145981967449188, + "learning_rate": 8.420878871043757e-05, + "loss": 1.1253, + "step": 4320 + }, + { + "epoch": 0.2625311379792211, + "grad_norm": 0.1925724893808365, + "learning_rate": 8.420180649593929e-05, + "loss": 1.1103, + "step": 4321 + }, + { + "epoch": 0.2625918950118476, + "grad_norm": 0.2260303944349289, + "learning_rate": 8.419482302776721e-05, + "loss": 1.1659, + "step": 4322 + }, + { + "epoch": 0.26265265204447413, + "grad_norm": 0.1830778419971466, + "learning_rate": 8.418783830617727e-05, + "loss": 1.1641, + "step": 4323 + }, + { + "epoch": 0.26271340907710067, + "grad_norm": 0.1560674011707306, + "learning_rate": 8.418085233142551e-05, + "loss": 1.0825, + "step": 4324 + }, + { + "epoch": 0.2627741661097272, + "grad_norm": 0.18006204068660736, + "learning_rate": 8.4173865103768e-05, + "loss": 1.0678, + "step": 4325 + }, + { + "epoch": 0.26283492314235374, + "grad_norm": 0.35525408387184143, + "learning_rate": 8.416687662346085e-05, + "loss": 1.1184, + "step": 4326 + }, + { + "epoch": 0.2628956801749803, + "grad_norm": 0.1623912751674652, + "learning_rate": 8.415988689076025e-05, + "loss": 1.1097, + "step": 4327 + }, + { + "epoch": 0.2629564372076068, + "grad_norm": 0.11738666146993637, + "learning_rate": 8.415289590592237e-05, + "loss": 1.1073, + "step": 4328 + }, + { + "epoch": 0.2630171942402333, + "grad_norm": 0.24086801707744598, + "learning_rate": 8.41459036692035e-05, + "loss": 1.0626, + "step": 4329 + }, + { + "epoch": 0.2630779512728598, + "grad_norm": 0.22589251399040222, + "learning_rate": 8.413891018085993e-05, + "loss": 1.174, + "step": 4330 + }, + { + "epoch": 0.26313870830548636, + "grad_norm": 0.14131517708301544, + "learning_rate": 8.413191544114803e-05, + "loss": 1.0931, + "step": 4331 + }, + { + "epoch": 0.2631994653381129, + "grad_norm": 0.22869080305099487, + "learning_rate": 8.412491945032414e-05, + "loss": 1.0693, + "step": 4332 + }, + { + "epoch": 0.2632602223707394, + "grad_norm": 0.2333141565322876, + "learning_rate": 8.411792220864474e-05, + "loss": 1.0574, + "step": 4333 + }, + { + "epoch": 0.26332097940336596, + "grad_norm": 0.1286310851573944, + "learning_rate": 8.41109237163663e-05, + "loss": 1.104, + "step": 4334 + }, + { + "epoch": 0.26338173643599244, + "grad_norm": 0.19540579617023468, + "learning_rate": 8.410392397374538e-05, + "loss": 1.0828, + "step": 4335 + }, + { + "epoch": 0.263442493468619, + "grad_norm": 0.11774860322475433, + "learning_rate": 8.409692298103852e-05, + "loss": 1.0604, + "step": 4336 + }, + { + "epoch": 0.2635032505012455, + "grad_norm": 0.24366101622581482, + "learning_rate": 8.408992073850237e-05, + "loss": 1.1778, + "step": 4337 + }, + { + "epoch": 0.26356400753387205, + "grad_norm": 0.17126290500164032, + "learning_rate": 8.408291724639357e-05, + "loss": 1.1507, + "step": 4338 + }, + { + "epoch": 0.2636247645664986, + "grad_norm": 0.1923706829547882, + "learning_rate": 8.407591250496885e-05, + "loss": 1.0667, + "step": 4339 + }, + { + "epoch": 0.2636855215991251, + "grad_norm": 0.1331157386302948, + "learning_rate": 8.406890651448499e-05, + "loss": 1.0635, + "step": 4340 + }, + { + "epoch": 0.26374627863175165, + "grad_norm": 0.1537851095199585, + "learning_rate": 8.406189927519876e-05, + "loss": 1.178, + "step": 4341 + }, + { + "epoch": 0.26380703566437813, + "grad_norm": 0.17169001698493958, + "learning_rate": 8.405489078736704e-05, + "loss": 1.1051, + "step": 4342 + }, + { + "epoch": 0.26386779269700467, + "grad_norm": 0.1729595959186554, + "learning_rate": 8.40478810512467e-05, + "loss": 1.0372, + "step": 4343 + }, + { + "epoch": 0.2639285497296312, + "grad_norm": 0.28310850262641907, + "learning_rate": 8.404087006709471e-05, + "loss": 1.1145, + "step": 4344 + }, + { + "epoch": 0.26398930676225774, + "grad_norm": 0.23747852444648743, + "learning_rate": 8.403385783516804e-05, + "loss": 1.2195, + "step": 4345 + }, + { + "epoch": 0.26405006379488427, + "grad_norm": 0.8016255497932434, + "learning_rate": 8.402684435572374e-05, + "loss": 1.1465, + "step": 4346 + }, + { + "epoch": 0.2641108208275108, + "grad_norm": 0.134220153093338, + "learning_rate": 8.401982962901887e-05, + "loss": 1.0676, + "step": 4347 + }, + { + "epoch": 0.2641715778601373, + "grad_norm": 0.273553729057312, + "learning_rate": 8.401281365531058e-05, + "loss": 1.0634, + "step": 4348 + }, + { + "epoch": 0.2642323348927638, + "grad_norm": 2.3341445922851562, + "learning_rate": 8.400579643485602e-05, + "loss": 1.1043, + "step": 4349 + }, + { + "epoch": 0.26429309192539036, + "grad_norm": 0.21785476803779602, + "learning_rate": 8.399877796791244e-05, + "loss": 1.2355, + "step": 4350 + }, + { + "epoch": 0.2643538489580169, + "grad_norm": 0.1721855103969574, + "learning_rate": 8.399175825473706e-05, + "loss": 1.1752, + "step": 4351 + }, + { + "epoch": 0.2644146059906434, + "grad_norm": 0.3808407783508301, + "learning_rate": 8.398473729558723e-05, + "loss": 1.1659, + "step": 4352 + }, + { + "epoch": 0.26447536302326996, + "grad_norm": 0.20280729234218597, + "learning_rate": 8.397771509072028e-05, + "loss": 1.0915, + "step": 4353 + }, + { + "epoch": 0.2645361200558965, + "grad_norm": 0.20858359336853027, + "learning_rate": 8.39706916403936e-05, + "loss": 1.1146, + "step": 4354 + }, + { + "epoch": 0.264596877088523, + "grad_norm": 0.20127947628498077, + "learning_rate": 8.396366694486466e-05, + "loss": 1.1034, + "step": 4355 + }, + { + "epoch": 0.2646576341211495, + "grad_norm": 0.1488337367773056, + "learning_rate": 8.395664100439096e-05, + "loss": 1.0257, + "step": 4356 + }, + { + "epoch": 0.26471839115377604, + "grad_norm": 0.18887291848659515, + "learning_rate": 8.394961381922999e-05, + "loss": 1.1087, + "step": 4357 + }, + { + "epoch": 0.2647791481864026, + "grad_norm": 0.19009460508823395, + "learning_rate": 8.394258538963937e-05, + "loss": 1.1282, + "step": 4358 + }, + { + "epoch": 0.2648399052190291, + "grad_norm": 0.2440362423658371, + "learning_rate": 8.393555571587673e-05, + "loss": 1.0398, + "step": 4359 + }, + { + "epoch": 0.26490066225165565, + "grad_norm": 1.8505592346191406, + "learning_rate": 8.392852479819974e-05, + "loss": 1.1352, + "step": 4360 + }, + { + "epoch": 0.26496141928428213, + "grad_norm": 0.22668585181236267, + "learning_rate": 8.392149263686611e-05, + "loss": 1.0999, + "step": 4361 + }, + { + "epoch": 0.26502217631690866, + "grad_norm": 0.21716921031475067, + "learning_rate": 8.391445923213361e-05, + "loss": 1.0966, + "step": 4362 + }, + { + "epoch": 0.2650829333495352, + "grad_norm": 0.1901271641254425, + "learning_rate": 8.390742458426007e-05, + "loss": 1.2186, + "step": 4363 + }, + { + "epoch": 0.26514369038216173, + "grad_norm": 0.24133965373039246, + "learning_rate": 8.39003886935033e-05, + "loss": 1.1675, + "step": 4364 + }, + { + "epoch": 0.26520444741478827, + "grad_norm": 0.22141315042972565, + "learning_rate": 8.389335156012124e-05, + "loss": 1.1113, + "step": 4365 + }, + { + "epoch": 0.2652652044474148, + "grad_norm": 0.14368648827075958, + "learning_rate": 8.388631318437183e-05, + "loss": 1.0755, + "step": 4366 + }, + { + "epoch": 0.26532596148004134, + "grad_norm": 0.22720423340797424, + "learning_rate": 8.387927356651306e-05, + "loss": 1.1467, + "step": 4367 + }, + { + "epoch": 0.2653867185126678, + "grad_norm": 0.2000090777873993, + "learning_rate": 8.387223270680296e-05, + "loss": 1.0808, + "step": 4368 + }, + { + "epoch": 0.26544747554529435, + "grad_norm": 0.20596404373645782, + "learning_rate": 8.386519060549962e-05, + "loss": 1.1044, + "step": 4369 + }, + { + "epoch": 0.2655082325779209, + "grad_norm": 0.1902557611465454, + "learning_rate": 8.38581472628612e-05, + "loss": 1.1099, + "step": 4370 + }, + { + "epoch": 0.2655689896105474, + "grad_norm": 0.1719650775194168, + "learning_rate": 8.385110267914582e-05, + "loss": 1.1367, + "step": 4371 + }, + { + "epoch": 0.26562974664317396, + "grad_norm": 0.16829217970371246, + "learning_rate": 8.384405685461174e-05, + "loss": 1.1559, + "step": 4372 + }, + { + "epoch": 0.2656905036758005, + "grad_norm": 0.9026603102684021, + "learning_rate": 8.383700978951722e-05, + "loss": 1.153, + "step": 4373 + }, + { + "epoch": 0.265751260708427, + "grad_norm": 0.8896664977073669, + "learning_rate": 8.382996148412055e-05, + "loss": 1.0729, + "step": 4374 + }, + { + "epoch": 0.2658120177410535, + "grad_norm": 0.20693574845790863, + "learning_rate": 8.38229119386801e-05, + "loss": 1.1608, + "step": 4375 + }, + { + "epoch": 0.26587277477368004, + "grad_norm": 0.6755214333534241, + "learning_rate": 8.38158611534543e-05, + "loss": 1.1074, + "step": 4376 + }, + { + "epoch": 0.2659335318063066, + "grad_norm": 0.3992058038711548, + "learning_rate": 8.380880912870155e-05, + "loss": 1.0855, + "step": 4377 + }, + { + "epoch": 0.2659942888389331, + "grad_norm": 0.1554999202489853, + "learning_rate": 8.380175586468038e-05, + "loss": 1.0975, + "step": 4378 + }, + { + "epoch": 0.26605504587155965, + "grad_norm": 0.15274523198604584, + "learning_rate": 8.37947013616493e-05, + "loss": 1.0872, + "step": 4379 + }, + { + "epoch": 0.2661158029041862, + "grad_norm": 0.1353253722190857, + "learning_rate": 8.378764561986691e-05, + "loss": 0.9963, + "step": 4380 + }, + { + "epoch": 0.26617655993681266, + "grad_norm": 0.20015887916088104, + "learning_rate": 8.378058863959186e-05, + "loss": 1.1271, + "step": 4381 + }, + { + "epoch": 0.2662373169694392, + "grad_norm": 0.192811518907547, + "learning_rate": 8.377353042108277e-05, + "loss": 1.1003, + "step": 4382 + }, + { + "epoch": 0.26629807400206573, + "grad_norm": 0.5832599997520447, + "learning_rate": 8.376647096459843e-05, + "loss": 1.0796, + "step": 4383 + }, + { + "epoch": 0.26635883103469227, + "grad_norm": 0.1367943435907364, + "learning_rate": 8.375941027039755e-05, + "loss": 1.0546, + "step": 4384 + }, + { + "epoch": 0.2664195880673188, + "grad_norm": 0.42363548278808594, + "learning_rate": 8.375234833873898e-05, + "loss": 1.0708, + "step": 4385 + }, + { + "epoch": 0.26648034509994534, + "grad_norm": 4.4285173416137695, + "learning_rate": 8.374528516988154e-05, + "loss": 1.0485, + "step": 4386 + }, + { + "epoch": 0.26654110213257187, + "grad_norm": 0.1950882524251938, + "learning_rate": 8.373822076408415e-05, + "loss": 1.0633, + "step": 4387 + }, + { + "epoch": 0.26660185916519835, + "grad_norm": 0.15844829380512238, + "learning_rate": 8.373115512160576e-05, + "loss": 1.1573, + "step": 4388 + }, + { + "epoch": 0.2666626161978249, + "grad_norm": 0.16829128563404083, + "learning_rate": 8.372408824270534e-05, + "loss": 1.1345, + "step": 4389 + }, + { + "epoch": 0.2667233732304514, + "grad_norm": 0.5832083225250244, + "learning_rate": 8.371702012764198e-05, + "loss": 1.1448, + "step": 4390 + }, + { + "epoch": 0.26678413026307796, + "grad_norm": 0.16852720081806183, + "learning_rate": 8.370995077667471e-05, + "loss": 1.0671, + "step": 4391 + }, + { + "epoch": 0.2668448872957045, + "grad_norm": 0.1869085729122162, + "learning_rate": 8.370288019006269e-05, + "loss": 1.0775, + "step": 4392 + }, + { + "epoch": 0.266905644328331, + "grad_norm": 1.2228291034698486, + "learning_rate": 8.369580836806508e-05, + "loss": 1.0945, + "step": 4393 + }, + { + "epoch": 0.2669664013609575, + "grad_norm": 0.9789943099021912, + "learning_rate": 8.368873531094109e-05, + "loss": 1.2168, + "step": 4394 + }, + { + "epoch": 0.26702715839358404, + "grad_norm": 0.22951073944568634, + "learning_rate": 8.368166101894999e-05, + "loss": 1.1413, + "step": 4395 + }, + { + "epoch": 0.2670879154262106, + "grad_norm": 0.17168264091014862, + "learning_rate": 8.36745854923511e-05, + "loss": 1.1657, + "step": 4396 + }, + { + "epoch": 0.2671486724588371, + "grad_norm": 0.21820928156375885, + "learning_rate": 8.366750873140379e-05, + "loss": 1.1243, + "step": 4397 + }, + { + "epoch": 0.26720942949146365, + "grad_norm": 0.35053229331970215, + "learning_rate": 8.366043073636743e-05, + "loss": 1.1848, + "step": 4398 + }, + { + "epoch": 0.2672701865240902, + "grad_norm": 0.1923595815896988, + "learning_rate": 8.365335150750147e-05, + "loss": 1.1164, + "step": 4399 + }, + { + "epoch": 0.2673309435567167, + "grad_norm": 0.29366567730903625, + "learning_rate": 8.36462710450654e-05, + "loss": 1.0871, + "step": 4400 + }, + { + "epoch": 0.2673917005893432, + "grad_norm": 0.17893652617931366, + "learning_rate": 8.363918934931875e-05, + "loss": 1.1154, + "step": 4401 + }, + { + "epoch": 0.26745245762196973, + "grad_norm": 0.18954449892044067, + "learning_rate": 8.363210642052113e-05, + "loss": 1.0414, + "step": 4402 + }, + { + "epoch": 0.26751321465459627, + "grad_norm": 0.23722869157791138, + "learning_rate": 8.362502225893216e-05, + "loss": 1.0774, + "step": 4403 + }, + { + "epoch": 0.2675739716872228, + "grad_norm": 0.1975167691707611, + "learning_rate": 8.361793686481149e-05, + "loss": 1.2206, + "step": 4404 + }, + { + "epoch": 0.26763472871984934, + "grad_norm": 0.218455970287323, + "learning_rate": 8.361085023841884e-05, + "loss": 1.2528, + "step": 4405 + }, + { + "epoch": 0.26769548575247587, + "grad_norm": 0.27026262879371643, + "learning_rate": 8.360376238001397e-05, + "loss": 1.1444, + "step": 4406 + }, + { + "epoch": 0.26775624278510235, + "grad_norm": 0.16529852151870728, + "learning_rate": 8.359667328985672e-05, + "loss": 1.0959, + "step": 4407 + }, + { + "epoch": 0.2678169998177289, + "grad_norm": 1.2387454509735107, + "learning_rate": 8.358958296820689e-05, + "loss": 1.1062, + "step": 4408 + }, + { + "epoch": 0.2678777568503554, + "grad_norm": 0.33961236476898193, + "learning_rate": 8.358249141532442e-05, + "loss": 1.0911, + "step": 4409 + }, + { + "epoch": 0.26793851388298195, + "grad_norm": 0.16197733581066132, + "learning_rate": 8.357539863146922e-05, + "loss": 1.1403, + "step": 4410 + }, + { + "epoch": 0.2679992709156085, + "grad_norm": 0.47110915184020996, + "learning_rate": 8.356830461690131e-05, + "loss": 1.0413, + "step": 4411 + }, + { + "epoch": 0.268060027948235, + "grad_norm": 0.1606959104537964, + "learning_rate": 8.35612093718807e-05, + "loss": 1.0596, + "step": 4412 + }, + { + "epoch": 0.26812078498086156, + "grad_norm": 0.383693665266037, + "learning_rate": 8.355411289666749e-05, + "loss": 1.0932, + "step": 4413 + }, + { + "epoch": 0.26818154201348804, + "grad_norm": 0.16229099035263062, + "learning_rate": 8.354701519152175e-05, + "loss": 1.1067, + "step": 4414 + }, + { + "epoch": 0.2682422990461146, + "grad_norm": 0.1542864590883255, + "learning_rate": 8.35399162567037e-05, + "loss": 1.0703, + "step": 4415 + }, + { + "epoch": 0.2683030560787411, + "grad_norm": 0.5054668188095093, + "learning_rate": 8.353281609247355e-05, + "loss": 1.1897, + "step": 4416 + }, + { + "epoch": 0.26836381311136764, + "grad_norm": 0.2539095878601074, + "learning_rate": 8.352571469909152e-05, + "loss": 1.0821, + "step": 4417 + }, + { + "epoch": 0.2684245701439942, + "grad_norm": 0.23482497036457062, + "learning_rate": 8.351861207681795e-05, + "loss": 1.1568, + "step": 4418 + }, + { + "epoch": 0.2684853271766207, + "grad_norm": 0.21159839630126953, + "learning_rate": 8.351150822591318e-05, + "loss": 1.1703, + "step": 4419 + }, + { + "epoch": 0.2685460842092472, + "grad_norm": 0.17760232090950012, + "learning_rate": 8.350440314663759e-05, + "loss": 1.0928, + "step": 4420 + }, + { + "epoch": 0.26860684124187373, + "grad_norm": 0.15669623017311096, + "learning_rate": 8.349729683925163e-05, + "loss": 1.2026, + "step": 4421 + }, + { + "epoch": 0.26866759827450026, + "grad_norm": 0.20074933767318726, + "learning_rate": 8.349018930401577e-05, + "loss": 1.0896, + "step": 4422 + }, + { + "epoch": 0.2687283553071268, + "grad_norm": 0.22339986264705658, + "learning_rate": 8.348308054119058e-05, + "loss": 1.1512, + "step": 4423 + }, + { + "epoch": 0.26878911233975333, + "grad_norm": 0.21061965823173523, + "learning_rate": 8.347597055103658e-05, + "loss": 1.1121, + "step": 4424 + }, + { + "epoch": 0.26884986937237987, + "grad_norm": 0.3132881820201874, + "learning_rate": 8.346885933381439e-05, + "loss": 1.1502, + "step": 4425 + }, + { + "epoch": 0.2689106264050064, + "grad_norm": 0.3162994384765625, + "learning_rate": 8.346174688978471e-05, + "loss": 1.1553, + "step": 4426 + }, + { + "epoch": 0.2689713834376329, + "grad_norm": 0.29460978507995605, + "learning_rate": 8.345463321920823e-05, + "loss": 1.0553, + "step": 4427 + }, + { + "epoch": 0.2690321404702594, + "grad_norm": 0.285300612449646, + "learning_rate": 8.34475183223457e-05, + "loss": 1.0708, + "step": 4428 + }, + { + "epoch": 0.26909289750288595, + "grad_norm": 0.41496700048446655, + "learning_rate": 8.344040219945793e-05, + "loss": 1.1075, + "step": 4429 + }, + { + "epoch": 0.2691536545355125, + "grad_norm": 0.15282705426216125, + "learning_rate": 8.343328485080574e-05, + "loss": 1.0922, + "step": 4430 + }, + { + "epoch": 0.269214411568139, + "grad_norm": 0.2773056626319885, + "learning_rate": 8.342616627665003e-05, + "loss": 1.1538, + "step": 4431 + }, + { + "epoch": 0.26927516860076556, + "grad_norm": 0.36892804503440857, + "learning_rate": 8.341904647725174e-05, + "loss": 1.1762, + "step": 4432 + }, + { + "epoch": 0.2693359256333921, + "grad_norm": 0.1873452365398407, + "learning_rate": 8.341192545287185e-05, + "loss": 1.1327, + "step": 4433 + }, + { + "epoch": 0.26939668266601857, + "grad_norm": 0.2599208652973175, + "learning_rate": 8.340480320377136e-05, + "loss": 1.0998, + "step": 4434 + }, + { + "epoch": 0.2694574396986451, + "grad_norm": 0.1900174915790558, + "learning_rate": 8.339767973021135e-05, + "loss": 1.0994, + "step": 4435 + }, + { + "epoch": 0.26951819673127164, + "grad_norm": 0.3114914894104004, + "learning_rate": 8.339055503245294e-05, + "loss": 1.1037, + "step": 4436 + }, + { + "epoch": 0.2695789537638982, + "grad_norm": 0.21054331958293915, + "learning_rate": 8.338342911075729e-05, + "loss": 1.1417, + "step": 4437 + }, + { + "epoch": 0.2696397107965247, + "grad_norm": 0.2541572153568268, + "learning_rate": 8.33763019653856e-05, + "loss": 1.1443, + "step": 4438 + }, + { + "epoch": 0.26970046782915125, + "grad_norm": 0.3991122245788574, + "learning_rate": 8.336917359659909e-05, + "loss": 1.1564, + "step": 4439 + }, + { + "epoch": 0.2697612248617777, + "grad_norm": 0.19234372675418854, + "learning_rate": 8.336204400465908e-05, + "loss": 1.0291, + "step": 4440 + }, + { + "epoch": 0.26982198189440426, + "grad_norm": 0.5077095031738281, + "learning_rate": 8.335491318982689e-05, + "loss": 1.2786, + "step": 4441 + }, + { + "epoch": 0.2698827389270308, + "grad_norm": 0.28668439388275146, + "learning_rate": 8.334778115236392e-05, + "loss": 1.1063, + "step": 4442 + }, + { + "epoch": 0.26994349595965733, + "grad_norm": 0.2900209426879883, + "learning_rate": 8.334064789253157e-05, + "loss": 1.1019, + "step": 4443 + }, + { + "epoch": 0.27000425299228387, + "grad_norm": 0.18565015494823456, + "learning_rate": 8.333351341059134e-05, + "loss": 1.0789, + "step": 4444 + }, + { + "epoch": 0.2700650100249104, + "grad_norm": 0.2625967562198639, + "learning_rate": 8.332637770680474e-05, + "loss": 1.0648, + "step": 4445 + }, + { + "epoch": 0.27012576705753694, + "grad_norm": 0.26043277978897095, + "learning_rate": 8.331924078143329e-05, + "loss": 1.1473, + "step": 4446 + }, + { + "epoch": 0.2701865240901634, + "grad_norm": 0.1826217621564865, + "learning_rate": 8.331210263473867e-05, + "loss": 1.1316, + "step": 4447 + }, + { + "epoch": 0.27024728112278995, + "grad_norm": 0.223603293299675, + "learning_rate": 8.330496326698247e-05, + "loss": 1.0609, + "step": 4448 + }, + { + "epoch": 0.2703080381554165, + "grad_norm": 0.20060575008392334, + "learning_rate": 8.329782267842641e-05, + "loss": 1.1495, + "step": 4449 + }, + { + "epoch": 0.270368795188043, + "grad_norm": 0.2720164954662323, + "learning_rate": 8.329068086933221e-05, + "loss": 1.0722, + "step": 4450 + }, + { + "epoch": 0.27042955222066956, + "grad_norm": 0.14457330107688904, + "learning_rate": 8.328353783996168e-05, + "loss": 1.0962, + "step": 4451 + }, + { + "epoch": 0.2704903092532961, + "grad_norm": 0.27922821044921875, + "learning_rate": 8.327639359057663e-05, + "loss": 1.0457, + "step": 4452 + }, + { + "epoch": 0.27055106628592257, + "grad_norm": 0.23993876576423645, + "learning_rate": 8.326924812143894e-05, + "loss": 1.1135, + "step": 4453 + }, + { + "epoch": 0.2706118233185491, + "grad_norm": 0.11556341499090195, + "learning_rate": 8.326210143281055e-05, + "loss": 1.0577, + "step": 4454 + }, + { + "epoch": 0.27067258035117564, + "grad_norm": 0.13276229798793793, + "learning_rate": 8.325495352495339e-05, + "loss": 1.0455, + "step": 4455 + }, + { + "epoch": 0.2707333373838022, + "grad_norm": 0.35786738991737366, + "learning_rate": 8.324780439812946e-05, + "loss": 1.1242, + "step": 4456 + }, + { + "epoch": 0.2707940944164287, + "grad_norm": 0.16986536979675293, + "learning_rate": 8.324065405260085e-05, + "loss": 1.1377, + "step": 4457 + }, + { + "epoch": 0.27085485144905525, + "grad_norm": 0.36732205748558044, + "learning_rate": 8.323350248862964e-05, + "loss": 1.1018, + "step": 4458 + }, + { + "epoch": 0.2709156084816818, + "grad_norm": 0.20355327427387238, + "learning_rate": 8.322634970647797e-05, + "loss": 1.1698, + "step": 4459 + }, + { + "epoch": 0.27097636551430826, + "grad_norm": 0.4248415231704712, + "learning_rate": 8.321919570640803e-05, + "loss": 1.3169, + "step": 4460 + }, + { + "epoch": 0.2710371225469348, + "grad_norm": 0.2988456189632416, + "learning_rate": 8.321204048868206e-05, + "loss": 1.0989, + "step": 4461 + }, + { + "epoch": 0.27109787957956133, + "grad_norm": 0.241630420088768, + "learning_rate": 8.32048840535623e-05, + "loss": 1.111, + "step": 4462 + }, + { + "epoch": 0.27115863661218786, + "grad_norm": 0.2592075765132904, + "learning_rate": 8.319772640131113e-05, + "loss": 1.2772, + "step": 4463 + }, + { + "epoch": 0.2712193936448144, + "grad_norm": 0.23523442447185516, + "learning_rate": 8.319056753219086e-05, + "loss": 1.1463, + "step": 4464 + }, + { + "epoch": 0.27128015067744093, + "grad_norm": 0.15017908811569214, + "learning_rate": 8.318340744646393e-05, + "loss": 1.2114, + "step": 4465 + }, + { + "epoch": 0.2713409077100674, + "grad_norm": 0.4901198148727417, + "learning_rate": 8.317624614439278e-05, + "loss": 1.3439, + "step": 4466 + }, + { + "epoch": 0.27140166474269395, + "grad_norm": 0.1556641310453415, + "learning_rate": 8.316908362623991e-05, + "loss": 1.0707, + "step": 4467 + }, + { + "epoch": 0.2714624217753205, + "grad_norm": 0.2285921275615692, + "learning_rate": 8.31619198922679e-05, + "loss": 1.2014, + "step": 4468 + }, + { + "epoch": 0.271523178807947, + "grad_norm": 0.1972893625497818, + "learning_rate": 8.315475494273927e-05, + "loss": 1.1493, + "step": 4469 + }, + { + "epoch": 0.27158393584057355, + "grad_norm": 0.3628298044204712, + "learning_rate": 8.31475887779167e-05, + "loss": 1.1442, + "step": 4470 + }, + { + "epoch": 0.2716446928732001, + "grad_norm": 0.16546091437339783, + "learning_rate": 8.314042139806287e-05, + "loss": 1.0411, + "step": 4471 + }, + { + "epoch": 0.2717054499058266, + "grad_norm": 0.5137971043586731, + "learning_rate": 8.313325280344046e-05, + "loss": 1.0667, + "step": 4472 + }, + { + "epoch": 0.2717662069384531, + "grad_norm": 0.17597582936286926, + "learning_rate": 8.312608299431229e-05, + "loss": 1.0752, + "step": 4473 + }, + { + "epoch": 0.27182696397107964, + "grad_norm": 0.227156400680542, + "learning_rate": 8.311891197094113e-05, + "loss": 1.1466, + "step": 4474 + }, + { + "epoch": 0.2718877210037062, + "grad_norm": 0.18647707998752594, + "learning_rate": 8.311173973358985e-05, + "loss": 1.0874, + "step": 4475 + }, + { + "epoch": 0.2719484780363327, + "grad_norm": 0.14235243201255798, + "learning_rate": 8.310456628252136e-05, + "loss": 1.0651, + "step": 4476 + }, + { + "epoch": 0.27200923506895924, + "grad_norm": 0.3674888014793396, + "learning_rate": 8.309739161799859e-05, + "loss": 1.0871, + "step": 4477 + }, + { + "epoch": 0.2720699921015858, + "grad_norm": 0.25865834951400757, + "learning_rate": 8.309021574028453e-05, + "loss": 1.1204, + "step": 4478 + }, + { + "epoch": 0.27213074913421226, + "grad_norm": 0.27623218297958374, + "learning_rate": 8.30830386496422e-05, + "loss": 1.0639, + "step": 4479 + }, + { + "epoch": 0.2721915061668388, + "grad_norm": 0.19091758131980896, + "learning_rate": 8.307586034633472e-05, + "loss": 1.2216, + "step": 4480 + }, + { + "epoch": 0.2722522631994653, + "grad_norm": 0.2135348618030548, + "learning_rate": 8.306868083062518e-05, + "loss": 1.0923, + "step": 4481 + }, + { + "epoch": 0.27231302023209186, + "grad_norm": 0.34670794010162354, + "learning_rate": 8.306150010277675e-05, + "loss": 1.1129, + "step": 4482 + }, + { + "epoch": 0.2723737772647184, + "grad_norm": 0.14597612619400024, + "learning_rate": 8.305431816305262e-05, + "loss": 1.107, + "step": 4483 + }, + { + "epoch": 0.27243453429734493, + "grad_norm": 0.25984859466552734, + "learning_rate": 8.30471350117161e-05, + "loss": 1.0409, + "step": 4484 + }, + { + "epoch": 0.27249529132997147, + "grad_norm": 0.1626632660627365, + "learning_rate": 8.303995064903045e-05, + "loss": 1.0621, + "step": 4485 + }, + { + "epoch": 0.27255604836259795, + "grad_norm": 0.20829637348651886, + "learning_rate": 8.303276507525902e-05, + "loss": 1.0655, + "step": 4486 + }, + { + "epoch": 0.2726168053952245, + "grad_norm": 0.31719571352005005, + "learning_rate": 8.30255782906652e-05, + "loss": 1.1134, + "step": 4487 + }, + { + "epoch": 0.272677562427851, + "grad_norm": 0.14328913390636444, + "learning_rate": 8.301839029551243e-05, + "loss": 1.0976, + "step": 4488 + }, + { + "epoch": 0.27273831946047755, + "grad_norm": 0.422942578792572, + "learning_rate": 8.301120109006418e-05, + "loss": 1.1305, + "step": 4489 + }, + { + "epoch": 0.2727990764931041, + "grad_norm": 0.12200777977705002, + "learning_rate": 8.300401067458396e-05, + "loss": 1.0784, + "step": 4490 + }, + { + "epoch": 0.2728598335257306, + "grad_norm": 0.2534925043582916, + "learning_rate": 8.299681904933536e-05, + "loss": 1.1613, + "step": 4491 + }, + { + "epoch": 0.27292059055835716, + "grad_norm": 0.1515333652496338, + "learning_rate": 8.298962621458197e-05, + "loss": 1.0507, + "step": 4492 + }, + { + "epoch": 0.27298134759098364, + "grad_norm": 0.2900865375995636, + "learning_rate": 8.298243217058747e-05, + "loss": 1.0689, + "step": 4493 + }, + { + "epoch": 0.27304210462361017, + "grad_norm": 0.17336629331111908, + "learning_rate": 8.297523691761554e-05, + "loss": 1.0488, + "step": 4494 + }, + { + "epoch": 0.2731028616562367, + "grad_norm": 0.18693529069423676, + "learning_rate": 8.296804045592992e-05, + "loss": 1.0673, + "step": 4495 + }, + { + "epoch": 0.27316361868886324, + "grad_norm": 0.2472897171974182, + "learning_rate": 8.296084278579442e-05, + "loss": 1.2907, + "step": 4496 + }, + { + "epoch": 0.2732243757214898, + "grad_norm": 0.1313284933567047, + "learning_rate": 8.295364390747284e-05, + "loss": 1.0733, + "step": 4497 + }, + { + "epoch": 0.2732851327541163, + "grad_norm": 0.15705497562885284, + "learning_rate": 8.294644382122906e-05, + "loss": 1.2581, + "step": 4498 + }, + { + "epoch": 0.2733458897867428, + "grad_norm": 0.1942000836133957, + "learning_rate": 8.293924252732704e-05, + "loss": 1.1025, + "step": 4499 + }, + { + "epoch": 0.2734066468193693, + "grad_norm": 0.2068057507276535, + "learning_rate": 8.29320400260307e-05, + "loss": 1.2565, + "step": 4500 + }, + { + "epoch": 0.27346740385199586, + "grad_norm": 0.17843692004680634, + "learning_rate": 8.292483631760408e-05, + "loss": 1.2376, + "step": 4501 + }, + { + "epoch": 0.2735281608846224, + "grad_norm": 0.16642257571220398, + "learning_rate": 8.291763140231122e-05, + "loss": 1.1336, + "step": 4502 + }, + { + "epoch": 0.27358891791724893, + "grad_norm": 0.179488867521286, + "learning_rate": 8.291042528041622e-05, + "loss": 1.1094, + "step": 4503 + }, + { + "epoch": 0.27364967494987547, + "grad_norm": 0.22454987466335297, + "learning_rate": 8.290321795218321e-05, + "loss": 1.0633, + "step": 4504 + }, + { + "epoch": 0.273710431982502, + "grad_norm": 0.26553380489349365, + "learning_rate": 8.289600941787639e-05, + "loss": 1.081, + "step": 4505 + }, + { + "epoch": 0.2737711890151285, + "grad_norm": 0.16264106333255768, + "learning_rate": 8.288879967775997e-05, + "loss": 1.0821, + "step": 4506 + }, + { + "epoch": 0.273831946047755, + "grad_norm": 0.41247719526290894, + "learning_rate": 8.288158873209826e-05, + "loss": 1.1986, + "step": 4507 + }, + { + "epoch": 0.27389270308038155, + "grad_norm": 0.24013249576091766, + "learning_rate": 8.287437658115556e-05, + "loss": 1.1142, + "step": 4508 + }, + { + "epoch": 0.2739534601130081, + "grad_norm": 0.4005536437034607, + "learning_rate": 8.286716322519624e-05, + "loss": 1.0716, + "step": 4509 + }, + { + "epoch": 0.2740142171456346, + "grad_norm": 0.34069719910621643, + "learning_rate": 8.285994866448468e-05, + "loss": 1.0756, + "step": 4510 + }, + { + "epoch": 0.27407497417826115, + "grad_norm": 0.2507462203502655, + "learning_rate": 8.285273289928535e-05, + "loss": 1.1925, + "step": 4511 + }, + { + "epoch": 0.27413573121088763, + "grad_norm": 0.2653174102306366, + "learning_rate": 8.284551592986277e-05, + "loss": 1.1208, + "step": 4512 + }, + { + "epoch": 0.27419648824351417, + "grad_norm": 0.17350877821445465, + "learning_rate": 8.283829775648143e-05, + "loss": 1.1445, + "step": 4513 + }, + { + "epoch": 0.2742572452761407, + "grad_norm": 3.2536861896514893, + "learning_rate": 8.283107837940593e-05, + "loss": 1.1013, + "step": 4514 + }, + { + "epoch": 0.27431800230876724, + "grad_norm": 0.6003816723823547, + "learning_rate": 8.282385779890092e-05, + "loss": 1.1052, + "step": 4515 + }, + { + "epoch": 0.2743787593413938, + "grad_norm": 1.152153491973877, + "learning_rate": 8.281663601523107e-05, + "loss": 1.2417, + "step": 4516 + }, + { + "epoch": 0.2744395163740203, + "grad_norm": 0.39114993810653687, + "learning_rate": 8.280941302866108e-05, + "loss": 1.117, + "step": 4517 + }, + { + "epoch": 0.27450027340664684, + "grad_norm": 0.26109611988067627, + "learning_rate": 8.28021888394557e-05, + "loss": 1.1302, + "step": 4518 + }, + { + "epoch": 0.2745610304392733, + "grad_norm": 0.4179966151714325, + "learning_rate": 8.279496344787975e-05, + "loss": 1.0478, + "step": 4519 + }, + { + "epoch": 0.27462178747189986, + "grad_norm": 0.19528590142726898, + "learning_rate": 8.278773685419808e-05, + "loss": 1.1558, + "step": 4520 + }, + { + "epoch": 0.2746825445045264, + "grad_norm": 0.326621413230896, + "learning_rate": 8.278050905867559e-05, + "loss": 1.1047, + "step": 4521 + }, + { + "epoch": 0.27474330153715293, + "grad_norm": 0.2561730444431305, + "learning_rate": 8.277328006157718e-05, + "loss": 1.1224, + "step": 4522 + }, + { + "epoch": 0.27480405856977946, + "grad_norm": 0.29456570744514465, + "learning_rate": 8.276604986316785e-05, + "loss": 1.0875, + "step": 4523 + }, + { + "epoch": 0.274864815602406, + "grad_norm": 0.19292910397052765, + "learning_rate": 8.275881846371265e-05, + "loss": 1.0527, + "step": 4524 + }, + { + "epoch": 0.2749255726350325, + "grad_norm": 0.2293853908777237, + "learning_rate": 8.275158586347662e-05, + "loss": 1.0718, + "step": 4525 + }, + { + "epoch": 0.274986329667659, + "grad_norm": 0.2319045066833496, + "learning_rate": 8.274435206272489e-05, + "loss": 1.0412, + "step": 4526 + }, + { + "epoch": 0.27504708670028555, + "grad_norm": 0.19103401899337769, + "learning_rate": 8.27371170617226e-05, + "loss": 1.0694, + "step": 4527 + }, + { + "epoch": 0.2751078437329121, + "grad_norm": 0.2501087784767151, + "learning_rate": 8.272988086073494e-05, + "loss": 1.1286, + "step": 4528 + }, + { + "epoch": 0.2751686007655386, + "grad_norm": 0.32802215218544006, + "learning_rate": 8.272264346002719e-05, + "loss": 1.2307, + "step": 4529 + }, + { + "epoch": 0.27522935779816515, + "grad_norm": 0.14455841481685638, + "learning_rate": 8.271540485986461e-05, + "loss": 1.1745, + "step": 4530 + }, + { + "epoch": 0.2752901148307917, + "grad_norm": 0.24390700459480286, + "learning_rate": 8.270816506051253e-05, + "loss": 1.1297, + "step": 4531 + }, + { + "epoch": 0.27535087186341817, + "grad_norm": 0.3448156416416168, + "learning_rate": 8.270092406223635e-05, + "loss": 1.112, + "step": 4532 + }, + { + "epoch": 0.2754116288960447, + "grad_norm": 0.15369431674480438, + "learning_rate": 8.269368186530149e-05, + "loss": 1.1169, + "step": 4533 + }, + { + "epoch": 0.27547238592867124, + "grad_norm": 0.18233084678649902, + "learning_rate": 8.26864384699734e-05, + "loss": 1.1651, + "step": 4534 + }, + { + "epoch": 0.2755331429612978, + "grad_norm": 0.17538852989673615, + "learning_rate": 8.267919387651757e-05, + "loss": 1.1014, + "step": 4535 + }, + { + "epoch": 0.2755938999939243, + "grad_norm": 0.1329255998134613, + "learning_rate": 8.267194808519958e-05, + "loss": 1.0677, + "step": 4536 + }, + { + "epoch": 0.27565465702655084, + "grad_norm": 0.4503363072872162, + "learning_rate": 8.266470109628501e-05, + "loss": 1.0506, + "step": 4537 + }, + { + "epoch": 0.2757154140591774, + "grad_norm": 0.1340859830379486, + "learning_rate": 8.265745291003953e-05, + "loss": 1.042, + "step": 4538 + }, + { + "epoch": 0.27577617109180386, + "grad_norm": 0.13064821064472198, + "learning_rate": 8.265020352672877e-05, + "loss": 1.0345, + "step": 4539 + }, + { + "epoch": 0.2758369281244304, + "grad_norm": 0.15410307049751282, + "learning_rate": 8.26429529466185e-05, + "loss": 1.091, + "step": 4540 + }, + { + "epoch": 0.2758976851570569, + "grad_norm": 0.13411352038383484, + "learning_rate": 8.26357011699745e-05, + "loss": 1.0297, + "step": 4541 + }, + { + "epoch": 0.27595844218968346, + "grad_norm": 0.1289222687482834, + "learning_rate": 8.262844819706255e-05, + "loss": 1.101, + "step": 4542 + }, + { + "epoch": 0.27601919922231, + "grad_norm": 0.16975997388362885, + "learning_rate": 8.262119402814852e-05, + "loss": 1.0901, + "step": 4543 + }, + { + "epoch": 0.27607995625493653, + "grad_norm": 0.740832507610321, + "learning_rate": 8.261393866349832e-05, + "loss": 1.2342, + "step": 4544 + }, + { + "epoch": 0.276140713287563, + "grad_norm": 0.16148574650287628, + "learning_rate": 8.260668210337789e-05, + "loss": 1.1655, + "step": 4545 + }, + { + "epoch": 0.27620147032018955, + "grad_norm": 0.13259842991828918, + "learning_rate": 8.259942434805324e-05, + "loss": 1.0706, + "step": 4546 + }, + { + "epoch": 0.2762622273528161, + "grad_norm": 0.34875038266181946, + "learning_rate": 8.259216539779038e-05, + "loss": 1.0904, + "step": 4547 + }, + { + "epoch": 0.2763229843854426, + "grad_norm": 0.16057588160037994, + "learning_rate": 8.25849052528554e-05, + "loss": 1.0704, + "step": 4548 + }, + { + "epoch": 0.27638374141806915, + "grad_norm": 0.17092905938625336, + "learning_rate": 8.25776439135144e-05, + "loss": 1.0683, + "step": 4549 + }, + { + "epoch": 0.2764444984506957, + "grad_norm": 0.21016310155391693, + "learning_rate": 8.25703813800336e-05, + "loss": 1.1103, + "step": 4550 + }, + { + "epoch": 0.2765052554833222, + "grad_norm": 0.22097277641296387, + "learning_rate": 8.256311765267916e-05, + "loss": 1.1482, + "step": 4551 + }, + { + "epoch": 0.2765660125159487, + "grad_norm": 0.160888209939003, + "learning_rate": 8.255585273171735e-05, + "loss": 1.0122, + "step": 4552 + }, + { + "epoch": 0.27662676954857524, + "grad_norm": 0.2927054464817047, + "learning_rate": 8.254858661741444e-05, + "loss": 1.1903, + "step": 4553 + }, + { + "epoch": 0.27668752658120177, + "grad_norm": 0.15477386116981506, + "learning_rate": 8.254131931003683e-05, + "loss": 1.0771, + "step": 4554 + }, + { + "epoch": 0.2767482836138283, + "grad_norm": 0.1971157342195511, + "learning_rate": 8.253405080985087e-05, + "loss": 1.1034, + "step": 4555 + }, + { + "epoch": 0.27680904064645484, + "grad_norm": 0.6341344714164734, + "learning_rate": 8.252678111712296e-05, + "loss": 1.3019, + "step": 4556 + }, + { + "epoch": 0.2768697976790814, + "grad_norm": 0.2178129106760025, + "learning_rate": 8.251951023211962e-05, + "loss": 1.1374, + "step": 4557 + }, + { + "epoch": 0.27693055471170785, + "grad_norm": 0.1752653867006302, + "learning_rate": 8.251223815510734e-05, + "loss": 1.1242, + "step": 4558 + }, + { + "epoch": 0.2769913117443344, + "grad_norm": 0.2596944570541382, + "learning_rate": 8.250496488635268e-05, + "loss": 1.1055, + "step": 4559 + }, + { + "epoch": 0.2770520687769609, + "grad_norm": 0.14579010009765625, + "learning_rate": 8.249769042612227e-05, + "loss": 1.0866, + "step": 4560 + }, + { + "epoch": 0.27711282580958746, + "grad_norm": 0.25169453024864197, + "learning_rate": 8.249041477468273e-05, + "loss": 1.1961, + "step": 4561 + }, + { + "epoch": 0.277173582842214, + "grad_norm": 6.168807506561279, + "learning_rate": 8.248313793230074e-05, + "loss": 1.0726, + "step": 4562 + }, + { + "epoch": 0.27723433987484053, + "grad_norm": 1.6058295965194702, + "learning_rate": 8.247585989924306e-05, + "loss": 1.3326, + "step": 4563 + }, + { + "epoch": 0.27729509690746706, + "grad_norm": 0.16555973887443542, + "learning_rate": 8.246858067577648e-05, + "loss": 1.117, + "step": 4564 + }, + { + "epoch": 0.27735585394009354, + "grad_norm": 0.1868455559015274, + "learning_rate": 8.246130026216776e-05, + "loss": 1.1559, + "step": 4565 + }, + { + "epoch": 0.2774166109727201, + "grad_norm": 0.3097016513347626, + "learning_rate": 8.245401865868383e-05, + "loss": 1.1407, + "step": 4566 + }, + { + "epoch": 0.2774773680053466, + "grad_norm": 0.16543899476528168, + "learning_rate": 8.244673586559157e-05, + "loss": 1.1081, + "step": 4567 + }, + { + "epoch": 0.27753812503797315, + "grad_norm": 0.1566673070192337, + "learning_rate": 8.243945188315791e-05, + "loss": 1.0997, + "step": 4568 + }, + { + "epoch": 0.2775988820705997, + "grad_norm": 0.2237454652786255, + "learning_rate": 8.243216671164991e-05, + "loss": 1.1184, + "step": 4569 + }, + { + "epoch": 0.2776596391032262, + "grad_norm": 0.19540287554264069, + "learning_rate": 8.242488035133454e-05, + "loss": 1.1303, + "step": 4570 + }, + { + "epoch": 0.2777203961358527, + "grad_norm": 0.16383953392505646, + "learning_rate": 8.241759280247893e-05, + "loss": 1.0531, + "step": 4571 + }, + { + "epoch": 0.27778115316847923, + "grad_norm": 0.14741382002830505, + "learning_rate": 8.241030406535017e-05, + "loss": 1.0493, + "step": 4572 + }, + { + "epoch": 0.27784191020110577, + "grad_norm": 0.20527935028076172, + "learning_rate": 8.240301414021548e-05, + "loss": 1.0885, + "step": 4573 + }, + { + "epoch": 0.2779026672337323, + "grad_norm": 0.3679577112197876, + "learning_rate": 8.239572302734202e-05, + "loss": 1.6023, + "step": 4574 + }, + { + "epoch": 0.27796342426635884, + "grad_norm": 0.19015346467494965, + "learning_rate": 8.238843072699708e-05, + "loss": 1.0688, + "step": 4575 + }, + { + "epoch": 0.2780241812989854, + "grad_norm": 0.31583908200263977, + "learning_rate": 8.238113723944795e-05, + "loss": 1.1412, + "step": 4576 + }, + { + "epoch": 0.2780849383316119, + "grad_norm": 0.18067505955696106, + "learning_rate": 8.237384256496197e-05, + "loss": 1.0639, + "step": 4577 + }, + { + "epoch": 0.2781456953642384, + "grad_norm": 0.27275148034095764, + "learning_rate": 8.236654670380655e-05, + "loss": 1.1374, + "step": 4578 + }, + { + "epoch": 0.2782064523968649, + "grad_norm": 0.3129608929157257, + "learning_rate": 8.235924965624909e-05, + "loss": 1.0456, + "step": 4579 + }, + { + "epoch": 0.27826720942949146, + "grad_norm": 0.3833840489387512, + "learning_rate": 8.235195142255707e-05, + "loss": 1.0886, + "step": 4580 + }, + { + "epoch": 0.278327966462118, + "grad_norm": 0.20252899825572968, + "learning_rate": 8.234465200299803e-05, + "loss": 1.2008, + "step": 4581 + }, + { + "epoch": 0.2783887234947445, + "grad_norm": 0.23847143352031708, + "learning_rate": 8.233735139783952e-05, + "loss": 1.1262, + "step": 4582 + }, + { + "epoch": 0.27844948052737106, + "grad_norm": 0.16970612108707428, + "learning_rate": 8.233004960734913e-05, + "loss": 1.1091, + "step": 4583 + }, + { + "epoch": 0.27851023755999754, + "grad_norm": 0.21134312450885773, + "learning_rate": 8.232274663179453e-05, + "loss": 1.1954, + "step": 4584 + }, + { + "epoch": 0.2785709945926241, + "grad_norm": 0.16985607147216797, + "learning_rate": 8.231544247144341e-05, + "loss": 1.0927, + "step": 4585 + }, + { + "epoch": 0.2786317516252506, + "grad_norm": 0.13938719034194946, + "learning_rate": 8.23081371265635e-05, + "loss": 1.0526, + "step": 4586 + }, + { + "epoch": 0.27869250865787715, + "grad_norm": 0.19115585088729858, + "learning_rate": 8.230083059742257e-05, + "loss": 1.1016, + "step": 4587 + }, + { + "epoch": 0.2787532656905037, + "grad_norm": 0.14090359210968018, + "learning_rate": 8.229352288428846e-05, + "loss": 1.035, + "step": 4588 + }, + { + "epoch": 0.2788140227231302, + "grad_norm": 0.2454357147216797, + "learning_rate": 8.228621398742902e-05, + "loss": 1.1749, + "step": 4589 + }, + { + "epoch": 0.27887477975575675, + "grad_norm": 0.21398983895778656, + "learning_rate": 8.227890390711216e-05, + "loss": 1.1485, + "step": 4590 + }, + { + "epoch": 0.27893553678838323, + "grad_norm": 0.23139554262161255, + "learning_rate": 8.227159264360586e-05, + "loss": 1.0546, + "step": 4591 + }, + { + "epoch": 0.27899629382100977, + "grad_norm": 0.16300325095653534, + "learning_rate": 8.226428019717807e-05, + "loss": 1.1082, + "step": 4592 + }, + { + "epoch": 0.2790570508536363, + "grad_norm": 1.25884211063385, + "learning_rate": 8.225696656809688e-05, + "loss": 1.0853, + "step": 4593 + }, + { + "epoch": 0.27911780788626284, + "grad_norm": 0.18098796904087067, + "learning_rate": 8.224965175663032e-05, + "loss": 1.1053, + "step": 4594 + }, + { + "epoch": 0.27917856491888937, + "grad_norm": 0.35015469789505005, + "learning_rate": 8.224233576304655e-05, + "loss": 1.0849, + "step": 4595 + }, + { + "epoch": 0.2792393219515159, + "grad_norm": 0.14025036990642548, + "learning_rate": 8.223501858761372e-05, + "loss": 1.0583, + "step": 4596 + }, + { + "epoch": 0.27930007898414244, + "grad_norm": 0.15127043426036835, + "learning_rate": 8.222770023060006e-05, + "loss": 1.0861, + "step": 4597 + }, + { + "epoch": 0.2793608360167689, + "grad_norm": 0.17315421998500824, + "learning_rate": 8.222038069227383e-05, + "loss": 1.1027, + "step": 4598 + }, + { + "epoch": 0.27942159304939546, + "grad_norm": 0.29059749841690063, + "learning_rate": 8.221305997290329e-05, + "loss": 1.0508, + "step": 4599 + }, + { + "epoch": 0.279482350082022, + "grad_norm": 0.20000159740447998, + "learning_rate": 8.220573807275685e-05, + "loss": 1.1479, + "step": 4600 + }, + { + "epoch": 0.2795431071146485, + "grad_norm": 0.14069730043411255, + "learning_rate": 8.219841499210283e-05, + "loss": 1.0821, + "step": 4601 + }, + { + "epoch": 0.27960386414727506, + "grad_norm": 0.22600381076335907, + "learning_rate": 8.219109073120967e-05, + "loss": 1.0802, + "step": 4602 + }, + { + "epoch": 0.2796646211799016, + "grad_norm": 0.47676920890808105, + "learning_rate": 8.218376529034589e-05, + "loss": 1.1481, + "step": 4603 + }, + { + "epoch": 0.2797253782125281, + "grad_norm": 0.178662970662117, + "learning_rate": 8.217643866977995e-05, + "loss": 1.1206, + "step": 4604 + }, + { + "epoch": 0.2797861352451546, + "grad_norm": 0.5735136270523071, + "learning_rate": 8.216911086978046e-05, + "loss": 1.1023, + "step": 4605 + }, + { + "epoch": 0.27984689227778115, + "grad_norm": 0.18864288926124573, + "learning_rate": 8.216178189061595e-05, + "loss": 1.219, + "step": 4606 + }, + { + "epoch": 0.2799076493104077, + "grad_norm": 0.22152753174304962, + "learning_rate": 8.215445173255514e-05, + "loss": 1.1029, + "step": 4607 + }, + { + "epoch": 0.2799684063430342, + "grad_norm": 0.28739672899246216, + "learning_rate": 8.214712039586667e-05, + "loss": 1.1797, + "step": 4608 + }, + { + "epoch": 0.28002916337566075, + "grad_norm": 0.26197290420532227, + "learning_rate": 8.21397878808193e-05, + "loss": 1.2171, + "step": 4609 + }, + { + "epoch": 0.2800899204082873, + "grad_norm": 0.4948214292526245, + "learning_rate": 8.213245418768179e-05, + "loss": 1.2254, + "step": 4610 + }, + { + "epoch": 0.28015067744091376, + "grad_norm": 0.260137140750885, + "learning_rate": 8.212511931672296e-05, + "loss": 1.073, + "step": 4611 + }, + { + "epoch": 0.2802114344735403, + "grad_norm": 0.19344522058963776, + "learning_rate": 8.211778326821166e-05, + "loss": 1.2006, + "step": 4612 + }, + { + "epoch": 0.28027219150616683, + "grad_norm": 0.23450690507888794, + "learning_rate": 8.211044604241681e-05, + "loss": 1.1887, + "step": 4613 + }, + { + "epoch": 0.28033294853879337, + "grad_norm": 0.24577519297599792, + "learning_rate": 8.210310763960737e-05, + "loss": 1.1533, + "step": 4614 + }, + { + "epoch": 0.2803937055714199, + "grad_norm": 0.3236617147922516, + "learning_rate": 8.20957680600523e-05, + "loss": 1.1662, + "step": 4615 + }, + { + "epoch": 0.28045446260404644, + "grad_norm": 0.29707232117652893, + "learning_rate": 8.208842730402065e-05, + "loss": 1.184, + "step": 4616 + }, + { + "epoch": 0.2805152196366729, + "grad_norm": 0.24683883786201477, + "learning_rate": 8.208108537178149e-05, + "loss": 1.1164, + "step": 4617 + }, + { + "epoch": 0.28057597666929945, + "grad_norm": 0.1817791759967804, + "learning_rate": 8.207374226360396e-05, + "loss": 1.1824, + "step": 4618 + }, + { + "epoch": 0.280636733701926, + "grad_norm": 0.22569699585437775, + "learning_rate": 8.206639797975719e-05, + "loss": 1.0921, + "step": 4619 + }, + { + "epoch": 0.2806974907345525, + "grad_norm": 0.6512724757194519, + "learning_rate": 8.205905252051041e-05, + "loss": 1.0864, + "step": 4620 + }, + { + "epoch": 0.28075824776717906, + "grad_norm": 0.3657459318637848, + "learning_rate": 8.205170588613286e-05, + "loss": 1.1494, + "step": 4621 + }, + { + "epoch": 0.2808190047998056, + "grad_norm": 0.1689261794090271, + "learning_rate": 8.204435807689385e-05, + "loss": 1.0679, + "step": 4622 + }, + { + "epoch": 0.28087976183243213, + "grad_norm": 0.18063753843307495, + "learning_rate": 8.203700909306269e-05, + "loss": 1.0988, + "step": 4623 + }, + { + "epoch": 0.2809405188650586, + "grad_norm": 1.233905553817749, + "learning_rate": 8.202965893490878e-05, + "loss": 1.0576, + "step": 4624 + }, + { + "epoch": 0.28100127589768514, + "grad_norm": 0.8604289889335632, + "learning_rate": 8.202230760270151e-05, + "loss": 1.0996, + "step": 4625 + }, + { + "epoch": 0.2810620329303117, + "grad_norm": 0.28963378071784973, + "learning_rate": 8.201495509671037e-05, + "loss": 1.0629, + "step": 4626 + }, + { + "epoch": 0.2811227899629382, + "grad_norm": 0.2162565141916275, + "learning_rate": 8.200760141720486e-05, + "loss": 1.2243, + "step": 4627 + }, + { + "epoch": 0.28118354699556475, + "grad_norm": 0.30542391538619995, + "learning_rate": 8.200024656445455e-05, + "loss": 1.0582, + "step": 4628 + }, + { + "epoch": 0.2812443040281913, + "grad_norm": 0.32875868678092957, + "learning_rate": 8.1992890538729e-05, + "loss": 1.1191, + "step": 4629 + }, + { + "epoch": 0.28130506106081776, + "grad_norm": 0.47611168026924133, + "learning_rate": 8.198553334029786e-05, + "loss": 1.0689, + "step": 4630 + }, + { + "epoch": 0.2813658180934443, + "grad_norm": 0.38822394609451294, + "learning_rate": 8.197817496943084e-05, + "loss": 1.0782, + "step": 4631 + }, + { + "epoch": 0.28142657512607083, + "grad_norm": 0.2146041989326477, + "learning_rate": 8.197081542639762e-05, + "loss": 1.0429, + "step": 4632 + }, + { + "epoch": 0.28148733215869737, + "grad_norm": 0.2552999258041382, + "learning_rate": 8.196345471146797e-05, + "loss": 1.1339, + "step": 4633 + }, + { + "epoch": 0.2815480891913239, + "grad_norm": 0.2688206136226654, + "learning_rate": 8.195609282491172e-05, + "loss": 1.0717, + "step": 4634 + }, + { + "epoch": 0.28160884622395044, + "grad_norm": 0.5100186467170715, + "learning_rate": 8.194872976699871e-05, + "loss": 1.1656, + "step": 4635 + }, + { + "epoch": 0.281669603256577, + "grad_norm": 0.18688523769378662, + "learning_rate": 8.194136553799883e-05, + "loss": 1.0769, + "step": 4636 + }, + { + "epoch": 0.28173036028920345, + "grad_norm": 7.792951583862305, + "learning_rate": 8.193400013818202e-05, + "loss": 1.206, + "step": 4637 + }, + { + "epoch": 0.28179111732183, + "grad_norm": 0.29357993602752686, + "learning_rate": 8.192663356781827e-05, + "loss": 1.2129, + "step": 4638 + }, + { + "epoch": 0.2818518743544565, + "grad_norm": 0.26707616448402405, + "learning_rate": 8.19192658271776e-05, + "loss": 1.1004, + "step": 4639 + }, + { + "epoch": 0.28191263138708306, + "grad_norm": 0.18493105471134186, + "learning_rate": 8.191189691653006e-05, + "loss": 1.0836, + "step": 4640 + }, + { + "epoch": 0.2819733884197096, + "grad_norm": 0.2140301764011383, + "learning_rate": 8.190452683614577e-05, + "loss": 1.1302, + "step": 4641 + }, + { + "epoch": 0.2820341454523361, + "grad_norm": 0.21024712920188904, + "learning_rate": 8.18971555862949e-05, + "loss": 1.0932, + "step": 4642 + }, + { + "epoch": 0.28209490248496266, + "grad_norm": 0.2261425405740738, + "learning_rate": 8.188978316724761e-05, + "loss": 1.0946, + "step": 4643 + }, + { + "epoch": 0.28215565951758914, + "grad_norm": 0.18732121586799622, + "learning_rate": 8.188240957927416e-05, + "loss": 1.1945, + "step": 4644 + }, + { + "epoch": 0.2822164165502157, + "grad_norm": 0.2324189692735672, + "learning_rate": 8.187503482264482e-05, + "loss": 1.3628, + "step": 4645 + }, + { + "epoch": 0.2822771735828422, + "grad_norm": 0.1965351551771164, + "learning_rate": 8.18676588976299e-05, + "loss": 1.1673, + "step": 4646 + }, + { + "epoch": 0.28233793061546875, + "grad_norm": 0.17932644486427307, + "learning_rate": 8.186028180449981e-05, + "loss": 1.0744, + "step": 4647 + }, + { + "epoch": 0.2823986876480953, + "grad_norm": 0.4422328472137451, + "learning_rate": 8.185290354352493e-05, + "loss": 1.2956, + "step": 4648 + }, + { + "epoch": 0.2824594446807218, + "grad_norm": 0.24107696115970612, + "learning_rate": 8.18455241149757e-05, + "loss": 1.1298, + "step": 4649 + }, + { + "epoch": 0.2825202017133483, + "grad_norm": 0.22162525355815887, + "learning_rate": 8.183814351912265e-05, + "loss": 1.1265, + "step": 4650 + }, + { + "epoch": 0.28258095874597483, + "grad_norm": 0.39656251668930054, + "learning_rate": 8.183076175623628e-05, + "loss": 1.2503, + "step": 4651 + }, + { + "epoch": 0.28264171577860137, + "grad_norm": 0.2623554766178131, + "learning_rate": 8.182337882658717e-05, + "loss": 1.1311, + "step": 4652 + }, + { + "epoch": 0.2827024728112279, + "grad_norm": 0.18301978707313538, + "learning_rate": 8.181599473044598e-05, + "loss": 1.2532, + "step": 4653 + }, + { + "epoch": 0.28276322984385444, + "grad_norm": 1.0720328092575073, + "learning_rate": 8.180860946808336e-05, + "loss": 1.5338, + "step": 4654 + }, + { + "epoch": 0.28282398687648097, + "grad_norm": 0.38734033703804016, + "learning_rate": 8.180122303976999e-05, + "loss": 1.1591, + "step": 4655 + }, + { + "epoch": 0.2828847439091075, + "grad_norm": 0.5955361723899841, + "learning_rate": 8.179383544577664e-05, + "loss": 1.2536, + "step": 4656 + }, + { + "epoch": 0.282945500941734, + "grad_norm": 0.19823609292507172, + "learning_rate": 8.178644668637412e-05, + "loss": 1.0633, + "step": 4657 + }, + { + "epoch": 0.2830062579743605, + "grad_norm": 0.4541545510292053, + "learning_rate": 8.177905676183325e-05, + "loss": 1.1511, + "step": 4658 + }, + { + "epoch": 0.28306701500698705, + "grad_norm": 0.2795678675174713, + "learning_rate": 8.177166567242491e-05, + "loss": 1.1396, + "step": 4659 + }, + { + "epoch": 0.2831277720396136, + "grad_norm": 0.2806314527988434, + "learning_rate": 8.176427341842001e-05, + "loss": 1.1468, + "step": 4660 + }, + { + "epoch": 0.2831885290722401, + "grad_norm": 0.3850073218345642, + "learning_rate": 8.175688000008956e-05, + "loss": 1.1084, + "step": 4661 + }, + { + "epoch": 0.28324928610486666, + "grad_norm": 0.23455089330673218, + "learning_rate": 8.17494854177045e-05, + "loss": 1.1676, + "step": 4662 + }, + { + "epoch": 0.28331004313749314, + "grad_norm": 0.28819727897644043, + "learning_rate": 8.174208967153593e-05, + "loss": 1.2905, + "step": 4663 + }, + { + "epoch": 0.2833708001701197, + "grad_norm": 0.28520601987838745, + "learning_rate": 8.173469276185495e-05, + "loss": 1.1198, + "step": 4664 + }, + { + "epoch": 0.2834315572027462, + "grad_norm": 0.16577650606632233, + "learning_rate": 8.172729468893264e-05, + "loss": 1.1215, + "step": 4665 + }, + { + "epoch": 0.28349231423537274, + "grad_norm": 0.35518643260002136, + "learning_rate": 8.171989545304022e-05, + "loss": 1.3198, + "step": 4666 + }, + { + "epoch": 0.2835530712679993, + "grad_norm": 0.19905826449394226, + "learning_rate": 8.171249505444891e-05, + "loss": 1.1921, + "step": 4667 + }, + { + "epoch": 0.2836138283006258, + "grad_norm": 0.1744869202375412, + "learning_rate": 8.170509349342996e-05, + "loss": 1.081, + "step": 4668 + }, + { + "epoch": 0.28367458533325235, + "grad_norm": 0.26516008377075195, + "learning_rate": 8.169769077025466e-05, + "loss": 1.0577, + "step": 4669 + }, + { + "epoch": 0.28373534236587883, + "grad_norm": 0.17361098527908325, + "learning_rate": 8.16902868851944e-05, + "loss": 1.1093, + "step": 4670 + }, + { + "epoch": 0.28379609939850536, + "grad_norm": 0.29466310143470764, + "learning_rate": 8.168288183852055e-05, + "loss": 1.0544, + "step": 4671 + }, + { + "epoch": 0.2838568564311319, + "grad_norm": 0.2904646098613739, + "learning_rate": 8.167547563050452e-05, + "loss": 1.2464, + "step": 4672 + }, + { + "epoch": 0.28391761346375843, + "grad_norm": 0.3171013295650482, + "learning_rate": 8.166806826141784e-05, + "loss": 1.0834, + "step": 4673 + }, + { + "epoch": 0.28397837049638497, + "grad_norm": 0.20559333264827728, + "learning_rate": 8.166065973153199e-05, + "loss": 1.1319, + "step": 4674 + }, + { + "epoch": 0.2840391275290115, + "grad_norm": 1.1493704319000244, + "learning_rate": 8.165325004111852e-05, + "loss": 1.1405, + "step": 4675 + }, + { + "epoch": 0.284099884561638, + "grad_norm": 0.477880597114563, + "learning_rate": 8.164583919044906e-05, + "loss": 1.075, + "step": 4676 + }, + { + "epoch": 0.2841606415942645, + "grad_norm": 0.16275012493133545, + "learning_rate": 8.163842717979525e-05, + "loss": 1.0794, + "step": 4677 + }, + { + "epoch": 0.28422139862689105, + "grad_norm": 0.21456238627433777, + "learning_rate": 8.163101400942877e-05, + "loss": 1.1095, + "step": 4678 + }, + { + "epoch": 0.2842821556595176, + "grad_norm": 0.17840400338172913, + "learning_rate": 8.162359967962134e-05, + "loss": 1.1096, + "step": 4679 + }, + { + "epoch": 0.2843429126921441, + "grad_norm": 0.20673146843910217, + "learning_rate": 8.161618419064478e-05, + "loss": 1.1351, + "step": 4680 + }, + { + "epoch": 0.28440366972477066, + "grad_norm": 0.185395747423172, + "learning_rate": 8.160876754277087e-05, + "loss": 1.0876, + "step": 4681 + }, + { + "epoch": 0.2844644267573972, + "grad_norm": 0.2899877727031708, + "learning_rate": 8.160134973627147e-05, + "loss": 1.1328, + "step": 4682 + }, + { + "epoch": 0.2845251837900237, + "grad_norm": 0.13526462018489838, + "learning_rate": 8.159393077141849e-05, + "loss": 1.0842, + "step": 4683 + }, + { + "epoch": 0.2845859408226502, + "grad_norm": 0.19481588900089264, + "learning_rate": 8.158651064848387e-05, + "loss": 1.249, + "step": 4684 + }, + { + "epoch": 0.28464669785527674, + "grad_norm": 0.3543314039707184, + "learning_rate": 8.157908936773961e-05, + "loss": 1.0935, + "step": 4685 + }, + { + "epoch": 0.2847074548879033, + "grad_norm": 0.28960728645324707, + "learning_rate": 8.15716669294577e-05, + "loss": 1.1669, + "step": 4686 + }, + { + "epoch": 0.2847682119205298, + "grad_norm": 0.22876624763011932, + "learning_rate": 8.156424333391025e-05, + "loss": 1.1913, + "step": 4687 + }, + { + "epoch": 0.28482896895315635, + "grad_norm": 0.40697211027145386, + "learning_rate": 8.155681858136937e-05, + "loss": 1.0249, + "step": 4688 + }, + { + "epoch": 0.2848897259857828, + "grad_norm": 0.22622931003570557, + "learning_rate": 8.154939267210719e-05, + "loss": 1.0453, + "step": 4689 + }, + { + "epoch": 0.28495048301840936, + "grad_norm": 0.1916540116071701, + "learning_rate": 8.154196560639592e-05, + "loss": 1.1253, + "step": 4690 + }, + { + "epoch": 0.2850112400510359, + "grad_norm": 2.1343138217926025, + "learning_rate": 8.153453738450782e-05, + "loss": 1.0774, + "step": 4691 + }, + { + "epoch": 0.28507199708366243, + "grad_norm": 0.1893221139907837, + "learning_rate": 8.152710800671515e-05, + "loss": 1.1148, + "step": 4692 + }, + { + "epoch": 0.28513275411628897, + "grad_norm": 0.18746380507946014, + "learning_rate": 8.151967747329024e-05, + "loss": 1.1496, + "step": 4693 + }, + { + "epoch": 0.2851935111489155, + "grad_norm": 0.35583555698394775, + "learning_rate": 8.151224578450545e-05, + "loss": 1.1697, + "step": 4694 + }, + { + "epoch": 0.28525426818154204, + "grad_norm": 0.27671295404434204, + "learning_rate": 8.150481294063321e-05, + "loss": 1.0999, + "step": 4695 + }, + { + "epoch": 0.2853150252141685, + "grad_norm": 0.2466990202665329, + "learning_rate": 8.149737894194596e-05, + "loss": 1.0794, + "step": 4696 + }, + { + "epoch": 0.28537578224679505, + "grad_norm": 0.213019460439682, + "learning_rate": 8.14899437887162e-05, + "loss": 1.1749, + "step": 4697 + }, + { + "epoch": 0.2854365392794216, + "grad_norm": 0.26015639305114746, + "learning_rate": 8.148250748121646e-05, + "loss": 1.1433, + "step": 4698 + }, + { + "epoch": 0.2854972963120481, + "grad_norm": 0.2658534049987793, + "learning_rate": 8.147507001971933e-05, + "loss": 1.2109, + "step": 4699 + }, + { + "epoch": 0.28555805334467466, + "grad_norm": 0.35617485642433167, + "learning_rate": 8.14676314044974e-05, + "loss": 1.0989, + "step": 4700 + }, + { + "epoch": 0.2856188103773012, + "grad_norm": 0.18339228630065918, + "learning_rate": 8.146019163582339e-05, + "loss": 1.1722, + "step": 4701 + }, + { + "epoch": 0.2856795674099277, + "grad_norm": 0.24170809984207153, + "learning_rate": 8.145275071396996e-05, + "loss": 1.1733, + "step": 4702 + }, + { + "epoch": 0.2857403244425542, + "grad_norm": 0.19103199243545532, + "learning_rate": 8.144530863920987e-05, + "loss": 1.2067, + "step": 4703 + }, + { + "epoch": 0.28580108147518074, + "grad_norm": 0.20725975930690765, + "learning_rate": 8.143786541181591e-05, + "loss": 1.0453, + "step": 4704 + }, + { + "epoch": 0.2858618385078073, + "grad_norm": 0.17264272272586823, + "learning_rate": 8.143042103206094e-05, + "loss": 1.0208, + "step": 4705 + }, + { + "epoch": 0.2859225955404338, + "grad_norm": 0.1455267369747162, + "learning_rate": 8.142297550021777e-05, + "loss": 1.0559, + "step": 4706 + }, + { + "epoch": 0.28598335257306035, + "grad_norm": 0.24420267343521118, + "learning_rate": 8.14155288165594e-05, + "loss": 1.0617, + "step": 4707 + }, + { + "epoch": 0.2860441096056869, + "grad_norm": 0.23790127038955688, + "learning_rate": 8.140808098135872e-05, + "loss": 1.0679, + "step": 4708 + }, + { + "epoch": 0.28610486663831336, + "grad_norm": 0.16236484050750732, + "learning_rate": 8.140063199488878e-05, + "loss": 1.0838, + "step": 4709 + }, + { + "epoch": 0.2861656236709399, + "grad_norm": 1.5107526779174805, + "learning_rate": 8.139318185742259e-05, + "loss": 1.198, + "step": 4710 + }, + { + "epoch": 0.28622638070356643, + "grad_norm": 0.29380860924720764, + "learning_rate": 8.138573056923328e-05, + "loss": 1.1327, + "step": 4711 + }, + { + "epoch": 0.28628713773619296, + "grad_norm": 2.3719253540039062, + "learning_rate": 8.137827813059392e-05, + "loss": 1.1012, + "step": 4712 + }, + { + "epoch": 0.2863478947688195, + "grad_norm": 0.33579131960868835, + "learning_rate": 8.137082454177771e-05, + "loss": 1.2244, + "step": 4713 + }, + { + "epoch": 0.28640865180144603, + "grad_norm": 0.19710390269756317, + "learning_rate": 8.136336980305787e-05, + "loss": 1.057, + "step": 4714 + }, + { + "epoch": 0.28646940883407257, + "grad_norm": 0.2228107899427414, + "learning_rate": 8.135591391470766e-05, + "loss": 1.2096, + "step": 4715 + }, + { + "epoch": 0.28653016586669905, + "grad_norm": 0.19869717955589294, + "learning_rate": 8.134845687700035e-05, + "loss": 1.1458, + "step": 4716 + }, + { + "epoch": 0.2865909228993256, + "grad_norm": 0.19826026260852814, + "learning_rate": 8.13409986902093e-05, + "loss": 1.0752, + "step": 4717 + }, + { + "epoch": 0.2866516799319521, + "grad_norm": 0.24265635013580322, + "learning_rate": 8.133353935460788e-05, + "loss": 1.1447, + "step": 4718 + }, + { + "epoch": 0.28671243696457865, + "grad_norm": 0.295247346162796, + "learning_rate": 8.132607887046952e-05, + "loss": 1.1352, + "step": 4719 + }, + { + "epoch": 0.2867731939972052, + "grad_norm": 0.563098669052124, + "learning_rate": 8.131861723806769e-05, + "loss": 1.2249, + "step": 4720 + }, + { + "epoch": 0.2868339510298317, + "grad_norm": 0.22320379316806793, + "learning_rate": 8.131115445767588e-05, + "loss": 1.0678, + "step": 4721 + }, + { + "epoch": 0.2868947080624582, + "grad_norm": 0.23385919630527496, + "learning_rate": 8.130369052956765e-05, + "loss": 1.0941, + "step": 4722 + }, + { + "epoch": 0.28695546509508474, + "grad_norm": 0.22694692015647888, + "learning_rate": 8.129622545401661e-05, + "loss": 1.062, + "step": 4723 + }, + { + "epoch": 0.2870162221277113, + "grad_norm": 0.205569326877594, + "learning_rate": 8.128875923129635e-05, + "loss": 1.0572, + "step": 4724 + }, + { + "epoch": 0.2870769791603378, + "grad_norm": 0.7134333252906799, + "learning_rate": 8.12812918616806e-05, + "loss": 1.0946, + "step": 4725 + }, + { + "epoch": 0.28713773619296434, + "grad_norm": 0.1782463788986206, + "learning_rate": 8.127382334544302e-05, + "loss": 1.0619, + "step": 4726 + }, + { + "epoch": 0.2871984932255909, + "grad_norm": 0.2327854186296463, + "learning_rate": 8.126635368285742e-05, + "loss": 1.2124, + "step": 4727 + }, + { + "epoch": 0.2872592502582174, + "grad_norm": 0.18850277364253998, + "learning_rate": 8.125888287419759e-05, + "loss": 1.1462, + "step": 4728 + }, + { + "epoch": 0.2873200072908439, + "grad_norm": 0.20325693488121033, + "learning_rate": 8.125141091973736e-05, + "loss": 1.24, + "step": 4729 + }, + { + "epoch": 0.28738076432347043, + "grad_norm": 0.23426564037799835, + "learning_rate": 8.124393781975062e-05, + "loss": 1.3633, + "step": 4730 + }, + { + "epoch": 0.28744152135609696, + "grad_norm": 0.17329590022563934, + "learning_rate": 8.123646357451129e-05, + "loss": 1.0398, + "step": 4731 + }, + { + "epoch": 0.2875022783887235, + "grad_norm": 0.2696012258529663, + "learning_rate": 8.122898818429336e-05, + "loss": 1.281, + "step": 4732 + }, + { + "epoch": 0.28756303542135003, + "grad_norm": 0.333278626203537, + "learning_rate": 8.122151164937083e-05, + "loss": 1.2321, + "step": 4733 + }, + { + "epoch": 0.28762379245397657, + "grad_norm": 0.22068017721176147, + "learning_rate": 8.121403397001774e-05, + "loss": 1.0782, + "step": 4734 + }, + { + "epoch": 0.28768454948660305, + "grad_norm": 0.19823525846004486, + "learning_rate": 8.120655514650824e-05, + "loss": 1.0738, + "step": 4735 + }, + { + "epoch": 0.2877453065192296, + "grad_norm": 0.3010410964488983, + "learning_rate": 8.11990751791164e-05, + "loss": 1.1292, + "step": 4736 + }, + { + "epoch": 0.2878060635518561, + "grad_norm": 0.1889469176530838, + "learning_rate": 8.119159406811643e-05, + "loss": 1.0807, + "step": 4737 + }, + { + "epoch": 0.28786682058448265, + "grad_norm": 0.21690398454666138, + "learning_rate": 8.118411181378256e-05, + "loss": 1.103, + "step": 4738 + }, + { + "epoch": 0.2879275776171092, + "grad_norm": 0.5330593585968018, + "learning_rate": 8.117662841638903e-05, + "loss": 1.344, + "step": 4739 + }, + { + "epoch": 0.2879883346497357, + "grad_norm": 0.19033487141132355, + "learning_rate": 8.116914387621018e-05, + "loss": 1.1315, + "step": 4740 + }, + { + "epoch": 0.28804909168236226, + "grad_norm": 0.293365478515625, + "learning_rate": 8.116165819352031e-05, + "loss": 1.1059, + "step": 4741 + }, + { + "epoch": 0.28810984871498874, + "grad_norm": 0.19638319313526154, + "learning_rate": 8.115417136859385e-05, + "loss": 1.1785, + "step": 4742 + }, + { + "epoch": 0.28817060574761527, + "grad_norm": 0.2521298825740814, + "learning_rate": 8.114668340170522e-05, + "loss": 1.085, + "step": 4743 + }, + { + "epoch": 0.2882313627802418, + "grad_norm": 0.1999048888683319, + "learning_rate": 8.113919429312888e-05, + "loss": 1.1324, + "step": 4744 + }, + { + "epoch": 0.28829211981286834, + "grad_norm": 0.49342015385627747, + "learning_rate": 8.113170404313937e-05, + "loss": 1.0628, + "step": 4745 + }, + { + "epoch": 0.2883528768454949, + "grad_norm": 0.1688714176416397, + "learning_rate": 8.112421265201121e-05, + "loss": 1.1498, + "step": 4746 + }, + { + "epoch": 0.2884136338781214, + "grad_norm": 0.6101064682006836, + "learning_rate": 8.111672012001904e-05, + "loss": 1.243, + "step": 4747 + }, + { + "epoch": 0.28847439091074795, + "grad_norm": 1.2826509475708008, + "learning_rate": 8.110922644743746e-05, + "loss": 1.1068, + "step": 4748 + }, + { + "epoch": 0.2885351479433744, + "grad_norm": 0.22259414196014404, + "learning_rate": 8.110173163454118e-05, + "loss": 1.1576, + "step": 4749 + }, + { + "epoch": 0.28859590497600096, + "grad_norm": 0.18059051036834717, + "learning_rate": 8.109423568160491e-05, + "loss": 1.0945, + "step": 4750 + }, + { + "epoch": 0.2886566620086275, + "grad_norm": 0.16577550768852234, + "learning_rate": 8.108673858890342e-05, + "loss": 1.1079, + "step": 4751 + }, + { + "epoch": 0.28871741904125403, + "grad_norm": 0.5138909220695496, + "learning_rate": 8.107924035671153e-05, + "loss": 1.1859, + "step": 4752 + }, + { + "epoch": 0.28877817607388057, + "grad_norm": 0.18410277366638184, + "learning_rate": 8.107174098530406e-05, + "loss": 1.1216, + "step": 4753 + }, + { + "epoch": 0.2888389331065071, + "grad_norm": 0.2318829894065857, + "learning_rate": 8.106424047495593e-05, + "loss": 1.1795, + "step": 4754 + }, + { + "epoch": 0.2888996901391336, + "grad_norm": 0.14606933295726776, + "learning_rate": 8.105673882594207e-05, + "loss": 1.0248, + "step": 4755 + }, + { + "epoch": 0.2889604471717601, + "grad_norm": 0.14447882771492004, + "learning_rate": 8.104923603853743e-05, + "loss": 1.0765, + "step": 4756 + }, + { + "epoch": 0.28902120420438665, + "grad_norm": 0.1729961633682251, + "learning_rate": 8.104173211301705e-05, + "loss": 1.0789, + "step": 4757 + }, + { + "epoch": 0.2890819612370132, + "grad_norm": 0.1393560916185379, + "learning_rate": 8.103422704965597e-05, + "loss": 1.0379, + "step": 4758 + }, + { + "epoch": 0.2891427182696397, + "grad_norm": 0.2100219875574112, + "learning_rate": 8.10267208487293e-05, + "loss": 1.3307, + "step": 4759 + }, + { + "epoch": 0.28920347530226626, + "grad_norm": 0.3587530553340912, + "learning_rate": 8.10192135105122e-05, + "loss": 1.3124, + "step": 4760 + }, + { + "epoch": 0.2892642323348928, + "grad_norm": 0.19232696294784546, + "learning_rate": 8.10117050352798e-05, + "loss": 1.0873, + "step": 4761 + }, + { + "epoch": 0.28932498936751927, + "grad_norm": 0.16133850812911987, + "learning_rate": 8.100419542330738e-05, + "loss": 1.0677, + "step": 4762 + }, + { + "epoch": 0.2893857464001458, + "grad_norm": 0.12450205534696579, + "learning_rate": 8.099668467487018e-05, + "loss": 1.0577, + "step": 4763 + }, + { + "epoch": 0.28944650343277234, + "grad_norm": 0.9260332584381104, + "learning_rate": 8.09891727902435e-05, + "loss": 1.1521, + "step": 4764 + }, + { + "epoch": 0.2895072604653989, + "grad_norm": 0.1659279763698578, + "learning_rate": 8.098165976970273e-05, + "loss": 1.0509, + "step": 4765 + }, + { + "epoch": 0.2895680174980254, + "grad_norm": 0.18438123166561127, + "learning_rate": 8.097414561352321e-05, + "loss": 1.0343, + "step": 4766 + }, + { + "epoch": 0.28962877453065194, + "grad_norm": 0.14608722925186157, + "learning_rate": 8.09666303219804e-05, + "loss": 1.115, + "step": 4767 + }, + { + "epoch": 0.2896895315632784, + "grad_norm": 0.14206336438655853, + "learning_rate": 8.095911389534977e-05, + "loss": 1.0897, + "step": 4768 + }, + { + "epoch": 0.28975028859590496, + "grad_norm": 0.12960398197174072, + "learning_rate": 8.095159633390683e-05, + "loss": 1.0864, + "step": 4769 + }, + { + "epoch": 0.2898110456285315, + "grad_norm": 0.1357732117176056, + "learning_rate": 8.094407763792714e-05, + "loss": 1.0866, + "step": 4770 + }, + { + "epoch": 0.28987180266115803, + "grad_norm": 0.16963078081607819, + "learning_rate": 8.093655780768632e-05, + "loss": 1.1873, + "step": 4771 + }, + { + "epoch": 0.28993255969378456, + "grad_norm": 0.17405863106250763, + "learning_rate": 8.092903684345999e-05, + "loss": 1.0914, + "step": 4772 + }, + { + "epoch": 0.2899933167264111, + "grad_norm": 0.20222699642181396, + "learning_rate": 8.092151474552381e-05, + "loss": 1.0646, + "step": 4773 + }, + { + "epoch": 0.29005407375903763, + "grad_norm": 0.19266146421432495, + "learning_rate": 8.091399151415355e-05, + "loss": 1.0609, + "step": 4774 + }, + { + "epoch": 0.2901148307916641, + "grad_norm": 0.145595520734787, + "learning_rate": 8.090646714962497e-05, + "loss": 1.0504, + "step": 4775 + }, + { + "epoch": 0.29017558782429065, + "grad_norm": 0.17000088095664978, + "learning_rate": 8.089894165221386e-05, + "loss": 1.0952, + "step": 4776 + }, + { + "epoch": 0.2902363448569172, + "grad_norm": 0.17131289839744568, + "learning_rate": 8.089141502219603e-05, + "loss": 1.0528, + "step": 4777 + }, + { + "epoch": 0.2902971018895437, + "grad_norm": 0.1466439813375473, + "learning_rate": 8.088388725984745e-05, + "loss": 1.0521, + "step": 4778 + }, + { + "epoch": 0.29035785892217025, + "grad_norm": 0.16580042243003845, + "learning_rate": 8.0876358365444e-05, + "loss": 1.1028, + "step": 4779 + }, + { + "epoch": 0.2904186159547968, + "grad_norm": 0.24699246883392334, + "learning_rate": 8.086882833926168e-05, + "loss": 1.1469, + "step": 4780 + }, + { + "epoch": 0.29047937298742327, + "grad_norm": 0.18186621367931366, + "learning_rate": 8.086129718157648e-05, + "loss": 1.1523, + "step": 4781 + }, + { + "epoch": 0.2905401300200498, + "grad_norm": 0.15710754692554474, + "learning_rate": 8.085376489266447e-05, + "loss": 1.0283, + "step": 4782 + }, + { + "epoch": 0.29060088705267634, + "grad_norm": 0.2273917943239212, + "learning_rate": 8.084623147280173e-05, + "loss": 1.1856, + "step": 4783 + }, + { + "epoch": 0.2906616440853029, + "grad_norm": 0.21383728086948395, + "learning_rate": 8.083869692226442e-05, + "loss": 1.1517, + "step": 4784 + }, + { + "epoch": 0.2907224011179294, + "grad_norm": 0.22727559506893158, + "learning_rate": 8.083116124132873e-05, + "loss": 1.1662, + "step": 4785 + }, + { + "epoch": 0.29078315815055594, + "grad_norm": 0.18458192050457, + "learning_rate": 8.082362443027085e-05, + "loss": 1.1141, + "step": 4786 + }, + { + "epoch": 0.2908439151831825, + "grad_norm": 2.501112699508667, + "learning_rate": 8.081608648936707e-05, + "loss": 1.2255, + "step": 4787 + }, + { + "epoch": 0.29090467221580896, + "grad_norm": 0.2777290642261505, + "learning_rate": 8.080854741889367e-05, + "loss": 1.114, + "step": 4788 + }, + { + "epoch": 0.2909654292484355, + "grad_norm": 0.13057801127433777, + "learning_rate": 8.080100721912702e-05, + "loss": 1.0318, + "step": 4789 + }, + { + "epoch": 0.291026186281062, + "grad_norm": 0.14138005673885345, + "learning_rate": 8.079346589034347e-05, + "loss": 1.0856, + "step": 4790 + }, + { + "epoch": 0.29108694331368856, + "grad_norm": 0.628367006778717, + "learning_rate": 8.07859234328195e-05, + "loss": 1.0691, + "step": 4791 + }, + { + "epoch": 0.2911477003463151, + "grad_norm": 0.1583787351846695, + "learning_rate": 8.077837984683155e-05, + "loss": 1.061, + "step": 4792 + }, + { + "epoch": 0.29120845737894163, + "grad_norm": 0.1693117320537567, + "learning_rate": 8.077083513265613e-05, + "loss": 1.1425, + "step": 4793 + }, + { + "epoch": 0.2912692144115681, + "grad_norm": 0.19208115339279175, + "learning_rate": 8.076328929056982e-05, + "loss": 1.1139, + "step": 4794 + }, + { + "epoch": 0.29132997144419465, + "grad_norm": 0.14601236581802368, + "learning_rate": 8.075574232084918e-05, + "loss": 1.0764, + "step": 4795 + }, + { + "epoch": 0.2913907284768212, + "grad_norm": 0.1709860861301422, + "learning_rate": 8.074819422377086e-05, + "loss": 1.1293, + "step": 4796 + }, + { + "epoch": 0.2914514855094477, + "grad_norm": 0.16490888595581055, + "learning_rate": 8.074064499961153e-05, + "loss": 1.0842, + "step": 4797 + }, + { + "epoch": 0.29151224254207425, + "grad_norm": 0.29872190952301025, + "learning_rate": 8.073309464864794e-05, + "loss": 1.2023, + "step": 4798 + }, + { + "epoch": 0.2915729995747008, + "grad_norm": 0.21467891335487366, + "learning_rate": 8.072554317115681e-05, + "loss": 1.0675, + "step": 4799 + }, + { + "epoch": 0.2916337566073273, + "grad_norm": 0.3493388295173645, + "learning_rate": 8.071799056741495e-05, + "loss": 1.1085, + "step": 4800 + }, + { + "epoch": 0.2916945136399538, + "grad_norm": 0.14783447980880737, + "learning_rate": 8.071043683769922e-05, + "loss": 1.1098, + "step": 4801 + }, + { + "epoch": 0.29175527067258034, + "grad_norm": 0.1892547458410263, + "learning_rate": 8.070288198228648e-05, + "loss": 1.1149, + "step": 4802 + }, + { + "epoch": 0.29181602770520687, + "grad_norm": 0.1804693639278412, + "learning_rate": 8.069532600145368e-05, + "loss": 1.2042, + "step": 4803 + }, + { + "epoch": 0.2918767847378334, + "grad_norm": 0.17891071736812592, + "learning_rate": 8.068776889547777e-05, + "loss": 1.1429, + "step": 4804 + }, + { + "epoch": 0.29193754177045994, + "grad_norm": 0.1375022828578949, + "learning_rate": 8.068021066463575e-05, + "loss": 1.0472, + "step": 4805 + }, + { + "epoch": 0.2919982988030865, + "grad_norm": 0.14984142780303955, + "learning_rate": 8.067265130920468e-05, + "loss": 1.0789, + "step": 4806 + }, + { + "epoch": 0.292059055835713, + "grad_norm": 0.2128085345029831, + "learning_rate": 8.066509082946166e-05, + "loss": 1.1318, + "step": 4807 + }, + { + "epoch": 0.2921198128683395, + "grad_norm": 0.15035560727119446, + "learning_rate": 8.06575292256838e-05, + "loss": 1.0993, + "step": 4808 + }, + { + "epoch": 0.292180569900966, + "grad_norm": 0.17657846212387085, + "learning_rate": 8.064996649814827e-05, + "loss": 1.1598, + "step": 4809 + }, + { + "epoch": 0.29224132693359256, + "grad_norm": 0.16894984245300293, + "learning_rate": 8.06424026471323e-05, + "loss": 1.0638, + "step": 4810 + }, + { + "epoch": 0.2923020839662191, + "grad_norm": 0.4684833288192749, + "learning_rate": 8.063483767291313e-05, + "loss": 1.1031, + "step": 4811 + }, + { + "epoch": 0.29236284099884563, + "grad_norm": 0.2029121220111847, + "learning_rate": 8.062727157576808e-05, + "loss": 1.0987, + "step": 4812 + }, + { + "epoch": 0.29242359803147217, + "grad_norm": 0.1746753752231598, + "learning_rate": 8.061970435597447e-05, + "loss": 1.105, + "step": 4813 + }, + { + "epoch": 0.29248435506409864, + "grad_norm": 0.16286218166351318, + "learning_rate": 8.061213601380966e-05, + "loss": 1.0918, + "step": 4814 + }, + { + "epoch": 0.2925451120967252, + "grad_norm": 0.12893499433994293, + "learning_rate": 8.06045665495511e-05, + "loss": 1.0189, + "step": 4815 + }, + { + "epoch": 0.2926058691293517, + "grad_norm": 0.19703014194965363, + "learning_rate": 8.059699596347625e-05, + "loss": 1.1905, + "step": 4816 + }, + { + "epoch": 0.29266662616197825, + "grad_norm": 0.16384656727313995, + "learning_rate": 8.058942425586258e-05, + "loss": 1.0969, + "step": 4817 + }, + { + "epoch": 0.2927273831946048, + "grad_norm": 0.18788295984268188, + "learning_rate": 8.058185142698765e-05, + "loss": 1.0691, + "step": 4818 + }, + { + "epoch": 0.2927881402272313, + "grad_norm": 0.1627199947834015, + "learning_rate": 8.057427747712905e-05, + "loss": 1.0965, + "step": 4819 + }, + { + "epoch": 0.29284889725985785, + "grad_norm": 0.24151472747325897, + "learning_rate": 8.05667024065644e-05, + "loss": 1.0922, + "step": 4820 + }, + { + "epoch": 0.29290965429248433, + "grad_norm": 0.16679595410823822, + "learning_rate": 8.055912621557136e-05, + "loss": 1.1382, + "step": 4821 + }, + { + "epoch": 0.29297041132511087, + "grad_norm": 0.32155275344848633, + "learning_rate": 8.055154890442766e-05, + "loss": 1.0255, + "step": 4822 + }, + { + "epoch": 0.2930311683577374, + "grad_norm": 0.11976300179958344, + "learning_rate": 8.0543970473411e-05, + "loss": 1.0184, + "step": 4823 + }, + { + "epoch": 0.29309192539036394, + "grad_norm": 0.17672741413116455, + "learning_rate": 8.053639092279922e-05, + "loss": 1.1088, + "step": 4824 + }, + { + "epoch": 0.2931526824229905, + "grad_norm": 0.27308014035224915, + "learning_rate": 8.052881025287014e-05, + "loss": 1.0774, + "step": 4825 + }, + { + "epoch": 0.293213439455617, + "grad_norm": 0.12782545387744904, + "learning_rate": 8.052122846390159e-05, + "loss": 1.0579, + "step": 4826 + }, + { + "epoch": 0.2932741964882435, + "grad_norm": 1.653658390045166, + "learning_rate": 8.051364555617151e-05, + "loss": 1.1211, + "step": 4827 + }, + { + "epoch": 0.29333495352087, + "grad_norm": 0.17118819057941437, + "learning_rate": 8.050606152995788e-05, + "loss": 1.1686, + "step": 4828 + }, + { + "epoch": 0.29339571055349656, + "grad_norm": 0.12530478835105896, + "learning_rate": 8.049847638553864e-05, + "loss": 1.0382, + "step": 4829 + }, + { + "epoch": 0.2934564675861231, + "grad_norm": 0.30110740661621094, + "learning_rate": 8.049089012319188e-05, + "loss": 1.1891, + "step": 4830 + }, + { + "epoch": 0.29351722461874963, + "grad_norm": 0.21907773613929749, + "learning_rate": 8.048330274319562e-05, + "loss": 1.1105, + "step": 4831 + }, + { + "epoch": 0.29357798165137616, + "grad_norm": 0.1301761120557785, + "learning_rate": 8.047571424582803e-05, + "loss": 1.0864, + "step": 4832 + }, + { + "epoch": 0.2936387386840027, + "grad_norm": 0.2238079011440277, + "learning_rate": 8.046812463136724e-05, + "loss": 1.051, + "step": 4833 + }, + { + "epoch": 0.2936994957166292, + "grad_norm": 0.34450873732566833, + "learning_rate": 8.046053390009144e-05, + "loss": 1.1267, + "step": 4834 + }, + { + "epoch": 0.2937602527492557, + "grad_norm": 0.14757469296455383, + "learning_rate": 8.045294205227889e-05, + "loss": 1.0877, + "step": 4835 + }, + { + "epoch": 0.29382100978188225, + "grad_norm": 0.13920970261096954, + "learning_rate": 8.044534908820786e-05, + "loss": 1.0596, + "step": 4836 + }, + { + "epoch": 0.2938817668145088, + "grad_norm": 0.28844472765922546, + "learning_rate": 8.043775500815668e-05, + "loss": 1.2527, + "step": 4837 + }, + { + "epoch": 0.2939425238471353, + "grad_norm": 0.18626797199249268, + "learning_rate": 8.04301598124037e-05, + "loss": 1.1656, + "step": 4838 + }, + { + "epoch": 0.29400328087976185, + "grad_norm": 0.21400314569473267, + "learning_rate": 8.042256350122734e-05, + "loss": 1.1937, + "step": 4839 + }, + { + "epoch": 0.29406403791238833, + "grad_norm": 0.22018057107925415, + "learning_rate": 8.041496607490602e-05, + "loss": 1.1395, + "step": 4840 + }, + { + "epoch": 0.29412479494501487, + "grad_norm": 0.17825329303741455, + "learning_rate": 8.040736753371825e-05, + "loss": 1.1149, + "step": 4841 + }, + { + "epoch": 0.2941855519776414, + "grad_norm": 0.1618984341621399, + "learning_rate": 8.039976787794255e-05, + "loss": 1.0572, + "step": 4842 + }, + { + "epoch": 0.29424630901026794, + "grad_norm": 0.15700997412204742, + "learning_rate": 8.039216710785746e-05, + "loss": 1.0678, + "step": 4843 + }, + { + "epoch": 0.29430706604289447, + "grad_norm": 0.3226756751537323, + "learning_rate": 8.038456522374163e-05, + "loss": 1.3166, + "step": 4844 + }, + { + "epoch": 0.294367823075521, + "grad_norm": 0.11979656666517258, + "learning_rate": 8.037696222587368e-05, + "loss": 1.0468, + "step": 4845 + }, + { + "epoch": 0.29442858010814754, + "grad_norm": 0.12319060415029526, + "learning_rate": 8.036935811453231e-05, + "loss": 1.0336, + "step": 4846 + }, + { + "epoch": 0.294489337140774, + "grad_norm": 0.17491278052330017, + "learning_rate": 8.036175288999625e-05, + "loss": 1.1986, + "step": 4847 + }, + { + "epoch": 0.29455009417340056, + "grad_norm": 0.25195613503456116, + "learning_rate": 8.035414655254427e-05, + "loss": 1.231, + "step": 4848 + }, + { + "epoch": 0.2946108512060271, + "grad_norm": 0.1673683524131775, + "learning_rate": 8.034653910245517e-05, + "loss": 1.0813, + "step": 4849 + }, + { + "epoch": 0.2946716082386536, + "grad_norm": 0.20356975495815277, + "learning_rate": 8.033893054000783e-05, + "loss": 1.1635, + "step": 4850 + }, + { + "epoch": 0.29473236527128016, + "grad_norm": 0.6214486956596375, + "learning_rate": 8.033132086548112e-05, + "loss": 1.1949, + "step": 4851 + }, + { + "epoch": 0.2947931223039067, + "grad_norm": 0.20783737301826477, + "learning_rate": 8.032371007915397e-05, + "loss": 1.1779, + "step": 4852 + }, + { + "epoch": 0.29485387933653323, + "grad_norm": 0.15741893649101257, + "learning_rate": 8.031609818130538e-05, + "loss": 1.0474, + "step": 4853 + }, + { + "epoch": 0.2949146363691597, + "grad_norm": 0.2226976752281189, + "learning_rate": 8.030848517221434e-05, + "loss": 1.0723, + "step": 4854 + }, + { + "epoch": 0.29497539340178625, + "grad_norm": 0.46578341722488403, + "learning_rate": 8.030087105215993e-05, + "loss": 1.0787, + "step": 4855 + }, + { + "epoch": 0.2950361504344128, + "grad_norm": 0.16051632165908813, + "learning_rate": 8.029325582142121e-05, + "loss": 1.1115, + "step": 4856 + }, + { + "epoch": 0.2950969074670393, + "grad_norm": 0.28879329562187195, + "learning_rate": 8.028563948027737e-05, + "loss": 1.1211, + "step": 4857 + }, + { + "epoch": 0.29515766449966585, + "grad_norm": 0.24414896965026855, + "learning_rate": 8.027802202900756e-05, + "loss": 1.0423, + "step": 4858 + }, + { + "epoch": 0.2952184215322924, + "grad_norm": 0.15892767906188965, + "learning_rate": 8.027040346789099e-05, + "loss": 1.1272, + "step": 4859 + }, + { + "epoch": 0.29527917856491886, + "grad_norm": 0.1327177882194519, + "learning_rate": 8.026278379720693e-05, + "loss": 1.0385, + "step": 4860 + }, + { + "epoch": 0.2953399355975454, + "grad_norm": 0.14075778424739838, + "learning_rate": 8.025516301723468e-05, + "loss": 1.0746, + "step": 4861 + }, + { + "epoch": 0.29540069263017193, + "grad_norm": 0.11402823776006699, + "learning_rate": 8.02475411282536e-05, + "loss": 1.0574, + "step": 4862 + }, + { + "epoch": 0.29546144966279847, + "grad_norm": 0.18731290102005005, + "learning_rate": 8.023991813054302e-05, + "loss": 1.0603, + "step": 4863 + }, + { + "epoch": 0.295522206695425, + "grad_norm": 0.20494194328784943, + "learning_rate": 8.023229402438241e-05, + "loss": 1.1572, + "step": 4864 + }, + { + "epoch": 0.29558296372805154, + "grad_norm": 0.23749224841594696, + "learning_rate": 8.022466881005121e-05, + "loss": 1.0468, + "step": 4865 + }, + { + "epoch": 0.2956437207606781, + "grad_norm": 0.17095661163330078, + "learning_rate": 8.021704248782895e-05, + "loss": 1.1394, + "step": 4866 + }, + { + "epoch": 0.29570447779330455, + "grad_norm": 0.5161401629447937, + "learning_rate": 8.020941505799515e-05, + "loss": 1.2939, + "step": 4867 + }, + { + "epoch": 0.2957652348259311, + "grad_norm": 0.1483251452445984, + "learning_rate": 8.02017865208294e-05, + "loss": 1.0967, + "step": 4868 + }, + { + "epoch": 0.2958259918585576, + "grad_norm": 0.31485968828201294, + "learning_rate": 8.019415687661134e-05, + "loss": 1.0865, + "step": 4869 + }, + { + "epoch": 0.29588674889118416, + "grad_norm": 0.5022217631340027, + "learning_rate": 8.01865261256206e-05, + "loss": 1.1151, + "step": 4870 + }, + { + "epoch": 0.2959475059238107, + "grad_norm": 0.2569407522678375, + "learning_rate": 8.017889426813692e-05, + "loss": 1.1146, + "step": 4871 + }, + { + "epoch": 0.29600826295643723, + "grad_norm": 0.32475942373275757, + "learning_rate": 8.017126130444004e-05, + "loss": 1.1545, + "step": 4872 + }, + { + "epoch": 0.2960690199890637, + "grad_norm": 0.32882651686668396, + "learning_rate": 8.016362723480974e-05, + "loss": 1.0458, + "step": 4873 + }, + { + "epoch": 0.29612977702169024, + "grad_norm": 0.28669607639312744, + "learning_rate": 8.015599205952586e-05, + "loss": 1.108, + "step": 4874 + }, + { + "epoch": 0.2961905340543168, + "grad_norm": 0.26534077525138855, + "learning_rate": 8.014835577886826e-05, + "loss": 1.0918, + "step": 4875 + }, + { + "epoch": 0.2962512910869433, + "grad_norm": 0.19056156277656555, + "learning_rate": 8.014071839311686e-05, + "loss": 1.1559, + "step": 4876 + }, + { + "epoch": 0.29631204811956985, + "grad_norm": 0.1659012883901596, + "learning_rate": 8.013307990255159e-05, + "loss": 1.0685, + "step": 4877 + }, + { + "epoch": 0.2963728051521964, + "grad_norm": 0.5224549174308777, + "learning_rate": 8.012544030745244e-05, + "loss": 1.1839, + "step": 4878 + }, + { + "epoch": 0.2964335621848229, + "grad_norm": 0.2468225061893463, + "learning_rate": 8.011779960809947e-05, + "loss": 1.1048, + "step": 4879 + }, + { + "epoch": 0.2964943192174494, + "grad_norm": 0.14499223232269287, + "learning_rate": 8.011015780477274e-05, + "loss": 1.092, + "step": 4880 + }, + { + "epoch": 0.29655507625007593, + "grad_norm": 0.1538555920124054, + "learning_rate": 8.010251489775234e-05, + "loss": 1.0888, + "step": 4881 + }, + { + "epoch": 0.29661583328270247, + "grad_norm": 0.22946538031101227, + "learning_rate": 8.009487088731845e-05, + "loss": 1.1833, + "step": 4882 + }, + { + "epoch": 0.296676590315329, + "grad_norm": 0.17426064610481262, + "learning_rate": 8.008722577375122e-05, + "loss": 1.1255, + "step": 4883 + }, + { + "epoch": 0.29673734734795554, + "grad_norm": 0.22603633999824524, + "learning_rate": 8.007957955733094e-05, + "loss": 1.1739, + "step": 4884 + }, + { + "epoch": 0.2967981043805821, + "grad_norm": 0.3607141077518463, + "learning_rate": 8.007193223833786e-05, + "loss": 1.1017, + "step": 4885 + }, + { + "epoch": 0.29685886141320855, + "grad_norm": 0.2515600621700287, + "learning_rate": 8.006428381705228e-05, + "loss": 1.1138, + "step": 4886 + }, + { + "epoch": 0.2969196184458351, + "grad_norm": 0.11034715920686722, + "learning_rate": 8.005663429375457e-05, + "loss": 1.0618, + "step": 4887 + }, + { + "epoch": 0.2969803754784616, + "grad_norm": 0.24948768317699432, + "learning_rate": 8.004898366872512e-05, + "loss": 1.0942, + "step": 4888 + }, + { + "epoch": 0.29704113251108816, + "grad_norm": 0.20381641387939453, + "learning_rate": 8.004133194224436e-05, + "loss": 1.0782, + "step": 4889 + }, + { + "epoch": 0.2971018895437147, + "grad_norm": 0.20852461457252502, + "learning_rate": 8.003367911459279e-05, + "loss": 1.135, + "step": 4890 + }, + { + "epoch": 0.2971626465763412, + "grad_norm": 2.300133228302002, + "learning_rate": 8.002602518605089e-05, + "loss": 1.0878, + "step": 4891 + }, + { + "epoch": 0.29722340360896776, + "grad_norm": 0.22455871105194092, + "learning_rate": 8.001837015689924e-05, + "loss": 1.1789, + "step": 4892 + }, + { + "epoch": 0.29728416064159424, + "grad_norm": 0.15111899375915527, + "learning_rate": 8.001071402741842e-05, + "loss": 1.085, + "step": 4893 + }, + { + "epoch": 0.2973449176742208, + "grad_norm": 0.21998457610607147, + "learning_rate": 8.000305679788908e-05, + "loss": 1.0692, + "step": 4894 + }, + { + "epoch": 0.2974056747068473, + "grad_norm": 0.34050309658050537, + "learning_rate": 7.99953984685919e-05, + "loss": 1.1033, + "step": 4895 + }, + { + "epoch": 0.29746643173947385, + "grad_norm": 0.24851812422275543, + "learning_rate": 7.998773903980759e-05, + "loss": 1.2472, + "step": 4896 + }, + { + "epoch": 0.2975271887721004, + "grad_norm": 0.1403234750032425, + "learning_rate": 7.99800785118169e-05, + "loss": 1.0835, + "step": 4897 + }, + { + "epoch": 0.2975879458047269, + "grad_norm": 0.14398226141929626, + "learning_rate": 7.997241688490062e-05, + "loss": 1.0664, + "step": 4898 + }, + { + "epoch": 0.2976487028373534, + "grad_norm": 0.2252487987279892, + "learning_rate": 7.996475415933964e-05, + "loss": 1.1443, + "step": 4899 + }, + { + "epoch": 0.29770945986997993, + "grad_norm": 0.15963275730609894, + "learning_rate": 7.99570903354148e-05, + "loss": 1.0689, + "step": 4900 + }, + { + "epoch": 0.29777021690260647, + "grad_norm": 0.14332592487335205, + "learning_rate": 7.9949425413407e-05, + "loss": 1.1201, + "step": 4901 + }, + { + "epoch": 0.297830973935233, + "grad_norm": 0.25667598843574524, + "learning_rate": 7.994175939359725e-05, + "loss": 1.2122, + "step": 4902 + }, + { + "epoch": 0.29789173096785954, + "grad_norm": 0.24005277454853058, + "learning_rate": 7.993409227626651e-05, + "loss": 1.1038, + "step": 4903 + }, + { + "epoch": 0.29795248800048607, + "grad_norm": 0.45693233609199524, + "learning_rate": 7.992642406169584e-05, + "loss": 1.061, + "step": 4904 + }, + { + "epoch": 0.2980132450331126, + "grad_norm": 0.2885357737541199, + "learning_rate": 7.991875475016629e-05, + "loss": 1.0943, + "step": 4905 + }, + { + "epoch": 0.2980740020657391, + "grad_norm": 0.15787114202976227, + "learning_rate": 7.991108434195902e-05, + "loss": 1.0836, + "step": 4906 + }, + { + "epoch": 0.2981347590983656, + "grad_norm": 0.25007563829421997, + "learning_rate": 7.990341283735516e-05, + "loss": 1.1384, + "step": 4907 + }, + { + "epoch": 0.29819551613099216, + "grad_norm": 0.15230529010295868, + "learning_rate": 7.989574023663592e-05, + "loss": 1.0359, + "step": 4908 + }, + { + "epoch": 0.2982562731636187, + "grad_norm": 0.49063366651535034, + "learning_rate": 7.988806654008256e-05, + "loss": 1.0633, + "step": 4909 + }, + { + "epoch": 0.2983170301962452, + "grad_norm": 0.24013467133045197, + "learning_rate": 7.988039174797633e-05, + "loss": 1.1554, + "step": 4910 + }, + { + "epoch": 0.29837778722887176, + "grad_norm": 0.42186757922172546, + "learning_rate": 7.987271586059857e-05, + "loss": 1.1717, + "step": 4911 + }, + { + "epoch": 0.2984385442614983, + "grad_norm": 0.29849252104759216, + "learning_rate": 7.986503887823064e-05, + "loss": 1.0789, + "step": 4912 + }, + { + "epoch": 0.2984993012941248, + "grad_norm": 0.19035378098487854, + "learning_rate": 7.985736080115393e-05, + "loss": 1.1628, + "step": 4913 + }, + { + "epoch": 0.2985600583267513, + "grad_norm": 0.13357847929000854, + "learning_rate": 7.98496816296499e-05, + "loss": 1.0374, + "step": 4914 + }, + { + "epoch": 0.29862081535937784, + "grad_norm": 0.25546133518218994, + "learning_rate": 7.984200136399999e-05, + "loss": 1.0679, + "step": 4915 + }, + { + "epoch": 0.2986815723920044, + "grad_norm": 0.21756219863891602, + "learning_rate": 7.983432000448578e-05, + "loss": 1.0823, + "step": 4916 + }, + { + "epoch": 0.2987423294246309, + "grad_norm": 0.17571885883808136, + "learning_rate": 7.982663755138879e-05, + "loss": 1.0591, + "step": 4917 + }, + { + "epoch": 0.29880308645725745, + "grad_norm": 0.2558269500732422, + "learning_rate": 7.981895400499065e-05, + "loss": 1.0669, + "step": 4918 + }, + { + "epoch": 0.29886384348988393, + "grad_norm": 0.4805607199668884, + "learning_rate": 7.981126936557298e-05, + "loss": 1.2421, + "step": 4919 + }, + { + "epoch": 0.29892460052251046, + "grad_norm": 0.7044974565505981, + "learning_rate": 7.980358363341747e-05, + "loss": 1.1244, + "step": 4920 + }, + { + "epoch": 0.298985357555137, + "grad_norm": 0.3205346465110779, + "learning_rate": 7.979589680880584e-05, + "loss": 1.1731, + "step": 4921 + }, + { + "epoch": 0.29904611458776353, + "grad_norm": 0.18769443035125732, + "learning_rate": 7.978820889201985e-05, + "loss": 1.0607, + "step": 4922 + }, + { + "epoch": 0.29910687162039007, + "grad_norm": 0.18973058462142944, + "learning_rate": 7.97805198833413e-05, + "loss": 1.1534, + "step": 4923 + }, + { + "epoch": 0.2991676286530166, + "grad_norm": 0.26898670196533203, + "learning_rate": 7.977282978305205e-05, + "loss": 1.1502, + "step": 4924 + }, + { + "epoch": 0.29922838568564314, + "grad_norm": 0.13413062691688538, + "learning_rate": 7.976513859143395e-05, + "loss": 1.0372, + "step": 4925 + }, + { + "epoch": 0.2992891427182696, + "grad_norm": 0.22134798765182495, + "learning_rate": 7.975744630876896e-05, + "loss": 1.0807, + "step": 4926 + }, + { + "epoch": 0.29934989975089615, + "grad_norm": 0.760003387928009, + "learning_rate": 7.974975293533902e-05, + "loss": 1.098, + "step": 4927 + }, + { + "epoch": 0.2994106567835227, + "grad_norm": 0.1347123384475708, + "learning_rate": 7.974205847142613e-05, + "loss": 1.1281, + "step": 4928 + }, + { + "epoch": 0.2994714138161492, + "grad_norm": 0.16674065589904785, + "learning_rate": 7.973436291731234e-05, + "loss": 1.0827, + "step": 4929 + }, + { + "epoch": 0.29953217084877576, + "grad_norm": 0.1286884993314743, + "learning_rate": 7.972666627327974e-05, + "loss": 1.052, + "step": 4930 + }, + { + "epoch": 0.2995929278814023, + "grad_norm": 0.172612726688385, + "learning_rate": 7.971896853961042e-05, + "loss": 1.1248, + "step": 4931 + }, + { + "epoch": 0.2996536849140288, + "grad_norm": 0.5562714338302612, + "learning_rate": 7.971126971658659e-05, + "loss": 1.3952, + "step": 4932 + }, + { + "epoch": 0.2997144419466553, + "grad_norm": 0.25090518593788147, + "learning_rate": 7.970356980449041e-05, + "loss": 1.055, + "step": 4933 + }, + { + "epoch": 0.29977519897928184, + "grad_norm": 0.1664423793554306, + "learning_rate": 7.969586880360413e-05, + "loss": 1.0859, + "step": 4934 + }, + { + "epoch": 0.2998359560119084, + "grad_norm": 0.3369622230529785, + "learning_rate": 7.968816671421005e-05, + "loss": 1.231, + "step": 4935 + }, + { + "epoch": 0.2998967130445349, + "grad_norm": 0.20732329785823822, + "learning_rate": 7.968046353659048e-05, + "loss": 1.2489, + "step": 4936 + }, + { + "epoch": 0.29995747007716145, + "grad_norm": 0.1416826993227005, + "learning_rate": 7.967275927102777e-05, + "loss": 1.0499, + "step": 4937 + }, + { + "epoch": 0.300018227109788, + "grad_norm": 0.1614678055047989, + "learning_rate": 7.966505391780436e-05, + "loss": 1.1482, + "step": 4938 + }, + { + "epoch": 0.30007898414241446, + "grad_norm": 0.20388440787792206, + "learning_rate": 7.965734747720264e-05, + "loss": 1.098, + "step": 4939 + }, + { + "epoch": 0.300139741175041, + "grad_norm": 0.15187425911426544, + "learning_rate": 7.964963994950513e-05, + "loss": 1.0801, + "step": 4940 + }, + { + "epoch": 0.30020049820766753, + "grad_norm": 0.14990165829658508, + "learning_rate": 7.964193133499432e-05, + "loss": 1.0409, + "step": 4941 + }, + { + "epoch": 0.30026125524029407, + "grad_norm": 0.24640433490276337, + "learning_rate": 7.963422163395281e-05, + "loss": 1.1354, + "step": 4942 + }, + { + "epoch": 0.3003220122729206, + "grad_norm": 0.16749872267246246, + "learning_rate": 7.962651084666316e-05, + "loss": 1.0518, + "step": 4943 + }, + { + "epoch": 0.30038276930554714, + "grad_norm": 0.11995523422956467, + "learning_rate": 7.961879897340803e-05, + "loss": 1.0149, + "step": 4944 + }, + { + "epoch": 0.3004435263381736, + "grad_norm": 0.22328971326351166, + "learning_rate": 7.96110860144701e-05, + "loss": 1.1297, + "step": 4945 + }, + { + "epoch": 0.30050428337080015, + "grad_norm": 0.2527675926685333, + "learning_rate": 7.96033719701321e-05, + "loss": 0.9945, + "step": 4946 + }, + { + "epoch": 0.3005650404034267, + "grad_norm": 0.43052467703819275, + "learning_rate": 7.959565684067675e-05, + "loss": 1.2694, + "step": 4947 + }, + { + "epoch": 0.3006257974360532, + "grad_norm": 0.28732234239578247, + "learning_rate": 7.95879406263869e-05, + "loss": 1.0808, + "step": 4948 + }, + { + "epoch": 0.30068655446867976, + "grad_norm": 3.940929889678955, + "learning_rate": 7.958022332754537e-05, + "loss": 1.0754, + "step": 4949 + }, + { + "epoch": 0.3007473115013063, + "grad_norm": 0.33323633670806885, + "learning_rate": 7.957250494443504e-05, + "loss": 1.1545, + "step": 4950 + }, + { + "epoch": 0.3008080685339328, + "grad_norm": 0.15152540802955627, + "learning_rate": 7.956478547733881e-05, + "loss": 1.0185, + "step": 4951 + }, + { + "epoch": 0.3008688255665593, + "grad_norm": 0.1421220600605011, + "learning_rate": 7.955706492653966e-05, + "loss": 1.1018, + "step": 4952 + }, + { + "epoch": 0.30092958259918584, + "grad_norm": 0.39070144295692444, + "learning_rate": 7.954934329232058e-05, + "loss": 1.0973, + "step": 4953 + }, + { + "epoch": 0.3009903396318124, + "grad_norm": 0.5201708078384399, + "learning_rate": 7.954162057496462e-05, + "loss": 1.1615, + "step": 4954 + }, + { + "epoch": 0.3010510966644389, + "grad_norm": 0.1853858232498169, + "learning_rate": 7.953389677475482e-05, + "loss": 1.1546, + "step": 4955 + }, + { + "epoch": 0.30111185369706545, + "grad_norm": 0.13055375218391418, + "learning_rate": 7.952617189197435e-05, + "loss": 1.0911, + "step": 4956 + }, + { + "epoch": 0.301172610729692, + "grad_norm": 0.16457238793373108, + "learning_rate": 7.951844592690634e-05, + "loss": 1.0622, + "step": 4957 + }, + { + "epoch": 0.3012333677623185, + "grad_norm": 0.19498306512832642, + "learning_rate": 7.951071887983397e-05, + "loss": 1.1219, + "step": 4958 + }, + { + "epoch": 0.301294124794945, + "grad_norm": 0.1648094356060028, + "learning_rate": 7.950299075104052e-05, + "loss": 1.075, + "step": 4959 + }, + { + "epoch": 0.30135488182757153, + "grad_norm": 0.16813687980175018, + "learning_rate": 7.949526154080922e-05, + "loss": 1.1705, + "step": 4960 + }, + { + "epoch": 0.30141563886019807, + "grad_norm": 0.15543539822101593, + "learning_rate": 7.948753124942343e-05, + "loss": 1.0736, + "step": 4961 + }, + { + "epoch": 0.3014763958928246, + "grad_norm": 0.2634231448173523, + "learning_rate": 7.947979987716647e-05, + "loss": 1.1821, + "step": 4962 + }, + { + "epoch": 0.30153715292545114, + "grad_norm": 0.1685730218887329, + "learning_rate": 7.947206742432174e-05, + "loss": 1.163, + "step": 4963 + }, + { + "epoch": 0.30159790995807767, + "grad_norm": 0.21360357105731964, + "learning_rate": 7.946433389117268e-05, + "loss": 1.0712, + "step": 4964 + }, + { + "epoch": 0.30165866699070415, + "grad_norm": 0.17472833395004272, + "learning_rate": 7.945659927800277e-05, + "loss": 1.0628, + "step": 4965 + }, + { + "epoch": 0.3017194240233307, + "grad_norm": 0.23866689205169678, + "learning_rate": 7.944886358509554e-05, + "loss": 1.0556, + "step": 4966 + }, + { + "epoch": 0.3017801810559572, + "grad_norm": 0.20201341807842255, + "learning_rate": 7.94411268127345e-05, + "loss": 1.036, + "step": 4967 + }, + { + "epoch": 0.30184093808858375, + "grad_norm": 0.21411119401454926, + "learning_rate": 7.943338896120328e-05, + "loss": 1.2082, + "step": 4968 + }, + { + "epoch": 0.3019016951212103, + "grad_norm": 0.19133217632770538, + "learning_rate": 7.942565003078548e-05, + "loss": 1.1021, + "step": 4969 + }, + { + "epoch": 0.3019624521538368, + "grad_norm": 0.5134078860282898, + "learning_rate": 7.94179100217648e-05, + "loss": 1.1174, + "step": 4970 + }, + { + "epoch": 0.30202320918646336, + "grad_norm": 0.26805537939071655, + "learning_rate": 7.941016893442495e-05, + "loss": 1.1782, + "step": 4971 + }, + { + "epoch": 0.30208396621908984, + "grad_norm": 0.23270122706890106, + "learning_rate": 7.940242676904965e-05, + "loss": 1.3901, + "step": 4972 + }, + { + "epoch": 0.3021447232517164, + "grad_norm": 0.3333161771297455, + "learning_rate": 7.939468352592274e-05, + "loss": 1.1211, + "step": 4973 + }, + { + "epoch": 0.3022054802843429, + "grad_norm": 0.17558227479457855, + "learning_rate": 7.938693920532801e-05, + "loss": 1.078, + "step": 4974 + }, + { + "epoch": 0.30226623731696944, + "grad_norm": 0.499388188123703, + "learning_rate": 7.937919380754934e-05, + "loss": 1.0234, + "step": 4975 + }, + { + "epoch": 0.302326994349596, + "grad_norm": 7.581668376922607, + "learning_rate": 7.937144733287065e-05, + "loss": 1.0464, + "step": 4976 + }, + { + "epoch": 0.3023877513822225, + "grad_norm": 0.23260857164859772, + "learning_rate": 7.936369978157589e-05, + "loss": 1.1839, + "step": 4977 + }, + { + "epoch": 0.302448508414849, + "grad_norm": 0.15449270606040955, + "learning_rate": 7.935595115394902e-05, + "loss": 1.0658, + "step": 4978 + }, + { + "epoch": 0.30250926544747553, + "grad_norm": 0.18905365467071533, + "learning_rate": 7.93482014502741e-05, + "loss": 1.05, + "step": 4979 + }, + { + "epoch": 0.30257002248010206, + "grad_norm": 0.13975360989570618, + "learning_rate": 7.934045067083517e-05, + "loss": 1.0636, + "step": 4980 + }, + { + "epoch": 0.3026307795127286, + "grad_norm": 0.13250093162059784, + "learning_rate": 7.933269881591635e-05, + "loss": 1.0972, + "step": 4981 + }, + { + "epoch": 0.30269153654535513, + "grad_norm": 0.15861931443214417, + "learning_rate": 7.932494588580179e-05, + "loss": 1.0982, + "step": 4982 + }, + { + "epoch": 0.30275229357798167, + "grad_norm": 0.24535703659057617, + "learning_rate": 7.931719188077567e-05, + "loss": 1.1169, + "step": 4983 + }, + { + "epoch": 0.3028130506106082, + "grad_norm": 0.19138747453689575, + "learning_rate": 7.930943680112222e-05, + "loss": 1.1768, + "step": 4984 + }, + { + "epoch": 0.3028738076432347, + "grad_norm": 0.39421167969703674, + "learning_rate": 7.930168064712569e-05, + "loss": 1.0769, + "step": 4985 + }, + { + "epoch": 0.3029345646758612, + "grad_norm": 0.23053671419620514, + "learning_rate": 7.92939234190704e-05, + "loss": 1.1584, + "step": 4986 + }, + { + "epoch": 0.30299532170848775, + "grad_norm": 0.6366480588912964, + "learning_rate": 7.928616511724068e-05, + "loss": 1.1671, + "step": 4987 + }, + { + "epoch": 0.3030560787411143, + "grad_norm": 0.3631538152694702, + "learning_rate": 7.927840574192092e-05, + "loss": 1.1417, + "step": 4988 + }, + { + "epoch": 0.3031168357737408, + "grad_norm": 0.27150318026542664, + "learning_rate": 7.927064529339555e-05, + "loss": 1.136, + "step": 4989 + }, + { + "epoch": 0.30317759280636736, + "grad_norm": 0.4584693908691406, + "learning_rate": 7.926288377194899e-05, + "loss": 1.2183, + "step": 4990 + }, + { + "epoch": 0.30323834983899384, + "grad_norm": 0.20481142401695251, + "learning_rate": 7.925512117786578e-05, + "loss": 1.1193, + "step": 4991 + }, + { + "epoch": 0.30329910687162037, + "grad_norm": 0.35838496685028076, + "learning_rate": 7.924735751143045e-05, + "loss": 1.2194, + "step": 4992 + }, + { + "epoch": 0.3033598639042469, + "grad_norm": 0.36893385648727417, + "learning_rate": 7.923959277292759e-05, + "loss": 1.186, + "step": 4993 + }, + { + "epoch": 0.30342062093687344, + "grad_norm": 0.15746700763702393, + "learning_rate": 7.923182696264177e-05, + "loss": 1.0833, + "step": 4994 + }, + { + "epoch": 0.3034813779695, + "grad_norm": 0.3159339725971222, + "learning_rate": 7.922406008085771e-05, + "loss": 1.2208, + "step": 4995 + }, + { + "epoch": 0.3035421350021265, + "grad_norm": 0.2524588704109192, + "learning_rate": 7.921629212786008e-05, + "loss": 1.1383, + "step": 4996 + }, + { + "epoch": 0.30360289203475305, + "grad_norm": 0.4001350402832031, + "learning_rate": 7.92085231039336e-05, + "loss": 1.1228, + "step": 4997 + }, + { + "epoch": 0.3036636490673795, + "grad_norm": 0.39483341574668884, + "learning_rate": 7.920075300936308e-05, + "loss": 1.1851, + "step": 4998 + }, + { + "epoch": 0.30372440610000606, + "grad_norm": 0.5460163950920105, + "learning_rate": 7.919298184443331e-05, + "loss": 1.0453, + "step": 4999 + }, + { + "epoch": 0.3037851631326326, + "grad_norm": 0.2757039964199066, + "learning_rate": 7.918520960942913e-05, + "loss": 1.2287, + "step": 5000 + }, + { + "epoch": 0.30384592016525913, + "grad_norm": 0.21803759038448334, + "learning_rate": 7.917743630463546e-05, + "loss": 1.129, + "step": 5001 + }, + { + "epoch": 0.30390667719788567, + "grad_norm": 0.21053986251354218, + "learning_rate": 7.916966193033723e-05, + "loss": 1.1218, + "step": 5002 + }, + { + "epoch": 0.3039674342305122, + "grad_norm": 0.23183192312717438, + "learning_rate": 7.916188648681939e-05, + "loss": 1.1931, + "step": 5003 + }, + { + "epoch": 0.3040281912631387, + "grad_norm": 0.14881351590156555, + "learning_rate": 7.915410997436697e-05, + "loss": 1.0757, + "step": 5004 + }, + { + "epoch": 0.3040889482957652, + "grad_norm": 0.17829740047454834, + "learning_rate": 7.914633239326502e-05, + "loss": 1.0884, + "step": 5005 + }, + { + "epoch": 0.30414970532839175, + "grad_norm": 0.938035249710083, + "learning_rate": 7.913855374379862e-05, + "loss": 1.2527, + "step": 5006 + }, + { + "epoch": 0.3042104623610183, + "grad_norm": 0.1620355099439621, + "learning_rate": 7.913077402625289e-05, + "loss": 1.0918, + "step": 5007 + }, + { + "epoch": 0.3042712193936448, + "grad_norm": 0.1912277787923813, + "learning_rate": 7.912299324091301e-05, + "loss": 1.1789, + "step": 5008 + }, + { + "epoch": 0.30433197642627136, + "grad_norm": 0.15307871997356415, + "learning_rate": 7.911521138806419e-05, + "loss": 1.0679, + "step": 5009 + }, + { + "epoch": 0.3043927334588979, + "grad_norm": 0.22374896705150604, + "learning_rate": 7.910742846799169e-05, + "loss": 1.0754, + "step": 5010 + }, + { + "epoch": 0.30445349049152437, + "grad_norm": 0.16303293406963348, + "learning_rate": 7.909964448098073e-05, + "loss": 1.0597, + "step": 5011 + }, + { + "epoch": 0.3045142475241509, + "grad_norm": 0.39754754304885864, + "learning_rate": 7.90918594273167e-05, + "loss": 1.3041, + "step": 5012 + }, + { + "epoch": 0.30457500455677744, + "grad_norm": 0.15414276719093323, + "learning_rate": 7.908407330728493e-05, + "loss": 1.0549, + "step": 5013 + }, + { + "epoch": 0.304635761589404, + "grad_norm": 1.6768600940704346, + "learning_rate": 7.907628612117084e-05, + "loss": 1.218, + "step": 5014 + }, + { + "epoch": 0.3046965186220305, + "grad_norm": 0.20556695759296417, + "learning_rate": 7.906849786925987e-05, + "loss": 1.1323, + "step": 5015 + }, + { + "epoch": 0.30475727565465704, + "grad_norm": 0.8451787829399109, + "learning_rate": 7.906070855183747e-05, + "loss": 1.1674, + "step": 5016 + }, + { + "epoch": 0.3048180326872836, + "grad_norm": 0.5405980944633484, + "learning_rate": 7.90529181691892e-05, + "loss": 1.2248, + "step": 5017 + }, + { + "epoch": 0.30487878971991006, + "grad_norm": 0.1711481809616089, + "learning_rate": 7.904512672160058e-05, + "loss": 1.1353, + "step": 5018 + }, + { + "epoch": 0.3049395467525366, + "grad_norm": 0.19523300230503082, + "learning_rate": 7.903733420935723e-05, + "loss": 1.0708, + "step": 5019 + }, + { + "epoch": 0.30500030378516313, + "grad_norm": 1.2843191623687744, + "learning_rate": 7.90295406327448e-05, + "loss": 1.0794, + "step": 5020 + }, + { + "epoch": 0.30506106081778966, + "grad_norm": 0.22761689126491547, + "learning_rate": 7.902174599204892e-05, + "loss": 1.1177, + "step": 5021 + }, + { + "epoch": 0.3051218178504162, + "grad_norm": 1.1813548803329468, + "learning_rate": 7.901395028755536e-05, + "loss": 1.3213, + "step": 5022 + }, + { + "epoch": 0.30518257488304273, + "grad_norm": 0.4325633943080902, + "learning_rate": 7.900615351954982e-05, + "loss": 1.1314, + "step": 5023 + }, + { + "epoch": 0.3052433319156692, + "grad_norm": 0.6260473728179932, + "learning_rate": 7.899835568831813e-05, + "loss": 1.0374, + "step": 5024 + }, + { + "epoch": 0.30530408894829575, + "grad_norm": 0.18695132434368134, + "learning_rate": 7.89905567941461e-05, + "loss": 1.113, + "step": 5025 + }, + { + "epoch": 0.3053648459809223, + "grad_norm": 0.3728717863559723, + "learning_rate": 7.898275683731962e-05, + "loss": 1.1198, + "step": 5026 + }, + { + "epoch": 0.3054256030135488, + "grad_norm": 0.44424402713775635, + "learning_rate": 7.897495581812457e-05, + "loss": 1.0337, + "step": 5027 + }, + { + "epoch": 0.30548636004617535, + "grad_norm": 0.1744932383298874, + "learning_rate": 7.89671537368469e-05, + "loss": 1.1343, + "step": 5028 + }, + { + "epoch": 0.3055471170788019, + "grad_norm": 0.7512226104736328, + "learning_rate": 7.895935059377264e-05, + "loss": 1.2605, + "step": 5029 + }, + { + "epoch": 0.3056078741114284, + "grad_norm": 0.309328556060791, + "learning_rate": 7.895154638918778e-05, + "loss": 1.1397, + "step": 5030 + }, + { + "epoch": 0.3056686311440549, + "grad_norm": 0.20712530612945557, + "learning_rate": 7.894374112337839e-05, + "loss": 1.1313, + "step": 5031 + }, + { + "epoch": 0.30572938817668144, + "grad_norm": 0.36675825715065, + "learning_rate": 7.893593479663057e-05, + "loss": 1.0413, + "step": 5032 + }, + { + "epoch": 0.305790145209308, + "grad_norm": 0.5124492645263672, + "learning_rate": 7.892812740923045e-05, + "loss": 1.0542, + "step": 5033 + }, + { + "epoch": 0.3058509022419345, + "grad_norm": 0.2213868945837021, + "learning_rate": 7.892031896146425e-05, + "loss": 1.0665, + "step": 5034 + }, + { + "epoch": 0.30591165927456104, + "grad_norm": 0.36965662240982056, + "learning_rate": 7.891250945361817e-05, + "loss": 1.0811, + "step": 5035 + }, + { + "epoch": 0.3059724163071876, + "grad_norm": 0.29335683584213257, + "learning_rate": 7.890469888597847e-05, + "loss": 1.037, + "step": 5036 + }, + { + "epoch": 0.30603317333981406, + "grad_norm": 0.21715201437473297, + "learning_rate": 7.889688725883143e-05, + "loss": 1.0774, + "step": 5037 + }, + { + "epoch": 0.3060939303724406, + "grad_norm": 0.16601771116256714, + "learning_rate": 7.88890745724634e-05, + "loss": 1.0936, + "step": 5038 + }, + { + "epoch": 0.3061546874050671, + "grad_norm": 0.1503555029630661, + "learning_rate": 7.888126082716077e-05, + "loss": 1.0898, + "step": 5039 + }, + { + "epoch": 0.30621544443769366, + "grad_norm": 0.13763649761676788, + "learning_rate": 7.887344602320994e-05, + "loss": 1.089, + "step": 5040 + }, + { + "epoch": 0.3062762014703202, + "grad_norm": 0.2543410658836365, + "learning_rate": 7.886563016089737e-05, + "loss": 1.1757, + "step": 5041 + }, + { + "epoch": 0.30633695850294673, + "grad_norm": 0.37459924817085266, + "learning_rate": 7.885781324050953e-05, + "loss": 1.1547, + "step": 5042 + }, + { + "epoch": 0.30639771553557327, + "grad_norm": 0.16601353883743286, + "learning_rate": 7.884999526233297e-05, + "loss": 1.0775, + "step": 5043 + }, + { + "epoch": 0.30645847256819975, + "grad_norm": 0.18259654939174652, + "learning_rate": 7.884217622665426e-05, + "loss": 1.0922, + "step": 5044 + }, + { + "epoch": 0.3065192296008263, + "grad_norm": 0.26253360509872437, + "learning_rate": 7.883435613376003e-05, + "loss": 1.1165, + "step": 5045 + }, + { + "epoch": 0.3065799866334528, + "grad_norm": 0.7012448906898499, + "learning_rate": 7.882653498393688e-05, + "loss": 1.1857, + "step": 5046 + }, + { + "epoch": 0.30664074366607935, + "grad_norm": 0.306253045797348, + "learning_rate": 7.88187127774715e-05, + "loss": 1.1296, + "step": 5047 + }, + { + "epoch": 0.3067015006987059, + "grad_norm": 0.19283844530582428, + "learning_rate": 7.881088951465066e-05, + "loss": 1.0603, + "step": 5048 + }, + { + "epoch": 0.3067622577313324, + "grad_norm": 0.1580544412136078, + "learning_rate": 7.880306519576109e-05, + "loss": 1.1252, + "step": 5049 + }, + { + "epoch": 0.3068230147639589, + "grad_norm": 0.16325531899929047, + "learning_rate": 7.87952398210896e-05, + "loss": 1.0204, + "step": 5050 + }, + { + "epoch": 0.30688377179658544, + "grad_norm": 0.17274509370326996, + "learning_rate": 7.878741339092302e-05, + "loss": 1.0666, + "step": 5051 + }, + { + "epoch": 0.30694452882921197, + "grad_norm": 0.2777055501937866, + "learning_rate": 7.877958590554824e-05, + "loss": 1.2514, + "step": 5052 + }, + { + "epoch": 0.3070052858618385, + "grad_norm": 0.15812616050243378, + "learning_rate": 7.877175736525217e-05, + "loss": 1.0664, + "step": 5053 + }, + { + "epoch": 0.30706604289446504, + "grad_norm": 0.6769807934761047, + "learning_rate": 7.87639277703218e-05, + "loss": 1.3595, + "step": 5054 + }, + { + "epoch": 0.3071267999270916, + "grad_norm": 0.19761823117733002, + "learning_rate": 7.875609712104407e-05, + "loss": 1.0974, + "step": 5055 + }, + { + "epoch": 0.3071875569597181, + "grad_norm": 0.1579507738351822, + "learning_rate": 7.874826541770604e-05, + "loss": 1.0592, + "step": 5056 + }, + { + "epoch": 0.3072483139923446, + "grad_norm": 0.15911029279232025, + "learning_rate": 7.874043266059479e-05, + "loss": 1.1017, + "step": 5057 + }, + { + "epoch": 0.3073090710249711, + "grad_norm": 0.12055608630180359, + "learning_rate": 7.873259884999744e-05, + "loss": 1.0086, + "step": 5058 + }, + { + "epoch": 0.30736982805759766, + "grad_norm": 0.1480165421962738, + "learning_rate": 7.87247639862011e-05, + "loss": 1.021, + "step": 5059 + }, + { + "epoch": 0.3074305850902242, + "grad_norm": 0.12465279549360275, + "learning_rate": 7.871692806949301e-05, + "loss": 1.057, + "step": 5060 + }, + { + "epoch": 0.30749134212285073, + "grad_norm": 0.1836155503988266, + "learning_rate": 7.870909110016036e-05, + "loss": 1.0774, + "step": 5061 + }, + { + "epoch": 0.30755209915547727, + "grad_norm": 0.13721024990081787, + "learning_rate": 7.870125307849042e-05, + "loss": 1.0415, + "step": 5062 + }, + { + "epoch": 0.3076128561881038, + "grad_norm": 0.1414865106344223, + "learning_rate": 7.86934140047705e-05, + "loss": 1.0651, + "step": 5063 + }, + { + "epoch": 0.3076736132207303, + "grad_norm": 0.2621670961380005, + "learning_rate": 7.868557387928796e-05, + "loss": 1.1895, + "step": 5064 + }, + { + "epoch": 0.3077343702533568, + "grad_norm": 0.6092076301574707, + "learning_rate": 7.867773270233014e-05, + "loss": 1.0737, + "step": 5065 + }, + { + "epoch": 0.30779512728598335, + "grad_norm": 0.3646189570426941, + "learning_rate": 7.866989047418452e-05, + "loss": 1.2532, + "step": 5066 + }, + { + "epoch": 0.3078558843186099, + "grad_norm": 0.18351449072360992, + "learning_rate": 7.86620471951385e-05, + "loss": 1.0589, + "step": 5067 + }, + { + "epoch": 0.3079166413512364, + "grad_norm": 0.23390978574752808, + "learning_rate": 7.865420286547959e-05, + "loss": 1.0713, + "step": 5068 + }, + { + "epoch": 0.30797739838386295, + "grad_norm": 0.15962259471416473, + "learning_rate": 7.864635748549534e-05, + "loss": 1.0818, + "step": 5069 + }, + { + "epoch": 0.30803815541648943, + "grad_norm": 0.17449022829532623, + "learning_rate": 7.863851105547333e-05, + "loss": 1.0783, + "step": 5070 + }, + { + "epoch": 0.30809891244911597, + "grad_norm": 0.13715773820877075, + "learning_rate": 7.863066357570115e-05, + "loss": 1.0409, + "step": 5071 + }, + { + "epoch": 0.3081596694817425, + "grad_norm": 0.20138972997665405, + "learning_rate": 7.862281504646647e-05, + "loss": 1.0236, + "step": 5072 + }, + { + "epoch": 0.30822042651436904, + "grad_norm": 0.8320518732070923, + "learning_rate": 7.861496546805697e-05, + "loss": 1.1709, + "step": 5073 + }, + { + "epoch": 0.3082811835469956, + "grad_norm": 0.2535605728626251, + "learning_rate": 7.860711484076038e-05, + "loss": 1.1913, + "step": 5074 + }, + { + "epoch": 0.3083419405796221, + "grad_norm": 0.2105361521244049, + "learning_rate": 7.859926316486448e-05, + "loss": 1.1223, + "step": 5075 + }, + { + "epoch": 0.30840269761224864, + "grad_norm": 0.26760080456733704, + "learning_rate": 7.859141044065704e-05, + "loss": 1.1247, + "step": 5076 + }, + { + "epoch": 0.3084634546448751, + "grad_norm": 0.14098025858402252, + "learning_rate": 7.858355666842594e-05, + "loss": 1.0657, + "step": 5077 + }, + { + "epoch": 0.30852421167750166, + "grad_norm": 0.25043994188308716, + "learning_rate": 7.857570184845903e-05, + "loss": 1.1209, + "step": 5078 + }, + { + "epoch": 0.3085849687101282, + "grad_norm": 0.23554249107837677, + "learning_rate": 7.856784598104426e-05, + "loss": 1.2225, + "step": 5079 + }, + { + "epoch": 0.30864572574275473, + "grad_norm": 0.19218409061431885, + "learning_rate": 7.855998906646956e-05, + "loss": 1.1355, + "step": 5080 + }, + { + "epoch": 0.30870648277538126, + "grad_norm": 0.20556126534938812, + "learning_rate": 7.855213110502293e-05, + "loss": 1.1302, + "step": 5081 + }, + { + "epoch": 0.3087672398080078, + "grad_norm": 0.16717664897441864, + "learning_rate": 7.854427209699244e-05, + "loss": 1.1274, + "step": 5082 + }, + { + "epoch": 0.3088279968406343, + "grad_norm": 0.1822149008512497, + "learning_rate": 7.853641204266614e-05, + "loss": 1.1169, + "step": 5083 + }, + { + "epoch": 0.3088887538732608, + "grad_norm": 0.1569221466779709, + "learning_rate": 7.852855094233211e-05, + "loss": 1.1279, + "step": 5084 + }, + { + "epoch": 0.30894951090588735, + "grad_norm": 0.25345590710639954, + "learning_rate": 7.852068879627855e-05, + "loss": 1.1864, + "step": 5085 + }, + { + "epoch": 0.3090102679385139, + "grad_norm": 0.16206392645835876, + "learning_rate": 7.851282560479363e-05, + "loss": 1.0941, + "step": 5086 + }, + { + "epoch": 0.3090710249711404, + "grad_norm": 0.17163822054862976, + "learning_rate": 7.850496136816558e-05, + "loss": 1.0964, + "step": 5087 + }, + { + "epoch": 0.30913178200376695, + "grad_norm": 0.2909422814846039, + "learning_rate": 7.849709608668265e-05, + "loss": 1.2, + "step": 5088 + }, + { + "epoch": 0.3091925390363935, + "grad_norm": 0.47417882084846497, + "learning_rate": 7.848922976063317e-05, + "loss": 1.0497, + "step": 5089 + }, + { + "epoch": 0.30925329606901997, + "grad_norm": 0.18791179358959198, + "learning_rate": 7.848136239030546e-05, + "loss": 1.1051, + "step": 5090 + }, + { + "epoch": 0.3093140531016465, + "grad_norm": 0.20847441256046295, + "learning_rate": 7.84734939759879e-05, + "loss": 1.1509, + "step": 5091 + }, + { + "epoch": 0.30937481013427304, + "grad_norm": 0.25191542506217957, + "learning_rate": 7.846562451796892e-05, + "loss": 1.0694, + "step": 5092 + }, + { + "epoch": 0.30943556716689957, + "grad_norm": 0.32054251432418823, + "learning_rate": 7.845775401653696e-05, + "loss": 1.0594, + "step": 5093 + }, + { + "epoch": 0.3094963241995261, + "grad_norm": 0.20492412149906158, + "learning_rate": 7.844988247198055e-05, + "loss": 1.2071, + "step": 5094 + }, + { + "epoch": 0.30955708123215264, + "grad_norm": 0.20347364246845245, + "learning_rate": 7.844200988458818e-05, + "loss": 1.1415, + "step": 5095 + }, + { + "epoch": 0.3096178382647791, + "grad_norm": 0.2791954576969147, + "learning_rate": 7.843413625464843e-05, + "loss": 1.0529, + "step": 5096 + }, + { + "epoch": 0.30967859529740566, + "grad_norm": 0.15032117068767548, + "learning_rate": 7.842626158244994e-05, + "loss": 1.076, + "step": 5097 + }, + { + "epoch": 0.3097393523300322, + "grad_norm": 0.5204444527626038, + "learning_rate": 7.841838586828134e-05, + "loss": 1.0288, + "step": 5098 + }, + { + "epoch": 0.3098001093626587, + "grad_norm": 0.26252487301826477, + "learning_rate": 7.841050911243129e-05, + "loss": 1.1046, + "step": 5099 + }, + { + "epoch": 0.30986086639528526, + "grad_norm": 0.21715553104877472, + "learning_rate": 7.840263131518856e-05, + "loss": 1.1699, + "step": 5100 + }, + { + "epoch": 0.3099216234279118, + "grad_norm": 0.400338351726532, + "learning_rate": 7.839475247684187e-05, + "loss": 1.0411, + "step": 5101 + }, + { + "epoch": 0.30998238046053833, + "grad_norm": 0.16862495243549347, + "learning_rate": 7.838687259768004e-05, + "loss": 1.1446, + "step": 5102 + }, + { + "epoch": 0.3100431374931648, + "grad_norm": 0.188529372215271, + "learning_rate": 7.837899167799194e-05, + "loss": 1.0879, + "step": 5103 + }, + { + "epoch": 0.31010389452579135, + "grad_norm": 0.20136164128780365, + "learning_rate": 7.837110971806638e-05, + "loss": 1.1205, + "step": 5104 + }, + { + "epoch": 0.3101646515584179, + "grad_norm": 0.27363207936286926, + "learning_rate": 7.836322671819232e-05, + "loss": 1.111, + "step": 5105 + }, + { + "epoch": 0.3102254085910444, + "grad_norm": 0.16804730892181396, + "learning_rate": 7.83553426786587e-05, + "loss": 1.1572, + "step": 5106 + }, + { + "epoch": 0.31028616562367095, + "grad_norm": 0.26315054297447205, + "learning_rate": 7.834745759975453e-05, + "loss": 1.0769, + "step": 5107 + }, + { + "epoch": 0.3103469226562975, + "grad_norm": 0.23740170896053314, + "learning_rate": 7.83395714817688e-05, + "loss": 1.1112, + "step": 5108 + }, + { + "epoch": 0.31040767968892397, + "grad_norm": 0.17278525233268738, + "learning_rate": 7.83316843249906e-05, + "loss": 1.1308, + "step": 5109 + }, + { + "epoch": 0.3104684367215505, + "grad_norm": 0.3111562728881836, + "learning_rate": 7.832379612970905e-05, + "loss": 1.1484, + "step": 5110 + }, + { + "epoch": 0.31052919375417704, + "grad_norm": 0.1663617044687271, + "learning_rate": 7.831590689621326e-05, + "loss": 1.0541, + "step": 5111 + }, + { + "epoch": 0.31058995078680357, + "grad_norm": 0.22036807239055634, + "learning_rate": 7.830801662479244e-05, + "loss": 1.2443, + "step": 5112 + }, + { + "epoch": 0.3106507078194301, + "grad_norm": 0.1357625424861908, + "learning_rate": 7.830012531573581e-05, + "loss": 1.0797, + "step": 5113 + }, + { + "epoch": 0.31071146485205664, + "grad_norm": 0.19941739737987518, + "learning_rate": 7.829223296933259e-05, + "loss": 1.1688, + "step": 5114 + }, + { + "epoch": 0.3107722218846832, + "grad_norm": 0.17946669459342957, + "learning_rate": 7.828433958587213e-05, + "loss": 1.1471, + "step": 5115 + }, + { + "epoch": 0.31083297891730965, + "grad_norm": 0.527148962020874, + "learning_rate": 7.827644516564371e-05, + "loss": 1.1591, + "step": 5116 + }, + { + "epoch": 0.3108937359499362, + "grad_norm": 0.17529942095279694, + "learning_rate": 7.826854970893674e-05, + "loss": 1.2252, + "step": 5117 + }, + { + "epoch": 0.3109544929825627, + "grad_norm": 0.16480781137943268, + "learning_rate": 7.826065321604061e-05, + "loss": 1.0701, + "step": 5118 + }, + { + "epoch": 0.31101525001518926, + "grad_norm": 0.15994659066200256, + "learning_rate": 7.825275568724478e-05, + "loss": 1.1142, + "step": 5119 + }, + { + "epoch": 0.3110760070478158, + "grad_norm": 0.15284310281276703, + "learning_rate": 7.824485712283873e-05, + "loss": 1.1072, + "step": 5120 + }, + { + "epoch": 0.31113676408044233, + "grad_norm": 0.17039936780929565, + "learning_rate": 7.823695752311198e-05, + "loss": 1.0696, + "step": 5121 + }, + { + "epoch": 0.31119752111306886, + "grad_norm": 0.1231837347149849, + "learning_rate": 7.82290568883541e-05, + "loss": 1.0322, + "step": 5122 + }, + { + "epoch": 0.31125827814569534, + "grad_norm": 0.1808963119983673, + "learning_rate": 7.822115521885466e-05, + "loss": 1.1309, + "step": 5123 + }, + { + "epoch": 0.3113190351783219, + "grad_norm": 0.1607855260372162, + "learning_rate": 7.821325251490335e-05, + "loss": 1.0626, + "step": 5124 + }, + { + "epoch": 0.3113797922109484, + "grad_norm": 0.17335842549800873, + "learning_rate": 7.82053487767898e-05, + "loss": 1.0651, + "step": 5125 + }, + { + "epoch": 0.31144054924357495, + "grad_norm": 0.2586134374141693, + "learning_rate": 7.819744400480373e-05, + "loss": 1.0913, + "step": 5126 + }, + { + "epoch": 0.3115013062762015, + "grad_norm": 0.2554481029510498, + "learning_rate": 7.818953819923491e-05, + "loss": 1.2143, + "step": 5127 + }, + { + "epoch": 0.311562063308828, + "grad_norm": 0.1633116602897644, + "learning_rate": 7.818163136037314e-05, + "loss": 1.1459, + "step": 5128 + }, + { + "epoch": 0.3116228203414545, + "grad_norm": 0.18200641870498657, + "learning_rate": 7.817372348850818e-05, + "loss": 1.0786, + "step": 5129 + }, + { + "epoch": 0.31168357737408103, + "grad_norm": 0.2773749530315399, + "learning_rate": 7.816581458392998e-05, + "loss": 1.1695, + "step": 5130 + }, + { + "epoch": 0.31174433440670757, + "grad_norm": 0.3059140741825104, + "learning_rate": 7.815790464692838e-05, + "loss": 1.1431, + "step": 5131 + }, + { + "epoch": 0.3118050914393341, + "grad_norm": 0.1893763691186905, + "learning_rate": 7.814999367779336e-05, + "loss": 1.0852, + "step": 5132 + }, + { + "epoch": 0.31186584847196064, + "grad_norm": 0.34205004572868347, + "learning_rate": 7.814208167681486e-05, + "loss": 1.0825, + "step": 5133 + }, + { + "epoch": 0.3119266055045872, + "grad_norm": 0.20849581062793732, + "learning_rate": 7.813416864428294e-05, + "loss": 1.0395, + "step": 5134 + }, + { + "epoch": 0.3119873625372137, + "grad_norm": 0.2009948343038559, + "learning_rate": 7.81262545804876e-05, + "loss": 1.1489, + "step": 5135 + }, + { + "epoch": 0.3120481195698402, + "grad_norm": 0.2520940899848938, + "learning_rate": 7.811833948571898e-05, + "loss": 1.2503, + "step": 5136 + }, + { + "epoch": 0.3121088766024667, + "grad_norm": 0.8176343441009521, + "learning_rate": 7.81104233602672e-05, + "loss": 1.1063, + "step": 5137 + }, + { + "epoch": 0.31216963363509326, + "grad_norm": 0.20283618569374084, + "learning_rate": 7.810250620442239e-05, + "loss": 1.1712, + "step": 5138 + }, + { + "epoch": 0.3122303906677198, + "grad_norm": 0.31284841895103455, + "learning_rate": 7.80945880184748e-05, + "loss": 1.085, + "step": 5139 + }, + { + "epoch": 0.3122911477003463, + "grad_norm": 0.31324079632759094, + "learning_rate": 7.808666880271466e-05, + "loss": 1.193, + "step": 5140 + }, + { + "epoch": 0.31235190473297286, + "grad_norm": 0.1671028733253479, + "learning_rate": 7.807874855743223e-05, + "loss": 1.0407, + "step": 5141 + }, + { + "epoch": 0.31241266176559934, + "grad_norm": 0.25555145740509033, + "learning_rate": 7.807082728291786e-05, + "loss": 1.0618, + "step": 5142 + }, + { + "epoch": 0.3124734187982259, + "grad_norm": 0.5085744261741638, + "learning_rate": 7.806290497946189e-05, + "loss": 1.3183, + "step": 5143 + }, + { + "epoch": 0.3125341758308524, + "grad_norm": 0.13113556802272797, + "learning_rate": 7.805498164735469e-05, + "loss": 1.1089, + "step": 5144 + }, + { + "epoch": 0.31259493286347895, + "grad_norm": 0.2609107196331024, + "learning_rate": 7.804705728688675e-05, + "loss": 1.1589, + "step": 5145 + }, + { + "epoch": 0.3126556898961055, + "grad_norm": 0.33784806728363037, + "learning_rate": 7.803913189834849e-05, + "loss": 1.3375, + "step": 5146 + }, + { + "epoch": 0.312716446928732, + "grad_norm": 0.16786982119083405, + "learning_rate": 7.803120548203042e-05, + "loss": 1.0685, + "step": 5147 + }, + { + "epoch": 0.31277720396135855, + "grad_norm": 0.19983232021331787, + "learning_rate": 7.80232780382231e-05, + "loss": 1.1291, + "step": 5148 + }, + { + "epoch": 0.31283796099398503, + "grad_norm": 0.1912095844745636, + "learning_rate": 7.80153495672171e-05, + "loss": 1.0716, + "step": 5149 + }, + { + "epoch": 0.31289871802661157, + "grad_norm": 0.1497095674276352, + "learning_rate": 7.800742006930306e-05, + "loss": 1.0912, + "step": 5150 + }, + { + "epoch": 0.3129594750592381, + "grad_norm": 0.15939418971538544, + "learning_rate": 7.799948954477163e-05, + "loss": 1.07, + "step": 5151 + }, + { + "epoch": 0.31302023209186464, + "grad_norm": 0.19541843235492706, + "learning_rate": 7.799155799391349e-05, + "loss": 1.014, + "step": 5152 + }, + { + "epoch": 0.31308098912449117, + "grad_norm": 0.20815326273441315, + "learning_rate": 7.798362541701938e-05, + "loss": 1.1034, + "step": 5153 + }, + { + "epoch": 0.3131417461571177, + "grad_norm": 0.2816903591156006, + "learning_rate": 7.797569181438007e-05, + "loss": 1.0955, + "step": 5154 + }, + { + "epoch": 0.3132025031897442, + "grad_norm": 0.16486649215221405, + "learning_rate": 7.796775718628636e-05, + "loss": 1.1044, + "step": 5155 + }, + { + "epoch": 0.3132632602223707, + "grad_norm": 0.22881430387496948, + "learning_rate": 7.795982153302911e-05, + "loss": 1.1176, + "step": 5156 + }, + { + "epoch": 0.31332401725499726, + "grad_norm": 0.18074339628219604, + "learning_rate": 7.79518848548992e-05, + "loss": 1.09, + "step": 5157 + }, + { + "epoch": 0.3133847742876238, + "grad_norm": 0.18104323744773865, + "learning_rate": 7.794394715218755e-05, + "loss": 1.139, + "step": 5158 + }, + { + "epoch": 0.3134455313202503, + "grad_norm": 0.17354963719844818, + "learning_rate": 7.793600842518512e-05, + "loss": 1.142, + "step": 5159 + }, + { + "epoch": 0.31350628835287686, + "grad_norm": 3.4281907081604004, + "learning_rate": 7.79280686741829e-05, + "loss": 1.1442, + "step": 5160 + }, + { + "epoch": 0.3135670453855034, + "grad_norm": 0.2361103743314743, + "learning_rate": 7.792012789947192e-05, + "loss": 1.119, + "step": 5161 + }, + { + "epoch": 0.3136278024181299, + "grad_norm": 0.20149557292461395, + "learning_rate": 7.791218610134323e-05, + "loss": 1.1081, + "step": 5162 + }, + { + "epoch": 0.3136885594507564, + "grad_norm": 0.13651804625988007, + "learning_rate": 7.7904243280088e-05, + "loss": 1.0802, + "step": 5163 + }, + { + "epoch": 0.31374931648338295, + "grad_norm": 0.18828308582305908, + "learning_rate": 7.789629943599732e-05, + "loss": 1.1238, + "step": 5164 + }, + { + "epoch": 0.3138100735160095, + "grad_norm": 0.25163477659225464, + "learning_rate": 7.78883545693624e-05, + "loss": 1.2072, + "step": 5165 + }, + { + "epoch": 0.313870830548636, + "grad_norm": 0.17760860919952393, + "learning_rate": 7.788040868047445e-05, + "loss": 1.0718, + "step": 5166 + }, + { + "epoch": 0.31393158758126255, + "grad_norm": 0.16761569678783417, + "learning_rate": 7.787246176962474e-05, + "loss": 1.1063, + "step": 5167 + }, + { + "epoch": 0.3139923446138891, + "grad_norm": 0.27144646644592285, + "learning_rate": 7.786451383710455e-05, + "loss": 1.1528, + "step": 5168 + }, + { + "epoch": 0.31405310164651556, + "grad_norm": 0.15995943546295166, + "learning_rate": 7.78565648832052e-05, + "loss": 1.1231, + "step": 5169 + }, + { + "epoch": 0.3141138586791421, + "grad_norm": 0.15376867353916168, + "learning_rate": 7.784861490821811e-05, + "loss": 1.0839, + "step": 5170 + }, + { + "epoch": 0.31417461571176863, + "grad_norm": 0.18771442770957947, + "learning_rate": 7.784066391243465e-05, + "loss": 1.1233, + "step": 5171 + }, + { + "epoch": 0.31423537274439517, + "grad_norm": 1.352339267730713, + "learning_rate": 7.783271189614626e-05, + "loss": 1.0791, + "step": 5172 + }, + { + "epoch": 0.3142961297770217, + "grad_norm": 0.12809929251670837, + "learning_rate": 7.782475885964444e-05, + "loss": 1.0845, + "step": 5173 + }, + { + "epoch": 0.31435688680964824, + "grad_norm": 0.1668839007616043, + "learning_rate": 7.78168048032207e-05, + "loss": 1.1146, + "step": 5174 + }, + { + "epoch": 0.3144176438422747, + "grad_norm": 0.42424067854881287, + "learning_rate": 7.780884972716662e-05, + "loss": 1.0524, + "step": 5175 + }, + { + "epoch": 0.31447840087490125, + "grad_norm": 0.14323019981384277, + "learning_rate": 7.780089363177376e-05, + "loss": 1.0056, + "step": 5176 + }, + { + "epoch": 0.3145391579075278, + "grad_norm": 0.19844859838485718, + "learning_rate": 7.779293651733379e-05, + "loss": 1.1222, + "step": 5177 + }, + { + "epoch": 0.3145999149401543, + "grad_norm": 0.26833683252334595, + "learning_rate": 7.778497838413834e-05, + "loss": 1.1296, + "step": 5178 + }, + { + "epoch": 0.31466067197278086, + "grad_norm": 0.1764657348394394, + "learning_rate": 7.777701923247916e-05, + "loss": 1.0837, + "step": 5179 + }, + { + "epoch": 0.3147214290054074, + "grad_norm": 0.24348610639572144, + "learning_rate": 7.776905906264795e-05, + "loss": 1.1094, + "step": 5180 + }, + { + "epoch": 0.31478218603803393, + "grad_norm": 0.404840886592865, + "learning_rate": 7.776109787493651e-05, + "loss": 1.1243, + "step": 5181 + }, + { + "epoch": 0.3148429430706604, + "grad_norm": 0.19809971749782562, + "learning_rate": 7.775313566963667e-05, + "loss": 1.104, + "step": 5182 + }, + { + "epoch": 0.31490370010328694, + "grad_norm": 0.1251288503408432, + "learning_rate": 7.77451724470403e-05, + "loss": 1.0254, + "step": 5183 + }, + { + "epoch": 0.3149644571359135, + "grad_norm": 0.18405430018901825, + "learning_rate": 7.773720820743922e-05, + "loss": 1.109, + "step": 5184 + }, + { + "epoch": 0.31502521416854, + "grad_norm": 0.6925203204154968, + "learning_rate": 7.772924295112545e-05, + "loss": 1.0947, + "step": 5185 + }, + { + "epoch": 0.31508597120116655, + "grad_norm": 0.26931998133659363, + "learning_rate": 7.772127667839092e-05, + "loss": 1.074, + "step": 5186 + }, + { + "epoch": 0.3151467282337931, + "grad_norm": 0.14279203116893768, + "learning_rate": 7.771330938952761e-05, + "loss": 1.0851, + "step": 5187 + }, + { + "epoch": 0.31520748526641956, + "grad_norm": 0.26213353872299194, + "learning_rate": 7.770534108482761e-05, + "loss": 1.0812, + "step": 5188 + }, + { + "epoch": 0.3152682422990461, + "grad_norm": 0.4701729714870453, + "learning_rate": 7.769737176458296e-05, + "loss": 1.1508, + "step": 5189 + }, + { + "epoch": 0.31532899933167263, + "grad_norm": 0.4827955961227417, + "learning_rate": 7.768940142908581e-05, + "loss": 1.0391, + "step": 5190 + }, + { + "epoch": 0.31538975636429917, + "grad_norm": 0.1750054508447647, + "learning_rate": 7.768143007862829e-05, + "loss": 1.0941, + "step": 5191 + }, + { + "epoch": 0.3154505133969257, + "grad_norm": 0.19122281670570374, + "learning_rate": 7.767345771350261e-05, + "loss": 1.1639, + "step": 5192 + }, + { + "epoch": 0.31551127042955224, + "grad_norm": 6.284471035003662, + "learning_rate": 7.766548433400096e-05, + "loss": 1.1548, + "step": 5193 + }, + { + "epoch": 0.3155720274621788, + "grad_norm": 0.14756184816360474, + "learning_rate": 7.765750994041566e-05, + "loss": 1.0822, + "step": 5194 + }, + { + "epoch": 0.31563278449480525, + "grad_norm": 0.16128914058208466, + "learning_rate": 7.764953453303898e-05, + "loss": 1.1433, + "step": 5195 + }, + { + "epoch": 0.3156935415274318, + "grad_norm": 0.13844360411167145, + "learning_rate": 7.764155811216327e-05, + "loss": 1.0279, + "step": 5196 + }, + { + "epoch": 0.3157542985600583, + "grad_norm": 0.43880733847618103, + "learning_rate": 7.76335806780809e-05, + "loss": 1.2821, + "step": 5197 + }, + { + "epoch": 0.31581505559268486, + "grad_norm": 0.2243174910545349, + "learning_rate": 7.76256022310843e-05, + "loss": 1.1521, + "step": 5198 + }, + { + "epoch": 0.3158758126253114, + "grad_norm": 0.27900493144989014, + "learning_rate": 7.761762277146589e-05, + "loss": 1.1816, + "step": 5199 + }, + { + "epoch": 0.3159365696579379, + "grad_norm": 0.24200977385044098, + "learning_rate": 7.76096422995182e-05, + "loss": 1.1435, + "step": 5200 + }, + { + "epoch": 0.3159973266905644, + "grad_norm": 8.08497142791748, + "learning_rate": 7.760166081553373e-05, + "loss": 1.1547, + "step": 5201 + }, + { + "epoch": 0.31605808372319094, + "grad_norm": 0.17424650490283966, + "learning_rate": 7.759367831980504e-05, + "loss": 1.0744, + "step": 5202 + }, + { + "epoch": 0.3161188407558175, + "grad_norm": 0.2214883267879486, + "learning_rate": 7.758569481262475e-05, + "loss": 1.1992, + "step": 5203 + }, + { + "epoch": 0.316179597788444, + "grad_norm": 0.4513641893863678, + "learning_rate": 7.757771029428547e-05, + "loss": 1.1952, + "step": 5204 + }, + { + "epoch": 0.31624035482107055, + "grad_norm": 0.24038437008857727, + "learning_rate": 7.756972476507989e-05, + "loss": 1.1015, + "step": 5205 + }, + { + "epoch": 0.3163011118536971, + "grad_norm": 0.1677442342042923, + "learning_rate": 7.756173822530071e-05, + "loss": 1.1296, + "step": 5206 + }, + { + "epoch": 0.3163618688863236, + "grad_norm": 0.14486092329025269, + "learning_rate": 7.755375067524071e-05, + "loss": 1.0801, + "step": 5207 + }, + { + "epoch": 0.3164226259189501, + "grad_norm": 0.17801958322525024, + "learning_rate": 7.754576211519264e-05, + "loss": 1.1745, + "step": 5208 + }, + { + "epoch": 0.31648338295157663, + "grad_norm": 0.18977457284927368, + "learning_rate": 7.753777254544934e-05, + "loss": 1.127, + "step": 5209 + }, + { + "epoch": 0.31654413998420317, + "grad_norm": 0.17817537486553192, + "learning_rate": 7.752978196630368e-05, + "loss": 1.1358, + "step": 5210 + }, + { + "epoch": 0.3166048970168297, + "grad_norm": 0.18756113946437836, + "learning_rate": 7.752179037804851e-05, + "loss": 1.0939, + "step": 5211 + }, + { + "epoch": 0.31666565404945624, + "grad_norm": 0.14955630898475647, + "learning_rate": 7.75137977809768e-05, + "loss": 1.0801, + "step": 5212 + }, + { + "epoch": 0.31672641108208277, + "grad_norm": 5.002853870391846, + "learning_rate": 7.750580417538153e-05, + "loss": 1.1666, + "step": 5213 + }, + { + "epoch": 0.31678716811470925, + "grad_norm": 0.21667887270450592, + "learning_rate": 7.749780956155568e-05, + "loss": 1.0649, + "step": 5214 + }, + { + "epoch": 0.3168479251473358, + "grad_norm": 0.4345817267894745, + "learning_rate": 7.74898139397923e-05, + "loss": 1.0416, + "step": 5215 + }, + { + "epoch": 0.3169086821799623, + "grad_norm": 0.3247835338115692, + "learning_rate": 7.748181731038449e-05, + "loss": 1.1617, + "step": 5216 + }, + { + "epoch": 0.31696943921258885, + "grad_norm": 0.18982625007629395, + "learning_rate": 7.747381967362536e-05, + "loss": 1.0961, + "step": 5217 + }, + { + "epoch": 0.3170301962452154, + "grad_norm": 37.097442626953125, + "learning_rate": 7.746582102980804e-05, + "loss": 1.1801, + "step": 5218 + }, + { + "epoch": 0.3170909532778419, + "grad_norm": 0.8889961242675781, + "learning_rate": 7.745782137922574e-05, + "loss": 1.1257, + "step": 5219 + }, + { + "epoch": 0.31715171031046846, + "grad_norm": 0.2838846743106842, + "learning_rate": 7.744982072217171e-05, + "loss": 1.1328, + "step": 5220 + }, + { + "epoch": 0.31721246734309494, + "grad_norm": 0.16100822389125824, + "learning_rate": 7.74418190589392e-05, + "loss": 1.0843, + "step": 5221 + }, + { + "epoch": 0.3172732243757215, + "grad_norm": 0.3542579710483551, + "learning_rate": 7.743381638982148e-05, + "loss": 1.0543, + "step": 5222 + }, + { + "epoch": 0.317333981408348, + "grad_norm": 0.13324719667434692, + "learning_rate": 7.742581271511192e-05, + "loss": 1.0876, + "step": 5223 + }, + { + "epoch": 0.31739473844097454, + "grad_norm": 0.1612965613603592, + "learning_rate": 7.741780803510391e-05, + "loss": 1.1505, + "step": 5224 + }, + { + "epoch": 0.3174554954736011, + "grad_norm": 0.1852015107870102, + "learning_rate": 7.740980235009084e-05, + "loss": 1.1412, + "step": 5225 + }, + { + "epoch": 0.3175162525062276, + "grad_norm": 0.15180452167987823, + "learning_rate": 7.740179566036617e-05, + "loss": 1.0735, + "step": 5226 + }, + { + "epoch": 0.31757700953885415, + "grad_norm": 0.14446836709976196, + "learning_rate": 7.739378796622338e-05, + "loss": 1.0564, + "step": 5227 + }, + { + "epoch": 0.31763776657148063, + "grad_norm": 1.1163607835769653, + "learning_rate": 7.738577926795601e-05, + "loss": 1.0983, + "step": 5228 + }, + { + "epoch": 0.31769852360410716, + "grad_norm": 2.0002286434173584, + "learning_rate": 7.737776956585759e-05, + "loss": 1.1263, + "step": 5229 + }, + { + "epoch": 0.3177592806367337, + "grad_norm": 0.14760617911815643, + "learning_rate": 7.736975886022173e-05, + "loss": 1.02, + "step": 5230 + }, + { + "epoch": 0.31782003766936023, + "grad_norm": 0.20231235027313232, + "learning_rate": 7.736174715134208e-05, + "loss": 1.0735, + "step": 5231 + }, + { + "epoch": 0.31788079470198677, + "grad_norm": 0.12804539501667023, + "learning_rate": 7.735373443951229e-05, + "loss": 1.0808, + "step": 5232 + }, + { + "epoch": 0.3179415517346133, + "grad_norm": 0.22206659615039825, + "learning_rate": 7.734572072502609e-05, + "loss": 1.1571, + "step": 5233 + }, + { + "epoch": 0.3180023087672398, + "grad_norm": 0.2367912232875824, + "learning_rate": 7.733770600817721e-05, + "loss": 1.0904, + "step": 5234 + }, + { + "epoch": 0.3180630657998663, + "grad_norm": 0.5960977077484131, + "learning_rate": 7.732969028925943e-05, + "loss": 1.0733, + "step": 5235 + }, + { + "epoch": 0.31812382283249285, + "grad_norm": 0.219915509223938, + "learning_rate": 7.732167356856655e-05, + "loss": 1.2214, + "step": 5236 + }, + { + "epoch": 0.3181845798651194, + "grad_norm": 0.43676573038101196, + "learning_rate": 7.731365584639248e-05, + "loss": 1.1628, + "step": 5237 + }, + { + "epoch": 0.3182453368977459, + "grad_norm": 0.45794686675071716, + "learning_rate": 7.730563712303104e-05, + "loss": 1.1118, + "step": 5238 + }, + { + "epoch": 0.31830609393037246, + "grad_norm": 0.13144971430301666, + "learning_rate": 7.729761739877622e-05, + "loss": 1.0708, + "step": 5239 + }, + { + "epoch": 0.318366850962999, + "grad_norm": 0.23507849872112274, + "learning_rate": 7.728959667392195e-05, + "loss": 1.1135, + "step": 5240 + }, + { + "epoch": 0.3184276079956255, + "grad_norm": 0.2360433042049408, + "learning_rate": 7.728157494876223e-05, + "loss": 1.0795, + "step": 5241 + }, + { + "epoch": 0.318488365028252, + "grad_norm": 0.2307671159505844, + "learning_rate": 7.727355222359112e-05, + "loss": 1.1673, + "step": 5242 + }, + { + "epoch": 0.31854912206087854, + "grad_norm": 0.20978204905986786, + "learning_rate": 7.726552849870267e-05, + "loss": 1.1977, + "step": 5243 + }, + { + "epoch": 0.3186098790935051, + "grad_norm": 0.22503332793712616, + "learning_rate": 7.7257503774391e-05, + "loss": 1.2425, + "step": 5244 + }, + { + "epoch": 0.3186706361261316, + "grad_norm": 0.15387806296348572, + "learning_rate": 7.724947805095027e-05, + "loss": 1.0151, + "step": 5245 + }, + { + "epoch": 0.31873139315875815, + "grad_norm": 0.13361290097236633, + "learning_rate": 7.724145132867466e-05, + "loss": 1.067, + "step": 5246 + }, + { + "epoch": 0.3187921501913846, + "grad_norm": 0.13001136481761932, + "learning_rate": 7.723342360785837e-05, + "loss": 1.0508, + "step": 5247 + }, + { + "epoch": 0.31885290722401116, + "grad_norm": 0.22706355154514313, + "learning_rate": 7.722539488879569e-05, + "loss": 1.2687, + "step": 5248 + }, + { + "epoch": 0.3189136642566377, + "grad_norm": 0.16564755141735077, + "learning_rate": 7.721736517178089e-05, + "loss": 1.0756, + "step": 5249 + }, + { + "epoch": 0.31897442128926423, + "grad_norm": 0.2275359034538269, + "learning_rate": 7.72093344571083e-05, + "loss": 1.0813, + "step": 5250 + }, + { + "epoch": 0.31903517832189077, + "grad_norm": 0.12234020233154297, + "learning_rate": 7.72013027450723e-05, + "loss": 1.0399, + "step": 5251 + }, + { + "epoch": 0.3190959353545173, + "grad_norm": 0.19603148102760315, + "learning_rate": 7.71932700359673e-05, + "loss": 1.1804, + "step": 5252 + }, + { + "epoch": 0.31915669238714384, + "grad_norm": 0.19225366413593292, + "learning_rate": 7.718523633008772e-05, + "loss": 1.1531, + "step": 5253 + }, + { + "epoch": 0.3192174494197703, + "grad_norm": 0.1957905888557434, + "learning_rate": 7.717720162772806e-05, + "loss": 1.1132, + "step": 5254 + }, + { + "epoch": 0.31927820645239685, + "grad_norm": 0.4529777765274048, + "learning_rate": 7.716916592918281e-05, + "loss": 1.2141, + "step": 5255 + }, + { + "epoch": 0.3193389634850234, + "grad_norm": 0.2204008251428604, + "learning_rate": 7.716112923474654e-05, + "loss": 1.1488, + "step": 5256 + }, + { + "epoch": 0.3193997205176499, + "grad_norm": 0.24447494745254517, + "learning_rate": 7.715309154471382e-05, + "loss": 1.256, + "step": 5257 + }, + { + "epoch": 0.31946047755027646, + "grad_norm": 0.16351109743118286, + "learning_rate": 7.714505285937928e-05, + "loss": 1.0119, + "step": 5258 + }, + { + "epoch": 0.319521234582903, + "grad_norm": 0.27826523780822754, + "learning_rate": 7.713701317903759e-05, + "loss": 1.1552, + "step": 5259 + }, + { + "epoch": 0.31958199161552947, + "grad_norm": 0.4236544668674469, + "learning_rate": 7.712897250398343e-05, + "loss": 1.0764, + "step": 5260 + }, + { + "epoch": 0.319642748648156, + "grad_norm": 0.19925330579280853, + "learning_rate": 7.712093083451153e-05, + "loss": 1.1677, + "step": 5261 + }, + { + "epoch": 0.31970350568078254, + "grad_norm": 0.1551828682422638, + "learning_rate": 7.711288817091669e-05, + "loss": 1.0962, + "step": 5262 + }, + { + "epoch": 0.3197642627134091, + "grad_norm": 0.24796655774116516, + "learning_rate": 7.710484451349367e-05, + "loss": 1.1001, + "step": 5263 + }, + { + "epoch": 0.3198250197460356, + "grad_norm": 0.17019613087177277, + "learning_rate": 7.709679986253735e-05, + "loss": 1.0776, + "step": 5264 + }, + { + "epoch": 0.31988577677866215, + "grad_norm": 0.1914503425359726, + "learning_rate": 7.708875421834258e-05, + "loss": 1.069, + "step": 5265 + }, + { + "epoch": 0.3199465338112887, + "grad_norm": 0.287005215883255, + "learning_rate": 7.70807075812043e-05, + "loss": 1.2286, + "step": 5266 + }, + { + "epoch": 0.32000729084391516, + "grad_norm": 0.4092259407043457, + "learning_rate": 7.707265995141744e-05, + "loss": 1.1158, + "step": 5267 + }, + { + "epoch": 0.3200680478765417, + "grad_norm": 0.21153295040130615, + "learning_rate": 7.7064611329277e-05, + "loss": 1.1045, + "step": 5268 + }, + { + "epoch": 0.32012880490916823, + "grad_norm": 0.1494741588830948, + "learning_rate": 7.705656171507802e-05, + "loss": 1.0573, + "step": 5269 + }, + { + "epoch": 0.32018956194179476, + "grad_norm": 0.26257410645484924, + "learning_rate": 7.70485111091155e-05, + "loss": 1.1391, + "step": 5270 + }, + { + "epoch": 0.3202503189744213, + "grad_norm": 0.17116105556488037, + "learning_rate": 7.704045951168461e-05, + "loss": 1.0949, + "step": 5271 + }, + { + "epoch": 0.32031107600704783, + "grad_norm": 0.17433473467826843, + "learning_rate": 7.703240692308044e-05, + "loss": 1.1713, + "step": 5272 + }, + { + "epoch": 0.32037183303967437, + "grad_norm": 0.162774920463562, + "learning_rate": 7.702435334359817e-05, + "loss": 1.1501, + "step": 5273 + }, + { + "epoch": 0.32043259007230085, + "grad_norm": 0.1266329288482666, + "learning_rate": 7.7016298773533e-05, + "loss": 1.0956, + "step": 5274 + }, + { + "epoch": 0.3204933471049274, + "grad_norm": 0.19025348126888275, + "learning_rate": 7.700824321318019e-05, + "loss": 1.0888, + "step": 5275 + }, + { + "epoch": 0.3205541041375539, + "grad_norm": 0.18125171959400177, + "learning_rate": 7.7000186662835e-05, + "loss": 1.1075, + "step": 5276 + }, + { + "epoch": 0.32061486117018045, + "grad_norm": 0.11788712441921234, + "learning_rate": 7.699212912279274e-05, + "loss": 1.0304, + "step": 5277 + }, + { + "epoch": 0.320675618202807, + "grad_norm": 0.4160124957561493, + "learning_rate": 7.698407059334877e-05, + "loss": 1.2273, + "step": 5278 + }, + { + "epoch": 0.3207363752354335, + "grad_norm": 1.816224217414856, + "learning_rate": 7.697601107479846e-05, + "loss": 1.1682, + "step": 5279 + }, + { + "epoch": 0.32079713226806, + "grad_norm": 0.17960700392723083, + "learning_rate": 7.696795056743727e-05, + "loss": 1.155, + "step": 5280 + }, + { + "epoch": 0.32085788930068654, + "grad_norm": 0.22117215394973755, + "learning_rate": 7.695988907156066e-05, + "loss": 1.2285, + "step": 5281 + }, + { + "epoch": 0.3209186463333131, + "grad_norm": 0.22642168402671814, + "learning_rate": 7.695182658746407e-05, + "loss": 1.1356, + "step": 5282 + }, + { + "epoch": 0.3209794033659396, + "grad_norm": 2.557623863220215, + "learning_rate": 7.694376311544307e-05, + "loss": 1.1825, + "step": 5283 + }, + { + "epoch": 0.32104016039856614, + "grad_norm": 0.19348691403865814, + "learning_rate": 7.693569865579325e-05, + "loss": 1.1193, + "step": 5284 + }, + { + "epoch": 0.3211009174311927, + "grad_norm": 0.1772070974111557, + "learning_rate": 7.692763320881015e-05, + "loss": 1.2338, + "step": 5285 + }, + { + "epoch": 0.3211616744638192, + "grad_norm": 3.7062413692474365, + "learning_rate": 7.691956677478948e-05, + "loss": 1.2231, + "step": 5286 + }, + { + "epoch": 0.3212224314964457, + "grad_norm": 0.14694485068321228, + "learning_rate": 7.691149935402689e-05, + "loss": 1.0527, + "step": 5287 + }, + { + "epoch": 0.32128318852907223, + "grad_norm": 0.35769063234329224, + "learning_rate": 7.690343094681807e-05, + "loss": 1.1529, + "step": 5288 + }, + { + "epoch": 0.32134394556169876, + "grad_norm": 0.19548434019088745, + "learning_rate": 7.68953615534588e-05, + "loss": 0.9972, + "step": 5289 + }, + { + "epoch": 0.3214047025943253, + "grad_norm": 0.2896392345428467, + "learning_rate": 7.688729117424484e-05, + "loss": 1.0352, + "step": 5290 + }, + { + "epoch": 0.32146545962695183, + "grad_norm": 0.1843344122171402, + "learning_rate": 7.687921980947206e-05, + "loss": 1.063, + "step": 5291 + }, + { + "epoch": 0.32152621665957837, + "grad_norm": 0.23084795475006104, + "learning_rate": 7.687114745943624e-05, + "loss": 1.2774, + "step": 5292 + }, + { + "epoch": 0.32158697369220485, + "grad_norm": 0.24386334419250488, + "learning_rate": 7.686307412443334e-05, + "loss": 1.1572, + "step": 5293 + }, + { + "epoch": 0.3216477307248314, + "grad_norm": 0.18706533312797546, + "learning_rate": 7.685499980475927e-05, + "loss": 1.1078, + "step": 5294 + }, + { + "epoch": 0.3217084877574579, + "grad_norm": 0.16719487309455872, + "learning_rate": 7.684692450070996e-05, + "loss": 1.0529, + "step": 5295 + }, + { + "epoch": 0.32176924479008445, + "grad_norm": 0.1591726392507553, + "learning_rate": 7.683884821258147e-05, + "loss": 1.0388, + "step": 5296 + }, + { + "epoch": 0.321830001822711, + "grad_norm": 0.22185593843460083, + "learning_rate": 7.68307709406698e-05, + "loss": 1.0579, + "step": 5297 + }, + { + "epoch": 0.3218907588553375, + "grad_norm": 0.14457620680332184, + "learning_rate": 7.682269268527104e-05, + "loss": 1.056, + "step": 5298 + }, + { + "epoch": 0.32195151588796406, + "grad_norm": 0.22602593898773193, + "learning_rate": 7.681461344668131e-05, + "loss": 1.1887, + "step": 5299 + }, + { + "epoch": 0.32201227292059054, + "grad_norm": 0.1864214390516281, + "learning_rate": 7.680653322519671e-05, + "loss": 1.048, + "step": 5300 + }, + { + "epoch": 0.32207302995321707, + "grad_norm": 0.19677239656448364, + "learning_rate": 7.679845202111347e-05, + "loss": 1.1757, + "step": 5301 + }, + { + "epoch": 0.3221337869858436, + "grad_norm": 0.3017561435699463, + "learning_rate": 7.67903698347278e-05, + "loss": 1.1122, + "step": 5302 + }, + { + "epoch": 0.32219454401847014, + "grad_norm": 0.12074954807758331, + "learning_rate": 7.678228666633592e-05, + "loss": 1.0468, + "step": 5303 + }, + { + "epoch": 0.3222553010510967, + "grad_norm": 0.17102929949760437, + "learning_rate": 7.677420251623418e-05, + "loss": 1.1208, + "step": 5304 + }, + { + "epoch": 0.3223160580837232, + "grad_norm": 0.3344148099422455, + "learning_rate": 7.676611738471882e-05, + "loss": 1.0861, + "step": 5305 + }, + { + "epoch": 0.3223768151163497, + "grad_norm": 0.30935409665107727, + "learning_rate": 7.675803127208629e-05, + "loss": 1.0522, + "step": 5306 + }, + { + "epoch": 0.3224375721489762, + "grad_norm": 0.12636761367321014, + "learning_rate": 7.674994417863295e-05, + "loss": 1.0914, + "step": 5307 + }, + { + "epoch": 0.32249832918160276, + "grad_norm": 0.20348799228668213, + "learning_rate": 7.674185610465523e-05, + "loss": 1.1769, + "step": 5308 + }, + { + "epoch": 0.3225590862142293, + "grad_norm": 0.21296432614326477, + "learning_rate": 7.673376705044962e-05, + "loss": 1.0786, + "step": 5309 + }, + { + "epoch": 0.32261984324685583, + "grad_norm": 1.714321494102478, + "learning_rate": 7.67256770163126e-05, + "loss": 1.0796, + "step": 5310 + }, + { + "epoch": 0.32268060027948237, + "grad_norm": 0.27067720890045166, + "learning_rate": 7.671758600254071e-05, + "loss": 1.1471, + "step": 5311 + }, + { + "epoch": 0.3227413573121089, + "grad_norm": 0.14980441331863403, + "learning_rate": 7.670949400943054e-05, + "loss": 1.1106, + "step": 5312 + }, + { + "epoch": 0.3228021143447354, + "grad_norm": 0.2398141771554947, + "learning_rate": 7.670140103727871e-05, + "loss": 1.2155, + "step": 5313 + }, + { + "epoch": 0.3228628713773619, + "grad_norm": 0.3515942096710205, + "learning_rate": 7.66933070863819e-05, + "loss": 1.0888, + "step": 5314 + }, + { + "epoch": 0.32292362840998845, + "grad_norm": 0.1338648647069931, + "learning_rate": 7.668521215703672e-05, + "loss": 1.0556, + "step": 5315 + }, + { + "epoch": 0.322984385442615, + "grad_norm": 0.40135011076927185, + "learning_rate": 7.667711624953993e-05, + "loss": 1.0619, + "step": 5316 + }, + { + "epoch": 0.3230451424752415, + "grad_norm": 0.2750815749168396, + "learning_rate": 7.66690193641883e-05, + "loss": 1.035, + "step": 5317 + }, + { + "epoch": 0.32310589950786806, + "grad_norm": 0.1642073690891266, + "learning_rate": 7.66609215012786e-05, + "loss": 1.1097, + "step": 5318 + }, + { + "epoch": 0.32316665654049453, + "grad_norm": 0.16084831953048706, + "learning_rate": 7.665282266110768e-05, + "loss": 1.074, + "step": 5319 + }, + { + "epoch": 0.32322741357312107, + "grad_norm": 0.25044238567352295, + "learning_rate": 7.664472284397239e-05, + "loss": 1.1129, + "step": 5320 + }, + { + "epoch": 0.3232881706057476, + "grad_norm": 0.1886386126279831, + "learning_rate": 7.663662205016964e-05, + "loss": 1.1071, + "step": 5321 + }, + { + "epoch": 0.32334892763837414, + "grad_norm": 0.2613964378833771, + "learning_rate": 7.662852027999636e-05, + "loss": 1.2313, + "step": 5322 + }, + { + "epoch": 0.3234096846710007, + "grad_norm": 0.3531823456287384, + "learning_rate": 7.662041753374949e-05, + "loss": 1.0519, + "step": 5323 + }, + { + "epoch": 0.3234704417036272, + "grad_norm": 0.1669054925441742, + "learning_rate": 7.661231381172611e-05, + "loss": 1.0758, + "step": 5324 + }, + { + "epoch": 0.32353119873625374, + "grad_norm": 0.1835552304983139, + "learning_rate": 7.660420911422321e-05, + "loss": 1.2275, + "step": 5325 + }, + { + "epoch": 0.3235919557688802, + "grad_norm": 0.1822025626897812, + "learning_rate": 7.659610344153787e-05, + "loss": 1.2218, + "step": 5326 + }, + { + "epoch": 0.32365271280150676, + "grad_norm": 1.3519262075424194, + "learning_rate": 7.658799679396724e-05, + "loss": 1.0978, + "step": 5327 + }, + { + "epoch": 0.3237134698341333, + "grad_norm": 0.19118796288967133, + "learning_rate": 7.657988917180843e-05, + "loss": 1.1404, + "step": 5328 + }, + { + "epoch": 0.32377422686675983, + "grad_norm": 25.419490814208984, + "learning_rate": 7.657178057535866e-05, + "loss": 1.3757, + "step": 5329 + }, + { + "epoch": 0.32383498389938636, + "grad_norm": 0.20100799202919006, + "learning_rate": 7.656367100491512e-05, + "loss": 1.0809, + "step": 5330 + }, + { + "epoch": 0.3238957409320129, + "grad_norm": 0.3121403753757477, + "learning_rate": 7.65555604607751e-05, + "loss": 1.1042, + "step": 5331 + }, + { + "epoch": 0.32395649796463943, + "grad_norm": 0.16733300685882568, + "learning_rate": 7.654744894323586e-05, + "loss": 1.1085, + "step": 5332 + }, + { + "epoch": 0.3240172549972659, + "grad_norm": 0.1692768931388855, + "learning_rate": 7.653933645259475e-05, + "loss": 1.1277, + "step": 5333 + }, + { + "epoch": 0.32407801202989245, + "grad_norm": 0.3412485122680664, + "learning_rate": 7.653122298914915e-05, + "loss": 1.0951, + "step": 5334 + }, + { + "epoch": 0.324138769062519, + "grad_norm": 0.15294376015663147, + "learning_rate": 7.652310855319643e-05, + "loss": 1.1336, + "step": 5335 + }, + { + "epoch": 0.3241995260951455, + "grad_norm": 0.20286858081817627, + "learning_rate": 7.651499314503403e-05, + "loss": 1.0978, + "step": 5336 + }, + { + "epoch": 0.32426028312777205, + "grad_norm": 0.14353255927562714, + "learning_rate": 7.650687676495945e-05, + "loss": 1.0742, + "step": 5337 + }, + { + "epoch": 0.3243210401603986, + "grad_norm": 0.1851993054151535, + "learning_rate": 7.649875941327018e-05, + "loss": 1.0608, + "step": 5338 + }, + { + "epoch": 0.32438179719302507, + "grad_norm": 0.15749365091323853, + "learning_rate": 7.649064109026374e-05, + "loss": 1.1293, + "step": 5339 + }, + { + "epoch": 0.3244425542256516, + "grad_norm": 0.14843803644180298, + "learning_rate": 7.648252179623775e-05, + "loss": 1.124, + "step": 5340 + }, + { + "epoch": 0.32450331125827814, + "grad_norm": 0.15867199003696442, + "learning_rate": 7.647440153148979e-05, + "loss": 1.125, + "step": 5341 + }, + { + "epoch": 0.3245640682909047, + "grad_norm": 0.19077855348587036, + "learning_rate": 7.646628029631754e-05, + "loss": 1.1323, + "step": 5342 + }, + { + "epoch": 0.3246248253235312, + "grad_norm": 0.2224782556295395, + "learning_rate": 7.645815809101866e-05, + "loss": 1.2911, + "step": 5343 + }, + { + "epoch": 0.32468558235615774, + "grad_norm": 0.13767606019973755, + "learning_rate": 7.645003491589089e-05, + "loss": 1.0544, + "step": 5344 + }, + { + "epoch": 0.3247463393887843, + "grad_norm": 0.9019991755485535, + "learning_rate": 7.644191077123197e-05, + "loss": 1.202, + "step": 5345 + }, + { + "epoch": 0.32480709642141076, + "grad_norm": 0.8234071135520935, + "learning_rate": 7.64337856573397e-05, + "loss": 1.1843, + "step": 5346 + }, + { + "epoch": 0.3248678534540373, + "grad_norm": 0.14335016906261444, + "learning_rate": 7.64256595745119e-05, + "loss": 1.0796, + "step": 5347 + }, + { + "epoch": 0.3249286104866638, + "grad_norm": 0.19248712062835693, + "learning_rate": 7.641753252304646e-05, + "loss": 1.116, + "step": 5348 + }, + { + "epoch": 0.32498936751929036, + "grad_norm": 0.2020735740661621, + "learning_rate": 7.640940450324124e-05, + "loss": 1.1848, + "step": 5349 + }, + { + "epoch": 0.3250501245519169, + "grad_norm": 0.14655005931854248, + "learning_rate": 7.640127551539419e-05, + "loss": 1.1154, + "step": 5350 + }, + { + "epoch": 0.32511088158454343, + "grad_norm": 0.2234189659357071, + "learning_rate": 7.63931455598033e-05, + "loss": 1.1787, + "step": 5351 + }, + { + "epoch": 0.3251716386171699, + "grad_norm": 0.37811279296875, + "learning_rate": 7.638501463676655e-05, + "loss": 1.0986, + "step": 5352 + }, + { + "epoch": 0.32523239564979645, + "grad_norm": 0.15809832513332367, + "learning_rate": 7.6376882746582e-05, + "loss": 1.0922, + "step": 5353 + }, + { + "epoch": 0.325293152682423, + "grad_norm": 0.16970260441303253, + "learning_rate": 7.63687498895477e-05, + "loss": 1.104, + "step": 5354 + }, + { + "epoch": 0.3253539097150495, + "grad_norm": 0.17243817448616028, + "learning_rate": 7.636061606596178e-05, + "loss": 1.067, + "step": 5355 + }, + { + "epoch": 0.32541466674767605, + "grad_norm": 0.15289516746997833, + "learning_rate": 7.635248127612238e-05, + "loss": 1.0662, + "step": 5356 + }, + { + "epoch": 0.3254754237803026, + "grad_norm": 0.17811495065689087, + "learning_rate": 7.634434552032768e-05, + "loss": 1.0967, + "step": 5357 + }, + { + "epoch": 0.3255361808129291, + "grad_norm": 0.14455220103263855, + "learning_rate": 7.63362087988759e-05, + "loss": 1.0499, + "step": 5358 + }, + { + "epoch": 0.3255969378455556, + "grad_norm": 0.1708948016166687, + "learning_rate": 7.632807111206531e-05, + "loss": 1.1196, + "step": 5359 + }, + { + "epoch": 0.32565769487818214, + "grad_norm": 0.16589440405368805, + "learning_rate": 7.631993246019418e-05, + "loss": 1.0705, + "step": 5360 + }, + { + "epoch": 0.32571845191080867, + "grad_norm": 0.12406129390001297, + "learning_rate": 7.631179284356085e-05, + "loss": 1.08, + "step": 5361 + }, + { + "epoch": 0.3257792089434352, + "grad_norm": 0.19620484113693237, + "learning_rate": 7.630365226246365e-05, + "loss": 1.0555, + "step": 5362 + }, + { + "epoch": 0.32583996597606174, + "grad_norm": 0.24526768922805786, + "learning_rate": 7.6295510717201e-05, + "loss": 1.1506, + "step": 5363 + }, + { + "epoch": 0.3259007230086883, + "grad_norm": 0.38127610087394714, + "learning_rate": 7.628736820807132e-05, + "loss": 1.1142, + "step": 5364 + }, + { + "epoch": 0.32596148004131476, + "grad_norm": 0.19456841051578522, + "learning_rate": 7.627922473537307e-05, + "loss": 1.1398, + "step": 5365 + }, + { + "epoch": 0.3260222370739413, + "grad_norm": 0.8136919736862183, + "learning_rate": 7.627108029940477e-05, + "loss": 1.0773, + "step": 5366 + }, + { + "epoch": 0.3260829941065678, + "grad_norm": 0.17009076476097107, + "learning_rate": 7.626293490046493e-05, + "loss": 1.172, + "step": 5367 + }, + { + "epoch": 0.32614375113919436, + "grad_norm": 0.24989072978496552, + "learning_rate": 7.625478853885215e-05, + "loss": 1.0367, + "step": 5368 + }, + { + "epoch": 0.3262045081718209, + "grad_norm": 0.22486184537410736, + "learning_rate": 7.6246641214865e-05, + "loss": 1.0291, + "step": 5369 + }, + { + "epoch": 0.32626526520444743, + "grad_norm": 0.25649169087409973, + "learning_rate": 7.623849292880216e-05, + "loss": 1.1169, + "step": 5370 + }, + { + "epoch": 0.32632602223707396, + "grad_norm": 0.22672656178474426, + "learning_rate": 7.623034368096229e-05, + "loss": 1.004, + "step": 5371 + }, + { + "epoch": 0.32638677926970044, + "grad_norm": 0.13147930800914764, + "learning_rate": 7.62221934716441e-05, + "loss": 1.0593, + "step": 5372 + }, + { + "epoch": 0.326447536302327, + "grad_norm": 0.2682172954082489, + "learning_rate": 7.621404230114632e-05, + "loss": 1.0627, + "step": 5373 + }, + { + "epoch": 0.3265082933349535, + "grad_norm": 0.15371541678905487, + "learning_rate": 7.62058901697678e-05, + "loss": 1.0906, + "step": 5374 + }, + { + "epoch": 0.32656905036758005, + "grad_norm": 6.770089149475098, + "learning_rate": 7.619773707780727e-05, + "loss": 1.0783, + "step": 5375 + }, + { + "epoch": 0.3266298074002066, + "grad_norm": 0.222050741314888, + "learning_rate": 7.618958302556363e-05, + "loss": 1.0888, + "step": 5376 + }, + { + "epoch": 0.3266905644328331, + "grad_norm": 0.18792663514614105, + "learning_rate": 7.618142801333576e-05, + "loss": 1.1686, + "step": 5377 + }, + { + "epoch": 0.32675132146545965, + "grad_norm": 0.47172272205352783, + "learning_rate": 7.61732720414226e-05, + "loss": 1.2057, + "step": 5378 + }, + { + "epoch": 0.32681207849808613, + "grad_norm": 0.3117204010486603, + "learning_rate": 7.616511511012308e-05, + "loss": 1.0713, + "step": 5379 + }, + { + "epoch": 0.32687283553071267, + "grad_norm": 0.1713021695613861, + "learning_rate": 7.615695721973621e-05, + "loss": 1.1123, + "step": 5380 + }, + { + "epoch": 0.3269335925633392, + "grad_norm": 0.13214349746704102, + "learning_rate": 7.614879837056103e-05, + "loss": 1.0323, + "step": 5381 + }, + { + "epoch": 0.32699434959596574, + "grad_norm": 0.17013302445411682, + "learning_rate": 7.614063856289658e-05, + "loss": 1.0746, + "step": 5382 + }, + { + "epoch": 0.3270551066285923, + "grad_norm": 0.1985173225402832, + "learning_rate": 7.613247779704197e-05, + "loss": 1.0644, + "step": 5383 + }, + { + "epoch": 0.3271158636612188, + "grad_norm": 0.21263551712036133, + "learning_rate": 7.612431607329635e-05, + "loss": 1.0573, + "step": 5384 + }, + { + "epoch": 0.3271766206938453, + "grad_norm": 0.6282607316970825, + "learning_rate": 7.611615339195886e-05, + "loss": 1.0843, + "step": 5385 + }, + { + "epoch": 0.3272373777264718, + "grad_norm": 0.33215171098709106, + "learning_rate": 7.610798975332871e-05, + "loss": 1.1194, + "step": 5386 + }, + { + "epoch": 0.32729813475909836, + "grad_norm": 0.2752797603607178, + "learning_rate": 7.609982515770516e-05, + "loss": 1.1742, + "step": 5387 + }, + { + "epoch": 0.3273588917917249, + "grad_norm": 0.21759207546710968, + "learning_rate": 7.609165960538748e-05, + "loss": 1.1583, + "step": 5388 + }, + { + "epoch": 0.32741964882435143, + "grad_norm": 0.30800655484199524, + "learning_rate": 7.608349309667498e-05, + "loss": 1.2168, + "step": 5389 + }, + { + "epoch": 0.32748040585697796, + "grad_norm": 0.17498065531253815, + "learning_rate": 7.607532563186696e-05, + "loss": 1.1263, + "step": 5390 + }, + { + "epoch": 0.3275411628896045, + "grad_norm": 0.15697449445724487, + "learning_rate": 7.606715721126288e-05, + "loss": 1.086, + "step": 5391 + }, + { + "epoch": 0.327601919922231, + "grad_norm": 0.16416527330875397, + "learning_rate": 7.60589878351621e-05, + "loss": 1.0964, + "step": 5392 + }, + { + "epoch": 0.3276626769548575, + "grad_norm": 0.21482136845588684, + "learning_rate": 7.605081750386407e-05, + "loss": 1.1328, + "step": 5393 + }, + { + "epoch": 0.32772343398748405, + "grad_norm": 0.27260974049568176, + "learning_rate": 7.604264621766833e-05, + "loss": 1.1123, + "step": 5394 + }, + { + "epoch": 0.3277841910201106, + "grad_norm": 0.23360304534435272, + "learning_rate": 7.603447397687432e-05, + "loss": 1.0687, + "step": 5395 + }, + { + "epoch": 0.3278449480527371, + "grad_norm": 0.13089804351329803, + "learning_rate": 7.602630078178164e-05, + "loss": 1.0495, + "step": 5396 + }, + { + "epoch": 0.32790570508536365, + "grad_norm": 0.13218490779399872, + "learning_rate": 7.601812663268988e-05, + "loss": 1.0475, + "step": 5397 + }, + { + "epoch": 0.32796646211799013, + "grad_norm": 0.375641793012619, + "learning_rate": 7.600995152989865e-05, + "loss": 1.0799, + "step": 5398 + }, + { + "epoch": 0.32802721915061667, + "grad_norm": 0.18387757241725922, + "learning_rate": 7.600177547370763e-05, + "loss": 1.1275, + "step": 5399 + }, + { + "epoch": 0.3280879761832432, + "grad_norm": 0.1416109949350357, + "learning_rate": 7.599359846441651e-05, + "loss": 1.0789, + "step": 5400 + }, + { + "epoch": 0.32814873321586974, + "grad_norm": 0.20161789655685425, + "learning_rate": 7.5985420502325e-05, + "loss": 1.0724, + "step": 5401 + }, + { + "epoch": 0.32820949024849627, + "grad_norm": 0.13243705034255981, + "learning_rate": 7.597724158773289e-05, + "loss": 1.0758, + "step": 5402 + }, + { + "epoch": 0.3282702472811228, + "grad_norm": 0.24550196528434753, + "learning_rate": 7.596906172093995e-05, + "loss": 1.0983, + "step": 5403 + }, + { + "epoch": 0.32833100431374934, + "grad_norm": 0.2173246145248413, + "learning_rate": 7.596088090224604e-05, + "loss": 1.1317, + "step": 5404 + }, + { + "epoch": 0.3283917613463758, + "grad_norm": 0.23761998116970062, + "learning_rate": 7.595269913195104e-05, + "loss": 1.1346, + "step": 5405 + }, + { + "epoch": 0.32845251837900236, + "grad_norm": 0.42454764246940613, + "learning_rate": 7.594451641035482e-05, + "loss": 1.0699, + "step": 5406 + }, + { + "epoch": 0.3285132754116289, + "grad_norm": 0.159896120429039, + "learning_rate": 7.593633273775734e-05, + "loss": 1.0873, + "step": 5407 + }, + { + "epoch": 0.3285740324442554, + "grad_norm": 0.13794861733913422, + "learning_rate": 7.592814811445856e-05, + "loss": 1.1247, + "step": 5408 + }, + { + "epoch": 0.32863478947688196, + "grad_norm": 0.20439006388187408, + "learning_rate": 7.591996254075851e-05, + "loss": 1.0699, + "step": 5409 + }, + { + "epoch": 0.3286955465095085, + "grad_norm": 0.1942063421010971, + "learning_rate": 7.591177601695722e-05, + "loss": 1.0832, + "step": 5410 + }, + { + "epoch": 0.328756303542135, + "grad_norm": 0.19093358516693115, + "learning_rate": 7.59035885433548e-05, + "loss": 1.0546, + "step": 5411 + }, + { + "epoch": 0.3288170605747615, + "grad_norm": 0.14206360280513763, + "learning_rate": 7.58954001202513e-05, + "loss": 1.0647, + "step": 5412 + }, + { + "epoch": 0.32887781760738805, + "grad_norm": 0.1793767511844635, + "learning_rate": 7.588721074794691e-05, + "loss": 1.0647, + "step": 5413 + }, + { + "epoch": 0.3289385746400146, + "grad_norm": 0.2375302016735077, + "learning_rate": 7.58790204267418e-05, + "loss": 1.128, + "step": 5414 + }, + { + "epoch": 0.3289993316726411, + "grad_norm": 0.17666825652122498, + "learning_rate": 7.58708291569362e-05, + "loss": 1.1092, + "step": 5415 + }, + { + "epoch": 0.32906008870526765, + "grad_norm": 0.27784040570259094, + "learning_rate": 7.586263693883036e-05, + "loss": 1.0649, + "step": 5416 + }, + { + "epoch": 0.3291208457378942, + "grad_norm": 0.18404880166053772, + "learning_rate": 7.585444377272456e-05, + "loss": 1.1707, + "step": 5417 + }, + { + "epoch": 0.32918160277052066, + "grad_norm": 0.42867541313171387, + "learning_rate": 7.584624965891913e-05, + "loss": 1.1083, + "step": 5418 + }, + { + "epoch": 0.3292423598031472, + "grad_norm": 0.26918578147888184, + "learning_rate": 7.583805459771443e-05, + "loss": 1.1006, + "step": 5419 + }, + { + "epoch": 0.32930311683577373, + "grad_norm": 0.12470528483390808, + "learning_rate": 7.582985858941083e-05, + "loss": 1.0458, + "step": 5420 + }, + { + "epoch": 0.32936387386840027, + "grad_norm": 0.21023140847682953, + "learning_rate": 7.582166163430877e-05, + "loss": 1.0358, + "step": 5421 + }, + { + "epoch": 0.3294246309010268, + "grad_norm": 0.1549159735441208, + "learning_rate": 7.581346373270873e-05, + "loss": 1.1289, + "step": 5422 + }, + { + "epoch": 0.32948538793365334, + "grad_norm": 0.1667061597108841, + "learning_rate": 7.580526488491116e-05, + "loss": 1.0213, + "step": 5423 + }, + { + "epoch": 0.3295461449662798, + "grad_norm": 0.49645593762397766, + "learning_rate": 7.579706509121663e-05, + "loss": 1.3674, + "step": 5424 + }, + { + "epoch": 0.32960690199890635, + "grad_norm": 0.16318374872207642, + "learning_rate": 7.57888643519257e-05, + "loss": 1.0536, + "step": 5425 + }, + { + "epoch": 0.3296676590315329, + "grad_norm": 0.15818971395492554, + "learning_rate": 7.578066266733897e-05, + "loss": 1.0969, + "step": 5426 + }, + { + "epoch": 0.3297284160641594, + "grad_norm": 0.2040669322013855, + "learning_rate": 7.577246003775704e-05, + "loss": 1.0815, + "step": 5427 + }, + { + "epoch": 0.32978917309678596, + "grad_norm": 1.1522833108901978, + "learning_rate": 7.576425646348062e-05, + "loss": 1.043, + "step": 5428 + }, + { + "epoch": 0.3298499301294125, + "grad_norm": 0.1402571201324463, + "learning_rate": 7.575605194481039e-05, + "loss": 1.0392, + "step": 5429 + }, + { + "epoch": 0.32991068716203903, + "grad_norm": 0.23267050087451935, + "learning_rate": 7.574784648204711e-05, + "loss": 1.2631, + "step": 5430 + }, + { + "epoch": 0.3299714441946655, + "grad_norm": 0.16812367737293243, + "learning_rate": 7.573964007549155e-05, + "loss": 1.1362, + "step": 5431 + }, + { + "epoch": 0.33003220122729204, + "grad_norm": 5.5949835777282715, + "learning_rate": 7.573143272544449e-05, + "loss": 1.0358, + "step": 5432 + }, + { + "epoch": 0.3300929582599186, + "grad_norm": 0.31999778747558594, + "learning_rate": 7.572322443220678e-05, + "loss": 1.0605, + "step": 5433 + }, + { + "epoch": 0.3301537152925451, + "grad_norm": 0.14096160233020782, + "learning_rate": 7.571501519607933e-05, + "loss": 1.0424, + "step": 5434 + }, + { + "epoch": 0.33021447232517165, + "grad_norm": 0.23999495804309845, + "learning_rate": 7.570680501736303e-05, + "loss": 1.0938, + "step": 5435 + }, + { + "epoch": 0.3302752293577982, + "grad_norm": 0.2573355436325073, + "learning_rate": 7.56985938963588e-05, + "loss": 1.129, + "step": 5436 + }, + { + "epoch": 0.3303359863904247, + "grad_norm": 0.3784105181694031, + "learning_rate": 7.569038183336765e-05, + "loss": 1.2554, + "step": 5437 + }, + { + "epoch": 0.3303967434230512, + "grad_norm": 0.2289574295282364, + "learning_rate": 7.56821688286906e-05, + "loss": 1.1632, + "step": 5438 + }, + { + "epoch": 0.33045750045567773, + "grad_norm": 0.25765928626060486, + "learning_rate": 7.567395488262868e-05, + "loss": 1.1846, + "step": 5439 + }, + { + "epoch": 0.33051825748830427, + "grad_norm": 0.13408100605010986, + "learning_rate": 7.566573999548297e-05, + "loss": 1.0368, + "step": 5440 + }, + { + "epoch": 0.3305790145209308, + "grad_norm": 0.13287340104579926, + "learning_rate": 7.565752416755461e-05, + "loss": 1.0991, + "step": 5441 + }, + { + "epoch": 0.33063977155355734, + "grad_norm": 0.18916484713554382, + "learning_rate": 7.564930739914473e-05, + "loss": 1.127, + "step": 5442 + }, + { + "epoch": 0.3307005285861839, + "grad_norm": 0.23863853514194489, + "learning_rate": 7.564108969055455e-05, + "loss": 1.1935, + "step": 5443 + }, + { + "epoch": 0.33076128561881035, + "grad_norm": 0.2178945392370224, + "learning_rate": 7.563287104208526e-05, + "loss": 1.1345, + "step": 5444 + }, + { + "epoch": 0.3308220426514369, + "grad_norm": 0.3364085257053375, + "learning_rate": 7.562465145403813e-05, + "loss": 1.1097, + "step": 5445 + }, + { + "epoch": 0.3308827996840634, + "grad_norm": 0.19464822113513947, + "learning_rate": 7.561643092671444e-05, + "loss": 1.0984, + "step": 5446 + }, + { + "epoch": 0.33094355671668996, + "grad_norm": 0.20432879030704498, + "learning_rate": 7.560820946041554e-05, + "loss": 1.0489, + "step": 5447 + }, + { + "epoch": 0.3310043137493165, + "grad_norm": 0.3350868225097656, + "learning_rate": 7.559998705544277e-05, + "loss": 1.0911, + "step": 5448 + }, + { + "epoch": 0.331065070781943, + "grad_norm": 0.13863138854503632, + "learning_rate": 7.559176371209751e-05, + "loss": 1.0284, + "step": 5449 + }, + { + "epoch": 0.33112582781456956, + "grad_norm": 0.37578779458999634, + "learning_rate": 7.558353943068122e-05, + "loss": 1.1807, + "step": 5450 + }, + { + "epoch": 0.33118658484719604, + "grad_norm": 0.2311204969882965, + "learning_rate": 7.557531421149534e-05, + "loss": 1.1274, + "step": 5451 + }, + { + "epoch": 0.3312473418798226, + "grad_norm": 0.2685815393924713, + "learning_rate": 7.556708805484138e-05, + "loss": 1.0295, + "step": 5452 + }, + { + "epoch": 0.3313080989124491, + "grad_norm": 0.2088678628206253, + "learning_rate": 7.555886096102084e-05, + "loss": 1.1626, + "step": 5453 + }, + { + "epoch": 0.33136885594507565, + "grad_norm": 5.203686714172363, + "learning_rate": 7.555063293033533e-05, + "loss": 1.1302, + "step": 5454 + }, + { + "epoch": 0.3314296129777022, + "grad_norm": 0.2517106533050537, + "learning_rate": 7.554240396308643e-05, + "loss": 1.0603, + "step": 5455 + }, + { + "epoch": 0.3314903700103287, + "grad_norm": 0.19543345272541046, + "learning_rate": 7.553417405957579e-05, + "loss": 1.2118, + "step": 5456 + }, + { + "epoch": 0.3315511270429552, + "grad_norm": 0.1629648655653, + "learning_rate": 7.552594322010504e-05, + "loss": 1.1508, + "step": 5457 + }, + { + "epoch": 0.33161188407558173, + "grad_norm": 0.21248915791511536, + "learning_rate": 7.551771144497592e-05, + "loss": 1.0847, + "step": 5458 + }, + { + "epoch": 0.33167264110820827, + "grad_norm": 0.20575113594532013, + "learning_rate": 7.550947873449016e-05, + "loss": 1.1877, + "step": 5459 + }, + { + "epoch": 0.3317333981408348, + "grad_norm": 0.14594343304634094, + "learning_rate": 7.550124508894951e-05, + "loss": 1.0824, + "step": 5460 + }, + { + "epoch": 0.33179415517346134, + "grad_norm": 0.19440650939941406, + "learning_rate": 7.54930105086558e-05, + "loss": 1.0894, + "step": 5461 + }, + { + "epoch": 0.33185491220608787, + "grad_norm": 0.16481439769268036, + "learning_rate": 7.548477499391087e-05, + "loss": 1.1211, + "step": 5462 + }, + { + "epoch": 0.3319156692387144, + "grad_norm": 0.24972113966941833, + "learning_rate": 7.547653854501657e-05, + "loss": 1.0991, + "step": 5463 + }, + { + "epoch": 0.3319764262713409, + "grad_norm": 0.18423761427402496, + "learning_rate": 7.546830116227484e-05, + "loss": 1.1164, + "step": 5464 + }, + { + "epoch": 0.3320371833039674, + "grad_norm": 0.2262442708015442, + "learning_rate": 7.54600628459876e-05, + "loss": 1.1068, + "step": 5465 + }, + { + "epoch": 0.33209794033659396, + "grad_norm": 0.17784802615642548, + "learning_rate": 7.545182359645684e-05, + "loss": 1.113, + "step": 5466 + }, + { + "epoch": 0.3321586973692205, + "grad_norm": 0.2207585722208023, + "learning_rate": 7.544358341398457e-05, + "loss": 1.178, + "step": 5467 + }, + { + "epoch": 0.332219454401847, + "grad_norm": 0.19200880825519562, + "learning_rate": 7.543534229887281e-05, + "loss": 1.0855, + "step": 5468 + }, + { + "epoch": 0.33228021143447356, + "grad_norm": 0.1824488341808319, + "learning_rate": 7.54271002514237e-05, + "loss": 1.3626, + "step": 5469 + }, + { + "epoch": 0.33234096846710004, + "grad_norm": 0.22011467814445496, + "learning_rate": 7.541885727193928e-05, + "loss": 1.1189, + "step": 5470 + }, + { + "epoch": 0.3324017254997266, + "grad_norm": 0.33231163024902344, + "learning_rate": 7.541061336072175e-05, + "loss": 1.0944, + "step": 5471 + }, + { + "epoch": 0.3324624825323531, + "grad_norm": 0.18799728155136108, + "learning_rate": 7.540236851807327e-05, + "loss": 1.0891, + "step": 5472 + }, + { + "epoch": 0.33252323956497964, + "grad_norm": 0.5314054489135742, + "learning_rate": 7.539412274429605e-05, + "loss": 1.0746, + "step": 5473 + }, + { + "epoch": 0.3325839965976062, + "grad_norm": 0.13672508299350739, + "learning_rate": 7.538587603969238e-05, + "loss": 1.0563, + "step": 5474 + }, + { + "epoch": 0.3326447536302327, + "grad_norm": 0.30675119161605835, + "learning_rate": 7.53776284045645e-05, + "loss": 1.0091, + "step": 5475 + }, + { + "epoch": 0.33270551066285925, + "grad_norm": 0.3291470408439636, + "learning_rate": 7.536937983921474e-05, + "loss": 1.1303, + "step": 5476 + }, + { + "epoch": 0.33276626769548573, + "grad_norm": 0.1538793444633484, + "learning_rate": 7.536113034394547e-05, + "loss": 1.0574, + "step": 5477 + }, + { + "epoch": 0.33282702472811226, + "grad_norm": 0.2535040080547333, + "learning_rate": 7.535287991905906e-05, + "loss": 1.0811, + "step": 5478 + }, + { + "epoch": 0.3328877817607388, + "grad_norm": 0.17257514595985413, + "learning_rate": 7.534462856485793e-05, + "loss": 1.1203, + "step": 5479 + }, + { + "epoch": 0.33294853879336533, + "grad_norm": 0.22970615327358246, + "learning_rate": 7.533637628164455e-05, + "loss": 1.1531, + "step": 5480 + }, + { + "epoch": 0.33300929582599187, + "grad_norm": 0.4710615873336792, + "learning_rate": 7.532812306972139e-05, + "loss": 1.0932, + "step": 5481 + }, + { + "epoch": 0.3330700528586184, + "grad_norm": 0.17988619208335876, + "learning_rate": 7.531986892939101e-05, + "loss": 1.0801, + "step": 5482 + }, + { + "epoch": 0.33313080989124494, + "grad_norm": 0.19155313074588776, + "learning_rate": 7.531161386095591e-05, + "loss": 1.1033, + "step": 5483 + }, + { + "epoch": 0.3331915669238714, + "grad_norm": 0.17571823298931122, + "learning_rate": 7.530335786471871e-05, + "loss": 1.2097, + "step": 5484 + }, + { + "epoch": 0.33325232395649795, + "grad_norm": 3.874110221862793, + "learning_rate": 7.529510094098207e-05, + "loss": 1.0823, + "step": 5485 + }, + { + "epoch": 0.3333130809891245, + "grad_norm": 1.1709591150283813, + "learning_rate": 7.528684309004859e-05, + "loss": 1.2051, + "step": 5486 + }, + { + "epoch": 0.333373838021751, + "grad_norm": 0.22638088464736938, + "learning_rate": 7.527858431222098e-05, + "loss": 1.1015, + "step": 5487 + }, + { + "epoch": 0.33343459505437756, + "grad_norm": 0.15556208789348602, + "learning_rate": 7.527032460780201e-05, + "loss": 1.085, + "step": 5488 + }, + { + "epoch": 0.3334953520870041, + "grad_norm": 0.18091131746768951, + "learning_rate": 7.526206397709437e-05, + "loss": 1.1757, + "step": 5489 + }, + { + "epoch": 0.3335561091196306, + "grad_norm": 0.1496092528104782, + "learning_rate": 7.525380242040092e-05, + "loss": 1.0332, + "step": 5490 + }, + { + "epoch": 0.3336168661522571, + "grad_norm": 0.15536093711853027, + "learning_rate": 7.524553993802443e-05, + "loss": 1.1971, + "step": 5491 + }, + { + "epoch": 0.33367762318488364, + "grad_norm": 0.25183531641960144, + "learning_rate": 7.523727653026783e-05, + "loss": 1.1268, + "step": 5492 + }, + { + "epoch": 0.3337383802175102, + "grad_norm": 0.15332481265068054, + "learning_rate": 7.522901219743396e-05, + "loss": 1.1097, + "step": 5493 + }, + { + "epoch": 0.3337991372501367, + "grad_norm": 0.16587772965431213, + "learning_rate": 7.522074693982577e-05, + "loss": 1.1238, + "step": 5494 + }, + { + "epoch": 0.33385989428276325, + "grad_norm": 0.21817606687545776, + "learning_rate": 7.521248075774623e-05, + "loss": 1.0806, + "step": 5495 + }, + { + "epoch": 0.3339206513153898, + "grad_norm": 0.371785044670105, + "learning_rate": 7.520421365149834e-05, + "loss": 1.1514, + "step": 5496 + }, + { + "epoch": 0.33398140834801626, + "grad_norm": 0.13117551803588867, + "learning_rate": 7.519594562138511e-05, + "loss": 1.0224, + "step": 5497 + }, + { + "epoch": 0.3340421653806428, + "grad_norm": 0.13245713710784912, + "learning_rate": 7.518767666770965e-05, + "loss": 1.0559, + "step": 5498 + }, + { + "epoch": 0.33410292241326933, + "grad_norm": 0.1396120935678482, + "learning_rate": 7.5179406790775e-05, + "loss": 1.0532, + "step": 5499 + }, + { + "epoch": 0.33416367944589587, + "grad_norm": 0.15593959391117096, + "learning_rate": 7.517113599088434e-05, + "loss": 1.0197, + "step": 5500 + }, + { + "epoch": 0.3342244364785224, + "grad_norm": 0.11808733642101288, + "learning_rate": 7.516286426834084e-05, + "loss": 1.0396, + "step": 5501 + }, + { + "epoch": 0.33428519351114894, + "grad_norm": 0.21038290858268738, + "learning_rate": 7.515459162344767e-05, + "loss": 1.1596, + "step": 5502 + }, + { + "epoch": 0.3343459505437754, + "grad_norm": 0.1895390748977661, + "learning_rate": 7.51463180565081e-05, + "loss": 1.1275, + "step": 5503 + }, + { + "epoch": 0.33440670757640195, + "grad_norm": 0.2504735291004181, + "learning_rate": 7.513804356782536e-05, + "loss": 1.0489, + "step": 5504 + }, + { + "epoch": 0.3344674646090285, + "grad_norm": 2.3111376762390137, + "learning_rate": 7.512976815770279e-05, + "loss": 1.0479, + "step": 5505 + }, + { + "epoch": 0.334528221641655, + "grad_norm": 12.76346206665039, + "learning_rate": 7.512149182644371e-05, + "loss": 1.0611, + "step": 5506 + }, + { + "epoch": 0.33458897867428156, + "grad_norm": 0.2044251561164856, + "learning_rate": 7.511321457435145e-05, + "loss": 1.0564, + "step": 5507 + }, + { + "epoch": 0.3346497357069081, + "grad_norm": 0.22237356007099152, + "learning_rate": 7.51049364017295e-05, + "loss": 1.1788, + "step": 5508 + }, + { + "epoch": 0.3347104927395346, + "grad_norm": 0.28337711095809937, + "learning_rate": 7.509665730888125e-05, + "loss": 1.1912, + "step": 5509 + }, + { + "epoch": 0.3347712497721611, + "grad_norm": 0.20663613080978394, + "learning_rate": 7.508837729611016e-05, + "loss": 1.0964, + "step": 5510 + }, + { + "epoch": 0.33483200680478764, + "grad_norm": 0.23966573178768158, + "learning_rate": 7.508009636371976e-05, + "loss": 1.0412, + "step": 5511 + }, + { + "epoch": 0.3348927638374142, + "grad_norm": 0.25408032536506653, + "learning_rate": 7.507181451201357e-05, + "loss": 1.1001, + "step": 5512 + }, + { + "epoch": 0.3349535208700407, + "grad_norm": 1.6409603357315063, + "learning_rate": 7.506353174129518e-05, + "loss": 1.1754, + "step": 5513 + }, + { + "epoch": 0.33501427790266725, + "grad_norm": 0.13990652561187744, + "learning_rate": 7.505524805186821e-05, + "loss": 1.0497, + "step": 5514 + }, + { + "epoch": 0.3350750349352938, + "grad_norm": 0.20568786561489105, + "learning_rate": 7.504696344403626e-05, + "loss": 1.0818, + "step": 5515 + }, + { + "epoch": 0.33513579196792026, + "grad_norm": 0.17626319825649261, + "learning_rate": 7.503867791810302e-05, + "loss": 1.1946, + "step": 5516 + }, + { + "epoch": 0.3351965490005468, + "grad_norm": 0.478924959897995, + "learning_rate": 7.503039147437218e-05, + "loss": 1.1956, + "step": 5517 + }, + { + "epoch": 0.33525730603317333, + "grad_norm": 0.30180832743644714, + "learning_rate": 7.502210411314755e-05, + "loss": 1.0471, + "step": 5518 + }, + { + "epoch": 0.33531806306579987, + "grad_norm": 0.2921055257320404, + "learning_rate": 7.501381583473284e-05, + "loss": 1.0678, + "step": 5519 + }, + { + "epoch": 0.3353788200984264, + "grad_norm": 0.2026301473379135, + "learning_rate": 7.500552663943186e-05, + "loss": 1.1336, + "step": 5520 + }, + { + "epoch": 0.33543957713105294, + "grad_norm": 0.1825183480978012, + "learning_rate": 7.499723652754847e-05, + "loss": 1.1394, + "step": 5521 + }, + { + "epoch": 0.33550033416367947, + "grad_norm": 0.1588670313358307, + "learning_rate": 7.498894549938656e-05, + "loss": 1.0613, + "step": 5522 + }, + { + "epoch": 0.33556109119630595, + "grad_norm": 0.17803405225276947, + "learning_rate": 7.498065355525002e-05, + "loss": 1.0756, + "step": 5523 + }, + { + "epoch": 0.3356218482289325, + "grad_norm": 0.15975968539714813, + "learning_rate": 7.497236069544278e-05, + "loss": 1.1229, + "step": 5524 + }, + { + "epoch": 0.335682605261559, + "grad_norm": 0.21353621780872345, + "learning_rate": 7.496406692026884e-05, + "loss": 1.0469, + "step": 5525 + }, + { + "epoch": 0.33574336229418555, + "grad_norm": 0.2091209590435028, + "learning_rate": 7.49557722300322e-05, + "loss": 1.1089, + "step": 5526 + }, + { + "epoch": 0.3358041193268121, + "grad_norm": 0.35379528999328613, + "learning_rate": 7.494747662503691e-05, + "loss": 1.0785, + "step": 5527 + }, + { + "epoch": 0.3358648763594386, + "grad_norm": 0.16948559880256653, + "learning_rate": 7.493918010558704e-05, + "loss": 1.0616, + "step": 5528 + }, + { + "epoch": 0.3359256333920651, + "grad_norm": 0.13938705623149872, + "learning_rate": 7.49308826719867e-05, + "loss": 1.0721, + "step": 5529 + }, + { + "epoch": 0.33598639042469164, + "grad_norm": 0.1933734118938446, + "learning_rate": 7.492258432454002e-05, + "loss": 1.1133, + "step": 5530 + }, + { + "epoch": 0.3360471474573182, + "grad_norm": 0.15812847018241882, + "learning_rate": 7.491428506355122e-05, + "loss": 1.0783, + "step": 5531 + }, + { + "epoch": 0.3361079044899447, + "grad_norm": 0.14268743991851807, + "learning_rate": 7.490598488932449e-05, + "loss": 1.0934, + "step": 5532 + }, + { + "epoch": 0.33616866152257124, + "grad_norm": 0.21588250994682312, + "learning_rate": 7.489768380216405e-05, + "loss": 1.2146, + "step": 5533 + }, + { + "epoch": 0.3362294185551978, + "grad_norm": 0.11810000240802765, + "learning_rate": 7.48893818023742e-05, + "loss": 1.0105, + "step": 5534 + }, + { + "epoch": 0.3362901755878243, + "grad_norm": 0.13542866706848145, + "learning_rate": 7.488107889025926e-05, + "loss": 1.0928, + "step": 5535 + }, + { + "epoch": 0.3363509326204508, + "grad_norm": 0.3417040705680847, + "learning_rate": 7.487277506612356e-05, + "loss": 1.1799, + "step": 5536 + }, + { + "epoch": 0.33641168965307733, + "grad_norm": 0.25541195273399353, + "learning_rate": 7.486447033027148e-05, + "loss": 1.1153, + "step": 5537 + }, + { + "epoch": 0.33647244668570386, + "grad_norm": 0.27999597787857056, + "learning_rate": 7.485616468300743e-05, + "loss": 1.1655, + "step": 5538 + }, + { + "epoch": 0.3365332037183304, + "grad_norm": 0.3260159492492676, + "learning_rate": 7.484785812463587e-05, + "loss": 1.1765, + "step": 5539 + }, + { + "epoch": 0.33659396075095693, + "grad_norm": 0.1697273850440979, + "learning_rate": 7.483955065546126e-05, + "loss": 1.1354, + "step": 5540 + }, + { + "epoch": 0.33665471778358347, + "grad_norm": 0.2634475827217102, + "learning_rate": 7.483124227578811e-05, + "loss": 1.2169, + "step": 5541 + }, + { + "epoch": 0.33671547481621, + "grad_norm": 0.18889199197292328, + "learning_rate": 7.482293298592098e-05, + "loss": 1.0853, + "step": 5542 + }, + { + "epoch": 0.3367762318488365, + "grad_norm": 0.12441056221723557, + "learning_rate": 7.481462278616444e-05, + "loss": 1.0495, + "step": 5543 + }, + { + "epoch": 0.336836988881463, + "grad_norm": 0.18259049952030182, + "learning_rate": 7.480631167682312e-05, + "loss": 1.2154, + "step": 5544 + }, + { + "epoch": 0.33689774591408955, + "grad_norm": 0.17367684841156006, + "learning_rate": 7.479799965820164e-05, + "loss": 1.0803, + "step": 5545 + }, + { + "epoch": 0.3369585029467161, + "grad_norm": 0.20125249028205872, + "learning_rate": 7.478968673060469e-05, + "loss": 1.1825, + "step": 5546 + }, + { + "epoch": 0.3370192599793426, + "grad_norm": 0.1303924173116684, + "learning_rate": 7.478137289433699e-05, + "loss": 1.0362, + "step": 5547 + }, + { + "epoch": 0.33708001701196916, + "grad_norm": 0.16618074476718903, + "learning_rate": 7.477305814970327e-05, + "loss": 1.0664, + "step": 5548 + }, + { + "epoch": 0.33714077404459564, + "grad_norm": 0.2883400022983551, + "learning_rate": 7.476474249700831e-05, + "loss": 1.1497, + "step": 5549 + }, + { + "epoch": 0.33720153107722217, + "grad_norm": 0.2781234383583069, + "learning_rate": 7.475642593655694e-05, + "loss": 1.1587, + "step": 5550 + }, + { + "epoch": 0.3372622881098487, + "grad_norm": 0.2012518048286438, + "learning_rate": 7.474810846865396e-05, + "loss": 1.1081, + "step": 5551 + }, + { + "epoch": 0.33732304514247524, + "grad_norm": 0.1505262851715088, + "learning_rate": 7.473979009360433e-05, + "loss": 1.0729, + "step": 5552 + }, + { + "epoch": 0.3373838021751018, + "grad_norm": 0.17856548726558685, + "learning_rate": 7.473147081171287e-05, + "loss": 1.1449, + "step": 5553 + }, + { + "epoch": 0.3374445592077283, + "grad_norm": 0.21609683334827423, + "learning_rate": 7.472315062328459e-05, + "loss": 1.2114, + "step": 5554 + }, + { + "epoch": 0.33750531624035485, + "grad_norm": 0.14103642106056213, + "learning_rate": 7.471482952862444e-05, + "loss": 1.0796, + "step": 5555 + }, + { + "epoch": 0.3375660732729813, + "grad_norm": 0.13603602349758148, + "learning_rate": 7.470650752803743e-05, + "loss": 1.0575, + "step": 5556 + }, + { + "epoch": 0.33762683030560786, + "grad_norm": 0.30538976192474365, + "learning_rate": 7.46981846218286e-05, + "loss": 1.2169, + "step": 5557 + }, + { + "epoch": 0.3376875873382344, + "grad_norm": 1.272532343864441, + "learning_rate": 7.468986081030307e-05, + "loss": 1.0851, + "step": 5558 + }, + { + "epoch": 0.33774834437086093, + "grad_norm": 0.33729514479637146, + "learning_rate": 7.468153609376591e-05, + "loss": 1.0894, + "step": 5559 + }, + { + "epoch": 0.33780910140348747, + "grad_norm": 0.2312454879283905, + "learning_rate": 7.467321047252226e-05, + "loss": 1.1492, + "step": 5560 + }, + { + "epoch": 0.337869858436114, + "grad_norm": 0.16546347737312317, + "learning_rate": 7.466488394687731e-05, + "loss": 1.0618, + "step": 5561 + }, + { + "epoch": 0.3379306154687405, + "grad_norm": 0.5239700078964233, + "learning_rate": 7.465655651713627e-05, + "loss": 1.1053, + "step": 5562 + }, + { + "epoch": 0.337991372501367, + "grad_norm": 0.22329890727996826, + "learning_rate": 7.464822818360441e-05, + "loss": 1.0917, + "step": 5563 + }, + { + "epoch": 0.33805212953399355, + "grad_norm": 0.27938318252563477, + "learning_rate": 7.463989894658696e-05, + "loss": 1.2364, + "step": 5564 + }, + { + "epoch": 0.3381128865666201, + "grad_norm": 0.15419381856918335, + "learning_rate": 7.463156880638926e-05, + "loss": 1.0473, + "step": 5565 + }, + { + "epoch": 0.3381736435992466, + "grad_norm": 1.407528042793274, + "learning_rate": 7.462323776331664e-05, + "loss": 1.1546, + "step": 5566 + }, + { + "epoch": 0.33823440063187316, + "grad_norm": 0.31853002309799194, + "learning_rate": 7.461490581767448e-05, + "loss": 1.09, + "step": 5567 + }, + { + "epoch": 0.3382951576644997, + "grad_norm": 0.1421099603176117, + "learning_rate": 7.460657296976819e-05, + "loss": 1.057, + "step": 5568 + }, + { + "epoch": 0.33835591469712617, + "grad_norm": 0.1373155266046524, + "learning_rate": 7.459823921990321e-05, + "loss": 1.0674, + "step": 5569 + }, + { + "epoch": 0.3384166717297527, + "grad_norm": 0.15903204679489136, + "learning_rate": 7.458990456838501e-05, + "loss": 1.0691, + "step": 5570 + }, + { + "epoch": 0.33847742876237924, + "grad_norm": 0.14871279895305634, + "learning_rate": 7.458156901551912e-05, + "loss": 1.0473, + "step": 5571 + }, + { + "epoch": 0.3385381857950058, + "grad_norm": 0.33478689193725586, + "learning_rate": 7.457323256161106e-05, + "loss": 1.1369, + "step": 5572 + }, + { + "epoch": 0.3385989428276323, + "grad_norm": 0.1887255311012268, + "learning_rate": 7.456489520696642e-05, + "loss": 1.0479, + "step": 5573 + }, + { + "epoch": 0.33865969986025884, + "grad_norm": 0.2735128700733185, + "learning_rate": 7.455655695189077e-05, + "loss": 1.104, + "step": 5574 + }, + { + "epoch": 0.3387204568928853, + "grad_norm": 0.23236620426177979, + "learning_rate": 7.454821779668981e-05, + "loss": 1.0628, + "step": 5575 + }, + { + "epoch": 0.33878121392551186, + "grad_norm": 0.16397589445114136, + "learning_rate": 7.453987774166917e-05, + "loss": 1.1087, + "step": 5576 + }, + { + "epoch": 0.3388419709581384, + "grad_norm": 0.18313263356685638, + "learning_rate": 7.453153678713455e-05, + "loss": 1.1311, + "step": 5577 + }, + { + "epoch": 0.33890272799076493, + "grad_norm": 0.8418211340904236, + "learning_rate": 7.452319493339173e-05, + "loss": 1.2568, + "step": 5578 + }, + { + "epoch": 0.33896348502339146, + "grad_norm": 0.23657409846782684, + "learning_rate": 7.451485218074646e-05, + "loss": 1.083, + "step": 5579 + }, + { + "epoch": 0.339024242056018, + "grad_norm": 2.726797580718994, + "learning_rate": 7.450650852950452e-05, + "loss": 1.0701, + "step": 5580 + }, + { + "epoch": 0.33908499908864453, + "grad_norm": 0.23161150515079498, + "learning_rate": 7.44981639799718e-05, + "loss": 1.1538, + "step": 5581 + }, + { + "epoch": 0.339145756121271, + "grad_norm": 1.5758064985275269, + "learning_rate": 7.448981853245414e-05, + "loss": 1.309, + "step": 5582 + }, + { + "epoch": 0.33920651315389755, + "grad_norm": 0.3868856728076935, + "learning_rate": 7.448147218725744e-05, + "loss": 1.169, + "step": 5583 + }, + { + "epoch": 0.3392672701865241, + "grad_norm": 0.166421577334404, + "learning_rate": 7.447312494468764e-05, + "loss": 1.0814, + "step": 5584 + }, + { + "epoch": 0.3393280272191506, + "grad_norm": 1.2591252326965332, + "learning_rate": 7.446477680505073e-05, + "loss": 1.1229, + "step": 5585 + }, + { + "epoch": 0.33938878425177715, + "grad_norm": 0.18639180064201355, + "learning_rate": 7.445642776865269e-05, + "loss": 1.0656, + "step": 5586 + }, + { + "epoch": 0.3394495412844037, + "grad_norm": 0.22079963982105255, + "learning_rate": 7.444807783579955e-05, + "loss": 1.0502, + "step": 5587 + }, + { + "epoch": 0.3395102983170302, + "grad_norm": 0.1429029405117035, + "learning_rate": 7.44397270067974e-05, + "loss": 1.1084, + "step": 5588 + }, + { + "epoch": 0.3395710553496567, + "grad_norm": 0.2187453806400299, + "learning_rate": 7.443137528195234e-05, + "loss": 1.0849, + "step": 5589 + }, + { + "epoch": 0.33963181238228324, + "grad_norm": 0.2235606610774994, + "learning_rate": 7.442302266157048e-05, + "loss": 1.0794, + "step": 5590 + }, + { + "epoch": 0.3396925694149098, + "grad_norm": 0.4305715560913086, + "learning_rate": 7.441466914595803e-05, + "loss": 1.096, + "step": 5591 + }, + { + "epoch": 0.3397533264475363, + "grad_norm": 0.19101032614707947, + "learning_rate": 7.440631473542115e-05, + "loss": 1.203, + "step": 5592 + }, + { + "epoch": 0.33981408348016284, + "grad_norm": 0.17606954276561737, + "learning_rate": 7.439795943026608e-05, + "loss": 1.051, + "step": 5593 + }, + { + "epoch": 0.3398748405127894, + "grad_norm": 0.4550893306732178, + "learning_rate": 7.43896032307991e-05, + "loss": 1.1026, + "step": 5594 + }, + { + "epoch": 0.33993559754541586, + "grad_norm": 0.12969236075878143, + "learning_rate": 7.438124613732648e-05, + "loss": 1.0219, + "step": 5595 + }, + { + "epoch": 0.3399963545780424, + "grad_norm": 0.2768893837928772, + "learning_rate": 7.43728881501546e-05, + "loss": 1.1329, + "step": 5596 + }, + { + "epoch": 0.3400571116106689, + "grad_norm": 0.24179022014141083, + "learning_rate": 7.436452926958978e-05, + "loss": 1.1184, + "step": 5597 + }, + { + "epoch": 0.34011786864329546, + "grad_norm": 0.8925986886024475, + "learning_rate": 7.435616949593842e-05, + "loss": 1.0757, + "step": 5598 + }, + { + "epoch": 0.340178625675922, + "grad_norm": 0.13775081932544708, + "learning_rate": 7.434780882950698e-05, + "loss": 1.0459, + "step": 5599 + }, + { + "epoch": 0.34023938270854853, + "grad_norm": 0.28351345658302307, + "learning_rate": 7.433944727060187e-05, + "loss": 1.1884, + "step": 5600 + }, + { + "epoch": 0.34030013974117507, + "grad_norm": 0.7399532794952393, + "learning_rate": 7.433108481952965e-05, + "loss": 1.2486, + "step": 5601 + }, + { + "epoch": 0.34036089677380155, + "grad_norm": 0.16441327333450317, + "learning_rate": 7.432272147659678e-05, + "loss": 1.0957, + "step": 5602 + }, + { + "epoch": 0.3404216538064281, + "grad_norm": 0.163296177983284, + "learning_rate": 7.431435724210988e-05, + "loss": 1.1066, + "step": 5603 + }, + { + "epoch": 0.3404824108390546, + "grad_norm": 0.15724974870681763, + "learning_rate": 7.43059921163755e-05, + "loss": 1.0814, + "step": 5604 + }, + { + "epoch": 0.34054316787168115, + "grad_norm": 0.18417984247207642, + "learning_rate": 7.42976260997003e-05, + "loss": 1.0665, + "step": 5605 + }, + { + "epoch": 0.3406039249043077, + "grad_norm": 0.2278042435646057, + "learning_rate": 7.428925919239089e-05, + "loss": 1.0832, + "step": 5606 + }, + { + "epoch": 0.3406646819369342, + "grad_norm": 0.25080230832099915, + "learning_rate": 7.4280891394754e-05, + "loss": 1.1882, + "step": 5607 + }, + { + "epoch": 0.3407254389695607, + "grad_norm": 0.12096163630485535, + "learning_rate": 7.427252270709635e-05, + "loss": 1.0625, + "step": 5608 + }, + { + "epoch": 0.34078619600218724, + "grad_norm": 0.13884446024894714, + "learning_rate": 7.426415312972469e-05, + "loss": 1.0871, + "step": 5609 + }, + { + "epoch": 0.34084695303481377, + "grad_norm": 0.19591939449310303, + "learning_rate": 7.425578266294579e-05, + "loss": 1.1022, + "step": 5610 + }, + { + "epoch": 0.3409077100674403, + "grad_norm": 0.23972727358341217, + "learning_rate": 7.42474113070665e-05, + "loss": 1.1176, + "step": 5611 + }, + { + "epoch": 0.34096846710006684, + "grad_norm": 0.19567696750164032, + "learning_rate": 7.423903906239367e-05, + "loss": 1.0735, + "step": 5612 + }, + { + "epoch": 0.3410292241326934, + "grad_norm": 0.18039634823799133, + "learning_rate": 7.423066592923417e-05, + "loss": 1.0964, + "step": 5613 + }, + { + "epoch": 0.3410899811653199, + "grad_norm": 0.24264463782310486, + "learning_rate": 7.422229190789492e-05, + "loss": 1.0596, + "step": 5614 + }, + { + "epoch": 0.3411507381979464, + "grad_norm": 0.2057594507932663, + "learning_rate": 7.421391699868288e-05, + "loss": 1.0339, + "step": 5615 + }, + { + "epoch": 0.3412114952305729, + "grad_norm": 0.1615353524684906, + "learning_rate": 7.420554120190504e-05, + "loss": 1.0968, + "step": 5616 + }, + { + "epoch": 0.34127225226319946, + "grad_norm": 0.22651933133602142, + "learning_rate": 7.41971645178684e-05, + "loss": 1.0786, + "step": 5617 + }, + { + "epoch": 0.341333009295826, + "grad_norm": 0.19685249030590057, + "learning_rate": 7.418878694688002e-05, + "loss": 1.1282, + "step": 5618 + }, + { + "epoch": 0.34139376632845253, + "grad_norm": 0.1910761445760727, + "learning_rate": 7.418040848924698e-05, + "loss": 1.1786, + "step": 5619 + }, + { + "epoch": 0.34145452336107907, + "grad_norm": 0.20940546691417694, + "learning_rate": 7.41720291452764e-05, + "loss": 1.1485, + "step": 5620 + }, + { + "epoch": 0.34151528039370554, + "grad_norm": 0.13719259202480316, + "learning_rate": 7.416364891527541e-05, + "loss": 1.0778, + "step": 5621 + }, + { + "epoch": 0.3415760374263321, + "grad_norm": 0.13285192847251892, + "learning_rate": 7.415526779955121e-05, + "loss": 1.0748, + "step": 5622 + }, + { + "epoch": 0.3416367944589586, + "grad_norm": 3.7748587131500244, + "learning_rate": 7.414688579841098e-05, + "loss": 1.1041, + "step": 5623 + }, + { + "epoch": 0.34169755149158515, + "grad_norm": 0.14596666395664215, + "learning_rate": 7.4138502912162e-05, + "loss": 1.1143, + "step": 5624 + }, + { + "epoch": 0.3417583085242117, + "grad_norm": 0.15469378232955933, + "learning_rate": 7.413011914111152e-05, + "loss": 1.0945, + "step": 5625 + }, + { + "epoch": 0.3418190655568382, + "grad_norm": 0.808948814868927, + "learning_rate": 7.412173448556687e-05, + "loss": 1.2254, + "step": 5626 + }, + { + "epoch": 0.34187982258946475, + "grad_norm": 0.5553398132324219, + "learning_rate": 7.411334894583538e-05, + "loss": 1.138, + "step": 5627 + }, + { + "epoch": 0.34194057962209123, + "grad_norm": 0.146551713347435, + "learning_rate": 7.41049625222244e-05, + "loss": 1.0603, + "step": 5628 + }, + { + "epoch": 0.34200133665471777, + "grad_norm": 0.4100880026817322, + "learning_rate": 7.409657521504137e-05, + "loss": 1.1189, + "step": 5629 + }, + { + "epoch": 0.3420620936873443, + "grad_norm": 0.16061285138130188, + "learning_rate": 7.408818702459374e-05, + "loss": 1.0913, + "step": 5630 + }, + { + "epoch": 0.34212285071997084, + "grad_norm": 0.27554699778556824, + "learning_rate": 7.407979795118892e-05, + "loss": 1.2068, + "step": 5631 + }, + { + "epoch": 0.3421836077525974, + "grad_norm": 0.14766065776348114, + "learning_rate": 7.407140799513449e-05, + "loss": 1.0972, + "step": 5632 + }, + { + "epoch": 0.3422443647852239, + "grad_norm": 0.19677849113941193, + "learning_rate": 7.406301715673793e-05, + "loss": 1.0682, + "step": 5633 + }, + { + "epoch": 0.3423051218178504, + "grad_norm": 0.26291149854660034, + "learning_rate": 7.405462543630681e-05, + "loss": 1.159, + "step": 5634 + }, + { + "epoch": 0.3423658788504769, + "grad_norm": 0.18334637582302094, + "learning_rate": 7.404623283414876e-05, + "loss": 1.0782, + "step": 5635 + }, + { + "epoch": 0.34242663588310346, + "grad_norm": 0.16423004865646362, + "learning_rate": 7.40378393505714e-05, + "loss": 1.0463, + "step": 5636 + }, + { + "epoch": 0.34248739291573, + "grad_norm": 0.2510235607624054, + "learning_rate": 7.402944498588239e-05, + "loss": 1.1497, + "step": 5637 + }, + { + "epoch": 0.34254814994835653, + "grad_norm": 0.24117280542850494, + "learning_rate": 7.402104974038943e-05, + "loss": 1.173, + "step": 5638 + }, + { + "epoch": 0.34260890698098306, + "grad_norm": 0.16989333927631378, + "learning_rate": 7.401265361440026e-05, + "loss": 1.1391, + "step": 5639 + }, + { + "epoch": 0.3426696640136096, + "grad_norm": 0.2275887280702591, + "learning_rate": 7.40042566082226e-05, + "loss": 1.0677, + "step": 5640 + }, + { + "epoch": 0.3427304210462361, + "grad_norm": 0.17539876699447632, + "learning_rate": 7.39958587221643e-05, + "loss": 1.1916, + "step": 5641 + }, + { + "epoch": 0.3427911780788626, + "grad_norm": 3.434662342071533, + "learning_rate": 7.398745995653316e-05, + "loss": 1.0947, + "step": 5642 + }, + { + "epoch": 0.34285193511148915, + "grad_norm": 0.3420221507549286, + "learning_rate": 7.397906031163702e-05, + "loss": 1.1176, + "step": 5643 + }, + { + "epoch": 0.3429126921441157, + "grad_norm": 4.487304210662842, + "learning_rate": 7.397065978778379e-05, + "loss": 1.151, + "step": 5644 + }, + { + "epoch": 0.3429734491767422, + "grad_norm": 0.17696045339107513, + "learning_rate": 7.396225838528141e-05, + "loss": 1.0733, + "step": 5645 + }, + { + "epoch": 0.34303420620936875, + "grad_norm": 0.22626429796218872, + "learning_rate": 7.395385610443782e-05, + "loss": 1.1691, + "step": 5646 + }, + { + "epoch": 0.3430949632419953, + "grad_norm": 0.1350768655538559, + "learning_rate": 7.394545294556098e-05, + "loss": 1.0289, + "step": 5647 + }, + { + "epoch": 0.34315572027462177, + "grad_norm": 0.17733575403690338, + "learning_rate": 7.393704890895895e-05, + "loss": 1.0973, + "step": 5648 + }, + { + "epoch": 0.3432164773072483, + "grad_norm": 0.2509138584136963, + "learning_rate": 7.392864399493976e-05, + "loss": 1.2118, + "step": 5649 + }, + { + "epoch": 0.34327723433987484, + "grad_norm": 0.3420547544956207, + "learning_rate": 7.39202382038115e-05, + "loss": 1.2192, + "step": 5650 + }, + { + "epoch": 0.34333799137250137, + "grad_norm": 0.1549650877714157, + "learning_rate": 7.39118315358823e-05, + "loss": 1.1308, + "step": 5651 + }, + { + "epoch": 0.3433987484051279, + "grad_norm": 0.2076931744813919, + "learning_rate": 7.390342399146027e-05, + "loss": 1.1029, + "step": 5652 + }, + { + "epoch": 0.34345950543775444, + "grad_norm": 0.14477044343948364, + "learning_rate": 7.389501557085362e-05, + "loss": 1.0735, + "step": 5653 + }, + { + "epoch": 0.3435202624703809, + "grad_norm": 0.2162570059299469, + "learning_rate": 7.388660627437055e-05, + "loss": 1.1981, + "step": 5654 + }, + { + "epoch": 0.34358101950300746, + "grad_norm": 0.1991337090730667, + "learning_rate": 7.387819610231932e-05, + "loss": 1.1393, + "step": 5655 + }, + { + "epoch": 0.343641776535634, + "grad_norm": 0.34177350997924805, + "learning_rate": 7.386978505500822e-05, + "loss": 1.0783, + "step": 5656 + }, + { + "epoch": 0.3437025335682605, + "grad_norm": 0.16313883662223816, + "learning_rate": 7.386137313274548e-05, + "loss": 1.1165, + "step": 5657 + }, + { + "epoch": 0.34376329060088706, + "grad_norm": 0.21096296608448029, + "learning_rate": 7.385296033583953e-05, + "loss": 1.1561, + "step": 5658 + }, + { + "epoch": 0.3438240476335136, + "grad_norm": 0.33268654346466064, + "learning_rate": 7.38445466645987e-05, + "loss": 1.067, + "step": 5659 + }, + { + "epoch": 0.34388480466614013, + "grad_norm": 0.14179377257823944, + "learning_rate": 7.383613211933141e-05, + "loss": 1.0873, + "step": 5660 + }, + { + "epoch": 0.3439455616987666, + "grad_norm": 0.19833813607692719, + "learning_rate": 7.382771670034608e-05, + "loss": 1.1134, + "step": 5661 + }, + { + "epoch": 0.34400631873139315, + "grad_norm": 0.4791829288005829, + "learning_rate": 7.381930040795119e-05, + "loss": 1.2737, + "step": 5662 + }, + { + "epoch": 0.3440670757640197, + "grad_norm": 0.4864122271537781, + "learning_rate": 7.381088324245525e-05, + "loss": 1.0829, + "step": 5663 + }, + { + "epoch": 0.3441278327966462, + "grad_norm": 0.16897287964820862, + "learning_rate": 7.380246520416678e-05, + "loss": 1.0861, + "step": 5664 + }, + { + "epoch": 0.34418858982927275, + "grad_norm": 0.2143501490354538, + "learning_rate": 7.379404629339433e-05, + "loss": 1.0881, + "step": 5665 + }, + { + "epoch": 0.3442493468618993, + "grad_norm": 0.3365273177623749, + "learning_rate": 7.378562651044653e-05, + "loss": 1.1237, + "step": 5666 + }, + { + "epoch": 0.34431010389452577, + "grad_norm": 0.8202298879623413, + "learning_rate": 7.377720585563199e-05, + "loss": 1.1335, + "step": 5667 + }, + { + "epoch": 0.3443708609271523, + "grad_norm": 0.16889657080173492, + "learning_rate": 7.376878432925938e-05, + "loss": 1.1591, + "step": 5668 + }, + { + "epoch": 0.34443161795977884, + "grad_norm": 0.15688326954841614, + "learning_rate": 7.376036193163738e-05, + "loss": 1.0354, + "step": 5669 + }, + { + "epoch": 0.34449237499240537, + "grad_norm": 0.21872282028198242, + "learning_rate": 7.375193866307471e-05, + "loss": 1.1424, + "step": 5670 + }, + { + "epoch": 0.3445531320250319, + "grad_norm": 0.20044030249118805, + "learning_rate": 7.374351452388016e-05, + "loss": 1.1177, + "step": 5671 + }, + { + "epoch": 0.34461388905765844, + "grad_norm": 0.47033408284187317, + "learning_rate": 7.373508951436247e-05, + "loss": 1.2517, + "step": 5672 + }, + { + "epoch": 0.344674646090285, + "grad_norm": 1.573832631111145, + "learning_rate": 7.37266636348305e-05, + "loss": 1.0547, + "step": 5673 + }, + { + "epoch": 0.34473540312291145, + "grad_norm": 0.1649778038263321, + "learning_rate": 7.371823688559308e-05, + "loss": 1.0617, + "step": 5674 + }, + { + "epoch": 0.344796160155538, + "grad_norm": 0.30305325984954834, + "learning_rate": 7.37098092669591e-05, + "loss": 1.1152, + "step": 5675 + }, + { + "epoch": 0.3448569171881645, + "grad_norm": 0.19876450300216675, + "learning_rate": 7.37013807792375e-05, + "loss": 1.0866, + "step": 5676 + }, + { + "epoch": 0.34491767422079106, + "grad_norm": 0.27401667833328247, + "learning_rate": 7.36929514227372e-05, + "loss": 1.0587, + "step": 5677 + }, + { + "epoch": 0.3449784312534176, + "grad_norm": 0.1567569524049759, + "learning_rate": 7.368452119776716e-05, + "loss": 1.0622, + "step": 5678 + }, + { + "epoch": 0.34503918828604413, + "grad_norm": 0.202568918466568, + "learning_rate": 7.367609010463646e-05, + "loss": 1.0361, + "step": 5679 + }, + { + "epoch": 0.3450999453186706, + "grad_norm": 0.17404919862747192, + "learning_rate": 7.366765814365406e-05, + "loss": 1.0743, + "step": 5680 + }, + { + "epoch": 0.34516070235129714, + "grad_norm": 1.4974126815795898, + "learning_rate": 7.36592253151291e-05, + "loss": 1.1646, + "step": 5681 + }, + { + "epoch": 0.3452214593839237, + "grad_norm": 0.2991856634616852, + "learning_rate": 7.365079161937066e-05, + "loss": 1.1555, + "step": 5682 + }, + { + "epoch": 0.3452822164165502, + "grad_norm": 0.25850579142570496, + "learning_rate": 7.364235705668787e-05, + "loss": 1.0941, + "step": 5683 + }, + { + "epoch": 0.34534297344917675, + "grad_norm": 0.20604000985622406, + "learning_rate": 7.363392162738992e-05, + "loss": 1.1276, + "step": 5684 + }, + { + "epoch": 0.3454037304818033, + "grad_norm": 0.15421925485134125, + "learning_rate": 7.362548533178599e-05, + "loss": 1.0567, + "step": 5685 + }, + { + "epoch": 0.3454644875144298, + "grad_norm": 0.2226591855287552, + "learning_rate": 7.361704817018535e-05, + "loss": 1.218, + "step": 5686 + }, + { + "epoch": 0.3455252445470563, + "grad_norm": 0.2017289251089096, + "learning_rate": 7.360861014289724e-05, + "loss": 1.1461, + "step": 5687 + }, + { + "epoch": 0.34558600157968283, + "grad_norm": 0.1717287003993988, + "learning_rate": 7.360017125023096e-05, + "loss": 1.161, + "step": 5688 + }, + { + "epoch": 0.34564675861230937, + "grad_norm": 0.34899288415908813, + "learning_rate": 7.359173149249584e-05, + "loss": 1.2619, + "step": 5689 + }, + { + "epoch": 0.3457075156449359, + "grad_norm": 0.15625520050525665, + "learning_rate": 7.358329087000124e-05, + "loss": 1.0627, + "step": 5690 + }, + { + "epoch": 0.34576827267756244, + "grad_norm": 0.13026641309261322, + "learning_rate": 7.357484938305655e-05, + "loss": 1.0959, + "step": 5691 + }, + { + "epoch": 0.345829029710189, + "grad_norm": 0.6139512658119202, + "learning_rate": 7.356640703197119e-05, + "loss": 1.1515, + "step": 5692 + }, + { + "epoch": 0.3458897867428155, + "grad_norm": 0.16386525332927704, + "learning_rate": 7.355796381705465e-05, + "loss": 1.1346, + "step": 5693 + }, + { + "epoch": 0.345950543775442, + "grad_norm": 0.21870790421962738, + "learning_rate": 7.354951973861639e-05, + "loss": 1.1361, + "step": 5694 + }, + { + "epoch": 0.3460113008080685, + "grad_norm": 0.17906992137432098, + "learning_rate": 7.354107479696591e-05, + "loss": 1.1116, + "step": 5695 + }, + { + "epoch": 0.34607205784069506, + "grad_norm": 0.16407795250415802, + "learning_rate": 7.35326289924128e-05, + "loss": 1.0573, + "step": 5696 + }, + { + "epoch": 0.3461328148733216, + "grad_norm": 0.3035754859447479, + "learning_rate": 7.352418232526663e-05, + "loss": 1.1312, + "step": 5697 + }, + { + "epoch": 0.3461935719059481, + "grad_norm": 0.12715089321136475, + "learning_rate": 7.3515734795837e-05, + "loss": 1.0728, + "step": 5698 + }, + { + "epoch": 0.34625432893857466, + "grad_norm": 0.14146849513053894, + "learning_rate": 7.350728640443358e-05, + "loss": 1.0633, + "step": 5699 + }, + { + "epoch": 0.34631508597120114, + "grad_norm": 0.2877689301967621, + "learning_rate": 7.3498837151366e-05, + "loss": 1.0632, + "step": 5700 + }, + { + "epoch": 0.3463758430038277, + "grad_norm": 0.2097448706626892, + "learning_rate": 7.349038703694401e-05, + "loss": 1.0886, + "step": 5701 + }, + { + "epoch": 0.3464366000364542, + "grad_norm": 0.21265172958374023, + "learning_rate": 7.348193606147738e-05, + "loss": 1.1493, + "step": 5702 + }, + { + "epoch": 0.34649735706908075, + "grad_norm": 0.12911838293075562, + "learning_rate": 7.347348422527581e-05, + "loss": 1.0571, + "step": 5703 + }, + { + "epoch": 0.3465581141017073, + "grad_norm": 0.13353310525417328, + "learning_rate": 7.346503152864916e-05, + "loss": 1.0911, + "step": 5704 + }, + { + "epoch": 0.3466188711343338, + "grad_norm": 0.14378947019577026, + "learning_rate": 7.345657797190723e-05, + "loss": 1.0687, + "step": 5705 + }, + { + "epoch": 0.34667962816696035, + "grad_norm": 0.14005614817142487, + "learning_rate": 7.344812355535989e-05, + "loss": 1.108, + "step": 5706 + }, + { + "epoch": 0.34674038519958683, + "grad_norm": 1.9512763023376465, + "learning_rate": 7.343966827931707e-05, + "loss": 1.0834, + "step": 5707 + }, + { + "epoch": 0.34680114223221337, + "grad_norm": 0.29365137219429016, + "learning_rate": 7.343121214408867e-05, + "loss": 1.1683, + "step": 5708 + }, + { + "epoch": 0.3468618992648399, + "grad_norm": 0.6108081936836243, + "learning_rate": 7.342275514998466e-05, + "loss": 1.0346, + "step": 5709 + }, + { + "epoch": 0.34692265629746644, + "grad_norm": 0.2702333331108093, + "learning_rate": 7.341429729731503e-05, + "loss": 1.1872, + "step": 5710 + }, + { + "epoch": 0.34698341333009297, + "grad_norm": 0.2966519296169281, + "learning_rate": 7.34058385863898e-05, + "loss": 1.2099, + "step": 5711 + }, + { + "epoch": 0.3470441703627195, + "grad_norm": 1.0424479246139526, + "learning_rate": 7.339737901751905e-05, + "loss": 1.251, + "step": 5712 + }, + { + "epoch": 0.347104927395346, + "grad_norm": 0.5929242372512817, + "learning_rate": 7.338891859101285e-05, + "loss": 1.2657, + "step": 5713 + }, + { + "epoch": 0.3471656844279725, + "grad_norm": 0.17844748497009277, + "learning_rate": 7.338045730718129e-05, + "loss": 1.132, + "step": 5714 + }, + { + "epoch": 0.34722644146059906, + "grad_norm": 0.31952664256095886, + "learning_rate": 7.337199516633458e-05, + "loss": 1.1326, + "step": 5715 + }, + { + "epoch": 0.3472871984932256, + "grad_norm": 0.24436403810977936, + "learning_rate": 7.336353216878285e-05, + "loss": 1.0437, + "step": 5716 + }, + { + "epoch": 0.3473479555258521, + "grad_norm": 0.2095416635274887, + "learning_rate": 7.335506831483634e-05, + "loss": 1.2193, + "step": 5717 + }, + { + "epoch": 0.34740871255847866, + "grad_norm": 0.19626320898532867, + "learning_rate": 7.334660360480527e-05, + "loss": 1.0921, + "step": 5718 + }, + { + "epoch": 0.3474694695911052, + "grad_norm": 0.17072322964668274, + "learning_rate": 7.333813803899996e-05, + "loss": 1.1532, + "step": 5719 + }, + { + "epoch": 0.3475302266237317, + "grad_norm": 0.18080373108386993, + "learning_rate": 7.332967161773068e-05, + "loss": 1.1549, + "step": 5720 + }, + { + "epoch": 0.3475909836563582, + "grad_norm": 0.19815124571323395, + "learning_rate": 7.332120434130778e-05, + "loss": 1.1661, + "step": 5721 + }, + { + "epoch": 0.34765174068898475, + "grad_norm": 1.6336404085159302, + "learning_rate": 7.33127362100416e-05, + "loss": 1.1206, + "step": 5722 + }, + { + "epoch": 0.3477124977216113, + "grad_norm": 0.16710756719112396, + "learning_rate": 7.33042672242426e-05, + "loss": 1.128, + "step": 5723 + }, + { + "epoch": 0.3477732547542378, + "grad_norm": 0.14641173183918, + "learning_rate": 7.329579738422116e-05, + "loss": 1.0815, + "step": 5724 + }, + { + "epoch": 0.34783401178686435, + "grad_norm": 0.29195988178253174, + "learning_rate": 7.328732669028777e-05, + "loss": 1.1457, + "step": 5725 + }, + { + "epoch": 0.34789476881949083, + "grad_norm": 0.13369852304458618, + "learning_rate": 7.327885514275294e-05, + "loss": 1.0279, + "step": 5726 + }, + { + "epoch": 0.34795552585211736, + "grad_norm": 0.15933088958263397, + "learning_rate": 7.327038274192714e-05, + "loss": 1.0546, + "step": 5727 + }, + { + "epoch": 0.3480162828847439, + "grad_norm": 0.14478226006031036, + "learning_rate": 7.326190948812097e-05, + "loss": 1.0877, + "step": 5728 + }, + { + "epoch": 0.34807703991737043, + "grad_norm": 0.2454705834388733, + "learning_rate": 7.325343538164503e-05, + "loss": 1.0718, + "step": 5729 + }, + { + "epoch": 0.34813779694999697, + "grad_norm": 0.1678120344877243, + "learning_rate": 7.32449604228099e-05, + "loss": 1.0125, + "step": 5730 + }, + { + "epoch": 0.3481985539826235, + "grad_norm": 0.18025526404380798, + "learning_rate": 7.323648461192627e-05, + "loss": 1.1315, + "step": 5731 + }, + { + "epoch": 0.34825931101525004, + "grad_norm": 0.2066357582807541, + "learning_rate": 7.32280079493048e-05, + "loss": 1.1235, + "step": 5732 + }, + { + "epoch": 0.3483200680478765, + "grad_norm": 0.1464719921350479, + "learning_rate": 7.321953043525622e-05, + "loss": 1.0697, + "step": 5733 + }, + { + "epoch": 0.34838082508050305, + "grad_norm": 0.16893261671066284, + "learning_rate": 7.321105207009124e-05, + "loss": 1.1194, + "step": 5734 + }, + { + "epoch": 0.3484415821131296, + "grad_norm": 3.2623672485351562, + "learning_rate": 7.320257285412066e-05, + "loss": 1.1092, + "step": 5735 + }, + { + "epoch": 0.3485023391457561, + "grad_norm": 0.19582438468933105, + "learning_rate": 7.319409278765532e-05, + "loss": 1.2521, + "step": 5736 + }, + { + "epoch": 0.34856309617838266, + "grad_norm": 0.5753293037414551, + "learning_rate": 7.318561187100598e-05, + "loss": 1.1917, + "step": 5737 + }, + { + "epoch": 0.3486238532110092, + "grad_norm": 0.17827434837818146, + "learning_rate": 7.317713010448359e-05, + "loss": 1.0871, + "step": 5738 + }, + { + "epoch": 0.3486846102436357, + "grad_norm": 0.1499662697315216, + "learning_rate": 7.316864748839901e-05, + "loss": 1.0344, + "step": 5739 + }, + { + "epoch": 0.3487453672762622, + "grad_norm": 0.15069326758384705, + "learning_rate": 7.316016402306315e-05, + "loss": 1.0608, + "step": 5740 + }, + { + "epoch": 0.34880612430888874, + "grad_norm": 0.2449057549238205, + "learning_rate": 7.315167970878705e-05, + "loss": 1.0996, + "step": 5741 + }, + { + "epoch": 0.3488668813415153, + "grad_norm": 0.7468262910842896, + "learning_rate": 7.314319454588162e-05, + "loss": 1.2121, + "step": 5742 + }, + { + "epoch": 0.3489276383741418, + "grad_norm": 0.6195400953292847, + "learning_rate": 7.313470853465794e-05, + "loss": 1.1891, + "step": 5743 + }, + { + "epoch": 0.34898839540676835, + "grad_norm": 0.157234787940979, + "learning_rate": 7.312622167542702e-05, + "loss": 1.1093, + "step": 5744 + }, + { + "epoch": 0.3490491524393949, + "grad_norm": 3.5099685192108154, + "learning_rate": 7.311773396849999e-05, + "loss": 1.0526, + "step": 5745 + }, + { + "epoch": 0.34910990947202136, + "grad_norm": 0.22053498029708862, + "learning_rate": 7.310924541418796e-05, + "loss": 1.1293, + "step": 5746 + }, + { + "epoch": 0.3491706665046479, + "grad_norm": 0.8692840933799744, + "learning_rate": 7.310075601280206e-05, + "loss": 1.3386, + "step": 5747 + }, + { + "epoch": 0.34923142353727443, + "grad_norm": 0.23964044451713562, + "learning_rate": 7.309226576465349e-05, + "loss": 1.2406, + "step": 5748 + }, + { + "epoch": 0.34929218056990097, + "grad_norm": 0.6238002181053162, + "learning_rate": 7.308377467005347e-05, + "loss": 1.2017, + "step": 5749 + }, + { + "epoch": 0.3493529376025275, + "grad_norm": 0.24637416005134583, + "learning_rate": 7.307528272931319e-05, + "loss": 1.0706, + "step": 5750 + }, + { + "epoch": 0.34941369463515404, + "grad_norm": 0.21685929596424103, + "learning_rate": 7.306678994274398e-05, + "loss": 1.0615, + "step": 5751 + }, + { + "epoch": 0.3494744516677806, + "grad_norm": 0.1708822250366211, + "learning_rate": 7.305829631065714e-05, + "loss": 1.1175, + "step": 5752 + }, + { + "epoch": 0.34953520870040705, + "grad_norm": 0.23879919946193695, + "learning_rate": 7.304980183336397e-05, + "loss": 1.1185, + "step": 5753 + }, + { + "epoch": 0.3495959657330336, + "grad_norm": 0.18171828985214233, + "learning_rate": 7.304130651117588e-05, + "loss": 1.1051, + "step": 5754 + }, + { + "epoch": 0.3496567227656601, + "grad_norm": 0.2343205064535141, + "learning_rate": 7.303281034440421e-05, + "loss": 1.102, + "step": 5755 + }, + { + "epoch": 0.34971747979828666, + "grad_norm": 0.36213231086730957, + "learning_rate": 7.302431333336048e-05, + "loss": 1.0919, + "step": 5756 + }, + { + "epoch": 0.3497782368309132, + "grad_norm": 0.24571682512760162, + "learning_rate": 7.301581547835606e-05, + "loss": 1.189, + "step": 5757 + }, + { + "epoch": 0.3498389938635397, + "grad_norm": 0.1978072226047516, + "learning_rate": 7.300731677970246e-05, + "loss": 1.1187, + "step": 5758 + }, + { + "epoch": 0.3498997508961662, + "grad_norm": 0.53170245885849, + "learning_rate": 7.299881723771124e-05, + "loss": 1.208, + "step": 5759 + }, + { + "epoch": 0.34996050792879274, + "grad_norm": 0.2298247069120407, + "learning_rate": 7.299031685269392e-05, + "loss": 1.0478, + "step": 5760 + }, + { + "epoch": 0.3500212649614193, + "grad_norm": 0.6664319634437561, + "learning_rate": 7.298181562496209e-05, + "loss": 1.159, + "step": 5761 + }, + { + "epoch": 0.3500820219940458, + "grad_norm": 0.18363624811172485, + "learning_rate": 7.297331355482737e-05, + "loss": 1.0679, + "step": 5762 + }, + { + "epoch": 0.35014277902667235, + "grad_norm": 0.4725208878517151, + "learning_rate": 7.296481064260141e-05, + "loss": 1.0559, + "step": 5763 + }, + { + "epoch": 0.3502035360592989, + "grad_norm": 0.39487212896347046, + "learning_rate": 7.295630688859587e-05, + "loss": 1.0848, + "step": 5764 + }, + { + "epoch": 0.3502642930919254, + "grad_norm": 0.33195677399635315, + "learning_rate": 7.294780229312246e-05, + "loss": 1.2254, + "step": 5765 + }, + { + "epoch": 0.3503250501245519, + "grad_norm": 0.16192245483398438, + "learning_rate": 7.293929685649292e-05, + "loss": 1.0606, + "step": 5766 + }, + { + "epoch": 0.35038580715717843, + "grad_norm": 0.18568256497383118, + "learning_rate": 7.293079057901902e-05, + "loss": 1.1535, + "step": 5767 + }, + { + "epoch": 0.35044656418980497, + "grad_norm": 0.17248280346393585, + "learning_rate": 7.292228346101255e-05, + "loss": 1.0904, + "step": 5768 + }, + { + "epoch": 0.3505073212224315, + "grad_norm": 0.29796722531318665, + "learning_rate": 7.291377550278537e-05, + "loss": 1.128, + "step": 5769 + }, + { + "epoch": 0.35056807825505804, + "grad_norm": 0.32810768485069275, + "learning_rate": 7.290526670464933e-05, + "loss": 1.2998, + "step": 5770 + }, + { + "epoch": 0.35062883528768457, + "grad_norm": 0.9751858115196228, + "learning_rate": 7.289675706691628e-05, + "loss": 1.1057, + "step": 5771 + }, + { + "epoch": 0.35068959232031105, + "grad_norm": 0.321365624666214, + "learning_rate": 7.288824658989818e-05, + "loss": 1.0544, + "step": 5772 + }, + { + "epoch": 0.3507503493529376, + "grad_norm": 1.1922931671142578, + "learning_rate": 7.287973527390699e-05, + "loss": 1.1017, + "step": 5773 + }, + { + "epoch": 0.3508111063855641, + "grad_norm": 0.14850562810897827, + "learning_rate": 7.287122311925467e-05, + "loss": 1.1196, + "step": 5774 + }, + { + "epoch": 0.35087186341819065, + "grad_norm": 0.16037853062152863, + "learning_rate": 7.286271012625326e-05, + "loss": 1.0693, + "step": 5775 + }, + { + "epoch": 0.3509326204508172, + "grad_norm": 0.18525104224681854, + "learning_rate": 7.285419629521477e-05, + "loss": 1.0864, + "step": 5776 + }, + { + "epoch": 0.3509933774834437, + "grad_norm": 0.41641688346862793, + "learning_rate": 7.284568162645131e-05, + "loss": 1.0468, + "step": 5777 + }, + { + "epoch": 0.35105413451607026, + "grad_norm": 0.30591610074043274, + "learning_rate": 7.283716612027497e-05, + "loss": 1.2554, + "step": 5778 + }, + { + "epoch": 0.35111489154869674, + "grad_norm": 0.13217495381832123, + "learning_rate": 7.282864977699789e-05, + "loss": 1.0358, + "step": 5779 + }, + { + "epoch": 0.3511756485813233, + "grad_norm": 15.487896919250488, + "learning_rate": 7.282013259693225e-05, + "loss": 1.0538, + "step": 5780 + }, + { + "epoch": 0.3512364056139498, + "grad_norm": 0.15854030847549438, + "learning_rate": 7.281161458039022e-05, + "loss": 1.0876, + "step": 5781 + }, + { + "epoch": 0.35129716264657634, + "grad_norm": 0.1775718480348587, + "learning_rate": 7.280309572768407e-05, + "loss": 1.1245, + "step": 5782 + }, + { + "epoch": 0.3513579196792029, + "grad_norm": 0.653881311416626, + "learning_rate": 7.279457603912603e-05, + "loss": 1.2809, + "step": 5783 + }, + { + "epoch": 0.3514186767118294, + "grad_norm": 0.23063676059246063, + "learning_rate": 7.27860555150284e-05, + "loss": 1.0354, + "step": 5784 + }, + { + "epoch": 0.3514794337444559, + "grad_norm": 0.17329175770282745, + "learning_rate": 7.277753415570349e-05, + "loss": 1.1186, + "step": 5785 + }, + { + "epoch": 0.35154019077708243, + "grad_norm": 0.15079350769519806, + "learning_rate": 7.276901196146367e-05, + "loss": 1.1657, + "step": 5786 + }, + { + "epoch": 0.35160094780970896, + "grad_norm": 0.23822999000549316, + "learning_rate": 7.276048893262132e-05, + "loss": 1.0697, + "step": 5787 + }, + { + "epoch": 0.3516617048423355, + "grad_norm": 0.20611199736595154, + "learning_rate": 7.275196506948885e-05, + "loss": 1.0403, + "step": 5788 + }, + { + "epoch": 0.35172246187496203, + "grad_norm": 0.25846919417381287, + "learning_rate": 7.274344037237868e-05, + "loss": 1.259, + "step": 5789 + }, + { + "epoch": 0.35178321890758857, + "grad_norm": 0.23814713954925537, + "learning_rate": 7.273491484160334e-05, + "loss": 1.075, + "step": 5790 + }, + { + "epoch": 0.3518439759402151, + "grad_norm": 0.5330560207366943, + "learning_rate": 7.272638847747528e-05, + "loss": 1.055, + "step": 5791 + }, + { + "epoch": 0.3519047329728416, + "grad_norm": 0.4811760187149048, + "learning_rate": 7.271786128030706e-05, + "loss": 1.2015, + "step": 5792 + }, + { + "epoch": 0.3519654900054681, + "grad_norm": 0.13954052329063416, + "learning_rate": 7.270933325041125e-05, + "loss": 1.0401, + "step": 5793 + }, + { + "epoch": 0.35202624703809465, + "grad_norm": 0.13437391817569733, + "learning_rate": 7.270080438810043e-05, + "loss": 1.0669, + "step": 5794 + }, + { + "epoch": 0.3520870040707212, + "grad_norm": 0.12584757804870605, + "learning_rate": 7.269227469368725e-05, + "loss": 1.0786, + "step": 5795 + }, + { + "epoch": 0.3521477611033477, + "grad_norm": 0.16621801257133484, + "learning_rate": 7.268374416748434e-05, + "loss": 1.12, + "step": 5796 + }, + { + "epoch": 0.35220851813597426, + "grad_norm": 0.15626905858516693, + "learning_rate": 7.267521280980442e-05, + "loss": 1.1097, + "step": 5797 + }, + { + "epoch": 0.3522692751686008, + "grad_norm": 0.43506526947021484, + "learning_rate": 7.266668062096018e-05, + "loss": 1.3063, + "step": 5798 + }, + { + "epoch": 0.3523300322012273, + "grad_norm": 0.32517826557159424, + "learning_rate": 7.265814760126439e-05, + "loss": 1.1496, + "step": 5799 + }, + { + "epoch": 0.3523907892338538, + "grad_norm": 0.26677462458610535, + "learning_rate": 7.26496137510298e-05, + "loss": 1.0512, + "step": 5800 + }, + { + "epoch": 0.35245154626648034, + "grad_norm": 0.2610955536365509, + "learning_rate": 7.264107907056928e-05, + "loss": 1.1183, + "step": 5801 + }, + { + "epoch": 0.3525123032991069, + "grad_norm": 0.16532549262046814, + "learning_rate": 7.263254356019558e-05, + "loss": 1.071, + "step": 5802 + }, + { + "epoch": 0.3525730603317334, + "grad_norm": 0.18929168581962585, + "learning_rate": 7.262400722022165e-05, + "loss": 1.1551, + "step": 5803 + }, + { + "epoch": 0.35263381736435995, + "grad_norm": 0.19736342132091522, + "learning_rate": 7.261547005096036e-05, + "loss": 1.1133, + "step": 5804 + }, + { + "epoch": 0.3526945743969864, + "grad_norm": 0.270641565322876, + "learning_rate": 7.260693205272463e-05, + "loss": 1.1173, + "step": 5805 + }, + { + "epoch": 0.35275533142961296, + "grad_norm": 0.14757870137691498, + "learning_rate": 7.259839322582747e-05, + "loss": 1.0433, + "step": 5806 + }, + { + "epoch": 0.3528160884622395, + "grad_norm": 0.15412771701812744, + "learning_rate": 7.25898535705818e-05, + "loss": 1.0538, + "step": 5807 + }, + { + "epoch": 0.35287684549486603, + "grad_norm": 0.2050395905971527, + "learning_rate": 7.258131308730069e-05, + "loss": 1.13, + "step": 5808 + }, + { + "epoch": 0.35293760252749257, + "grad_norm": 0.18094982206821442, + "learning_rate": 7.25727717762972e-05, + "loss": 1.1525, + "step": 5809 + }, + { + "epoch": 0.3529983595601191, + "grad_norm": 0.1964893490076065, + "learning_rate": 7.256422963788438e-05, + "loss": 1.2286, + "step": 5810 + }, + { + "epoch": 0.35305911659274564, + "grad_norm": 0.21823422610759735, + "learning_rate": 7.255568667237537e-05, + "loss": 1.2311, + "step": 5811 + }, + { + "epoch": 0.3531198736253721, + "grad_norm": 0.14053837954998016, + "learning_rate": 7.25471428800833e-05, + "loss": 1.1162, + "step": 5812 + }, + { + "epoch": 0.35318063065799865, + "grad_norm": 0.17161621153354645, + "learning_rate": 7.253859826132137e-05, + "loss": 1.1776, + "step": 5813 + }, + { + "epoch": 0.3532413876906252, + "grad_norm": 0.34896978735923767, + "learning_rate": 7.253005281640275e-05, + "loss": 1.217, + "step": 5814 + }, + { + "epoch": 0.3533021447232517, + "grad_norm": 0.14858554303646088, + "learning_rate": 7.252150654564067e-05, + "loss": 1.0859, + "step": 5815 + }, + { + "epoch": 0.35336290175587826, + "grad_norm": 0.215206116437912, + "learning_rate": 7.251295944934843e-05, + "loss": 1.2089, + "step": 5816 + }, + { + "epoch": 0.3534236587885048, + "grad_norm": 1.9434363842010498, + "learning_rate": 7.250441152783931e-05, + "loss": 1.1805, + "step": 5817 + }, + { + "epoch": 0.35348441582113127, + "grad_norm": 0.1971471905708313, + "learning_rate": 7.249586278142664e-05, + "loss": 1.1045, + "step": 5818 + }, + { + "epoch": 0.3535451728537578, + "grad_norm": 0.25170570611953735, + "learning_rate": 7.248731321042377e-05, + "loss": 1.1289, + "step": 5819 + }, + { + "epoch": 0.35360592988638434, + "grad_norm": 0.20548677444458008, + "learning_rate": 7.247876281514408e-05, + "loss": 1.0986, + "step": 5820 + }, + { + "epoch": 0.3536666869190109, + "grad_norm": 0.37013623118400574, + "learning_rate": 7.2470211595901e-05, + "loss": 1.1252, + "step": 5821 + }, + { + "epoch": 0.3537274439516374, + "grad_norm": 0.16794206202030182, + "learning_rate": 7.246165955300796e-05, + "loss": 1.0588, + "step": 5822 + }, + { + "epoch": 0.35378820098426395, + "grad_norm": 0.13325662910938263, + "learning_rate": 7.245310668677848e-05, + "loss": 1.0545, + "step": 5823 + }, + { + "epoch": 0.3538489580168905, + "grad_norm": 0.18467111885547638, + "learning_rate": 7.2444552997526e-05, + "loss": 1.0946, + "step": 5824 + }, + { + "epoch": 0.35390971504951696, + "grad_norm": 0.18164540827274323, + "learning_rate": 7.24359984855641e-05, + "loss": 1.1457, + "step": 5825 + }, + { + "epoch": 0.3539704720821435, + "grad_norm": 0.9037814736366272, + "learning_rate": 7.242744315120634e-05, + "loss": 1.0995, + "step": 5826 + }, + { + "epoch": 0.35403122911477003, + "grad_norm": 0.3074457049369812, + "learning_rate": 7.241888699476632e-05, + "loss": 1.1664, + "step": 5827 + }, + { + "epoch": 0.35409198614739656, + "grad_norm": 0.24883440136909485, + "learning_rate": 7.241033001655765e-05, + "loss": 1.0164, + "step": 5828 + }, + { + "epoch": 0.3541527431800231, + "grad_norm": 0.1257573962211609, + "learning_rate": 7.240177221689401e-05, + "loss": 1.076, + "step": 5829 + }, + { + "epoch": 0.35421350021264963, + "grad_norm": 0.14039850234985352, + "learning_rate": 7.239321359608908e-05, + "loss": 1.0648, + "step": 5830 + }, + { + "epoch": 0.3542742572452761, + "grad_norm": 0.13982249796390533, + "learning_rate": 7.238465415445659e-05, + "loss": 1.0384, + "step": 5831 + }, + { + "epoch": 0.35433501427790265, + "grad_norm": 0.1574956178665161, + "learning_rate": 7.237609389231025e-05, + "loss": 1.1417, + "step": 5832 + }, + { + "epoch": 0.3543957713105292, + "grad_norm": 0.15349221229553223, + "learning_rate": 7.23675328099639e-05, + "loss": 1.1083, + "step": 5833 + }, + { + "epoch": 0.3544565283431557, + "grad_norm": 0.2074791043996811, + "learning_rate": 7.235897090773127e-05, + "loss": 1.0862, + "step": 5834 + }, + { + "epoch": 0.35451728537578225, + "grad_norm": 0.2703007757663727, + "learning_rate": 7.235040818592627e-05, + "loss": 1.2171, + "step": 5835 + }, + { + "epoch": 0.3545780424084088, + "grad_norm": 0.2722099721431732, + "learning_rate": 7.234184464486271e-05, + "loss": 1.0861, + "step": 5836 + }, + { + "epoch": 0.3546387994410353, + "grad_norm": 0.1791897863149643, + "learning_rate": 7.233328028485454e-05, + "loss": 1.0108, + "step": 5837 + }, + { + "epoch": 0.3546995564736618, + "grad_norm": 0.19273117184638977, + "learning_rate": 7.232471510621565e-05, + "loss": 1.0425, + "step": 5838 + }, + { + "epoch": 0.35476031350628834, + "grad_norm": 0.14207425713539124, + "learning_rate": 7.231614910926002e-05, + "loss": 1.0942, + "step": 5839 + }, + { + "epoch": 0.3548210705389149, + "grad_norm": 0.24160359799861908, + "learning_rate": 7.230758229430164e-05, + "loss": 1.4506, + "step": 5840 + }, + { + "epoch": 0.3548818275715414, + "grad_norm": 0.20541724562644958, + "learning_rate": 7.229901466165449e-05, + "loss": 1.1123, + "step": 5841 + }, + { + "epoch": 0.35494258460416794, + "grad_norm": 0.20765239000320435, + "learning_rate": 7.229044621163267e-05, + "loss": 1.0419, + "step": 5842 + }, + { + "epoch": 0.3550033416367945, + "grad_norm": 0.23890089988708496, + "learning_rate": 7.228187694455023e-05, + "loss": 1.1002, + "step": 5843 + }, + { + "epoch": 0.35506409866942096, + "grad_norm": 0.20605246722698212, + "learning_rate": 7.227330686072129e-05, + "loss": 1.0974, + "step": 5844 + }, + { + "epoch": 0.3551248557020475, + "grad_norm": 0.2058846801519394, + "learning_rate": 7.226473596045997e-05, + "loss": 1.0605, + "step": 5845 + }, + { + "epoch": 0.355185612734674, + "grad_norm": 0.14860503375530243, + "learning_rate": 7.225616424408045e-05, + "loss": 1.1214, + "step": 5846 + }, + { + "epoch": 0.35524636976730056, + "grad_norm": 0.271267831325531, + "learning_rate": 7.224759171189694e-05, + "loss": 1.1043, + "step": 5847 + }, + { + "epoch": 0.3553071267999271, + "grad_norm": 0.17829258739948273, + "learning_rate": 7.223901836422366e-05, + "loss": 1.085, + "step": 5848 + }, + { + "epoch": 0.35536788383255363, + "grad_norm": 0.2951408326625824, + "learning_rate": 7.223044420137484e-05, + "loss": 1.0586, + "step": 5849 + }, + { + "epoch": 0.35542864086518017, + "grad_norm": 0.2137591689825058, + "learning_rate": 7.22218692236648e-05, + "loss": 1.2399, + "step": 5850 + }, + { + "epoch": 0.35548939789780665, + "grad_norm": 0.23036998510360718, + "learning_rate": 7.221329343140784e-05, + "loss": 1.1197, + "step": 5851 + }, + { + "epoch": 0.3555501549304332, + "grad_norm": 0.15717172622680664, + "learning_rate": 7.220471682491834e-05, + "loss": 1.1348, + "step": 5852 + }, + { + "epoch": 0.3556109119630597, + "grad_norm": 0.13252952694892883, + "learning_rate": 7.219613940451064e-05, + "loss": 1.0578, + "step": 5853 + }, + { + "epoch": 0.35567166899568625, + "grad_norm": 0.1493271291255951, + "learning_rate": 7.218756117049915e-05, + "loss": 1.1174, + "step": 5854 + }, + { + "epoch": 0.3557324260283128, + "grad_norm": 0.19335941970348358, + "learning_rate": 7.217898212319833e-05, + "loss": 1.181, + "step": 5855 + }, + { + "epoch": 0.3557931830609393, + "grad_norm": 0.16902048885822296, + "learning_rate": 7.217040226292262e-05, + "loss": 1.1299, + "step": 5856 + }, + { + "epoch": 0.35585394009356586, + "grad_norm": 0.21546825766563416, + "learning_rate": 7.216182158998654e-05, + "loss": 1.1415, + "step": 5857 + }, + { + "epoch": 0.35591469712619234, + "grad_norm": 0.14659367501735687, + "learning_rate": 7.215324010470462e-05, + "loss": 1.0742, + "step": 5858 + }, + { + "epoch": 0.35597545415881887, + "grad_norm": 0.14131975173950195, + "learning_rate": 7.214465780739137e-05, + "loss": 1.0485, + "step": 5859 + }, + { + "epoch": 0.3560362111914454, + "grad_norm": 0.15080803632736206, + "learning_rate": 7.213607469836143e-05, + "loss": 1.1213, + "step": 5860 + }, + { + "epoch": 0.35609696822407194, + "grad_norm": 0.11239893734455109, + "learning_rate": 7.212749077792937e-05, + "loss": 1.0296, + "step": 5861 + }, + { + "epoch": 0.3561577252566985, + "grad_norm": 0.17335954308509827, + "learning_rate": 7.211890604640989e-05, + "loss": 1.1704, + "step": 5862 + }, + { + "epoch": 0.356218482289325, + "grad_norm": 0.1877175122499466, + "learning_rate": 7.211032050411763e-05, + "loss": 1.1369, + "step": 5863 + }, + { + "epoch": 0.3562792393219515, + "grad_norm": 0.46775996685028076, + "learning_rate": 7.210173415136727e-05, + "loss": 1.1046, + "step": 5864 + }, + { + "epoch": 0.356339996354578, + "grad_norm": 0.2042836993932724, + "learning_rate": 7.209314698847359e-05, + "loss": 1.2355, + "step": 5865 + }, + { + "epoch": 0.35640075338720456, + "grad_norm": 0.2687236964702606, + "learning_rate": 7.208455901575133e-05, + "loss": 1.2335, + "step": 5866 + }, + { + "epoch": 0.3564615104198311, + "grad_norm": 0.2034236043691635, + "learning_rate": 7.20759702335153e-05, + "loss": 1.143, + "step": 5867 + }, + { + "epoch": 0.35652226745245763, + "grad_norm": 0.1796652227640152, + "learning_rate": 7.206738064208031e-05, + "loss": 1.0573, + "step": 5868 + }, + { + "epoch": 0.35658302448508417, + "grad_norm": 0.147388756275177, + "learning_rate": 7.205879024176121e-05, + "loss": 1.0968, + "step": 5869 + }, + { + "epoch": 0.3566437815177107, + "grad_norm": 0.15133580565452576, + "learning_rate": 7.205019903287291e-05, + "loss": 1.0879, + "step": 5870 + }, + { + "epoch": 0.3567045385503372, + "grad_norm": 0.19186252355575562, + "learning_rate": 7.204160701573028e-05, + "loss": 1.2921, + "step": 5871 + }, + { + "epoch": 0.3567652955829637, + "grad_norm": 0.18380627036094666, + "learning_rate": 7.203301419064828e-05, + "loss": 1.1138, + "step": 5872 + }, + { + "epoch": 0.35682605261559025, + "grad_norm": 0.16337788105010986, + "learning_rate": 7.202442055794192e-05, + "loss": 1.1027, + "step": 5873 + }, + { + "epoch": 0.3568868096482168, + "grad_norm": 0.12010836601257324, + "learning_rate": 7.201582611792614e-05, + "loss": 1.0696, + "step": 5874 + }, + { + "epoch": 0.3569475666808433, + "grad_norm": 0.20754846930503845, + "learning_rate": 7.2007230870916e-05, + "loss": 1.1449, + "step": 5875 + }, + { + "epoch": 0.35700832371346986, + "grad_norm": 0.426717609167099, + "learning_rate": 7.199863481722656e-05, + "loss": 1.2325, + "step": 5876 + }, + { + "epoch": 0.35706908074609633, + "grad_norm": 0.16668668389320374, + "learning_rate": 7.199003795717292e-05, + "loss": 1.0529, + "step": 5877 + }, + { + "epoch": 0.35712983777872287, + "grad_norm": 0.149159774184227, + "learning_rate": 7.198144029107018e-05, + "loss": 1.0929, + "step": 5878 + }, + { + "epoch": 0.3571905948113494, + "grad_norm": 0.3591291904449463, + "learning_rate": 7.19728418192335e-05, + "loss": 1.0927, + "step": 5879 + }, + { + "epoch": 0.35725135184397594, + "grad_norm": 0.17408284544944763, + "learning_rate": 7.196424254197804e-05, + "loss": 1.1429, + "step": 5880 + }, + { + "epoch": 0.3573121088766025, + "grad_norm": 0.815989077091217, + "learning_rate": 7.195564245961904e-05, + "loss": 1.1256, + "step": 5881 + }, + { + "epoch": 0.357372865909229, + "grad_norm": 22.102617263793945, + "learning_rate": 7.194704157247172e-05, + "loss": 1.0558, + "step": 5882 + }, + { + "epoch": 0.35743362294185554, + "grad_norm": 0.33766666054725647, + "learning_rate": 7.193843988085135e-05, + "loss": 1.1757, + "step": 5883 + }, + { + "epoch": 0.357494379974482, + "grad_norm": 1.3121384382247925, + "learning_rate": 7.192983738507322e-05, + "loss": 1.0954, + "step": 5884 + }, + { + "epoch": 0.35755513700710856, + "grad_norm": 0.1580880731344223, + "learning_rate": 7.192123408545266e-05, + "loss": 1.1042, + "step": 5885 + }, + { + "epoch": 0.3576158940397351, + "grad_norm": 2.1086742877960205, + "learning_rate": 7.191262998230504e-05, + "loss": 1.2362, + "step": 5886 + }, + { + "epoch": 0.35767665107236163, + "grad_norm": 0.23374570906162262, + "learning_rate": 7.190402507594571e-05, + "loss": 1.1557, + "step": 5887 + }, + { + "epoch": 0.35773740810498816, + "grad_norm": 0.17526526749134064, + "learning_rate": 7.189541936669013e-05, + "loss": 1.1567, + "step": 5888 + }, + { + "epoch": 0.3577981651376147, + "grad_norm": 0.18390122056007385, + "learning_rate": 7.188681285485371e-05, + "loss": 1.0668, + "step": 5889 + }, + { + "epoch": 0.3578589221702412, + "grad_norm": 0.2820643484592438, + "learning_rate": 7.187820554075193e-05, + "loss": 1.2079, + "step": 5890 + }, + { + "epoch": 0.3579196792028677, + "grad_norm": 0.17928802967071533, + "learning_rate": 7.186959742470031e-05, + "loss": 1.1336, + "step": 5891 + }, + { + "epoch": 0.35798043623549425, + "grad_norm": 0.22715944051742554, + "learning_rate": 7.186098850701435e-05, + "loss": 1.1369, + "step": 5892 + }, + { + "epoch": 0.3580411932681208, + "grad_norm": 0.22867751121520996, + "learning_rate": 7.185237878800966e-05, + "loss": 1.2372, + "step": 5893 + }, + { + "epoch": 0.3581019503007473, + "grad_norm": 0.26421797275543213, + "learning_rate": 7.184376826800178e-05, + "loss": 1.0644, + "step": 5894 + }, + { + "epoch": 0.35816270733337385, + "grad_norm": 0.40871885418891907, + "learning_rate": 7.183515694730635e-05, + "loss": 1.2616, + "step": 5895 + }, + { + "epoch": 0.3582234643660004, + "grad_norm": 0.174764946103096, + "learning_rate": 7.182654482623903e-05, + "loss": 1.1292, + "step": 5896 + }, + { + "epoch": 0.35828422139862687, + "grad_norm": 0.1459951102733612, + "learning_rate": 7.181793190511549e-05, + "loss": 1.0934, + "step": 5897 + }, + { + "epoch": 0.3583449784312534, + "grad_norm": 0.2506260275840759, + "learning_rate": 7.180931818425142e-05, + "loss": 1.2078, + "step": 5898 + }, + { + "epoch": 0.35840573546387994, + "grad_norm": 0.21106983721256256, + "learning_rate": 7.180070366396261e-05, + "loss": 1.158, + "step": 5899 + }, + { + "epoch": 0.3584664924965065, + "grad_norm": 0.2122010886669159, + "learning_rate": 7.179208834456477e-05, + "loss": 1.0334, + "step": 5900 + }, + { + "epoch": 0.358527249529133, + "grad_norm": 0.2114807516336441, + "learning_rate": 7.178347222637373e-05, + "loss": 1.0905, + "step": 5901 + }, + { + "epoch": 0.35858800656175954, + "grad_norm": 0.21153010427951813, + "learning_rate": 7.17748553097053e-05, + "loss": 1.0293, + "step": 5902 + }, + { + "epoch": 0.3586487635943861, + "grad_norm": 0.22813719511032104, + "learning_rate": 7.176623759487534e-05, + "loss": 1.0809, + "step": 5903 + }, + { + "epoch": 0.35870952062701256, + "grad_norm": 0.1508573740720749, + "learning_rate": 7.175761908219974e-05, + "loss": 1.0913, + "step": 5904 + }, + { + "epoch": 0.3587702776596391, + "grad_norm": 0.20379769802093506, + "learning_rate": 7.174899977199439e-05, + "loss": 1.2287, + "step": 5905 + }, + { + "epoch": 0.3588310346922656, + "grad_norm": 0.2024427056312561, + "learning_rate": 7.174037966457525e-05, + "loss": 1.1184, + "step": 5906 + }, + { + "epoch": 0.35889179172489216, + "grad_norm": 0.14341942965984344, + "learning_rate": 7.173175876025831e-05, + "loss": 1.0739, + "step": 5907 + }, + { + "epoch": 0.3589525487575187, + "grad_norm": 0.18194203078746796, + "learning_rate": 7.172313705935955e-05, + "loss": 1.0635, + "step": 5908 + }, + { + "epoch": 0.35901330579014523, + "grad_norm": 0.4800781309604645, + "learning_rate": 7.171451456219499e-05, + "loss": 1.0819, + "step": 5909 + }, + { + "epoch": 0.3590740628227717, + "grad_norm": 0.18123412132263184, + "learning_rate": 7.17058912690807e-05, + "loss": 1.126, + "step": 5910 + }, + { + "epoch": 0.35913481985539825, + "grad_norm": 0.14279356598854065, + "learning_rate": 7.169726718033278e-05, + "loss": 1.2625, + "step": 5911 + }, + { + "epoch": 0.3591955768880248, + "grad_norm": 0.22185450792312622, + "learning_rate": 7.168864229626734e-05, + "loss": 1.0829, + "step": 5912 + }, + { + "epoch": 0.3592563339206513, + "grad_norm": 0.13260206580162048, + "learning_rate": 7.168001661720051e-05, + "loss": 1.0462, + "step": 5913 + }, + { + "epoch": 0.35931709095327785, + "grad_norm": 0.18639041483402252, + "learning_rate": 7.16713901434485e-05, + "loss": 1.1881, + "step": 5914 + }, + { + "epoch": 0.3593778479859044, + "grad_norm": 0.24409301578998566, + "learning_rate": 7.166276287532748e-05, + "loss": 1.1298, + "step": 5915 + }, + { + "epoch": 0.3594386050185309, + "grad_norm": 0.17557953298091888, + "learning_rate": 7.16541348131537e-05, + "loss": 1.0667, + "step": 5916 + }, + { + "epoch": 0.3594993620511574, + "grad_norm": 0.2696946859359741, + "learning_rate": 7.164550595724344e-05, + "loss": 1.0989, + "step": 5917 + }, + { + "epoch": 0.35956011908378394, + "grad_norm": 0.18718624114990234, + "learning_rate": 7.163687630791294e-05, + "loss": 1.0972, + "step": 5918 + }, + { + "epoch": 0.35962087611641047, + "grad_norm": 0.14111928641796112, + "learning_rate": 7.162824586547858e-05, + "loss": 1.1031, + "step": 5919 + }, + { + "epoch": 0.359681633149037, + "grad_norm": 0.20912785828113556, + "learning_rate": 7.161961463025669e-05, + "loss": 1.1477, + "step": 5920 + }, + { + "epoch": 0.35974239018166354, + "grad_norm": 0.19308944046497345, + "learning_rate": 7.161098260256363e-05, + "loss": 1.1293, + "step": 5921 + }, + { + "epoch": 0.3598031472142901, + "grad_norm": 0.33384910225868225, + "learning_rate": 7.160234978271585e-05, + "loss": 1.1733, + "step": 5922 + }, + { + "epoch": 0.35986390424691655, + "grad_norm": 0.2403935194015503, + "learning_rate": 7.159371617102975e-05, + "loss": 1.0779, + "step": 5923 + }, + { + "epoch": 0.3599246612795431, + "grad_norm": 0.15773074328899384, + "learning_rate": 7.15850817678218e-05, + "loss": 1.1289, + "step": 5924 + }, + { + "epoch": 0.3599854183121696, + "grad_norm": 0.15671879053115845, + "learning_rate": 7.157644657340852e-05, + "loss": 1.0892, + "step": 5925 + }, + { + "epoch": 0.36004617534479616, + "grad_norm": 0.17868761718273163, + "learning_rate": 7.15678105881064e-05, + "loss": 1.1468, + "step": 5926 + }, + { + "epoch": 0.3601069323774227, + "grad_norm": 0.18014110624790192, + "learning_rate": 7.155917381223205e-05, + "loss": 1.135, + "step": 5927 + }, + { + "epoch": 0.36016768941004923, + "grad_norm": 3.4692015647888184, + "learning_rate": 7.1550536246102e-05, + "loss": 1.0914, + "step": 5928 + }, + { + "epoch": 0.36022844644267576, + "grad_norm": 0.19155573844909668, + "learning_rate": 7.154189789003286e-05, + "loss": 1.0702, + "step": 5929 + }, + { + "epoch": 0.36028920347530224, + "grad_norm": 0.1417994350194931, + "learning_rate": 7.153325874434131e-05, + "loss": 1.0433, + "step": 5930 + }, + { + "epoch": 0.3603499605079288, + "grad_norm": 0.2555970251560211, + "learning_rate": 7.152461880934399e-05, + "loss": 1.0917, + "step": 5931 + }, + { + "epoch": 0.3604107175405553, + "grad_norm": 0.16115422546863556, + "learning_rate": 7.151597808535761e-05, + "loss": 1.0477, + "step": 5932 + }, + { + "epoch": 0.36047147457318185, + "grad_norm": 0.2958734929561615, + "learning_rate": 7.150733657269889e-05, + "loss": 1.0699, + "step": 5933 + }, + { + "epoch": 0.3605322316058084, + "grad_norm": 0.18945914506912231, + "learning_rate": 7.14986942716846e-05, + "loss": 1.1249, + "step": 5934 + }, + { + "epoch": 0.3605929886384349, + "grad_norm": 0.3092036843299866, + "learning_rate": 7.14900511826315e-05, + "loss": 1.092, + "step": 5935 + }, + { + "epoch": 0.3606537456710614, + "grad_norm": 0.1366962194442749, + "learning_rate": 7.148140730585644e-05, + "loss": 1.0881, + "step": 5936 + }, + { + "epoch": 0.36071450270368793, + "grad_norm": 0.17601732909679413, + "learning_rate": 7.147276264167623e-05, + "loss": 1.1401, + "step": 5937 + }, + { + "epoch": 0.36077525973631447, + "grad_norm": 0.20743726193904877, + "learning_rate": 7.146411719040775e-05, + "loss": 1.0814, + "step": 5938 + }, + { + "epoch": 0.360836016768941, + "grad_norm": 0.1295817494392395, + "learning_rate": 7.145547095236791e-05, + "loss": 1.0473, + "step": 5939 + }, + { + "epoch": 0.36089677380156754, + "grad_norm": 0.1812259554862976, + "learning_rate": 7.144682392787365e-05, + "loss": 1.102, + "step": 5940 + }, + { + "epoch": 0.3609575308341941, + "grad_norm": 0.2486310601234436, + "learning_rate": 7.14381761172419e-05, + "loss": 1.2024, + "step": 5941 + }, + { + "epoch": 0.3610182878668206, + "grad_norm": 0.15354949235916138, + "learning_rate": 7.142952752078965e-05, + "loss": 1.0609, + "step": 5942 + }, + { + "epoch": 0.3610790448994471, + "grad_norm": 0.532869815826416, + "learning_rate": 7.142087813883395e-05, + "loss": 1.2216, + "step": 5943 + }, + { + "epoch": 0.3611398019320736, + "grad_norm": 0.1500968188047409, + "learning_rate": 7.14122279716918e-05, + "loss": 1.067, + "step": 5944 + }, + { + "epoch": 0.36120055896470016, + "grad_norm": 0.1807074397802353, + "learning_rate": 7.140357701968031e-05, + "loss": 1.1021, + "step": 5945 + }, + { + "epoch": 0.3612613159973267, + "grad_norm": 0.20345276594161987, + "learning_rate": 7.139492528311657e-05, + "loss": 1.1096, + "step": 5946 + }, + { + "epoch": 0.36132207302995323, + "grad_norm": 0.39479169249534607, + "learning_rate": 7.138627276231768e-05, + "loss": 1.3074, + "step": 5947 + }, + { + "epoch": 0.36138283006257976, + "grad_norm": 0.4613039195537567, + "learning_rate": 7.137761945760086e-05, + "loss": 1.1521, + "step": 5948 + }, + { + "epoch": 0.36144358709520624, + "grad_norm": 0.4169823229312897, + "learning_rate": 7.136896536928326e-05, + "loss": 1.1755, + "step": 5949 + }, + { + "epoch": 0.3615043441278328, + "grad_norm": 0.19304074347019196, + "learning_rate": 7.13603104976821e-05, + "loss": 1.102, + "step": 5950 + }, + { + "epoch": 0.3615651011604593, + "grad_norm": 0.3686462640762329, + "learning_rate": 7.135165484311462e-05, + "loss": 1.252, + "step": 5951 + }, + { + "epoch": 0.36162585819308585, + "grad_norm": 0.26206162571907043, + "learning_rate": 7.134299840589811e-05, + "loss": 1.0926, + "step": 5952 + }, + { + "epoch": 0.3616866152257124, + "grad_norm": 0.34802401065826416, + "learning_rate": 7.133434118634986e-05, + "loss": 1.0846, + "step": 5953 + }, + { + "epoch": 0.3617473722583389, + "grad_norm": 0.26187098026275635, + "learning_rate": 7.132568318478722e-05, + "loss": 1.2137, + "step": 5954 + }, + { + "epoch": 0.36180812929096545, + "grad_norm": 0.5490225553512573, + "learning_rate": 7.131702440152755e-05, + "loss": 1.0896, + "step": 5955 + }, + { + "epoch": 0.36186888632359193, + "grad_norm": 0.32293233275413513, + "learning_rate": 7.130836483688821e-05, + "loss": 1.0628, + "step": 5956 + }, + { + "epoch": 0.36192964335621847, + "grad_norm": 0.2808167636394501, + "learning_rate": 7.129970449118666e-05, + "loss": 1.0765, + "step": 5957 + }, + { + "epoch": 0.361990400388845, + "grad_norm": 0.24105970561504364, + "learning_rate": 7.129104336474031e-05, + "loss": 1.1343, + "step": 5958 + }, + { + "epoch": 0.36205115742147154, + "grad_norm": 0.30015817284584045, + "learning_rate": 7.128238145786665e-05, + "loss": 1.1138, + "step": 5959 + }, + { + "epoch": 0.36211191445409807, + "grad_norm": 0.17284353077411652, + "learning_rate": 7.12737187708832e-05, + "loss": 1.0238, + "step": 5960 + }, + { + "epoch": 0.3621726714867246, + "grad_norm": 0.18822729587554932, + "learning_rate": 7.126505530410746e-05, + "loss": 1.1787, + "step": 5961 + }, + { + "epoch": 0.36223342851935114, + "grad_norm": 0.2871028184890747, + "learning_rate": 7.125639105785699e-05, + "loss": 1.1398, + "step": 5962 + }, + { + "epoch": 0.3622941855519776, + "grad_norm": 0.21566082537174225, + "learning_rate": 7.124772603244943e-05, + "loss": 1.1501, + "step": 5963 + }, + { + "epoch": 0.36235494258460416, + "grad_norm": 0.16293512284755707, + "learning_rate": 7.123906022820238e-05, + "loss": 1.096, + "step": 5964 + }, + { + "epoch": 0.3624156996172307, + "grad_norm": 0.14977681636810303, + "learning_rate": 7.123039364543342e-05, + "loss": 1.0984, + "step": 5965 + }, + { + "epoch": 0.3624764566498572, + "grad_norm": 0.12604397535324097, + "learning_rate": 7.122172628446031e-05, + "loss": 0.9909, + "step": 5966 + }, + { + "epoch": 0.36253721368248376, + "grad_norm": 0.7872484922409058, + "learning_rate": 7.12130581456007e-05, + "loss": 1.196, + "step": 5967 + }, + { + "epoch": 0.3625979707151103, + "grad_norm": 0.19841915369033813, + "learning_rate": 7.120438922917236e-05, + "loss": 1.1695, + "step": 5968 + }, + { + "epoch": 0.3626587277477368, + "grad_norm": 0.20634128153324127, + "learning_rate": 7.119571953549305e-05, + "loss": 1.1998, + "step": 5969 + }, + { + "epoch": 0.3627194847803633, + "grad_norm": 0.12938696146011353, + "learning_rate": 7.118704906488052e-05, + "loss": 1.06, + "step": 5970 + }, + { + "epoch": 0.36278024181298985, + "grad_norm": 0.16447527706623077, + "learning_rate": 7.117837781765261e-05, + "loss": 1.1027, + "step": 5971 + }, + { + "epoch": 0.3628409988456164, + "grad_norm": 3.4606661796569824, + "learning_rate": 7.116970579412717e-05, + "loss": 1.1739, + "step": 5972 + }, + { + "epoch": 0.3629017558782429, + "grad_norm": 0.2508723735809326, + "learning_rate": 7.116103299462207e-05, + "loss": 1.1552, + "step": 5973 + }, + { + "epoch": 0.36296251291086945, + "grad_norm": 0.28259187936782837, + "learning_rate": 7.115235941945524e-05, + "loss": 1.1627, + "step": 5974 + }, + { + "epoch": 0.363023269943496, + "grad_norm": 0.20259036123752594, + "learning_rate": 7.114368506894456e-05, + "loss": 1.0997, + "step": 5975 + }, + { + "epoch": 0.36308402697612246, + "grad_norm": 0.20449164509773254, + "learning_rate": 7.113500994340801e-05, + "loss": 1.1848, + "step": 5976 + }, + { + "epoch": 0.363144784008749, + "grad_norm": 0.15127572417259216, + "learning_rate": 7.112633404316361e-05, + "loss": 1.0675, + "step": 5977 + }, + { + "epoch": 0.36320554104137553, + "grad_norm": 0.29695481061935425, + "learning_rate": 7.111765736852934e-05, + "loss": 1.192, + "step": 5978 + }, + { + "epoch": 0.36326629807400207, + "grad_norm": 1.2648652791976929, + "learning_rate": 7.110897991982326e-05, + "loss": 1.1737, + "step": 5979 + }, + { + "epoch": 0.3633270551066286, + "grad_norm": 0.31794071197509766, + "learning_rate": 7.110030169736344e-05, + "loss": 1.1979, + "step": 5980 + }, + { + "epoch": 0.36338781213925514, + "grad_norm": 0.21422503888607025, + "learning_rate": 7.109162270146797e-05, + "loss": 1.1712, + "step": 5981 + }, + { + "epoch": 0.3634485691718816, + "grad_norm": 0.33230382204055786, + "learning_rate": 7.108294293245501e-05, + "loss": 1.1507, + "step": 5982 + }, + { + "epoch": 0.36350932620450815, + "grad_norm": 0.22521817684173584, + "learning_rate": 7.107426239064269e-05, + "loss": 1.0411, + "step": 5983 + }, + { + "epoch": 0.3635700832371347, + "grad_norm": 0.22728776931762695, + "learning_rate": 7.106558107634923e-05, + "loss": 1.2453, + "step": 5984 + }, + { + "epoch": 0.3636308402697612, + "grad_norm": 0.13818351924419403, + "learning_rate": 7.10568989898928e-05, + "loss": 1.0527, + "step": 5985 + }, + { + "epoch": 0.36369159730238776, + "grad_norm": 0.25233694911003113, + "learning_rate": 7.104821613159167e-05, + "loss": 1.2351, + "step": 5986 + }, + { + "epoch": 0.3637523543350143, + "grad_norm": 0.19004298746585846, + "learning_rate": 7.103953250176413e-05, + "loss": 1.1024, + "step": 5987 + }, + { + "epoch": 0.36381311136764083, + "grad_norm": 0.37169086933135986, + "learning_rate": 7.103084810072843e-05, + "loss": 1.1874, + "step": 5988 + }, + { + "epoch": 0.3638738684002673, + "grad_norm": 0.15443316102027893, + "learning_rate": 7.102216292880295e-05, + "loss": 1.1087, + "step": 5989 + }, + { + "epoch": 0.36393462543289384, + "grad_norm": 0.1607917994260788, + "learning_rate": 7.101347698630601e-05, + "loss": 1.1258, + "step": 5990 + }, + { + "epoch": 0.3639953824655204, + "grad_norm": 0.1835775226354599, + "learning_rate": 7.100479027355603e-05, + "loss": 1.0674, + "step": 5991 + }, + { + "epoch": 0.3640561394981469, + "grad_norm": 0.5484330058097839, + "learning_rate": 7.099610279087138e-05, + "loss": 1.1876, + "step": 5992 + }, + { + "epoch": 0.36411689653077345, + "grad_norm": 0.4057673513889313, + "learning_rate": 7.098741453857053e-05, + "loss": 1.1698, + "step": 5993 + }, + { + "epoch": 0.3641776535634, + "grad_norm": 0.27871638536453247, + "learning_rate": 7.097872551697195e-05, + "loss": 1.051, + "step": 5994 + }, + { + "epoch": 0.36423841059602646, + "grad_norm": 0.16202746331691742, + "learning_rate": 7.097003572639413e-05, + "loss": 1.1082, + "step": 5995 + }, + { + "epoch": 0.364299167628653, + "grad_norm": 0.2087109088897705, + "learning_rate": 7.096134516715557e-05, + "loss": 1.0972, + "step": 5996 + }, + { + "epoch": 0.36435992466127953, + "grad_norm": 0.187360480427742, + "learning_rate": 7.09526538395749e-05, + "loss": 1.0991, + "step": 5997 + }, + { + "epoch": 0.36442068169390607, + "grad_norm": 0.13905178010463715, + "learning_rate": 7.094396174397061e-05, + "loss": 1.0998, + "step": 5998 + }, + { + "epoch": 0.3644814387265326, + "grad_norm": 0.17264042794704437, + "learning_rate": 7.093526888066137e-05, + "loss": 1.1434, + "step": 5999 + }, + { + "epoch": 0.36454219575915914, + "grad_norm": 0.2895245850086212, + "learning_rate": 7.09265752499658e-05, + "loss": 1.2232, + "step": 6000 + }, + { + "epoch": 0.3646029527917857, + "grad_norm": 0.14069925248622894, + "learning_rate": 7.091788085220259e-05, + "loss": 1.1153, + "step": 6001 + }, + { + "epoch": 0.36466370982441215, + "grad_norm": 0.15252617001533508, + "learning_rate": 7.09091856876904e-05, + "loss": 1.112, + "step": 6002 + }, + { + "epoch": 0.3647244668570387, + "grad_norm": 0.3880089521408081, + "learning_rate": 7.090048975674796e-05, + "loss": 1.2197, + "step": 6003 + }, + { + "epoch": 0.3647852238896652, + "grad_norm": 0.7139638066291809, + "learning_rate": 7.089179305969403e-05, + "loss": 1.0787, + "step": 6004 + }, + { + "epoch": 0.36484598092229176, + "grad_norm": 0.2053522914648056, + "learning_rate": 7.088309559684739e-05, + "loss": 1.0356, + "step": 6005 + }, + { + "epoch": 0.3649067379549183, + "grad_norm": 0.18879061937332153, + "learning_rate": 7.087439736852684e-05, + "loss": 1.0274, + "step": 6006 + }, + { + "epoch": 0.3649674949875448, + "grad_norm": 0.15123805403709412, + "learning_rate": 7.086569837505122e-05, + "loss": 1.1557, + "step": 6007 + }, + { + "epoch": 0.36502825202017136, + "grad_norm": 0.1728934943675995, + "learning_rate": 7.08569986167394e-05, + "loss": 1.1208, + "step": 6008 + }, + { + "epoch": 0.36508900905279784, + "grad_norm": 0.1747850626707077, + "learning_rate": 7.084829809391026e-05, + "loss": 1.1182, + "step": 6009 + }, + { + "epoch": 0.3651497660854244, + "grad_norm": 0.17299675941467285, + "learning_rate": 7.083959680688273e-05, + "loss": 1.0947, + "step": 6010 + }, + { + "epoch": 0.3652105231180509, + "grad_norm": 0.18715403974056244, + "learning_rate": 7.083089475597574e-05, + "loss": 1.089, + "step": 6011 + }, + { + "epoch": 0.36527128015067745, + "grad_norm": 0.23310427367687225, + "learning_rate": 7.082219194150828e-05, + "loss": 1.0448, + "step": 6012 + }, + { + "epoch": 0.365332037183304, + "grad_norm": 0.12462879717350006, + "learning_rate": 7.081348836379935e-05, + "loss": 1.0576, + "step": 6013 + }, + { + "epoch": 0.3653927942159305, + "grad_norm": 0.5681056380271912, + "learning_rate": 7.080478402316798e-05, + "loss": 1.0825, + "step": 6014 + }, + { + "epoch": 0.365453551248557, + "grad_norm": 4.0196919441223145, + "learning_rate": 7.079607891993323e-05, + "loss": 1.1142, + "step": 6015 + }, + { + "epoch": 0.36551430828118353, + "grad_norm": 0.27448785305023193, + "learning_rate": 7.07873730544142e-05, + "loss": 1.0766, + "step": 6016 + }, + { + "epoch": 0.36557506531381007, + "grad_norm": 0.1731339395046234, + "learning_rate": 7.077866642692998e-05, + "loss": 1.0986, + "step": 6017 + }, + { + "epoch": 0.3656358223464366, + "grad_norm": 0.22972512245178223, + "learning_rate": 7.076995903779972e-05, + "loss": 1.1725, + "step": 6018 + }, + { + "epoch": 0.36569657937906314, + "grad_norm": 4.182675838470459, + "learning_rate": 7.07612508873426e-05, + "loss": 1.0308, + "step": 6019 + }, + { + "epoch": 0.36575733641168967, + "grad_norm": 0.15066596865653992, + "learning_rate": 7.075254197587782e-05, + "loss": 1.0312, + "step": 6020 + }, + { + "epoch": 0.3658180934443162, + "grad_norm": 0.205636665225029, + "learning_rate": 7.074383230372461e-05, + "loss": 1.0671, + "step": 6021 + }, + { + "epoch": 0.3658788504769427, + "grad_norm": 0.1955956667661667, + "learning_rate": 7.073512187120221e-05, + "loss": 1.1173, + "step": 6022 + }, + { + "epoch": 0.3659396075095692, + "grad_norm": 0.25524693727493286, + "learning_rate": 7.072641067862991e-05, + "loss": 1.2482, + "step": 6023 + }, + { + "epoch": 0.36600036454219576, + "grad_norm": 0.2876506447792053, + "learning_rate": 7.071769872632701e-05, + "loss": 1.3079, + "step": 6024 + }, + { + "epoch": 0.3660611215748223, + "grad_norm": 0.20998498797416687, + "learning_rate": 7.070898601461287e-05, + "loss": 1.0703, + "step": 6025 + }, + { + "epoch": 0.3661218786074488, + "grad_norm": 0.16320553421974182, + "learning_rate": 7.070027254380684e-05, + "loss": 1.0442, + "step": 6026 + }, + { + "epoch": 0.36618263564007536, + "grad_norm": 0.2894010841846466, + "learning_rate": 7.069155831422832e-05, + "loss": 1.0691, + "step": 6027 + }, + { + "epoch": 0.36624339267270184, + "grad_norm": 0.2230319380760193, + "learning_rate": 7.068284332619673e-05, + "loss": 1.069, + "step": 6028 + }, + { + "epoch": 0.3663041497053284, + "grad_norm": 0.3279269337654114, + "learning_rate": 7.067412758003154e-05, + "loss": 1.1033, + "step": 6029 + }, + { + "epoch": 0.3663649067379549, + "grad_norm": 0.16777008771896362, + "learning_rate": 7.066541107605217e-05, + "loss": 1.1586, + "step": 6030 + }, + { + "epoch": 0.36642566377058144, + "grad_norm": 0.23910661041736603, + "learning_rate": 7.065669381457819e-05, + "loss": 1.127, + "step": 6031 + }, + { + "epoch": 0.366486420803208, + "grad_norm": 0.6004482507705688, + "learning_rate": 7.064797579592909e-05, + "loss": 1.0522, + "step": 6032 + }, + { + "epoch": 0.3665471778358345, + "grad_norm": 0.22792962193489075, + "learning_rate": 7.063925702042446e-05, + "loss": 1.0699, + "step": 6033 + }, + { + "epoch": 0.36660793486846105, + "grad_norm": 0.17332828044891357, + "learning_rate": 7.063053748838386e-05, + "loss": 1.1035, + "step": 6034 + }, + { + "epoch": 0.36666869190108753, + "grad_norm": 0.24174825847148895, + "learning_rate": 7.062181720012693e-05, + "loss": 1.1761, + "step": 6035 + }, + { + "epoch": 0.36672944893371406, + "grad_norm": 0.19885759055614471, + "learning_rate": 7.06130961559733e-05, + "loss": 1.0727, + "step": 6036 + }, + { + "epoch": 0.3667902059663406, + "grad_norm": 0.19328734278678894, + "learning_rate": 7.060437435624265e-05, + "loss": 1.0582, + "step": 6037 + }, + { + "epoch": 0.36685096299896713, + "grad_norm": 0.19828392565250397, + "learning_rate": 7.059565180125466e-05, + "loss": 1.1848, + "step": 6038 + }, + { + "epoch": 0.36691172003159367, + "grad_norm": 0.20452788472175598, + "learning_rate": 7.05869284913291e-05, + "loss": 1.0519, + "step": 6039 + }, + { + "epoch": 0.3669724770642202, + "grad_norm": 0.2678563892841339, + "learning_rate": 7.057820442678567e-05, + "loss": 1.2584, + "step": 6040 + }, + { + "epoch": 0.3670332340968467, + "grad_norm": 0.2401776760816574, + "learning_rate": 7.05694796079442e-05, + "loss": 1.0755, + "step": 6041 + }, + { + "epoch": 0.3670939911294732, + "grad_norm": 0.21597108244895935, + "learning_rate": 7.056075403512448e-05, + "loss": 1.2046, + "step": 6042 + }, + { + "epoch": 0.36715474816209975, + "grad_norm": 0.20854249596595764, + "learning_rate": 7.055202770864633e-05, + "loss": 1.0752, + "step": 6043 + }, + { + "epoch": 0.3672155051947263, + "grad_norm": 0.1955675333738327, + "learning_rate": 7.054330062882964e-05, + "loss": 1.0651, + "step": 6044 + }, + { + "epoch": 0.3672762622273528, + "grad_norm": 0.26309099793434143, + "learning_rate": 7.05345727959943e-05, + "loss": 1.271, + "step": 6045 + }, + { + "epoch": 0.36733701925997936, + "grad_norm": 0.19660969078540802, + "learning_rate": 7.05258442104602e-05, + "loss": 1.1057, + "step": 6046 + }, + { + "epoch": 0.3673977762926059, + "grad_norm": 0.2614695131778717, + "learning_rate": 7.051711487254734e-05, + "loss": 1.092, + "step": 6047 + }, + { + "epoch": 0.3674585333252324, + "grad_norm": 0.16886524856090546, + "learning_rate": 7.050838478257566e-05, + "loss": 1.0392, + "step": 6048 + }, + { + "epoch": 0.3675192903578589, + "grad_norm": 1.104804277420044, + "learning_rate": 7.049965394086518e-05, + "loss": 1.0514, + "step": 6049 + }, + { + "epoch": 0.36758004739048544, + "grad_norm": 0.18418674170970917, + "learning_rate": 7.04909223477359e-05, + "loss": 1.0766, + "step": 6050 + }, + { + "epoch": 0.367640804423112, + "grad_norm": 0.18491162359714508, + "learning_rate": 7.048219000350792e-05, + "loss": 1.1927, + "step": 6051 + }, + { + "epoch": 0.3677015614557385, + "grad_norm": 0.27615538239479065, + "learning_rate": 7.04734569085013e-05, + "loss": 1.0177, + "step": 6052 + }, + { + "epoch": 0.36776231848836505, + "grad_norm": 0.22296200692653656, + "learning_rate": 7.046472306303616e-05, + "loss": 1.1007, + "step": 6053 + }, + { + "epoch": 0.3678230755209915, + "grad_norm": 0.22059759497642517, + "learning_rate": 7.045598846743264e-05, + "loss": 1.2174, + "step": 6054 + }, + { + "epoch": 0.36788383255361806, + "grad_norm": 0.5893834829330444, + "learning_rate": 7.04472531220109e-05, + "loss": 1.3108, + "step": 6055 + }, + { + "epoch": 0.3679445895862446, + "grad_norm": 0.2247239053249359, + "learning_rate": 7.043851702709115e-05, + "loss": 1.141, + "step": 6056 + }, + { + "epoch": 0.36800534661887113, + "grad_norm": 0.3982189893722534, + "learning_rate": 7.042978018299361e-05, + "loss": 1.1064, + "step": 6057 + }, + { + "epoch": 0.36806610365149767, + "grad_norm": 0.2997436225414276, + "learning_rate": 7.042104259003853e-05, + "loss": 1.1572, + "step": 6058 + }, + { + "epoch": 0.3681268606841242, + "grad_norm": 0.20508989691734314, + "learning_rate": 7.041230424854619e-05, + "loss": 1.0877, + "step": 6059 + }, + { + "epoch": 0.36818761771675074, + "grad_norm": 0.1397322565317154, + "learning_rate": 7.04035651588369e-05, + "loss": 1.0579, + "step": 6060 + }, + { + "epoch": 0.3682483747493772, + "grad_norm": 0.1818924993276596, + "learning_rate": 7.039482532123096e-05, + "loss": 1.1172, + "step": 6061 + }, + { + "epoch": 0.36830913178200375, + "grad_norm": 0.5817762613296509, + "learning_rate": 7.038608473604877e-05, + "loss": 1.1744, + "step": 6062 + }, + { + "epoch": 0.3683698888146303, + "grad_norm": 1.063594937324524, + "learning_rate": 7.037734340361069e-05, + "loss": 1.1224, + "step": 6063 + }, + { + "epoch": 0.3684306458472568, + "grad_norm": 0.15738733112812042, + "learning_rate": 7.036860132423718e-05, + "loss": 1.3237, + "step": 6064 + }, + { + "epoch": 0.36849140287988336, + "grad_norm": 0.15839505195617676, + "learning_rate": 7.035985849824863e-05, + "loss": 1.0783, + "step": 6065 + }, + { + "epoch": 0.3685521599125099, + "grad_norm": 0.19259479641914368, + "learning_rate": 7.035111492596552e-05, + "loss": 1.0934, + "step": 6066 + }, + { + "epoch": 0.3686129169451364, + "grad_norm": 0.3592526316642761, + "learning_rate": 7.034237060770838e-05, + "loss": 1.1451, + "step": 6067 + }, + { + "epoch": 0.3686736739777629, + "grad_norm": 0.6591197848320007, + "learning_rate": 7.03336255437977e-05, + "loss": 1.0937, + "step": 6068 + }, + { + "epoch": 0.36873443101038944, + "grad_norm": 0.15160061419010162, + "learning_rate": 7.032487973455406e-05, + "loss": 1.1105, + "step": 6069 + }, + { + "epoch": 0.368795188043016, + "grad_norm": 0.20214974880218506, + "learning_rate": 7.031613318029802e-05, + "loss": 1.0484, + "step": 6070 + }, + { + "epoch": 0.3688559450756425, + "grad_norm": 0.24280185997486115, + "learning_rate": 7.030738588135018e-05, + "loss": 1.0632, + "step": 6071 + }, + { + "epoch": 0.36891670210826905, + "grad_norm": 0.15089461207389832, + "learning_rate": 7.02986378380312e-05, + "loss": 1.0494, + "step": 6072 + }, + { + "epoch": 0.3689774591408956, + "grad_norm": 0.20249488949775696, + "learning_rate": 7.028988905066171e-05, + "loss": 1.078, + "step": 6073 + }, + { + "epoch": 0.36903821617352206, + "grad_norm": 0.15266983211040497, + "learning_rate": 7.028113951956242e-05, + "loss": 1.0337, + "step": 6074 + }, + { + "epoch": 0.3690989732061486, + "grad_norm": 0.13306894898414612, + "learning_rate": 7.027238924505404e-05, + "loss": 1.0445, + "step": 6075 + }, + { + "epoch": 0.36915973023877513, + "grad_norm": 0.17093436419963837, + "learning_rate": 7.026363822745732e-05, + "loss": 1.1148, + "step": 6076 + }, + { + "epoch": 0.36922048727140167, + "grad_norm": 0.22219501435756683, + "learning_rate": 7.025488646709302e-05, + "loss": 1.1146, + "step": 6077 + }, + { + "epoch": 0.3692812443040282, + "grad_norm": 0.21313917636871338, + "learning_rate": 7.024613396428195e-05, + "loss": 1.1769, + "step": 6078 + }, + { + "epoch": 0.36934200133665473, + "grad_norm": 0.23730236291885376, + "learning_rate": 7.023738071934491e-05, + "loss": 1.0662, + "step": 6079 + }, + { + "epoch": 0.36940275836928127, + "grad_norm": 0.23709143698215485, + "learning_rate": 7.022862673260278e-05, + "loss": 1.0675, + "step": 6080 + }, + { + "epoch": 0.36946351540190775, + "grad_norm": 0.14758968353271484, + "learning_rate": 7.021987200437642e-05, + "loss": 1.0876, + "step": 6081 + }, + { + "epoch": 0.3695242724345343, + "grad_norm": 0.26542794704437256, + "learning_rate": 7.021111653498675e-05, + "loss": 1.0414, + "step": 6082 + }, + { + "epoch": 0.3695850294671608, + "grad_norm": 0.21081890165805817, + "learning_rate": 7.02023603247547e-05, + "loss": 1.0966, + "step": 6083 + }, + { + "epoch": 0.36964578649978735, + "grad_norm": 0.3085448443889618, + "learning_rate": 7.019360337400122e-05, + "loss": 1.3571, + "step": 6084 + }, + { + "epoch": 0.3697065435324139, + "grad_norm": 0.15007925033569336, + "learning_rate": 7.01848456830473e-05, + "loss": 1.0917, + "step": 6085 + }, + { + "epoch": 0.3697673005650404, + "grad_norm": 0.1844521462917328, + "learning_rate": 7.017608725221397e-05, + "loss": 1.138, + "step": 6086 + }, + { + "epoch": 0.3698280575976669, + "grad_norm": 0.12841415405273438, + "learning_rate": 7.016732808182224e-05, + "loss": 1.0582, + "step": 6087 + }, + { + "epoch": 0.36988881463029344, + "grad_norm": 0.3314495384693146, + "learning_rate": 7.015856817219324e-05, + "loss": 1.2464, + "step": 6088 + }, + { + "epoch": 0.36994957166292, + "grad_norm": 0.15241481363773346, + "learning_rate": 7.014980752364799e-05, + "loss": 1.0571, + "step": 6089 + }, + { + "epoch": 0.3700103286955465, + "grad_norm": 0.5650461316108704, + "learning_rate": 7.014104613650766e-05, + "loss": 1.1379, + "step": 6090 + }, + { + "epoch": 0.37007108572817304, + "grad_norm": 0.16181419789791107, + "learning_rate": 7.013228401109341e-05, + "loss": 1.042, + "step": 6091 + }, + { + "epoch": 0.3701318427607996, + "grad_norm": 0.12752079963684082, + "learning_rate": 7.012352114772638e-05, + "loss": 1.047, + "step": 6092 + }, + { + "epoch": 0.3701925997934261, + "grad_norm": 0.20489177107810974, + "learning_rate": 7.011475754672781e-05, + "loss": 1.2822, + "step": 6093 + }, + { + "epoch": 0.3702533568260526, + "grad_norm": 0.13086888194084167, + "learning_rate": 7.010599320841889e-05, + "loss": 1.0124, + "step": 6094 + }, + { + "epoch": 0.37031411385867913, + "grad_norm": 0.2657249867916107, + "learning_rate": 7.009722813312092e-05, + "loss": 1.1187, + "step": 6095 + }, + { + "epoch": 0.37037487089130566, + "grad_norm": 0.14157021045684814, + "learning_rate": 7.008846232115515e-05, + "loss": 1.1032, + "step": 6096 + }, + { + "epoch": 0.3704356279239322, + "grad_norm": 0.19336383044719696, + "learning_rate": 7.007969577284292e-05, + "loss": 1.1428, + "step": 6097 + }, + { + "epoch": 0.37049638495655873, + "grad_norm": 0.11828788369894028, + "learning_rate": 7.007092848850559e-05, + "loss": 1.0689, + "step": 6098 + }, + { + "epoch": 0.37055714198918527, + "grad_norm": 0.24823912978172302, + "learning_rate": 7.006216046846444e-05, + "loss": 1.0898, + "step": 6099 + }, + { + "epoch": 0.37061789902181175, + "grad_norm": 0.2931252121925354, + "learning_rate": 7.005339171304095e-05, + "loss": 1.0534, + "step": 6100 + }, + { + "epoch": 0.3706786560544383, + "grad_norm": 0.14766299724578857, + "learning_rate": 7.004462222255653e-05, + "loss": 1.0924, + "step": 6101 + }, + { + "epoch": 0.3707394130870648, + "grad_norm": 0.14781631529331207, + "learning_rate": 7.003585199733258e-05, + "loss": 1.0715, + "step": 6102 + }, + { + "epoch": 0.37080017011969135, + "grad_norm": 0.1661837100982666, + "learning_rate": 7.00270810376906e-05, + "loss": 1.0693, + "step": 6103 + }, + { + "epoch": 0.3708609271523179, + "grad_norm": 0.157626673579216, + "learning_rate": 7.00183093439521e-05, + "loss": 1.0911, + "step": 6104 + }, + { + "epoch": 0.3709216841849444, + "grad_norm": 0.1764076054096222, + "learning_rate": 7.000953691643861e-05, + "loss": 1.1437, + "step": 6105 + }, + { + "epoch": 0.37098244121757096, + "grad_norm": 0.18832449615001678, + "learning_rate": 7.000076375547168e-05, + "loss": 1.048, + "step": 6106 + }, + { + "epoch": 0.37104319825019744, + "grad_norm": 0.3083008825778961, + "learning_rate": 6.999198986137287e-05, + "loss": 1.1835, + "step": 6107 + }, + { + "epoch": 0.37110395528282397, + "grad_norm": 0.13549351692199707, + "learning_rate": 6.998321523446383e-05, + "loss": 1.05, + "step": 6108 + }, + { + "epoch": 0.3711647123154505, + "grad_norm": 0.3009432256221771, + "learning_rate": 6.997443987506616e-05, + "loss": 1.2666, + "step": 6109 + }, + { + "epoch": 0.37122546934807704, + "grad_norm": 0.16966791450977325, + "learning_rate": 6.996566378350152e-05, + "loss": 1.1162, + "step": 6110 + }, + { + "epoch": 0.3712862263807036, + "grad_norm": 0.1756017506122589, + "learning_rate": 6.995688696009164e-05, + "loss": 1.2071, + "step": 6111 + }, + { + "epoch": 0.3713469834133301, + "grad_norm": 2.539581060409546, + "learning_rate": 6.994810940515819e-05, + "loss": 1.1902, + "step": 6112 + }, + { + "epoch": 0.37140774044595665, + "grad_norm": 0.3390590250492096, + "learning_rate": 6.993933111902296e-05, + "loss": 1.2008, + "step": 6113 + }, + { + "epoch": 0.3714684974785831, + "grad_norm": 0.21464766561985016, + "learning_rate": 6.993055210200768e-05, + "loss": 1.085, + "step": 6114 + }, + { + "epoch": 0.37152925451120966, + "grad_norm": 0.470664381980896, + "learning_rate": 6.992177235443416e-05, + "loss": 1.2072, + "step": 6115 + }, + { + "epoch": 0.3715900115438362, + "grad_norm": 0.2351076900959015, + "learning_rate": 6.991299187662421e-05, + "loss": 1.1909, + "step": 6116 + }, + { + "epoch": 0.37165076857646273, + "grad_norm": 0.19528645277023315, + "learning_rate": 6.990421066889971e-05, + "loss": 1.0534, + "step": 6117 + }, + { + "epoch": 0.37171152560908927, + "grad_norm": 2.4312665462493896, + "learning_rate": 6.989542873158252e-05, + "loss": 1.1163, + "step": 6118 + }, + { + "epoch": 0.3717722826417158, + "grad_norm": 0.18483975529670715, + "learning_rate": 6.988664606499454e-05, + "loss": 1.1588, + "step": 6119 + }, + { + "epoch": 0.3718330396743423, + "grad_norm": 0.2241080403327942, + "learning_rate": 6.98778626694577e-05, + "loss": 1.0895, + "step": 6120 + }, + { + "epoch": 0.3718937967069688, + "grad_norm": 0.1853787750005722, + "learning_rate": 6.986907854529397e-05, + "loss": 1.0141, + "step": 6121 + }, + { + "epoch": 0.37195455373959535, + "grad_norm": 0.2961101233959198, + "learning_rate": 6.986029369282533e-05, + "loss": 1.1629, + "step": 6122 + }, + { + "epoch": 0.3720153107722219, + "grad_norm": 0.2024242877960205, + "learning_rate": 6.985150811237377e-05, + "loss": 1.0389, + "step": 6123 + }, + { + "epoch": 0.3720760678048484, + "grad_norm": 0.1818472295999527, + "learning_rate": 6.984272180426134e-05, + "loss": 1.0895, + "step": 6124 + }, + { + "epoch": 0.37213682483747496, + "grad_norm": 0.16849341988563538, + "learning_rate": 6.983393476881013e-05, + "loss": 1.0816, + "step": 6125 + }, + { + "epoch": 0.3721975818701015, + "grad_norm": 0.21810662746429443, + "learning_rate": 6.98251470063422e-05, + "loss": 1.0274, + "step": 6126 + }, + { + "epoch": 0.37225833890272797, + "grad_norm": 0.3145095109939575, + "learning_rate": 6.981635851717966e-05, + "loss": 1.2075, + "step": 6127 + }, + { + "epoch": 0.3723190959353545, + "grad_norm": 0.30857306718826294, + "learning_rate": 6.980756930164468e-05, + "loss": 1.1858, + "step": 6128 + }, + { + "epoch": 0.37237985296798104, + "grad_norm": 0.3091966509819031, + "learning_rate": 6.979877936005942e-05, + "loss": 1.1373, + "step": 6129 + }, + { + "epoch": 0.3724406100006076, + "grad_norm": 0.2408003807067871, + "learning_rate": 6.978998869274606e-05, + "loss": 1.0775, + "step": 6130 + }, + { + "epoch": 0.3725013670332341, + "grad_norm": 0.13595938682556152, + "learning_rate": 6.978119730002686e-05, + "loss": 1.0285, + "step": 6131 + }, + { + "epoch": 0.37256212406586064, + "grad_norm": 0.16466747224330902, + "learning_rate": 6.977240518222403e-05, + "loss": 1.0682, + "step": 6132 + }, + { + "epoch": 0.3726228810984871, + "grad_norm": 0.28831911087036133, + "learning_rate": 6.976361233965988e-05, + "loss": 1.1288, + "step": 6133 + }, + { + "epoch": 0.37268363813111366, + "grad_norm": 0.236700639128685, + "learning_rate": 6.97548187726567e-05, + "loss": 1.1594, + "step": 6134 + }, + { + "epoch": 0.3727443951637402, + "grad_norm": 0.12770244479179382, + "learning_rate": 6.974602448153683e-05, + "loss": 1.0815, + "step": 6135 + }, + { + "epoch": 0.37280515219636673, + "grad_norm": 0.1409512460231781, + "learning_rate": 6.973722946662258e-05, + "loss": 1.1057, + "step": 6136 + }, + { + "epoch": 0.37286590922899326, + "grad_norm": 0.1878613382577896, + "learning_rate": 6.972843372823638e-05, + "loss": 1.1584, + "step": 6137 + }, + { + "epoch": 0.3729266662616198, + "grad_norm": 0.20101282000541687, + "learning_rate": 6.971963726670065e-05, + "loss": 1.0868, + "step": 6138 + }, + { + "epoch": 0.37298742329424633, + "grad_norm": 0.17033053934574127, + "learning_rate": 6.97108400823378e-05, + "loss": 1.0962, + "step": 6139 + }, + { + "epoch": 0.3730481803268728, + "grad_norm": 0.18454821407794952, + "learning_rate": 6.970204217547027e-05, + "loss": 1.1581, + "step": 6140 + }, + { + "epoch": 0.37310893735949935, + "grad_norm": 0.22834104299545288, + "learning_rate": 6.96932435464206e-05, + "loss": 1.0749, + "step": 6141 + }, + { + "epoch": 0.3731696943921259, + "grad_norm": 0.15021520853042603, + "learning_rate": 6.968444419551128e-05, + "loss": 1.1433, + "step": 6142 + }, + { + "epoch": 0.3732304514247524, + "grad_norm": 0.2514025866985321, + "learning_rate": 6.967564412306485e-05, + "loss": 0.9998, + "step": 6143 + }, + { + "epoch": 0.37329120845737895, + "grad_norm": 0.18249507248401642, + "learning_rate": 6.966684332940388e-05, + "loss": 1.1168, + "step": 6144 + }, + { + "epoch": 0.3733519654900055, + "grad_norm": 0.20229239761829376, + "learning_rate": 6.965804181485097e-05, + "loss": 1.0933, + "step": 6145 + }, + { + "epoch": 0.37341272252263197, + "grad_norm": 7.055782318115234, + "learning_rate": 6.964923957972872e-05, + "loss": 1.0567, + "step": 6146 + }, + { + "epoch": 0.3734734795552585, + "grad_norm": 0.18048037588596344, + "learning_rate": 6.964043662435981e-05, + "loss": 1.1217, + "step": 6147 + }, + { + "epoch": 0.37353423658788504, + "grad_norm": 0.25209271907806396, + "learning_rate": 6.963163294906691e-05, + "loss": 1.0352, + "step": 6148 + }, + { + "epoch": 0.3735949936205116, + "grad_norm": 0.17877323925495148, + "learning_rate": 6.962282855417269e-05, + "loss": 1.1214, + "step": 6149 + }, + { + "epoch": 0.3736557506531381, + "grad_norm": 0.33786898851394653, + "learning_rate": 6.96140234399999e-05, + "loss": 1.0787, + "step": 6150 + }, + { + "epoch": 0.37371650768576464, + "grad_norm": 0.25326746702194214, + "learning_rate": 6.960521760687129e-05, + "loss": 1.0764, + "step": 6151 + }, + { + "epoch": 0.3737772647183912, + "grad_norm": 0.1220151036977768, + "learning_rate": 6.959641105510966e-05, + "loss": 1.0211, + "step": 6152 + }, + { + "epoch": 0.37383802175101766, + "grad_norm": 0.18178948760032654, + "learning_rate": 6.958760378503776e-05, + "loss": 1.0548, + "step": 6153 + }, + { + "epoch": 0.3738987787836442, + "grad_norm": 0.17142456769943237, + "learning_rate": 6.957879579697848e-05, + "loss": 1.1817, + "step": 6154 + }, + { + "epoch": 0.3739595358162707, + "grad_norm": 0.21144050359725952, + "learning_rate": 6.956998709125467e-05, + "loss": 1.1075, + "step": 6155 + }, + { + "epoch": 0.37402029284889726, + "grad_norm": 4.238260746002197, + "learning_rate": 6.956117766818915e-05, + "loss": 1.0603, + "step": 6156 + }, + { + "epoch": 0.3740810498815238, + "grad_norm": 0.3379259705543518, + "learning_rate": 6.955236752810493e-05, + "loss": 1.078, + "step": 6157 + }, + { + "epoch": 0.37414180691415033, + "grad_norm": 0.27843043208122253, + "learning_rate": 6.954355667132489e-05, + "loss": 1.2243, + "step": 6158 + }, + { + "epoch": 0.3742025639467768, + "grad_norm": 0.2400258481502533, + "learning_rate": 6.953474509817197e-05, + "loss": 1.1211, + "step": 6159 + }, + { + "epoch": 0.37426332097940335, + "grad_norm": 0.35528701543807983, + "learning_rate": 6.952593280896922e-05, + "loss": 1.0683, + "step": 6160 + }, + { + "epoch": 0.3743240780120299, + "grad_norm": 0.1909889131784439, + "learning_rate": 6.951711980403963e-05, + "loss": 1.0956, + "step": 6161 + }, + { + "epoch": 0.3743848350446564, + "grad_norm": 0.24124369025230408, + "learning_rate": 6.950830608370623e-05, + "loss": 1.1332, + "step": 6162 + }, + { + "epoch": 0.37444559207728295, + "grad_norm": 0.29295313358306885, + "learning_rate": 6.94994916482921e-05, + "loss": 1.2311, + "step": 6163 + }, + { + "epoch": 0.3745063491099095, + "grad_norm": 0.13519087433815002, + "learning_rate": 6.949067649812033e-05, + "loss": 1.0519, + "step": 6164 + }, + { + "epoch": 0.374567106142536, + "grad_norm": 0.45620399713516235, + "learning_rate": 6.948186063351407e-05, + "loss": 1.3742, + "step": 6165 + }, + { + "epoch": 0.3746278631751625, + "grad_norm": 0.1934356540441513, + "learning_rate": 6.947304405479644e-05, + "loss": 1.0893, + "step": 6166 + }, + { + "epoch": 0.37468862020778904, + "grad_norm": 0.32619509100914, + "learning_rate": 6.946422676229058e-05, + "loss": 1.231, + "step": 6167 + }, + { + "epoch": 0.37474937724041557, + "grad_norm": 3.7163872718811035, + "learning_rate": 6.945540875631977e-05, + "loss": 1.1135, + "step": 6168 + }, + { + "epoch": 0.3748101342730421, + "grad_norm": 0.19481046497821808, + "learning_rate": 6.944659003720716e-05, + "loss": 1.0907, + "step": 6169 + }, + { + "epoch": 0.37487089130566864, + "grad_norm": 0.18538278341293335, + "learning_rate": 6.943777060527605e-05, + "loss": 1.1023, + "step": 6170 + }, + { + "epoch": 0.3749316483382952, + "grad_norm": 0.1559235006570816, + "learning_rate": 6.94289504608497e-05, + "loss": 1.0195, + "step": 6171 + }, + { + "epoch": 0.3749924053709217, + "grad_norm": 0.17050620913505554, + "learning_rate": 6.94201296042514e-05, + "loss": 1.0486, + "step": 6172 + }, + { + "epoch": 0.3750531624035482, + "grad_norm": 0.17463983595371246, + "learning_rate": 6.941130803580451e-05, + "loss": 1.0774, + "step": 6173 + }, + { + "epoch": 0.3751139194361747, + "grad_norm": 0.16148193180561066, + "learning_rate": 6.940248575583236e-05, + "loss": 1.0274, + "step": 6174 + }, + { + "epoch": 0.37517467646880126, + "grad_norm": 0.14038436114788055, + "learning_rate": 6.939366276465836e-05, + "loss": 1.0671, + "step": 6175 + }, + { + "epoch": 0.3752354335014278, + "grad_norm": 0.17784066498279572, + "learning_rate": 6.938483906260589e-05, + "loss": 1.108, + "step": 6176 + }, + { + "epoch": 0.37529619053405433, + "grad_norm": 3.010763168334961, + "learning_rate": 6.93760146499984e-05, + "loss": 1.0644, + "step": 6177 + }, + { + "epoch": 0.37535694756668087, + "grad_norm": 0.15693561732769012, + "learning_rate": 6.936718952715936e-05, + "loss": 1.0948, + "step": 6178 + }, + { + "epoch": 0.37541770459930734, + "grad_norm": 0.21804840862751007, + "learning_rate": 6.935836369441223e-05, + "loss": 1.1942, + "step": 6179 + }, + { + "epoch": 0.3754784616319339, + "grad_norm": 0.295316219329834, + "learning_rate": 6.934953715208054e-05, + "loss": 1.2403, + "step": 6180 + }, + { + "epoch": 0.3755392186645604, + "grad_norm": 0.13129793107509613, + "learning_rate": 6.934070990048784e-05, + "loss": 1.0554, + "step": 6181 + }, + { + "epoch": 0.37559997569718695, + "grad_norm": 0.19522185623645782, + "learning_rate": 6.933188193995766e-05, + "loss": 1.0771, + "step": 6182 + }, + { + "epoch": 0.3756607327298135, + "grad_norm": 0.15450729429721832, + "learning_rate": 6.932305327081361e-05, + "loss": 1.1766, + "step": 6183 + }, + { + "epoch": 0.37572148976244, + "grad_norm": 0.29695799946784973, + "learning_rate": 6.931422389337932e-05, + "loss": 1.1088, + "step": 6184 + }, + { + "epoch": 0.37578224679506655, + "grad_norm": 0.14404602348804474, + "learning_rate": 6.93053938079784e-05, + "loss": 1.0977, + "step": 6185 + }, + { + "epoch": 0.37584300382769303, + "grad_norm": 0.13579857349395752, + "learning_rate": 6.929656301493455e-05, + "loss": 1.0769, + "step": 6186 + }, + { + "epoch": 0.37590376086031957, + "grad_norm": 0.14122948050498962, + "learning_rate": 6.928773151457147e-05, + "loss": 1.0465, + "step": 6187 + }, + { + "epoch": 0.3759645178929461, + "grad_norm": 0.8193957805633545, + "learning_rate": 6.927889930721286e-05, + "loss": 1.1652, + "step": 6188 + }, + { + "epoch": 0.37602527492557264, + "grad_norm": 0.16174200177192688, + "learning_rate": 6.927006639318245e-05, + "loss": 1.2036, + "step": 6189 + }, + { + "epoch": 0.3760860319581992, + "grad_norm": 0.16232994198799133, + "learning_rate": 6.926123277280403e-05, + "loss": 1.0831, + "step": 6190 + }, + { + "epoch": 0.3761467889908257, + "grad_norm": 0.12621040642261505, + "learning_rate": 6.925239844640141e-05, + "loss": 1.0879, + "step": 6191 + }, + { + "epoch": 0.3762075460234522, + "grad_norm": 0.17994782328605652, + "learning_rate": 6.92435634142984e-05, + "loss": 1.1079, + "step": 6192 + }, + { + "epoch": 0.3762683030560787, + "grad_norm": 0.15814127027988434, + "learning_rate": 6.923472767681886e-05, + "loss": 1.1201, + "step": 6193 + }, + { + "epoch": 0.37632906008870526, + "grad_norm": 0.17147469520568848, + "learning_rate": 6.922589123428665e-05, + "loss": 1.0973, + "step": 6194 + }, + { + "epoch": 0.3763898171213318, + "grad_norm": 0.17876200377941132, + "learning_rate": 6.92170540870257e-05, + "loss": 1.0564, + "step": 6195 + }, + { + "epoch": 0.37645057415395833, + "grad_norm": 0.2009342461824417, + "learning_rate": 6.920821623535989e-05, + "loss": 1.1544, + "step": 6196 + }, + { + "epoch": 0.37651133118658486, + "grad_norm": 0.1431100219488144, + "learning_rate": 6.919937767961322e-05, + "loss": 1.0564, + "step": 6197 + }, + { + "epoch": 0.3765720882192114, + "grad_norm": 0.18452730774879456, + "learning_rate": 6.919053842010964e-05, + "loss": 1.1519, + "step": 6198 + }, + { + "epoch": 0.3766328452518379, + "grad_norm": 0.31846579909324646, + "learning_rate": 6.918169845717318e-05, + "loss": 1.1067, + "step": 6199 + }, + { + "epoch": 0.3766936022844644, + "grad_norm": 0.1929236501455307, + "learning_rate": 6.917285779112784e-05, + "loss": 1.0778, + "step": 6200 + }, + { + "epoch": 0.37675435931709095, + "grad_norm": 0.4140916168689728, + "learning_rate": 6.916401642229769e-05, + "loss": 1.0852, + "step": 6201 + }, + { + "epoch": 0.3768151163497175, + "grad_norm": 0.17804791033267975, + "learning_rate": 6.915517435100683e-05, + "loss": 1.1071, + "step": 6202 + }, + { + "epoch": 0.376875873382344, + "grad_norm": 0.15515582263469696, + "learning_rate": 6.914633157757932e-05, + "loss": 1.1028, + "step": 6203 + }, + { + "epoch": 0.37693663041497055, + "grad_norm": 0.2886927127838135, + "learning_rate": 6.913748810233935e-05, + "loss": 1.0972, + "step": 6204 + }, + { + "epoch": 0.37699738744759703, + "grad_norm": 0.17595957219600677, + "learning_rate": 6.912864392561105e-05, + "loss": 1.1183, + "step": 6205 + }, + { + "epoch": 0.37705814448022357, + "grad_norm": 1.979278802871704, + "learning_rate": 6.911979904771862e-05, + "loss": 1.1344, + "step": 6206 + }, + { + "epoch": 0.3771189015128501, + "grad_norm": 0.22587543725967407, + "learning_rate": 6.911095346898624e-05, + "loss": 1.0265, + "step": 6207 + }, + { + "epoch": 0.37717965854547664, + "grad_norm": 0.22240546345710754, + "learning_rate": 6.910210718973818e-05, + "loss": 1.2302, + "step": 6208 + }, + { + "epoch": 0.37724041557810317, + "grad_norm": 0.14650443196296692, + "learning_rate": 6.90932602102987e-05, + "loss": 1.1186, + "step": 6209 + }, + { + "epoch": 0.3773011726107297, + "grad_norm": 0.1838058978319168, + "learning_rate": 6.908441253099205e-05, + "loss": 1.142, + "step": 6210 + }, + { + "epoch": 0.37736192964335624, + "grad_norm": 0.23207156360149384, + "learning_rate": 6.907556415214258e-05, + "loss": 1.0898, + "step": 6211 + }, + { + "epoch": 0.3774226866759827, + "grad_norm": 0.14599956572055817, + "learning_rate": 6.906671507407463e-05, + "loss": 1.0784, + "step": 6212 + }, + { + "epoch": 0.37748344370860926, + "grad_norm": 0.23296137154102325, + "learning_rate": 6.905786529711254e-05, + "loss": 1.0722, + "step": 6213 + }, + { + "epoch": 0.3775442007412358, + "grad_norm": 0.19169026613235474, + "learning_rate": 6.90490148215807e-05, + "loss": 1.2047, + "step": 6214 + }, + { + "epoch": 0.3776049577738623, + "grad_norm": 0.27794721722602844, + "learning_rate": 6.904016364780358e-05, + "loss": 1.0987, + "step": 6215 + }, + { + "epoch": 0.37766571480648886, + "grad_norm": 0.31110909581184387, + "learning_rate": 6.903131177610554e-05, + "loss": 1.0693, + "step": 6216 + }, + { + "epoch": 0.3777264718391154, + "grad_norm": 0.19506366550922394, + "learning_rate": 6.902245920681112e-05, + "loss": 1.0664, + "step": 6217 + }, + { + "epoch": 0.37778722887174193, + "grad_norm": 1.2798144817352295, + "learning_rate": 6.901360594024477e-05, + "loss": 1.1754, + "step": 6218 + }, + { + "epoch": 0.3778479859043684, + "grad_norm": 0.19944436848163605, + "learning_rate": 6.900475197673101e-05, + "loss": 1.09, + "step": 6219 + }, + { + "epoch": 0.37790874293699495, + "grad_norm": 0.1938268095254898, + "learning_rate": 6.89958973165944e-05, + "loss": 1.1191, + "step": 6220 + }, + { + "epoch": 0.3779694999696215, + "grad_norm": 0.1337699145078659, + "learning_rate": 6.898704196015949e-05, + "loss": 1.0946, + "step": 6221 + }, + { + "epoch": 0.378030257002248, + "grad_norm": 0.1706361025571823, + "learning_rate": 6.897818590775092e-05, + "loss": 1.1659, + "step": 6222 + }, + { + "epoch": 0.37809101403487455, + "grad_norm": 0.20918989181518555, + "learning_rate": 6.896932915969325e-05, + "loss": 1.1382, + "step": 6223 + }, + { + "epoch": 0.3781517710675011, + "grad_norm": 1.891122579574585, + "learning_rate": 6.896047171631114e-05, + "loss": 1.2807, + "step": 6224 + }, + { + "epoch": 0.37821252810012757, + "grad_norm": 0.20246365666389465, + "learning_rate": 6.895161357792931e-05, + "loss": 1.1135, + "step": 6225 + }, + { + "epoch": 0.3782732851327541, + "grad_norm": 0.16818103194236755, + "learning_rate": 6.89427547448724e-05, + "loss": 1.0715, + "step": 6226 + }, + { + "epoch": 0.37833404216538064, + "grad_norm": 0.19742870330810547, + "learning_rate": 6.893389521746515e-05, + "loss": 1.0905, + "step": 6227 + }, + { + "epoch": 0.37839479919800717, + "grad_norm": 0.14059999585151672, + "learning_rate": 6.892503499603233e-05, + "loss": 1.047, + "step": 6228 + }, + { + "epoch": 0.3784555562306337, + "grad_norm": 0.26940542459487915, + "learning_rate": 6.891617408089866e-05, + "loss": 1.1264, + "step": 6229 + }, + { + "epoch": 0.37851631326326024, + "grad_norm": 0.1259632408618927, + "learning_rate": 6.8907312472389e-05, + "loss": 1.0456, + "step": 6230 + }, + { + "epoch": 0.3785770702958868, + "grad_norm": 0.17143326997756958, + "learning_rate": 6.889845017082815e-05, + "loss": 1.1082, + "step": 6231 + }, + { + "epoch": 0.37863782732851325, + "grad_norm": 0.17943157255649567, + "learning_rate": 6.888958717654095e-05, + "loss": 1.0741, + "step": 6232 + }, + { + "epoch": 0.3786985843611398, + "grad_norm": 0.22239373624324799, + "learning_rate": 6.888072348985227e-05, + "loss": 1.0426, + "step": 6233 + }, + { + "epoch": 0.3787593413937663, + "grad_norm": 0.19490393996238708, + "learning_rate": 6.887185911108702e-05, + "loss": 1.098, + "step": 6234 + }, + { + "epoch": 0.37882009842639286, + "grad_norm": 0.15172097086906433, + "learning_rate": 6.886299404057015e-05, + "loss": 1.0932, + "step": 6235 + }, + { + "epoch": 0.3788808554590194, + "grad_norm": 0.2695910334587097, + "learning_rate": 6.885412827862657e-05, + "loss": 1.1831, + "step": 6236 + }, + { + "epoch": 0.37894161249164593, + "grad_norm": 0.17541678249835968, + "learning_rate": 6.884526182558126e-05, + "loss": 1.157, + "step": 6237 + }, + { + "epoch": 0.3790023695242724, + "grad_norm": 0.23607981204986572, + "learning_rate": 6.883639468175927e-05, + "loss": 1.0669, + "step": 6238 + }, + { + "epoch": 0.37906312655689894, + "grad_norm": 0.14976032078266144, + "learning_rate": 6.882752684748556e-05, + "loss": 1.1239, + "step": 6239 + }, + { + "epoch": 0.3791238835895255, + "grad_norm": 3.4635632038116455, + "learning_rate": 6.881865832308523e-05, + "loss": 1.1176, + "step": 6240 + }, + { + "epoch": 0.379184640622152, + "grad_norm": 0.20082558691501617, + "learning_rate": 6.880978910888334e-05, + "loss": 1.1138, + "step": 6241 + }, + { + "epoch": 0.37924539765477855, + "grad_norm": 0.43915224075317383, + "learning_rate": 6.8800919205205e-05, + "loss": 1.1007, + "step": 6242 + }, + { + "epoch": 0.3793061546874051, + "grad_norm": 0.17540255188941956, + "learning_rate": 6.879204861237532e-05, + "loss": 1.0648, + "step": 6243 + }, + { + "epoch": 0.3793669117200316, + "grad_norm": 0.3072577714920044, + "learning_rate": 6.878317733071947e-05, + "loss": 1.0377, + "step": 6244 + }, + { + "epoch": 0.3794276687526581, + "grad_norm": 0.13999001681804657, + "learning_rate": 6.877430536056263e-05, + "loss": 1.0418, + "step": 6245 + }, + { + "epoch": 0.37948842578528463, + "grad_norm": 0.263920396566391, + "learning_rate": 6.876543270222999e-05, + "loss": 1.1796, + "step": 6246 + }, + { + "epoch": 0.37954918281791117, + "grad_norm": 0.212153822183609, + "learning_rate": 6.875655935604678e-05, + "loss": 1.0772, + "step": 6247 + }, + { + "epoch": 0.3796099398505377, + "grad_norm": 2.7500085830688477, + "learning_rate": 6.874768532233828e-05, + "loss": 1.064, + "step": 6248 + }, + { + "epoch": 0.37967069688316424, + "grad_norm": 0.21294887363910675, + "learning_rate": 6.873881060142973e-05, + "loss": 1.1994, + "step": 6249 + }, + { + "epoch": 0.3797314539157908, + "grad_norm": 0.2289019525051117, + "learning_rate": 6.872993519364648e-05, + "loss": 1.0736, + "step": 6250 + }, + { + "epoch": 0.37979221094841725, + "grad_norm": 0.18465881049633026, + "learning_rate": 6.872105909931382e-05, + "loss": 1.0748, + "step": 6251 + }, + { + "epoch": 0.3798529679810438, + "grad_norm": 0.12557293474674225, + "learning_rate": 6.871218231875713e-05, + "loss": 1.0902, + "step": 6252 + }, + { + "epoch": 0.3799137250136703, + "grad_norm": 0.28713440895080566, + "learning_rate": 6.870330485230176e-05, + "loss": 1.1982, + "step": 6253 + }, + { + "epoch": 0.37997448204629686, + "grad_norm": 0.2273520678281784, + "learning_rate": 6.869442670027315e-05, + "loss": 1.0376, + "step": 6254 + }, + { + "epoch": 0.3800352390789234, + "grad_norm": 0.21783767640590668, + "learning_rate": 6.868554786299672e-05, + "loss": 1.1128, + "step": 6255 + }, + { + "epoch": 0.3800959961115499, + "grad_norm": 0.2760343551635742, + "learning_rate": 6.867666834079791e-05, + "loss": 1.04, + "step": 6256 + }, + { + "epoch": 0.38015675314417646, + "grad_norm": 0.3612003028392792, + "learning_rate": 6.86677881340022e-05, + "loss": 1.0864, + "step": 6257 + }, + { + "epoch": 0.38021751017680294, + "grad_norm": 0.5285005569458008, + "learning_rate": 6.865890724293515e-05, + "loss": 1.0548, + "step": 6258 + }, + { + "epoch": 0.3802782672094295, + "grad_norm": 0.22248300909996033, + "learning_rate": 6.865002566792223e-05, + "loss": 1.133, + "step": 6259 + }, + { + "epoch": 0.380339024242056, + "grad_norm": 0.1370825320482254, + "learning_rate": 6.864114340928898e-05, + "loss": 1.0439, + "step": 6260 + }, + { + "epoch": 0.38039978127468255, + "grad_norm": 0.42779216170310974, + "learning_rate": 6.863226046736105e-05, + "loss": 1.0953, + "step": 6261 + }, + { + "epoch": 0.3804605383073091, + "grad_norm": 0.16123263537883759, + "learning_rate": 6.862337684246401e-05, + "loss": 1.0761, + "step": 6262 + }, + { + "epoch": 0.3805212953399356, + "grad_norm": 0.15944361686706543, + "learning_rate": 6.861449253492348e-05, + "loss": 1.1203, + "step": 6263 + }, + { + "epoch": 0.38058205237256215, + "grad_norm": 0.13039962947368622, + "learning_rate": 6.860560754506513e-05, + "loss": 1.0614, + "step": 6264 + }, + { + "epoch": 0.38064280940518863, + "grad_norm": 0.32936879992485046, + "learning_rate": 6.859672187321464e-05, + "loss": 1.1507, + "step": 6265 + }, + { + "epoch": 0.38070356643781517, + "grad_norm": 0.17140968143939972, + "learning_rate": 6.858783551969773e-05, + "loss": 1.0751, + "step": 6266 + }, + { + "epoch": 0.3807643234704417, + "grad_norm": 0.19645406305789948, + "learning_rate": 6.85789484848401e-05, + "loss": 1.1237, + "step": 6267 + }, + { + "epoch": 0.38082508050306824, + "grad_norm": 0.14797905087471008, + "learning_rate": 6.857006076896751e-05, + "loss": 1.082, + "step": 6268 + }, + { + "epoch": 0.38088583753569477, + "grad_norm": 0.34342846274375916, + "learning_rate": 6.856117237240577e-05, + "loss": 1.2068, + "step": 6269 + }, + { + "epoch": 0.3809465945683213, + "grad_norm": 0.1380472481250763, + "learning_rate": 6.855228329548067e-05, + "loss": 1.0892, + "step": 6270 + }, + { + "epoch": 0.3810073516009478, + "grad_norm": 0.48161420226097107, + "learning_rate": 6.854339353851804e-05, + "loss": 1.2117, + "step": 6271 + }, + { + "epoch": 0.3810681086335743, + "grad_norm": 0.11848028004169464, + "learning_rate": 6.853450310184375e-05, + "loss": 1.0409, + "step": 6272 + }, + { + "epoch": 0.38112886566620086, + "grad_norm": 0.24575193226337433, + "learning_rate": 6.852561198578364e-05, + "loss": 1.0887, + "step": 6273 + }, + { + "epoch": 0.3811896226988274, + "grad_norm": 0.24208222329616547, + "learning_rate": 6.851672019066364e-05, + "loss": 1.2321, + "step": 6274 + }, + { + "epoch": 0.3812503797314539, + "grad_norm": 0.18586929142475128, + "learning_rate": 6.85078277168097e-05, + "loss": 1.0546, + "step": 6275 + }, + { + "epoch": 0.38131113676408046, + "grad_norm": 0.1334799975156784, + "learning_rate": 6.849893456454775e-05, + "loss": 1.0723, + "step": 6276 + }, + { + "epoch": 0.381371893796707, + "grad_norm": 0.15104424953460693, + "learning_rate": 6.849004073420377e-05, + "loss": 1.0679, + "step": 6277 + }, + { + "epoch": 0.3814326508293335, + "grad_norm": 0.24542753398418427, + "learning_rate": 6.848114622610377e-05, + "loss": 1.2248, + "step": 6278 + }, + { + "epoch": 0.38149340786196, + "grad_norm": 0.161895751953125, + "learning_rate": 6.847225104057382e-05, + "loss": 1.0902, + "step": 6279 + }, + { + "epoch": 0.38155416489458654, + "grad_norm": 0.19862933456897736, + "learning_rate": 6.84633551779399e-05, + "loss": 1.1896, + "step": 6280 + }, + { + "epoch": 0.3816149219272131, + "grad_norm": 0.21482138335704803, + "learning_rate": 6.845445863852814e-05, + "loss": 1.1489, + "step": 6281 + }, + { + "epoch": 0.3816756789598396, + "grad_norm": 0.1661219298839569, + "learning_rate": 6.844556142266464e-05, + "loss": 1.0921, + "step": 6282 + }, + { + "epoch": 0.38173643599246615, + "grad_norm": 0.12064141780138016, + "learning_rate": 6.843666353067549e-05, + "loss": 1.0664, + "step": 6283 + }, + { + "epoch": 0.38179719302509263, + "grad_norm": 2.6466097831726074, + "learning_rate": 6.84277649628869e-05, + "loss": 1.1072, + "step": 6284 + }, + { + "epoch": 0.38185795005771916, + "grad_norm": 0.13032706081867218, + "learning_rate": 6.841886571962501e-05, + "loss": 1.0948, + "step": 6285 + }, + { + "epoch": 0.3819187070903457, + "grad_norm": 0.12376551330089569, + "learning_rate": 6.840996580121606e-05, + "loss": 1.0807, + "step": 6286 + }, + { + "epoch": 0.38197946412297223, + "grad_norm": 0.43306705355644226, + "learning_rate": 6.840106520798624e-05, + "loss": 1.0655, + "step": 6287 + }, + { + "epoch": 0.38204022115559877, + "grad_norm": 0.458124577999115, + "learning_rate": 6.839216394026182e-05, + "loss": 1.0614, + "step": 6288 + }, + { + "epoch": 0.3821009781882253, + "grad_norm": 0.1400168240070343, + "learning_rate": 6.838326199836906e-05, + "loss": 1.093, + "step": 6289 + }, + { + "epoch": 0.38216173522085184, + "grad_norm": 0.3852221965789795, + "learning_rate": 6.83743593826343e-05, + "loss": 1.0747, + "step": 6290 + }, + { + "epoch": 0.3822224922534783, + "grad_norm": 0.17086315155029297, + "learning_rate": 6.836545609338383e-05, + "loss": 1.0572, + "step": 6291 + }, + { + "epoch": 0.38228324928610485, + "grad_norm": 0.31543099880218506, + "learning_rate": 6.835655213094403e-05, + "loss": 1.1819, + "step": 6292 + }, + { + "epoch": 0.3823440063187314, + "grad_norm": 0.1580762267112732, + "learning_rate": 6.834764749564125e-05, + "loss": 1.1262, + "step": 6293 + }, + { + "epoch": 0.3824047633513579, + "grad_norm": 0.15375079214572906, + "learning_rate": 6.833874218780192e-05, + "loss": 1.0844, + "step": 6294 + }, + { + "epoch": 0.38246552038398446, + "grad_norm": 0.1681925356388092, + "learning_rate": 6.832983620775244e-05, + "loss": 1.1153, + "step": 6295 + }, + { + "epoch": 0.382526277416611, + "grad_norm": 0.13680367171764374, + "learning_rate": 6.832092955581926e-05, + "loss": 1.0719, + "step": 6296 + }, + { + "epoch": 0.3825870344492375, + "grad_norm": 3.1433639526367188, + "learning_rate": 6.831202223232889e-05, + "loss": 1.0424, + "step": 6297 + }, + { + "epoch": 0.382647791481864, + "grad_norm": 0.22310005128383636, + "learning_rate": 6.830311423760779e-05, + "loss": 1.0496, + "step": 6298 + }, + { + "epoch": 0.38270854851449054, + "grad_norm": 0.13245375454425812, + "learning_rate": 6.829420557198251e-05, + "loss": 1.0397, + "step": 6299 + }, + { + "epoch": 0.3827693055471171, + "grad_norm": 0.3128804862499237, + "learning_rate": 6.828529623577957e-05, + "loss": 1.1724, + "step": 6300 + }, + { + "epoch": 0.3828300625797436, + "grad_norm": 0.15887482464313507, + "learning_rate": 6.827638622932558e-05, + "loss": 1.0794, + "step": 6301 + }, + { + "epoch": 0.38289081961237015, + "grad_norm": 0.1257995218038559, + "learning_rate": 6.826747555294711e-05, + "loss": 1.0546, + "step": 6302 + }, + { + "epoch": 0.3829515766449967, + "grad_norm": 0.23607231676578522, + "learning_rate": 6.825856420697081e-05, + "loss": 1.1403, + "step": 6303 + }, + { + "epoch": 0.38301233367762316, + "grad_norm": 0.14890529215335846, + "learning_rate": 6.824965219172328e-05, + "loss": 1.0259, + "step": 6304 + }, + { + "epoch": 0.3830730907102497, + "grad_norm": 0.5186467170715332, + "learning_rate": 6.824073950753124e-05, + "loss": 1.0646, + "step": 6305 + }, + { + "epoch": 0.38313384774287623, + "grad_norm": 0.1965055614709854, + "learning_rate": 6.823182615472137e-05, + "loss": 1.067, + "step": 6306 + }, + { + "epoch": 0.38319460477550277, + "grad_norm": 0.3759694993495941, + "learning_rate": 6.822291213362038e-05, + "loss": 1.1792, + "step": 6307 + }, + { + "epoch": 0.3832553618081293, + "grad_norm": 0.4658016264438629, + "learning_rate": 6.821399744455503e-05, + "loss": 1.0957, + "step": 6308 + }, + { + "epoch": 0.38331611884075584, + "grad_norm": 0.16722477972507477, + "learning_rate": 6.820508208785207e-05, + "loss": 1.0696, + "step": 6309 + }, + { + "epoch": 0.3833768758733823, + "grad_norm": 0.14305900037288666, + "learning_rate": 6.819616606383832e-05, + "loss": 1.0703, + "step": 6310 + }, + { + "epoch": 0.38343763290600885, + "grad_norm": 0.24095800518989563, + "learning_rate": 6.818724937284059e-05, + "loss": 1.0699, + "step": 6311 + }, + { + "epoch": 0.3834983899386354, + "grad_norm": 0.17182044684886932, + "learning_rate": 6.817833201518571e-05, + "loss": 1.0756, + "step": 6312 + }, + { + "epoch": 0.3835591469712619, + "grad_norm": 0.1402570754289627, + "learning_rate": 6.816941399120056e-05, + "loss": 1.0917, + "step": 6313 + }, + { + "epoch": 0.38361990400388846, + "grad_norm": 0.21883484721183777, + "learning_rate": 6.816049530121202e-05, + "loss": 1.1313, + "step": 6314 + }, + { + "epoch": 0.383680661036515, + "grad_norm": 0.4477614462375641, + "learning_rate": 6.815157594554702e-05, + "loss": 1.0917, + "step": 6315 + }, + { + "epoch": 0.3837414180691415, + "grad_norm": 0.1858288198709488, + "learning_rate": 6.81426559245325e-05, + "loss": 1.124, + "step": 6316 + }, + { + "epoch": 0.383802175101768, + "grad_norm": 0.22513240575790405, + "learning_rate": 6.81337352384954e-05, + "loss": 1.1854, + "step": 6317 + }, + { + "epoch": 0.38386293213439454, + "grad_norm": 0.2168305367231369, + "learning_rate": 6.812481388776275e-05, + "loss": 1.0872, + "step": 6318 + }, + { + "epoch": 0.3839236891670211, + "grad_norm": 0.17464782297611237, + "learning_rate": 6.811589187266155e-05, + "loss": 1.1634, + "step": 6319 + }, + { + "epoch": 0.3839844461996476, + "grad_norm": 0.14811238646507263, + "learning_rate": 6.81069691935188e-05, + "loss": 1.0339, + "step": 6320 + }, + { + "epoch": 0.38404520323227415, + "grad_norm": 0.22146408259868622, + "learning_rate": 6.809804585066162e-05, + "loss": 1.2309, + "step": 6321 + }, + { + "epoch": 0.3841059602649007, + "grad_norm": 0.12117231637239456, + "learning_rate": 6.808912184441707e-05, + "loss": 1.0786, + "step": 6322 + }, + { + "epoch": 0.3841667172975272, + "grad_norm": 0.21472381055355072, + "learning_rate": 6.808019717511226e-05, + "loss": 1.0601, + "step": 6323 + }, + { + "epoch": 0.3842274743301537, + "grad_norm": 0.12870797514915466, + "learning_rate": 6.807127184307433e-05, + "loss": 1.0582, + "step": 6324 + }, + { + "epoch": 0.38428823136278023, + "grad_norm": 0.18556331098079681, + "learning_rate": 6.806234584863043e-05, + "loss": 1.0549, + "step": 6325 + }, + { + "epoch": 0.38434898839540677, + "grad_norm": 0.1558101624250412, + "learning_rate": 6.805341919210775e-05, + "loss": 1.0464, + "step": 6326 + }, + { + "epoch": 0.3844097454280333, + "grad_norm": 0.1820998340845108, + "learning_rate": 6.80444918738335e-05, + "loss": 1.1543, + "step": 6327 + }, + { + "epoch": 0.38447050246065984, + "grad_norm": 0.15699411928653717, + "learning_rate": 6.803556389413491e-05, + "loss": 1.0545, + "step": 6328 + }, + { + "epoch": 0.38453125949328637, + "grad_norm": 0.17958933115005493, + "learning_rate": 6.802663525333926e-05, + "loss": 1.103, + "step": 6329 + }, + { + "epoch": 0.38459201652591285, + "grad_norm": 0.11888808757066727, + "learning_rate": 6.801770595177379e-05, + "loss": 1.0349, + "step": 6330 + }, + { + "epoch": 0.3846527735585394, + "grad_norm": 0.2576075494289398, + "learning_rate": 6.800877598976583e-05, + "loss": 1.182, + "step": 6331 + }, + { + "epoch": 0.3847135305911659, + "grad_norm": 0.228237584233284, + "learning_rate": 6.79998453676427e-05, + "loss": 1.1383, + "step": 6332 + }, + { + "epoch": 0.38477428762379245, + "grad_norm": 0.1752501130104065, + "learning_rate": 6.799091408573177e-05, + "loss": 1.1117, + "step": 6333 + }, + { + "epoch": 0.384835044656419, + "grad_norm": 0.18376250565052032, + "learning_rate": 6.79819821443604e-05, + "loss": 1.1039, + "step": 6334 + }, + { + "epoch": 0.3848958016890455, + "grad_norm": 0.15483279526233673, + "learning_rate": 6.7973049543856e-05, + "loss": 1.0891, + "step": 6335 + }, + { + "epoch": 0.38495655872167206, + "grad_norm": 0.26706215739250183, + "learning_rate": 6.7964116284546e-05, + "loss": 1.0494, + "step": 6336 + }, + { + "epoch": 0.38501731575429854, + "grad_norm": 0.20220930874347687, + "learning_rate": 6.795518236675784e-05, + "loss": 1.205, + "step": 6337 + }, + { + "epoch": 0.3850780727869251, + "grad_norm": 0.4418732523918152, + "learning_rate": 6.7946247790819e-05, + "loss": 1.0601, + "step": 6338 + }, + { + "epoch": 0.3851388298195516, + "grad_norm": 0.16609244048595428, + "learning_rate": 6.793731255705699e-05, + "loss": 1.0968, + "step": 6339 + }, + { + "epoch": 0.38519958685217814, + "grad_norm": 0.2317761927843094, + "learning_rate": 6.79283766657993e-05, + "loss": 1.0485, + "step": 6340 + }, + { + "epoch": 0.3852603438848047, + "grad_norm": 0.16373886168003082, + "learning_rate": 6.79194401173735e-05, + "loss": 1.0558, + "step": 6341 + }, + { + "epoch": 0.3853211009174312, + "grad_norm": 0.18133790791034698, + "learning_rate": 6.791050291210716e-05, + "loss": 1.0689, + "step": 6342 + }, + { + "epoch": 0.3853818579500577, + "grad_norm": 0.39687323570251465, + "learning_rate": 6.79015650503279e-05, + "loss": 1.1413, + "step": 6343 + }, + { + "epoch": 0.38544261498268423, + "grad_norm": 0.1871715635061264, + "learning_rate": 6.78926265323633e-05, + "loss": 1.0097, + "step": 6344 + }, + { + "epoch": 0.38550337201531076, + "grad_norm": 0.18519176542758942, + "learning_rate": 6.788368735854099e-05, + "loss": 1.0522, + "step": 6345 + }, + { + "epoch": 0.3855641290479373, + "grad_norm": 0.13306421041488647, + "learning_rate": 6.787474752918869e-05, + "loss": 1.0624, + "step": 6346 + }, + { + "epoch": 0.38562488608056383, + "grad_norm": 0.4891233444213867, + "learning_rate": 6.786580704463406e-05, + "loss": 1.237, + "step": 6347 + }, + { + "epoch": 0.38568564311319037, + "grad_norm": 0.1669224500656128, + "learning_rate": 6.785686590520481e-05, + "loss": 1.1104, + "step": 6348 + }, + { + "epoch": 0.3857464001458169, + "grad_norm": 0.19904066622257233, + "learning_rate": 6.78479241112287e-05, + "loss": 1.2674, + "step": 6349 + }, + { + "epoch": 0.3858071571784434, + "grad_norm": 0.28875163197517395, + "learning_rate": 6.783898166303346e-05, + "loss": 1.1573, + "step": 6350 + }, + { + "epoch": 0.3858679142110699, + "grad_norm": 1.2536693811416626, + "learning_rate": 6.783003856094692e-05, + "loss": 1.0715, + "step": 6351 + }, + { + "epoch": 0.38592867124369645, + "grad_norm": 3.2727980613708496, + "learning_rate": 6.782109480529689e-05, + "loss": 1.09, + "step": 6352 + }, + { + "epoch": 0.385989428276323, + "grad_norm": 0.23191440105438232, + "learning_rate": 6.781215039641114e-05, + "loss": 1.0536, + "step": 6353 + }, + { + "epoch": 0.3860501853089495, + "grad_norm": 0.16875076293945312, + "learning_rate": 6.78032053346176e-05, + "loss": 1.065, + "step": 6354 + }, + { + "epoch": 0.38611094234157606, + "grad_norm": 0.2235843539237976, + "learning_rate": 6.779425962024411e-05, + "loss": 1.2331, + "step": 6355 + }, + { + "epoch": 0.38617169937420254, + "grad_norm": 0.1509760618209839, + "learning_rate": 6.77853132536186e-05, + "loss": 1.0639, + "step": 6356 + }, + { + "epoch": 0.38623245640682907, + "grad_norm": 0.21987676620483398, + "learning_rate": 6.777636623506901e-05, + "loss": 1.0701, + "step": 6357 + }, + { + "epoch": 0.3862932134394556, + "grad_norm": 0.3164826035499573, + "learning_rate": 6.776741856492326e-05, + "loss": 1.1628, + "step": 6358 + }, + { + "epoch": 0.38635397047208214, + "grad_norm": 0.32382747530937195, + "learning_rate": 6.775847024350936e-05, + "loss": 1.1056, + "step": 6359 + }, + { + "epoch": 0.3864147275047087, + "grad_norm": 0.47914010286331177, + "learning_rate": 6.77495212711553e-05, + "loss": 1.1833, + "step": 6360 + }, + { + "epoch": 0.3864754845373352, + "grad_norm": 5.246538162231445, + "learning_rate": 6.77405716481891e-05, + "loss": 1.0786, + "step": 6361 + }, + { + "epoch": 0.38653624156996175, + "grad_norm": 0.452112078666687, + "learning_rate": 6.773162137493884e-05, + "loss": 1.0889, + "step": 6362 + }, + { + "epoch": 0.3865969986025882, + "grad_norm": 0.149632528424263, + "learning_rate": 6.772267045173254e-05, + "loss": 1.0516, + "step": 6363 + }, + { + "epoch": 0.38665775563521476, + "grad_norm": 0.16905269026756287, + "learning_rate": 6.771371887889835e-05, + "loss": 1.0975, + "step": 6364 + }, + { + "epoch": 0.3867185126678413, + "grad_norm": 0.16825300455093384, + "learning_rate": 6.770476665676435e-05, + "loss": 1.0974, + "step": 6365 + }, + { + "epoch": 0.38677926970046783, + "grad_norm": 0.18294303119182587, + "learning_rate": 6.769581378565873e-05, + "loss": 1.1711, + "step": 6366 + }, + { + "epoch": 0.38684002673309437, + "grad_norm": 0.14837855100631714, + "learning_rate": 6.768686026590963e-05, + "loss": 1.0737, + "step": 6367 + }, + { + "epoch": 0.3869007837657209, + "grad_norm": 0.23875705897808075, + "learning_rate": 6.767790609784523e-05, + "loss": 1.1615, + "step": 6368 + }, + { + "epoch": 0.38696154079834744, + "grad_norm": 3.9483509063720703, + "learning_rate": 6.766895128179378e-05, + "loss": 1.0591, + "step": 6369 + }, + { + "epoch": 0.3870222978309739, + "grad_norm": 0.16219587624073029, + "learning_rate": 6.765999581808351e-05, + "loss": 1.065, + "step": 6370 + }, + { + "epoch": 0.38708305486360045, + "grad_norm": 0.875992476940155, + "learning_rate": 6.765103970704266e-05, + "loss": 1.1725, + "step": 6371 + }, + { + "epoch": 0.387143811896227, + "grad_norm": 1.1835606098175049, + "learning_rate": 6.764208294899956e-05, + "loss": 1.068, + "step": 6372 + }, + { + "epoch": 0.3872045689288535, + "grad_norm": 0.43274930119514465, + "learning_rate": 6.76331255442825e-05, + "loss": 1.0657, + "step": 6373 + }, + { + "epoch": 0.38726532596148006, + "grad_norm": 0.13690155744552612, + "learning_rate": 6.762416749321978e-05, + "loss": 1.0966, + "step": 6374 + }, + { + "epoch": 0.3873260829941066, + "grad_norm": 0.16002944111824036, + "learning_rate": 6.761520879613982e-05, + "loss": 1.1208, + "step": 6375 + }, + { + "epoch": 0.38738684002673307, + "grad_norm": 0.2330576479434967, + "learning_rate": 6.760624945337097e-05, + "loss": 1.1549, + "step": 6376 + }, + { + "epoch": 0.3874475970593596, + "grad_norm": 0.12631060183048248, + "learning_rate": 6.759728946524165e-05, + "loss": 1.0978, + "step": 6377 + }, + { + "epoch": 0.38750835409198614, + "grad_norm": 0.1642490178346634, + "learning_rate": 6.758832883208029e-05, + "loss": 1.0364, + "step": 6378 + }, + { + "epoch": 0.3875691111246127, + "grad_norm": 0.18020714819431305, + "learning_rate": 6.757936755421532e-05, + "loss": 1.1049, + "step": 6379 + }, + { + "epoch": 0.3876298681572392, + "grad_norm": 0.14700233936309814, + "learning_rate": 6.757040563197525e-05, + "loss": 1.0716, + "step": 6380 + }, + { + "epoch": 0.38769062518986575, + "grad_norm": 0.26052945852279663, + "learning_rate": 6.756144306568856e-05, + "loss": 1.2431, + "step": 6381 + }, + { + "epoch": 0.3877513822224923, + "grad_norm": 0.20602093636989594, + "learning_rate": 6.755247985568379e-05, + "loss": 1.0884, + "step": 6382 + }, + { + "epoch": 0.38781213925511876, + "grad_norm": 0.16676735877990723, + "learning_rate": 6.754351600228945e-05, + "loss": 1.1352, + "step": 6383 + }, + { + "epoch": 0.3878728962877453, + "grad_norm": 0.24783259630203247, + "learning_rate": 6.753455150583415e-05, + "loss": 1.1362, + "step": 6384 + }, + { + "epoch": 0.38793365332037183, + "grad_norm": 0.17209668457508087, + "learning_rate": 6.752558636664648e-05, + "loss": 1.1415, + "step": 6385 + }, + { + "epoch": 0.38799441035299836, + "grad_norm": 28.459001541137695, + "learning_rate": 6.751662058505506e-05, + "loss": 1.0554, + "step": 6386 + }, + { + "epoch": 0.3880551673856249, + "grad_norm": 0.2686154246330261, + "learning_rate": 6.750765416138851e-05, + "loss": 1.2096, + "step": 6387 + }, + { + "epoch": 0.38811592441825143, + "grad_norm": 0.46046197414398193, + "learning_rate": 6.749868709597553e-05, + "loss": 1.0291, + "step": 6388 + }, + { + "epoch": 0.3881766814508779, + "grad_norm": 0.2041414976119995, + "learning_rate": 6.748971938914479e-05, + "loss": 1.1269, + "step": 6389 + }, + { + "epoch": 0.38823743848350445, + "grad_norm": 0.2749987244606018, + "learning_rate": 6.7480751041225e-05, + "loss": 1.156, + "step": 6390 + }, + { + "epoch": 0.388298195516131, + "grad_norm": 0.26438790559768677, + "learning_rate": 6.74717820525449e-05, + "loss": 1.2141, + "step": 6391 + }, + { + "epoch": 0.3883589525487575, + "grad_norm": 0.5964327454566956, + "learning_rate": 6.746281242343325e-05, + "loss": 1.1732, + "step": 6392 + }, + { + "epoch": 0.38841970958138405, + "grad_norm": 0.18892355263233185, + "learning_rate": 6.745384215421884e-05, + "loss": 1.0508, + "step": 6393 + }, + { + "epoch": 0.3884804666140106, + "grad_norm": 0.13409344851970673, + "learning_rate": 6.744487124523047e-05, + "loss": 1.0793, + "step": 6394 + }, + { + "epoch": 0.3885412236466371, + "grad_norm": 0.2899182140827179, + "learning_rate": 6.743589969679697e-05, + "loss": 1.1475, + "step": 6395 + }, + { + "epoch": 0.3886019806792636, + "grad_norm": 0.1661696881055832, + "learning_rate": 6.742692750924722e-05, + "loss": 1.0464, + "step": 6396 + }, + { + "epoch": 0.38866273771189014, + "grad_norm": 0.1817939281463623, + "learning_rate": 6.741795468291003e-05, + "loss": 1.1267, + "step": 6397 + }, + { + "epoch": 0.3887234947445167, + "grad_norm": 0.14239276945590973, + "learning_rate": 6.740898121811437e-05, + "loss": 1.0579, + "step": 6398 + }, + { + "epoch": 0.3887842517771432, + "grad_norm": 0.177519291639328, + "learning_rate": 6.740000711518915e-05, + "loss": 1.0958, + "step": 6399 + }, + { + "epoch": 0.38884500880976974, + "grad_norm": 0.22885555028915405, + "learning_rate": 6.739103237446328e-05, + "loss": 1.1554, + "step": 6400 + }, + { + "epoch": 0.3889057658423963, + "grad_norm": 0.19650842249393463, + "learning_rate": 6.738205699626578e-05, + "loss": 1.0324, + "step": 6401 + }, + { + "epoch": 0.38896652287502276, + "grad_norm": 0.21699388325214386, + "learning_rate": 6.737308098092561e-05, + "loss": 1.1588, + "step": 6402 + }, + { + "epoch": 0.3890272799076493, + "grad_norm": 0.21354782581329346, + "learning_rate": 6.736410432877179e-05, + "loss": 1.1649, + "step": 6403 + }, + { + "epoch": 0.3890880369402758, + "grad_norm": 0.2252529114484787, + "learning_rate": 6.735512704013339e-05, + "loss": 1.1875, + "step": 6404 + }, + { + "epoch": 0.38914879397290236, + "grad_norm": 0.18823717534542084, + "learning_rate": 6.734614911533944e-05, + "loss": 1.1775, + "step": 6405 + }, + { + "epoch": 0.3892095510055289, + "grad_norm": 0.12210594862699509, + "learning_rate": 6.733717055471904e-05, + "loss": 1.051, + "step": 6406 + }, + { + "epoch": 0.38927030803815543, + "grad_norm": 0.23055042326450348, + "learning_rate": 6.732819135860129e-05, + "loss": 1.1588, + "step": 6407 + }, + { + "epoch": 0.38933106507078197, + "grad_norm": 0.1257196068763733, + "learning_rate": 6.731921152731535e-05, + "loss": 1.0678, + "step": 6408 + }, + { + "epoch": 0.38939182210340845, + "grad_norm": 0.2573351562023163, + "learning_rate": 6.731023106119038e-05, + "loss": 1.1496, + "step": 6409 + }, + { + "epoch": 0.389452579136035, + "grad_norm": 0.15273867547512054, + "learning_rate": 6.730124996055552e-05, + "loss": 1.1913, + "step": 6410 + }, + { + "epoch": 0.3895133361686615, + "grad_norm": 0.17081992328166962, + "learning_rate": 6.729226822574e-05, + "loss": 1.1167, + "step": 6411 + }, + { + "epoch": 0.38957409320128805, + "grad_norm": 0.14605049788951874, + "learning_rate": 6.728328585707306e-05, + "loss": 1.0909, + "step": 6412 + }, + { + "epoch": 0.3896348502339146, + "grad_norm": 0.19868595898151398, + "learning_rate": 6.727430285488393e-05, + "loss": 1.0877, + "step": 6413 + }, + { + "epoch": 0.3896956072665411, + "grad_norm": 0.3197658360004425, + "learning_rate": 6.726531921950189e-05, + "loss": 1.1905, + "step": 6414 + }, + { + "epoch": 0.3897563642991676, + "grad_norm": 0.11486510932445526, + "learning_rate": 6.725633495125623e-05, + "loss": 1.0434, + "step": 6415 + }, + { + "epoch": 0.38981712133179414, + "grad_norm": 0.16571666300296783, + "learning_rate": 6.724735005047629e-05, + "loss": 1.0855, + "step": 6416 + }, + { + "epoch": 0.38987787836442067, + "grad_norm": 0.278359591960907, + "learning_rate": 6.72383645174914e-05, + "loss": 1.297, + "step": 6417 + }, + { + "epoch": 0.3899386353970472, + "grad_norm": 0.4895019233226776, + "learning_rate": 6.72293783526309e-05, + "loss": 1.0789, + "step": 6418 + }, + { + "epoch": 0.38999939242967374, + "grad_norm": 0.13772952556610107, + "learning_rate": 6.722039155622424e-05, + "loss": 1.1092, + "step": 6419 + }, + { + "epoch": 0.3900601494623003, + "grad_norm": 0.21690237522125244, + "learning_rate": 6.721140412860077e-05, + "loss": 1.0926, + "step": 6420 + }, + { + "epoch": 0.3901209064949268, + "grad_norm": 0.8746984601020813, + "learning_rate": 6.720241607008997e-05, + "loss": 1.0559, + "step": 6421 + }, + { + "epoch": 0.3901816635275533, + "grad_norm": 0.13369816541671753, + "learning_rate": 6.719342738102128e-05, + "loss": 1.0433, + "step": 6422 + }, + { + "epoch": 0.3902424205601798, + "grad_norm": 0.15946729481220245, + "learning_rate": 6.71844380617242e-05, + "loss": 1.1068, + "step": 6423 + }, + { + "epoch": 0.39030317759280636, + "grad_norm": 0.2904299795627594, + "learning_rate": 6.717544811252821e-05, + "loss": 1.1724, + "step": 6424 + }, + { + "epoch": 0.3903639346254329, + "grad_norm": 0.13413147628307343, + "learning_rate": 6.716645753376284e-05, + "loss": 1.0318, + "step": 6425 + }, + { + "epoch": 0.39042469165805943, + "grad_norm": 0.1666967272758484, + "learning_rate": 6.715746632575767e-05, + "loss": 1.0444, + "step": 6426 + }, + { + "epoch": 0.39048544869068597, + "grad_norm": 0.20521396398544312, + "learning_rate": 6.714847448884225e-05, + "loss": 1.0385, + "step": 6427 + }, + { + "epoch": 0.3905462057233125, + "grad_norm": 0.33713576197624207, + "learning_rate": 6.713948202334617e-05, + "loss": 1.1237, + "step": 6428 + }, + { + "epoch": 0.390606962755939, + "grad_norm": 0.23071159422397614, + "learning_rate": 6.713048892959908e-05, + "loss": 1.1186, + "step": 6429 + }, + { + "epoch": 0.3906677197885655, + "grad_norm": 6.779727935791016, + "learning_rate": 6.71214952079306e-05, + "loss": 1.1391, + "step": 6430 + }, + { + "epoch": 0.39072847682119205, + "grad_norm": 0.3514596223831177, + "learning_rate": 6.711250085867039e-05, + "loss": 1.1629, + "step": 6431 + }, + { + "epoch": 0.3907892338538186, + "grad_norm": 0.18650712072849274, + "learning_rate": 6.710350588214817e-05, + "loss": 1.1439, + "step": 6432 + }, + { + "epoch": 0.3908499908864451, + "grad_norm": 0.4680578410625458, + "learning_rate": 6.709451027869364e-05, + "loss": 1.0795, + "step": 6433 + }, + { + "epoch": 0.39091074791907165, + "grad_norm": 0.19026999175548553, + "learning_rate": 6.708551404863653e-05, + "loss": 1.0439, + "step": 6434 + }, + { + "epoch": 0.39097150495169813, + "grad_norm": 0.2945087254047394, + "learning_rate": 6.707651719230658e-05, + "loss": 1.0689, + "step": 6435 + }, + { + "epoch": 0.39103226198432467, + "grad_norm": 0.15697064995765686, + "learning_rate": 6.70675197100336e-05, + "loss": 1.0379, + "step": 6436 + }, + { + "epoch": 0.3910930190169512, + "grad_norm": 0.527032732963562, + "learning_rate": 6.70585216021474e-05, + "loss": 1.1346, + "step": 6437 + }, + { + "epoch": 0.39115377604957774, + "grad_norm": 1.2023156881332397, + "learning_rate": 6.704952286897778e-05, + "loss": 1.0391, + "step": 6438 + }, + { + "epoch": 0.3912145330822043, + "grad_norm": 0.15741731226444244, + "learning_rate": 6.704052351085462e-05, + "loss": 1.0807, + "step": 6439 + }, + { + "epoch": 0.3912752901148308, + "grad_norm": 0.15480320155620575, + "learning_rate": 6.703152352810775e-05, + "loss": 1.0479, + "step": 6440 + }, + { + "epoch": 0.39133604714745734, + "grad_norm": 0.16746895015239716, + "learning_rate": 6.70225229210671e-05, + "loss": 1.1609, + "step": 6441 + }, + { + "epoch": 0.3913968041800838, + "grad_norm": 0.12224670499563217, + "learning_rate": 6.70135216900626e-05, + "loss": 1.0185, + "step": 6442 + }, + { + "epoch": 0.39145756121271036, + "grad_norm": 0.17947405576705933, + "learning_rate": 6.700451983542415e-05, + "loss": 1.1291, + "step": 6443 + }, + { + "epoch": 0.3915183182453369, + "grad_norm": 0.1567447930574417, + "learning_rate": 6.699551735748176e-05, + "loss": 1.0741, + "step": 6444 + }, + { + "epoch": 0.39157907527796343, + "grad_norm": 0.2524197995662689, + "learning_rate": 6.698651425656539e-05, + "loss": 1.2205, + "step": 6445 + }, + { + "epoch": 0.39163983231058996, + "grad_norm": 0.20584231615066528, + "learning_rate": 6.697751053300505e-05, + "loss": 1.138, + "step": 6446 + }, + { + "epoch": 0.3917005893432165, + "grad_norm": 0.21095509827136993, + "learning_rate": 6.696850618713079e-05, + "loss": 1.1867, + "step": 6447 + }, + { + "epoch": 0.391761346375843, + "grad_norm": 0.1479436755180359, + "learning_rate": 6.695950121927265e-05, + "loss": 1.0653, + "step": 6448 + }, + { + "epoch": 0.3918221034084695, + "grad_norm": 0.6414020657539368, + "learning_rate": 6.695049562976072e-05, + "loss": 1.2842, + "step": 6449 + }, + { + "epoch": 0.39188286044109605, + "grad_norm": 0.15622633695602417, + "learning_rate": 6.694148941892509e-05, + "loss": 1.0771, + "step": 6450 + }, + { + "epoch": 0.3919436174737226, + "grad_norm": 0.34224677085876465, + "learning_rate": 6.693248258709588e-05, + "loss": 1.1831, + "step": 6451 + }, + { + "epoch": 0.3920043745063491, + "grad_norm": 0.49429529905319214, + "learning_rate": 6.692347513460326e-05, + "loss": 1.0159, + "step": 6452 + }, + { + "epoch": 0.39206513153897565, + "grad_norm": 0.15195602178573608, + "learning_rate": 6.69144670617774e-05, + "loss": 1.0711, + "step": 6453 + }, + { + "epoch": 0.3921258885716022, + "grad_norm": 0.22439368069171906, + "learning_rate": 6.690545836894845e-05, + "loss": 1.254, + "step": 6454 + }, + { + "epoch": 0.39218664560422867, + "grad_norm": 0.1413053423166275, + "learning_rate": 6.689644905644668e-05, + "loss": 1.0396, + "step": 6455 + }, + { + "epoch": 0.3922474026368552, + "grad_norm": 0.18424078822135925, + "learning_rate": 6.688743912460229e-05, + "loss": 1.0853, + "step": 6456 + }, + { + "epoch": 0.39230815966948174, + "grad_norm": 0.20230261981487274, + "learning_rate": 6.687842857374556e-05, + "loss": 1.1808, + "step": 6457 + }, + { + "epoch": 0.3923689167021083, + "grad_norm": 0.19519254565238953, + "learning_rate": 6.686941740420678e-05, + "loss": 1.1793, + "step": 6458 + }, + { + "epoch": 0.3924296737347348, + "grad_norm": 0.179273784160614, + "learning_rate": 6.686040561631623e-05, + "loss": 1.1164, + "step": 6459 + }, + { + "epoch": 0.39249043076736134, + "grad_norm": 0.1486324667930603, + "learning_rate": 6.685139321040426e-05, + "loss": 1.1227, + "step": 6460 + }, + { + "epoch": 0.3925511877999878, + "grad_norm": 0.1647394597530365, + "learning_rate": 6.684238018680121e-05, + "loss": 1.0897, + "step": 6461 + }, + { + "epoch": 0.39261194483261436, + "grad_norm": 0.1527169793844223, + "learning_rate": 6.683336654583745e-05, + "loss": 1.0465, + "step": 6462 + }, + { + "epoch": 0.3926727018652409, + "grad_norm": 0.13533085584640503, + "learning_rate": 6.682435228784341e-05, + "loss": 1.0504, + "step": 6463 + }, + { + "epoch": 0.3927334588978674, + "grad_norm": 0.1908823698759079, + "learning_rate": 6.681533741314945e-05, + "loss": 1.1417, + "step": 6464 + }, + { + "epoch": 0.39279421593049396, + "grad_norm": 0.20578359067440033, + "learning_rate": 6.680632192208609e-05, + "loss": 1.1132, + "step": 6465 + }, + { + "epoch": 0.3928549729631205, + "grad_norm": 0.2384302318096161, + "learning_rate": 6.679730581498372e-05, + "loss": 1.1656, + "step": 6466 + }, + { + "epoch": 0.39291572999574703, + "grad_norm": 0.16097606718540192, + "learning_rate": 6.678828909217286e-05, + "loss": 1.1049, + "step": 6467 + }, + { + "epoch": 0.3929764870283735, + "grad_norm": 0.13006088137626648, + "learning_rate": 6.677927175398403e-05, + "loss": 1.0118, + "step": 6468 + }, + { + "epoch": 0.39303724406100005, + "grad_norm": 0.29412567615509033, + "learning_rate": 6.677025380074775e-05, + "loss": 1.1508, + "step": 6469 + }, + { + "epoch": 0.3930980010936266, + "grad_norm": 0.42197516560554504, + "learning_rate": 6.676123523279457e-05, + "loss": 1.0784, + "step": 6470 + }, + { + "epoch": 0.3931587581262531, + "grad_norm": 0.32488706707954407, + "learning_rate": 6.675221605045507e-05, + "loss": 1.2118, + "step": 6471 + }, + { + "epoch": 0.39321951515887965, + "grad_norm": 0.15832357108592987, + "learning_rate": 6.674319625405985e-05, + "loss": 1.0735, + "step": 6472 + }, + { + "epoch": 0.3932802721915062, + "grad_norm": 0.17760059237480164, + "learning_rate": 6.673417584393955e-05, + "loss": 1.1496, + "step": 6473 + }, + { + "epoch": 0.3933410292241327, + "grad_norm": 0.18103596568107605, + "learning_rate": 6.67251548204248e-05, + "loss": 1.089, + "step": 6474 + }, + { + "epoch": 0.3934017862567592, + "grad_norm": 0.28530701994895935, + "learning_rate": 6.671613318384624e-05, + "loss": 1.1169, + "step": 6475 + }, + { + "epoch": 0.39346254328938574, + "grad_norm": 0.16695894300937653, + "learning_rate": 6.670711093453461e-05, + "loss": 1.068, + "step": 6476 + }, + { + "epoch": 0.39352330032201227, + "grad_norm": 0.14309263229370117, + "learning_rate": 6.669808807282058e-05, + "loss": 1.0534, + "step": 6477 + }, + { + "epoch": 0.3935840573546388, + "grad_norm": 0.21720516681671143, + "learning_rate": 6.668906459903493e-05, + "loss": 1.0586, + "step": 6478 + }, + { + "epoch": 0.39364481438726534, + "grad_norm": 0.3170429468154907, + "learning_rate": 6.668004051350839e-05, + "loss": 1.2139, + "step": 6479 + }, + { + "epoch": 0.3937055714198919, + "grad_norm": 0.24288886785507202, + "learning_rate": 6.667101581657172e-05, + "loss": 1.1619, + "step": 6480 + }, + { + "epoch": 0.39376632845251835, + "grad_norm": 0.21614795923233032, + "learning_rate": 6.666199050855575e-05, + "loss": 1.0565, + "step": 6481 + }, + { + "epoch": 0.3938270854851449, + "grad_norm": 0.4317033886909485, + "learning_rate": 6.665296458979131e-05, + "loss": 1.089, + "step": 6482 + }, + { + "epoch": 0.3938878425177714, + "grad_norm": 0.15566758811473846, + "learning_rate": 6.664393806060922e-05, + "loss": 1.0807, + "step": 6483 + }, + { + "epoch": 0.39394859955039796, + "grad_norm": 0.18097607791423798, + "learning_rate": 6.663491092134038e-05, + "loss": 1.0569, + "step": 6484 + }, + { + "epoch": 0.3940093565830245, + "grad_norm": 0.15195651352405548, + "learning_rate": 6.662588317231564e-05, + "loss": 1.141, + "step": 6485 + }, + { + "epoch": 0.39407011361565103, + "grad_norm": 0.40401092171669006, + "learning_rate": 6.661685481386596e-05, + "loss": 1.1663, + "step": 6486 + }, + { + "epoch": 0.39413087064827756, + "grad_norm": 0.17111338675022125, + "learning_rate": 6.660782584632223e-05, + "loss": 1.1666, + "step": 6487 + }, + { + "epoch": 0.39419162768090404, + "grad_norm": 0.24630577862262726, + "learning_rate": 6.659879627001545e-05, + "loss": 1.1749, + "step": 6488 + }, + { + "epoch": 0.3942523847135306, + "grad_norm": 0.13845933973789215, + "learning_rate": 6.658976608527658e-05, + "loss": 1.0276, + "step": 6489 + }, + { + "epoch": 0.3943131417461571, + "grad_norm": 0.18594589829444885, + "learning_rate": 6.658073529243664e-05, + "loss": 1.1567, + "step": 6490 + }, + { + "epoch": 0.39437389877878365, + "grad_norm": 0.14208967983722687, + "learning_rate": 6.657170389182663e-05, + "loss": 1.0712, + "step": 6491 + }, + { + "epoch": 0.3944346558114102, + "grad_norm": 0.3039787709712982, + "learning_rate": 6.656267188377762e-05, + "loss": 1.059, + "step": 6492 + }, + { + "epoch": 0.3944954128440367, + "grad_norm": 0.2600259780883789, + "learning_rate": 6.655363926862066e-05, + "loss": 1.1182, + "step": 6493 + }, + { + "epoch": 0.3945561698766632, + "grad_norm": 0.17212441563606262, + "learning_rate": 6.654460604668687e-05, + "loss": 1.1503, + "step": 6494 + }, + { + "epoch": 0.39461692690928973, + "grad_norm": 0.3530206084251404, + "learning_rate": 6.653557221830732e-05, + "loss": 1.223, + "step": 6495 + }, + { + "epoch": 0.39467768394191627, + "grad_norm": 0.18086989223957062, + "learning_rate": 6.652653778381319e-05, + "loss": 1.0511, + "step": 6496 + }, + { + "epoch": 0.3947384409745428, + "grad_norm": 0.14229127764701843, + "learning_rate": 6.651750274353563e-05, + "loss": 1.0806, + "step": 6497 + }, + { + "epoch": 0.39479919800716934, + "grad_norm": 2.082559108734131, + "learning_rate": 6.650846709780581e-05, + "loss": 1.0685, + "step": 6498 + }, + { + "epoch": 0.3948599550397959, + "grad_norm": 0.15213042497634888, + "learning_rate": 6.649943084695493e-05, + "loss": 1.0686, + "step": 6499 + }, + { + "epoch": 0.3949207120724224, + "grad_norm": 0.12228461354970932, + "learning_rate": 6.649039399131423e-05, + "loss": 1.0573, + "step": 6500 + }, + { + "epoch": 0.3949814691050489, + "grad_norm": 4.972786903381348, + "learning_rate": 6.648135653121494e-05, + "loss": 1.0946, + "step": 6501 + }, + { + "epoch": 0.3950422261376754, + "grad_norm": 0.17178978025913239, + "learning_rate": 6.647231846698835e-05, + "loss": 1.0982, + "step": 6502 + }, + { + "epoch": 0.39510298317030196, + "grad_norm": 0.6011859178543091, + "learning_rate": 6.646327979896574e-05, + "loss": 1.076, + "step": 6503 + }, + { + "epoch": 0.3951637402029285, + "grad_norm": 0.24459262192249298, + "learning_rate": 6.645424052747843e-05, + "loss": 1.0556, + "step": 6504 + }, + { + "epoch": 0.39522449723555503, + "grad_norm": 0.15008269250392914, + "learning_rate": 6.644520065285775e-05, + "loss": 1.0435, + "step": 6505 + }, + { + "epoch": 0.39528525426818156, + "grad_norm": 0.20823614299297333, + "learning_rate": 6.643616017543506e-05, + "loss": 1.0401, + "step": 6506 + }, + { + "epoch": 0.39534601130080804, + "grad_norm": 0.16204649209976196, + "learning_rate": 6.642711909554174e-05, + "loss": 1.058, + "step": 6507 + }, + { + "epoch": 0.3954067683334346, + "grad_norm": 0.32570192217826843, + "learning_rate": 6.64180774135092e-05, + "loss": 1.0567, + "step": 6508 + }, + { + "epoch": 0.3954675253660611, + "grad_norm": 0.14407196640968323, + "learning_rate": 6.640903512966885e-05, + "loss": 1.0811, + "step": 6509 + }, + { + "epoch": 0.39552828239868765, + "grad_norm": 0.16951946914196014, + "learning_rate": 6.639999224435217e-05, + "loss": 1.1062, + "step": 6510 + }, + { + "epoch": 0.3955890394313142, + "grad_norm": 5.580693244934082, + "learning_rate": 6.639094875789058e-05, + "loss": 1.1061, + "step": 6511 + }, + { + "epoch": 0.3956497964639407, + "grad_norm": 0.2052382528781891, + "learning_rate": 6.638190467061559e-05, + "loss": 1.2548, + "step": 6512 + }, + { + "epoch": 0.39571055349656725, + "grad_norm": 0.17212602496147156, + "learning_rate": 6.637285998285874e-05, + "loss": 1.0813, + "step": 6513 + }, + { + "epoch": 0.39577131052919373, + "grad_norm": 0.434367299079895, + "learning_rate": 6.636381469495152e-05, + "loss": 1.0953, + "step": 6514 + }, + { + "epoch": 0.39583206756182027, + "grad_norm": 0.12933877110481262, + "learning_rate": 6.635476880722553e-05, + "loss": 1.0243, + "step": 6515 + }, + { + "epoch": 0.3958928245944468, + "grad_norm": 0.14470084011554718, + "learning_rate": 6.634572232001232e-05, + "loss": 1.0763, + "step": 6516 + }, + { + "epoch": 0.39595358162707334, + "grad_norm": 0.17749479413032532, + "learning_rate": 6.633667523364349e-05, + "loss": 1.129, + "step": 6517 + }, + { + "epoch": 0.39601433865969987, + "grad_norm": 0.19330710172653198, + "learning_rate": 6.63276275484507e-05, + "loss": 1.0798, + "step": 6518 + }, + { + "epoch": 0.3960750956923264, + "grad_norm": 0.14301753044128418, + "learning_rate": 6.631857926476552e-05, + "loss": 1.0933, + "step": 6519 + }, + { + "epoch": 0.3961358527249529, + "grad_norm": 0.23434649407863617, + "learning_rate": 6.63095303829197e-05, + "loss": 1.1547, + "step": 6520 + }, + { + "epoch": 0.3961966097575794, + "grad_norm": 0.1665913462638855, + "learning_rate": 6.630048090324488e-05, + "loss": 1.1146, + "step": 6521 + }, + { + "epoch": 0.39625736679020596, + "grad_norm": 0.15681713819503784, + "learning_rate": 6.629143082607278e-05, + "loss": 1.0566, + "step": 6522 + }, + { + "epoch": 0.3963181238228325, + "grad_norm": 0.16899584233760834, + "learning_rate": 6.628238015173514e-05, + "loss": 1.1174, + "step": 6523 + }, + { + "epoch": 0.396378880855459, + "grad_norm": 0.30240699648857117, + "learning_rate": 6.627332888056369e-05, + "loss": 1.0942, + "step": 6524 + }, + { + "epoch": 0.39643963788808556, + "grad_norm": 0.14742599427700043, + "learning_rate": 6.626427701289025e-05, + "loss": 1.1393, + "step": 6525 + }, + { + "epoch": 0.3965003949207121, + "grad_norm": 0.17558152973651886, + "learning_rate": 6.625522454904657e-05, + "loss": 1.0943, + "step": 6526 + }, + { + "epoch": 0.3965611519533386, + "grad_norm": 0.17351263761520386, + "learning_rate": 6.62461714893645e-05, + "loss": 1.0987, + "step": 6527 + }, + { + "epoch": 0.3966219089859651, + "grad_norm": 0.14238056540489197, + "learning_rate": 6.623711783417587e-05, + "loss": 1.051, + "step": 6528 + }, + { + "epoch": 0.39668266601859165, + "grad_norm": 0.24085108935832977, + "learning_rate": 6.622806358381255e-05, + "loss": 1.1409, + "step": 6529 + }, + { + "epoch": 0.3967434230512182, + "grad_norm": 2.1114413738250732, + "learning_rate": 6.621900873860645e-05, + "loss": 1.1023, + "step": 6530 + }, + { + "epoch": 0.3968041800838447, + "grad_norm": 0.23471508920192719, + "learning_rate": 6.620995329888942e-05, + "loss": 1.0999, + "step": 6531 + }, + { + "epoch": 0.39686493711647125, + "grad_norm": 8.686838150024414, + "learning_rate": 6.620089726499341e-05, + "loss": 1.1093, + "step": 6532 + }, + { + "epoch": 0.3969256941490978, + "grad_norm": 0.24466344714164734, + "learning_rate": 6.61918406372504e-05, + "loss": 1.1733, + "step": 6533 + }, + { + "epoch": 0.39698645118172426, + "grad_norm": 0.150930255651474, + "learning_rate": 6.618278341599233e-05, + "loss": 1.0391, + "step": 6534 + }, + { + "epoch": 0.3970472082143508, + "grad_norm": 0.16410765051841736, + "learning_rate": 6.61737256015512e-05, + "loss": 1.0606, + "step": 6535 + }, + { + "epoch": 0.39710796524697733, + "grad_norm": 0.16845226287841797, + "learning_rate": 6.616466719425905e-05, + "loss": 1.0905, + "step": 6536 + }, + { + "epoch": 0.39716872227960387, + "grad_norm": 0.23544728755950928, + "learning_rate": 6.61556081944479e-05, + "loss": 1.1362, + "step": 6537 + }, + { + "epoch": 0.3972294793122304, + "grad_norm": 0.12614917755126953, + "learning_rate": 6.61465486024498e-05, + "loss": 1.0817, + "step": 6538 + }, + { + "epoch": 0.39729023634485694, + "grad_norm": 0.16425821185112, + "learning_rate": 6.613748841859683e-05, + "loss": 1.1852, + "step": 6539 + }, + { + "epoch": 0.3973509933774834, + "grad_norm": 0.12663373351097107, + "learning_rate": 6.612842764322113e-05, + "loss": 1.0299, + "step": 6540 + }, + { + "epoch": 0.39741175041010995, + "grad_norm": 0.25890493392944336, + "learning_rate": 6.611936627665477e-05, + "loss": 1.1362, + "step": 6541 + }, + { + "epoch": 0.3974725074427365, + "grad_norm": 0.13185185194015503, + "learning_rate": 6.611030431922993e-05, + "loss": 1.072, + "step": 6542 + }, + { + "epoch": 0.397533264475363, + "grad_norm": 0.11852778494358063, + "learning_rate": 6.610124177127879e-05, + "loss": 1.0077, + "step": 6543 + }, + { + "epoch": 0.39759402150798956, + "grad_norm": 0.17963242530822754, + "learning_rate": 6.609217863313349e-05, + "loss": 1.0591, + "step": 6544 + }, + { + "epoch": 0.3976547785406161, + "grad_norm": 0.12181134521961212, + "learning_rate": 6.608311490512629e-05, + "loss": 1.0382, + "step": 6545 + }, + { + "epoch": 0.39771553557324263, + "grad_norm": 0.18284626305103302, + "learning_rate": 6.60740505875894e-05, + "loss": 1.2584, + "step": 6546 + }, + { + "epoch": 0.3977762926058691, + "grad_norm": 0.13705766201019287, + "learning_rate": 6.606498568085508e-05, + "loss": 1.0503, + "step": 6547 + }, + { + "epoch": 0.39783704963849564, + "grad_norm": 0.18914468586444855, + "learning_rate": 6.60559201852556e-05, + "loss": 1.0995, + "step": 6548 + }, + { + "epoch": 0.3978978066711222, + "grad_norm": 0.35786524415016174, + "learning_rate": 6.604685410112327e-05, + "loss": 1.1095, + "step": 6549 + }, + { + "epoch": 0.3979585637037487, + "grad_norm": 0.16245368123054504, + "learning_rate": 6.60377874287904e-05, + "loss": 1.0953, + "step": 6550 + }, + { + "epoch": 0.39801932073637525, + "grad_norm": 0.27149254083633423, + "learning_rate": 6.602872016858933e-05, + "loss": 1.0592, + "step": 6551 + }, + { + "epoch": 0.3980800777690018, + "grad_norm": 0.3707484006881714, + "learning_rate": 6.601965232085241e-05, + "loss": 1.081, + "step": 6552 + }, + { + "epoch": 0.39814083480162826, + "grad_norm": 0.30184659361839294, + "learning_rate": 6.601058388591206e-05, + "loss": 1.1586, + "step": 6553 + }, + { + "epoch": 0.3982015918342548, + "grad_norm": 0.11948718875646591, + "learning_rate": 6.600151486410066e-05, + "loss": 1.0004, + "step": 6554 + }, + { + "epoch": 0.39826234886688133, + "grad_norm": 0.2058251053094864, + "learning_rate": 6.599244525575062e-05, + "loss": 1.0685, + "step": 6555 + }, + { + "epoch": 0.39832310589950787, + "grad_norm": 0.13420431315898895, + "learning_rate": 6.598337506119443e-05, + "loss": 1.1134, + "step": 6556 + }, + { + "epoch": 0.3983838629321344, + "grad_norm": 0.45619717240333557, + "learning_rate": 6.597430428076451e-05, + "loss": 1.1429, + "step": 6557 + }, + { + "epoch": 0.39844461996476094, + "grad_norm": 0.16612450778484344, + "learning_rate": 6.596523291479342e-05, + "loss": 1.1198, + "step": 6558 + }, + { + "epoch": 0.3985053769973875, + "grad_norm": 0.23507781326770782, + "learning_rate": 6.59561609636136e-05, + "loss": 1.2081, + "step": 6559 + }, + { + "epoch": 0.39856613403001395, + "grad_norm": 0.1710253208875656, + "learning_rate": 6.594708842755762e-05, + "loss": 1.1253, + "step": 6560 + }, + { + "epoch": 0.3986268910626405, + "grad_norm": 0.28977149724960327, + "learning_rate": 6.593801530695804e-05, + "loss": 1.1377, + "step": 6561 + }, + { + "epoch": 0.398687648095267, + "grad_norm": 0.16601623594760895, + "learning_rate": 6.592894160214743e-05, + "loss": 1.1475, + "step": 6562 + }, + { + "epoch": 0.39874840512789356, + "grad_norm": 0.3298317790031433, + "learning_rate": 6.591986731345839e-05, + "loss": 1.184, + "step": 6563 + }, + { + "epoch": 0.3988091621605201, + "grad_norm": 0.139494389295578, + "learning_rate": 6.591079244122352e-05, + "loss": 1.0064, + "step": 6564 + }, + { + "epoch": 0.3988699191931466, + "grad_norm": 0.2404806911945343, + "learning_rate": 6.590171698577547e-05, + "loss": 1.1143, + "step": 6565 + }, + { + "epoch": 0.3989306762257731, + "grad_norm": 0.3489436209201813, + "learning_rate": 6.589264094744693e-05, + "loss": 1.0959, + "step": 6566 + }, + { + "epoch": 0.39899143325839964, + "grad_norm": 0.13315507769584656, + "learning_rate": 6.588356432657057e-05, + "loss": 1.1232, + "step": 6567 + }, + { + "epoch": 0.3990521902910262, + "grad_norm": 8.759673118591309, + "learning_rate": 6.587448712347907e-05, + "loss": 1.0895, + "step": 6568 + }, + { + "epoch": 0.3991129473236527, + "grad_norm": 0.21881161630153656, + "learning_rate": 6.586540933850518e-05, + "loss": 1.11, + "step": 6569 + }, + { + "epoch": 0.39917370435627925, + "grad_norm": 0.2691590189933777, + "learning_rate": 6.585633097198165e-05, + "loss": 1.0763, + "step": 6570 + }, + { + "epoch": 0.3992344613889058, + "grad_norm": 0.20333918929100037, + "learning_rate": 6.584725202424125e-05, + "loss": 1.1471, + "step": 6571 + }, + { + "epoch": 0.3992952184215323, + "grad_norm": 0.2324604094028473, + "learning_rate": 6.583817249561675e-05, + "loss": 1.1277, + "step": 6572 + }, + { + "epoch": 0.3993559754541588, + "grad_norm": 0.2534354031085968, + "learning_rate": 6.582909238644098e-05, + "loss": 1.0799, + "step": 6573 + }, + { + "epoch": 0.39941673248678533, + "grad_norm": 0.23539762198925018, + "learning_rate": 6.582001169704676e-05, + "loss": 1.0613, + "step": 6574 + }, + { + "epoch": 0.39947748951941187, + "grad_norm": 0.14437204599380493, + "learning_rate": 6.581093042776699e-05, + "loss": 1.1095, + "step": 6575 + }, + { + "epoch": 0.3995382465520384, + "grad_norm": 0.17713621258735657, + "learning_rate": 6.580184857893446e-05, + "loss": 1.0469, + "step": 6576 + }, + { + "epoch": 0.39959900358466494, + "grad_norm": 0.1882336437702179, + "learning_rate": 6.579276615088214e-05, + "loss": 1.0508, + "step": 6577 + }, + { + "epoch": 0.39965976061729147, + "grad_norm": 0.14026860892772675, + "learning_rate": 6.578368314394292e-05, + "loss": 1.0861, + "step": 6578 + }, + { + "epoch": 0.399720517649918, + "grad_norm": 0.2194347381591797, + "learning_rate": 6.577459955844976e-05, + "loss": 1.1215, + "step": 6579 + }, + { + "epoch": 0.3997812746825445, + "grad_norm": 0.22031378746032715, + "learning_rate": 6.576551539473559e-05, + "loss": 1.0821, + "step": 6580 + }, + { + "epoch": 0.399842031715171, + "grad_norm": 0.1323220133781433, + "learning_rate": 6.575643065313339e-05, + "loss": 1.0822, + "step": 6581 + }, + { + "epoch": 0.39990278874779756, + "grad_norm": 0.17923913896083832, + "learning_rate": 6.574734533397621e-05, + "loss": 1.0931, + "step": 6582 + }, + { + "epoch": 0.3999635457804241, + "grad_norm": 0.2591210603713989, + "learning_rate": 6.573825943759702e-05, + "loss": 1.0764, + "step": 6583 + }, + { + "epoch": 0.4000243028130506, + "grad_norm": 0.24421407282352448, + "learning_rate": 6.57291729643289e-05, + "loss": 1.038, + "step": 6584 + }, + { + "epoch": 0.40008505984567716, + "grad_norm": 0.2938361167907715, + "learning_rate": 6.57200859145049e-05, + "loss": 1.1638, + "step": 6585 + }, + { + "epoch": 0.40014581687830364, + "grad_norm": 0.3407941162586212, + "learning_rate": 6.57109982884581e-05, + "loss": 1.1792, + "step": 6586 + }, + { + "epoch": 0.4002065739109302, + "grad_norm": 0.15962469577789307, + "learning_rate": 6.570191008652165e-05, + "loss": 1.0725, + "step": 6587 + }, + { + "epoch": 0.4002673309435567, + "grad_norm": 0.202643483877182, + "learning_rate": 6.569282130902862e-05, + "loss": 1.1958, + "step": 6588 + }, + { + "epoch": 0.40032808797618324, + "grad_norm": 0.24373379349708557, + "learning_rate": 6.568373195631221e-05, + "loss": 1.085, + "step": 6589 + }, + { + "epoch": 0.4003888450088098, + "grad_norm": 0.250884085893631, + "learning_rate": 6.567464202870559e-05, + "loss": 1.0494, + "step": 6590 + }, + { + "epoch": 0.4004496020414363, + "grad_norm": 0.14808085560798645, + "learning_rate": 6.566555152654189e-05, + "loss": 1.0627, + "step": 6591 + }, + { + "epoch": 0.40051035907406285, + "grad_norm": 0.1772744208574295, + "learning_rate": 6.56564604501544e-05, + "loss": 1.0672, + "step": 6592 + }, + { + "epoch": 0.40057111610668933, + "grad_norm": 0.15409553050994873, + "learning_rate": 6.564736879987632e-05, + "loss": 1.0838, + "step": 6593 + }, + { + "epoch": 0.40063187313931586, + "grad_norm": 0.22324049472808838, + "learning_rate": 6.563827657604092e-05, + "loss": 1.1507, + "step": 6594 + }, + { + "epoch": 0.4006926301719424, + "grad_norm": 0.15755148231983185, + "learning_rate": 6.562918377898146e-05, + "loss": 1.1539, + "step": 6595 + }, + { + "epoch": 0.40075338720456893, + "grad_norm": 0.14027974009513855, + "learning_rate": 6.562009040903124e-05, + "loss": 1.0186, + "step": 6596 + }, + { + "epoch": 0.40081414423719547, + "grad_norm": 0.20236752927303314, + "learning_rate": 6.561099646652361e-05, + "loss": 1.1087, + "step": 6597 + }, + { + "epoch": 0.400874901269822, + "grad_norm": 0.11700024455785751, + "learning_rate": 6.560190195179188e-05, + "loss": 1.0545, + "step": 6598 + }, + { + "epoch": 0.4009356583024485, + "grad_norm": 0.20795938372612, + "learning_rate": 6.55928068651694e-05, + "loss": 1.1491, + "step": 6599 + }, + { + "epoch": 0.400996415335075, + "grad_norm": 0.4421660602092743, + "learning_rate": 6.55837112069896e-05, + "loss": 1.0629, + "step": 6600 + }, + { + "epoch": 0.40105717236770155, + "grad_norm": 0.2056204080581665, + "learning_rate": 6.557461497758583e-05, + "loss": 1.0939, + "step": 6601 + }, + { + "epoch": 0.4011179294003281, + "grad_norm": 0.366139680147171, + "learning_rate": 6.556551817729153e-05, + "loss": 1.1591, + "step": 6602 + }, + { + "epoch": 0.4011786864329546, + "grad_norm": 0.14360135793685913, + "learning_rate": 6.555642080644018e-05, + "loss": 1.0735, + "step": 6603 + }, + { + "epoch": 0.40123944346558116, + "grad_norm": 0.1836598813533783, + "learning_rate": 6.55473228653652e-05, + "loss": 1.1618, + "step": 6604 + }, + { + "epoch": 0.4013002004982077, + "grad_norm": 0.26345452666282654, + "learning_rate": 6.55382243544001e-05, + "loss": 1.0369, + "step": 6605 + }, + { + "epoch": 0.4013609575308342, + "grad_norm": 0.15187640488147736, + "learning_rate": 6.552912527387838e-05, + "loss": 1.099, + "step": 6606 + }, + { + "epoch": 0.4014217145634607, + "grad_norm": 0.16972950100898743, + "learning_rate": 6.552002562413358e-05, + "loss": 1.0588, + "step": 6607 + }, + { + "epoch": 0.40148247159608724, + "grad_norm": 0.25443100929260254, + "learning_rate": 6.551092540549923e-05, + "loss": 1.0541, + "step": 6608 + }, + { + "epoch": 0.4015432286287138, + "grad_norm": 0.19845768809318542, + "learning_rate": 6.550182461830892e-05, + "loss": 1.0847, + "step": 6609 + }, + { + "epoch": 0.4016039856613403, + "grad_norm": 0.16475890576839447, + "learning_rate": 6.549272326289623e-05, + "loss": 1.1807, + "step": 6610 + }, + { + "epoch": 0.40166474269396685, + "grad_norm": 0.22839957475662231, + "learning_rate": 6.548362133959478e-05, + "loss": 1.1024, + "step": 6611 + }, + { + "epoch": 0.4017254997265933, + "grad_norm": 0.2388787716627121, + "learning_rate": 6.547451884873817e-05, + "loss": 1.1277, + "step": 6612 + }, + { + "epoch": 0.40178625675921986, + "grad_norm": 0.18775707483291626, + "learning_rate": 6.54654157906601e-05, + "loss": 1.0862, + "step": 6613 + }, + { + "epoch": 0.4018470137918464, + "grad_norm": 0.32869967818260193, + "learning_rate": 6.545631216569423e-05, + "loss": 1.1068, + "step": 6614 + }, + { + "epoch": 0.40190777082447293, + "grad_norm": 0.18017113208770752, + "learning_rate": 6.544720797417426e-05, + "loss": 1.0704, + "step": 6615 + }, + { + "epoch": 0.40196852785709947, + "grad_norm": 0.3624647855758667, + "learning_rate": 6.543810321643386e-05, + "loss": 1.0809, + "step": 6616 + }, + { + "epoch": 0.402029284889726, + "grad_norm": 0.1427358090877533, + "learning_rate": 6.542899789280684e-05, + "loss": 1.0692, + "step": 6617 + }, + { + "epoch": 0.40209004192235254, + "grad_norm": 0.20363053679466248, + "learning_rate": 6.541989200362689e-05, + "loss": 1.1052, + "step": 6618 + }, + { + "epoch": 0.402150798954979, + "grad_norm": 0.360845685005188, + "learning_rate": 6.541078554922784e-05, + "loss": 1.1304, + "step": 6619 + }, + { + "epoch": 0.40221155598760555, + "grad_norm": 0.13105599582195282, + "learning_rate": 6.540167852994346e-05, + "loss": 1.0792, + "step": 6620 + }, + { + "epoch": 0.4022723130202321, + "grad_norm": 0.14443010091781616, + "learning_rate": 6.539257094610757e-05, + "loss": 1.0527, + "step": 6621 + }, + { + "epoch": 0.4023330700528586, + "grad_norm": 0.16193968057632446, + "learning_rate": 6.538346279805401e-05, + "loss": 1.108, + "step": 6622 + }, + { + "epoch": 0.40239382708548516, + "grad_norm": 0.22881177067756653, + "learning_rate": 6.537435408611667e-05, + "loss": 1.1628, + "step": 6623 + }, + { + "epoch": 0.4024545841181117, + "grad_norm": 0.24028386175632477, + "learning_rate": 6.53652448106294e-05, + "loss": 1.0347, + "step": 6624 + }, + { + "epoch": 0.40251534115073817, + "grad_norm": 0.1537184864282608, + "learning_rate": 6.53561349719261e-05, + "loss": 1.1427, + "step": 6625 + }, + { + "epoch": 0.4025760981833647, + "grad_norm": 0.28651416301727295, + "learning_rate": 6.534702457034071e-05, + "loss": 1.1191, + "step": 6626 + }, + { + "epoch": 0.40263685521599124, + "grad_norm": 0.23378139734268188, + "learning_rate": 6.533791360620717e-05, + "loss": 1.0672, + "step": 6627 + }, + { + "epoch": 0.4026976122486178, + "grad_norm": 0.19313105940818787, + "learning_rate": 6.532880207985945e-05, + "loss": 1.1301, + "step": 6628 + }, + { + "epoch": 0.4027583692812443, + "grad_norm": 0.20975559949874878, + "learning_rate": 6.531968999163151e-05, + "loss": 1.0649, + "step": 6629 + }, + { + "epoch": 0.40281912631387085, + "grad_norm": 0.2829676568508148, + "learning_rate": 6.53105773418574e-05, + "loss": 1.2322, + "step": 6630 + }, + { + "epoch": 0.4028798833464974, + "grad_norm": 0.15994608402252197, + "learning_rate": 6.53014641308711e-05, + "loss": 1.0822, + "step": 6631 + }, + { + "epoch": 0.40294064037912386, + "grad_norm": 0.22853371500968933, + "learning_rate": 6.529235035900667e-05, + "loss": 1.0704, + "step": 6632 + }, + { + "epoch": 0.4030013974117504, + "grad_norm": 0.1920643150806427, + "learning_rate": 6.528323602659817e-05, + "loss": 1.1006, + "step": 6633 + }, + { + "epoch": 0.40306215444437693, + "grad_norm": 0.17940761148929596, + "learning_rate": 6.527412113397973e-05, + "loss": 1.142, + "step": 6634 + }, + { + "epoch": 0.40312291147700346, + "grad_norm": 0.13779808580875397, + "learning_rate": 6.52650056814854e-05, + "loss": 1.0438, + "step": 6635 + }, + { + "epoch": 0.40318366850963, + "grad_norm": 0.21858450770378113, + "learning_rate": 6.525588966944936e-05, + "loss": 1.0789, + "step": 6636 + }, + { + "epoch": 0.40324442554225653, + "grad_norm": 0.34694162011146545, + "learning_rate": 6.524677309820571e-05, + "loss": 1.0641, + "step": 6637 + }, + { + "epoch": 0.40330518257488307, + "grad_norm": 0.12775734066963196, + "learning_rate": 6.523765596808866e-05, + "loss": 1.0867, + "step": 6638 + }, + { + "epoch": 0.40336593960750955, + "grad_norm": 0.3817265033721924, + "learning_rate": 6.52285382794324e-05, + "loss": 1.0058, + "step": 6639 + }, + { + "epoch": 0.4034266966401361, + "grad_norm": 0.2208240032196045, + "learning_rate": 6.52194200325711e-05, + "loss": 1.0304, + "step": 6640 + }, + { + "epoch": 0.4034874536727626, + "grad_norm": 0.13443036377429962, + "learning_rate": 6.521030122783903e-05, + "loss": 1.0481, + "step": 6641 + }, + { + "epoch": 0.40354821070538915, + "grad_norm": 0.20626302063465118, + "learning_rate": 6.520118186557043e-05, + "loss": 1.0605, + "step": 6642 + }, + { + "epoch": 0.4036089677380157, + "grad_norm": 0.20362652838230133, + "learning_rate": 6.519206194609954e-05, + "loss": 1.0659, + "step": 6643 + }, + { + "epoch": 0.4036697247706422, + "grad_norm": 0.14869822561740875, + "learning_rate": 6.518294146976073e-05, + "loss": 1.0318, + "step": 6644 + }, + { + "epoch": 0.4037304818032687, + "grad_norm": 0.149115189909935, + "learning_rate": 6.517382043688823e-05, + "loss": 1.0876, + "step": 6645 + }, + { + "epoch": 0.40379123883589524, + "grad_norm": 0.1736573874950409, + "learning_rate": 6.516469884781644e-05, + "loss": 1.0794, + "step": 6646 + }, + { + "epoch": 0.4038519958685218, + "grad_norm": 0.2508244514465332, + "learning_rate": 6.515557670287966e-05, + "loss": 1.1283, + "step": 6647 + }, + { + "epoch": 0.4039127529011483, + "grad_norm": 0.20715117454528809, + "learning_rate": 6.514645400241229e-05, + "loss": 1.0799, + "step": 6648 + }, + { + "epoch": 0.40397350993377484, + "grad_norm": 0.6394730806350708, + "learning_rate": 6.513733074674872e-05, + "loss": 1.2992, + "step": 6649 + }, + { + "epoch": 0.4040342669664014, + "grad_norm": 0.21538077294826508, + "learning_rate": 6.512820693622337e-05, + "loss": 1.159, + "step": 6650 + }, + { + "epoch": 0.4040950239990279, + "grad_norm": 0.43514516949653625, + "learning_rate": 6.511908257117067e-05, + "loss": 1.1446, + "step": 6651 + }, + { + "epoch": 0.4041557810316544, + "grad_norm": 0.21478870511054993, + "learning_rate": 6.510995765192508e-05, + "loss": 1.0499, + "step": 6652 + }, + { + "epoch": 0.40421653806428093, + "grad_norm": 0.49485352635383606, + "learning_rate": 6.510083217882106e-05, + "loss": 1.211, + "step": 6653 + }, + { + "epoch": 0.40427729509690746, + "grad_norm": 0.17889845371246338, + "learning_rate": 6.509170615219313e-05, + "loss": 1.1264, + "step": 6654 + }, + { + "epoch": 0.404338052129534, + "grad_norm": 0.15409870445728302, + "learning_rate": 6.508257957237578e-05, + "loss": 1.2703, + "step": 6655 + }, + { + "epoch": 0.40439880916216053, + "grad_norm": 0.16098731756210327, + "learning_rate": 6.507345243970356e-05, + "loss": 1.0317, + "step": 6656 + }, + { + "epoch": 0.40445956619478707, + "grad_norm": 2.5118050575256348, + "learning_rate": 6.506432475451106e-05, + "loss": 1.0563, + "step": 6657 + }, + { + "epoch": 0.40452032322741355, + "grad_norm": 0.17978206276893616, + "learning_rate": 6.505519651713277e-05, + "loss": 1.0711, + "step": 6658 + }, + { + "epoch": 0.4045810802600401, + "grad_norm": 0.2846367657184601, + "learning_rate": 6.504606772790338e-05, + "loss": 1.1987, + "step": 6659 + }, + { + "epoch": 0.4046418372926666, + "grad_norm": 0.12307152152061462, + "learning_rate": 6.503693838715748e-05, + "loss": 1.0401, + "step": 6660 + }, + { + "epoch": 0.40470259432529315, + "grad_norm": 0.2261846661567688, + "learning_rate": 6.502780849522965e-05, + "loss": 1.1381, + "step": 6661 + }, + { + "epoch": 0.4047633513579197, + "grad_norm": 0.16556531190872192, + "learning_rate": 6.501867805245463e-05, + "loss": 1.0702, + "step": 6662 + }, + { + "epoch": 0.4048241083905462, + "grad_norm": 0.42099320888519287, + "learning_rate": 6.500954705916706e-05, + "loss": 1.1616, + "step": 6663 + }, + { + "epoch": 0.40488486542317276, + "grad_norm": 0.17864783108234406, + "learning_rate": 6.500041551570163e-05, + "loss": 1.1111, + "step": 6664 + }, + { + "epoch": 0.40494562245579924, + "grad_norm": 0.19972102344036102, + "learning_rate": 6.499128342239307e-05, + "loss": 1.2066, + "step": 6665 + }, + { + "epoch": 0.40500637948842577, + "grad_norm": 0.1650223284959793, + "learning_rate": 6.498215077957612e-05, + "loss": 1.1443, + "step": 6666 + }, + { + "epoch": 0.4050671365210523, + "grad_norm": 0.28008827567100525, + "learning_rate": 6.497301758758557e-05, + "loss": 1.2032, + "step": 6667 + }, + { + "epoch": 0.40512789355367884, + "grad_norm": 0.21093466877937317, + "learning_rate": 6.496388384675613e-05, + "loss": 1.1249, + "step": 6668 + }, + { + "epoch": 0.4051886505863054, + "grad_norm": 0.22220522165298462, + "learning_rate": 6.495474955742263e-05, + "loss": 1.1426, + "step": 6669 + }, + { + "epoch": 0.4052494076189319, + "grad_norm": 0.13069680333137512, + "learning_rate": 6.494561471991991e-05, + "loss": 1.0801, + "step": 6670 + }, + { + "epoch": 0.4053101646515584, + "grad_norm": 0.15735410153865814, + "learning_rate": 6.493647933458278e-05, + "loss": 1.0833, + "step": 6671 + }, + { + "epoch": 0.4053709216841849, + "grad_norm": 0.1487436592578888, + "learning_rate": 6.492734340174614e-05, + "loss": 1.0503, + "step": 6672 + }, + { + "epoch": 0.40543167871681146, + "grad_norm": 0.13360805809497833, + "learning_rate": 6.491820692174481e-05, + "loss": 1.013, + "step": 6673 + }, + { + "epoch": 0.405492435749438, + "grad_norm": 0.17869211733341217, + "learning_rate": 6.490906989491375e-05, + "loss": 1.06, + "step": 6674 + }, + { + "epoch": 0.40555319278206453, + "grad_norm": 0.1763959378004074, + "learning_rate": 6.489993232158783e-05, + "loss": 1.1478, + "step": 6675 + }, + { + "epoch": 0.40561394981469107, + "grad_norm": 0.2103027105331421, + "learning_rate": 6.489079420210203e-05, + "loss": 1.2345, + "step": 6676 + }, + { + "epoch": 0.4056747068473176, + "grad_norm": 0.20595324039459229, + "learning_rate": 6.488165553679128e-05, + "loss": 1.144, + "step": 6677 + }, + { + "epoch": 0.4057354638799441, + "grad_norm": 0.21653783321380615, + "learning_rate": 6.487251632599056e-05, + "loss": 1.1657, + "step": 6678 + }, + { + "epoch": 0.4057962209125706, + "grad_norm": 0.37403324246406555, + "learning_rate": 6.486337657003489e-05, + "loss": 1.1102, + "step": 6679 + }, + { + "epoch": 0.40585697794519715, + "grad_norm": 0.8291086554527283, + "learning_rate": 6.485423626925928e-05, + "loss": 1.1297, + "step": 6680 + }, + { + "epoch": 0.4059177349778237, + "grad_norm": 0.19228830933570862, + "learning_rate": 6.484509542399876e-05, + "loss": 1.0228, + "step": 6681 + }, + { + "epoch": 0.4059784920104502, + "grad_norm": 0.21620532870292664, + "learning_rate": 6.48359540345884e-05, + "loss": 1.0831, + "step": 6682 + }, + { + "epoch": 0.40603924904307676, + "grad_norm": 0.19082558155059814, + "learning_rate": 6.482681210136328e-05, + "loss": 1.2095, + "step": 6683 + }, + { + "epoch": 0.4061000060757033, + "grad_norm": 0.1522844433784485, + "learning_rate": 6.48176696246585e-05, + "loss": 1.1178, + "step": 6684 + }, + { + "epoch": 0.40616076310832977, + "grad_norm": 3.2175731658935547, + "learning_rate": 6.480852660480918e-05, + "loss": 1.1549, + "step": 6685 + }, + { + "epoch": 0.4062215201409563, + "grad_norm": 0.1618947833776474, + "learning_rate": 6.479938304215043e-05, + "loss": 1.0719, + "step": 6686 + }, + { + "epoch": 0.40628227717358284, + "grad_norm": 0.16067153215408325, + "learning_rate": 6.479023893701745e-05, + "loss": 1.0778, + "step": 6687 + }, + { + "epoch": 0.4063430342062094, + "grad_norm": 0.1437070518732071, + "learning_rate": 6.478109428974542e-05, + "loss": 1.1159, + "step": 6688 + }, + { + "epoch": 0.4064037912388359, + "grad_norm": 0.3020385503768921, + "learning_rate": 6.47719491006695e-05, + "loss": 0.999, + "step": 6689 + }, + { + "epoch": 0.40646454827146244, + "grad_norm": 0.15864762663841248, + "learning_rate": 6.476280337012494e-05, + "loss": 1.1149, + "step": 6690 + }, + { + "epoch": 0.4065253053040889, + "grad_norm": 1.2274796962738037, + "learning_rate": 6.475365709844697e-05, + "loss": 1.0734, + "step": 6691 + }, + { + "epoch": 0.40658606233671546, + "grad_norm": 0.17850178480148315, + "learning_rate": 6.474451028597083e-05, + "loss": 0.9927, + "step": 6692 + }, + { + "epoch": 0.406646819369342, + "grad_norm": 0.1862597018480301, + "learning_rate": 6.473536293303182e-05, + "loss": 1.0516, + "step": 6693 + }, + { + "epoch": 0.40670757640196853, + "grad_norm": 0.30372729897499084, + "learning_rate": 6.472621503996525e-05, + "loss": 1.1004, + "step": 6694 + }, + { + "epoch": 0.40676833343459506, + "grad_norm": 0.17860114574432373, + "learning_rate": 6.471706660710641e-05, + "loss": 1.1441, + "step": 6695 + }, + { + "epoch": 0.4068290904672216, + "grad_norm": 0.14600655436515808, + "learning_rate": 6.470791763479065e-05, + "loss": 1.0836, + "step": 6696 + }, + { + "epoch": 0.40688984749984813, + "grad_norm": 0.45386990904808044, + "learning_rate": 6.469876812335333e-05, + "loss": 1.1657, + "step": 6697 + }, + { + "epoch": 0.4069506045324746, + "grad_norm": 0.197322815656662, + "learning_rate": 6.468961807312982e-05, + "loss": 1.1366, + "step": 6698 + }, + { + "epoch": 0.40701136156510115, + "grad_norm": 0.1556454449892044, + "learning_rate": 6.468046748445552e-05, + "loss": 1.0545, + "step": 6699 + }, + { + "epoch": 0.4070721185977277, + "grad_norm": 0.12340997159481049, + "learning_rate": 6.467131635766584e-05, + "loss": 1.1206, + "step": 6700 + }, + { + "epoch": 0.4071328756303542, + "grad_norm": 0.2479964643716812, + "learning_rate": 6.466216469309624e-05, + "loss": 1.0821, + "step": 6701 + }, + { + "epoch": 0.40719363266298075, + "grad_norm": 0.15532584488391876, + "learning_rate": 6.465301249108214e-05, + "loss": 1.0491, + "step": 6702 + }, + { + "epoch": 0.4072543896956073, + "grad_norm": 0.21279531717300415, + "learning_rate": 6.464385975195905e-05, + "loss": 1.1038, + "step": 6703 + }, + { + "epoch": 0.40731514672823377, + "grad_norm": 0.19126805663108826, + "learning_rate": 6.463470647606244e-05, + "loss": 1.1087, + "step": 6704 + }, + { + "epoch": 0.4073759037608603, + "grad_norm": 0.22247013449668884, + "learning_rate": 6.462555266372784e-05, + "loss": 1.1314, + "step": 6705 + }, + { + "epoch": 0.40743666079348684, + "grad_norm": 0.22831468284130096, + "learning_rate": 6.461639831529078e-05, + "loss": 1.0675, + "step": 6706 + }, + { + "epoch": 0.4074974178261134, + "grad_norm": 0.12370583415031433, + "learning_rate": 6.460724343108681e-05, + "loss": 1.031, + "step": 6707 + }, + { + "epoch": 0.4075581748587399, + "grad_norm": 0.1619083732366562, + "learning_rate": 6.459808801145151e-05, + "loss": 1.096, + "step": 6708 + }, + { + "epoch": 0.40761893189136644, + "grad_norm": 0.19678902626037598, + "learning_rate": 6.458893205672047e-05, + "loss": 1.0405, + "step": 6709 + }, + { + "epoch": 0.407679688923993, + "grad_norm": 0.2028418779373169, + "learning_rate": 6.457977556722931e-05, + "loss": 1.0651, + "step": 6710 + }, + { + "epoch": 0.40774044595661946, + "grad_norm": 0.16231292486190796, + "learning_rate": 6.457061854331368e-05, + "loss": 1.0691, + "step": 6711 + }, + { + "epoch": 0.407801202989246, + "grad_norm": 0.8556142449378967, + "learning_rate": 6.456146098530918e-05, + "loss": 1.2452, + "step": 6712 + }, + { + "epoch": 0.4078619600218725, + "grad_norm": 0.15859106183052063, + "learning_rate": 6.455230289355151e-05, + "loss": 1.027, + "step": 6713 + }, + { + "epoch": 0.40792271705449906, + "grad_norm": 0.329671710729599, + "learning_rate": 6.454314426837639e-05, + "loss": 1.1167, + "step": 6714 + }, + { + "epoch": 0.4079834740871256, + "grad_norm": 0.3965393304824829, + "learning_rate": 6.453398511011948e-05, + "loss": 1.1326, + "step": 6715 + }, + { + "epoch": 0.40804423111975213, + "grad_norm": 0.15830573439598083, + "learning_rate": 6.452482541911656e-05, + "loss": 1.1237, + "step": 6716 + }, + { + "epoch": 0.4081049881523786, + "grad_norm": 0.1571277678012848, + "learning_rate": 6.451566519570336e-05, + "loss": 1.0272, + "step": 6717 + }, + { + "epoch": 0.40816574518500515, + "grad_norm": 0.1618984192609787, + "learning_rate": 6.450650444021561e-05, + "loss": 1.1627, + "step": 6718 + }, + { + "epoch": 0.4082265022176317, + "grad_norm": 0.19462212920188904, + "learning_rate": 6.449734315298917e-05, + "loss": 1.0825, + "step": 6719 + }, + { + "epoch": 0.4082872592502582, + "grad_norm": 0.24113252758979797, + "learning_rate": 6.448818133435978e-05, + "loss": 1.0641, + "step": 6720 + }, + { + "epoch": 0.40834801628288475, + "grad_norm": 0.15695524215698242, + "learning_rate": 6.447901898466335e-05, + "loss": 1.1029, + "step": 6721 + }, + { + "epoch": 0.4084087733155113, + "grad_norm": 0.2879861295223236, + "learning_rate": 6.446985610423565e-05, + "loss": 1.1403, + "step": 6722 + }, + { + "epoch": 0.4084695303481378, + "grad_norm": 0.15667253732681274, + "learning_rate": 6.446069269341257e-05, + "loss": 1.1269, + "step": 6723 + }, + { + "epoch": 0.4085302873807643, + "grad_norm": 1.1046885251998901, + "learning_rate": 6.445152875253002e-05, + "loss": 1.0691, + "step": 6724 + }, + { + "epoch": 0.40859104441339084, + "grad_norm": 0.2713603675365448, + "learning_rate": 6.44423642819239e-05, + "loss": 1.1142, + "step": 6725 + }, + { + "epoch": 0.40865180144601737, + "grad_norm": 0.22601619362831116, + "learning_rate": 6.443319928193011e-05, + "loss": 1.0528, + "step": 6726 + }, + { + "epoch": 0.4087125584786439, + "grad_norm": 0.16097596287727356, + "learning_rate": 6.442403375288462e-05, + "loss": 1.0478, + "step": 6727 + }, + { + "epoch": 0.40877331551127044, + "grad_norm": 0.30255916714668274, + "learning_rate": 6.441486769512336e-05, + "loss": 1.0845, + "step": 6728 + }, + { + "epoch": 0.408834072543897, + "grad_norm": 0.20107612013816833, + "learning_rate": 6.440570110898236e-05, + "loss": 1.1634, + "step": 6729 + }, + { + "epoch": 0.40889482957652346, + "grad_norm": 0.20882728695869446, + "learning_rate": 6.43965339947976e-05, + "loss": 1.2285, + "step": 6730 + }, + { + "epoch": 0.40895558660915, + "grad_norm": 0.2033892571926117, + "learning_rate": 6.43873663529051e-05, + "loss": 1.144, + "step": 6731 + }, + { + "epoch": 0.4090163436417765, + "grad_norm": 0.34708642959594727, + "learning_rate": 6.43781981836409e-05, + "loss": 1.1474, + "step": 6732 + }, + { + "epoch": 0.40907710067440306, + "grad_norm": 0.20012971758842468, + "learning_rate": 6.436902948734107e-05, + "loss": 1.0641, + "step": 6733 + }, + { + "epoch": 0.4091378577070296, + "grad_norm": 0.11697956919670105, + "learning_rate": 6.43598602643417e-05, + "loss": 1.0262, + "step": 6734 + }, + { + "epoch": 0.40919861473965613, + "grad_norm": 0.21744292974472046, + "learning_rate": 6.435069051497886e-05, + "loss": 1.136, + "step": 6735 + }, + { + "epoch": 0.40925937177228267, + "grad_norm": 0.3735305070877075, + "learning_rate": 6.434152023958869e-05, + "loss": 1.0168, + "step": 6736 + }, + { + "epoch": 0.40932012880490914, + "grad_norm": 0.2032550722360611, + "learning_rate": 6.433234943850733e-05, + "loss": 1.0121, + "step": 6737 + }, + { + "epoch": 0.4093808858375357, + "grad_norm": 18.607328414916992, + "learning_rate": 6.432317811207091e-05, + "loss": 1.1331, + "step": 6738 + }, + { + "epoch": 0.4094416428701622, + "grad_norm": 0.3275555372238159, + "learning_rate": 6.431400626061565e-05, + "loss": 1.144, + "step": 6739 + }, + { + "epoch": 0.40950239990278875, + "grad_norm": 0.2409260869026184, + "learning_rate": 6.430483388447772e-05, + "loss": 1.0839, + "step": 6740 + }, + { + "epoch": 0.4095631569354153, + "grad_norm": 0.1464911252260208, + "learning_rate": 6.429566098399334e-05, + "loss": 1.0603, + "step": 6741 + }, + { + "epoch": 0.4096239139680418, + "grad_norm": 0.2618921995162964, + "learning_rate": 6.428648755949873e-05, + "loss": 1.1141, + "step": 6742 + }, + { + "epoch": 0.40968467100066835, + "grad_norm": 0.35685738921165466, + "learning_rate": 6.427731361133016e-05, + "loss": 1.1225, + "step": 6743 + }, + { + "epoch": 0.40974542803329483, + "grad_norm": 0.2647813558578491, + "learning_rate": 6.426813913982392e-05, + "loss": 1.2056, + "step": 6744 + }, + { + "epoch": 0.40980618506592137, + "grad_norm": 0.15585041046142578, + "learning_rate": 6.425896414531627e-05, + "loss": 1.1543, + "step": 6745 + }, + { + "epoch": 0.4098669420985479, + "grad_norm": 0.19152897596359253, + "learning_rate": 6.424978862814352e-05, + "loss": 1.1875, + "step": 6746 + }, + { + "epoch": 0.40992769913117444, + "grad_norm": 0.1646856963634491, + "learning_rate": 6.424061258864203e-05, + "loss": 1.0461, + "step": 6747 + }, + { + "epoch": 0.409988456163801, + "grad_norm": 0.17298170924186707, + "learning_rate": 6.423143602714813e-05, + "loss": 1.0966, + "step": 6748 + }, + { + "epoch": 0.4100492131964275, + "grad_norm": 0.12542447447776794, + "learning_rate": 6.422225894399818e-05, + "loss": 1.0465, + "step": 6749 + }, + { + "epoch": 0.410109970229054, + "grad_norm": 0.1578681766986847, + "learning_rate": 6.421308133952858e-05, + "loss": 1.0543, + "step": 6750 + }, + { + "epoch": 0.4101707272616805, + "grad_norm": 0.30167078971862793, + "learning_rate": 6.420390321407575e-05, + "loss": 1.1853, + "step": 6751 + }, + { + "epoch": 0.41023148429430706, + "grad_norm": 0.24929563701152802, + "learning_rate": 6.41947245679761e-05, + "loss": 1.1201, + "step": 6752 + }, + { + "epoch": 0.4102922413269336, + "grad_norm": 0.24105627834796906, + "learning_rate": 6.418554540156607e-05, + "loss": 1.1356, + "step": 6753 + }, + { + "epoch": 0.41035299835956013, + "grad_norm": 0.15548865497112274, + "learning_rate": 6.417636571518213e-05, + "loss": 1.0628, + "step": 6754 + }, + { + "epoch": 0.41041375539218666, + "grad_norm": 0.17175735533237457, + "learning_rate": 6.416718550916078e-05, + "loss": 1.1799, + "step": 6755 + }, + { + "epoch": 0.4104745124248132, + "grad_norm": 0.24108925461769104, + "learning_rate": 6.415800478383849e-05, + "loss": 1.0849, + "step": 6756 + }, + { + "epoch": 0.4105352694574397, + "grad_norm": 2.870900869369507, + "learning_rate": 6.41488235395518e-05, + "loss": 1.1703, + "step": 6757 + }, + { + "epoch": 0.4105960264900662, + "grad_norm": 0.19000382721424103, + "learning_rate": 6.413964177663726e-05, + "loss": 1.0626, + "step": 6758 + }, + { + "epoch": 0.41065678352269275, + "grad_norm": 0.28872933983802795, + "learning_rate": 6.41304594954314e-05, + "loss": 1.2691, + "step": 6759 + }, + { + "epoch": 0.4107175405553193, + "grad_norm": 0.15360146760940552, + "learning_rate": 6.412127669627083e-05, + "loss": 1.0329, + "step": 6760 + }, + { + "epoch": 0.4107782975879458, + "grad_norm": 0.3661043345928192, + "learning_rate": 6.411209337949214e-05, + "loss": 1.1655, + "step": 6761 + }, + { + "epoch": 0.41083905462057235, + "grad_norm": 0.13706107437610626, + "learning_rate": 6.410290954543191e-05, + "loss": 1.0234, + "step": 6762 + }, + { + "epoch": 0.41089981165319883, + "grad_norm": 0.2353687286376953, + "learning_rate": 6.409372519442682e-05, + "loss": 1.2065, + "step": 6763 + }, + { + "epoch": 0.41096056868582537, + "grad_norm": 0.2996273338794708, + "learning_rate": 6.408454032681352e-05, + "loss": 1.0825, + "step": 6764 + }, + { + "epoch": 0.4110213257184519, + "grad_norm": 0.18953882157802582, + "learning_rate": 6.407535494292867e-05, + "loss": 1.1542, + "step": 6765 + }, + { + "epoch": 0.41108208275107844, + "grad_norm": 0.2067681849002838, + "learning_rate": 6.406616904310894e-05, + "loss": 1.1228, + "step": 6766 + }, + { + "epoch": 0.41114283978370497, + "grad_norm": 0.15057958662509918, + "learning_rate": 6.405698262769109e-05, + "loss": 1.0869, + "step": 6767 + }, + { + "epoch": 0.4112035968163315, + "grad_norm": 0.25824567675590515, + "learning_rate": 6.404779569701182e-05, + "loss": 1.0192, + "step": 6768 + }, + { + "epoch": 0.41126435384895804, + "grad_norm": 0.13443215191364288, + "learning_rate": 6.403860825140789e-05, + "loss": 1.0788, + "step": 6769 + }, + { + "epoch": 0.4113251108815845, + "grad_norm": 0.2732512950897217, + "learning_rate": 6.402942029121603e-05, + "loss": 1.2363, + "step": 6770 + }, + { + "epoch": 0.41138586791421106, + "grad_norm": 1.3949735164642334, + "learning_rate": 6.40202318167731e-05, + "loss": 1.1352, + "step": 6771 + }, + { + "epoch": 0.4114466249468376, + "grad_norm": 0.2742675840854645, + "learning_rate": 6.401104282841583e-05, + "loss": 1.1955, + "step": 6772 + }, + { + "epoch": 0.4115073819794641, + "grad_norm": 0.22874107956886292, + "learning_rate": 6.400185332648108e-05, + "loss": 1.15, + "step": 6773 + }, + { + "epoch": 0.41156813901209066, + "grad_norm": 0.2986707091331482, + "learning_rate": 6.399266331130571e-05, + "loss": 1.1729, + "step": 6774 + }, + { + "epoch": 0.4116288960447172, + "grad_norm": 0.21805629134178162, + "learning_rate": 6.398347278322654e-05, + "loss": 1.1117, + "step": 6775 + }, + { + "epoch": 0.4116896530773437, + "grad_norm": 0.17125198245048523, + "learning_rate": 6.397428174258047e-05, + "loss": 1.134, + "step": 6776 + }, + { + "epoch": 0.4117504101099702, + "grad_norm": 0.13878971338272095, + "learning_rate": 6.396509018970442e-05, + "loss": 1.053, + "step": 6777 + }, + { + "epoch": 0.41181116714259675, + "grad_norm": 0.2812134325504303, + "learning_rate": 6.395589812493528e-05, + "loss": 1.1214, + "step": 6778 + }, + { + "epoch": 0.4118719241752233, + "grad_norm": 0.12149051576852798, + "learning_rate": 6.394670554861002e-05, + "loss": 1.0445, + "step": 6779 + }, + { + "epoch": 0.4119326812078498, + "grad_norm": 22.036266326904297, + "learning_rate": 6.393751246106553e-05, + "loss": 1.1176, + "step": 6780 + }, + { + "epoch": 0.41199343824047635, + "grad_norm": 0.6228098273277283, + "learning_rate": 6.392831886263886e-05, + "loss": 1.0863, + "step": 6781 + }, + { + "epoch": 0.4120541952731029, + "grad_norm": 0.7305749654769897, + "learning_rate": 6.391912475366695e-05, + "loss": 1.0611, + "step": 6782 + }, + { + "epoch": 0.41211495230572937, + "grad_norm": 0.41448134183883667, + "learning_rate": 6.390993013448683e-05, + "loss": 1.1178, + "step": 6783 + }, + { + "epoch": 0.4121757093383559, + "grad_norm": 0.16076204180717468, + "learning_rate": 6.390073500543556e-05, + "loss": 1.1146, + "step": 6784 + }, + { + "epoch": 0.41223646637098244, + "grad_norm": 0.1853996217250824, + "learning_rate": 6.389153936685012e-05, + "loss": 1.0825, + "step": 6785 + }, + { + "epoch": 0.41229722340360897, + "grad_norm": 0.17406095564365387, + "learning_rate": 6.388234321906765e-05, + "loss": 1.0645, + "step": 6786 + }, + { + "epoch": 0.4123579804362355, + "grad_norm": 0.28208327293395996, + "learning_rate": 6.38731465624252e-05, + "loss": 1.1606, + "step": 6787 + }, + { + "epoch": 0.41241873746886204, + "grad_norm": 0.13500328361988068, + "learning_rate": 6.386394939725989e-05, + "loss": 1.0769, + "step": 6788 + }, + { + "epoch": 0.4124794945014886, + "grad_norm": 0.1835724264383316, + "learning_rate": 6.385475172390882e-05, + "loss": 1.0882, + "step": 6789 + }, + { + "epoch": 0.41254025153411505, + "grad_norm": 0.16545453667640686, + "learning_rate": 6.384555354270913e-05, + "loss": 1.0934, + "step": 6790 + }, + { + "epoch": 0.4126010085667416, + "grad_norm": 11.786956787109375, + "learning_rate": 6.383635485399804e-05, + "loss": 1.0899, + "step": 6791 + }, + { + "epoch": 0.4126617655993681, + "grad_norm": 0.17428408563137054, + "learning_rate": 6.382715565811267e-05, + "loss": 1.1009, + "step": 6792 + }, + { + "epoch": 0.41272252263199466, + "grad_norm": 0.2572706937789917, + "learning_rate": 6.381795595539023e-05, + "loss": 1.0628, + "step": 6793 + }, + { + "epoch": 0.4127832796646212, + "grad_norm": 0.1567520946264267, + "learning_rate": 6.380875574616795e-05, + "loss": 1.0409, + "step": 6794 + }, + { + "epoch": 0.41284403669724773, + "grad_norm": 0.21802347898483276, + "learning_rate": 6.379955503078306e-05, + "loss": 0.9951, + "step": 6795 + }, + { + "epoch": 0.4129047937298742, + "grad_norm": 0.16666068136692047, + "learning_rate": 6.379035380957281e-05, + "loss": 1.0834, + "step": 6796 + }, + { + "epoch": 0.41296555076250074, + "grad_norm": 0.15167683362960815, + "learning_rate": 6.378115208287447e-05, + "loss": 1.093, + "step": 6797 + }, + { + "epoch": 0.4130263077951273, + "grad_norm": 0.21332664787769318, + "learning_rate": 6.377194985102534e-05, + "loss": 1.0815, + "step": 6798 + }, + { + "epoch": 0.4130870648277538, + "grad_norm": 0.41679686307907104, + "learning_rate": 6.376274711436274e-05, + "loss": 1.3194, + "step": 6799 + }, + { + "epoch": 0.41314782186038035, + "grad_norm": 0.22373071312904358, + "learning_rate": 6.375354387322397e-05, + "loss": 1.2072, + "step": 6800 + }, + { + "epoch": 0.4132085788930069, + "grad_norm": 0.14026744663715363, + "learning_rate": 6.37443401279464e-05, + "loss": 1.0749, + "step": 6801 + }, + { + "epoch": 0.4132693359256334, + "grad_norm": 0.14397601783275604, + "learning_rate": 6.373513587886737e-05, + "loss": 1.0595, + "step": 6802 + }, + { + "epoch": 0.4133300929582599, + "grad_norm": 0.419059157371521, + "learning_rate": 6.37259311263243e-05, + "loss": 1.3038, + "step": 6803 + }, + { + "epoch": 0.41339084999088643, + "grad_norm": 6.196994304656982, + "learning_rate": 6.371672587065456e-05, + "loss": 1.1054, + "step": 6804 + }, + { + "epoch": 0.41345160702351297, + "grad_norm": 0.2972855865955353, + "learning_rate": 6.370752011219559e-05, + "loss": 1.0455, + "step": 6805 + }, + { + "epoch": 0.4135123640561395, + "grad_norm": 0.1494392305612564, + "learning_rate": 6.369831385128479e-05, + "loss": 1.0755, + "step": 6806 + }, + { + "epoch": 0.41357312108876604, + "grad_norm": 0.31748634576797485, + "learning_rate": 6.368910708825968e-05, + "loss": 1.1824, + "step": 6807 + }, + { + "epoch": 0.4136338781213926, + "grad_norm": 0.21726271510124207, + "learning_rate": 6.36798998234577e-05, + "loss": 1.0903, + "step": 6808 + }, + { + "epoch": 0.41369463515401905, + "grad_norm": 0.16084003448486328, + "learning_rate": 6.367069205721634e-05, + "loss": 1.0449, + "step": 6809 + }, + { + "epoch": 0.4137553921866456, + "grad_norm": 0.13093261420726776, + "learning_rate": 6.366148378987312e-05, + "loss": 1.0608, + "step": 6810 + }, + { + "epoch": 0.4138161492192721, + "grad_norm": 0.2830486297607422, + "learning_rate": 6.365227502176557e-05, + "loss": 1.1464, + "step": 6811 + }, + { + "epoch": 0.41387690625189866, + "grad_norm": 0.12262752652168274, + "learning_rate": 6.364306575323124e-05, + "loss": 1.0376, + "step": 6812 + }, + { + "epoch": 0.4139376632845252, + "grad_norm": 0.3101465106010437, + "learning_rate": 6.363385598460772e-05, + "loss": 1.076, + "step": 6813 + }, + { + "epoch": 0.4139984203171517, + "grad_norm": 0.17994682490825653, + "learning_rate": 6.362464571623254e-05, + "loss": 1.1801, + "step": 6814 + }, + { + "epoch": 0.41405917734977826, + "grad_norm": 0.2440444678068161, + "learning_rate": 6.361543494844337e-05, + "loss": 1.1403, + "step": 6815 + }, + { + "epoch": 0.41411993438240474, + "grad_norm": 0.1671774983406067, + "learning_rate": 6.360622368157776e-05, + "loss": 1.1513, + "step": 6816 + }, + { + "epoch": 0.4141806914150313, + "grad_norm": 0.4949195981025696, + "learning_rate": 6.359701191597342e-05, + "loss": 1.1362, + "step": 6817 + }, + { + "epoch": 0.4142414484476578, + "grad_norm": 0.1731252670288086, + "learning_rate": 6.358779965196799e-05, + "loss": 1.0316, + "step": 6818 + }, + { + "epoch": 0.41430220548028435, + "grad_norm": 0.18434561789035797, + "learning_rate": 6.35785868898991e-05, + "loss": 1.1337, + "step": 6819 + }, + { + "epoch": 0.4143629625129109, + "grad_norm": 0.15021756291389465, + "learning_rate": 6.356937363010452e-05, + "loss": 1.0661, + "step": 6820 + }, + { + "epoch": 0.4144237195455374, + "grad_norm": 0.1818803995847702, + "learning_rate": 6.35601598729219e-05, + "loss": 1.029, + "step": 6821 + }, + { + "epoch": 0.4144844765781639, + "grad_norm": 0.22433963418006897, + "learning_rate": 6.355094561868901e-05, + "loss": 1.1532, + "step": 6822 + }, + { + "epoch": 0.41454523361079043, + "grad_norm": 0.18789389729499817, + "learning_rate": 6.35417308677436e-05, + "loss": 1.055, + "step": 6823 + }, + { + "epoch": 0.41460599064341697, + "grad_norm": 0.218994140625, + "learning_rate": 6.353251562042342e-05, + "loss": 1.0583, + "step": 6824 + }, + { + "epoch": 0.4146667476760435, + "grad_norm": 0.21378548443317413, + "learning_rate": 6.352329987706625e-05, + "loss": 1.2208, + "step": 6825 + }, + { + "epoch": 0.41472750470867004, + "grad_norm": 0.12409744411706924, + "learning_rate": 6.351408363800993e-05, + "loss": 1.0516, + "step": 6826 + }, + { + "epoch": 0.41478826174129657, + "grad_norm": 0.3805442452430725, + "learning_rate": 6.350486690359223e-05, + "loss": 1.0773, + "step": 6827 + }, + { + "epoch": 0.4148490187739231, + "grad_norm": 0.23587088286876678, + "learning_rate": 6.349564967415105e-05, + "loss": 1.1544, + "step": 6828 + }, + { + "epoch": 0.4149097758065496, + "grad_norm": 0.1841599941253662, + "learning_rate": 6.348643195002422e-05, + "loss": 1.2298, + "step": 6829 + }, + { + "epoch": 0.4149705328391761, + "grad_norm": 0.1608675718307495, + "learning_rate": 6.34772137315496e-05, + "loss": 1.0988, + "step": 6830 + }, + { + "epoch": 0.41503128987180266, + "grad_norm": 0.13292677700519562, + "learning_rate": 6.346799501906513e-05, + "loss": 1.0107, + "step": 6831 + }, + { + "epoch": 0.4150920469044292, + "grad_norm": 0.18974928557872772, + "learning_rate": 6.34587758129087e-05, + "loss": 1.2204, + "step": 6832 + }, + { + "epoch": 0.4151528039370557, + "grad_norm": 0.19476690888404846, + "learning_rate": 6.344955611341822e-05, + "loss": 1.1685, + "step": 6833 + }, + { + "epoch": 0.41521356096968226, + "grad_norm": 0.14657852053642273, + "learning_rate": 6.344033592093169e-05, + "loss": 1.0884, + "step": 6834 + }, + { + "epoch": 0.41527431800230874, + "grad_norm": 0.21968382596969604, + "learning_rate": 6.343111523578703e-05, + "loss": 1.0708, + "step": 6835 + }, + { + "epoch": 0.4153350750349353, + "grad_norm": 0.14353160560131073, + "learning_rate": 6.342189405832225e-05, + "loss": 1.0546, + "step": 6836 + }, + { + "epoch": 0.4153958320675618, + "grad_norm": 0.15635627508163452, + "learning_rate": 6.341267238887534e-05, + "loss": 1.0967, + "step": 6837 + }, + { + "epoch": 0.41545658910018834, + "grad_norm": 0.1640235334634781, + "learning_rate": 6.340345022778435e-05, + "loss": 1.0507, + "step": 6838 + }, + { + "epoch": 0.4155173461328149, + "grad_norm": 0.20663829147815704, + "learning_rate": 6.339422757538727e-05, + "loss": 1.1422, + "step": 6839 + }, + { + "epoch": 0.4155781031654414, + "grad_norm": 0.15276584029197693, + "learning_rate": 6.33850044320222e-05, + "loss": 1.0889, + "step": 6840 + }, + { + "epoch": 0.41563886019806795, + "grad_norm": 0.2972036302089691, + "learning_rate": 6.337578079802724e-05, + "loss": 1.0401, + "step": 6841 + }, + { + "epoch": 0.41569961723069443, + "grad_norm": 0.2572774887084961, + "learning_rate": 6.33665566737404e-05, + "loss": 1.2376, + "step": 6842 + }, + { + "epoch": 0.41576037426332096, + "grad_norm": 0.18544059991836548, + "learning_rate": 6.335733205949988e-05, + "loss": 1.0772, + "step": 6843 + }, + { + "epoch": 0.4158211312959475, + "grad_norm": 0.14605402946472168, + "learning_rate": 6.334810695564375e-05, + "loss": 1.1364, + "step": 6844 + }, + { + "epoch": 0.41588188832857403, + "grad_norm": 0.12782181799411774, + "learning_rate": 6.333888136251018e-05, + "loss": 1.0563, + "step": 6845 + }, + { + "epoch": 0.41594264536120057, + "grad_norm": 0.22031638026237488, + "learning_rate": 6.332965528043735e-05, + "loss": 1.1313, + "step": 6846 + }, + { + "epoch": 0.4160034023938271, + "grad_norm": 0.11370088160037994, + "learning_rate": 6.33204287097634e-05, + "loss": 1.0812, + "step": 6847 + }, + { + "epoch": 0.41606415942645364, + "grad_norm": 0.14567701518535614, + "learning_rate": 6.331120165082659e-05, + "loss": 1.0732, + "step": 6848 + }, + { + "epoch": 0.4161249164590801, + "grad_norm": 0.26609399914741516, + "learning_rate": 6.33019741039651e-05, + "loss": 1.1244, + "step": 6849 + }, + { + "epoch": 0.41618567349170665, + "grad_norm": 0.2305385023355484, + "learning_rate": 6.329274606951718e-05, + "loss": 1.0898, + "step": 6850 + }, + { + "epoch": 0.4162464305243332, + "grad_norm": 0.1748591810464859, + "learning_rate": 6.32835175478211e-05, + "loss": 1.0617, + "step": 6851 + }, + { + "epoch": 0.4163071875569597, + "grad_norm": 0.12653739750385284, + "learning_rate": 6.327428853921508e-05, + "loss": 1.0546, + "step": 6852 + }, + { + "epoch": 0.41636794458958626, + "grad_norm": 0.5172829031944275, + "learning_rate": 6.326505904403745e-05, + "loss": 1.2402, + "step": 6853 + }, + { + "epoch": 0.4164287016222128, + "grad_norm": 0.16379953920841217, + "learning_rate": 6.325582906262655e-05, + "loss": 1.0003, + "step": 6854 + }, + { + "epoch": 0.4164894586548393, + "grad_norm": 0.23378968238830566, + "learning_rate": 6.324659859532065e-05, + "loss": 1.1527, + "step": 6855 + }, + { + "epoch": 0.4165502156874658, + "grad_norm": 0.28856486082077026, + "learning_rate": 6.323736764245813e-05, + "loss": 1.077, + "step": 6856 + }, + { + "epoch": 0.41661097272009234, + "grad_norm": 1.1506011486053467, + "learning_rate": 6.322813620437732e-05, + "loss": 1.1055, + "step": 6857 + }, + { + "epoch": 0.4166717297527189, + "grad_norm": 0.5873069763183594, + "learning_rate": 6.321890428141665e-05, + "loss": 1.1425, + "step": 6858 + }, + { + "epoch": 0.4167324867853454, + "grad_norm": 0.23544001579284668, + "learning_rate": 6.320967187391447e-05, + "loss": 1.1597, + "step": 6859 + }, + { + "epoch": 0.41679324381797195, + "grad_norm": 0.3136921525001526, + "learning_rate": 6.32004389822092e-05, + "loss": 1.2012, + "step": 6860 + }, + { + "epoch": 0.4168540008505985, + "grad_norm": 0.2939581871032715, + "learning_rate": 6.319120560663932e-05, + "loss": 1.1314, + "step": 6861 + }, + { + "epoch": 0.41691475788322496, + "grad_norm": 0.17378170788288116, + "learning_rate": 6.318197174754325e-05, + "loss": 1.105, + "step": 6862 + }, + { + "epoch": 0.4169755149158515, + "grad_norm": 0.25381678342819214, + "learning_rate": 6.317273740525942e-05, + "loss": 1.1491, + "step": 6863 + }, + { + "epoch": 0.41703627194847803, + "grad_norm": 0.15965205430984497, + "learning_rate": 6.316350258012638e-05, + "loss": 1.1048, + "step": 6864 + }, + { + "epoch": 0.41709702898110457, + "grad_norm": 0.1606505811214447, + "learning_rate": 6.31542672724826e-05, + "loss": 1.1133, + "step": 6865 + }, + { + "epoch": 0.4171577860137311, + "grad_norm": 0.208512544631958, + "learning_rate": 6.314503148266662e-05, + "loss": 1.1683, + "step": 6866 + }, + { + "epoch": 0.41721854304635764, + "grad_norm": 0.3423088788986206, + "learning_rate": 6.313579521101697e-05, + "loss": 1.1472, + "step": 6867 + }, + { + "epoch": 0.4172793000789841, + "grad_norm": 0.23983359336853027, + "learning_rate": 6.31265584578722e-05, + "loss": 1.0281, + "step": 6868 + }, + { + "epoch": 0.41734005711161065, + "grad_norm": 0.1681239902973175, + "learning_rate": 6.31173212235709e-05, + "loss": 1.0925, + "step": 6869 + }, + { + "epoch": 0.4174008141442372, + "grad_norm": 7.762028217315674, + "learning_rate": 6.310808350845164e-05, + "loss": 1.0839, + "step": 6870 + }, + { + "epoch": 0.4174615711768637, + "grad_norm": 0.1919071525335312, + "learning_rate": 6.309884531285307e-05, + "loss": 1.2627, + "step": 6871 + }, + { + "epoch": 0.41752232820949026, + "grad_norm": 0.21320335566997528, + "learning_rate": 6.308960663711376e-05, + "loss": 1.0737, + "step": 6872 + }, + { + "epoch": 0.4175830852421168, + "grad_norm": 0.19712430238723755, + "learning_rate": 6.30803674815724e-05, + "loss": 1.0324, + "step": 6873 + }, + { + "epoch": 0.4176438422747433, + "grad_norm": 0.21667392551898956, + "learning_rate": 6.307112784656766e-05, + "loss": 1.0627, + "step": 6874 + }, + { + "epoch": 0.4177045993073698, + "grad_norm": 0.17969444394111633, + "learning_rate": 6.306188773243819e-05, + "loss": 0.9994, + "step": 6875 + }, + { + "epoch": 0.41776535633999634, + "grad_norm": 0.23410826921463013, + "learning_rate": 6.305264713952269e-05, + "loss": 1.1836, + "step": 6876 + }, + { + "epoch": 0.4178261133726229, + "grad_norm": 0.20407861471176147, + "learning_rate": 6.30434060681599e-05, + "loss": 1.2992, + "step": 6877 + }, + { + "epoch": 0.4178868704052494, + "grad_norm": 0.5336781144142151, + "learning_rate": 6.303416451868853e-05, + "loss": 1.1372, + "step": 6878 + }, + { + "epoch": 0.41794762743787595, + "grad_norm": 0.14235033094882965, + "learning_rate": 6.302492249144736e-05, + "loss": 1.0444, + "step": 6879 + }, + { + "epoch": 0.4180083844705025, + "grad_norm": 27.98836326599121, + "learning_rate": 6.301567998677512e-05, + "loss": 1.1922, + "step": 6880 + }, + { + "epoch": 0.41806914150312896, + "grad_norm": 0.31793883442878723, + "learning_rate": 6.300643700501061e-05, + "loss": 1.2796, + "step": 6881 + }, + { + "epoch": 0.4181298985357555, + "grad_norm": 0.16187778115272522, + "learning_rate": 6.299719354649266e-05, + "loss": 1.0702, + "step": 6882 + }, + { + "epoch": 0.41819065556838203, + "grad_norm": 0.5476409196853638, + "learning_rate": 6.298794961156004e-05, + "loss": 1.319, + "step": 6883 + }, + { + "epoch": 0.41825141260100857, + "grad_norm": 0.2328275740146637, + "learning_rate": 6.297870520055163e-05, + "loss": 1.1273, + "step": 6884 + }, + { + "epoch": 0.4183121696336351, + "grad_norm": 0.12698839604854584, + "learning_rate": 6.296946031380628e-05, + "loss": 1.0523, + "step": 6885 + }, + { + "epoch": 0.41837292666626164, + "grad_norm": 0.14996226131916046, + "learning_rate": 6.296021495166284e-05, + "loss": 1.0498, + "step": 6886 + }, + { + "epoch": 0.41843368369888817, + "grad_norm": 0.20301690697669983, + "learning_rate": 6.295096911446023e-05, + "loss": 1.1121, + "step": 6887 + }, + { + "epoch": 0.41849444073151465, + "grad_norm": 0.19202876091003418, + "learning_rate": 6.294172280253734e-05, + "loss": 1.091, + "step": 6888 + }, + { + "epoch": 0.4185551977641412, + "grad_norm": 19.74171257019043, + "learning_rate": 6.29324760162331e-05, + "loss": 1.0966, + "step": 6889 + }, + { + "epoch": 0.4186159547967677, + "grad_norm": 0.2416202574968338, + "learning_rate": 6.292322875588645e-05, + "loss": 1.1811, + "step": 6890 + }, + { + "epoch": 0.41867671182939425, + "grad_norm": 0.18140235543251038, + "learning_rate": 6.291398102183634e-05, + "loss": 1.0655, + "step": 6891 + }, + { + "epoch": 0.4187374688620208, + "grad_norm": 0.18717539310455322, + "learning_rate": 6.290473281442177e-05, + "loss": 1.1467, + "step": 6892 + }, + { + "epoch": 0.4187982258946473, + "grad_norm": 0.1472216099500656, + "learning_rate": 6.289548413398172e-05, + "loss": 1.0387, + "step": 6893 + }, + { + "epoch": 0.41885898292727386, + "grad_norm": 0.18043498694896698, + "learning_rate": 6.288623498085522e-05, + "loss": 1.0747, + "step": 6894 + }, + { + "epoch": 0.41891973995990034, + "grad_norm": 0.15664604306221008, + "learning_rate": 6.287698535538129e-05, + "loss": 1.112, + "step": 6895 + }, + { + "epoch": 0.4189804969925269, + "grad_norm": 0.2661375403404236, + "learning_rate": 6.286773525789894e-05, + "loss": 1.0764, + "step": 6896 + }, + { + "epoch": 0.4190412540251534, + "grad_norm": 0.14460201561450958, + "learning_rate": 6.285848468874728e-05, + "loss": 1.0316, + "step": 6897 + }, + { + "epoch": 0.41910201105777994, + "grad_norm": 0.22918212413787842, + "learning_rate": 6.28492336482654e-05, + "loss": 1.1167, + "step": 6898 + }, + { + "epoch": 0.4191627680904065, + "grad_norm": 0.1595105230808258, + "learning_rate": 6.283998213679236e-05, + "loss": 1.065, + "step": 6899 + }, + { + "epoch": 0.419223525123033, + "grad_norm": 1.2316800355911255, + "learning_rate": 6.28307301546673e-05, + "loss": 1.0789, + "step": 6900 + }, + { + "epoch": 0.4192842821556595, + "grad_norm": 0.287421315908432, + "learning_rate": 6.282147770222932e-05, + "loss": 1.0824, + "step": 6901 + }, + { + "epoch": 0.41934503918828603, + "grad_norm": 0.1340104341506958, + "learning_rate": 6.281222477981762e-05, + "loss": 1.0551, + "step": 6902 + }, + { + "epoch": 0.41940579622091256, + "grad_norm": 0.35405653715133667, + "learning_rate": 6.280297138777135e-05, + "loss": 1.0831, + "step": 6903 + }, + { + "epoch": 0.4194665532535391, + "grad_norm": 0.13373813033103943, + "learning_rate": 6.279371752642968e-05, + "loss": 1.0476, + "step": 6904 + }, + { + "epoch": 0.41952731028616563, + "grad_norm": 0.14785555005073547, + "learning_rate": 6.278446319613183e-05, + "loss": 1.0999, + "step": 6905 + }, + { + "epoch": 0.41958806731879217, + "grad_norm": 0.11077269911766052, + "learning_rate": 6.2775208397217e-05, + "loss": 1.0273, + "step": 6906 + }, + { + "epoch": 0.4196488243514187, + "grad_norm": 0.11789675801992416, + "learning_rate": 6.276595313002442e-05, + "loss": 1.059, + "step": 6907 + }, + { + "epoch": 0.4197095813840452, + "grad_norm": 0.16572049260139465, + "learning_rate": 6.275669739489339e-05, + "loss": 1.1243, + "step": 6908 + }, + { + "epoch": 0.4197703384166717, + "grad_norm": 2.3128082752227783, + "learning_rate": 6.274744119216314e-05, + "loss": 1.0493, + "step": 6909 + }, + { + "epoch": 0.41983109544929825, + "grad_norm": 0.27944597601890564, + "learning_rate": 6.273818452217295e-05, + "loss": 1.0743, + "step": 6910 + }, + { + "epoch": 0.4198918524819248, + "grad_norm": 0.3292635977268219, + "learning_rate": 6.27289273852622e-05, + "loss": 1.1443, + "step": 6911 + }, + { + "epoch": 0.4199526095145513, + "grad_norm": 0.1531735509634018, + "learning_rate": 6.271966978177009e-05, + "loss": 1.061, + "step": 6912 + }, + { + "epoch": 0.42001336654717786, + "grad_norm": 0.1380385309457779, + "learning_rate": 6.271041171203606e-05, + "loss": 1.0429, + "step": 6913 + }, + { + "epoch": 0.42007412357980434, + "grad_norm": 0.18009182810783386, + "learning_rate": 6.270115317639943e-05, + "loss": 1.1294, + "step": 6914 + }, + { + "epoch": 0.42013488061243087, + "grad_norm": 5.402449607849121, + "learning_rate": 6.269189417519957e-05, + "loss": 1.1245, + "step": 6915 + }, + { + "epoch": 0.4201956376450574, + "grad_norm": 0.1931043416261673, + "learning_rate": 6.268263470877587e-05, + "loss": 1.2073, + "step": 6916 + }, + { + "epoch": 0.42025639467768394, + "grad_norm": 6.12284517288208, + "learning_rate": 6.267337477746776e-05, + "loss": 1.121, + "step": 6917 + }, + { + "epoch": 0.4203171517103105, + "grad_norm": 0.1379459798336029, + "learning_rate": 6.266411438161465e-05, + "loss": 1.0877, + "step": 6918 + }, + { + "epoch": 0.420377908742937, + "grad_norm": 0.18237358331680298, + "learning_rate": 6.265485352155595e-05, + "loss": 1.1493, + "step": 6919 + }, + { + "epoch": 0.42043866577556355, + "grad_norm": 0.1345060020685196, + "learning_rate": 6.264559219763116e-05, + "loss": 1.0716, + "step": 6920 + }, + { + "epoch": 0.42049942280819, + "grad_norm": 0.14291448891162872, + "learning_rate": 6.263633041017975e-05, + "loss": 1.1124, + "step": 6921 + }, + { + "epoch": 0.42056017984081656, + "grad_norm": 0.1362290382385254, + "learning_rate": 6.262706815954121e-05, + "loss": 1.0575, + "step": 6922 + }, + { + "epoch": 0.4206209368734431, + "grad_norm": 0.12679123878479004, + "learning_rate": 6.261780544605504e-05, + "loss": 1.0844, + "step": 6923 + }, + { + "epoch": 0.42068169390606963, + "grad_norm": 0.20512786507606506, + "learning_rate": 6.260854227006079e-05, + "loss": 1.123, + "step": 6924 + }, + { + "epoch": 0.42074245093869617, + "grad_norm": 0.21317705512046814, + "learning_rate": 6.259927863189796e-05, + "loss": 1.1117, + "step": 6925 + }, + { + "epoch": 0.4208032079713227, + "grad_norm": 0.15350694954395294, + "learning_rate": 6.259001453190615e-05, + "loss": 1.0852, + "step": 6926 + }, + { + "epoch": 0.4208639650039492, + "grad_norm": 0.18139959871768951, + "learning_rate": 6.258074997042493e-05, + "loss": 1.1731, + "step": 6927 + }, + { + "epoch": 0.4209247220365757, + "grad_norm": 0.19312353432178497, + "learning_rate": 6.257148494779387e-05, + "loss": 1.2615, + "step": 6928 + }, + { + "epoch": 0.42098547906920225, + "grad_norm": 0.2791098654270172, + "learning_rate": 6.256221946435262e-05, + "loss": 1.0902, + "step": 6929 + }, + { + "epoch": 0.4210462361018288, + "grad_norm": 0.1469917893409729, + "learning_rate": 6.255295352044075e-05, + "loss": 1.0227, + "step": 6930 + }, + { + "epoch": 0.4211069931344553, + "grad_norm": 0.1947174370288849, + "learning_rate": 6.254368711639797e-05, + "loss": 1.1611, + "step": 6931 + }, + { + "epoch": 0.42116775016708186, + "grad_norm": 0.19812403619289398, + "learning_rate": 6.253442025256392e-05, + "loss": 1.0874, + "step": 6932 + }, + { + "epoch": 0.4212285071997084, + "grad_norm": 0.29190686345100403, + "learning_rate": 6.252515292927827e-05, + "loss": 1.1326, + "step": 6933 + }, + { + "epoch": 0.42128926423233487, + "grad_norm": 0.40126460790634155, + "learning_rate": 6.25158851468807e-05, + "loss": 1.079, + "step": 6934 + }, + { + "epoch": 0.4213500212649614, + "grad_norm": 0.23086629807949066, + "learning_rate": 6.250661690571095e-05, + "loss": 1.0901, + "step": 6935 + }, + { + "epoch": 0.42141077829758794, + "grad_norm": 0.4902494251728058, + "learning_rate": 6.249734820610876e-05, + "loss": 1.0133, + "step": 6936 + }, + { + "epoch": 0.4214715353302145, + "grad_norm": 0.1629909873008728, + "learning_rate": 6.248807904841383e-05, + "loss": 1.0541, + "step": 6937 + }, + { + "epoch": 0.421532292362841, + "grad_norm": 0.43007004261016846, + "learning_rate": 6.247880943296596e-05, + "loss": 1.1121, + "step": 6938 + }, + { + "epoch": 0.42159304939546755, + "grad_norm": 0.1909950226545334, + "learning_rate": 6.24695393601049e-05, + "loss": 1.0356, + "step": 6939 + }, + { + "epoch": 0.421653806428094, + "grad_norm": 0.3682610094547272, + "learning_rate": 6.246026883017048e-05, + "loss": 1.1733, + "step": 6940 + }, + { + "epoch": 0.42171456346072056, + "grad_norm": 0.2998441457748413, + "learning_rate": 6.24509978435025e-05, + "loss": 1.0487, + "step": 6941 + }, + { + "epoch": 0.4217753204933471, + "grad_norm": 0.16706110537052155, + "learning_rate": 6.244172640044081e-05, + "loss": 1.1036, + "step": 6942 + }, + { + "epoch": 0.42183607752597363, + "grad_norm": 0.18088893592357635, + "learning_rate": 6.24324545013252e-05, + "loss": 1.0899, + "step": 6943 + }, + { + "epoch": 0.42189683455860016, + "grad_norm": 0.21891342103481293, + "learning_rate": 6.242318214649557e-05, + "loss": 1.0751, + "step": 6944 + }, + { + "epoch": 0.4219575915912267, + "grad_norm": 0.1899438351392746, + "learning_rate": 6.24139093362918e-05, + "loss": 1.0458, + "step": 6945 + }, + { + "epoch": 0.42201834862385323, + "grad_norm": 0.14089667797088623, + "learning_rate": 6.240463607105377e-05, + "loss": 1.0152, + "step": 6946 + }, + { + "epoch": 0.4220791056564797, + "grad_norm": 0.14117169380187988, + "learning_rate": 6.239536235112144e-05, + "loss": 1.0353, + "step": 6947 + }, + { + "epoch": 0.42213986268910625, + "grad_norm": 0.1743745058774948, + "learning_rate": 6.238608817683468e-05, + "loss": 1.0739, + "step": 6948 + }, + { + "epoch": 0.4222006197217328, + "grad_norm": 0.21024319529533386, + "learning_rate": 6.237681354853349e-05, + "loss": 1.1628, + "step": 6949 + }, + { + "epoch": 0.4222613767543593, + "grad_norm": 0.20903562009334564, + "learning_rate": 6.236753846655778e-05, + "loss": 1.1894, + "step": 6950 + }, + { + "epoch": 0.42232213378698585, + "grad_norm": 0.1602024883031845, + "learning_rate": 6.235826293124756e-05, + "loss": 1.034, + "step": 6951 + }, + { + "epoch": 0.4223828908196124, + "grad_norm": 0.19580984115600586, + "learning_rate": 6.234898694294284e-05, + "loss": 1.1636, + "step": 6952 + }, + { + "epoch": 0.4224436478522389, + "grad_norm": 0.13619299232959747, + "learning_rate": 6.233971050198361e-05, + "loss": 1.0798, + "step": 6953 + }, + { + "epoch": 0.4225044048848654, + "grad_norm": 0.14674031734466553, + "learning_rate": 6.23304336087099e-05, + "loss": 1.1592, + "step": 6954 + }, + { + "epoch": 0.42256516191749194, + "grad_norm": 0.169845312833786, + "learning_rate": 6.232115626346178e-05, + "loss": 1.1067, + "step": 6955 + }, + { + "epoch": 0.4226259189501185, + "grad_norm": 0.17236468195915222, + "learning_rate": 6.231187846657927e-05, + "loss": 1.0417, + "step": 6956 + }, + { + "epoch": 0.422686675982745, + "grad_norm": 0.16715261340141296, + "learning_rate": 6.23026002184025e-05, + "loss": 1.1337, + "step": 6957 + }, + { + "epoch": 0.42274743301537154, + "grad_norm": 0.42817223072052, + "learning_rate": 6.229332151927151e-05, + "loss": 1.373, + "step": 6958 + }, + { + "epoch": 0.4228081900479981, + "grad_norm": 0.1281808465719223, + "learning_rate": 6.228404236952647e-05, + "loss": 1.0387, + "step": 6959 + }, + { + "epoch": 0.42286894708062456, + "grad_norm": 0.13899502158164978, + "learning_rate": 6.227476276950748e-05, + "loss": 1.0434, + "step": 6960 + }, + { + "epoch": 0.4229297041132511, + "grad_norm": 0.18265396356582642, + "learning_rate": 6.226548271955466e-05, + "loss": 1.0292, + "step": 6961 + }, + { + "epoch": 0.4229904611458776, + "grad_norm": 0.161569744348526, + "learning_rate": 6.225620222000826e-05, + "loss": 1.1229, + "step": 6962 + }, + { + "epoch": 0.42305121817850416, + "grad_norm": 0.1807115226984024, + "learning_rate": 6.224692127120834e-05, + "loss": 1.0406, + "step": 6963 + }, + { + "epoch": 0.4231119752111307, + "grad_norm": 0.17286376655101776, + "learning_rate": 6.223763987349518e-05, + "loss": 1.1447, + "step": 6964 + }, + { + "epoch": 0.42317273224375723, + "grad_norm": 0.24327607452869415, + "learning_rate": 6.222835802720897e-05, + "loss": 1.048, + "step": 6965 + }, + { + "epoch": 0.42323348927638377, + "grad_norm": 0.20642633736133575, + "learning_rate": 6.22190757326899e-05, + "loss": 1.1765, + "step": 6966 + }, + { + "epoch": 0.42329424630901025, + "grad_norm": 2.7735562324523926, + "learning_rate": 6.220979299027829e-05, + "loss": 1.1822, + "step": 6967 + }, + { + "epoch": 0.4233550033416368, + "grad_norm": 0.152414932847023, + "learning_rate": 6.220050980031433e-05, + "loss": 1.141, + "step": 6968 + }, + { + "epoch": 0.4234157603742633, + "grad_norm": 0.2437283843755722, + "learning_rate": 6.219122616313832e-05, + "loss": 1.1929, + "step": 6969 + }, + { + "epoch": 0.42347651740688985, + "grad_norm": 0.21336692571640015, + "learning_rate": 6.218194207909058e-05, + "loss": 1.1662, + "step": 6970 + }, + { + "epoch": 0.4235372744395164, + "grad_norm": 0.20766429603099823, + "learning_rate": 6.21726575485114e-05, + "loss": 1.0903, + "step": 6971 + }, + { + "epoch": 0.4235980314721429, + "grad_norm": 0.1277417540550232, + "learning_rate": 6.216337257174109e-05, + "loss": 1.1028, + "step": 6972 + }, + { + "epoch": 0.4236587885047694, + "grad_norm": 0.18504175543785095, + "learning_rate": 6.215408714912001e-05, + "loss": 1.1471, + "step": 6973 + }, + { + "epoch": 0.42371954553739594, + "grad_norm": 0.1630711406469345, + "learning_rate": 6.214480128098851e-05, + "loss": 1.0445, + "step": 6974 + }, + { + "epoch": 0.42378030257002247, + "grad_norm": 0.1733415424823761, + "learning_rate": 6.2135514967687e-05, + "loss": 1.0522, + "step": 6975 + }, + { + "epoch": 0.423841059602649, + "grad_norm": 0.19768063724040985, + "learning_rate": 6.212622820955582e-05, + "loss": 1.1364, + "step": 6976 + }, + { + "epoch": 0.42390181663527554, + "grad_norm": 0.15273819863796234, + "learning_rate": 6.211694100693538e-05, + "loss": 1.1118, + "step": 6977 + }, + { + "epoch": 0.4239625736679021, + "grad_norm": 0.17157956957817078, + "learning_rate": 6.210765336016616e-05, + "loss": 1.0561, + "step": 6978 + }, + { + "epoch": 0.4240233307005286, + "grad_norm": 0.36483943462371826, + "learning_rate": 6.209836526958854e-05, + "loss": 1.1137, + "step": 6979 + }, + { + "epoch": 0.4240840877331551, + "grad_norm": 0.1682959347963333, + "learning_rate": 6.208907673554303e-05, + "loss": 1.0244, + "step": 6980 + }, + { + "epoch": 0.4241448447657816, + "grad_norm": 0.3107626438140869, + "learning_rate": 6.207978775837005e-05, + "loss": 1.1828, + "step": 6981 + }, + { + "epoch": 0.42420560179840816, + "grad_norm": 0.1407303810119629, + "learning_rate": 6.207049833841014e-05, + "loss": 1.1248, + "step": 6982 + }, + { + "epoch": 0.4242663588310347, + "grad_norm": 0.16782920062541962, + "learning_rate": 6.206120847600377e-05, + "loss": 1.0773, + "step": 6983 + }, + { + "epoch": 0.42432711586366123, + "grad_norm": 0.19861558079719543, + "learning_rate": 6.205191817149145e-05, + "loss": 1.1012, + "step": 6984 + }, + { + "epoch": 0.42438787289628777, + "grad_norm": 0.15361498296260834, + "learning_rate": 6.204262742521377e-05, + "loss": 1.0886, + "step": 6985 + }, + { + "epoch": 0.42444862992891425, + "grad_norm": 0.14355316758155823, + "learning_rate": 6.203333623751124e-05, + "loss": 1.0522, + "step": 6986 + }, + { + "epoch": 0.4245093869615408, + "grad_norm": 0.21123307943344116, + "learning_rate": 6.202404460872444e-05, + "loss": 1.1052, + "step": 6987 + }, + { + "epoch": 0.4245701439941673, + "grad_norm": 0.20525924861431122, + "learning_rate": 6.201475253919397e-05, + "loss": 1.0438, + "step": 6988 + }, + { + "epoch": 0.42463090102679385, + "grad_norm": 46.03693389892578, + "learning_rate": 6.200546002926043e-05, + "loss": 1.0986, + "step": 6989 + }, + { + "epoch": 0.4246916580594204, + "grad_norm": 0.19850599765777588, + "learning_rate": 6.199616707926441e-05, + "loss": 1.1396, + "step": 6990 + }, + { + "epoch": 0.4247524150920469, + "grad_norm": 0.1901119500398636, + "learning_rate": 6.198687368954659e-05, + "loss": 1.1318, + "step": 6991 + }, + { + "epoch": 0.42481317212467345, + "grad_norm": 0.1583910584449768, + "learning_rate": 6.197757986044759e-05, + "loss": 1.1032, + "step": 6992 + }, + { + "epoch": 0.42487392915729993, + "grad_norm": 0.15106023848056793, + "learning_rate": 6.196828559230809e-05, + "loss": 1.0996, + "step": 6993 + }, + { + "epoch": 0.42493468618992647, + "grad_norm": 0.2375694066286087, + "learning_rate": 6.195899088546875e-05, + "loss": 1.1119, + "step": 6994 + }, + { + "epoch": 0.424995443222553, + "grad_norm": 0.2967996895313263, + "learning_rate": 6.19496957402703e-05, + "loss": 1.0543, + "step": 6995 + }, + { + "epoch": 0.42505620025517954, + "grad_norm": 0.4244559705257416, + "learning_rate": 6.194040015705344e-05, + "loss": 1.2376, + "step": 6996 + }, + { + "epoch": 0.4251169572878061, + "grad_norm": 0.3686823844909668, + "learning_rate": 6.19311041361589e-05, + "loss": 1.0578, + "step": 6997 + }, + { + "epoch": 0.4251777143204326, + "grad_norm": 0.1373891681432724, + "learning_rate": 6.192180767792743e-05, + "loss": 1.0618, + "step": 6998 + }, + { + "epoch": 0.42523847135305914, + "grad_norm": 0.16797654330730438, + "learning_rate": 6.191251078269981e-05, + "loss": 1.0495, + "step": 6999 + }, + { + "epoch": 0.4252992283856856, + "grad_norm": 0.1643468737602234, + "learning_rate": 6.19032134508168e-05, + "loss": 1.0902, + "step": 7000 + }, + { + "epoch": 0.42535998541831216, + "grad_norm": 0.3246098458766937, + "learning_rate": 6.189391568261919e-05, + "loss": 1.1858, + "step": 7001 + }, + { + "epoch": 0.4254207424509387, + "grad_norm": 0.12454608082771301, + "learning_rate": 6.188461747844781e-05, + "loss": 1.0969, + "step": 7002 + }, + { + "epoch": 0.42548149948356523, + "grad_norm": 0.16250506043434143, + "learning_rate": 6.187531883864348e-05, + "loss": 1.1429, + "step": 7003 + }, + { + "epoch": 0.42554225651619176, + "grad_norm": 0.17062848806381226, + "learning_rate": 6.186601976354705e-05, + "loss": 1.1037, + "step": 7004 + }, + { + "epoch": 0.4256030135488183, + "grad_norm": 2.5766096115112305, + "learning_rate": 6.185672025349935e-05, + "loss": 1.1161, + "step": 7005 + }, + { + "epoch": 0.4256637705814448, + "grad_norm": 0.19814927875995636, + "learning_rate": 6.18474203088413e-05, + "loss": 1.2225, + "step": 7006 + }, + { + "epoch": 0.4257245276140713, + "grad_norm": 0.2760142385959625, + "learning_rate": 6.183811992991375e-05, + "loss": 1.0919, + "step": 7007 + }, + { + "epoch": 0.42578528464669785, + "grad_norm": 0.6109820008277893, + "learning_rate": 6.182881911705762e-05, + "loss": 1.2268, + "step": 7008 + }, + { + "epoch": 0.4258460416793244, + "grad_norm": 0.20365901291370392, + "learning_rate": 6.181951787061388e-05, + "loss": 1.1179, + "step": 7009 + }, + { + "epoch": 0.4259067987119509, + "grad_norm": 0.15513215959072113, + "learning_rate": 6.181021619092339e-05, + "loss": 1.1462, + "step": 7010 + }, + { + "epoch": 0.42596755574457745, + "grad_norm": 0.1652374416589737, + "learning_rate": 6.180091407832715e-05, + "loss": 1.0216, + "step": 7011 + }, + { + "epoch": 0.426028312777204, + "grad_norm": 0.13578563928604126, + "learning_rate": 6.179161153316614e-05, + "loss": 1.0311, + "step": 7012 + }, + { + "epoch": 0.42608906980983047, + "grad_norm": 0.20253820717334747, + "learning_rate": 6.17823085557813e-05, + "loss": 1.1829, + "step": 7013 + }, + { + "epoch": 0.426149826842457, + "grad_norm": 0.42810219526290894, + "learning_rate": 6.177300514651367e-05, + "loss": 1.262, + "step": 7014 + }, + { + "epoch": 0.42621058387508354, + "grad_norm": 0.18049682676792145, + "learning_rate": 6.176370130570428e-05, + "loss": 1.1112, + "step": 7015 + }, + { + "epoch": 0.4262713409077101, + "grad_norm": 0.26479053497314453, + "learning_rate": 6.175439703369414e-05, + "loss": 1.1471, + "step": 7016 + }, + { + "epoch": 0.4263320979403366, + "grad_norm": 0.13780274987220764, + "learning_rate": 6.17450923308243e-05, + "loss": 1.0598, + "step": 7017 + }, + { + "epoch": 0.42639285497296314, + "grad_norm": 0.2020138055086136, + "learning_rate": 6.173578719743581e-05, + "loss": 1.0978, + "step": 7018 + }, + { + "epoch": 0.4264536120055896, + "grad_norm": 0.17220433056354523, + "learning_rate": 6.17264816338698e-05, + "loss": 1.0873, + "step": 7019 + }, + { + "epoch": 0.42651436903821616, + "grad_norm": 0.26138007640838623, + "learning_rate": 6.171717564046735e-05, + "loss": 1.0651, + "step": 7020 + }, + { + "epoch": 0.4265751260708427, + "grad_norm": 0.13816316425800323, + "learning_rate": 6.170786921756951e-05, + "loss": 1.0139, + "step": 7021 + }, + { + "epoch": 0.4266358831034692, + "grad_norm": 0.32481956481933594, + "learning_rate": 6.169856236551751e-05, + "loss": 1.1706, + "step": 7022 + }, + { + "epoch": 0.42669664013609576, + "grad_norm": 0.14297835528850555, + "learning_rate": 6.168925508465241e-05, + "loss": 1.0174, + "step": 7023 + }, + { + "epoch": 0.4267573971687223, + "grad_norm": 0.15055494010448456, + "learning_rate": 6.167994737531543e-05, + "loss": 1.0698, + "step": 7024 + }, + { + "epoch": 0.42681815420134883, + "grad_norm": 0.24918074905872345, + "learning_rate": 6.167063923784771e-05, + "loss": 1.0262, + "step": 7025 + }, + { + "epoch": 0.4268789112339753, + "grad_norm": 0.19334658980369568, + "learning_rate": 6.166133067259047e-05, + "loss": 1.0675, + "step": 7026 + }, + { + "epoch": 0.42693966826660185, + "grad_norm": 0.22851526737213135, + "learning_rate": 6.165202167988488e-05, + "loss": 1.1112, + "step": 7027 + }, + { + "epoch": 0.4270004252992284, + "grad_norm": 13.368069648742676, + "learning_rate": 6.16427122600722e-05, + "loss": 1.1536, + "step": 7028 + }, + { + "epoch": 0.4270611823318549, + "grad_norm": 0.5032018423080444, + "learning_rate": 6.163340241349365e-05, + "loss": 1.2312, + "step": 7029 + }, + { + "epoch": 0.42712193936448145, + "grad_norm": 0.33650246262550354, + "learning_rate": 6.162409214049047e-05, + "loss": 1.067, + "step": 7030 + }, + { + "epoch": 0.427182696397108, + "grad_norm": 0.19822436571121216, + "learning_rate": 6.161478144140395e-05, + "loss": 1.1227, + "step": 7031 + }, + { + "epoch": 0.42724345342973447, + "grad_norm": 0.24417605996131897, + "learning_rate": 6.160547031657538e-05, + "loss": 1.1816, + "step": 7032 + }, + { + "epoch": 0.427304210462361, + "grad_norm": 0.20915569365024567, + "learning_rate": 6.159615876634603e-05, + "loss": 1.1855, + "step": 7033 + }, + { + "epoch": 0.42736496749498754, + "grad_norm": 0.16673614084720612, + "learning_rate": 6.158684679105727e-05, + "loss": 1.041, + "step": 7034 + }, + { + "epoch": 0.42742572452761407, + "grad_norm": 0.13468198478221893, + "learning_rate": 6.15775343910504e-05, + "loss": 1.0355, + "step": 7035 + }, + { + "epoch": 0.4274864815602406, + "grad_norm": 0.3226775527000427, + "learning_rate": 6.156822156666674e-05, + "loss": 1.1822, + "step": 7036 + }, + { + "epoch": 0.42754723859286714, + "grad_norm": 0.575821578502655, + "learning_rate": 6.155890831824769e-05, + "loss": 1.0471, + "step": 7037 + }, + { + "epoch": 0.4276079956254937, + "grad_norm": 0.2560516595840454, + "learning_rate": 6.154959464613464e-05, + "loss": 0.9903, + "step": 7038 + }, + { + "epoch": 0.42766875265812015, + "grad_norm": 0.13638198375701904, + "learning_rate": 6.154028055066896e-05, + "loss": 1.0836, + "step": 7039 + }, + { + "epoch": 0.4277295096907467, + "grad_norm": 0.19872820377349854, + "learning_rate": 6.153096603219207e-05, + "loss": 1.1425, + "step": 7040 + }, + { + "epoch": 0.4277902667233732, + "grad_norm": 0.18162383139133453, + "learning_rate": 6.152165109104536e-05, + "loss": 1.1514, + "step": 7041 + }, + { + "epoch": 0.42785102375599976, + "grad_norm": 0.1593349277973175, + "learning_rate": 6.151233572757034e-05, + "loss": 1.1199, + "step": 7042 + }, + { + "epoch": 0.4279117807886263, + "grad_norm": 0.23361849784851074, + "learning_rate": 6.150301994210841e-05, + "loss": 1.2252, + "step": 7043 + }, + { + "epoch": 0.42797253782125283, + "grad_norm": 0.13178184628486633, + "learning_rate": 6.149370373500105e-05, + "loss": 1.0471, + "step": 7044 + }, + { + "epoch": 0.4280332948538793, + "grad_norm": 0.2979399859905243, + "learning_rate": 6.14843871065898e-05, + "loss": 1.1882, + "step": 7045 + }, + { + "epoch": 0.42809405188650584, + "grad_norm": 0.20199747383594513, + "learning_rate": 6.147507005721607e-05, + "loss": 1.1594, + "step": 7046 + }, + { + "epoch": 0.4281548089191324, + "grad_norm": 0.21474188566207886, + "learning_rate": 6.146575258722146e-05, + "loss": 1.1894, + "step": 7047 + }, + { + "epoch": 0.4282155659517589, + "grad_norm": 0.17794427275657654, + "learning_rate": 6.145643469694747e-05, + "loss": 1.105, + "step": 7048 + }, + { + "epoch": 0.42827632298438545, + "grad_norm": 0.5070486068725586, + "learning_rate": 6.144711638673564e-05, + "loss": 1.1564, + "step": 7049 + }, + { + "epoch": 0.428337080017012, + "grad_norm": 0.17088961601257324, + "learning_rate": 6.143779765692756e-05, + "loss": 1.0613, + "step": 7050 + }, + { + "epoch": 0.4283978370496385, + "grad_norm": 0.1595560908317566, + "learning_rate": 6.142847850786477e-05, + "loss": 1.2335, + "step": 7051 + }, + { + "epoch": 0.428458594082265, + "grad_norm": 0.2286539524793625, + "learning_rate": 6.141915893988892e-05, + "loss": 1.1241, + "step": 7052 + }, + { + "epoch": 0.42851935111489153, + "grad_norm": 0.23533058166503906, + "learning_rate": 6.140983895334158e-05, + "loss": 1.1091, + "step": 7053 + }, + { + "epoch": 0.42858010814751807, + "grad_norm": 0.19321134686470032, + "learning_rate": 6.140051854856437e-05, + "loss": 1.2191, + "step": 7054 + }, + { + "epoch": 0.4286408651801446, + "grad_norm": 0.21221891045570374, + "learning_rate": 6.139119772589899e-05, + "loss": 1.1174, + "step": 7055 + }, + { + "epoch": 0.42870162221277114, + "grad_norm": 0.3473241329193115, + "learning_rate": 6.138187648568704e-05, + "loss": 1.2597, + "step": 7056 + }, + { + "epoch": 0.4287623792453977, + "grad_norm": 0.21167616546154022, + "learning_rate": 6.137255482827018e-05, + "loss": 1.1508, + "step": 7057 + }, + { + "epoch": 0.4288231362780242, + "grad_norm": 0.1301553100347519, + "learning_rate": 6.136323275399016e-05, + "loss": 1.0411, + "step": 7058 + }, + { + "epoch": 0.4288838933106507, + "grad_norm": 0.4767073690891266, + "learning_rate": 6.135391026318862e-05, + "loss": 1.3891, + "step": 7059 + }, + { + "epoch": 0.4289446503432772, + "grad_norm": 0.14260655641555786, + "learning_rate": 6.134458735620732e-05, + "loss": 1.0142, + "step": 7060 + }, + { + "epoch": 0.42900540737590376, + "grad_norm": 0.2777833044528961, + "learning_rate": 6.133526403338798e-05, + "loss": 1.0565, + "step": 7061 + }, + { + "epoch": 0.4290661644085303, + "grad_norm": 0.15962089598178864, + "learning_rate": 6.132594029507234e-05, + "loss": 1.0886, + "step": 7062 + }, + { + "epoch": 0.42912692144115683, + "grad_norm": 0.20868034660816193, + "learning_rate": 6.131661614160217e-05, + "loss": 1.1047, + "step": 7063 + }, + { + "epoch": 0.42918767847378336, + "grad_norm": 0.16495519876480103, + "learning_rate": 6.130729157331924e-05, + "loss": 1.151, + "step": 7064 + }, + { + "epoch": 0.42924843550640984, + "grad_norm": 0.17820709943771362, + "learning_rate": 6.129796659056535e-05, + "loss": 1.1386, + "step": 7065 + }, + { + "epoch": 0.4293091925390364, + "grad_norm": 0.1497243046760559, + "learning_rate": 6.128864119368234e-05, + "loss": 1.0805, + "step": 7066 + }, + { + "epoch": 0.4293699495716629, + "grad_norm": 0.3838287889957428, + "learning_rate": 6.127931538301198e-05, + "loss": 1.0966, + "step": 7067 + }, + { + "epoch": 0.42943070660428945, + "grad_norm": 0.16190454363822937, + "learning_rate": 6.126998915889612e-05, + "loss": 1.1048, + "step": 7068 + }, + { + "epoch": 0.429491463636916, + "grad_norm": 0.17607471346855164, + "learning_rate": 6.126066252167667e-05, + "loss": 1.0616, + "step": 7069 + }, + { + "epoch": 0.4295522206695425, + "grad_norm": 0.2324267476797104, + "learning_rate": 6.125133547169543e-05, + "loss": 1.2412, + "step": 7070 + }, + { + "epoch": 0.42961297770216905, + "grad_norm": 0.13477569818496704, + "learning_rate": 6.124200800929431e-05, + "loss": 1.0669, + "step": 7071 + }, + { + "epoch": 0.42967373473479553, + "grad_norm": 0.13471132516860962, + "learning_rate": 6.123268013481523e-05, + "loss": 1.0692, + "step": 7072 + }, + { + "epoch": 0.42973449176742207, + "grad_norm": 0.15257640182971954, + "learning_rate": 6.122335184860008e-05, + "loss": 1.091, + "step": 7073 + }, + { + "epoch": 0.4297952488000486, + "grad_norm": 0.15090148150920868, + "learning_rate": 6.121402315099081e-05, + "loss": 1.1209, + "step": 7074 + }, + { + "epoch": 0.42985600583267514, + "grad_norm": 0.11312448978424072, + "learning_rate": 6.120469404232932e-05, + "loss": 1.0342, + "step": 7075 + }, + { + "epoch": 0.42991676286530167, + "grad_norm": 0.17266380786895752, + "learning_rate": 6.119536452295765e-05, + "loss": 1.1304, + "step": 7076 + }, + { + "epoch": 0.4299775198979282, + "grad_norm": 0.2930562198162079, + "learning_rate": 6.118603459321772e-05, + "loss": 1.1513, + "step": 7077 + }, + { + "epoch": 0.4300382769305547, + "grad_norm": 0.10981795191764832, + "learning_rate": 6.11767042534515e-05, + "loss": 1.0226, + "step": 7078 + }, + { + "epoch": 0.4300990339631812, + "grad_norm": 0.13515910506248474, + "learning_rate": 6.116737350400105e-05, + "loss": 1.0393, + "step": 7079 + }, + { + "epoch": 0.43015979099580776, + "grad_norm": 0.1950734704732895, + "learning_rate": 6.115804234520834e-05, + "loss": 1.0281, + "step": 7080 + }, + { + "epoch": 0.4302205480284343, + "grad_norm": 0.1276920735836029, + "learning_rate": 6.114871077741546e-05, + "loss": 1.0492, + "step": 7081 + }, + { + "epoch": 0.4302813050610608, + "grad_norm": 0.17948336899280548, + "learning_rate": 6.113937880096442e-05, + "loss": 1.1086, + "step": 7082 + }, + { + "epoch": 0.43034206209368736, + "grad_norm": 0.11978979408740997, + "learning_rate": 6.11300464161973e-05, + "loss": 1.0521, + "step": 7083 + }, + { + "epoch": 0.4304028191263139, + "grad_norm": 0.20154602825641632, + "learning_rate": 6.112071362345617e-05, + "loss": 1.1547, + "step": 7084 + }, + { + "epoch": 0.4304635761589404, + "grad_norm": 0.11545953154563904, + "learning_rate": 6.111138042308314e-05, + "loss": 1.03, + "step": 7085 + }, + { + "epoch": 0.4305243331915669, + "grad_norm": 0.14451618492603302, + "learning_rate": 6.110204681542031e-05, + "loss": 1.0605, + "step": 7086 + }, + { + "epoch": 0.43058509022419345, + "grad_norm": 0.2848051190376282, + "learning_rate": 6.109271280080981e-05, + "loss": 1.2883, + "step": 7087 + }, + { + "epoch": 0.43064584725682, + "grad_norm": 0.17974348366260529, + "learning_rate": 6.108337837959377e-05, + "loss": 1.115, + "step": 7088 + }, + { + "epoch": 0.4307066042894465, + "grad_norm": 0.16068078577518463, + "learning_rate": 6.107404355211437e-05, + "loss": 1.0828, + "step": 7089 + }, + { + "epoch": 0.43076736132207305, + "grad_norm": 0.11544747650623322, + "learning_rate": 6.106470831871374e-05, + "loss": 1.0812, + "step": 7090 + }, + { + "epoch": 0.43082811835469953, + "grad_norm": 0.11844844371080399, + "learning_rate": 6.10553726797341e-05, + "loss": 1.0738, + "step": 7091 + }, + { + "epoch": 0.43088887538732606, + "grad_norm": 0.3195345103740692, + "learning_rate": 6.104603663551764e-05, + "loss": 1.1275, + "step": 7092 + }, + { + "epoch": 0.4309496324199526, + "grad_norm": 0.12494722753763199, + "learning_rate": 6.103670018640657e-05, + "loss": 1.0839, + "step": 7093 + }, + { + "epoch": 0.43101038945257913, + "grad_norm": 0.16662168502807617, + "learning_rate": 6.102736333274313e-05, + "loss": 1.1831, + "step": 7094 + }, + { + "epoch": 0.43107114648520567, + "grad_norm": 0.21858850121498108, + "learning_rate": 6.1018026074869536e-05, + "loss": 1.1509, + "step": 7095 + }, + { + "epoch": 0.4311319035178322, + "grad_norm": 0.17780764400959015, + "learning_rate": 6.100868841312808e-05, + "loss": 1.0821, + "step": 7096 + }, + { + "epoch": 0.43119266055045874, + "grad_norm": 0.231523796916008, + "learning_rate": 6.0999350347861016e-05, + "loss": 1.0561, + "step": 7097 + }, + { + "epoch": 0.4312534175830852, + "grad_norm": 0.1594240963459015, + "learning_rate": 6.099001187941063e-05, + "loss": 1.1148, + "step": 7098 + }, + { + "epoch": 0.43131417461571175, + "grad_norm": 0.2271859347820282, + "learning_rate": 6.098067300811927e-05, + "loss": 1.1758, + "step": 7099 + }, + { + "epoch": 0.4313749316483383, + "grad_norm": 0.1946006417274475, + "learning_rate": 6.09713337343292e-05, + "loss": 1.1624, + "step": 7100 + }, + { + "epoch": 0.4314356886809648, + "grad_norm": 0.12236620485782623, + "learning_rate": 6.096199405838274e-05, + "loss": 1.0467, + "step": 7101 + }, + { + "epoch": 0.43149644571359136, + "grad_norm": 0.18780270218849182, + "learning_rate": 6.095265398062231e-05, + "loss": 1.0469, + "step": 7102 + }, + { + "epoch": 0.4315572027462179, + "grad_norm": 0.2614232301712036, + "learning_rate": 6.094331350139021e-05, + "loss": 1.0238, + "step": 7103 + }, + { + "epoch": 0.43161795977884443, + "grad_norm": 0.14401251077651978, + "learning_rate": 6.093397262102884e-05, + "loss": 1.0763, + "step": 7104 + }, + { + "epoch": 0.4316787168114709, + "grad_norm": 0.33712613582611084, + "learning_rate": 6.09246313398806e-05, + "loss": 1.1354, + "step": 7105 + }, + { + "epoch": 0.43173947384409744, + "grad_norm": 0.40387997031211853, + "learning_rate": 6.0915289658287877e-05, + "loss": 1.0825, + "step": 7106 + }, + { + "epoch": 0.431800230876724, + "grad_norm": 0.13862301409244537, + "learning_rate": 6.090594757659311e-05, + "loss": 1.043, + "step": 7107 + }, + { + "epoch": 0.4318609879093505, + "grad_norm": 0.263348251581192, + "learning_rate": 6.089660509513872e-05, + "loss": 1.0176, + "step": 7108 + }, + { + "epoch": 0.43192174494197705, + "grad_norm": 0.3066016435623169, + "learning_rate": 6.0887262214267174e-05, + "loss": 1.0681, + "step": 7109 + }, + { + "epoch": 0.4319825019746036, + "grad_norm": 0.13966190814971924, + "learning_rate": 6.0877918934320924e-05, + "loss": 1.0875, + "step": 7110 + }, + { + "epoch": 0.43204325900723006, + "grad_norm": 0.27148106694221497, + "learning_rate": 6.086857525564244e-05, + "loss": 1.002, + "step": 7111 + }, + { + "epoch": 0.4321040160398566, + "grad_norm": 0.18690867722034454, + "learning_rate": 6.085923117857424e-05, + "loss": 1.0406, + "step": 7112 + }, + { + "epoch": 0.43216477307248313, + "grad_norm": 0.21091775596141815, + "learning_rate": 6.0849886703458826e-05, + "loss": 1.2286, + "step": 7113 + }, + { + "epoch": 0.43222553010510967, + "grad_norm": 0.2024928480386734, + "learning_rate": 6.08405418306387e-05, + "loss": 1.0651, + "step": 7114 + }, + { + "epoch": 0.4322862871377362, + "grad_norm": 0.20826666057109833, + "learning_rate": 6.083119656045643e-05, + "loss": 1.1842, + "step": 7115 + }, + { + "epoch": 0.43234704417036274, + "grad_norm": 0.22445645928382874, + "learning_rate": 6.082185089325455e-05, + "loss": 1.1479, + "step": 7116 + }, + { + "epoch": 0.4324078012029893, + "grad_norm": 0.20395037531852722, + "learning_rate": 6.0812504829375636e-05, + "loss": 1.103, + "step": 7117 + }, + { + "epoch": 0.43246855823561575, + "grad_norm": 0.2830505669116974, + "learning_rate": 6.080315836916228e-05, + "loss": 1.0743, + "step": 7118 + }, + { + "epoch": 0.4325293152682423, + "grad_norm": 1.192375659942627, + "learning_rate": 6.079381151295706e-05, + "loss": 1.0642, + "step": 7119 + }, + { + "epoch": 0.4325900723008688, + "grad_norm": 0.1309729963541031, + "learning_rate": 6.0784464261102584e-05, + "loss": 1.0456, + "step": 7120 + }, + { + "epoch": 0.43265082933349536, + "grad_norm": 0.24944883584976196, + "learning_rate": 6.07751166139415e-05, + "loss": 1.1335, + "step": 7121 + }, + { + "epoch": 0.4327115863661219, + "grad_norm": 0.13679952919483185, + "learning_rate": 6.076576857181642e-05, + "loss": 1.0541, + "step": 7122 + }, + { + "epoch": 0.4327723433987484, + "grad_norm": 0.17177702486515045, + "learning_rate": 6.0756420135070016e-05, + "loss": 1.0998, + "step": 7123 + }, + { + "epoch": 0.4328331004313749, + "grad_norm": 0.11775720864534378, + "learning_rate": 6.0747071304044943e-05, + "loss": 1.0794, + "step": 7124 + }, + { + "epoch": 0.43289385746400144, + "grad_norm": 0.16918975114822388, + "learning_rate": 6.07377220790839e-05, + "loss": 1.1037, + "step": 7125 + }, + { + "epoch": 0.432954614496628, + "grad_norm": 0.16255351901054382, + "learning_rate": 6.0728372460529594e-05, + "loss": 1.1624, + "step": 7126 + }, + { + "epoch": 0.4330153715292545, + "grad_norm": 0.1941896378993988, + "learning_rate": 6.07190224487247e-05, + "loss": 1.1485, + "step": 7127 + }, + { + "epoch": 0.43307612856188105, + "grad_norm": 0.22777590155601501, + "learning_rate": 6.070967204401199e-05, + "loss": 1.1124, + "step": 7128 + }, + { + "epoch": 0.4331368855945076, + "grad_norm": 0.4661976397037506, + "learning_rate": 6.070032124673416e-05, + "loss": 1.1488, + "step": 7129 + }, + { + "epoch": 0.4331976426271341, + "grad_norm": 0.12373591214418411, + "learning_rate": 6.0690970057234e-05, + "loss": 1.0287, + "step": 7130 + }, + { + "epoch": 0.4332583996597606, + "grad_norm": 0.15416377782821655, + "learning_rate": 6.068161847585426e-05, + "loss": 1.0552, + "step": 7131 + }, + { + "epoch": 0.43331915669238713, + "grad_norm": 0.13549821078777313, + "learning_rate": 6.067226650293774e-05, + "loss": 1.1038, + "step": 7132 + }, + { + "epoch": 0.43337991372501367, + "grad_norm": 0.921561062335968, + "learning_rate": 6.066291413882721e-05, + "loss": 1.0443, + "step": 7133 + }, + { + "epoch": 0.4334406707576402, + "grad_norm": 3.5042123794555664, + "learning_rate": 6.06535613838655e-05, + "loss": 1.1176, + "step": 7134 + }, + { + "epoch": 0.43350142779026674, + "grad_norm": 0.15883924067020416, + "learning_rate": 6.064420823839545e-05, + "loss": 1.0985, + "step": 7135 + }, + { + "epoch": 0.43356218482289327, + "grad_norm": 0.19465826451778412, + "learning_rate": 6.0634854702759904e-05, + "loss": 1.1411, + "step": 7136 + }, + { + "epoch": 0.43362294185551975, + "grad_norm": 0.1418951004743576, + "learning_rate": 6.062550077730167e-05, + "loss": 1.0655, + "step": 7137 + }, + { + "epoch": 0.4336836988881463, + "grad_norm": 0.15734487771987915, + "learning_rate": 6.0616146462363664e-05, + "loss": 1.046, + "step": 7138 + }, + { + "epoch": 0.4337444559207728, + "grad_norm": 0.2207980901002884, + "learning_rate": 6.060679175828877e-05, + "loss": 1.0857, + "step": 7139 + }, + { + "epoch": 0.43380521295339936, + "grad_norm": 0.16430488228797913, + "learning_rate": 6.059743666541986e-05, + "loss": 1.1034, + "step": 7140 + }, + { + "epoch": 0.4338659699860259, + "grad_norm": 0.19490373134613037, + "learning_rate": 6.0588081184099865e-05, + "loss": 1.1176, + "step": 7141 + }, + { + "epoch": 0.4339267270186524, + "grad_norm": 0.15159621834754944, + "learning_rate": 6.05787253146717e-05, + "loss": 1.147, + "step": 7142 + }, + { + "epoch": 0.43398748405127896, + "grad_norm": 0.14486467838287354, + "learning_rate": 6.056936905747834e-05, + "loss": 1.0935, + "step": 7143 + }, + { + "epoch": 0.43404824108390544, + "grad_norm": 0.15860125422477722, + "learning_rate": 6.05600124128627e-05, + "loss": 1.0545, + "step": 7144 + }, + { + "epoch": 0.434108998116532, + "grad_norm": 0.1415642946958542, + "learning_rate": 6.055065538116774e-05, + "loss": 1.1636, + "step": 7145 + }, + { + "epoch": 0.4341697551491585, + "grad_norm": 0.13268671929836273, + "learning_rate": 6.054129796273651e-05, + "loss": 1.0779, + "step": 7146 + }, + { + "epoch": 0.43423051218178504, + "grad_norm": 0.21452026069164276, + "learning_rate": 6.0531940157911924e-05, + "loss": 1.1252, + "step": 7147 + }, + { + "epoch": 0.4342912692144116, + "grad_norm": 68.19952392578125, + "learning_rate": 6.052258196703706e-05, + "loss": 1.0644, + "step": 7148 + }, + { + "epoch": 0.4343520262470381, + "grad_norm": 0.4241727888584137, + "learning_rate": 6.051322339045493e-05, + "loss": 1.2822, + "step": 7149 + }, + { + "epoch": 0.4344127832796646, + "grad_norm": 0.15386973321437836, + "learning_rate": 6.050386442850854e-05, + "loss": 1.0769, + "step": 7150 + }, + { + "epoch": 0.43447354031229113, + "grad_norm": 0.14582303166389465, + "learning_rate": 6.049450508154098e-05, + "loss": 1.0454, + "step": 7151 + }, + { + "epoch": 0.43453429734491766, + "grad_norm": 0.17524971067905426, + "learning_rate": 6.04851453498953e-05, + "loss": 1.1636, + "step": 7152 + }, + { + "epoch": 0.4345950543775442, + "grad_norm": 0.22474323213100433, + "learning_rate": 6.047578523391459e-05, + "loss": 1.1451, + "step": 7153 + }, + { + "epoch": 0.43465581141017073, + "grad_norm": 0.15816301107406616, + "learning_rate": 6.046642473394196e-05, + "loss": 1.1828, + "step": 7154 + }, + { + "epoch": 0.43471656844279727, + "grad_norm": 0.3579353094100952, + "learning_rate": 6.045706385032049e-05, + "loss": 1.2668, + "step": 7155 + }, + { + "epoch": 0.4347773254754238, + "grad_norm": 0.1534872055053711, + "learning_rate": 6.044770258339335e-05, + "loss": 1.0896, + "step": 7156 + }, + { + "epoch": 0.4348380825080503, + "grad_norm": 0.15883764624595642, + "learning_rate": 6.0438340933503644e-05, + "loss": 1.0624, + "step": 7157 + }, + { + "epoch": 0.4348988395406768, + "grad_norm": 0.11349524557590485, + "learning_rate": 6.0428978900994516e-05, + "loss": 1.0198, + "step": 7158 + }, + { + "epoch": 0.43495959657330335, + "grad_norm": 0.1767425686120987, + "learning_rate": 6.0419616486209174e-05, + "loss": 1.0026, + "step": 7159 + }, + { + "epoch": 0.4350203536059299, + "grad_norm": 0.13428759574890137, + "learning_rate": 6.041025368949076e-05, + "loss": 1.0105, + "step": 7160 + }, + { + "epoch": 0.4350811106385564, + "grad_norm": 0.15892168879508972, + "learning_rate": 6.0400890511182485e-05, + "loss": 1.1038, + "step": 7161 + }, + { + "epoch": 0.43514186767118296, + "grad_norm": 0.14943519234657288, + "learning_rate": 6.039152695162756e-05, + "loss": 1.0742, + "step": 7162 + }, + { + "epoch": 0.4352026247038095, + "grad_norm": 0.602253258228302, + "learning_rate": 6.038216301116921e-05, + "loss": 1.2142, + "step": 7163 + }, + { + "epoch": 0.435263381736436, + "grad_norm": 0.14757990837097168, + "learning_rate": 6.037279869015067e-05, + "loss": 1.0739, + "step": 7164 + }, + { + "epoch": 0.4353241387690625, + "grad_norm": 0.14695501327514648, + "learning_rate": 6.0363433988915176e-05, + "loss": 1.0311, + "step": 7165 + }, + { + "epoch": 0.43538489580168904, + "grad_norm": 1.7526659965515137, + "learning_rate": 6.0354068907806014e-05, + "loss": 1.0223, + "step": 7166 + }, + { + "epoch": 0.4354456528343156, + "grad_norm": 0.1997620165348053, + "learning_rate": 6.034470344716646e-05, + "loss": 1.0764, + "step": 7167 + }, + { + "epoch": 0.4355064098669421, + "grad_norm": 0.17104572057724, + "learning_rate": 6.033533760733978e-05, + "loss": 1.0475, + "step": 7168 + }, + { + "epoch": 0.43556716689956865, + "grad_norm": 0.19382449984550476, + "learning_rate": 6.0325971388669334e-05, + "loss": 1.1337, + "step": 7169 + }, + { + "epoch": 0.4356279239321951, + "grad_norm": 0.18365032970905304, + "learning_rate": 6.03166047914984e-05, + "loss": 1.0475, + "step": 7170 + }, + { + "epoch": 0.43568868096482166, + "grad_norm": 0.16334949433803558, + "learning_rate": 6.0307237816170304e-05, + "loss": 1.1123, + "step": 7171 + }, + { + "epoch": 0.4357494379974482, + "grad_norm": 0.5135619640350342, + "learning_rate": 6.029787046302843e-05, + "loss": 1.2474, + "step": 7172 + }, + { + "epoch": 0.43581019503007473, + "grad_norm": 0.13248133659362793, + "learning_rate": 6.028850273241612e-05, + "loss": 1.0637, + "step": 7173 + }, + { + "epoch": 0.43587095206270127, + "grad_norm": 2.4597551822662354, + "learning_rate": 6.027913462467676e-05, + "loss": 1.0462, + "step": 7174 + }, + { + "epoch": 0.4359317090953278, + "grad_norm": 0.2982158362865448, + "learning_rate": 6.026976614015373e-05, + "loss": 1.0594, + "step": 7175 + }, + { + "epoch": 0.43599246612795434, + "grad_norm": 0.29003480076789856, + "learning_rate": 6.0260397279190437e-05, + "loss": 1.1222, + "step": 7176 + }, + { + "epoch": 0.4360532231605808, + "grad_norm": 0.33089935779571533, + "learning_rate": 6.02510280421303e-05, + "loss": 1.2227, + "step": 7177 + }, + { + "epoch": 0.43611398019320735, + "grad_norm": 0.5047960877418518, + "learning_rate": 6.0241658429316747e-05, + "loss": 1.1323, + "step": 7178 + }, + { + "epoch": 0.4361747372258339, + "grad_norm": 0.1691085547208786, + "learning_rate": 6.0232288441093224e-05, + "loss": 1.0994, + "step": 7179 + }, + { + "epoch": 0.4362354942584604, + "grad_norm": 0.1709735244512558, + "learning_rate": 6.022291807780319e-05, + "loss": 1.0786, + "step": 7180 + }, + { + "epoch": 0.43629625129108696, + "grad_norm": 0.25987717509269714, + "learning_rate": 6.0213547339790097e-05, + "loss": 1.0558, + "step": 7181 + }, + { + "epoch": 0.4363570083237135, + "grad_norm": 0.14235125482082367, + "learning_rate": 6.020417622739748e-05, + "loss": 1.0835, + "step": 7182 + }, + { + "epoch": 0.43641776535633997, + "grad_norm": 0.9190377593040466, + "learning_rate": 6.01948047409688e-05, + "loss": 1.1797, + "step": 7183 + }, + { + "epoch": 0.4364785223889665, + "grad_norm": 0.33326396346092224, + "learning_rate": 6.018543288084759e-05, + "loss": 1.2175, + "step": 7184 + }, + { + "epoch": 0.43653927942159304, + "grad_norm": 0.20607006549835205, + "learning_rate": 6.0176060647377367e-05, + "loss": 1.0854, + "step": 7185 + }, + { + "epoch": 0.4366000364542196, + "grad_norm": 0.15370653569698334, + "learning_rate": 6.016668804090168e-05, + "loss": 1.08, + "step": 7186 + }, + { + "epoch": 0.4366607934868461, + "grad_norm": 0.3283064663410187, + "learning_rate": 6.0157315061764066e-05, + "loss": 1.2804, + "step": 7187 + }, + { + "epoch": 0.43672155051947265, + "grad_norm": 0.14727094769477844, + "learning_rate": 6.014794171030811e-05, + "loss": 1.0919, + "step": 7188 + }, + { + "epoch": 0.4367823075520992, + "grad_norm": 0.12322274595499039, + "learning_rate": 6.013856798687738e-05, + "loss": 1.0545, + "step": 7189 + }, + { + "epoch": 0.43684306458472566, + "grad_norm": 0.11874996870756149, + "learning_rate": 6.012919389181549e-05, + "loss": 1.0469, + "step": 7190 + }, + { + "epoch": 0.4369038216173522, + "grad_norm": 0.9322015047073364, + "learning_rate": 6.011981942546603e-05, + "loss": 1.1716, + "step": 7191 + }, + { + "epoch": 0.43696457864997873, + "grad_norm": 0.177250936627388, + "learning_rate": 6.0110444588172645e-05, + "loss": 1.0946, + "step": 7192 + }, + { + "epoch": 0.43702533568260526, + "grad_norm": 0.18527120351791382, + "learning_rate": 6.010106938027896e-05, + "loss": 1.0729, + "step": 7193 + }, + { + "epoch": 0.4370860927152318, + "grad_norm": 0.24348853528499603, + "learning_rate": 6.0091693802128614e-05, + "loss": 1.0784, + "step": 7194 + }, + { + "epoch": 0.43714684974785833, + "grad_norm": 0.1880393922328949, + "learning_rate": 6.0082317854065285e-05, + "loss": 1.1799, + "step": 7195 + }, + { + "epoch": 0.4372076067804848, + "grad_norm": 0.30407989025115967, + "learning_rate": 6.0072941536432644e-05, + "loss": 1.1176, + "step": 7196 + }, + { + "epoch": 0.43726836381311135, + "grad_norm": 0.1593843698501587, + "learning_rate": 6.0063564849574385e-05, + "loss": 1.1005, + "step": 7197 + }, + { + "epoch": 0.4373291208457379, + "grad_norm": 0.12760384380817413, + "learning_rate": 6.005418779383421e-05, + "loss": 1.0543, + "step": 7198 + }, + { + "epoch": 0.4373898778783644, + "grad_norm": 0.1750766932964325, + "learning_rate": 6.0044810369555846e-05, + "loss": 1.1043, + "step": 7199 + }, + { + "epoch": 0.43745063491099095, + "grad_norm": 0.12246454507112503, + "learning_rate": 6.003543257708302e-05, + "loss": 1.0177, + "step": 7200 + }, + { + "epoch": 0.4375113919436175, + "grad_norm": 0.14238403737545013, + "learning_rate": 6.002605441675946e-05, + "loss": 1.0319, + "step": 7201 + }, + { + "epoch": 0.437572148976244, + "grad_norm": 0.2159753292798996, + "learning_rate": 6.001667588892893e-05, + "loss": 1.1293, + "step": 7202 + }, + { + "epoch": 0.4376329060088705, + "grad_norm": 0.26794809103012085, + "learning_rate": 6.000729699393523e-05, + "loss": 1.0496, + "step": 7203 + }, + { + "epoch": 0.43769366304149704, + "grad_norm": 0.18779681622982025, + "learning_rate": 5.999791773212211e-05, + "loss": 1.1013, + "step": 7204 + }, + { + "epoch": 0.4377544200741236, + "grad_norm": 0.22233515977859497, + "learning_rate": 5.998853810383339e-05, + "loss": 0.9845, + "step": 7205 + }, + { + "epoch": 0.4378151771067501, + "grad_norm": 1.4402379989624023, + "learning_rate": 5.9979158109412894e-05, + "loss": 1.1515, + "step": 7206 + }, + { + "epoch": 0.43787593413937664, + "grad_norm": 0.2819216251373291, + "learning_rate": 5.996977774920439e-05, + "loss": 1.0863, + "step": 7207 + }, + { + "epoch": 0.4379366911720032, + "grad_norm": 0.1818607747554779, + "learning_rate": 5.996039702355177e-05, + "loss": 1.1223, + "step": 7208 + }, + { + "epoch": 0.4379974482046297, + "grad_norm": 0.14225588738918304, + "learning_rate": 5.995101593279888e-05, + "loss": 1.0412, + "step": 7209 + }, + { + "epoch": 0.4380582052372562, + "grad_norm": 0.1902163326740265, + "learning_rate": 5.9941634477289576e-05, + "loss": 1.0985, + "step": 7210 + }, + { + "epoch": 0.43811896226988273, + "grad_norm": 0.1476496011018753, + "learning_rate": 5.993225265736774e-05, + "loss": 1.1104, + "step": 7211 + }, + { + "epoch": 0.43817971930250926, + "grad_norm": 0.26813390851020813, + "learning_rate": 5.9922870473377244e-05, + "loss": 1.0151, + "step": 7212 + }, + { + "epoch": 0.4382404763351358, + "grad_norm": 0.15807177126407623, + "learning_rate": 5.991348792566204e-05, + "loss": 1.0277, + "step": 7213 + }, + { + "epoch": 0.43830123336776233, + "grad_norm": 0.3268468976020813, + "learning_rate": 5.9904105014566e-05, + "loss": 1.1809, + "step": 7214 + }, + { + "epoch": 0.43836199040038887, + "grad_norm": 0.37448757886886597, + "learning_rate": 5.989472174043308e-05, + "loss": 1.0742, + "step": 7215 + }, + { + "epoch": 0.43842274743301535, + "grad_norm": 0.23370249569416046, + "learning_rate": 5.988533810360724e-05, + "loss": 1.0567, + "step": 7216 + }, + { + "epoch": 0.4384835044656419, + "grad_norm": 0.5218729376792908, + "learning_rate": 5.9875954104432394e-05, + "loss": 1.1128, + "step": 7217 + }, + { + "epoch": 0.4385442614982684, + "grad_norm": 0.20440687239170074, + "learning_rate": 5.9866569743252555e-05, + "loss": 1.1062, + "step": 7218 + }, + { + "epoch": 0.43860501853089495, + "grad_norm": 0.21167246997356415, + "learning_rate": 5.98571850204117e-05, + "loss": 1.0505, + "step": 7219 + }, + { + "epoch": 0.4386657755635215, + "grad_norm": 0.22053633630275726, + "learning_rate": 5.984779993625381e-05, + "loss": 1.0248, + "step": 7220 + }, + { + "epoch": 0.438726532596148, + "grad_norm": 0.1635219156742096, + "learning_rate": 5.983841449112292e-05, + "loss": 1.0654, + "step": 7221 + }, + { + "epoch": 0.43878728962877456, + "grad_norm": 0.23663997650146484, + "learning_rate": 5.9829028685363044e-05, + "loss": 1.0579, + "step": 7222 + }, + { + "epoch": 0.43884804666140104, + "grad_norm": 0.1273733675479889, + "learning_rate": 5.981964251931822e-05, + "loss": 1.0797, + "step": 7223 + }, + { + "epoch": 0.43890880369402757, + "grad_norm": 0.2503994405269623, + "learning_rate": 5.98102559933325e-05, + "loss": 1.1365, + "step": 7224 + }, + { + "epoch": 0.4389695607266541, + "grad_norm": 0.24734438955783844, + "learning_rate": 5.980086910774994e-05, + "loss": 1.1173, + "step": 7225 + }, + { + "epoch": 0.43903031775928064, + "grad_norm": 0.12965643405914307, + "learning_rate": 5.9791481862914645e-05, + "loss": 1.0925, + "step": 7226 + }, + { + "epoch": 0.4390910747919072, + "grad_norm": 0.21391692757606506, + "learning_rate": 5.978209425917067e-05, + "loss": 1.2036, + "step": 7227 + }, + { + "epoch": 0.4391518318245337, + "grad_norm": 0.3929983079433441, + "learning_rate": 5.977270629686216e-05, + "loss": 1.0999, + "step": 7228 + }, + { + "epoch": 0.4392125888571602, + "grad_norm": 0.18287576735019684, + "learning_rate": 5.976331797633321e-05, + "loss": 1.0769, + "step": 7229 + }, + { + "epoch": 0.4392733458897867, + "grad_norm": 0.16073887050151825, + "learning_rate": 5.9753929297927945e-05, + "loss": 1.1016, + "step": 7230 + }, + { + "epoch": 0.43933410292241326, + "grad_norm": 0.20056360960006714, + "learning_rate": 5.974454026199051e-05, + "loss": 1.1041, + "step": 7231 + }, + { + "epoch": 0.4393948599550398, + "grad_norm": 0.30369260907173157, + "learning_rate": 5.9735150868865086e-05, + "loss": 1.1303, + "step": 7232 + }, + { + "epoch": 0.43945561698766633, + "grad_norm": 0.17466102540493011, + "learning_rate": 5.9725761118895816e-05, + "loss": 1.1051, + "step": 7233 + }, + { + "epoch": 0.43951637402029287, + "grad_norm": 0.1584279090166092, + "learning_rate": 5.9716371012426885e-05, + "loss": 1.085, + "step": 7234 + }, + { + "epoch": 0.4395771310529194, + "grad_norm": 0.25505560636520386, + "learning_rate": 5.9706980549802494e-05, + "loss": 1.2213, + "step": 7235 + }, + { + "epoch": 0.4396378880855459, + "grad_norm": 0.1809869259595871, + "learning_rate": 5.969758973136688e-05, + "loss": 1.1755, + "step": 7236 + }, + { + "epoch": 0.4396986451181724, + "grad_norm": 0.13349549472332, + "learning_rate": 5.968819855746423e-05, + "loss": 1.0584, + "step": 7237 + }, + { + "epoch": 0.43975940215079895, + "grad_norm": 0.2590322494506836, + "learning_rate": 5.9678807028438766e-05, + "loss": 1.1261, + "step": 7238 + }, + { + "epoch": 0.4398201591834255, + "grad_norm": 0.268155038356781, + "learning_rate": 5.9669415144634775e-05, + "loss": 1.0605, + "step": 7239 + }, + { + "epoch": 0.439880916216052, + "grad_norm": 0.14229999482631683, + "learning_rate": 5.966002290639651e-05, + "loss": 1.0538, + "step": 7240 + }, + { + "epoch": 0.43994167324867856, + "grad_norm": 0.17587120831012726, + "learning_rate": 5.9650630314068226e-05, + "loss": 1.0914, + "step": 7241 + }, + { + "epoch": 0.44000243028130503, + "grad_norm": 0.29001328349113464, + "learning_rate": 5.964123736799423e-05, + "loss": 1.0644, + "step": 7242 + }, + { + "epoch": 0.44006318731393157, + "grad_norm": 0.19917793571949005, + "learning_rate": 5.9631844068518813e-05, + "loss": 1.1887, + "step": 7243 + }, + { + "epoch": 0.4401239443465581, + "grad_norm": 0.20727324485778809, + "learning_rate": 5.962245041598629e-05, + "loss": 1.0929, + "step": 7244 + }, + { + "epoch": 0.44018470137918464, + "grad_norm": 0.22194448113441467, + "learning_rate": 5.961305641074099e-05, + "loss": 1.1479, + "step": 7245 + }, + { + "epoch": 0.4402454584118112, + "grad_norm": 0.17089296877384186, + "learning_rate": 5.960366205312725e-05, + "loss": 1.0884, + "step": 7246 + }, + { + "epoch": 0.4403062154444377, + "grad_norm": 0.18409684300422668, + "learning_rate": 5.959426734348942e-05, + "loss": 1.1724, + "step": 7247 + }, + { + "epoch": 0.44036697247706424, + "grad_norm": 0.27629512548446655, + "learning_rate": 5.9584872282171845e-05, + "loss": 1.2123, + "step": 7248 + }, + { + "epoch": 0.4404277295096907, + "grad_norm": 0.18911081552505493, + "learning_rate": 5.957547686951894e-05, + "loss": 1.0732, + "step": 7249 + }, + { + "epoch": 0.44048848654231726, + "grad_norm": 0.19058875739574432, + "learning_rate": 5.9566081105875093e-05, + "loss": 1.0903, + "step": 7250 + }, + { + "epoch": 0.4405492435749438, + "grad_norm": 0.15656369924545288, + "learning_rate": 5.955668499158468e-05, + "loss": 1.0886, + "step": 7251 + }, + { + "epoch": 0.44061000060757033, + "grad_norm": 0.17857365310192108, + "learning_rate": 5.954728852699213e-05, + "loss": 1.0921, + "step": 7252 + }, + { + "epoch": 0.44067075764019686, + "grad_norm": 0.16066035628318787, + "learning_rate": 5.9537891712441885e-05, + "loss": 1.1337, + "step": 7253 + }, + { + "epoch": 0.4407315146728234, + "grad_norm": 0.22143623232841492, + "learning_rate": 5.952849454827837e-05, + "loss": 1.3762, + "step": 7254 + }, + { + "epoch": 0.4407922717054499, + "grad_norm": 0.12572744488716125, + "learning_rate": 5.951909703484604e-05, + "loss": 1.0206, + "step": 7255 + }, + { + "epoch": 0.4408530287380764, + "grad_norm": 0.12908640503883362, + "learning_rate": 5.950969917248938e-05, + "loss": 1.0384, + "step": 7256 + }, + { + "epoch": 0.44091378577070295, + "grad_norm": 0.15913233160972595, + "learning_rate": 5.950030096155285e-05, + "loss": 1.1174, + "step": 7257 + }, + { + "epoch": 0.4409745428033295, + "grad_norm": 0.1637459397315979, + "learning_rate": 5.9490902402380944e-05, + "loss": 1.0997, + "step": 7258 + }, + { + "epoch": 0.441035299835956, + "grad_norm": 0.13719452917575836, + "learning_rate": 5.948150349531818e-05, + "loss": 1.05, + "step": 7259 + }, + { + "epoch": 0.44109605686858255, + "grad_norm": 0.14094413816928864, + "learning_rate": 5.947210424070908e-05, + "loss": 1.0559, + "step": 7260 + }, + { + "epoch": 0.4411568139012091, + "grad_norm": 0.11847657710313797, + "learning_rate": 5.946270463889816e-05, + "loss": 1.0521, + "step": 7261 + }, + { + "epoch": 0.44121757093383557, + "grad_norm": 0.14528438448905945, + "learning_rate": 5.945330469022997e-05, + "loss": 1.1106, + "step": 7262 + }, + { + "epoch": 0.4412783279664621, + "grad_norm": 0.14616583287715912, + "learning_rate": 5.94439043950491e-05, + "loss": 1.1743, + "step": 7263 + }, + { + "epoch": 0.44133908499908864, + "grad_norm": 0.3158397078514099, + "learning_rate": 5.943450375370006e-05, + "loss": 1.1681, + "step": 7264 + }, + { + "epoch": 0.4413998420317152, + "grad_norm": 0.22226400673389435, + "learning_rate": 5.942510276652746e-05, + "loss": 1.2765, + "step": 7265 + }, + { + "epoch": 0.4414605990643417, + "grad_norm": 0.4645533859729767, + "learning_rate": 5.94157014338759e-05, + "loss": 1.1982, + "step": 7266 + }, + { + "epoch": 0.44152135609696824, + "grad_norm": 0.20253103971481323, + "learning_rate": 5.940629975608999e-05, + "loss": 1.0946, + "step": 7267 + }, + { + "epoch": 0.4415821131295948, + "grad_norm": 0.3862606883049011, + "learning_rate": 5.939689773351434e-05, + "loss": 1.0629, + "step": 7268 + }, + { + "epoch": 0.44164287016222126, + "grad_norm": 0.1699996292591095, + "learning_rate": 5.938749536649357e-05, + "loss": 1.0495, + "step": 7269 + }, + { + "epoch": 0.4417036271948478, + "grad_norm": 0.16358983516693115, + "learning_rate": 5.937809265537239e-05, + "loss": 1.0575, + "step": 7270 + }, + { + "epoch": 0.4417643842274743, + "grad_norm": 0.3102813959121704, + "learning_rate": 5.936868960049538e-05, + "loss": 1.1185, + "step": 7271 + }, + { + "epoch": 0.44182514126010086, + "grad_norm": 0.20680458843708038, + "learning_rate": 5.935928620220723e-05, + "loss": 1.1294, + "step": 7272 + }, + { + "epoch": 0.4418858982927274, + "grad_norm": 0.4974614083766937, + "learning_rate": 5.934988246085266e-05, + "loss": 1.0614, + "step": 7273 + }, + { + "epoch": 0.44194665532535393, + "grad_norm": 0.1325724571943283, + "learning_rate": 5.934047837677632e-05, + "loss": 1.0715, + "step": 7274 + }, + { + "epoch": 0.4420074123579804, + "grad_norm": 0.2840862572193146, + "learning_rate": 5.933107395032295e-05, + "loss": 1.1105, + "step": 7275 + }, + { + "epoch": 0.44206816939060695, + "grad_norm": 0.1518881767988205, + "learning_rate": 5.932166918183726e-05, + "loss": 1.07, + "step": 7276 + }, + { + "epoch": 0.4421289264232335, + "grad_norm": 0.2648848295211792, + "learning_rate": 5.931226407166398e-05, + "loss": 1.1245, + "step": 7277 + }, + { + "epoch": 0.44218968345586, + "grad_norm": 0.1568232923746109, + "learning_rate": 5.930285862014787e-05, + "loss": 1.0539, + "step": 7278 + }, + { + "epoch": 0.44225044048848655, + "grad_norm": 0.159840390086174, + "learning_rate": 5.929345282763367e-05, + "loss": 1.0897, + "step": 7279 + }, + { + "epoch": 0.4423111975211131, + "grad_norm": 0.2896788418292999, + "learning_rate": 5.928404669446617e-05, + "loss": 1.0601, + "step": 7280 + }, + { + "epoch": 0.4423719545537396, + "grad_norm": 0.11926238238811493, + "learning_rate": 5.927464022099014e-05, + "loss": 1.0444, + "step": 7281 + }, + { + "epoch": 0.4424327115863661, + "grad_norm": 0.589303731918335, + "learning_rate": 5.926523340755037e-05, + "loss": 1.136, + "step": 7282 + }, + { + "epoch": 0.44249346861899264, + "grad_norm": 0.26594099402427673, + "learning_rate": 5.92558262544917e-05, + "loss": 1.0513, + "step": 7283 + }, + { + "epoch": 0.44255422565161917, + "grad_norm": 0.2291601002216339, + "learning_rate": 5.924641876215891e-05, + "loss": 1.0835, + "step": 7284 + }, + { + "epoch": 0.4426149826842457, + "grad_norm": 0.15537233650684357, + "learning_rate": 5.9237010930896856e-05, + "loss": 1.0609, + "step": 7285 + }, + { + "epoch": 0.44267573971687224, + "grad_norm": 0.2580200135707855, + "learning_rate": 5.922760276105038e-05, + "loss": 1.1255, + "step": 7286 + }, + { + "epoch": 0.4427364967494988, + "grad_norm": 0.26946792006492615, + "learning_rate": 5.9218194252964355e-05, + "loss": 1.097, + "step": 7287 + }, + { + "epoch": 0.44279725378212526, + "grad_norm": 0.25447866320610046, + "learning_rate": 5.9208785406983624e-05, + "loss": 1.0824, + "step": 7288 + }, + { + "epoch": 0.4428580108147518, + "grad_norm": 0.22378745675086975, + "learning_rate": 5.919937622345309e-05, + "loss": 1.1728, + "step": 7289 + }, + { + "epoch": 0.4429187678473783, + "grad_norm": 0.17891372740268707, + "learning_rate": 5.9189966702717636e-05, + "loss": 1.0486, + "step": 7290 + }, + { + "epoch": 0.44297952488000486, + "grad_norm": 0.1861187368631363, + "learning_rate": 5.918055684512218e-05, + "loss": 1.1243, + "step": 7291 + }, + { + "epoch": 0.4430402819126314, + "grad_norm": 0.20694777369499207, + "learning_rate": 5.917114665101163e-05, + "loss": 1.1523, + "step": 7292 + }, + { + "epoch": 0.44310103894525793, + "grad_norm": 0.13934335112571716, + "learning_rate": 5.916173612073095e-05, + "loss": 1.1038, + "step": 7293 + }, + { + "epoch": 0.44316179597788447, + "grad_norm": 0.14663487672805786, + "learning_rate": 5.915232525462505e-05, + "loss": 1.0764, + "step": 7294 + }, + { + "epoch": 0.44322255301051094, + "grad_norm": 0.45416945219039917, + "learning_rate": 5.914291405303889e-05, + "loss": 1.0441, + "step": 7295 + }, + { + "epoch": 0.4432833100431375, + "grad_norm": 0.2938636541366577, + "learning_rate": 5.913350251631745e-05, + "loss": 1.1389, + "step": 7296 + }, + { + "epoch": 0.443344067075764, + "grad_norm": 0.4333260953426361, + "learning_rate": 5.9124090644805706e-05, + "loss": 1.0706, + "step": 7297 + }, + { + "epoch": 0.44340482410839055, + "grad_norm": 0.15775185823440552, + "learning_rate": 5.911467843884867e-05, + "loss": 1.1443, + "step": 7298 + }, + { + "epoch": 0.4434655811410171, + "grad_norm": 0.1447594314813614, + "learning_rate": 5.910526589879133e-05, + "loss": 1.0979, + "step": 7299 + }, + { + "epoch": 0.4435263381736436, + "grad_norm": 0.14317794144153595, + "learning_rate": 5.9095853024978706e-05, + "loss": 1.0893, + "step": 7300 + }, + { + "epoch": 0.4435870952062701, + "grad_norm": 0.20676583051681519, + "learning_rate": 5.9086439817755833e-05, + "loss": 1.1735, + "step": 7301 + }, + { + "epoch": 0.44364785223889663, + "grad_norm": 0.14047355949878693, + "learning_rate": 5.907702627746775e-05, + "loss": 1.044, + "step": 7302 + }, + { + "epoch": 0.44370860927152317, + "grad_norm": 0.16433227062225342, + "learning_rate": 5.9067612404459516e-05, + "loss": 1.0795, + "step": 7303 + }, + { + "epoch": 0.4437693663041497, + "grad_norm": 0.17662374675273895, + "learning_rate": 5.90581981990762e-05, + "loss": 1.0974, + "step": 7304 + }, + { + "epoch": 0.44383012333677624, + "grad_norm": 0.42414590716362, + "learning_rate": 5.904878366166285e-05, + "loss": 1.1943, + "step": 7305 + }, + { + "epoch": 0.4438908803694028, + "grad_norm": 0.2576986849308014, + "learning_rate": 5.903936879256462e-05, + "loss": 1.1524, + "step": 7306 + }, + { + "epoch": 0.4439516374020293, + "grad_norm": 0.21064572036266327, + "learning_rate": 5.9029953592126575e-05, + "loss": 1.0453, + "step": 7307 + }, + { + "epoch": 0.4440123944346558, + "grad_norm": 0.1555577516555786, + "learning_rate": 5.902053806069382e-05, + "loss": 1.0352, + "step": 7308 + }, + { + "epoch": 0.4440731514672823, + "grad_norm": 0.1901993304491043, + "learning_rate": 5.9011122198611515e-05, + "loss": 1.1059, + "step": 7309 + }, + { + "epoch": 0.44413390849990886, + "grad_norm": 0.3174474537372589, + "learning_rate": 5.900170600622477e-05, + "loss": 1.1541, + "step": 7310 + }, + { + "epoch": 0.4441946655325354, + "grad_norm": 0.18357430398464203, + "learning_rate": 5.899228948387876e-05, + "loss": 1.1525, + "step": 7311 + }, + { + "epoch": 0.44425542256516193, + "grad_norm": 0.1494145542383194, + "learning_rate": 5.898287263191864e-05, + "loss": 1.0441, + "step": 7312 + }, + { + "epoch": 0.44431617959778846, + "grad_norm": 0.18338635563850403, + "learning_rate": 5.8973455450689586e-05, + "loss": 1.1381, + "step": 7313 + }, + { + "epoch": 0.444376936630415, + "grad_norm": 0.19906257092952728, + "learning_rate": 5.896403794053679e-05, + "loss": 1.1647, + "step": 7314 + }, + { + "epoch": 0.4444376936630415, + "grad_norm": 0.30824077129364014, + "learning_rate": 5.895462010180545e-05, + "loss": 1.106, + "step": 7315 + }, + { + "epoch": 0.444498450695668, + "grad_norm": 0.14091327786445618, + "learning_rate": 5.894520193484077e-05, + "loss": 1.0421, + "step": 7316 + }, + { + "epoch": 0.44455920772829455, + "grad_norm": 0.30853745341300964, + "learning_rate": 5.8935783439988004e-05, + "loss": 1.0531, + "step": 7317 + }, + { + "epoch": 0.4446199647609211, + "grad_norm": 0.15057380497455597, + "learning_rate": 5.892636461759234e-05, + "loss": 1.0464, + "step": 7318 + }, + { + "epoch": 0.4446807217935476, + "grad_norm": 0.14891479909420013, + "learning_rate": 5.891694546799909e-05, + "loss": 1.1184, + "step": 7319 + }, + { + "epoch": 0.44474147882617415, + "grad_norm": 0.28520748019218445, + "learning_rate": 5.8907525991553444e-05, + "loss": 1.2007, + "step": 7320 + }, + { + "epoch": 0.44480223585880063, + "grad_norm": 0.17570620775222778, + "learning_rate": 5.889810618860074e-05, + "loss": 1.0749, + "step": 7321 + }, + { + "epoch": 0.44486299289142717, + "grad_norm": 0.19225281476974487, + "learning_rate": 5.888868605948622e-05, + "loss": 1.039, + "step": 7322 + }, + { + "epoch": 0.4449237499240537, + "grad_norm": 0.2215781956911087, + "learning_rate": 5.88792656045552e-05, + "loss": 1.0872, + "step": 7323 + }, + { + "epoch": 0.44498450695668024, + "grad_norm": 1.9763462543487549, + "learning_rate": 5.8869844824152985e-05, + "loss": 1.1749, + "step": 7324 + }, + { + "epoch": 0.44504526398930677, + "grad_norm": 0.15236996114253998, + "learning_rate": 5.886042371862489e-05, + "loss": 0.9973, + "step": 7325 + }, + { + "epoch": 0.4451060210219333, + "grad_norm": 0.169319286942482, + "learning_rate": 5.8851002288316234e-05, + "loss": 1.098, + "step": 7326 + }, + { + "epoch": 0.44516677805455984, + "grad_norm": 0.9842188358306885, + "learning_rate": 5.884158053357241e-05, + "loss": 1.1303, + "step": 7327 + }, + { + "epoch": 0.4452275350871863, + "grad_norm": 0.17401878535747528, + "learning_rate": 5.883215845473872e-05, + "loss": 1.049, + "step": 7328 + }, + { + "epoch": 0.44528829211981286, + "grad_norm": 0.2303812950849533, + "learning_rate": 5.8822736052160553e-05, + "loss": 1.147, + "step": 7329 + }, + { + "epoch": 0.4453490491524394, + "grad_norm": 0.21957561373710632, + "learning_rate": 5.881331332618332e-05, + "loss": 1.1082, + "step": 7330 + }, + { + "epoch": 0.4454098061850659, + "grad_norm": 1.7288386821746826, + "learning_rate": 5.880389027715235e-05, + "loss": 1.3549, + "step": 7331 + }, + { + "epoch": 0.44547056321769246, + "grad_norm": 0.27854982018470764, + "learning_rate": 5.8794466905413095e-05, + "loss": 1.0973, + "step": 7332 + }, + { + "epoch": 0.445531320250319, + "grad_norm": 0.1918107122182846, + "learning_rate": 5.8785043211310954e-05, + "loss": 1.039, + "step": 7333 + }, + { + "epoch": 0.4455920772829455, + "grad_norm": 0.1668788641691208, + "learning_rate": 5.8775619195191355e-05, + "loss": 1.1035, + "step": 7334 + }, + { + "epoch": 0.445652834315572, + "grad_norm": 0.9830353260040283, + "learning_rate": 5.876619485739974e-05, + "loss": 1.1331, + "step": 7335 + }, + { + "epoch": 0.44571359134819855, + "grad_norm": 0.17448976635932922, + "learning_rate": 5.875677019828154e-05, + "loss": 1.0597, + "step": 7336 + }, + { + "epoch": 0.4457743483808251, + "grad_norm": 0.17785082757472992, + "learning_rate": 5.874734521818227e-05, + "loss": 1.0808, + "step": 7337 + }, + { + "epoch": 0.4458351054134516, + "grad_norm": 0.12727798521518707, + "learning_rate": 5.873791991744735e-05, + "loss": 1.0992, + "step": 7338 + }, + { + "epoch": 0.44589586244607815, + "grad_norm": 0.2568991482257843, + "learning_rate": 5.872849429642228e-05, + "loss": 0.9957, + "step": 7339 + }, + { + "epoch": 0.4459566194787047, + "grad_norm": 0.19845077395439148, + "learning_rate": 5.871906835545259e-05, + "loss": 1.0614, + "step": 7340 + }, + { + "epoch": 0.44601737651133117, + "grad_norm": 0.13526736199855804, + "learning_rate": 5.8709642094883736e-05, + "loss": 1.019, + "step": 7341 + }, + { + "epoch": 0.4460781335439577, + "grad_norm": 0.20291964709758759, + "learning_rate": 5.870021551506127e-05, + "loss": 1.0623, + "step": 7342 + }, + { + "epoch": 0.44613889057658423, + "grad_norm": 0.19378912448883057, + "learning_rate": 5.869078861633075e-05, + "loss": 1.184, + "step": 7343 + }, + { + "epoch": 0.44619964760921077, + "grad_norm": 0.1606374830007553, + "learning_rate": 5.868136139903766e-05, + "loss": 1.102, + "step": 7344 + }, + { + "epoch": 0.4462604046418373, + "grad_norm": 2.80609393119812, + "learning_rate": 5.8671933863527606e-05, + "loss": 1.0534, + "step": 7345 + }, + { + "epoch": 0.44632116167446384, + "grad_norm": 0.13264694809913635, + "learning_rate": 5.8662506010146146e-05, + "loss": 1.0595, + "step": 7346 + }, + { + "epoch": 0.4463819187070903, + "grad_norm": 0.5925062298774719, + "learning_rate": 5.865307783923885e-05, + "loss": 1.0506, + "step": 7347 + }, + { + "epoch": 0.44644267573971685, + "grad_norm": 0.14357206225395203, + "learning_rate": 5.86436493511513e-05, + "loss": 1.0716, + "step": 7348 + }, + { + "epoch": 0.4465034327723434, + "grad_norm": 0.2187967151403427, + "learning_rate": 5.8634220546229115e-05, + "loss": 1.1221, + "step": 7349 + }, + { + "epoch": 0.4465641898049699, + "grad_norm": 0.11231549084186554, + "learning_rate": 5.8624791424817926e-05, + "loss": 1.0283, + "step": 7350 + }, + { + "epoch": 0.44662494683759646, + "grad_norm": 0.2309916913509369, + "learning_rate": 5.8615361987263325e-05, + "loss": 1.0409, + "step": 7351 + }, + { + "epoch": 0.446685703870223, + "grad_norm": 0.11349146068096161, + "learning_rate": 5.860593223391096e-05, + "loss": 1.0145, + "step": 7352 + }, + { + "epoch": 0.44674646090284953, + "grad_norm": 0.1541716605424881, + "learning_rate": 5.8596502165106494e-05, + "loss": 1.0077, + "step": 7353 + }, + { + "epoch": 0.446807217935476, + "grad_norm": 0.1144527867436409, + "learning_rate": 5.858707178119558e-05, + "loss": 1.0385, + "step": 7354 + }, + { + "epoch": 0.44686797496810254, + "grad_norm": 0.17247967422008514, + "learning_rate": 5.857764108252389e-05, + "loss": 1.038, + "step": 7355 + }, + { + "epoch": 0.4469287320007291, + "grad_norm": 0.1535053551197052, + "learning_rate": 5.856821006943711e-05, + "loss": 1.0024, + "step": 7356 + }, + { + "epoch": 0.4469894890333556, + "grad_norm": 0.14443626999855042, + "learning_rate": 5.855877874228093e-05, + "loss": 1.0887, + "step": 7357 + }, + { + "epoch": 0.44705024606598215, + "grad_norm": 0.18524456024169922, + "learning_rate": 5.8549347101401066e-05, + "loss": 1.1141, + "step": 7358 + }, + { + "epoch": 0.4471110030986087, + "grad_norm": 0.1365940123796463, + "learning_rate": 5.853991514714322e-05, + "loss": 1.0602, + "step": 7359 + }, + { + "epoch": 0.44717176013123516, + "grad_norm": 0.13903789222240448, + "learning_rate": 5.8530482879853145e-05, + "loss": 0.9976, + "step": 7360 + }, + { + "epoch": 0.4472325171638617, + "grad_norm": 0.19113962352275848, + "learning_rate": 5.8521050299876566e-05, + "loss": 1.182, + "step": 7361 + }, + { + "epoch": 0.44729327419648823, + "grad_norm": 0.18511556088924408, + "learning_rate": 5.851161740755924e-05, + "loss": 1.0859, + "step": 7362 + }, + { + "epoch": 0.44735403122911477, + "grad_norm": 0.23406803607940674, + "learning_rate": 5.8502184203246924e-05, + "loss": 1.0925, + "step": 7363 + }, + { + "epoch": 0.4474147882617413, + "grad_norm": 0.19588711857795715, + "learning_rate": 5.849275068728543e-05, + "loss": 1.1043, + "step": 7364 + }, + { + "epoch": 0.44747554529436784, + "grad_norm": 0.2184978872537613, + "learning_rate": 5.848331686002049e-05, + "loss": 1.1085, + "step": 7365 + }, + { + "epoch": 0.4475363023269944, + "grad_norm": 0.19283030927181244, + "learning_rate": 5.847388272179795e-05, + "loss": 1.1215, + "step": 7366 + }, + { + "epoch": 0.44759705935962085, + "grad_norm": 0.14727739989757538, + "learning_rate": 5.846444827296359e-05, + "loss": 1.0655, + "step": 7367 + }, + { + "epoch": 0.4476578163922474, + "grad_norm": 0.22092007100582123, + "learning_rate": 5.845501351386324e-05, + "loss": 1.1196, + "step": 7368 + }, + { + "epoch": 0.4477185734248739, + "grad_norm": 0.2107936292886734, + "learning_rate": 5.844557844484274e-05, + "loss": 1.0024, + "step": 7369 + }, + { + "epoch": 0.44777933045750046, + "grad_norm": 0.1994905322790146, + "learning_rate": 5.8436143066247936e-05, + "loss": 1.0843, + "step": 7370 + }, + { + "epoch": 0.447840087490127, + "grad_norm": 0.11246003210544586, + "learning_rate": 5.8426707378424675e-05, + "loss": 1.0078, + "step": 7371 + }, + { + "epoch": 0.4479008445227535, + "grad_norm": 0.17775876820087433, + "learning_rate": 5.841727138171882e-05, + "loss": 1.0651, + "step": 7372 + }, + { + "epoch": 0.44796160155538006, + "grad_norm": 0.1904994547367096, + "learning_rate": 5.840783507647625e-05, + "loss": 1.1008, + "step": 7373 + }, + { + "epoch": 0.44802235858800654, + "grad_norm": 0.12441320717334747, + "learning_rate": 5.839839846304287e-05, + "loss": 1.0822, + "step": 7374 + }, + { + "epoch": 0.4480831156206331, + "grad_norm": 0.16783198714256287, + "learning_rate": 5.838896154176456e-05, + "loss": 1.0587, + "step": 7375 + }, + { + "epoch": 0.4481438726532596, + "grad_norm": 0.1952381432056427, + "learning_rate": 5.837952431298726e-05, + "loss": 1.0509, + "step": 7376 + }, + { + "epoch": 0.44820462968588615, + "grad_norm": 0.995356023311615, + "learning_rate": 5.8370086777056866e-05, + "loss": 1.0485, + "step": 7377 + }, + { + "epoch": 0.4482653867185127, + "grad_norm": 0.14168232679367065, + "learning_rate": 5.836064893431933e-05, + "loss": 1.0604, + "step": 7378 + }, + { + "epoch": 0.4483261437511392, + "grad_norm": 0.17297153174877167, + "learning_rate": 5.835121078512058e-05, + "loss": 1.1616, + "step": 7379 + }, + { + "epoch": 0.4483869007837657, + "grad_norm": 0.15718460083007812, + "learning_rate": 5.83417723298066e-05, + "loss": 1.13, + "step": 7380 + }, + { + "epoch": 0.44844765781639223, + "grad_norm": 0.12464726716279984, + "learning_rate": 5.833233356872333e-05, + "loss": 1.0326, + "step": 7381 + }, + { + "epoch": 0.44850841484901877, + "grad_norm": 0.19435523450374603, + "learning_rate": 5.832289450221677e-05, + "loss": 1.0759, + "step": 7382 + }, + { + "epoch": 0.4485691718816453, + "grad_norm": 0.1311478614807129, + "learning_rate": 5.831345513063289e-05, + "loss": 1.0834, + "step": 7383 + }, + { + "epoch": 0.44862992891427184, + "grad_norm": 0.2084561586380005, + "learning_rate": 5.8304015454317726e-05, + "loss": 1.0589, + "step": 7384 + }, + { + "epoch": 0.44869068594689837, + "grad_norm": 0.1819782555103302, + "learning_rate": 5.8294575473617254e-05, + "loss": 1.2061, + "step": 7385 + }, + { + "epoch": 0.4487514429795249, + "grad_norm": 0.1541939675807953, + "learning_rate": 5.8285135188877517e-05, + "loss": 1.0991, + "step": 7386 + }, + { + "epoch": 0.4488122000121514, + "grad_norm": 0.34547290205955505, + "learning_rate": 5.8275694600444565e-05, + "loss": 1.253, + "step": 7387 + }, + { + "epoch": 0.4488729570447779, + "grad_norm": 0.30192795395851135, + "learning_rate": 5.8266253708664396e-05, + "loss": 1.1729, + "step": 7388 + }, + { + "epoch": 0.44893371407740446, + "grad_norm": 0.1295192986726761, + "learning_rate": 5.825681251388313e-05, + "loss": 1.0909, + "step": 7389 + }, + { + "epoch": 0.448994471110031, + "grad_norm": 0.16431687772274017, + "learning_rate": 5.824737101644679e-05, + "loss": 1.1382, + "step": 7390 + }, + { + "epoch": 0.4490552281426575, + "grad_norm": 0.17481756210327148, + "learning_rate": 5.823792921670147e-05, + "loss": 1.1314, + "step": 7391 + }, + { + "epoch": 0.44911598517528406, + "grad_norm": 0.22288189828395844, + "learning_rate": 5.822848711499327e-05, + "loss": 1.0858, + "step": 7392 + }, + { + "epoch": 0.44917674220791054, + "grad_norm": 10.75539493560791, + "learning_rate": 5.8219044711668266e-05, + "loss": 1.0969, + "step": 7393 + }, + { + "epoch": 0.4492374992405371, + "grad_norm": 0.15337887406349182, + "learning_rate": 5.8209602007072615e-05, + "loss": 1.0685, + "step": 7394 + }, + { + "epoch": 0.4492982562731636, + "grad_norm": 0.1962725669145584, + "learning_rate": 5.8200159001552415e-05, + "loss": 1.0642, + "step": 7395 + }, + { + "epoch": 0.44935901330579014, + "grad_norm": 0.19740870594978333, + "learning_rate": 5.819071569545378e-05, + "loss": 1.0856, + "step": 7396 + }, + { + "epoch": 0.4494197703384167, + "grad_norm": 0.12731491029262543, + "learning_rate": 5.81812720891229e-05, + "loss": 1.0393, + "step": 7397 + }, + { + "epoch": 0.4494805273710432, + "grad_norm": 0.1905568540096283, + "learning_rate": 5.817182818290588e-05, + "loss": 1.0499, + "step": 7398 + }, + { + "epoch": 0.44954128440366975, + "grad_norm": 0.18223793804645538, + "learning_rate": 5.816238397714895e-05, + "loss": 1.0821, + "step": 7399 + }, + { + "epoch": 0.44960204143629623, + "grad_norm": 0.15613962709903717, + "learning_rate": 5.815293947219825e-05, + "loss": 1.0468, + "step": 7400 + }, + { + "epoch": 0.44966279846892276, + "grad_norm": 0.16405601799488068, + "learning_rate": 5.814349466839997e-05, + "loss": 1.1184, + "step": 7401 + }, + { + "epoch": 0.4497235555015493, + "grad_norm": 0.16086284816265106, + "learning_rate": 5.8134049566100325e-05, + "loss": 1.0652, + "step": 7402 + }, + { + "epoch": 0.44978431253417583, + "grad_norm": 0.1482958048582077, + "learning_rate": 5.812460416564551e-05, + "loss": 1.0386, + "step": 7403 + }, + { + "epoch": 0.44984506956680237, + "grad_norm": 0.16720061004161835, + "learning_rate": 5.811515846738177e-05, + "loss": 1.056, + "step": 7404 + }, + { + "epoch": 0.4499058265994289, + "grad_norm": 0.10713080316781998, + "learning_rate": 5.810571247165533e-05, + "loss": 1.0274, + "step": 7405 + }, + { + "epoch": 0.4499665836320554, + "grad_norm": 0.11764936149120331, + "learning_rate": 5.809626617881241e-05, + "loss": 0.9931, + "step": 7406 + }, + { + "epoch": 0.4500273406646819, + "grad_norm": 0.17225749790668488, + "learning_rate": 5.808681958919932e-05, + "loss": 1.1447, + "step": 7407 + }, + { + "epoch": 0.45008809769730845, + "grad_norm": 1.5785449743270874, + "learning_rate": 5.807737270316228e-05, + "loss": 1.0848, + "step": 7408 + }, + { + "epoch": 0.450148854729935, + "grad_norm": 0.11203330010175705, + "learning_rate": 5.806792552104757e-05, + "loss": 1.0138, + "step": 7409 + }, + { + "epoch": 0.4502096117625615, + "grad_norm": 0.12918095290660858, + "learning_rate": 5.8058478043201526e-05, + "loss": 1.0493, + "step": 7410 + }, + { + "epoch": 0.45027036879518806, + "grad_norm": 0.5991353988647461, + "learning_rate": 5.8049030269970374e-05, + "loss": 1.0833, + "step": 7411 + }, + { + "epoch": 0.4503311258278146, + "grad_norm": 0.39992600679397583, + "learning_rate": 5.803958220170047e-05, + "loss": 1.1555, + "step": 7412 + }, + { + "epoch": 0.4503918828604411, + "grad_norm": 0.2323329597711563, + "learning_rate": 5.8030133838738146e-05, + "loss": 1.126, + "step": 7413 + }, + { + "epoch": 0.4504526398930676, + "grad_norm": 0.1528501659631729, + "learning_rate": 5.802068518142969e-05, + "loss": 1.0292, + "step": 7414 + }, + { + "epoch": 0.45051339692569414, + "grad_norm": 1.09116530418396, + "learning_rate": 5.801123623012148e-05, + "loss": 1.056, + "step": 7415 + }, + { + "epoch": 0.4505741539583207, + "grad_norm": 0.1937636435031891, + "learning_rate": 5.8001786985159856e-05, + "loss": 1.0361, + "step": 7416 + }, + { + "epoch": 0.4506349109909472, + "grad_norm": 0.4029299318790436, + "learning_rate": 5.7992337446891186e-05, + "loss": 1.1597, + "step": 7417 + }, + { + "epoch": 0.45069566802357375, + "grad_norm": 0.47917649149894714, + "learning_rate": 5.798288761566184e-05, + "loss": 1.215, + "step": 7418 + }, + { + "epoch": 0.4507564250562003, + "grad_norm": 0.23846226930618286, + "learning_rate": 5.797343749181819e-05, + "loss": 1.1331, + "step": 7419 + }, + { + "epoch": 0.45081718208882676, + "grad_norm": 0.2682127356529236, + "learning_rate": 5.796398707570668e-05, + "loss": 1.0445, + "step": 7420 + }, + { + "epoch": 0.4508779391214533, + "grad_norm": 0.16246290504932404, + "learning_rate": 5.795453636767366e-05, + "loss": 1.0531, + "step": 7421 + }, + { + "epoch": 0.45093869615407983, + "grad_norm": 0.15183283388614655, + "learning_rate": 5.794508536806558e-05, + "loss": 1.0492, + "step": 7422 + }, + { + "epoch": 0.45099945318670637, + "grad_norm": 0.14682994782924652, + "learning_rate": 5.793563407722886e-05, + "loss": 1.0382, + "step": 7423 + }, + { + "epoch": 0.4510602102193329, + "grad_norm": 0.1822127252817154, + "learning_rate": 5.792618249550994e-05, + "loss": 1.1113, + "step": 7424 + }, + { + "epoch": 0.45112096725195944, + "grad_norm": 0.30428874492645264, + "learning_rate": 5.791673062325527e-05, + "loss": 1.1447, + "step": 7425 + }, + { + "epoch": 0.4511817242845859, + "grad_norm": 0.19708821177482605, + "learning_rate": 5.79072784608113e-05, + "loss": 1.1603, + "step": 7426 + }, + { + "epoch": 0.45124248131721245, + "grad_norm": 0.1499456912279129, + "learning_rate": 5.789782600852452e-05, + "loss": 1.0522, + "step": 7427 + }, + { + "epoch": 0.451303238349839, + "grad_norm": 0.12563274800777435, + "learning_rate": 5.7888373266741394e-05, + "loss": 1.0535, + "step": 7428 + }, + { + "epoch": 0.4513639953824655, + "grad_norm": 0.12478446215391159, + "learning_rate": 5.787892023580841e-05, + "loss": 1.0219, + "step": 7429 + }, + { + "epoch": 0.45142475241509206, + "grad_norm": 0.21333153545856476, + "learning_rate": 5.786946691607209e-05, + "loss": 1.0769, + "step": 7430 + }, + { + "epoch": 0.4514855094477186, + "grad_norm": 0.22072742879390717, + "learning_rate": 5.786001330787896e-05, + "loss": 1.083, + "step": 7431 + }, + { + "epoch": 0.4515462664803451, + "grad_norm": 0.16860386729240417, + "learning_rate": 5.78505594115755e-05, + "loss": 1.0954, + "step": 7432 + }, + { + "epoch": 0.4516070235129716, + "grad_norm": 0.16723968088626862, + "learning_rate": 5.784110522750828e-05, + "loss": 1.0567, + "step": 7433 + }, + { + "epoch": 0.45166778054559814, + "grad_norm": 0.11265820264816284, + "learning_rate": 5.783165075602381e-05, + "loss": 1.1, + "step": 7434 + }, + { + "epoch": 0.4517285375782247, + "grad_norm": 0.2543773055076599, + "learning_rate": 5.78221959974687e-05, + "loss": 1.2925, + "step": 7435 + }, + { + "epoch": 0.4517892946108512, + "grad_norm": 0.2166946679353714, + "learning_rate": 5.781274095218946e-05, + "loss": 1.0519, + "step": 7436 + }, + { + "epoch": 0.45185005164347775, + "grad_norm": 0.14822642505168915, + "learning_rate": 5.78032856205327e-05, + "loss": 1.1237, + "step": 7437 + }, + { + "epoch": 0.4519108086761043, + "grad_norm": 0.13509392738342285, + "learning_rate": 5.7793830002845e-05, + "loss": 1.0998, + "step": 7438 + }, + { + "epoch": 0.45197156570873076, + "grad_norm": 0.13430723547935486, + "learning_rate": 5.778437409947295e-05, + "loss": 1.0393, + "step": 7439 + }, + { + "epoch": 0.4520323227413573, + "grad_norm": 0.30174896121025085, + "learning_rate": 5.777491791076315e-05, + "loss": 1.2745, + "step": 7440 + }, + { + "epoch": 0.45209307977398383, + "grad_norm": 0.14422793686389923, + "learning_rate": 5.776546143706225e-05, + "loss": 1.1172, + "step": 7441 + }, + { + "epoch": 0.45215383680661037, + "grad_norm": 2.848076105117798, + "learning_rate": 5.7756004678716847e-05, + "loss": 1.0586, + "step": 7442 + }, + { + "epoch": 0.4522145938392369, + "grad_norm": 0.1597025990486145, + "learning_rate": 5.77465476360736e-05, + "loss": 1.0813, + "step": 7443 + }, + { + "epoch": 0.45227535087186344, + "grad_norm": 0.14917469024658203, + "learning_rate": 5.773709030947916e-05, + "loss": 1.0286, + "step": 7444 + }, + { + "epoch": 0.45233610790448997, + "grad_norm": 0.19608931243419647, + "learning_rate": 5.7727632699280166e-05, + "loss": 1.13, + "step": 7445 + }, + { + "epoch": 0.45239686493711645, + "grad_norm": 0.14525218307971954, + "learning_rate": 5.771817480582331e-05, + "loss": 1.1098, + "step": 7446 + }, + { + "epoch": 0.452457621969743, + "grad_norm": 0.17831704020500183, + "learning_rate": 5.7708716629455264e-05, + "loss": 1.0069, + "step": 7447 + }, + { + "epoch": 0.4525183790023695, + "grad_norm": 0.1452292799949646, + "learning_rate": 5.7699258170522705e-05, + "loss": 1.0734, + "step": 7448 + }, + { + "epoch": 0.45257913603499605, + "grad_norm": 0.11639715731143951, + "learning_rate": 5.768979942937237e-05, + "loss": 1.0521, + "step": 7449 + }, + { + "epoch": 0.4526398930676226, + "grad_norm": 0.2177213877439499, + "learning_rate": 5.768034040635093e-05, + "loss": 1.1612, + "step": 7450 + }, + { + "epoch": 0.4527006501002491, + "grad_norm": 0.17206627130508423, + "learning_rate": 5.767088110180515e-05, + "loss": 1.11, + "step": 7451 + }, + { + "epoch": 0.4527614071328756, + "grad_norm": 0.14323192834854126, + "learning_rate": 5.766142151608174e-05, + "loss": 1.0841, + "step": 7452 + }, + { + "epoch": 0.45282216416550214, + "grad_norm": 0.13353809714317322, + "learning_rate": 5.765196164952742e-05, + "loss": 1.0435, + "step": 7453 + }, + { + "epoch": 0.4528829211981287, + "grad_norm": 0.16537903249263763, + "learning_rate": 5.7642501502488986e-05, + "loss": 1.1244, + "step": 7454 + }, + { + "epoch": 0.4529436782307552, + "grad_norm": 0.12540307641029358, + "learning_rate": 5.7633041075313176e-05, + "loss": 1.0307, + "step": 7455 + }, + { + "epoch": 0.45300443526338174, + "grad_norm": 0.2685820460319519, + "learning_rate": 5.7623580368346765e-05, + "loss": 1.2347, + "step": 7456 + }, + { + "epoch": 0.4530651922960083, + "grad_norm": 0.11319020390510559, + "learning_rate": 5.761411938193656e-05, + "loss": 1.0311, + "step": 7457 + }, + { + "epoch": 0.4531259493286348, + "grad_norm": 0.19712980091571808, + "learning_rate": 5.760465811642932e-05, + "loss": 1.1076, + "step": 7458 + }, + { + "epoch": 0.4531867063612613, + "grad_norm": 1.0889067649841309, + "learning_rate": 5.759519657217187e-05, + "loss": 1.0751, + "step": 7459 + }, + { + "epoch": 0.45324746339388783, + "grad_norm": 0.16383759677410126, + "learning_rate": 5.758573474951102e-05, + "loss": 1.0686, + "step": 7460 + }, + { + "epoch": 0.45330822042651436, + "grad_norm": 10.906392097473145, + "learning_rate": 5.75762726487936e-05, + "loss": 1.1086, + "step": 7461 + }, + { + "epoch": 0.4533689774591409, + "grad_norm": 0.4894454777240753, + "learning_rate": 5.7566810270366445e-05, + "loss": 1.0495, + "step": 7462 + }, + { + "epoch": 0.45342973449176743, + "grad_norm": 0.20121070742607117, + "learning_rate": 5.7557347614576384e-05, + "loss": 1.2094, + "step": 7463 + }, + { + "epoch": 0.45349049152439397, + "grad_norm": 0.19886942207813263, + "learning_rate": 5.75478846817703e-05, + "loss": 1.0844, + "step": 7464 + }, + { + "epoch": 0.45355124855702045, + "grad_norm": 0.19015374779701233, + "learning_rate": 5.753842147229504e-05, + "loss": 1.1203, + "step": 7465 + }, + { + "epoch": 0.453612005589647, + "grad_norm": 0.24247772991657257, + "learning_rate": 5.752895798649748e-05, + "loss": 1.0626, + "step": 7466 + }, + { + "epoch": 0.4536727626222735, + "grad_norm": 0.18095257878303528, + "learning_rate": 5.751949422472452e-05, + "loss": 1.1557, + "step": 7467 + }, + { + "epoch": 0.45373351965490005, + "grad_norm": 0.1507074534893036, + "learning_rate": 5.751003018732303e-05, + "loss": 1.0508, + "step": 7468 + }, + { + "epoch": 0.4537942766875266, + "grad_norm": 0.1635165959596634, + "learning_rate": 5.750056587463995e-05, + "loss": 1.0823, + "step": 7469 + }, + { + "epoch": 0.4538550337201531, + "grad_norm": 0.16673457622528076, + "learning_rate": 5.7491101287022154e-05, + "loss": 1.0986, + "step": 7470 + }, + { + "epoch": 0.45391579075277966, + "grad_norm": 0.17124459147453308, + "learning_rate": 5.748163642481661e-05, + "loss": 1.0737, + "step": 7471 + }, + { + "epoch": 0.45397654778540614, + "grad_norm": 0.24389341473579407, + "learning_rate": 5.7472171288370235e-05, + "loss": 1.1192, + "step": 7472 + }, + { + "epoch": 0.45403730481803267, + "grad_norm": 0.5769345760345459, + "learning_rate": 5.746270587802998e-05, + "loss": 1.2811, + "step": 7473 + }, + { + "epoch": 0.4540980618506592, + "grad_norm": 0.15695974230766296, + "learning_rate": 5.745324019414279e-05, + "loss": 1.0716, + "step": 7474 + }, + { + "epoch": 0.45415881888328574, + "grad_norm": 3.043384552001953, + "learning_rate": 5.7443774237055644e-05, + "loss": 1.0556, + "step": 7475 + }, + { + "epoch": 0.4542195759159123, + "grad_norm": 0.2506196200847626, + "learning_rate": 5.74343080071155e-05, + "loss": 1.1081, + "step": 7476 + }, + { + "epoch": 0.4542803329485388, + "grad_norm": 0.21501842141151428, + "learning_rate": 5.742484150466939e-05, + "loss": 1.1938, + "step": 7477 + }, + { + "epoch": 0.45434108998116535, + "grad_norm": 0.15379853546619415, + "learning_rate": 5.741537473006425e-05, + "loss": 1.1079, + "step": 7478 + }, + { + "epoch": 0.4544018470137918, + "grad_norm": 0.658745527267456, + "learning_rate": 5.7405907683647134e-05, + "loss": 1.1059, + "step": 7479 + }, + { + "epoch": 0.45446260404641836, + "grad_norm": 0.21668709814548492, + "learning_rate": 5.739644036576502e-05, + "loss": 1.0096, + "step": 7480 + }, + { + "epoch": 0.4545233610790449, + "grad_norm": 0.14211909472942352, + "learning_rate": 5.738697277676498e-05, + "loss": 1.0324, + "step": 7481 + }, + { + "epoch": 0.45458411811167143, + "grad_norm": 1.3377307653427124, + "learning_rate": 5.737750491699402e-05, + "loss": 1.2082, + "step": 7482 + }, + { + "epoch": 0.45464487514429797, + "grad_norm": 0.1769695281982422, + "learning_rate": 5.736803678679918e-05, + "loss": 1.053, + "step": 7483 + }, + { + "epoch": 0.4547056321769245, + "grad_norm": 0.15515007078647614, + "learning_rate": 5.735856838652754e-05, + "loss": 1.0571, + "step": 7484 + }, + { + "epoch": 0.454766389209551, + "grad_norm": 0.24353644251823425, + "learning_rate": 5.734909971652614e-05, + "loss": 1.0264, + "step": 7485 + }, + { + "epoch": 0.4548271462421775, + "grad_norm": 0.19325301051139832, + "learning_rate": 5.733963077714206e-05, + "loss": 1.2901, + "step": 7486 + }, + { + "epoch": 0.45488790327480405, + "grad_norm": 0.16973786056041718, + "learning_rate": 5.7330161568722426e-05, + "loss": 1.0374, + "step": 7487 + }, + { + "epoch": 0.4549486603074306, + "grad_norm": 0.25033992528915405, + "learning_rate": 5.732069209161429e-05, + "loss": 1.2324, + "step": 7488 + }, + { + "epoch": 0.4550094173400571, + "grad_norm": 49.92509460449219, + "learning_rate": 5.731122234616476e-05, + "loss": 1.0527, + "step": 7489 + }, + { + "epoch": 0.45507017437268366, + "grad_norm": 0.21506592631340027, + "learning_rate": 5.730175233272097e-05, + "loss": 1.0632, + "step": 7490 + }, + { + "epoch": 0.4551309314053102, + "grad_norm": 3.1655020713806152, + "learning_rate": 5.729228205163003e-05, + "loss": 1.0467, + "step": 7491 + }, + { + "epoch": 0.45519168843793667, + "grad_norm": 0.2095041573047638, + "learning_rate": 5.7282811503239096e-05, + "loss": 1.1275, + "step": 7492 + }, + { + "epoch": 0.4552524454705632, + "grad_norm": 0.3740250766277313, + "learning_rate": 5.727334068789529e-05, + "loss": 1.1432, + "step": 7493 + }, + { + "epoch": 0.45531320250318974, + "grad_norm": 0.12217546254396439, + "learning_rate": 5.7263869605945775e-05, + "loss": 1.0468, + "step": 7494 + }, + { + "epoch": 0.4553739595358163, + "grad_norm": 0.16230589151382446, + "learning_rate": 5.725439825773772e-05, + "loss": 1.0893, + "step": 7495 + }, + { + "epoch": 0.4554347165684428, + "grad_norm": 0.13122041523456573, + "learning_rate": 5.724492664361829e-05, + "loss": 1.052, + "step": 7496 + }, + { + "epoch": 0.45549547360106935, + "grad_norm": 0.23191942274570465, + "learning_rate": 5.7235454763934657e-05, + "loss": 1.0789, + "step": 7497 + }, + { + "epoch": 0.4555562306336958, + "grad_norm": 0.13227961957454681, + "learning_rate": 5.722598261903407e-05, + "loss": 1.0579, + "step": 7498 + }, + { + "epoch": 0.45561698766632236, + "grad_norm": 0.1584184467792511, + "learning_rate": 5.721651020926366e-05, + "loss": 1.0694, + "step": 7499 + }, + { + "epoch": 0.4556777446989489, + "grad_norm": 4.345352649688721, + "learning_rate": 5.7207037534970686e-05, + "loss": 1.1363, + "step": 7500 + }, + { + "epoch": 0.45573850173157543, + "grad_norm": 0.3370049297809601, + "learning_rate": 5.719756459650237e-05, + "loss": 1.0893, + "step": 7501 + }, + { + "epoch": 0.45579925876420196, + "grad_norm": 0.1856507509946823, + "learning_rate": 5.718809139420591e-05, + "loss": 1.2368, + "step": 7502 + }, + { + "epoch": 0.4558600157968285, + "grad_norm": 0.4172630310058594, + "learning_rate": 5.717861792842858e-05, + "loss": 1.3132, + "step": 7503 + }, + { + "epoch": 0.45592077282945503, + "grad_norm": 0.22857438027858734, + "learning_rate": 5.716914419951762e-05, + "loss": 1.0564, + "step": 7504 + }, + { + "epoch": 0.4559815298620815, + "grad_norm": 0.15569455921649933, + "learning_rate": 5.7159670207820305e-05, + "loss": 1.0379, + "step": 7505 + }, + { + "epoch": 0.45604228689470805, + "grad_norm": 0.134469136595726, + "learning_rate": 5.715019595368388e-05, + "loss": 1.0775, + "step": 7506 + }, + { + "epoch": 0.4561030439273346, + "grad_norm": 0.11792246997356415, + "learning_rate": 5.714072143745564e-05, + "loss": 1.0335, + "step": 7507 + }, + { + "epoch": 0.4561638009599611, + "grad_norm": 0.18644648790359497, + "learning_rate": 5.71312466594829e-05, + "loss": 1.1581, + "step": 7508 + }, + { + "epoch": 0.45622455799258765, + "grad_norm": 0.11870857328176498, + "learning_rate": 5.712177162011291e-05, + "loss": 1.0161, + "step": 7509 + }, + { + "epoch": 0.4562853150252142, + "grad_norm": 0.13469670712947845, + "learning_rate": 5.7112296319693004e-05, + "loss": 1.0613, + "step": 7510 + }, + { + "epoch": 0.45634607205784067, + "grad_norm": 0.3815622329711914, + "learning_rate": 5.710282075857052e-05, + "loss": 1.1575, + "step": 7511 + }, + { + "epoch": 0.4564068290904672, + "grad_norm": 0.22992122173309326, + "learning_rate": 5.709334493709274e-05, + "loss": 1.0239, + "step": 7512 + }, + { + "epoch": 0.45646758612309374, + "grad_norm": 0.18595284223556519, + "learning_rate": 5.7083868855607045e-05, + "loss": 1.0607, + "step": 7513 + }, + { + "epoch": 0.4565283431557203, + "grad_norm": 0.1342955082654953, + "learning_rate": 5.707439251446077e-05, + "loss": 1.077, + "step": 7514 + }, + { + "epoch": 0.4565891001883468, + "grad_norm": 0.28029853105545044, + "learning_rate": 5.706491591400128e-05, + "loss": 1.0669, + "step": 7515 + }, + { + "epoch": 0.45664985722097334, + "grad_norm": 0.2969526946544647, + "learning_rate": 5.705543905457592e-05, + "loss": 1.1455, + "step": 7516 + }, + { + "epoch": 0.4567106142535999, + "grad_norm": 0.32625094056129456, + "learning_rate": 5.704596193653208e-05, + "loss": 1.1397, + "step": 7517 + }, + { + "epoch": 0.45677137128622636, + "grad_norm": 0.13720357418060303, + "learning_rate": 5.703648456021714e-05, + "loss": 1.0565, + "step": 7518 + }, + { + "epoch": 0.4568321283188529, + "grad_norm": 0.21891836822032928, + "learning_rate": 5.702700692597851e-05, + "loss": 1.1714, + "step": 7519 + }, + { + "epoch": 0.4568928853514794, + "grad_norm": 0.3338230550289154, + "learning_rate": 5.701752903416356e-05, + "loss": 1.1148, + "step": 7520 + }, + { + "epoch": 0.45695364238410596, + "grad_norm": 0.2304871827363968, + "learning_rate": 5.7008050885119754e-05, + "loss": 1.0731, + "step": 7521 + }, + { + "epoch": 0.4570143994167325, + "grad_norm": 0.2524801194667816, + "learning_rate": 5.699857247919447e-05, + "loss": 1.1334, + "step": 7522 + }, + { + "epoch": 0.45707515644935903, + "grad_norm": 0.25523653626441956, + "learning_rate": 5.698909381673516e-05, + "loss": 1.0527, + "step": 7523 + }, + { + "epoch": 0.45713591348198557, + "grad_norm": 0.24774916470050812, + "learning_rate": 5.697961489808927e-05, + "loss": 1.1022, + "step": 7524 + }, + { + "epoch": 0.45719667051461205, + "grad_norm": 0.4545222222805023, + "learning_rate": 5.697013572360424e-05, + "loss": 1.0904, + "step": 7525 + }, + { + "epoch": 0.4572574275472386, + "grad_norm": 0.2186569720506668, + "learning_rate": 5.696065629362755e-05, + "loss": 1.2372, + "step": 7526 + }, + { + "epoch": 0.4573181845798651, + "grad_norm": 0.44130274653434753, + "learning_rate": 5.695117660850665e-05, + "loss": 1.0495, + "step": 7527 + }, + { + "epoch": 0.45737894161249165, + "grad_norm": 0.1566094607114792, + "learning_rate": 5.694169666858903e-05, + "loss": 1.0535, + "step": 7528 + }, + { + "epoch": 0.4574396986451182, + "grad_norm": 0.25207868218421936, + "learning_rate": 5.693221647422218e-05, + "loss": 1.0542, + "step": 7529 + }, + { + "epoch": 0.4575004556777447, + "grad_norm": 0.2549058198928833, + "learning_rate": 5.692273602575358e-05, + "loss": 1.0901, + "step": 7530 + }, + { + "epoch": 0.4575612127103712, + "grad_norm": 0.2928144037723541, + "learning_rate": 5.691325532353078e-05, + "loss": 1.0308, + "step": 7531 + }, + { + "epoch": 0.45762196974299774, + "grad_norm": 1.2693920135498047, + "learning_rate": 5.690377436790125e-05, + "loss": 1.1426, + "step": 7532 + }, + { + "epoch": 0.45768272677562427, + "grad_norm": 2.5480690002441406, + "learning_rate": 5.689429315921253e-05, + "loss": 1.0497, + "step": 7533 + }, + { + "epoch": 0.4577434838082508, + "grad_norm": 0.18954262137413025, + "learning_rate": 5.688481169781218e-05, + "loss": 1.1384, + "step": 7534 + }, + { + "epoch": 0.45780424084087734, + "grad_norm": 0.23400700092315674, + "learning_rate": 5.687532998404771e-05, + "loss": 1.1508, + "step": 7535 + }, + { + "epoch": 0.4578649978735039, + "grad_norm": 0.15430310368537903, + "learning_rate": 5.68658480182667e-05, + "loss": 1.0806, + "step": 7536 + }, + { + "epoch": 0.4579257549061304, + "grad_norm": 0.24934610724449158, + "learning_rate": 5.68563658008167e-05, + "loss": 1.0373, + "step": 7537 + }, + { + "epoch": 0.4579865119387569, + "grad_norm": 0.3733794391155243, + "learning_rate": 5.6846883332045284e-05, + "loss": 1.1144, + "step": 7538 + }, + { + "epoch": 0.4580472689713834, + "grad_norm": 0.15330719947814941, + "learning_rate": 5.683740061230005e-05, + "loss": 1.0456, + "step": 7539 + }, + { + "epoch": 0.45810802600400996, + "grad_norm": 0.2150934636592865, + "learning_rate": 5.6827917641928566e-05, + "loss": 1.1329, + "step": 7540 + }, + { + "epoch": 0.4581687830366365, + "grad_norm": 0.2593918740749359, + "learning_rate": 5.681843442127844e-05, + "loss": 1.165, + "step": 7541 + }, + { + "epoch": 0.45822954006926303, + "grad_norm": 0.3031356930732727, + "learning_rate": 5.680895095069728e-05, + "loss": 1.0436, + "step": 7542 + }, + { + "epoch": 0.45829029710188957, + "grad_norm": 0.2594084143638611, + "learning_rate": 5.67994672305327e-05, + "loss": 1.2377, + "step": 7543 + }, + { + "epoch": 0.45835105413451604, + "grad_norm": 0.2219531387090683, + "learning_rate": 5.678998326113236e-05, + "loss": 1.0988, + "step": 7544 + }, + { + "epoch": 0.4584118111671426, + "grad_norm": 0.18685947358608246, + "learning_rate": 5.678049904284385e-05, + "loss": 1.0624, + "step": 7545 + }, + { + "epoch": 0.4584725681997691, + "grad_norm": 0.21842002868652344, + "learning_rate": 5.6771014576014836e-05, + "loss": 1.0707, + "step": 7546 + }, + { + "epoch": 0.45853332523239565, + "grad_norm": 0.3374932110309601, + "learning_rate": 5.676152986099298e-05, + "loss": 1.2428, + "step": 7547 + }, + { + "epoch": 0.4585940822650222, + "grad_norm": 0.1564943939447403, + "learning_rate": 5.675204489812594e-05, + "loss": 1.0611, + "step": 7548 + }, + { + "epoch": 0.4586548392976487, + "grad_norm": 0.2009422332048416, + "learning_rate": 5.674255968776139e-05, + "loss": 1.1097, + "step": 7549 + }, + { + "epoch": 0.45871559633027525, + "grad_norm": 0.29928141832351685, + "learning_rate": 5.673307423024701e-05, + "loss": 1.0432, + "step": 7550 + }, + { + "epoch": 0.45877635336290173, + "grad_norm": 0.17595785856246948, + "learning_rate": 5.6723588525930496e-05, + "loss": 1.1462, + "step": 7551 + }, + { + "epoch": 0.45883711039552827, + "grad_norm": 0.34310516715049744, + "learning_rate": 5.671410257515955e-05, + "loss": 1.2845, + "step": 7552 + }, + { + "epoch": 0.4588978674281548, + "grad_norm": 0.13096091151237488, + "learning_rate": 5.670461637828186e-05, + "loss": 1.0821, + "step": 7553 + }, + { + "epoch": 0.45895862446078134, + "grad_norm": 0.17904110252857208, + "learning_rate": 5.669512993564516e-05, + "loss": 1.1369, + "step": 7554 + }, + { + "epoch": 0.4590193814934079, + "grad_norm": 0.20433752238750458, + "learning_rate": 5.66856432475972e-05, + "loss": 1.1922, + "step": 7555 + }, + { + "epoch": 0.4590801385260344, + "grad_norm": 0.17755264043807983, + "learning_rate": 5.6676156314485665e-05, + "loss": 1.0175, + "step": 7556 + }, + { + "epoch": 0.4591408955586609, + "grad_norm": 0.1238652840256691, + "learning_rate": 5.6666669136658343e-05, + "loss": 1.0429, + "step": 7557 + }, + { + "epoch": 0.4592016525912874, + "grad_norm": 0.1593741774559021, + "learning_rate": 5.665718171446299e-05, + "loss": 1.079, + "step": 7558 + }, + { + "epoch": 0.45926240962391396, + "grad_norm": 0.14473868906497955, + "learning_rate": 5.6647694048247335e-05, + "loss": 1.0854, + "step": 7559 + }, + { + "epoch": 0.4593231666565405, + "grad_norm": 0.2751701772212982, + "learning_rate": 5.6638206138359185e-05, + "loss": 1.2336, + "step": 7560 + }, + { + "epoch": 0.45938392368916703, + "grad_norm": 0.15657395124435425, + "learning_rate": 5.662871798514629e-05, + "loss": 1.0734, + "step": 7561 + }, + { + "epoch": 0.45944468072179356, + "grad_norm": 0.39454635977745056, + "learning_rate": 5.661922958895647e-05, + "loss": 1.1759, + "step": 7562 + }, + { + "epoch": 0.4595054377544201, + "grad_norm": 0.2491476684808731, + "learning_rate": 5.66097409501375e-05, + "loss": 1.046, + "step": 7563 + }, + { + "epoch": 0.4595661947870466, + "grad_norm": 0.24620307981967926, + "learning_rate": 5.6600252069037206e-05, + "loss": 1.0304, + "step": 7564 + }, + { + "epoch": 0.4596269518196731, + "grad_norm": 0.16522061824798584, + "learning_rate": 5.65907629460034e-05, + "loss": 1.0829, + "step": 7565 + }, + { + "epoch": 0.45968770885229965, + "grad_norm": 0.23070620000362396, + "learning_rate": 5.65812735813839e-05, + "loss": 1.0888, + "step": 7566 + }, + { + "epoch": 0.4597484658849262, + "grad_norm": 0.13351227343082428, + "learning_rate": 5.6571783975526523e-05, + "loss": 1.1077, + "step": 7567 + }, + { + "epoch": 0.4598092229175527, + "grad_norm": 0.5957766771316528, + "learning_rate": 5.656229412877917e-05, + "loss": 1.0399, + "step": 7568 + }, + { + "epoch": 0.45986997995017925, + "grad_norm": 0.19671595096588135, + "learning_rate": 5.655280404148963e-05, + "loss": 1.0529, + "step": 7569 + }, + { + "epoch": 0.45993073698280573, + "grad_norm": 0.19638168811798096, + "learning_rate": 5.65433137140058e-05, + "loss": 1.1049, + "step": 7570 + }, + { + "epoch": 0.45999149401543227, + "grad_norm": 0.1827968806028366, + "learning_rate": 5.6533823146675546e-05, + "loss": 1.0553, + "step": 7571 + }, + { + "epoch": 0.4600522510480588, + "grad_norm": 0.6725801825523376, + "learning_rate": 5.652433233984673e-05, + "loss": 1.0822, + "step": 7572 + }, + { + "epoch": 0.46011300808068534, + "grad_norm": 0.1218404695391655, + "learning_rate": 5.651484129386726e-05, + "loss": 1.0449, + "step": 7573 + }, + { + "epoch": 0.4601737651133119, + "grad_norm": 0.2364734709262848, + "learning_rate": 5.6505350009085024e-05, + "loss": 1.0068, + "step": 7574 + }, + { + "epoch": 0.4602345221459384, + "grad_norm": 1.5241683721542358, + "learning_rate": 5.649585848584793e-05, + "loss": 1.1521, + "step": 7575 + }, + { + "epoch": 0.46029527917856494, + "grad_norm": 0.14631372690200806, + "learning_rate": 5.648636672450388e-05, + "loss": 1.0765, + "step": 7576 + }, + { + "epoch": 0.4603560362111914, + "grad_norm": 0.3273763060569763, + "learning_rate": 5.6476874725400795e-05, + "loss": 1.1211, + "step": 7577 + }, + { + "epoch": 0.46041679324381796, + "grad_norm": 0.15713606774806976, + "learning_rate": 5.646738248888663e-05, + "loss": 1.0777, + "step": 7578 + }, + { + "epoch": 0.4604775502764445, + "grad_norm": 0.29719293117523193, + "learning_rate": 5.64578900153093e-05, + "loss": 1.2381, + "step": 7579 + }, + { + "epoch": 0.460538307309071, + "grad_norm": 0.4006941318511963, + "learning_rate": 5.6448397305016766e-05, + "loss": 1.0431, + "step": 7580 + }, + { + "epoch": 0.46059906434169756, + "grad_norm": 0.3481408953666687, + "learning_rate": 5.6438904358357004e-05, + "loss": 1.0488, + "step": 7581 + }, + { + "epoch": 0.4606598213743241, + "grad_norm": 0.18954573571681976, + "learning_rate": 5.642941117567793e-05, + "loss": 1.0892, + "step": 7582 + }, + { + "epoch": 0.46072057840695063, + "grad_norm": 0.32978764176368713, + "learning_rate": 5.6419917757327555e-05, + "loss": 1.078, + "step": 7583 + }, + { + "epoch": 0.4607813354395771, + "grad_norm": 0.17325016856193542, + "learning_rate": 5.6410424103653844e-05, + "loss": 1.1239, + "step": 7584 + }, + { + "epoch": 0.46084209247220365, + "grad_norm": 0.19177311658859253, + "learning_rate": 5.640093021500481e-05, + "loss": 1.0542, + "step": 7585 + }, + { + "epoch": 0.4609028495048302, + "grad_norm": 0.18840324878692627, + "learning_rate": 5.6391436091728446e-05, + "loss": 1.1353, + "step": 7586 + }, + { + "epoch": 0.4609636065374567, + "grad_norm": 0.152621328830719, + "learning_rate": 5.638194173417274e-05, + "loss": 1.0428, + "step": 7587 + }, + { + "epoch": 0.46102436357008325, + "grad_norm": 0.17865662276744843, + "learning_rate": 5.637244714268575e-05, + "loss": 1.1341, + "step": 7588 + }, + { + "epoch": 0.4610851206027098, + "grad_norm": 1.0702911615371704, + "learning_rate": 5.636295231761547e-05, + "loss": 1.0964, + "step": 7589 + }, + { + "epoch": 0.46114587763533627, + "grad_norm": 0.18569421768188477, + "learning_rate": 5.6353457259309925e-05, + "loss": 1.0688, + "step": 7590 + }, + { + "epoch": 0.4612066346679628, + "grad_norm": 0.15140080451965332, + "learning_rate": 5.6343961968117196e-05, + "loss": 1.062, + "step": 7591 + }, + { + "epoch": 0.46126739170058934, + "grad_norm": 0.516845703125, + "learning_rate": 5.63344664443853e-05, + "loss": 1.0543, + "step": 7592 + }, + { + "epoch": 0.46132814873321587, + "grad_norm": 0.19516772031784058, + "learning_rate": 5.632497068846232e-05, + "loss": 1.0706, + "step": 7593 + }, + { + "epoch": 0.4613889057658424, + "grad_norm": 0.1170029565691948, + "learning_rate": 5.631547470069631e-05, + "loss": 1.0205, + "step": 7594 + }, + { + "epoch": 0.46144966279846894, + "grad_norm": 0.4323507249355316, + "learning_rate": 5.630597848143537e-05, + "loss": 1.0735, + "step": 7595 + }, + { + "epoch": 0.4615104198310955, + "grad_norm": 0.32684460282325745, + "learning_rate": 5.6296482031027565e-05, + "loss": 1.157, + "step": 7596 + }, + { + "epoch": 0.46157117686372195, + "grad_norm": 0.29347795248031616, + "learning_rate": 5.6286985349820986e-05, + "loss": 1.1309, + "step": 7597 + }, + { + "epoch": 0.4616319338963485, + "grad_norm": 0.18857432901859283, + "learning_rate": 5.627748843816375e-05, + "loss": 1.1816, + "step": 7598 + }, + { + "epoch": 0.461692690928975, + "grad_norm": 7.915004253387451, + "learning_rate": 5.626799129640395e-05, + "loss": 1.1834, + "step": 7599 + }, + { + "epoch": 0.46175344796160156, + "grad_norm": 0.2563534677028656, + "learning_rate": 5.625849392488972e-05, + "loss": 1.2312, + "step": 7600 + }, + { + "epoch": 0.4618142049942281, + "grad_norm": 0.18239572644233704, + "learning_rate": 5.62489963239692e-05, + "loss": 1.1721, + "step": 7601 + }, + { + "epoch": 0.46187496202685463, + "grad_norm": 0.14203126728534698, + "learning_rate": 5.623949849399052e-05, + "loss": 1.0179, + "step": 7602 + }, + { + "epoch": 0.4619357190594811, + "grad_norm": 0.12551337480545044, + "learning_rate": 5.623000043530179e-05, + "loss": 1.0602, + "step": 7603 + }, + { + "epoch": 0.46199647609210764, + "grad_norm": 0.1708313375711441, + "learning_rate": 5.6220502148251197e-05, + "loss": 1.1203, + "step": 7604 + }, + { + "epoch": 0.4620572331247342, + "grad_norm": 0.12785588204860687, + "learning_rate": 5.62110036331869e-05, + "loss": 1.0581, + "step": 7605 + }, + { + "epoch": 0.4621179901573607, + "grad_norm": 0.25426387786865234, + "learning_rate": 5.620150489045706e-05, + "loss": 1.1265, + "step": 7606 + }, + { + "epoch": 0.46217874718998725, + "grad_norm": 0.1477641463279724, + "learning_rate": 5.619200592040986e-05, + "loss": 1.0383, + "step": 7607 + }, + { + "epoch": 0.4622395042226138, + "grad_norm": 0.17011034488677979, + "learning_rate": 5.618250672339349e-05, + "loss": 1.0614, + "step": 7608 + }, + { + "epoch": 0.4623002612552403, + "grad_norm": 0.5043832063674927, + "learning_rate": 5.617300729975614e-05, + "loss": 1.1171, + "step": 7609 + }, + { + "epoch": 0.4623610182878668, + "grad_norm": 0.16781552135944366, + "learning_rate": 5.6163507649846026e-05, + "loss": 1.0651, + "step": 7610 + }, + { + "epoch": 0.46242177532049333, + "grad_norm": 0.1682317703962326, + "learning_rate": 5.615400777401133e-05, + "loss": 1.1196, + "step": 7611 + }, + { + "epoch": 0.46248253235311987, + "grad_norm": 0.1677902340888977, + "learning_rate": 5.61445076726003e-05, + "loss": 1.1295, + "step": 7612 + }, + { + "epoch": 0.4625432893857464, + "grad_norm": 3.1851911544799805, + "learning_rate": 5.613500734596113e-05, + "loss": 1.0841, + "step": 7613 + }, + { + "epoch": 0.46260404641837294, + "grad_norm": 0.19222573935985565, + "learning_rate": 5.612550679444211e-05, + "loss": 1.0268, + "step": 7614 + }, + { + "epoch": 0.4626648034509995, + "grad_norm": 0.30901357531547546, + "learning_rate": 5.611600601839144e-05, + "loss": 1.2294, + "step": 7615 + }, + { + "epoch": 0.46272556048362595, + "grad_norm": 0.1423233449459076, + "learning_rate": 5.610650501815739e-05, + "loss": 1.0767, + "step": 7616 + }, + { + "epoch": 0.4627863175162525, + "grad_norm": 0.1298656016588211, + "learning_rate": 5.609700379408822e-05, + "loss": 1.075, + "step": 7617 + }, + { + "epoch": 0.462847074548879, + "grad_norm": 0.27051857113838196, + "learning_rate": 5.60875023465322e-05, + "loss": 1.2075, + "step": 7618 + }, + { + "epoch": 0.46290783158150556, + "grad_norm": 0.12271862477064133, + "learning_rate": 5.6078000675837594e-05, + "loss": 1.0499, + "step": 7619 + }, + { + "epoch": 0.4629685886141321, + "grad_norm": 0.1604657918214798, + "learning_rate": 5.6068498782352707e-05, + "loss": 1.0606, + "step": 7620 + }, + { + "epoch": 0.4630293456467586, + "grad_norm": 0.14413392543792725, + "learning_rate": 5.605899666642582e-05, + "loss": 1.0435, + "step": 7621 + }, + { + "epoch": 0.46309010267938516, + "grad_norm": 0.2147381454706192, + "learning_rate": 5.604949432840525e-05, + "loss": 1.0417, + "step": 7622 + }, + { + "epoch": 0.46315085971201164, + "grad_norm": 0.15062421560287476, + "learning_rate": 5.6039991768639276e-05, + "loss": 1.1245, + "step": 7623 + }, + { + "epoch": 0.4632116167446382, + "grad_norm": 0.15868224203586578, + "learning_rate": 5.603048898747625e-05, + "loss": 1.1355, + "step": 7624 + }, + { + "epoch": 0.4632723737772647, + "grad_norm": 0.19187390804290771, + "learning_rate": 5.6020985985264486e-05, + "loss": 1.1011, + "step": 7625 + }, + { + "epoch": 0.46333313080989125, + "grad_norm": 0.20561785995960236, + "learning_rate": 5.601148276235231e-05, + "loss": 1.1224, + "step": 7626 + }, + { + "epoch": 0.4633938878425178, + "grad_norm": 0.17380262911319733, + "learning_rate": 5.600197931908807e-05, + "loss": 1.147, + "step": 7627 + }, + { + "epoch": 0.4634546448751443, + "grad_norm": 0.1822991967201233, + "learning_rate": 5.599247565582012e-05, + "loss": 1.1022, + "step": 7628 + }, + { + "epoch": 0.46351540190777085, + "grad_norm": 0.45203229784965515, + "learning_rate": 5.598297177289682e-05, + "loss": 1.1577, + "step": 7629 + }, + { + "epoch": 0.46357615894039733, + "grad_norm": 0.14951534569263458, + "learning_rate": 5.597346767066654e-05, + "loss": 1.0283, + "step": 7630 + }, + { + "epoch": 0.46363691597302387, + "grad_norm": 0.3066413104534149, + "learning_rate": 5.5963963349477636e-05, + "loss": 1.0693, + "step": 7631 + }, + { + "epoch": 0.4636976730056504, + "grad_norm": 0.8103142976760864, + "learning_rate": 5.59544588096785e-05, + "loss": 1.073, + "step": 7632 + }, + { + "epoch": 0.46375843003827694, + "grad_norm": 0.34123125672340393, + "learning_rate": 5.594495405161754e-05, + "loss": 1.1859, + "step": 7633 + }, + { + "epoch": 0.46381918707090347, + "grad_norm": 0.1669309288263321, + "learning_rate": 5.593544907564312e-05, + "loss": 1.0837, + "step": 7634 + }, + { + "epoch": 0.46387994410353, + "grad_norm": 0.22812524437904358, + "learning_rate": 5.592594388210369e-05, + "loss": 1.0743, + "step": 7635 + }, + { + "epoch": 0.4639407011361565, + "grad_norm": 1.2388594150543213, + "learning_rate": 5.591643847134762e-05, + "loss": 1.0607, + "step": 7636 + }, + { + "epoch": 0.464001458168783, + "grad_norm": 1.0885109901428223, + "learning_rate": 5.590693284372336e-05, + "loss": 1.0916, + "step": 7637 + }, + { + "epoch": 0.46406221520140956, + "grad_norm": 0.13822364807128906, + "learning_rate": 5.589742699957935e-05, + "loss": 1.0739, + "step": 7638 + }, + { + "epoch": 0.4641229722340361, + "grad_norm": 0.1727365404367447, + "learning_rate": 5.588792093926399e-05, + "loss": 1.0989, + "step": 7639 + }, + { + "epoch": 0.4641837292666626, + "grad_norm": 0.2509419322013855, + "learning_rate": 5.5878414663125765e-05, + "loss": 1.124, + "step": 7640 + }, + { + "epoch": 0.46424448629928916, + "grad_norm": 0.2248162031173706, + "learning_rate": 5.586890817151311e-05, + "loss": 1.0429, + "step": 7641 + }, + { + "epoch": 0.4643052433319157, + "grad_norm": 0.18183498084545135, + "learning_rate": 5.58594014647745e-05, + "loss": 1.0876, + "step": 7642 + }, + { + "epoch": 0.4643660003645422, + "grad_norm": 0.19940079748630524, + "learning_rate": 5.5849894543258396e-05, + "loss": 1.1727, + "step": 7643 + }, + { + "epoch": 0.4644267573971687, + "grad_norm": 0.12469612061977386, + "learning_rate": 5.5840387407313254e-05, + "loss": 1.0311, + "step": 7644 + }, + { + "epoch": 0.46448751442979525, + "grad_norm": 0.17444831132888794, + "learning_rate": 5.5830880057287616e-05, + "loss": 1.084, + "step": 7645 + }, + { + "epoch": 0.4645482714624218, + "grad_norm": 2.5646579265594482, + "learning_rate": 5.5821372493529924e-05, + "loss": 1.0094, + "step": 7646 + }, + { + "epoch": 0.4646090284950483, + "grad_norm": 0.12972000241279602, + "learning_rate": 5.581186471638868e-05, + "loss": 1.0733, + "step": 7647 + }, + { + "epoch": 0.46466978552767485, + "grad_norm": 0.4693816900253296, + "learning_rate": 5.580235672621246e-05, + "loss": 1.133, + "step": 7648 + }, + { + "epoch": 0.46473054256030133, + "grad_norm": 0.18356047570705414, + "learning_rate": 5.5792848523349684e-05, + "loss": 1.181, + "step": 7649 + }, + { + "epoch": 0.46479129959292786, + "grad_norm": 0.24481543898582458, + "learning_rate": 5.578334010814895e-05, + "loss": 1.0815, + "step": 7650 + }, + { + "epoch": 0.4648520566255544, + "grad_norm": 0.3973977565765381, + "learning_rate": 5.577383148095876e-05, + "loss": 1.072, + "step": 7651 + }, + { + "epoch": 0.46491281365818093, + "grad_norm": 0.29410886764526367, + "learning_rate": 5.5764322642127665e-05, + "loss": 1.1073, + "step": 7652 + }, + { + "epoch": 0.46497357069080747, + "grad_norm": 0.27756214141845703, + "learning_rate": 5.575481359200421e-05, + "loss": 1.1719, + "step": 7653 + }, + { + "epoch": 0.465034327723434, + "grad_norm": 0.24933436512947083, + "learning_rate": 5.5745304330936945e-05, + "loss": 1.3256, + "step": 7654 + }, + { + "epoch": 0.46509508475606054, + "grad_norm": 0.12517014145851135, + "learning_rate": 5.5735794859274436e-05, + "loss": 1.015, + "step": 7655 + }, + { + "epoch": 0.465155841788687, + "grad_norm": 0.16356950998306274, + "learning_rate": 5.5726285177365265e-05, + "loss": 1.1986, + "step": 7656 + }, + { + "epoch": 0.46521659882131355, + "grad_norm": 0.1978103518486023, + "learning_rate": 5.5716775285558e-05, + "loss": 1.0759, + "step": 7657 + }, + { + "epoch": 0.4652773558539401, + "grad_norm": 0.17229701578617096, + "learning_rate": 5.570726518420124e-05, + "loss": 1.1028, + "step": 7658 + }, + { + "epoch": 0.4653381128865666, + "grad_norm": 0.23430059850215912, + "learning_rate": 5.569775487364357e-05, + "loss": 1.1268, + "step": 7659 + }, + { + "epoch": 0.46539886991919316, + "grad_norm": 0.14466476440429688, + "learning_rate": 5.5688244354233577e-05, + "loss": 1.0849, + "step": 7660 + }, + { + "epoch": 0.4654596269518197, + "grad_norm": 0.165469229221344, + "learning_rate": 5.56787336263199e-05, + "loss": 1.0958, + "step": 7661 + }, + { + "epoch": 0.4655203839844462, + "grad_norm": 0.13033407926559448, + "learning_rate": 5.5669222690251146e-05, + "loss": 1.0625, + "step": 7662 + }, + { + "epoch": 0.4655811410170727, + "grad_norm": 0.13559192419052124, + "learning_rate": 5.5659711546375935e-05, + "loss": 1.0696, + "step": 7663 + }, + { + "epoch": 0.46564189804969924, + "grad_norm": 0.23872654139995575, + "learning_rate": 5.5650200195042914e-05, + "loss": 1.1407, + "step": 7664 + }, + { + "epoch": 0.4657026550823258, + "grad_norm": 0.2885741591453552, + "learning_rate": 5.56406886366007e-05, + "loss": 1.1699, + "step": 7665 + }, + { + "epoch": 0.4657634121149523, + "grad_norm": 0.2483404576778412, + "learning_rate": 5.563117687139796e-05, + "loss": 1.1875, + "step": 7666 + }, + { + "epoch": 0.46582416914757885, + "grad_norm": 0.17019130289554596, + "learning_rate": 5.5621664899783345e-05, + "loss": 1.1328, + "step": 7667 + }, + { + "epoch": 0.4658849261802054, + "grad_norm": 0.19060491025447845, + "learning_rate": 5.561215272210551e-05, + "loss": 1.1019, + "step": 7668 + }, + { + "epoch": 0.46594568321283186, + "grad_norm": 0.30430859327316284, + "learning_rate": 5.560264033871313e-05, + "loss": 1.1902, + "step": 7669 + }, + { + "epoch": 0.4660064402454584, + "grad_norm": 0.20313583314418793, + "learning_rate": 5.559312774995489e-05, + "loss": 1.1339, + "step": 7670 + }, + { + "epoch": 0.46606719727808493, + "grad_norm": 0.12355385720729828, + "learning_rate": 5.5583614956179464e-05, + "loss": 1.0667, + "step": 7671 + }, + { + "epoch": 0.46612795431071147, + "grad_norm": 0.13360348343849182, + "learning_rate": 5.5574101957735556e-05, + "loss": 1.0758, + "step": 7672 + }, + { + "epoch": 0.466188711343338, + "grad_norm": 0.16960491240024567, + "learning_rate": 5.556458875497187e-05, + "loss": 1.0487, + "step": 7673 + }, + { + "epoch": 0.46624946837596454, + "grad_norm": 0.14360320568084717, + "learning_rate": 5.555507534823711e-05, + "loss": 1.0692, + "step": 7674 + }, + { + "epoch": 0.466310225408591, + "grad_norm": 0.11860251426696777, + "learning_rate": 5.554556173787998e-05, + "loss": 1.0709, + "step": 7675 + }, + { + "epoch": 0.46637098244121755, + "grad_norm": 0.3663264513015747, + "learning_rate": 5.553604792424922e-05, + "loss": 1.072, + "step": 7676 + }, + { + "epoch": 0.4664317394738441, + "grad_norm": 0.39243003726005554, + "learning_rate": 5.552653390769356e-05, + "loss": 1.0582, + "step": 7677 + }, + { + "epoch": 0.4664924965064706, + "grad_norm": 0.15361706912517548, + "learning_rate": 5.551701968856171e-05, + "loss": 1.0509, + "step": 7678 + }, + { + "epoch": 0.46655325353909716, + "grad_norm": 0.21406547725200653, + "learning_rate": 5.5507505267202456e-05, + "loss": 1.1269, + "step": 7679 + }, + { + "epoch": 0.4666140105717237, + "grad_norm": 0.23460249602794647, + "learning_rate": 5.5497990643964505e-05, + "loss": 1.1227, + "step": 7680 + }, + { + "epoch": 0.4666747676043502, + "grad_norm": 0.14040905237197876, + "learning_rate": 5.5488475819196674e-05, + "loss": 1.1328, + "step": 7681 + }, + { + "epoch": 0.4667355246369767, + "grad_norm": 0.21273450553417206, + "learning_rate": 5.547896079324769e-05, + "loss": 1.0463, + "step": 7682 + }, + { + "epoch": 0.46679628166960324, + "grad_norm": 0.12193840742111206, + "learning_rate": 5.546944556646634e-05, + "loss": 1.0542, + "step": 7683 + }, + { + "epoch": 0.4668570387022298, + "grad_norm": 0.34236642718315125, + "learning_rate": 5.54599301392014e-05, + "loss": 1.0547, + "step": 7684 + }, + { + "epoch": 0.4669177957348563, + "grad_norm": 0.16691485047340393, + "learning_rate": 5.545041451180167e-05, + "loss": 1.127, + "step": 7685 + }, + { + "epoch": 0.46697855276748285, + "grad_norm": 0.19637003540992737, + "learning_rate": 5.544089868461594e-05, + "loss": 1.075, + "step": 7686 + }, + { + "epoch": 0.4670393098001094, + "grad_norm": 0.20217357575893402, + "learning_rate": 5.5431382657993025e-05, + "loss": 1.1611, + "step": 7687 + }, + { + "epoch": 0.4671000668327359, + "grad_norm": 0.14192472398281097, + "learning_rate": 5.542186643228172e-05, + "loss": 1.0641, + "step": 7688 + }, + { + "epoch": 0.4671608238653624, + "grad_norm": 0.2243974655866623, + "learning_rate": 5.541235000783086e-05, + "loss": 1.0851, + "step": 7689 + }, + { + "epoch": 0.46722158089798893, + "grad_norm": 0.15024195611476898, + "learning_rate": 5.540283338498926e-05, + "loss": 1.0835, + "step": 7690 + }, + { + "epoch": 0.46728233793061547, + "grad_norm": 0.14921025931835175, + "learning_rate": 5.539331656410575e-05, + "loss": 1.0471, + "step": 7691 + }, + { + "epoch": 0.467343094963242, + "grad_norm": 0.20188233256340027, + "learning_rate": 5.5383799545529205e-05, + "loss": 1.0741, + "step": 7692 + }, + { + "epoch": 0.46740385199586854, + "grad_norm": 0.17332637310028076, + "learning_rate": 5.5374282329608404e-05, + "loss": 1.0258, + "step": 7693 + }, + { + "epoch": 0.46746460902849507, + "grad_norm": 0.317302942276001, + "learning_rate": 5.5364764916692277e-05, + "loss": 1.1503, + "step": 7694 + }, + { + "epoch": 0.46752536606112155, + "grad_norm": 0.23128226399421692, + "learning_rate": 5.5355247307129664e-05, + "loss": 1.0749, + "step": 7695 + }, + { + "epoch": 0.4675861230937481, + "grad_norm": 0.21716414391994476, + "learning_rate": 5.5345729501269396e-05, + "loss": 1.2077, + "step": 7696 + }, + { + "epoch": 0.4676468801263746, + "grad_norm": 0.41507014632225037, + "learning_rate": 5.53362114994604e-05, + "loss": 1.0666, + "step": 7697 + }, + { + "epoch": 0.46770763715900115, + "grad_norm": 0.14391931891441345, + "learning_rate": 5.5326693302051537e-05, + "loss": 1.0611, + "step": 7698 + }, + { + "epoch": 0.4677683941916277, + "grad_norm": 0.15689924359321594, + "learning_rate": 5.531717490939169e-05, + "loss": 1.0617, + "step": 7699 + }, + { + "epoch": 0.4678291512242542, + "grad_norm": 0.15438872575759888, + "learning_rate": 5.530765632182978e-05, + "loss": 1.1145, + "step": 7700 + }, + { + "epoch": 0.46788990825688076, + "grad_norm": 0.2057807892560959, + "learning_rate": 5.529813753971469e-05, + "loss": 1.1581, + "step": 7701 + }, + { + "epoch": 0.46795066528950724, + "grad_norm": 29.648632049560547, + "learning_rate": 5.5288618563395366e-05, + "loss": 1.0382, + "step": 7702 + }, + { + "epoch": 0.4680114223221338, + "grad_norm": 0.15054571628570557, + "learning_rate": 5.527909939322069e-05, + "loss": 1.0575, + "step": 7703 + }, + { + "epoch": 0.4680721793547603, + "grad_norm": 0.1688757836818695, + "learning_rate": 5.5269580029539605e-05, + "loss": 1.1171, + "step": 7704 + }, + { + "epoch": 0.46813293638738684, + "grad_norm": 0.20824523270130157, + "learning_rate": 5.526006047270106e-05, + "loss": 1.0807, + "step": 7705 + }, + { + "epoch": 0.4681936934200134, + "grad_norm": 0.1984352320432663, + "learning_rate": 5.525054072305395e-05, + "loss": 1.0547, + "step": 7706 + }, + { + "epoch": 0.4682544504526399, + "grad_norm": 0.11433682590723038, + "learning_rate": 5.5241020780947284e-05, + "loss": 1.0176, + "step": 7707 + }, + { + "epoch": 0.4683152074852664, + "grad_norm": 0.24109522998332977, + "learning_rate": 5.523150064672999e-05, + "loss": 1.2032, + "step": 7708 + }, + { + "epoch": 0.46837596451789293, + "grad_norm": 0.17710252106189728, + "learning_rate": 5.522198032075101e-05, + "loss": 1.0757, + "step": 7709 + }, + { + "epoch": 0.46843672155051946, + "grad_norm": 0.1847608983516693, + "learning_rate": 5.521245980335934e-05, + "loss": 1.0615, + "step": 7710 + }, + { + "epoch": 0.468497478583146, + "grad_norm": 0.12425071746110916, + "learning_rate": 5.520293909490396e-05, + "loss": 1.0648, + "step": 7711 + }, + { + "epoch": 0.46855823561577253, + "grad_norm": 0.22357232868671417, + "learning_rate": 5.5193418195733825e-05, + "loss": 1.0918, + "step": 7712 + }, + { + "epoch": 0.46861899264839907, + "grad_norm": 0.19007642567157745, + "learning_rate": 5.5183897106197945e-05, + "loss": 1.1689, + "step": 7713 + }, + { + "epoch": 0.4686797496810256, + "grad_norm": 0.2071160525083542, + "learning_rate": 5.517437582664531e-05, + "loss": 1.0804, + "step": 7714 + }, + { + "epoch": 0.4687405067136521, + "grad_norm": 0.14910295605659485, + "learning_rate": 5.516485435742496e-05, + "loss": 1.1555, + "step": 7715 + }, + { + "epoch": 0.4688012637462786, + "grad_norm": 0.12315890938043594, + "learning_rate": 5.515533269888583e-05, + "loss": 1.0108, + "step": 7716 + }, + { + "epoch": 0.46886202077890515, + "grad_norm": 0.18095339834690094, + "learning_rate": 5.514581085137701e-05, + "loss": 1.0851, + "step": 7717 + }, + { + "epoch": 0.4689227778115317, + "grad_norm": 0.20210176706314087, + "learning_rate": 5.51362888152475e-05, + "loss": 1.0758, + "step": 7718 + }, + { + "epoch": 0.4689835348441582, + "grad_norm": 0.1383674144744873, + "learning_rate": 5.512676659084631e-05, + "loss": 1.0767, + "step": 7719 + }, + { + "epoch": 0.46904429187678476, + "grad_norm": 0.1770932674407959, + "learning_rate": 5.511724417852252e-05, + "loss": 1.1355, + "step": 7720 + }, + { + "epoch": 0.46910504890941124, + "grad_norm": 0.1914380043745041, + "learning_rate": 5.510772157862515e-05, + "loss": 1.0826, + "step": 7721 + }, + { + "epoch": 0.4691658059420378, + "grad_norm": 0.13793520629405975, + "learning_rate": 5.5098198791503265e-05, + "loss": 1.0382, + "step": 7722 + }, + { + "epoch": 0.4692265629746643, + "grad_norm": 0.18655921518802643, + "learning_rate": 5.5088675817505905e-05, + "loss": 1.1243, + "step": 7723 + }, + { + "epoch": 0.46928732000729084, + "grad_norm": 2.703308343887329, + "learning_rate": 5.507915265698215e-05, + "loss": 1.0956, + "step": 7724 + }, + { + "epoch": 0.4693480770399174, + "grad_norm": 0.5491136312484741, + "learning_rate": 5.506962931028109e-05, + "loss": 1.0964, + "step": 7725 + }, + { + "epoch": 0.4694088340725439, + "grad_norm": 1.9153352975845337, + "learning_rate": 5.506010577775177e-05, + "loss": 1.0614, + "step": 7726 + }, + { + "epoch": 0.46946959110517045, + "grad_norm": 0.22011426091194153, + "learning_rate": 5.5050582059743296e-05, + "loss": 1.1205, + "step": 7727 + }, + { + "epoch": 0.4695303481377969, + "grad_norm": 0.2554408311843872, + "learning_rate": 5.5041058156604766e-05, + "loss": 1.0518, + "step": 7728 + }, + { + "epoch": 0.46959110517042346, + "grad_norm": 0.25837868452072144, + "learning_rate": 5.503153406868527e-05, + "loss": 1.1819, + "step": 7729 + }, + { + "epoch": 0.46965186220305, + "grad_norm": 0.1197812557220459, + "learning_rate": 5.502200979633392e-05, + "loss": 1.0153, + "step": 7730 + }, + { + "epoch": 0.46971261923567653, + "grad_norm": 0.1912653148174286, + "learning_rate": 5.5012485339899835e-05, + "loss": 1.1036, + "step": 7731 + }, + { + "epoch": 0.46977337626830307, + "grad_norm": 0.17191733419895172, + "learning_rate": 5.5002960699732135e-05, + "loss": 1.1622, + "step": 7732 + }, + { + "epoch": 0.4698341333009296, + "grad_norm": 0.14717574417591095, + "learning_rate": 5.499343587617993e-05, + "loss": 1.0963, + "step": 7733 + }, + { + "epoch": 0.46989489033355614, + "grad_norm": 0.16133275628089905, + "learning_rate": 5.4983910869592384e-05, + "loss": 1.0896, + "step": 7734 + }, + { + "epoch": 0.4699556473661826, + "grad_norm": 0.15688002109527588, + "learning_rate": 5.497438568031861e-05, + "loss": 1.0214, + "step": 7735 + }, + { + "epoch": 0.47001640439880915, + "grad_norm": 0.24089637398719788, + "learning_rate": 5.496486030870777e-05, + "loss": 1.1374, + "step": 7736 + }, + { + "epoch": 0.4700771614314357, + "grad_norm": 0.1316508948802948, + "learning_rate": 5.4955334755109014e-05, + "loss": 1.0939, + "step": 7737 + }, + { + "epoch": 0.4701379184640622, + "grad_norm": 2.5219810009002686, + "learning_rate": 5.49458090198715e-05, + "loss": 1.0275, + "step": 7738 + }, + { + "epoch": 0.47019867549668876, + "grad_norm": 0.284229576587677, + "learning_rate": 5.493628310334442e-05, + "loss": 1.2056, + "step": 7739 + }, + { + "epoch": 0.4702594325293153, + "grad_norm": 0.1665724366903305, + "learning_rate": 5.4926757005876904e-05, + "loss": 1.1052, + "step": 7740 + }, + { + "epoch": 0.47032018956194177, + "grad_norm": 0.14781175553798676, + "learning_rate": 5.491723072781817e-05, + "loss": 1.1326, + "step": 7741 + }, + { + "epoch": 0.4703809465945683, + "grad_norm": 0.15880252420902252, + "learning_rate": 5.4907704269517394e-05, + "loss": 1.1092, + "step": 7742 + }, + { + "epoch": 0.47044170362719484, + "grad_norm": 0.22209881246089935, + "learning_rate": 5.489817763132377e-05, + "loss": 1.107, + "step": 7743 + }, + { + "epoch": 0.4705024606598214, + "grad_norm": 0.2783887982368469, + "learning_rate": 5.4888650813586494e-05, + "loss": 1.1081, + "step": 7744 + }, + { + "epoch": 0.4705632176924479, + "grad_norm": 0.22827184200286865, + "learning_rate": 5.4879123816654786e-05, + "loss": 1.1623, + "step": 7745 + }, + { + "epoch": 0.47062397472507445, + "grad_norm": 0.4204244911670685, + "learning_rate": 5.486959664087784e-05, + "loss": 1.0821, + "step": 7746 + }, + { + "epoch": 0.470684731757701, + "grad_norm": 0.12838798761367798, + "learning_rate": 5.4860069286604896e-05, + "loss": 1.0201, + "step": 7747 + }, + { + "epoch": 0.47074548879032746, + "grad_norm": 0.1261807531118393, + "learning_rate": 5.485054175418516e-05, + "loss": 1.0269, + "step": 7748 + }, + { + "epoch": 0.470806245822954, + "grad_norm": 0.25944215059280396, + "learning_rate": 5.48410140439679e-05, + "loss": 1.0849, + "step": 7749 + }, + { + "epoch": 0.47086700285558053, + "grad_norm": 0.19042851030826569, + "learning_rate": 5.483148615630231e-05, + "loss": 1.1666, + "step": 7750 + }, + { + "epoch": 0.47092775988820706, + "grad_norm": 0.6102296710014343, + "learning_rate": 5.4821958091537675e-05, + "loss": 1.0643, + "step": 7751 + }, + { + "epoch": 0.4709885169208336, + "grad_norm": 0.17072387039661407, + "learning_rate": 5.481242985002325e-05, + "loss": 1.1393, + "step": 7752 + }, + { + "epoch": 0.47104927395346013, + "grad_norm": 0.17193695902824402, + "learning_rate": 5.480290143210825e-05, + "loss": 1.1388, + "step": 7753 + }, + { + "epoch": 0.4711100309860866, + "grad_norm": 0.19930435717105865, + "learning_rate": 5.4793372838141974e-05, + "loss": 1.0579, + "step": 7754 + }, + { + "epoch": 0.47117078801871315, + "grad_norm": 0.20226283371448517, + "learning_rate": 5.47838440684737e-05, + "loss": 1.1862, + "step": 7755 + }, + { + "epoch": 0.4712315450513397, + "grad_norm": 0.22341449558734894, + "learning_rate": 5.4774315123452703e-05, + "loss": 1.19, + "step": 7756 + }, + { + "epoch": 0.4712923020839662, + "grad_norm": 0.14023500680923462, + "learning_rate": 5.476478600342825e-05, + "loss": 1.0576, + "step": 7757 + }, + { + "epoch": 0.47135305911659275, + "grad_norm": 0.17156977951526642, + "learning_rate": 5.4755256708749635e-05, + "loss": 1.0887, + "step": 7758 + }, + { + "epoch": 0.4714138161492193, + "grad_norm": 0.14423759281635284, + "learning_rate": 5.4745727239766195e-05, + "loss": 1.0629, + "step": 7759 + }, + { + "epoch": 0.4714745731818458, + "grad_norm": 0.12208879739046097, + "learning_rate": 5.4736197596827185e-05, + "loss": 1.0569, + "step": 7760 + }, + { + "epoch": 0.4715353302144723, + "grad_norm": 0.16867636144161224, + "learning_rate": 5.4726667780281935e-05, + "loss": 1.113, + "step": 7761 + }, + { + "epoch": 0.47159608724709884, + "grad_norm": 0.2090156078338623, + "learning_rate": 5.471713779047979e-05, + "loss": 1.07, + "step": 7762 + }, + { + "epoch": 0.4716568442797254, + "grad_norm": 0.5601214170455933, + "learning_rate": 5.470760762777001e-05, + "loss": 1.1139, + "step": 7763 + }, + { + "epoch": 0.4717176013123519, + "grad_norm": 0.14180058240890503, + "learning_rate": 5.469807729250198e-05, + "loss": 1.0703, + "step": 7764 + }, + { + "epoch": 0.47177835834497844, + "grad_norm": 0.20316222310066223, + "learning_rate": 5.468854678502503e-05, + "loss": 1.1128, + "step": 7765 + }, + { + "epoch": 0.471839115377605, + "grad_norm": 0.1542683094739914, + "learning_rate": 5.4679016105688475e-05, + "loss": 1.0391, + "step": 7766 + }, + { + "epoch": 0.47189987241023146, + "grad_norm": 0.20195795595645905, + "learning_rate": 5.466948525484169e-05, + "loss": 1.0544, + "step": 7767 + }, + { + "epoch": 0.471960629442858, + "grad_norm": 0.18277931213378906, + "learning_rate": 5.465995423283401e-05, + "loss": 1.1079, + "step": 7768 + }, + { + "epoch": 0.47202138647548453, + "grad_norm": 0.21681025624275208, + "learning_rate": 5.465042304001482e-05, + "loss": 1.1719, + "step": 7769 + }, + { + "epoch": 0.47208214350811106, + "grad_norm": 0.32704997062683105, + "learning_rate": 5.4640891676733466e-05, + "loss": 1.1155, + "step": 7770 + }, + { + "epoch": 0.4721429005407376, + "grad_norm": 0.25209930539131165, + "learning_rate": 5.463136014333933e-05, + "loss": 1.1418, + "step": 7771 + }, + { + "epoch": 0.47220365757336413, + "grad_norm": 0.13654296100139618, + "learning_rate": 5.46218284401818e-05, + "loss": 1.0462, + "step": 7772 + }, + { + "epoch": 0.47226441460599067, + "grad_norm": 0.1953367441892624, + "learning_rate": 5.4612296567610244e-05, + "loss": 1.1316, + "step": 7773 + }, + { + "epoch": 0.47232517163861715, + "grad_norm": 6.810418128967285, + "learning_rate": 5.4602764525974074e-05, + "loss": 1.1277, + "step": 7774 + }, + { + "epoch": 0.4723859286712437, + "grad_norm": 0.20527994632720947, + "learning_rate": 5.459323231562269e-05, + "loss": 1.0892, + "step": 7775 + }, + { + "epoch": 0.4724466857038702, + "grad_norm": 0.2602483034133911, + "learning_rate": 5.458369993690546e-05, + "loss": 1.1021, + "step": 7776 + }, + { + "epoch": 0.47250744273649675, + "grad_norm": 0.19205524027347565, + "learning_rate": 5.4574167390171836e-05, + "loss": 1.1201, + "step": 7777 + }, + { + "epoch": 0.4725681997691233, + "grad_norm": 0.14140817523002625, + "learning_rate": 5.456463467577123e-05, + "loss": 1.0448, + "step": 7778 + }, + { + "epoch": 0.4726289568017498, + "grad_norm": 0.20182453095912933, + "learning_rate": 5.4555101794053054e-05, + "loss": 1.1289, + "step": 7779 + }, + { + "epoch": 0.4726897138343763, + "grad_norm": 0.1880880892276764, + "learning_rate": 5.4545568745366734e-05, + "loss": 1.0591, + "step": 7780 + }, + { + "epoch": 0.47275047086700284, + "grad_norm": 0.23056897521018982, + "learning_rate": 5.4536035530061716e-05, + "loss": 1.0786, + "step": 7781 + }, + { + "epoch": 0.47281122789962937, + "grad_norm": 0.1171664372086525, + "learning_rate": 5.452650214848746e-05, + "loss": 0.9748, + "step": 7782 + }, + { + "epoch": 0.4728719849322559, + "grad_norm": 0.2350465953350067, + "learning_rate": 5.451696860099337e-05, + "loss": 1.0683, + "step": 7783 + }, + { + "epoch": 0.47293274196488244, + "grad_norm": 0.1455424576997757, + "learning_rate": 5.450743488792892e-05, + "loss": 1.1746, + "step": 7784 + }, + { + "epoch": 0.472993498997509, + "grad_norm": 0.12110444903373718, + "learning_rate": 5.4497901009643594e-05, + "loss": 1.0514, + "step": 7785 + }, + { + "epoch": 0.4730542560301355, + "grad_norm": 0.8733172416687012, + "learning_rate": 5.4488366966486804e-05, + "loss": 1.1994, + "step": 7786 + }, + { + "epoch": 0.473115013062762, + "grad_norm": 0.16160254180431366, + "learning_rate": 5.447883275880809e-05, + "loss": 1.0621, + "step": 7787 + }, + { + "epoch": 0.4731757700953885, + "grad_norm": 0.18750369548797607, + "learning_rate": 5.446929838695687e-05, + "loss": 1.2648, + "step": 7788 + }, + { + "epoch": 0.47323652712801506, + "grad_norm": 0.13489563763141632, + "learning_rate": 5.445976385128266e-05, + "loss": 1.0345, + "step": 7789 + }, + { + "epoch": 0.4732972841606416, + "grad_norm": 0.23016774654388428, + "learning_rate": 5.445022915213496e-05, + "loss": 1.1304, + "step": 7790 + }, + { + "epoch": 0.47335804119326813, + "grad_norm": 0.15345828235149384, + "learning_rate": 5.444069428986324e-05, + "loss": 1.0549, + "step": 7791 + }, + { + "epoch": 0.47341879822589467, + "grad_norm": 0.25145938992500305, + "learning_rate": 5.4431159264817e-05, + "loss": 1.0497, + "step": 7792 + }, + { + "epoch": 0.4734795552585212, + "grad_norm": 0.19060321152210236, + "learning_rate": 5.442162407734578e-05, + "loss": 1.0812, + "step": 7793 + }, + { + "epoch": 0.4735403122911477, + "grad_norm": 0.2530083656311035, + "learning_rate": 5.441208872779905e-05, + "loss": 1.1228, + "step": 7794 + }, + { + "epoch": 0.4736010693237742, + "grad_norm": 0.12563717365264893, + "learning_rate": 5.440255321652637e-05, + "loss": 1.0195, + "step": 7795 + }, + { + "epoch": 0.47366182635640075, + "grad_norm": 0.19678513705730438, + "learning_rate": 5.4393017543877266e-05, + "loss": 1.1552, + "step": 7796 + }, + { + "epoch": 0.4737225833890273, + "grad_norm": 0.28398069739341736, + "learning_rate": 5.438348171020123e-05, + "loss": 1.091, + "step": 7797 + }, + { + "epoch": 0.4737833404216538, + "grad_norm": 0.20876304805278778, + "learning_rate": 5.437394571584784e-05, + "loss": 1.0837, + "step": 7798 + }, + { + "epoch": 0.47384409745428036, + "grad_norm": 0.45880165696144104, + "learning_rate": 5.436440956116662e-05, + "loss": 1.3407, + "step": 7799 + }, + { + "epoch": 0.47390485448690683, + "grad_norm": 0.23414337635040283, + "learning_rate": 5.435487324650712e-05, + "loss": 1.0597, + "step": 7800 + }, + { + "epoch": 0.47396561151953337, + "grad_norm": 0.1596781462430954, + "learning_rate": 5.43453367722189e-05, + "loss": 1.0568, + "step": 7801 + }, + { + "epoch": 0.4740263685521599, + "grad_norm": 0.19410710036754608, + "learning_rate": 5.433580013865153e-05, + "loss": 1.0466, + "step": 7802 + }, + { + "epoch": 0.47408712558478644, + "grad_norm": 0.2315683811903, + "learning_rate": 5.432626334615456e-05, + "loss": 1.0954, + "step": 7803 + }, + { + "epoch": 0.474147882617413, + "grad_norm": 0.18427301943302155, + "learning_rate": 5.431672639507758e-05, + "loss": 1.0862, + "step": 7804 + }, + { + "epoch": 0.4742086396500395, + "grad_norm": 0.15700361132621765, + "learning_rate": 5.430718928577013e-05, + "loss": 1.065, + "step": 7805 + }, + { + "epoch": 0.47426939668266604, + "grad_norm": 0.19870778918266296, + "learning_rate": 5.429765201858186e-05, + "loss": 1.0405, + "step": 7806 + }, + { + "epoch": 0.4743301537152925, + "grad_norm": 0.18470308184623718, + "learning_rate": 5.4288114593862305e-05, + "loss": 1.1483, + "step": 7807 + }, + { + "epoch": 0.47439091074791906, + "grad_norm": 0.24553771317005157, + "learning_rate": 5.4278577011961084e-05, + "loss": 1.0676, + "step": 7808 + }, + { + "epoch": 0.4744516677805456, + "grad_norm": 0.1649901121854782, + "learning_rate": 5.4269039273227796e-05, + "loss": 1.1483, + "step": 7809 + }, + { + "epoch": 0.47451242481317213, + "grad_norm": 0.5021433234214783, + "learning_rate": 5.4259501378012055e-05, + "loss": 1.0552, + "step": 7810 + }, + { + "epoch": 0.47457318184579866, + "grad_norm": 0.17254818975925446, + "learning_rate": 5.424996332666347e-05, + "loss": 1.1915, + "step": 7811 + }, + { + "epoch": 0.4746339388784252, + "grad_norm": 0.21085822582244873, + "learning_rate": 5.424042511953165e-05, + "loss": 1.0015, + "step": 7812 + }, + { + "epoch": 0.4746946959110517, + "grad_norm": 0.12940984964370728, + "learning_rate": 5.4230886756966224e-05, + "loss": 1.0798, + "step": 7813 + }, + { + "epoch": 0.4747554529436782, + "grad_norm": 0.15791188180446625, + "learning_rate": 5.4221348239316826e-05, + "loss": 1.0185, + "step": 7814 + }, + { + "epoch": 0.47481620997630475, + "grad_norm": 0.19145667552947998, + "learning_rate": 5.421180956693309e-05, + "loss": 1.0688, + "step": 7815 + }, + { + "epoch": 0.4748769670089313, + "grad_norm": 0.1844419240951538, + "learning_rate": 5.4202270740164676e-05, + "loss": 1.0806, + "step": 7816 + }, + { + "epoch": 0.4749377240415578, + "grad_norm": 0.1794002801179886, + "learning_rate": 5.4192731759361194e-05, + "loss": 1.1276, + "step": 7817 + }, + { + "epoch": 0.47499848107418435, + "grad_norm": 0.19025595486164093, + "learning_rate": 5.418319262487234e-05, + "loss": 1.1032, + "step": 7818 + }, + { + "epoch": 0.4750592381068109, + "grad_norm": 0.16529054939746857, + "learning_rate": 5.417365333704775e-05, + "loss": 1.0487, + "step": 7819 + }, + { + "epoch": 0.47511999513943737, + "grad_norm": 0.3196588456630707, + "learning_rate": 5.4164113896237065e-05, + "loss": 1.1987, + "step": 7820 + }, + { + "epoch": 0.4751807521720639, + "grad_norm": 0.1594213843345642, + "learning_rate": 5.4154574302790006e-05, + "loss": 1.0485, + "step": 7821 + }, + { + "epoch": 0.47524150920469044, + "grad_norm": 0.23622652888298035, + "learning_rate": 5.4145034557056216e-05, + "loss": 1.0181, + "step": 7822 + }, + { + "epoch": 0.475302266237317, + "grad_norm": 0.14866940677165985, + "learning_rate": 5.4135494659385375e-05, + "loss": 1.0821, + "step": 7823 + }, + { + "epoch": 0.4753630232699435, + "grad_norm": 0.16754929721355438, + "learning_rate": 5.4125954610127194e-05, + "loss": 1.0843, + "step": 7824 + }, + { + "epoch": 0.47542378030257004, + "grad_norm": 0.15314020216464996, + "learning_rate": 5.4116414409631334e-05, + "loss": 1.1154, + "step": 7825 + }, + { + "epoch": 0.4754845373351965, + "grad_norm": 0.12049468606710434, + "learning_rate": 5.4106874058247526e-05, + "loss": 1.0753, + "step": 7826 + }, + { + "epoch": 0.47554529436782306, + "grad_norm": 0.16232872009277344, + "learning_rate": 5.409733355632544e-05, + "loss": 1.1097, + "step": 7827 + }, + { + "epoch": 0.4756060514004496, + "grad_norm": 0.13334311544895172, + "learning_rate": 5.40877929042148e-05, + "loss": 1.0695, + "step": 7828 + }, + { + "epoch": 0.4756668084330761, + "grad_norm": 0.23005901277065277, + "learning_rate": 5.407825210226534e-05, + "loss": 1.0816, + "step": 7829 + }, + { + "epoch": 0.47572756546570266, + "grad_norm": 1.7344071865081787, + "learning_rate": 5.406871115082672e-05, + "loss": 1.1305, + "step": 7830 + }, + { + "epoch": 0.4757883224983292, + "grad_norm": 0.22466632723808289, + "learning_rate": 5.405917005024874e-05, + "loss": 1.1343, + "step": 7831 + }, + { + "epoch": 0.47584907953095573, + "grad_norm": 0.16500264406204224, + "learning_rate": 5.4049628800881105e-05, + "loss": 1.0241, + "step": 7832 + }, + { + "epoch": 0.4759098365635822, + "grad_norm": 0.11504673957824707, + "learning_rate": 5.404008740307351e-05, + "loss": 1.0506, + "step": 7833 + }, + { + "epoch": 0.47597059359620875, + "grad_norm": 0.2061557024717331, + "learning_rate": 5.4030545857175754e-05, + "loss": 1.091, + "step": 7834 + }, + { + "epoch": 0.4760313506288353, + "grad_norm": 0.14971303939819336, + "learning_rate": 5.402100416353756e-05, + "loss": 1.1017, + "step": 7835 + }, + { + "epoch": 0.4760921076614618, + "grad_norm": 0.13605548441410065, + "learning_rate": 5.4011462322508674e-05, + "loss": 1.0577, + "step": 7836 + }, + { + "epoch": 0.47615286469408835, + "grad_norm": 0.13173580169677734, + "learning_rate": 5.400192033443886e-05, + "loss": 1.0498, + "step": 7837 + }, + { + "epoch": 0.4762136217267149, + "grad_norm": 0.16944468021392822, + "learning_rate": 5.399237819967788e-05, + "loss": 1.0513, + "step": 7838 + }, + { + "epoch": 0.4762743787593414, + "grad_norm": 0.12402139604091644, + "learning_rate": 5.398283591857552e-05, + "loss": 1.0867, + "step": 7839 + }, + { + "epoch": 0.4763351357919679, + "grad_norm": 0.7062398791313171, + "learning_rate": 5.397329349148154e-05, + "loss": 1.1476, + "step": 7840 + }, + { + "epoch": 0.47639589282459444, + "grad_norm": 0.9580641388893127, + "learning_rate": 5.3963750918745694e-05, + "loss": 1.0157, + "step": 7841 + }, + { + "epoch": 0.47645664985722097, + "grad_norm": 0.24378855526447296, + "learning_rate": 5.3954208200717814e-05, + "loss": 1.2185, + "step": 7842 + }, + { + "epoch": 0.4765174068898475, + "grad_norm": 0.22034133970737457, + "learning_rate": 5.3944665337747655e-05, + "loss": 1.0968, + "step": 7843 + }, + { + "epoch": 0.47657816392247404, + "grad_norm": 0.2139805108308792, + "learning_rate": 5.393512233018504e-05, + "loss": 1.0286, + "step": 7844 + }, + { + "epoch": 0.4766389209551006, + "grad_norm": 0.14700251817703247, + "learning_rate": 5.3925579178379745e-05, + "loss": 1.1194, + "step": 7845 + }, + { + "epoch": 0.47669967798772706, + "grad_norm": 0.20244181156158447, + "learning_rate": 5.3916035882681595e-05, + "loss": 1.1879, + "step": 7846 + }, + { + "epoch": 0.4767604350203536, + "grad_norm": 0.12988512217998505, + "learning_rate": 5.3906492443440384e-05, + "loss": 1.111, + "step": 7847 + }, + { + "epoch": 0.4768211920529801, + "grad_norm": 0.12920653820037842, + "learning_rate": 5.3896948861005935e-05, + "loss": 1.0324, + "step": 7848 + }, + { + "epoch": 0.47688194908560666, + "grad_norm": 0.13387957215309143, + "learning_rate": 5.388740513572809e-05, + "loss": 1.0631, + "step": 7849 + }, + { + "epoch": 0.4769427061182332, + "grad_norm": 0.24146319925785065, + "learning_rate": 5.3877861267956645e-05, + "loss": 1.2698, + "step": 7850 + }, + { + "epoch": 0.47700346315085973, + "grad_norm": 0.12807698547840118, + "learning_rate": 5.386831725804143e-05, + "loss": 1.0056, + "step": 7851 + }, + { + "epoch": 0.47706422018348627, + "grad_norm": 0.33459898829460144, + "learning_rate": 5.385877310633233e-05, + "loss": 1.2747, + "step": 7852 + }, + { + "epoch": 0.47712497721611274, + "grad_norm": 0.16809044778347015, + "learning_rate": 5.3849228813179144e-05, + "loss": 1.1117, + "step": 7853 + }, + { + "epoch": 0.4771857342487393, + "grad_norm": 0.20247362554073334, + "learning_rate": 5.3839684378931724e-05, + "loss": 1.0077, + "step": 7854 + }, + { + "epoch": 0.4772464912813658, + "grad_norm": 7.45745849609375, + "learning_rate": 5.383013980393993e-05, + "loss": 1.0652, + "step": 7855 + }, + { + "epoch": 0.47730724831399235, + "grad_norm": 0.2583889067173004, + "learning_rate": 5.3820595088553626e-05, + "loss": 1.1445, + "step": 7856 + }, + { + "epoch": 0.4773680053466189, + "grad_norm": 0.15075862407684326, + "learning_rate": 5.381105023312267e-05, + "loss": 1.0751, + "step": 7857 + }, + { + "epoch": 0.4774287623792454, + "grad_norm": 0.5770914554595947, + "learning_rate": 5.380150523799693e-05, + "loss": 1.1802, + "step": 7858 + }, + { + "epoch": 0.4774895194118719, + "grad_norm": 0.3534625172615051, + "learning_rate": 5.3791960103526284e-05, + "loss": 1.1225, + "step": 7859 + }, + { + "epoch": 0.47755027644449843, + "grad_norm": 0.2707405686378479, + "learning_rate": 5.378241483006061e-05, + "loss": 1.3073, + "step": 7860 + }, + { + "epoch": 0.47761103347712497, + "grad_norm": 0.180998295545578, + "learning_rate": 5.377286941794979e-05, + "loss": 1.0496, + "step": 7861 + }, + { + "epoch": 0.4776717905097515, + "grad_norm": 0.2080940157175064, + "learning_rate": 5.376332386754369e-05, + "loss": 1.1915, + "step": 7862 + }, + { + "epoch": 0.47773254754237804, + "grad_norm": 0.17377610504627228, + "learning_rate": 5.375377817919226e-05, + "loss": 1.017, + "step": 7863 + }, + { + "epoch": 0.4777933045750046, + "grad_norm": 0.29675769805908203, + "learning_rate": 5.374423235324534e-05, + "loss": 1.15, + "step": 7864 + }, + { + "epoch": 0.4778540616076311, + "grad_norm": 0.24133987724781036, + "learning_rate": 5.3734686390052866e-05, + "loss": 1.1266, + "step": 7865 + }, + { + "epoch": 0.4779148186402576, + "grad_norm": 0.16240110993385315, + "learning_rate": 5.372514028996475e-05, + "loss": 1.0719, + "step": 7866 + }, + { + "epoch": 0.4779755756728841, + "grad_norm": 0.27527379989624023, + "learning_rate": 5.371559405333089e-05, + "loss": 1.0623, + "step": 7867 + }, + { + "epoch": 0.47803633270551066, + "grad_norm": 0.17591215670108795, + "learning_rate": 5.370604768050121e-05, + "loss": 1.1256, + "step": 7868 + }, + { + "epoch": 0.4780970897381372, + "grad_norm": 0.21628862619400024, + "learning_rate": 5.369650117182564e-05, + "loss": 1.0643, + "step": 7869 + }, + { + "epoch": 0.47815784677076373, + "grad_norm": 0.1805424839258194, + "learning_rate": 5.3686954527654096e-05, + "loss": 1.1736, + "step": 7870 + }, + { + "epoch": 0.47821860380339026, + "grad_norm": 2.398637533187866, + "learning_rate": 5.367740774833653e-05, + "loss": 1.1197, + "step": 7871 + }, + { + "epoch": 0.47827936083601674, + "grad_norm": 0.18604539334774017, + "learning_rate": 5.366786083422286e-05, + "loss": 1.1382, + "step": 7872 + }, + { + "epoch": 0.4783401178686433, + "grad_norm": 0.3984617590904236, + "learning_rate": 5.365831378566306e-05, + "loss": 1.3101, + "step": 7873 + }, + { + "epoch": 0.4784008749012698, + "grad_norm": 0.1727215200662613, + "learning_rate": 5.364876660300705e-05, + "loss": 1.0559, + "step": 7874 + }, + { + "epoch": 0.47846163193389635, + "grad_norm": 0.19871488213539124, + "learning_rate": 5.363921928660479e-05, + "loss": 1.0094, + "step": 7875 + }, + { + "epoch": 0.4785223889665229, + "grad_norm": 0.2542242407798767, + "learning_rate": 5.362967183680627e-05, + "loss": 1.1125, + "step": 7876 + }, + { + "epoch": 0.4785831459991494, + "grad_norm": 0.1417936384677887, + "learning_rate": 5.36201242539614e-05, + "loss": 1.0357, + "step": 7877 + }, + { + "epoch": 0.47864390303177595, + "grad_norm": 0.2642741799354553, + "learning_rate": 5.3610576538420185e-05, + "loss": 1.1574, + "step": 7878 + }, + { + "epoch": 0.47870466006440243, + "grad_norm": 0.21164394915103912, + "learning_rate": 5.3601028690532586e-05, + "loss": 1.1746, + "step": 7879 + }, + { + "epoch": 0.47876541709702897, + "grad_norm": 0.32209643721580505, + "learning_rate": 5.359148071064859e-05, + "loss": 1.0919, + "step": 7880 + }, + { + "epoch": 0.4788261741296555, + "grad_norm": 0.2014092057943344, + "learning_rate": 5.358193259911817e-05, + "loss": 1.0956, + "step": 7881 + }, + { + "epoch": 0.47888693116228204, + "grad_norm": 0.1983584761619568, + "learning_rate": 5.35723843562913e-05, + "loss": 1.0658, + "step": 7882 + }, + { + "epoch": 0.47894768819490857, + "grad_norm": 0.2146424651145935, + "learning_rate": 5.356283598251803e-05, + "loss": 1.2607, + "step": 7883 + }, + { + "epoch": 0.4790084452275351, + "grad_norm": 0.2187325656414032, + "learning_rate": 5.3553287478148295e-05, + "loss": 1.012, + "step": 7884 + }, + { + "epoch": 0.4790692022601616, + "grad_norm": 0.202541783452034, + "learning_rate": 5.3543738843532124e-05, + "loss": 1.1371, + "step": 7885 + }, + { + "epoch": 0.4791299592927881, + "grad_norm": 0.17969128489494324, + "learning_rate": 5.353419007901953e-05, + "loss": 1.0692, + "step": 7886 + }, + { + "epoch": 0.47919071632541466, + "grad_norm": 0.4171610176563263, + "learning_rate": 5.352464118496049e-05, + "loss": 1.3031, + "step": 7887 + }, + { + "epoch": 0.4792514733580412, + "grad_norm": 0.7410226464271545, + "learning_rate": 5.351509216170507e-05, + "loss": 1.0556, + "step": 7888 + }, + { + "epoch": 0.4793122303906677, + "grad_norm": 0.1526513546705246, + "learning_rate": 5.350554300960326e-05, + "loss": 1.1034, + "step": 7889 + }, + { + "epoch": 0.47937298742329426, + "grad_norm": 0.17666850984096527, + "learning_rate": 5.349599372900509e-05, + "loss": 1.1811, + "step": 7890 + }, + { + "epoch": 0.4794337444559208, + "grad_norm": 1.730309009552002, + "learning_rate": 5.34864443202606e-05, + "loss": 1.077, + "step": 7891 + }, + { + "epoch": 0.4794945014885473, + "grad_norm": 0.19253453612327576, + "learning_rate": 5.347689478371982e-05, + "loss": 1.0733, + "step": 7892 + }, + { + "epoch": 0.4795552585211738, + "grad_norm": 0.4330640137195587, + "learning_rate": 5.346734511973278e-05, + "loss": 1.1161, + "step": 7893 + }, + { + "epoch": 0.47961601555380035, + "grad_norm": 0.1686244010925293, + "learning_rate": 5.345779532864954e-05, + "loss": 1.0456, + "step": 7894 + }, + { + "epoch": 0.4796767725864269, + "grad_norm": 0.13274863362312317, + "learning_rate": 5.344824541082014e-05, + "loss": 1.0379, + "step": 7895 + }, + { + "epoch": 0.4797375296190534, + "grad_norm": 0.13108132779598236, + "learning_rate": 5.343869536659465e-05, + "loss": 1.0185, + "step": 7896 + }, + { + "epoch": 0.47979828665167995, + "grad_norm": 0.24932914972305298, + "learning_rate": 5.342914519632312e-05, + "loss": 1.0402, + "step": 7897 + }, + { + "epoch": 0.4798590436843065, + "grad_norm": 0.17208437621593475, + "learning_rate": 5.3419594900355585e-05, + "loss": 1.0514, + "step": 7898 + }, + { + "epoch": 0.47991980071693296, + "grad_norm": 0.1338982880115509, + "learning_rate": 5.3410044479042166e-05, + "loss": 1.0036, + "step": 7899 + }, + { + "epoch": 0.4799805577495595, + "grad_norm": 0.2246599793434143, + "learning_rate": 5.3400493932732873e-05, + "loss": 1.1542, + "step": 7900 + }, + { + "epoch": 0.48004131478218603, + "grad_norm": 0.2111661583185196, + "learning_rate": 5.339094326177785e-05, + "loss": 1.1145, + "step": 7901 + }, + { + "epoch": 0.48010207181481257, + "grad_norm": 0.2084105759859085, + "learning_rate": 5.3381392466527135e-05, + "loss": 1.0655, + "step": 7902 + }, + { + "epoch": 0.4801628288474391, + "grad_norm": 0.20062394440174103, + "learning_rate": 5.3371841547330825e-05, + "loss": 1.0884, + "step": 7903 + }, + { + "epoch": 0.48022358588006564, + "grad_norm": 0.3038314878940582, + "learning_rate": 5.3362290504539005e-05, + "loss": 1.0506, + "step": 7904 + }, + { + "epoch": 0.4802843429126921, + "grad_norm": 0.23090730607509613, + "learning_rate": 5.335273933850179e-05, + "loss": 1.0537, + "step": 7905 + }, + { + "epoch": 0.48034509994531865, + "grad_norm": 0.1430114209651947, + "learning_rate": 5.334318804956926e-05, + "loss": 1.0321, + "step": 7906 + }, + { + "epoch": 0.4804058569779452, + "grad_norm": 0.30413711071014404, + "learning_rate": 5.333363663809152e-05, + "loss": 1.0682, + "step": 7907 + }, + { + "epoch": 0.4804666140105717, + "grad_norm": 0.21721361577510834, + "learning_rate": 5.3324085104418683e-05, + "loss": 1.0291, + "step": 7908 + }, + { + "epoch": 0.48052737104319826, + "grad_norm": 0.1294948160648346, + "learning_rate": 5.331453344890088e-05, + "loss": 1.0913, + "step": 7909 + }, + { + "epoch": 0.4805881280758248, + "grad_norm": 0.2155667543411255, + "learning_rate": 5.33049816718882e-05, + "loss": 1.2055, + "step": 7910 + }, + { + "epoch": 0.48064888510845133, + "grad_norm": 0.3649980425834656, + "learning_rate": 5.3295429773730774e-05, + "loss": 1.2326, + "step": 7911 + }, + { + "epoch": 0.4807096421410778, + "grad_norm": 0.2715924084186554, + "learning_rate": 5.328587775477875e-05, + "loss": 1.1139, + "step": 7912 + }, + { + "epoch": 0.48077039917370434, + "grad_norm": 0.13731925189495087, + "learning_rate": 5.327632561538223e-05, + "loss": 1.0574, + "step": 7913 + }, + { + "epoch": 0.4808311562063309, + "grad_norm": 0.15858833491802216, + "learning_rate": 5.3266773355891365e-05, + "loss": 1.0083, + "step": 7914 + }, + { + "epoch": 0.4808919132389574, + "grad_norm": 0.2582811415195465, + "learning_rate": 5.325722097665629e-05, + "loss": 1.0731, + "step": 7915 + }, + { + "epoch": 0.48095267027158395, + "grad_norm": 0.24416813254356384, + "learning_rate": 5.3247668478027144e-05, + "loss": 1.043, + "step": 7916 + }, + { + "epoch": 0.4810134273042105, + "grad_norm": 0.2177872210741043, + "learning_rate": 5.32381158603541e-05, + "loss": 1.0251, + "step": 7917 + }, + { + "epoch": 0.48107418433683696, + "grad_norm": 0.23981410264968872, + "learning_rate": 5.3228563123987266e-05, + "loss": 1.1534, + "step": 7918 + }, + { + "epoch": 0.4811349413694635, + "grad_norm": 0.24031716585159302, + "learning_rate": 5.321901026927685e-05, + "loss": 1.0805, + "step": 7919 + }, + { + "epoch": 0.48119569840209003, + "grad_norm": 0.2082665115594864, + "learning_rate": 5.320945729657298e-05, + "loss": 1.1479, + "step": 7920 + }, + { + "epoch": 0.48125645543471657, + "grad_norm": 0.1649162769317627, + "learning_rate": 5.319990420622583e-05, + "loss": 1.0763, + "step": 7921 + }, + { + "epoch": 0.4813172124673431, + "grad_norm": 0.1867951601743698, + "learning_rate": 5.319035099858558e-05, + "loss": 1.2107, + "step": 7922 + }, + { + "epoch": 0.48137796949996964, + "grad_norm": 0.19225385785102844, + "learning_rate": 5.3180797674002394e-05, + "loss": 1.3577, + "step": 7923 + }, + { + "epoch": 0.4814387265325962, + "grad_norm": 0.29202041029930115, + "learning_rate": 5.317124423282646e-05, + "loss": 1.0761, + "step": 7924 + }, + { + "epoch": 0.48149948356522265, + "grad_norm": 0.1566685438156128, + "learning_rate": 5.316169067540795e-05, + "loss": 1.0715, + "step": 7925 + }, + { + "epoch": 0.4815602405978492, + "grad_norm": 0.1629745215177536, + "learning_rate": 5.315213700209706e-05, + "loss": 1.0888, + "step": 7926 + }, + { + "epoch": 0.4816209976304757, + "grad_norm": 0.14564397931098938, + "learning_rate": 5.314258321324398e-05, + "loss": 1.0502, + "step": 7927 + }, + { + "epoch": 0.48168175466310226, + "grad_norm": 0.15911243855953217, + "learning_rate": 5.31330293091989e-05, + "loss": 1.0768, + "step": 7928 + }, + { + "epoch": 0.4817425116957288, + "grad_norm": 0.18059316277503967, + "learning_rate": 5.312347529031203e-05, + "loss": 1.176, + "step": 7929 + }, + { + "epoch": 0.4818032687283553, + "grad_norm": 0.15501949191093445, + "learning_rate": 5.3113921156933566e-05, + "loss": 1.0504, + "step": 7930 + }, + { + "epoch": 0.4818640257609818, + "grad_norm": 0.21993504464626312, + "learning_rate": 5.3104366909413705e-05, + "loss": 1.2641, + "step": 7931 + }, + { + "epoch": 0.48192478279360834, + "grad_norm": 0.20999452471733093, + "learning_rate": 5.309481254810269e-05, + "loss": 1.0181, + "step": 7932 + }, + { + "epoch": 0.4819855398262349, + "grad_norm": 0.24557897448539734, + "learning_rate": 5.308525807335074e-05, + "loss": 1.0617, + "step": 7933 + }, + { + "epoch": 0.4820462968588614, + "grad_norm": 0.16740214824676514, + "learning_rate": 5.3075703485508036e-05, + "loss": 1.0498, + "step": 7934 + }, + { + "epoch": 0.48210705389148795, + "grad_norm": 0.14449343085289001, + "learning_rate": 5.306614878492483e-05, + "loss": 1.0816, + "step": 7935 + }, + { + "epoch": 0.4821678109241145, + "grad_norm": 0.14462348818778992, + "learning_rate": 5.305659397195135e-05, + "loss": 1.0324, + "step": 7936 + }, + { + "epoch": 0.482228567956741, + "grad_norm": 0.24806468188762665, + "learning_rate": 5.304703904693783e-05, + "loss": 1.0069, + "step": 7937 + }, + { + "epoch": 0.4822893249893675, + "grad_norm": 0.3005404472351074, + "learning_rate": 5.303748401023451e-05, + "loss": 1.1536, + "step": 7938 + }, + { + "epoch": 0.48235008202199403, + "grad_norm": 0.1694445013999939, + "learning_rate": 5.30279288621916e-05, + "loss": 1.0587, + "step": 7939 + }, + { + "epoch": 0.48241083905462057, + "grad_norm": 0.49404528737068176, + "learning_rate": 5.30183736031594e-05, + "loss": 1.1646, + "step": 7940 + }, + { + "epoch": 0.4824715960872471, + "grad_norm": 0.16141681373119354, + "learning_rate": 5.3008818233488124e-05, + "loss": 1.0195, + "step": 7941 + }, + { + "epoch": 0.48253235311987364, + "grad_norm": 0.14614440500736237, + "learning_rate": 5.299926275352802e-05, + "loss": 1.0414, + "step": 7942 + }, + { + "epoch": 0.48259311015250017, + "grad_norm": 0.3665587902069092, + "learning_rate": 5.298970716362939e-05, + "loss": 1.0628, + "step": 7943 + }, + { + "epoch": 0.4826538671851267, + "grad_norm": 0.19485582411289215, + "learning_rate": 5.2980151464142434e-05, + "loss": 1.1274, + "step": 7944 + }, + { + "epoch": 0.4827146242177532, + "grad_norm": 0.2236337959766388, + "learning_rate": 5.297059565541746e-05, + "loss": 1.1642, + "step": 7945 + }, + { + "epoch": 0.4827753812503797, + "grad_norm": 0.470614492893219, + "learning_rate": 5.296103973780475e-05, + "loss": 1.0924, + "step": 7946 + }, + { + "epoch": 0.48283613828300626, + "grad_norm": 0.7623069882392883, + "learning_rate": 5.2951483711654534e-05, + "loss": 1.0472, + "step": 7947 + }, + { + "epoch": 0.4828968953156328, + "grad_norm": 0.19647791981697083, + "learning_rate": 5.294192757731712e-05, + "loss": 1.1392, + "step": 7948 + }, + { + "epoch": 0.4829576523482593, + "grad_norm": 0.23711782693862915, + "learning_rate": 5.2932371335142784e-05, + "loss": 1.0503, + "step": 7949 + }, + { + "epoch": 0.48301840938088586, + "grad_norm": 0.2014499455690384, + "learning_rate": 5.29228149854818e-05, + "loss": 1.0746, + "step": 7950 + }, + { + "epoch": 0.48307916641351234, + "grad_norm": 0.2196071445941925, + "learning_rate": 5.2913258528684476e-05, + "loss": 1.1907, + "step": 7951 + }, + { + "epoch": 0.4831399234461389, + "grad_norm": 0.2745140492916107, + "learning_rate": 5.290370196510108e-05, + "loss": 1.2252, + "step": 7952 + }, + { + "epoch": 0.4832006804787654, + "grad_norm": 0.19129444658756256, + "learning_rate": 5.2894145295081954e-05, + "loss": 1.0874, + "step": 7953 + }, + { + "epoch": 0.48326143751139194, + "grad_norm": 0.25906902551651, + "learning_rate": 5.2884588518977364e-05, + "loss": 1.1451, + "step": 7954 + }, + { + "epoch": 0.4833221945440185, + "grad_norm": 0.16583764553070068, + "learning_rate": 5.287503163713761e-05, + "loss": 1.1273, + "step": 7955 + }, + { + "epoch": 0.483382951576645, + "grad_norm": 0.9115603566169739, + "learning_rate": 5.286547464991304e-05, + "loss": 1.0457, + "step": 7956 + }, + { + "epoch": 0.48344370860927155, + "grad_norm": 0.11355254054069519, + "learning_rate": 5.285591755765391e-05, + "loss": 1.0668, + "step": 7957 + }, + { + "epoch": 0.48350446564189803, + "grad_norm": 0.9551360607147217, + "learning_rate": 5.28463603607106e-05, + "loss": 1.094, + "step": 7958 + }, + { + "epoch": 0.48356522267452456, + "grad_norm": 0.18994790315628052, + "learning_rate": 5.2836803059433385e-05, + "loss": 1.0461, + "step": 7959 + }, + { + "epoch": 0.4836259797071511, + "grad_norm": 0.1270156353712082, + "learning_rate": 5.282724565417261e-05, + "loss": 1.0836, + "step": 7960 + }, + { + "epoch": 0.48368673673977763, + "grad_norm": 0.22180868685245514, + "learning_rate": 5.28176881452786e-05, + "loss": 1.1258, + "step": 7961 + }, + { + "epoch": 0.48374749377240417, + "grad_norm": 0.2829572260379791, + "learning_rate": 5.280813053310168e-05, + "loss": 1.2545, + "step": 7962 + }, + { + "epoch": 0.4838082508050307, + "grad_norm": 0.12600886821746826, + "learning_rate": 5.279857281799221e-05, + "loss": 1.052, + "step": 7963 + }, + { + "epoch": 0.4838690078376572, + "grad_norm": 0.2003079205751419, + "learning_rate": 5.27890150003005e-05, + "loss": 1.0105, + "step": 7964 + }, + { + "epoch": 0.4839297648702837, + "grad_norm": 0.9814311265945435, + "learning_rate": 5.2779457080376895e-05, + "loss": 1.1166, + "step": 7965 + }, + { + "epoch": 0.48399052190291025, + "grad_norm": 0.1546151489019394, + "learning_rate": 5.276989905857178e-05, + "loss": 1.0787, + "step": 7966 + }, + { + "epoch": 0.4840512789355368, + "grad_norm": 0.5525694489479065, + "learning_rate": 5.276034093523545e-05, + "loss": 1.0378, + "step": 7967 + }, + { + "epoch": 0.4841120359681633, + "grad_norm": 0.2193448692560196, + "learning_rate": 5.27507827107183e-05, + "loss": 1.1484, + "step": 7968 + }, + { + "epoch": 0.48417279300078986, + "grad_norm": 0.18941007554531097, + "learning_rate": 5.274122438537069e-05, + "loss": 1.0993, + "step": 7969 + }, + { + "epoch": 0.4842335500334164, + "grad_norm": 0.38067716360092163, + "learning_rate": 5.2731665959542964e-05, + "loss": 1.095, + "step": 7970 + }, + { + "epoch": 0.4842943070660429, + "grad_norm": 0.1646842211484909, + "learning_rate": 5.272210743358549e-05, + "loss": 1.0901, + "step": 7971 + }, + { + "epoch": 0.4843550640986694, + "grad_norm": 0.1659906655550003, + "learning_rate": 5.271254880784865e-05, + "loss": 1.0534, + "step": 7972 + }, + { + "epoch": 0.48441582113129594, + "grad_norm": 0.16462163627147675, + "learning_rate": 5.27029900826828e-05, + "loss": 1.1142, + "step": 7973 + }, + { + "epoch": 0.4844765781639225, + "grad_norm": 0.12224703282117844, + "learning_rate": 5.269343125843833e-05, + "loss": 1.0569, + "step": 7974 + }, + { + "epoch": 0.484537335196549, + "grad_norm": 0.21103839576244354, + "learning_rate": 5.268387233546561e-05, + "loss": 1.0869, + "step": 7975 + }, + { + "epoch": 0.48459809222917555, + "grad_norm": 0.14903800189495087, + "learning_rate": 5.267431331411504e-05, + "loss": 1.04, + "step": 7976 + }, + { + "epoch": 0.484658849261802, + "grad_norm": 0.2757299244403839, + "learning_rate": 5.2664754194737007e-05, + "loss": 1.0386, + "step": 7977 + }, + { + "epoch": 0.48471960629442856, + "grad_norm": 0.11410175263881683, + "learning_rate": 5.2655194977681864e-05, + "loss": 1.0371, + "step": 7978 + }, + { + "epoch": 0.4847803633270551, + "grad_norm": 0.12369665503501892, + "learning_rate": 5.264563566330006e-05, + "loss": 1.0136, + "step": 7979 + }, + { + "epoch": 0.48484112035968163, + "grad_norm": 0.23373769223690033, + "learning_rate": 5.263607625194197e-05, + "loss": 1.0418, + "step": 7980 + }, + { + "epoch": 0.48490187739230817, + "grad_norm": 0.18790516257286072, + "learning_rate": 5.262651674395799e-05, + "loss": 1.1498, + "step": 7981 + }, + { + "epoch": 0.4849626344249347, + "grad_norm": 0.13051916658878326, + "learning_rate": 5.261695713969853e-05, + "loss": 1.0668, + "step": 7982 + }, + { + "epoch": 0.48502339145756124, + "grad_norm": 0.1773120015859604, + "learning_rate": 5.2607397439514e-05, + "loss": 1.1004, + "step": 7983 + }, + { + "epoch": 0.4850841484901877, + "grad_norm": 0.21541526913642883, + "learning_rate": 5.259783764375481e-05, + "loss": 1.2231, + "step": 7984 + }, + { + "epoch": 0.48514490552281425, + "grad_norm": 0.15957030653953552, + "learning_rate": 5.2588277752771383e-05, + "loss": 1.1902, + "step": 7985 + }, + { + "epoch": 0.4852056625554408, + "grad_norm": 0.12087242305278778, + "learning_rate": 5.257871776691413e-05, + "loss": 1.0611, + "step": 7986 + }, + { + "epoch": 0.4852664195880673, + "grad_norm": 0.1997787058353424, + "learning_rate": 5.2569157686533486e-05, + "loss": 1.1269, + "step": 7987 + }, + { + "epoch": 0.48532717662069386, + "grad_norm": 0.29204103350639343, + "learning_rate": 5.2559597511979865e-05, + "loss": 1.2454, + "step": 7988 + }, + { + "epoch": 0.4853879336533204, + "grad_norm": 0.12689681351184845, + "learning_rate": 5.25500372436037e-05, + "loss": 1.034, + "step": 7989 + }, + { + "epoch": 0.48544869068594687, + "grad_norm": 0.234115332365036, + "learning_rate": 5.254047688175544e-05, + "loss": 1.0758, + "step": 7990 + }, + { + "epoch": 0.4855094477185734, + "grad_norm": 0.20598378777503967, + "learning_rate": 5.253091642678549e-05, + "loss": 1.139, + "step": 7991 + }, + { + "epoch": 0.48557020475119994, + "grad_norm": 0.5636376142501831, + "learning_rate": 5.252135587904432e-05, + "loss": 1.1992, + "step": 7992 + }, + { + "epoch": 0.4856309617838265, + "grad_norm": 0.19981490075588226, + "learning_rate": 5.251179523888237e-05, + "loss": 1.13, + "step": 7993 + }, + { + "epoch": 0.485691718816453, + "grad_norm": 0.14565733075141907, + "learning_rate": 5.2502234506650074e-05, + "loss": 1.0073, + "step": 7994 + }, + { + "epoch": 0.48575247584907955, + "grad_norm": 0.2524283230304718, + "learning_rate": 5.249267368269788e-05, + "loss": 1.1495, + "step": 7995 + }, + { + "epoch": 0.4858132328817061, + "grad_norm": 0.2057437300682068, + "learning_rate": 5.2483112767376267e-05, + "loss": 1.0412, + "step": 7996 + }, + { + "epoch": 0.48587398991433256, + "grad_norm": 0.1583324372768402, + "learning_rate": 5.247355176103567e-05, + "loss": 1.275, + "step": 7997 + }, + { + "epoch": 0.4859347469469591, + "grad_norm": 0.14873246848583221, + "learning_rate": 5.246399066402654e-05, + "loss": 1.07, + "step": 7998 + }, + { + "epoch": 0.48599550397958563, + "grad_norm": 0.20360715687274933, + "learning_rate": 5.2454429476699365e-05, + "loss": 1.2321, + "step": 7999 + }, + { + "epoch": 0.48605626101221217, + "grad_norm": 0.1450817734003067, + "learning_rate": 5.2444868199404616e-05, + "loss": 1.0631, + "step": 8000 + }, + { + "epoch": 0.4861170180448387, + "grad_norm": 0.28071364760398865, + "learning_rate": 5.2435306832492724e-05, + "loss": 1.1646, + "step": 8001 + }, + { + "epoch": 0.48617777507746524, + "grad_norm": 0.1492142379283905, + "learning_rate": 5.2425745376314206e-05, + "loss": 1.0385, + "step": 8002 + }, + { + "epoch": 0.48623853211009177, + "grad_norm": 0.1308092325925827, + "learning_rate": 5.2416183831219535e-05, + "loss": 1.0746, + "step": 8003 + }, + { + "epoch": 0.48629928914271825, + "grad_norm": 0.16332940757274628, + "learning_rate": 5.240662219755915e-05, + "loss": 1.1104, + "step": 8004 + }, + { + "epoch": 0.4863600461753448, + "grad_norm": 0.13730299472808838, + "learning_rate": 5.239706047568358e-05, + "loss": 1.1203, + "step": 8005 + }, + { + "epoch": 0.4864208032079713, + "grad_norm": 0.22228753566741943, + "learning_rate": 5.2387498665943294e-05, + "loss": 1.2661, + "step": 8006 + }, + { + "epoch": 0.48648156024059785, + "grad_norm": 0.20831681787967682, + "learning_rate": 5.237793676868879e-05, + "loss": 1.1509, + "step": 8007 + }, + { + "epoch": 0.4865423172732244, + "grad_norm": 1.1074758768081665, + "learning_rate": 5.2368374784270544e-05, + "loss": 1.145, + "step": 8008 + }, + { + "epoch": 0.4866030743058509, + "grad_norm": 0.18163339793682098, + "learning_rate": 5.2358812713039054e-05, + "loss": 1.1492, + "step": 8009 + }, + { + "epoch": 0.4866638313384774, + "grad_norm": 0.17313796281814575, + "learning_rate": 5.234925055534485e-05, + "loss": 1.0973, + "step": 8010 + }, + { + "epoch": 0.48672458837110394, + "grad_norm": 0.14672242105007172, + "learning_rate": 5.233968831153839e-05, + "loss": 1.0585, + "step": 8011 + }, + { + "epoch": 0.4867853454037305, + "grad_norm": 0.2614966332912445, + "learning_rate": 5.233012598197021e-05, + "loss": 1.0867, + "step": 8012 + }, + { + "epoch": 0.486846102436357, + "grad_norm": 0.16797687113285065, + "learning_rate": 5.232056356699083e-05, + "loss": 1.0453, + "step": 8013 + }, + { + "epoch": 0.48690685946898354, + "grad_norm": 0.1362597644329071, + "learning_rate": 5.2311001066950715e-05, + "loss": 1.0838, + "step": 8014 + }, + { + "epoch": 0.4869676165016101, + "grad_norm": 0.1593841165304184, + "learning_rate": 5.2301438482200425e-05, + "loss": 1.0238, + "step": 8015 + }, + { + "epoch": 0.4870283735342366, + "grad_norm": 0.13067477941513062, + "learning_rate": 5.2291875813090465e-05, + "loss": 1.0082, + "step": 8016 + }, + { + "epoch": 0.4870891305668631, + "grad_norm": 0.2266821265220642, + "learning_rate": 5.228231305997136e-05, + "loss": 1.1389, + "step": 8017 + }, + { + "epoch": 0.48714988759948963, + "grad_norm": 0.11394955962896347, + "learning_rate": 5.227275022319361e-05, + "loss": 1.0428, + "step": 8018 + }, + { + "epoch": 0.48721064463211616, + "grad_norm": 0.16107165813446045, + "learning_rate": 5.226318730310776e-05, + "loss": 1.06, + "step": 8019 + }, + { + "epoch": 0.4872714016647427, + "grad_norm": 0.2283700853586197, + "learning_rate": 5.2253624300064365e-05, + "loss": 1.0908, + "step": 8020 + }, + { + "epoch": 0.48733215869736923, + "grad_norm": 0.1669009029865265, + "learning_rate": 5.224406121441392e-05, + "loss": 1.118, + "step": 8021 + }, + { + "epoch": 0.48739291572999577, + "grad_norm": 0.36684319376945496, + "learning_rate": 5.223449804650697e-05, + "loss": 1.006, + "step": 8022 + }, + { + "epoch": 0.48745367276262225, + "grad_norm": 0.15023773908615112, + "learning_rate": 5.222493479669409e-05, + "loss": 1.1483, + "step": 8023 + }, + { + "epoch": 0.4875144297952488, + "grad_norm": 0.6638458371162415, + "learning_rate": 5.221537146532577e-05, + "loss": 1.1516, + "step": 8024 + }, + { + "epoch": 0.4875751868278753, + "grad_norm": 0.2511345148086548, + "learning_rate": 5.220580805275258e-05, + "loss": 1.0713, + "step": 8025 + }, + { + "epoch": 0.48763594386050185, + "grad_norm": 0.14232602715492249, + "learning_rate": 5.219624455932508e-05, + "loss": 1.0901, + "step": 8026 + }, + { + "epoch": 0.4876967008931284, + "grad_norm": 0.205320805311203, + "learning_rate": 5.218668098539381e-05, + "loss": 1.2123, + "step": 8027 + }, + { + "epoch": 0.4877574579257549, + "grad_norm": 0.17856544256210327, + "learning_rate": 5.217711733130932e-05, + "loss": 1.0312, + "step": 8028 + }, + { + "epoch": 0.48781821495838146, + "grad_norm": 0.5738333463668823, + "learning_rate": 5.216755359742218e-05, + "loss": 1.0911, + "step": 8029 + }, + { + "epoch": 0.48787897199100794, + "grad_norm": 0.17153413593769073, + "learning_rate": 5.215798978408295e-05, + "loss": 1.0853, + "step": 8030 + }, + { + "epoch": 0.48793972902363447, + "grad_norm": 0.16926462948322296, + "learning_rate": 5.214842589164218e-05, + "loss": 1.0512, + "step": 8031 + }, + { + "epoch": 0.488000486056261, + "grad_norm": 0.14639419317245483, + "learning_rate": 5.213886192045043e-05, + "loss": 1.0356, + "step": 8032 + }, + { + "epoch": 0.48806124308888754, + "grad_norm": 0.12281551212072372, + "learning_rate": 5.2129297870858304e-05, + "loss": 1.0345, + "step": 8033 + }, + { + "epoch": 0.4881220001215141, + "grad_norm": 0.3668050467967987, + "learning_rate": 5.2119733743216346e-05, + "loss": 1.1403, + "step": 8034 + }, + { + "epoch": 0.4881827571541406, + "grad_norm": 0.18001694977283478, + "learning_rate": 5.2110169537875134e-05, + "loss": 1.2287, + "step": 8035 + }, + { + "epoch": 0.4882435141867671, + "grad_norm": 0.12598273158073425, + "learning_rate": 5.210060525518525e-05, + "loss": 1.0753, + "step": 8036 + }, + { + "epoch": 0.4883042712193936, + "grad_norm": 4.260048866271973, + "learning_rate": 5.209104089549728e-05, + "loss": 1.1117, + "step": 8037 + }, + { + "epoch": 0.48836502825202016, + "grad_norm": 0.20910169184207916, + "learning_rate": 5.2081476459161796e-05, + "loss": 1.0553, + "step": 8038 + }, + { + "epoch": 0.4884257852846467, + "grad_norm": 0.34444794058799744, + "learning_rate": 5.20719119465294e-05, + "loss": 1.1254, + "step": 8039 + }, + { + "epoch": 0.48848654231727323, + "grad_norm": 0.21555787324905396, + "learning_rate": 5.2062347357950655e-05, + "loss": 1.2044, + "step": 8040 + }, + { + "epoch": 0.48854729934989977, + "grad_norm": 0.38426005840301514, + "learning_rate": 5.205278269377618e-05, + "loss": 1.0867, + "step": 8041 + }, + { + "epoch": 0.4886080563825263, + "grad_norm": 0.17334817349910736, + "learning_rate": 5.204321795435656e-05, + "loss": 1.1405, + "step": 8042 + }, + { + "epoch": 0.4886688134151528, + "grad_norm": 0.1316339522600174, + "learning_rate": 5.2033653140042384e-05, + "loss": 1.0939, + "step": 8043 + }, + { + "epoch": 0.4887295704477793, + "grad_norm": 0.3123212158679962, + "learning_rate": 5.202408825118427e-05, + "loss": 1.1619, + "step": 8044 + }, + { + "epoch": 0.48879032748040585, + "grad_norm": 0.22630399465560913, + "learning_rate": 5.201452328813279e-05, + "loss": 1.046, + "step": 8045 + }, + { + "epoch": 0.4888510845130324, + "grad_norm": 0.15358251333236694, + "learning_rate": 5.200495825123858e-05, + "loss": 1.0724, + "step": 8046 + }, + { + "epoch": 0.4889118415456589, + "grad_norm": 0.14148572087287903, + "learning_rate": 5.199539314085224e-05, + "loss": 1.0066, + "step": 8047 + }, + { + "epoch": 0.48897259857828546, + "grad_norm": 0.20466063916683197, + "learning_rate": 5.1985827957324376e-05, + "loss": 1.1558, + "step": 8048 + }, + { + "epoch": 0.489033355610912, + "grad_norm": 0.5848605036735535, + "learning_rate": 5.197626270100561e-05, + "loss": 1.1226, + "step": 8049 + }, + { + "epoch": 0.48909411264353847, + "grad_norm": 0.17252375185489655, + "learning_rate": 5.196669737224654e-05, + "loss": 1.1444, + "step": 8050 + }, + { + "epoch": 0.489154869676165, + "grad_norm": 0.1706843376159668, + "learning_rate": 5.195713197139781e-05, + "loss": 1.1242, + "step": 8051 + }, + { + "epoch": 0.48921562670879154, + "grad_norm": 3.653329372406006, + "learning_rate": 5.194756649881003e-05, + "loss": 1.1025, + "step": 8052 + }, + { + "epoch": 0.4892763837414181, + "grad_norm": 0.368230402469635, + "learning_rate": 5.1938000954833824e-05, + "loss": 1.227, + "step": 8053 + }, + { + "epoch": 0.4893371407740446, + "grad_norm": 0.21818876266479492, + "learning_rate": 5.192843533981981e-05, + "loss": 1.1089, + "step": 8054 + }, + { + "epoch": 0.48939789780667114, + "grad_norm": 0.16938568651676178, + "learning_rate": 5.191886965411863e-05, + "loss": 1.0654, + "step": 8055 + }, + { + "epoch": 0.4894586548392976, + "grad_norm": 0.4583264887332916, + "learning_rate": 5.190930389808091e-05, + "loss": 1.2066, + "step": 8056 + }, + { + "epoch": 0.48951941187192416, + "grad_norm": 0.26365500688552856, + "learning_rate": 5.1899738072057294e-05, + "loss": 1.097, + "step": 8057 + }, + { + "epoch": 0.4895801689045507, + "grad_norm": 0.7178860306739807, + "learning_rate": 5.18901721763984e-05, + "loss": 1.0628, + "step": 8058 + }, + { + "epoch": 0.48964092593717723, + "grad_norm": 0.2661334276199341, + "learning_rate": 5.1880606211454886e-05, + "loss": 1.0743, + "step": 8059 + }, + { + "epoch": 0.48970168296980376, + "grad_norm": 0.2064826935529709, + "learning_rate": 5.187104017757738e-05, + "loss": 1.0727, + "step": 8060 + }, + { + "epoch": 0.4897624400024303, + "grad_norm": 0.1711578071117401, + "learning_rate": 5.186147407511654e-05, + "loss": 1.0394, + "step": 8061 + }, + { + "epoch": 0.48982319703505683, + "grad_norm": 0.25302138924598694, + "learning_rate": 5.185190790442301e-05, + "loss": 1.0506, + "step": 8062 + }, + { + "epoch": 0.4898839540676833, + "grad_norm": 0.14483009278774261, + "learning_rate": 5.184234166584745e-05, + "loss": 1.0901, + "step": 8063 + }, + { + "epoch": 0.48994471110030985, + "grad_norm": 0.2093842625617981, + "learning_rate": 5.1832775359740484e-05, + "loss": 1.0458, + "step": 8064 + }, + { + "epoch": 0.4900054681329364, + "grad_norm": 0.14583586156368256, + "learning_rate": 5.1823208986452786e-05, + "loss": 0.9877, + "step": 8065 + }, + { + "epoch": 0.4900662251655629, + "grad_norm": 0.14035290479660034, + "learning_rate": 5.181364254633501e-05, + "loss": 1.0456, + "step": 8066 + }, + { + "epoch": 0.49012698219818945, + "grad_norm": 0.4639998972415924, + "learning_rate": 5.180407603973783e-05, + "loss": 1.0946, + "step": 8067 + }, + { + "epoch": 0.490187739230816, + "grad_norm": 0.1791357845067978, + "learning_rate": 5.1794509467011864e-05, + "loss": 1.0854, + "step": 8068 + }, + { + "epoch": 0.49024849626344247, + "grad_norm": 0.15634910762310028, + "learning_rate": 5.178494282850782e-05, + "loss": 1.0984, + "step": 8069 + }, + { + "epoch": 0.490309253296069, + "grad_norm": 1.635263442993164, + "learning_rate": 5.177537612457637e-05, + "loss": 1.2299, + "step": 8070 + }, + { + "epoch": 0.49037001032869554, + "grad_norm": 0.13857577741146088, + "learning_rate": 5.176580935556814e-05, + "loss": 1.0622, + "step": 8071 + }, + { + "epoch": 0.4904307673613221, + "grad_norm": 0.12869401276111603, + "learning_rate": 5.175624252183383e-05, + "loss": 1.0787, + "step": 8072 + }, + { + "epoch": 0.4904915243939486, + "grad_norm": 0.15388746559619904, + "learning_rate": 5.174667562372413e-05, + "loss": 1.0931, + "step": 8073 + }, + { + "epoch": 0.49055228142657514, + "grad_norm": 0.17299987375736237, + "learning_rate": 5.1737108661589685e-05, + "loss": 1.1027, + "step": 8074 + }, + { + "epoch": 0.4906130384592017, + "grad_norm": 0.1280059665441513, + "learning_rate": 5.172754163578118e-05, + "loss": 1.0775, + "step": 8075 + }, + { + "epoch": 0.49067379549182816, + "grad_norm": 0.14672118425369263, + "learning_rate": 5.17179745466493e-05, + "loss": 1.1298, + "step": 8076 + }, + { + "epoch": 0.4907345525244547, + "grad_norm": 0.25295767188072205, + "learning_rate": 5.1708407394544755e-05, + "loss": 1.1434, + "step": 8077 + }, + { + "epoch": 0.4907953095570812, + "grad_norm": 0.16360440850257874, + "learning_rate": 5.1698840179818186e-05, + "loss": 1.0919, + "step": 8078 + }, + { + "epoch": 0.49085606658970776, + "grad_norm": 0.36694246530532837, + "learning_rate": 5.1689272902820296e-05, + "loss": 1.0782, + "step": 8079 + }, + { + "epoch": 0.4909168236223343, + "grad_norm": 0.2163536101579666, + "learning_rate": 5.16797055639018e-05, + "loss": 1.2255, + "step": 8080 + }, + { + "epoch": 0.49097758065496083, + "grad_norm": 0.2372448742389679, + "learning_rate": 5.167013816341336e-05, + "loss": 1.0121, + "step": 8081 + }, + { + "epoch": 0.4910383376875873, + "grad_norm": 0.23705677688121796, + "learning_rate": 5.166057070170569e-05, + "loss": 1.0877, + "step": 8082 + }, + { + "epoch": 0.49109909472021385, + "grad_norm": 0.1316308081150055, + "learning_rate": 5.1651003179129476e-05, + "loss": 1.0561, + "step": 8083 + }, + { + "epoch": 0.4911598517528404, + "grad_norm": 0.2815787196159363, + "learning_rate": 5.164143559603543e-05, + "loss": 1.0579, + "step": 8084 + }, + { + "epoch": 0.4912206087854669, + "grad_norm": 0.20056988298892975, + "learning_rate": 5.163186795277423e-05, + "loss": 1.1034, + "step": 8085 + }, + { + "epoch": 0.49128136581809345, + "grad_norm": 0.22818082571029663, + "learning_rate": 5.162230024969661e-05, + "loss": 1.0575, + "step": 8086 + }, + { + "epoch": 0.49134212285072, + "grad_norm": 0.28945690393447876, + "learning_rate": 5.161273248715325e-05, + "loss": 1.099, + "step": 8087 + }, + { + "epoch": 0.4914028798833465, + "grad_norm": 0.2143135666847229, + "learning_rate": 5.1603164665494877e-05, + "loss": 1.1362, + "step": 8088 + }, + { + "epoch": 0.491463636915973, + "grad_norm": 0.1453467160463333, + "learning_rate": 5.159359678507217e-05, + "loss": 1.0261, + "step": 8089 + }, + { + "epoch": 0.49152439394859954, + "grad_norm": 0.15251703560352325, + "learning_rate": 5.15840288462359e-05, + "loss": 1.0341, + "step": 8090 + }, + { + "epoch": 0.49158515098122607, + "grad_norm": 0.20740902423858643, + "learning_rate": 5.1574460849336724e-05, + "loss": 1.0742, + "step": 8091 + }, + { + "epoch": 0.4916459080138526, + "grad_norm": 0.48251283168792725, + "learning_rate": 5.156489279472537e-05, + "loss": 1.0298, + "step": 8092 + }, + { + "epoch": 0.49170666504647914, + "grad_norm": 0.31824442744255066, + "learning_rate": 5.155532468275259e-05, + "loss": 1.1727, + "step": 8093 + }, + { + "epoch": 0.4917674220791057, + "grad_norm": 0.14958001673221588, + "learning_rate": 5.1545756513769084e-05, + "loss": 1.0608, + "step": 8094 + }, + { + "epoch": 0.49182817911173216, + "grad_norm": 0.16572943329811096, + "learning_rate": 5.1536188288125565e-05, + "loss": 1.0245, + "step": 8095 + }, + { + "epoch": 0.4918889361443587, + "grad_norm": 0.20816749334335327, + "learning_rate": 5.152662000617277e-05, + "loss": 1.1165, + "step": 8096 + }, + { + "epoch": 0.4919496931769852, + "grad_norm": 4.0026726722717285, + "learning_rate": 5.151705166826143e-05, + "loss": 1.1173, + "step": 8097 + }, + { + "epoch": 0.49201045020961176, + "grad_norm": 1.2023611068725586, + "learning_rate": 5.1507483274742266e-05, + "loss": 1.0872, + "step": 8098 + }, + { + "epoch": 0.4920712072422383, + "grad_norm": 0.23394490778446198, + "learning_rate": 5.1497914825965996e-05, + "loss": 1.1161, + "step": 8099 + }, + { + "epoch": 0.49213196427486483, + "grad_norm": 0.23241440951824188, + "learning_rate": 5.1488346322283384e-05, + "loss": 1.1167, + "step": 8100 + }, + { + "epoch": 0.49219272130749137, + "grad_norm": 0.182684987783432, + "learning_rate": 5.147877776404515e-05, + "loss": 1.0474, + "step": 8101 + }, + { + "epoch": 0.49225347834011784, + "grad_norm": 0.185987189412117, + "learning_rate": 5.1469209151602014e-05, + "loss": 1.1285, + "step": 8102 + }, + { + "epoch": 0.4923142353727444, + "grad_norm": 0.18138177692890167, + "learning_rate": 5.145964048530475e-05, + "loss": 1.0419, + "step": 8103 + }, + { + "epoch": 0.4923749924053709, + "grad_norm": 0.1576085239648819, + "learning_rate": 5.145007176550408e-05, + "loss": 1.0747, + "step": 8104 + }, + { + "epoch": 0.49243574943799745, + "grad_norm": 0.4329274594783783, + "learning_rate": 5.144050299255074e-05, + "loss": 1.3283, + "step": 8105 + }, + { + "epoch": 0.492496506470624, + "grad_norm": 0.12247934192419052, + "learning_rate": 5.143093416679549e-05, + "loss": 1.0534, + "step": 8106 + }, + { + "epoch": 0.4925572635032505, + "grad_norm": 0.19341446459293365, + "learning_rate": 5.142136528858907e-05, + "loss": 1.095, + "step": 8107 + }, + { + "epoch": 0.49261802053587705, + "grad_norm": 0.3289068043231964, + "learning_rate": 5.1411796358282235e-05, + "loss": 1.1127, + "step": 8108 + }, + { + "epoch": 0.49267877756850353, + "grad_norm": 0.11702846735715866, + "learning_rate": 5.140222737622573e-05, + "loss": 1.0267, + "step": 8109 + }, + { + "epoch": 0.49273953460113007, + "grad_norm": 0.43668898940086365, + "learning_rate": 5.139265834277032e-05, + "loss": 1.1228, + "step": 8110 + }, + { + "epoch": 0.4928002916337566, + "grad_norm": 0.12550920248031616, + "learning_rate": 5.1383089258266725e-05, + "loss": 1.0298, + "step": 8111 + }, + { + "epoch": 0.49286104866638314, + "grad_norm": 0.13258221745491028, + "learning_rate": 5.1373520123065735e-05, + "loss": 1.0591, + "step": 8112 + }, + { + "epoch": 0.4929218056990097, + "grad_norm": 0.13505661487579346, + "learning_rate": 5.1363950937518104e-05, + "loss": 1.0526, + "step": 8113 + }, + { + "epoch": 0.4929825627316362, + "grad_norm": 0.1507914811372757, + "learning_rate": 5.1354381701974597e-05, + "loss": 1.0821, + "step": 8114 + }, + { + "epoch": 0.4930433197642627, + "grad_norm": 0.1936265230178833, + "learning_rate": 5.1344812416785946e-05, + "loss": 1.0794, + "step": 8115 + }, + { + "epoch": 0.4931040767968892, + "grad_norm": 0.23144285380840302, + "learning_rate": 5.133524308230295e-05, + "loss": 1.0622, + "step": 8116 + }, + { + "epoch": 0.49316483382951576, + "grad_norm": 0.20834168791770935, + "learning_rate": 5.1325673698876356e-05, + "loss": 1.1006, + "step": 8117 + }, + { + "epoch": 0.4932255908621423, + "grad_norm": 0.1457124948501587, + "learning_rate": 5.1316104266856936e-05, + "loss": 1.1049, + "step": 8118 + }, + { + "epoch": 0.49328634789476883, + "grad_norm": 0.13936744630336761, + "learning_rate": 5.130653478659546e-05, + "loss": 1.091, + "step": 8119 + }, + { + "epoch": 0.49334710492739536, + "grad_norm": 0.12499111145734787, + "learning_rate": 5.1296965258442706e-05, + "loss": 1.0565, + "step": 8120 + }, + { + "epoch": 0.4934078619600219, + "grad_norm": 0.12328989058732986, + "learning_rate": 5.128739568274944e-05, + "loss": 1.1194, + "step": 8121 + }, + { + "epoch": 0.4934686189926484, + "grad_norm": 0.11281615495681763, + "learning_rate": 5.127782605986643e-05, + "loss": 1.0856, + "step": 8122 + }, + { + "epoch": 0.4935293760252749, + "grad_norm": 0.19786526262760162, + "learning_rate": 5.1268256390144444e-05, + "loss": 1.1847, + "step": 8123 + }, + { + "epoch": 0.49359013305790145, + "grad_norm": 0.2402956634759903, + "learning_rate": 5.125868667393431e-05, + "loss": 1.1225, + "step": 8124 + }, + { + "epoch": 0.493650890090528, + "grad_norm": 0.21540434658527374, + "learning_rate": 5.124911691158674e-05, + "loss": 1.1308, + "step": 8125 + }, + { + "epoch": 0.4937116471231545, + "grad_norm": 0.12889930605888367, + "learning_rate": 5.123954710345256e-05, + "loss": 1.0247, + "step": 8126 + }, + { + "epoch": 0.49377240415578105, + "grad_norm": 0.20945194363594055, + "learning_rate": 5.122997724988255e-05, + "loss": 1.0621, + "step": 8127 + }, + { + "epoch": 0.49383316118840753, + "grad_norm": 0.22293387353420258, + "learning_rate": 5.122040735122747e-05, + "loss": 1.2814, + "step": 8128 + }, + { + "epoch": 0.49389391822103407, + "grad_norm": 0.31326350569725037, + "learning_rate": 5.121083740783813e-05, + "loss": 1.2136, + "step": 8129 + }, + { + "epoch": 0.4939546752536606, + "grad_norm": 0.1952691674232483, + "learning_rate": 5.120126742006531e-05, + "loss": 1.109, + "step": 8130 + }, + { + "epoch": 0.49401543228628714, + "grad_norm": 0.1779486984014511, + "learning_rate": 5.119169738825981e-05, + "loss": 1.1342, + "step": 8131 + }, + { + "epoch": 0.49407618931891367, + "grad_norm": 0.19886529445648193, + "learning_rate": 5.11821273127724e-05, + "loss": 1.1375, + "step": 8132 + }, + { + "epoch": 0.4941369463515402, + "grad_norm": 0.1656958907842636, + "learning_rate": 5.117255719395388e-05, + "loss": 1.0374, + "step": 8133 + }, + { + "epoch": 0.49419770338416674, + "grad_norm": 0.36190176010131836, + "learning_rate": 5.1162987032155074e-05, + "loss": 1.1571, + "step": 8134 + }, + { + "epoch": 0.4942584604167932, + "grad_norm": 0.27930328249931335, + "learning_rate": 5.115341682772674e-05, + "loss": 1.0269, + "step": 8135 + }, + { + "epoch": 0.49431921744941976, + "grad_norm": 0.2375454604625702, + "learning_rate": 5.114384658101967e-05, + "loss": 1.0736, + "step": 8136 + }, + { + "epoch": 0.4943799744820463, + "grad_norm": 0.2661468982696533, + "learning_rate": 5.113427629238472e-05, + "loss": 1.1042, + "step": 8137 + }, + { + "epoch": 0.4944407315146728, + "grad_norm": 0.18297788500785828, + "learning_rate": 5.112470596217261e-05, + "loss": 1.122, + "step": 8138 + }, + { + "epoch": 0.49450148854729936, + "grad_norm": 0.12859390676021576, + "learning_rate": 5.111513559073422e-05, + "loss": 1.0424, + "step": 8139 + }, + { + "epoch": 0.4945622455799259, + "grad_norm": 0.20417475700378418, + "learning_rate": 5.110556517842031e-05, + "loss": 1.1031, + "step": 8140 + }, + { + "epoch": 0.4946230026125524, + "grad_norm": 0.4640609323978424, + "learning_rate": 5.109599472558169e-05, + "loss": 1.1813, + "step": 8141 + }, + { + "epoch": 0.4946837596451789, + "grad_norm": 0.3602086901664734, + "learning_rate": 5.108642423256918e-05, + "loss": 1.2167, + "step": 8142 + }, + { + "epoch": 0.49474451667780545, + "grad_norm": 0.1720881164073944, + "learning_rate": 5.1076853699733576e-05, + "loss": 1.134, + "step": 8143 + }, + { + "epoch": 0.494805273710432, + "grad_norm": 0.16503708064556122, + "learning_rate": 5.106728312742568e-05, + "loss": 1.1311, + "step": 8144 + }, + { + "epoch": 0.4948660307430585, + "grad_norm": 1.6363108158111572, + "learning_rate": 5.105771251599633e-05, + "loss": 1.1103, + "step": 8145 + }, + { + "epoch": 0.49492678777568505, + "grad_norm": 0.1599103808403015, + "learning_rate": 5.104814186579631e-05, + "loss": 1.0433, + "step": 8146 + }, + { + "epoch": 0.4949875448083116, + "grad_norm": 0.135761097073555, + "learning_rate": 5.103857117717647e-05, + "loss": 1.0645, + "step": 8147 + }, + { + "epoch": 0.49504830184093807, + "grad_norm": 2.425630807876587, + "learning_rate": 5.102900045048758e-05, + "loss": 1.0743, + "step": 8148 + }, + { + "epoch": 0.4951090588735646, + "grad_norm": 0.20664967596530914, + "learning_rate": 5.101942968608048e-05, + "loss": 1.086, + "step": 8149 + }, + { + "epoch": 0.49516981590619114, + "grad_norm": 0.20991453528404236, + "learning_rate": 5.1009858884305995e-05, + "loss": 1.1199, + "step": 8150 + }, + { + "epoch": 0.49523057293881767, + "grad_norm": 0.21037231385707855, + "learning_rate": 5.100028804551493e-05, + "loss": 1.0924, + "step": 8151 + }, + { + "epoch": 0.4952913299714442, + "grad_norm": 0.2317184954881668, + "learning_rate": 5.099071717005812e-05, + "loss": 1.1425, + "step": 8152 + }, + { + "epoch": 0.49535208700407074, + "grad_norm": 0.20401878654956818, + "learning_rate": 5.0981146258286385e-05, + "loss": 1.0787, + "step": 8153 + }, + { + "epoch": 0.4954128440366973, + "grad_norm": 0.47571122646331787, + "learning_rate": 5.0971575310550546e-05, + "loss": 1.1633, + "step": 8154 + }, + { + "epoch": 0.49547360106932375, + "grad_norm": 0.18967245519161224, + "learning_rate": 5.0962004327201416e-05, + "loss": 1.0949, + "step": 8155 + }, + { + "epoch": 0.4955343581019503, + "grad_norm": 2.0704503059387207, + "learning_rate": 5.0952433308589844e-05, + "loss": 1.1116, + "step": 8156 + }, + { + "epoch": 0.4955951151345768, + "grad_norm": 0.2872239351272583, + "learning_rate": 5.094286225506664e-05, + "loss": 1.0119, + "step": 8157 + }, + { + "epoch": 0.49565587216720336, + "grad_norm": 0.14626392722129822, + "learning_rate": 5.093329116698263e-05, + "loss": 1.0656, + "step": 8158 + }, + { + "epoch": 0.4957166291998299, + "grad_norm": 0.22473354637622833, + "learning_rate": 5.092372004468865e-05, + "loss": 1.0813, + "step": 8159 + }, + { + "epoch": 0.49577738623245643, + "grad_norm": 0.25407880544662476, + "learning_rate": 5.091414888853554e-05, + "loss": 1.0723, + "step": 8160 + }, + { + "epoch": 0.4958381432650829, + "grad_norm": 0.14181669056415558, + "learning_rate": 5.090457769887412e-05, + "loss": 1.044, + "step": 8161 + }, + { + "epoch": 0.49589890029770944, + "grad_norm": 0.29317861795425415, + "learning_rate": 5.089500647605524e-05, + "loss": 1.1828, + "step": 8162 + }, + { + "epoch": 0.495959657330336, + "grad_norm": 0.1615181714296341, + "learning_rate": 5.088543522042972e-05, + "loss": 1.0371, + "step": 8163 + }, + { + "epoch": 0.4960204143629625, + "grad_norm": 0.16157956421375275, + "learning_rate": 5.08758639323484e-05, + "loss": 1.0826, + "step": 8164 + }, + { + "epoch": 0.49608117139558905, + "grad_norm": 0.14580108225345612, + "learning_rate": 5.086629261216211e-05, + "loss": 1.0761, + "step": 8165 + }, + { + "epoch": 0.4961419284282156, + "grad_norm": 0.14426328241825104, + "learning_rate": 5.0856721260221716e-05, + "loss": 1.0669, + "step": 8166 + }, + { + "epoch": 0.4962026854608421, + "grad_norm": 0.1830640435218811, + "learning_rate": 5.084714987687803e-05, + "loss": 1.0907, + "step": 8167 + }, + { + "epoch": 0.4962634424934686, + "grad_norm": 0.23111943900585175, + "learning_rate": 5.08375784624819e-05, + "loss": 1.144, + "step": 8168 + }, + { + "epoch": 0.49632419952609513, + "grad_norm": 0.26133355498313904, + "learning_rate": 5.082800701738416e-05, + "loss": 1.0952, + "step": 8169 + }, + { + "epoch": 0.49638495655872167, + "grad_norm": 0.20605742931365967, + "learning_rate": 5.0818435541935675e-05, + "loss": 1.12, + "step": 8170 + }, + { + "epoch": 0.4964457135913482, + "grad_norm": 0.1918751299381256, + "learning_rate": 5.080886403648728e-05, + "loss": 1.142, + "step": 8171 + }, + { + "epoch": 0.49650647062397474, + "grad_norm": 1.250995397567749, + "learning_rate": 5.079929250138981e-05, + "loss": 1.0772, + "step": 8172 + }, + { + "epoch": 0.4965672276566013, + "grad_norm": 0.2142866849899292, + "learning_rate": 5.0789720936994115e-05, + "loss": 1.0732, + "step": 8173 + }, + { + "epoch": 0.49662798468922775, + "grad_norm": 0.29276618361473083, + "learning_rate": 5.078014934365105e-05, + "loss": 1.0331, + "step": 8174 + }, + { + "epoch": 0.4966887417218543, + "grad_norm": 0.2521336078643799, + "learning_rate": 5.077057772171146e-05, + "loss": 1.1482, + "step": 8175 + }, + { + "epoch": 0.4967494987544808, + "grad_norm": 0.22795985639095306, + "learning_rate": 5.076100607152621e-05, + "loss": 1.031, + "step": 8176 + }, + { + "epoch": 0.49681025578710736, + "grad_norm": 0.26999640464782715, + "learning_rate": 5.0751434393446116e-05, + "loss": 1.01, + "step": 8177 + }, + { + "epoch": 0.4968710128197339, + "grad_norm": 0.1529611051082611, + "learning_rate": 5.074186268782205e-05, + "loss": 1.0345, + "step": 8178 + }, + { + "epoch": 0.4969317698523604, + "grad_norm": 0.21202926337718964, + "learning_rate": 5.073229095500488e-05, + "loss": 1.1232, + "step": 8179 + }, + { + "epoch": 0.49699252688498696, + "grad_norm": 0.1875039041042328, + "learning_rate": 5.0722719195345415e-05, + "loss": 1.1163, + "step": 8180 + }, + { + "epoch": 0.49705328391761344, + "grad_norm": 0.19611190259456635, + "learning_rate": 5.071314740919457e-05, + "loss": 1.1717, + "step": 8181 + }, + { + "epoch": 0.49711404095024, + "grad_norm": 0.21463800966739655, + "learning_rate": 5.070357559690314e-05, + "loss": 1.0346, + "step": 8182 + }, + { + "epoch": 0.4971747979828665, + "grad_norm": 0.21034328639507294, + "learning_rate": 5.069400375882203e-05, + "loss": 1.1268, + "step": 8183 + }, + { + "epoch": 0.49723555501549305, + "grad_norm": 0.18212296068668365, + "learning_rate": 5.0684431895302086e-05, + "loss": 1.064, + "step": 8184 + }, + { + "epoch": 0.4972963120481196, + "grad_norm": 0.2803708612918854, + "learning_rate": 5.067486000669414e-05, + "loss": 1.155, + "step": 8185 + }, + { + "epoch": 0.4973570690807461, + "grad_norm": 0.2102777659893036, + "learning_rate": 5.066528809334907e-05, + "loss": 1.1148, + "step": 8186 + }, + { + "epoch": 0.4974178261133726, + "grad_norm": 0.17017783224582672, + "learning_rate": 5.065571615561774e-05, + "loss": 1.0931, + "step": 8187 + }, + { + "epoch": 0.49747858314599913, + "grad_norm": 0.2427312582731247, + "learning_rate": 5.064614419385101e-05, + "loss": 1.2314, + "step": 8188 + }, + { + "epoch": 0.49753934017862567, + "grad_norm": 3.5309224128723145, + "learning_rate": 5.0636572208399743e-05, + "loss": 1.0487, + "step": 8189 + }, + { + "epoch": 0.4976000972112522, + "grad_norm": 0.747995138168335, + "learning_rate": 5.062700019961478e-05, + "loss": 1.1525, + "step": 8190 + }, + { + "epoch": 0.49766085424387874, + "grad_norm": 0.23654013872146606, + "learning_rate": 5.061742816784704e-05, + "loss": 1.1189, + "step": 8191 + }, + { + "epoch": 0.49772161127650527, + "grad_norm": 0.1926470249891281, + "learning_rate": 5.0607856113447324e-05, + "loss": 1.1093, + "step": 8192 + }, + { + "epoch": 0.4977823683091318, + "grad_norm": 0.2659144997596741, + "learning_rate": 5.059828403676652e-05, + "loss": 1.0888, + "step": 8193 + }, + { + "epoch": 0.4978431253417583, + "grad_norm": 0.17904962599277496, + "learning_rate": 5.058871193815552e-05, + "loss": 1.0981, + "step": 8194 + }, + { + "epoch": 0.4979038823743848, + "grad_norm": 0.16388067603111267, + "learning_rate": 5.0579139817965146e-05, + "loss": 1.0259, + "step": 8195 + }, + { + "epoch": 0.49796463940701136, + "grad_norm": 0.8273473381996155, + "learning_rate": 5.0569567676546304e-05, + "loss": 1.0974, + "step": 8196 + }, + { + "epoch": 0.4980253964396379, + "grad_norm": 0.9225310683250427, + "learning_rate": 5.055999551424986e-05, + "loss": 1.0881, + "step": 8197 + }, + { + "epoch": 0.4980861534722644, + "grad_norm": 0.20532894134521484, + "learning_rate": 5.055042333142666e-05, + "loss": 1.0351, + "step": 8198 + }, + { + "epoch": 0.49814691050489096, + "grad_norm": 0.13360542058944702, + "learning_rate": 5.054085112842758e-05, + "loss": 1.0417, + "step": 8199 + }, + { + "epoch": 0.49820766753751744, + "grad_norm": 0.2290326952934265, + "learning_rate": 5.053127890560351e-05, + "loss": 1.1253, + "step": 8200 + }, + { + "epoch": 0.498268424570144, + "grad_norm": 0.17761391401290894, + "learning_rate": 5.05217066633053e-05, + "loss": 1.3029, + "step": 8201 + }, + { + "epoch": 0.4983291816027705, + "grad_norm": 0.17378677427768707, + "learning_rate": 5.051213440188383e-05, + "loss": 1.1049, + "step": 8202 + }, + { + "epoch": 0.49838993863539705, + "grad_norm": 0.153955340385437, + "learning_rate": 5.050256212168997e-05, + "loss": 1.0613, + "step": 8203 + }, + { + "epoch": 0.4984506956680236, + "grad_norm": 0.2369835525751114, + "learning_rate": 5.049298982307462e-05, + "loss": 1.1353, + "step": 8204 + }, + { + "epoch": 0.4985114527006501, + "grad_norm": 0.3454437851905823, + "learning_rate": 5.048341750638862e-05, + "loss": 1.0689, + "step": 8205 + }, + { + "epoch": 0.49857220973327665, + "grad_norm": 0.35425442457199097, + "learning_rate": 5.0473845171982844e-05, + "loss": 1.043, + "step": 8206 + }, + { + "epoch": 0.49863296676590313, + "grad_norm": 0.17250806093215942, + "learning_rate": 5.0464272820208214e-05, + "loss": 1.0134, + "step": 8207 + }, + { + "epoch": 0.49869372379852966, + "grad_norm": 0.9371998906135559, + "learning_rate": 5.045470045141554e-05, + "loss": 1.0978, + "step": 8208 + }, + { + "epoch": 0.4987544808311562, + "grad_norm": 0.12351718544960022, + "learning_rate": 5.044512806595575e-05, + "loss": 1.0731, + "step": 8209 + }, + { + "epoch": 0.49881523786378273, + "grad_norm": 0.19571208953857422, + "learning_rate": 5.04355556641797e-05, + "loss": 1.1871, + "step": 8210 + }, + { + "epoch": 0.49887599489640927, + "grad_norm": 0.13838797807693481, + "learning_rate": 5.042598324643828e-05, + "loss": 1.078, + "step": 8211 + }, + { + "epoch": 0.4989367519290358, + "grad_norm": 0.3813706636428833, + "learning_rate": 5.041641081308236e-05, + "loss": 1.0104, + "step": 8212 + }, + { + "epoch": 0.49899750896166234, + "grad_norm": 0.14353570342063904, + "learning_rate": 5.0406838364462804e-05, + "loss": 1.0368, + "step": 8213 + }, + { + "epoch": 0.4990582659942888, + "grad_norm": 0.21878044307231903, + "learning_rate": 5.039726590093053e-05, + "loss": 1.2278, + "step": 8214 + }, + { + "epoch": 0.49911902302691535, + "grad_norm": 1.1359318494796753, + "learning_rate": 5.03876934228364e-05, + "loss": 1.0577, + "step": 8215 + }, + { + "epoch": 0.4991797800595419, + "grad_norm": 0.15227779746055603, + "learning_rate": 5.037812093053127e-05, + "loss": 1.0808, + "step": 8216 + }, + { + "epoch": 0.4992405370921684, + "grad_norm": 0.21359845995903015, + "learning_rate": 5.036854842436608e-05, + "loss": 1.1212, + "step": 8217 + }, + { + "epoch": 0.49930129412479496, + "grad_norm": 0.1925724446773529, + "learning_rate": 5.035897590469164e-05, + "loss": 1.1241, + "step": 8218 + }, + { + "epoch": 0.4993620511574215, + "grad_norm": 0.20035238564014435, + "learning_rate": 5.0349403371858886e-05, + "loss": 1.0131, + "step": 8219 + }, + { + "epoch": 0.499422808190048, + "grad_norm": 0.18301323056221008, + "learning_rate": 5.033983082621868e-05, + "loss": 1.1172, + "step": 8220 + }, + { + "epoch": 0.4994835652226745, + "grad_norm": 0.17572705447673798, + "learning_rate": 5.033025826812191e-05, + "loss": 1.0583, + "step": 8221 + }, + { + "epoch": 0.49954432225530104, + "grad_norm": 0.19760817289352417, + "learning_rate": 5.0320685697919465e-05, + "loss": 1.1087, + "step": 8222 + }, + { + "epoch": 0.4996050792879276, + "grad_norm": 0.26191139221191406, + "learning_rate": 5.0311113115962215e-05, + "loss": 1.0983, + "step": 8223 + }, + { + "epoch": 0.4996658363205541, + "grad_norm": 0.30545851588249207, + "learning_rate": 5.030154052260106e-05, + "loss": 1.1286, + "step": 8224 + }, + { + "epoch": 0.49972659335318065, + "grad_norm": 0.24228700995445251, + "learning_rate": 5.0291967918186875e-05, + "loss": 1.1084, + "step": 8225 + }, + { + "epoch": 0.4997873503858072, + "grad_norm": 0.16840411722660065, + "learning_rate": 5.028239530307054e-05, + "loss": 1.0668, + "step": 8226 + }, + { + "epoch": 0.49984810741843366, + "grad_norm": 0.16302527487277985, + "learning_rate": 5.027282267760296e-05, + "loss": 0.9963, + "step": 8227 + }, + { + "epoch": 0.4999088644510602, + "grad_norm": 0.14595066010951996, + "learning_rate": 5.026325004213504e-05, + "loss": 1.0778, + "step": 8228 + }, + { + "epoch": 0.49996962148368673, + "grad_norm": 0.5399507284164429, + "learning_rate": 5.02536773970176e-05, + "loss": 1.0661, + "step": 8229 + }, + { + "epoch": 0.5000303785163133, + "grad_norm": 0.14924471080303192, + "learning_rate": 5.024410474260157e-05, + "loss": 1.0671, + "step": 8230 + }, + { + "epoch": 0.5000911355489398, + "grad_norm": 0.8931715488433838, + "learning_rate": 5.023453207923784e-05, + "loss": 1.4328, + "step": 8231 + }, + { + "epoch": 0.5001518925815663, + "grad_norm": 0.16851367056369781, + "learning_rate": 5.0224959407277296e-05, + "loss": 1.0402, + "step": 8232 + }, + { + "epoch": 0.5002126496141929, + "grad_norm": 0.13297845423221588, + "learning_rate": 5.0215386727070824e-05, + "loss": 1.0906, + "step": 8233 + }, + { + "epoch": 0.5002734066468194, + "grad_norm": 0.42528483271598816, + "learning_rate": 5.02058140389693e-05, + "loss": 1.078, + "step": 8234 + }, + { + "epoch": 0.5003341636794459, + "grad_norm": 0.1264801323413849, + "learning_rate": 5.019624134332363e-05, + "loss": 1.048, + "step": 8235 + }, + { + "epoch": 0.5003949207120725, + "grad_norm": 0.2873716950416565, + "learning_rate": 5.018666864048469e-05, + "loss": 1.1872, + "step": 8236 + }, + { + "epoch": 0.5004556777446989, + "grad_norm": 0.16146931052207947, + "learning_rate": 5.017709593080338e-05, + "loss": 1.0365, + "step": 8237 + }, + { + "epoch": 0.5005164347773254, + "grad_norm": 0.35578781366348267, + "learning_rate": 5.0167523214630586e-05, + "loss": 1.0896, + "step": 8238 + }, + { + "epoch": 0.500577191809952, + "grad_norm": 0.16928507387638092, + "learning_rate": 5.015795049231717e-05, + "loss": 1.047, + "step": 8239 + }, + { + "epoch": 0.5006379488425785, + "grad_norm": 0.3534541130065918, + "learning_rate": 5.0148377764214084e-05, + "loss": 1.0634, + "step": 8240 + }, + { + "epoch": 0.500698705875205, + "grad_norm": 0.1801421344280243, + "learning_rate": 5.0138805030672176e-05, + "loss": 1.0528, + "step": 8241 + }, + { + "epoch": 0.5007594629078316, + "grad_norm": 0.32417336106300354, + "learning_rate": 5.0129232292042325e-05, + "loss": 1.0255, + "step": 8242 + }, + { + "epoch": 0.5008202199404581, + "grad_norm": 0.1729975789785385, + "learning_rate": 5.0119659548675446e-05, + "loss": 1.0761, + "step": 8243 + }, + { + "epoch": 0.5008809769730846, + "grad_norm": 0.20362064242362976, + "learning_rate": 5.011008680092243e-05, + "loss": 1.102, + "step": 8244 + }, + { + "epoch": 0.5009417340057112, + "grad_norm": 0.42174428701400757, + "learning_rate": 5.010051404913416e-05, + "loss": 1.0797, + "step": 8245 + }, + { + "epoch": 0.5010024910383377, + "grad_norm": 0.13610652089118958, + "learning_rate": 5.009094129366153e-05, + "loss": 1.1002, + "step": 8246 + }, + { + "epoch": 0.5010632480709643, + "grad_norm": 0.14378514885902405, + "learning_rate": 5.008136853485541e-05, + "loss": 1.0406, + "step": 8247 + }, + { + "epoch": 0.5011240051035908, + "grad_norm": 0.14423348009586334, + "learning_rate": 5.007179577306674e-05, + "loss": 1.0981, + "step": 8248 + }, + { + "epoch": 0.5011847621362173, + "grad_norm": 0.22621090710163116, + "learning_rate": 5.006222300864637e-05, + "loss": 1.075, + "step": 8249 + }, + { + "epoch": 0.5012455191688437, + "grad_norm": 0.20006893575191498, + "learning_rate": 5.005265024194519e-05, + "loss": 1.1468, + "step": 8250 + }, + { + "epoch": 0.5013062762014703, + "grad_norm": 0.18724669516086578, + "learning_rate": 5.004307747331413e-05, + "loss": 1.0676, + "step": 8251 + }, + { + "epoch": 0.5013670332340968, + "grad_norm": 0.4504494369029999, + "learning_rate": 5.003350470310403e-05, + "loss": 1.198, + "step": 8252 + }, + { + "epoch": 0.5014277902667233, + "grad_norm": 0.20448404550552368, + "learning_rate": 5.002393193166582e-05, + "loss": 1.1066, + "step": 8253 + }, + { + "epoch": 0.5014885472993499, + "grad_norm": 0.26566049456596375, + "learning_rate": 5.0014359159350385e-05, + "loss": 1.0701, + "step": 8254 + }, + { + "epoch": 0.5015493043319764, + "grad_norm": 0.1513049602508545, + "learning_rate": 5.0004786386508616e-05, + "loss": 1.1191, + "step": 8255 + }, + { + "epoch": 0.501610061364603, + "grad_norm": 0.11535308510065079, + "learning_rate": 4.9995213613491396e-05, + "loss": 1.0038, + "step": 8256 + }, + { + "epoch": 0.5016708183972295, + "grad_norm": 0.18777461349964142, + "learning_rate": 4.998564084064962e-05, + "loss": 1.0201, + "step": 8257 + }, + { + "epoch": 0.501731575429856, + "grad_norm": 0.31867924332618713, + "learning_rate": 4.9976068068334176e-05, + "loss": 1.0506, + "step": 8258 + }, + { + "epoch": 0.5017923324624826, + "grad_norm": 0.14262045919895172, + "learning_rate": 4.9966495296895974e-05, + "loss": 1.136, + "step": 8259 + }, + { + "epoch": 0.5018530894951091, + "grad_norm": 0.1401367485523224, + "learning_rate": 4.995692252668589e-05, + "loss": 1.0649, + "step": 8260 + }, + { + "epoch": 0.5019138465277356, + "grad_norm": 0.1476631611585617, + "learning_rate": 4.9947349758054816e-05, + "loss": 1.1078, + "step": 8261 + }, + { + "epoch": 0.5019746035603622, + "grad_norm": 0.1467423439025879, + "learning_rate": 4.993777699135364e-05, + "loss": 1.078, + "step": 8262 + }, + { + "epoch": 0.5020353605929886, + "grad_norm": 0.36051467061042786, + "learning_rate": 4.9928204226933264e-05, + "loss": 1.2199, + "step": 8263 + }, + { + "epoch": 0.5020961176256151, + "grad_norm": 0.42102688550949097, + "learning_rate": 4.991863146514459e-05, + "loss": 1.045, + "step": 8264 + }, + { + "epoch": 0.5021568746582417, + "grad_norm": 0.12762510776519775, + "learning_rate": 4.990905870633848e-05, + "loss": 1.0754, + "step": 8265 + }, + { + "epoch": 0.5022176316908682, + "grad_norm": 0.18396292626857758, + "learning_rate": 4.989948595086586e-05, + "loss": 1.2735, + "step": 8266 + }, + { + "epoch": 0.5022783887234947, + "grad_norm": 0.13720965385437012, + "learning_rate": 4.988991319907758e-05, + "loss": 1.0712, + "step": 8267 + }, + { + "epoch": 0.5023391457561213, + "grad_norm": 0.14273101091384888, + "learning_rate": 4.988034045132456e-05, + "loss": 1.0345, + "step": 8268 + }, + { + "epoch": 0.5023999027887478, + "grad_norm": 1.1191110610961914, + "learning_rate": 4.9870767707957686e-05, + "loss": 1.172, + "step": 8269 + }, + { + "epoch": 0.5024606598213743, + "grad_norm": 0.1882486194372177, + "learning_rate": 4.9861194969327835e-05, + "loss": 1.0999, + "step": 8270 + }, + { + "epoch": 0.5025214168540009, + "grad_norm": 0.15251389145851135, + "learning_rate": 4.985162223578592e-05, + "loss": 1.0377, + "step": 8271 + }, + { + "epoch": 0.5025821738866274, + "grad_norm": 0.2699245810508728, + "learning_rate": 4.984204950768282e-05, + "loss": 1.1296, + "step": 8272 + }, + { + "epoch": 0.5026429309192539, + "grad_norm": 1.4700466394424438, + "learning_rate": 4.983247678536944e-05, + "loss": 1.1121, + "step": 8273 + }, + { + "epoch": 0.5027036879518805, + "grad_norm": 0.9751190543174744, + "learning_rate": 4.982290406919664e-05, + "loss": 1.0971, + "step": 8274 + }, + { + "epoch": 0.502764444984507, + "grad_norm": 0.29272234439849854, + "learning_rate": 4.981333135951532e-05, + "loss": 1.1349, + "step": 8275 + }, + { + "epoch": 0.5028252020171334, + "grad_norm": 0.31304648518562317, + "learning_rate": 4.980375865667638e-05, + "loss": 1.072, + "step": 8276 + }, + { + "epoch": 0.50288595904976, + "grad_norm": 0.20816628634929657, + "learning_rate": 4.979418596103071e-05, + "loss": 1.0687, + "step": 8277 + }, + { + "epoch": 0.5029467160823865, + "grad_norm": 0.11504765599966049, + "learning_rate": 4.978461327292918e-05, + "loss": 1.067, + "step": 8278 + }, + { + "epoch": 0.503007473115013, + "grad_norm": 0.1644960343837738, + "learning_rate": 4.977504059272273e-05, + "loss": 1.1067, + "step": 8279 + }, + { + "epoch": 0.5030682301476396, + "grad_norm": 0.17996364831924438, + "learning_rate": 4.9765467920762164e-05, + "loss": 1.1757, + "step": 8280 + }, + { + "epoch": 0.5031289871802661, + "grad_norm": 0.24800243973731995, + "learning_rate": 4.975589525739844e-05, + "loss": 1.1103, + "step": 8281 + }, + { + "epoch": 0.5031897442128926, + "grad_norm": 0.12489693611860275, + "learning_rate": 4.974632260298242e-05, + "loss": 1.0354, + "step": 8282 + }, + { + "epoch": 0.5032505012455192, + "grad_norm": 0.2894044518470764, + "learning_rate": 4.973674995786498e-05, + "loss": 1.1608, + "step": 8283 + }, + { + "epoch": 0.5033112582781457, + "grad_norm": 0.1456850916147232, + "learning_rate": 4.9727177322397034e-05, + "loss": 1.0766, + "step": 8284 + }, + { + "epoch": 0.5033720153107722, + "grad_norm": 0.1728762984275818, + "learning_rate": 4.971760469692946e-05, + "loss": 1.1459, + "step": 8285 + }, + { + "epoch": 0.5034327723433988, + "grad_norm": 0.17219896614551544, + "learning_rate": 4.9708032081813144e-05, + "loss": 1.0324, + "step": 8286 + }, + { + "epoch": 0.5034935293760253, + "grad_norm": 0.1751197874546051, + "learning_rate": 4.969845947739896e-05, + "loss": 1.0958, + "step": 8287 + }, + { + "epoch": 0.5035542864086519, + "grad_norm": 0.1838088184595108, + "learning_rate": 4.96888868840378e-05, + "loss": 1.2078, + "step": 8288 + }, + { + "epoch": 0.5036150434412783, + "grad_norm": 0.20298756659030914, + "learning_rate": 4.967931430208055e-05, + "loss": 1.064, + "step": 8289 + }, + { + "epoch": 0.5036758004739048, + "grad_norm": 0.182186096906662, + "learning_rate": 4.96697417318781e-05, + "loss": 1.0071, + "step": 8290 + }, + { + "epoch": 0.5037365575065313, + "grad_norm": 0.22610075771808624, + "learning_rate": 4.9660169173781316e-05, + "loss": 1.2349, + "step": 8291 + }, + { + "epoch": 0.5037973145391579, + "grad_norm": 0.15375541150569916, + "learning_rate": 4.965059662814112e-05, + "loss": 1.0728, + "step": 8292 + }, + { + "epoch": 0.5038580715717844, + "grad_norm": 0.23978929221630096, + "learning_rate": 4.964102409530837e-05, + "loss": 1.004, + "step": 8293 + }, + { + "epoch": 0.503918828604411, + "grad_norm": 0.1773739457130432, + "learning_rate": 4.963145157563395e-05, + "loss": 1.0872, + "step": 8294 + }, + { + "epoch": 0.5039795856370375, + "grad_norm": 0.11503232270479202, + "learning_rate": 4.962187906946873e-05, + "loss": 1.0313, + "step": 8295 + }, + { + "epoch": 0.504040342669664, + "grad_norm": 0.14824290573596954, + "learning_rate": 4.961230657716361e-05, + "loss": 1.0599, + "step": 8296 + }, + { + "epoch": 0.5041010997022906, + "grad_norm": 0.21134766936302185, + "learning_rate": 4.960273409906947e-05, + "loss": 1.0274, + "step": 8297 + }, + { + "epoch": 0.5041618567349171, + "grad_norm": 0.2069268524646759, + "learning_rate": 4.9593161635537194e-05, + "loss": 1.1218, + "step": 8298 + }, + { + "epoch": 0.5042226137675436, + "grad_norm": 2.4647109508514404, + "learning_rate": 4.958358918691764e-05, + "loss": 1.0774, + "step": 8299 + }, + { + "epoch": 0.5042833708001702, + "grad_norm": 1.0594382286071777, + "learning_rate": 4.9574016753561744e-05, + "loss": 1.086, + "step": 8300 + }, + { + "epoch": 0.5043441278327967, + "grad_norm": 0.39914965629577637, + "learning_rate": 4.956444433582031e-05, + "loss": 1.0667, + "step": 8301 + }, + { + "epoch": 0.5044048848654231, + "grad_norm": 8.242327690124512, + "learning_rate": 4.955487193404426e-05, + "loss": 1.0377, + "step": 8302 + }, + { + "epoch": 0.5044656418980497, + "grad_norm": 0.35576769709587097, + "learning_rate": 4.954529954858447e-05, + "loss": 1.1649, + "step": 8303 + }, + { + "epoch": 0.5045263989306762, + "grad_norm": 0.2670916020870209, + "learning_rate": 4.9535727179791804e-05, + "loss": 1.0443, + "step": 8304 + }, + { + "epoch": 0.5045871559633027, + "grad_norm": 13.714424133300781, + "learning_rate": 4.9526154828017154e-05, + "loss": 1.2174, + "step": 8305 + }, + { + "epoch": 0.5046479129959293, + "grad_norm": 0.15306521952152252, + "learning_rate": 4.95165824936114e-05, + "loss": 1.0686, + "step": 8306 + }, + { + "epoch": 0.5047086700285558, + "grad_norm": 0.4864562451839447, + "learning_rate": 4.950701017692541e-05, + "loss": 1.1131, + "step": 8307 + }, + { + "epoch": 0.5047694270611823, + "grad_norm": 0.12344438582658768, + "learning_rate": 4.9497437878310045e-05, + "loss": 1.0559, + "step": 8308 + }, + { + "epoch": 0.5048301840938089, + "grad_norm": 0.17158269882202148, + "learning_rate": 4.9487865598116177e-05, + "loss": 1.0564, + "step": 8309 + }, + { + "epoch": 0.5048909411264354, + "grad_norm": 0.14548590779304504, + "learning_rate": 4.947829333669471e-05, + "loss": 1.0802, + "step": 8310 + }, + { + "epoch": 0.5049516981590619, + "grad_norm": 0.1790032833814621, + "learning_rate": 4.946872109439651e-05, + "loss": 1.029, + "step": 8311 + }, + { + "epoch": 0.5050124551916885, + "grad_norm": 2.055565357208252, + "learning_rate": 4.945914887157242e-05, + "loss": 1.1672, + "step": 8312 + }, + { + "epoch": 0.505073212224315, + "grad_norm": 0.1323356032371521, + "learning_rate": 4.9449576668573374e-05, + "loss": 1.0071, + "step": 8313 + }, + { + "epoch": 0.5051339692569415, + "grad_norm": 1.1997883319854736, + "learning_rate": 4.9440004485750165e-05, + "loss": 1.0596, + "step": 8314 + }, + { + "epoch": 0.505194726289568, + "grad_norm": 5.971543788909912, + "learning_rate": 4.943043232345371e-05, + "loss": 1.1915, + "step": 8315 + }, + { + "epoch": 0.5052554833221945, + "grad_norm": 0.6543163657188416, + "learning_rate": 4.942086018203486e-05, + "loss": 1.1988, + "step": 8316 + }, + { + "epoch": 0.505316240354821, + "grad_norm": 0.17534765601158142, + "learning_rate": 4.941128806184449e-05, + "loss": 1.0399, + "step": 8317 + }, + { + "epoch": 0.5053769973874476, + "grad_norm": 0.15541040897369385, + "learning_rate": 4.940171596323348e-05, + "loss": 1.085, + "step": 8318 + }, + { + "epoch": 0.5054377544200741, + "grad_norm": 0.44979262351989746, + "learning_rate": 4.939214388655269e-05, + "loss": 1.2206, + "step": 8319 + }, + { + "epoch": 0.5054985114527006, + "grad_norm": 0.1236925721168518, + "learning_rate": 4.938257183215299e-05, + "loss": 1.0607, + "step": 8320 + }, + { + "epoch": 0.5055592684853272, + "grad_norm": 0.17472024261951447, + "learning_rate": 4.937299980038523e-05, + "loss": 1.1329, + "step": 8321 + }, + { + "epoch": 0.5056200255179537, + "grad_norm": 0.1306833028793335, + "learning_rate": 4.936342779160027e-05, + "loss": 1.1088, + "step": 8322 + }, + { + "epoch": 0.5056807825505802, + "grad_norm": 0.14495404064655304, + "learning_rate": 4.9353855806149003e-05, + "loss": 1.0233, + "step": 8323 + }, + { + "epoch": 0.5057415395832068, + "grad_norm": 0.13575270771980286, + "learning_rate": 4.934428384438227e-05, + "loss": 1.0718, + "step": 8324 + }, + { + "epoch": 0.5058022966158333, + "grad_norm": 0.26696622371673584, + "learning_rate": 4.933471190665093e-05, + "loss": 1.1328, + "step": 8325 + }, + { + "epoch": 0.5058630536484598, + "grad_norm": 0.18755652010440826, + "learning_rate": 4.9325139993305894e-05, + "loss": 1.1105, + "step": 8326 + }, + { + "epoch": 0.5059238106810864, + "grad_norm": 0.2036897838115692, + "learning_rate": 4.931556810469794e-05, + "loss": 1.0683, + "step": 8327 + }, + { + "epoch": 0.5059845677137128, + "grad_norm": 0.1344156563282013, + "learning_rate": 4.930599624117798e-05, + "loss": 1.0316, + "step": 8328 + }, + { + "epoch": 0.5060453247463393, + "grad_norm": 0.11509817093610764, + "learning_rate": 4.929642440309687e-05, + "loss": 1.0582, + "step": 8329 + }, + { + "epoch": 0.5061060817789659, + "grad_norm": 0.33605095744132996, + "learning_rate": 4.9286852590805446e-05, + "loss": 1.0665, + "step": 8330 + }, + { + "epoch": 0.5061668388115924, + "grad_norm": 0.22083233296871185, + "learning_rate": 4.927728080465458e-05, + "loss": 1.1513, + "step": 8331 + }, + { + "epoch": 0.506227595844219, + "grad_norm": 0.2308490127325058, + "learning_rate": 4.9267709044995136e-05, + "loss": 1.0228, + "step": 8332 + }, + { + "epoch": 0.5062883528768455, + "grad_norm": 0.23748978972434998, + "learning_rate": 4.925813731217797e-05, + "loss": 1.2818, + "step": 8333 + }, + { + "epoch": 0.506349109909472, + "grad_norm": 0.41256120800971985, + "learning_rate": 4.92485656065539e-05, + "loss": 1.2997, + "step": 8334 + }, + { + "epoch": 0.5064098669420986, + "grad_norm": 0.18186990916728973, + "learning_rate": 4.923899392847381e-05, + "loss": 1.1078, + "step": 8335 + }, + { + "epoch": 0.5064706239747251, + "grad_norm": 0.26134192943573, + "learning_rate": 4.922942227828855e-05, + "loss": 1.1419, + "step": 8336 + }, + { + "epoch": 0.5065313810073516, + "grad_norm": 0.33839961886405945, + "learning_rate": 4.921985065634896e-05, + "loss": 1.0407, + "step": 8337 + }, + { + "epoch": 0.5065921380399782, + "grad_norm": 0.1901620626449585, + "learning_rate": 4.921027906300588e-05, + "loss": 1.0358, + "step": 8338 + }, + { + "epoch": 0.5066528950726047, + "grad_norm": 0.23898373544216156, + "learning_rate": 4.920070749861019e-05, + "loss": 1.0535, + "step": 8339 + }, + { + "epoch": 0.5067136521052312, + "grad_norm": 0.17903980612754822, + "learning_rate": 4.9191135963512744e-05, + "loss": 1.047, + "step": 8340 + }, + { + "epoch": 0.5067744091378578, + "grad_norm": 0.12277766317129135, + "learning_rate": 4.918156445806434e-05, + "loss": 1.0323, + "step": 8341 + }, + { + "epoch": 0.5068351661704842, + "grad_norm": 1.125687599182129, + "learning_rate": 4.917199298261585e-05, + "loss": 1.175, + "step": 8342 + }, + { + "epoch": 0.5068959232031107, + "grad_norm": 0.12285816669464111, + "learning_rate": 4.916242153751811e-05, + "loss": 1.0709, + "step": 8343 + }, + { + "epoch": 0.5069566802357373, + "grad_norm": 0.15012361109256744, + "learning_rate": 4.9152850123121977e-05, + "loss": 1.0389, + "step": 8344 + }, + { + "epoch": 0.5070174372683638, + "grad_norm": 0.8612470030784607, + "learning_rate": 4.9143278739778296e-05, + "loss": 1.1423, + "step": 8345 + }, + { + "epoch": 0.5070781943009903, + "grad_norm": 0.22222037613391876, + "learning_rate": 4.91337073878379e-05, + "loss": 1.068, + "step": 8346 + }, + { + "epoch": 0.5071389513336169, + "grad_norm": 40.13809585571289, + "learning_rate": 4.912413606765161e-05, + "loss": 1.2587, + "step": 8347 + }, + { + "epoch": 0.5071997083662434, + "grad_norm": 0.25332459807395935, + "learning_rate": 4.911456477957029e-05, + "loss": 1.0471, + "step": 8348 + }, + { + "epoch": 0.5072604653988699, + "grad_norm": 0.20554561913013458, + "learning_rate": 4.910499352394478e-05, + "loss": 1.0618, + "step": 8349 + }, + { + "epoch": 0.5073212224314965, + "grad_norm": 0.20227853953838348, + "learning_rate": 4.909542230112589e-05, + "loss": 1.1739, + "step": 8350 + }, + { + "epoch": 0.507381979464123, + "grad_norm": 0.16589759290218353, + "learning_rate": 4.9085851111464464e-05, + "loss": 1.0338, + "step": 8351 + }, + { + "epoch": 0.5074427364967495, + "grad_norm": 0.25207895040512085, + "learning_rate": 4.9076279955311357e-05, + "loss": 1.0719, + "step": 8352 + }, + { + "epoch": 0.5075034935293761, + "grad_norm": 0.5040931105613708, + "learning_rate": 4.9066708833017396e-05, + "loss": 1.1578, + "step": 8353 + }, + { + "epoch": 0.5075642505620026, + "grad_norm": 0.238718181848526, + "learning_rate": 4.905713774493339e-05, + "loss": 1.1963, + "step": 8354 + }, + { + "epoch": 0.507625007594629, + "grad_norm": 0.24442973732948303, + "learning_rate": 4.9047566691410174e-05, + "loss": 1.0836, + "step": 8355 + }, + { + "epoch": 0.5076857646272556, + "grad_norm": 0.6389700770378113, + "learning_rate": 4.9037995672798595e-05, + "loss": 1.0674, + "step": 8356 + }, + { + "epoch": 0.5077465216598821, + "grad_norm": 0.16048896312713623, + "learning_rate": 4.9028424689449466e-05, + "loss": 1.0854, + "step": 8357 + }, + { + "epoch": 0.5078072786925086, + "grad_norm": 0.23621916770935059, + "learning_rate": 4.901885374171362e-05, + "loss": 1.0831, + "step": 8358 + }, + { + "epoch": 0.5078680357251352, + "grad_norm": 0.1800125539302826, + "learning_rate": 4.900928282994187e-05, + "loss": 1.0643, + "step": 8359 + }, + { + "epoch": 0.5079287927577617, + "grad_norm": 0.23138806223869324, + "learning_rate": 4.8999711954485075e-05, + "loss": 1.1036, + "step": 8360 + }, + { + "epoch": 0.5079895497903882, + "grad_norm": 0.13819749653339386, + "learning_rate": 4.8990141115694024e-05, + "loss": 1.0323, + "step": 8361 + }, + { + "epoch": 0.5080503068230148, + "grad_norm": 0.14696162939071655, + "learning_rate": 4.898057031391953e-05, + "loss": 1.1208, + "step": 8362 + }, + { + "epoch": 0.5081110638556413, + "grad_norm": 0.12947984039783478, + "learning_rate": 4.897099954951243e-05, + "loss": 1.0478, + "step": 8363 + }, + { + "epoch": 0.5081718208882678, + "grad_norm": 0.1823125034570694, + "learning_rate": 4.896142882282355e-05, + "loss": 1.1095, + "step": 8364 + }, + { + "epoch": 0.5082325779208944, + "grad_norm": 0.5740890502929688, + "learning_rate": 4.895185813420369e-05, + "loss": 1.1337, + "step": 8365 + }, + { + "epoch": 0.5082933349535209, + "grad_norm": 0.18487799167633057, + "learning_rate": 4.8942287484003684e-05, + "loss": 1.1344, + "step": 8366 + }, + { + "epoch": 0.5083540919861474, + "grad_norm": 0.14665530622005463, + "learning_rate": 4.893271687257434e-05, + "loss": 1.0444, + "step": 8367 + }, + { + "epoch": 0.5084148490187739, + "grad_norm": 0.18262310326099396, + "learning_rate": 4.892314630026644e-05, + "loss": 1.1154, + "step": 8368 + }, + { + "epoch": 0.5084756060514004, + "grad_norm": 0.16470301151275635, + "learning_rate": 4.8913575767430834e-05, + "loss": 1.066, + "step": 8369 + }, + { + "epoch": 0.5085363630840269, + "grad_norm": 0.12337921559810638, + "learning_rate": 4.890400527441832e-05, + "loss": 1.0216, + "step": 8370 + }, + { + "epoch": 0.5085971201166535, + "grad_norm": 0.42846640944480896, + "learning_rate": 4.889443482157969e-05, + "loss": 1.2121, + "step": 8371 + }, + { + "epoch": 0.50865787714928, + "grad_norm": 0.332369863986969, + "learning_rate": 4.8884864409265786e-05, + "loss": 1.113, + "step": 8372 + }, + { + "epoch": 0.5087186341819065, + "grad_norm": 0.39648205041885376, + "learning_rate": 4.887529403782739e-05, + "loss": 1.0549, + "step": 8373 + }, + { + "epoch": 0.5087793912145331, + "grad_norm": 0.1325053721666336, + "learning_rate": 4.886572370761531e-05, + "loss": 1.0496, + "step": 8374 + }, + { + "epoch": 0.5088401482471596, + "grad_norm": 0.38651201128959656, + "learning_rate": 4.885615341898034e-05, + "loss": 1.0689, + "step": 8375 + }, + { + "epoch": 0.5089009052797862, + "grad_norm": 0.1649765968322754, + "learning_rate": 4.8846583172273275e-05, + "loss": 1.0749, + "step": 8376 + }, + { + "epoch": 0.5089616623124127, + "grad_norm": 0.11553426831960678, + "learning_rate": 4.8837012967844944e-05, + "loss": 1.0396, + "step": 8377 + }, + { + "epoch": 0.5090224193450392, + "grad_norm": 0.12951795756816864, + "learning_rate": 4.882744280604612e-05, + "loss": 1.0914, + "step": 8378 + }, + { + "epoch": 0.5090831763776658, + "grad_norm": 0.20966719090938568, + "learning_rate": 4.8817872687227594e-05, + "loss": 1.088, + "step": 8379 + }, + { + "epoch": 0.5091439334102923, + "grad_norm": 0.21130825579166412, + "learning_rate": 4.8808302611740215e-05, + "loss": 1.0573, + "step": 8380 + }, + { + "epoch": 0.5092046904429187, + "grad_norm": 0.19537219405174255, + "learning_rate": 4.87987325799347e-05, + "loss": 1.0652, + "step": 8381 + }, + { + "epoch": 0.5092654474755453, + "grad_norm": 0.2663423418998718, + "learning_rate": 4.878916259216188e-05, + "loss": 1.0784, + "step": 8382 + }, + { + "epoch": 0.5093262045081718, + "grad_norm": 0.6188851594924927, + "learning_rate": 4.877959264877254e-05, + "loss": 1.0337, + "step": 8383 + }, + { + "epoch": 0.5093869615407983, + "grad_norm": 0.2325194925069809, + "learning_rate": 4.877002275011746e-05, + "loss": 1.1417, + "step": 8384 + }, + { + "epoch": 0.5094477185734249, + "grad_norm": 0.13020442426204681, + "learning_rate": 4.876045289654744e-05, + "loss": 1.0677, + "step": 8385 + }, + { + "epoch": 0.5095084756060514, + "grad_norm": 0.1499943882226944, + "learning_rate": 4.875088308841326e-05, + "loss": 1.0139, + "step": 8386 + }, + { + "epoch": 0.5095692326386779, + "grad_norm": 0.18690651655197144, + "learning_rate": 4.8741313326065715e-05, + "loss": 1.0417, + "step": 8387 + }, + { + "epoch": 0.5096299896713045, + "grad_norm": 0.21670256555080414, + "learning_rate": 4.873174360985557e-05, + "loss": 1.0929, + "step": 8388 + }, + { + "epoch": 0.509690746703931, + "grad_norm": 0.15587611496448517, + "learning_rate": 4.872217394013359e-05, + "loss": 1.0635, + "step": 8389 + }, + { + "epoch": 0.5097515037365575, + "grad_norm": 0.6354724168777466, + "learning_rate": 4.8712604317250576e-05, + "loss": 1.1001, + "step": 8390 + }, + { + "epoch": 0.5098122607691841, + "grad_norm": 0.15006624162197113, + "learning_rate": 4.87030347415573e-05, + "loss": 1.1248, + "step": 8391 + }, + { + "epoch": 0.5098730178018106, + "grad_norm": 0.12928888201713562, + "learning_rate": 4.869346521340453e-05, + "loss": 1.0064, + "step": 8392 + }, + { + "epoch": 0.5099337748344371, + "grad_norm": 0.18467842042446136, + "learning_rate": 4.868389573314308e-05, + "loss": 1.1236, + "step": 8393 + }, + { + "epoch": 0.5099945318670636, + "grad_norm": 0.74225914478302, + "learning_rate": 4.867432630112365e-05, + "loss": 1.0268, + "step": 8394 + }, + { + "epoch": 0.5100552888996901, + "grad_norm": 0.1632264256477356, + "learning_rate": 4.8664756917697066e-05, + "loss": 1.1005, + "step": 8395 + }, + { + "epoch": 0.5101160459323166, + "grad_norm": 0.17403481900691986, + "learning_rate": 4.8655187583214066e-05, + "loss": 1.0288, + "step": 8396 + }, + { + "epoch": 0.5101768029649432, + "grad_norm": 0.14741408824920654, + "learning_rate": 4.8645618298025415e-05, + "loss": 1.0774, + "step": 8397 + }, + { + "epoch": 0.5102375599975697, + "grad_norm": 0.20694880187511444, + "learning_rate": 4.86360490624819e-05, + "loss": 1.1291, + "step": 8398 + }, + { + "epoch": 0.5102983170301962, + "grad_norm": 0.15443269908428192, + "learning_rate": 4.862647987693427e-05, + "loss": 1.0092, + "step": 8399 + }, + { + "epoch": 0.5103590740628228, + "grad_norm": 0.17777439951896667, + "learning_rate": 4.8616910741733294e-05, + "loss": 1.0872, + "step": 8400 + }, + { + "epoch": 0.5104198310954493, + "grad_norm": 0.12636573612689972, + "learning_rate": 4.860734165722971e-05, + "loss": 1.0549, + "step": 8401 + }, + { + "epoch": 0.5104805881280758, + "grad_norm": 0.21173891425132751, + "learning_rate": 4.859777262377428e-05, + "loss": 1.0293, + "step": 8402 + }, + { + "epoch": 0.5105413451607024, + "grad_norm": 0.2152070701122284, + "learning_rate": 4.858820364171777e-05, + "loss": 1.0677, + "step": 8403 + }, + { + "epoch": 0.5106021021933289, + "grad_norm": 0.2382003664970398, + "learning_rate": 4.8578634711410933e-05, + "loss": 1.0634, + "step": 8404 + }, + { + "epoch": 0.5106628592259554, + "grad_norm": 0.310569703578949, + "learning_rate": 4.856906583320451e-05, + "loss": 1.0424, + "step": 8405 + }, + { + "epoch": 0.510723616258582, + "grad_norm": 0.2448827475309372, + "learning_rate": 4.855949700744926e-05, + "loss": 1.0607, + "step": 8406 + }, + { + "epoch": 0.5107843732912084, + "grad_norm": 0.59648197889328, + "learning_rate": 4.854992823449594e-05, + "loss": 1.1614, + "step": 8407 + }, + { + "epoch": 0.5108451303238349, + "grad_norm": 0.13383783400058746, + "learning_rate": 4.854035951469527e-05, + "loss": 1.0891, + "step": 8408 + }, + { + "epoch": 0.5109058873564615, + "grad_norm": 0.1836208999156952, + "learning_rate": 4.8530790848398e-05, + "loss": 1.1025, + "step": 8409 + }, + { + "epoch": 0.510966644389088, + "grad_norm": 0.16916318237781525, + "learning_rate": 4.8521222235954864e-05, + "loss": 1.1377, + "step": 8410 + }, + { + "epoch": 0.5110274014217145, + "grad_norm": 0.22267036139965057, + "learning_rate": 4.851165367771663e-05, + "loss": 1.0696, + "step": 8411 + }, + { + "epoch": 0.5110881584543411, + "grad_norm": 0.12919865548610687, + "learning_rate": 4.8502085174034015e-05, + "loss": 1.0257, + "step": 8412 + }, + { + "epoch": 0.5111489154869676, + "grad_norm": 0.16686958074569702, + "learning_rate": 4.8492516725257766e-05, + "loss": 1.0933, + "step": 8413 + }, + { + "epoch": 0.5112096725195941, + "grad_norm": 0.9743154048919678, + "learning_rate": 4.84829483317386e-05, + "loss": 1.0628, + "step": 8414 + }, + { + "epoch": 0.5112704295522207, + "grad_norm": 0.20970475673675537, + "learning_rate": 4.847337999382724e-05, + "loss": 1.1368, + "step": 8415 + }, + { + "epoch": 0.5113311865848472, + "grad_norm": 0.6008890867233276, + "learning_rate": 4.8463811711874446e-05, + "loss": 1.0215, + "step": 8416 + }, + { + "epoch": 0.5113919436174738, + "grad_norm": 0.15277495980262756, + "learning_rate": 4.8454243486230934e-05, + "loss": 1.1052, + "step": 8417 + }, + { + "epoch": 0.5114527006501003, + "grad_norm": 0.278396874666214, + "learning_rate": 4.844467531724741e-05, + "loss": 1.2837, + "step": 8418 + }, + { + "epoch": 0.5115134576827268, + "grad_norm": 0.13862337172031403, + "learning_rate": 4.843510720527462e-05, + "loss": 1.0813, + "step": 8419 + }, + { + "epoch": 0.5115742147153532, + "grad_norm": 0.13305699825286865, + "learning_rate": 4.8425539150663294e-05, + "loss": 1.0853, + "step": 8420 + }, + { + "epoch": 0.5116349717479798, + "grad_norm": 0.10755746066570282, + "learning_rate": 4.8415971153764126e-05, + "loss": 1.0058, + "step": 8421 + }, + { + "epoch": 0.5116957287806063, + "grad_norm": 0.1701010763645172, + "learning_rate": 4.840640321492784e-05, + "loss": 1.1213, + "step": 8422 + }, + { + "epoch": 0.5117564858132329, + "grad_norm": 0.20484969019889832, + "learning_rate": 4.839683533450514e-05, + "loss": 1.0533, + "step": 8423 + }, + { + "epoch": 0.5118172428458594, + "grad_norm": 0.15652777254581451, + "learning_rate": 4.8387267512846764e-05, + "loss": 1.0716, + "step": 8424 + }, + { + "epoch": 0.5118779998784859, + "grad_norm": 0.1799188107252121, + "learning_rate": 4.8377699750303404e-05, + "loss": 1.0912, + "step": 8425 + }, + { + "epoch": 0.5119387569111125, + "grad_norm": 0.3735145628452301, + "learning_rate": 4.836813204722577e-05, + "loss": 1.2903, + "step": 8426 + }, + { + "epoch": 0.511999513943739, + "grad_norm": 0.14639516174793243, + "learning_rate": 4.83585644039646e-05, + "loss": 1.0672, + "step": 8427 + }, + { + "epoch": 0.5120602709763655, + "grad_norm": 0.17273227870464325, + "learning_rate": 4.8348996820870535e-05, + "loss": 1.1334, + "step": 8428 + }, + { + "epoch": 0.5121210280089921, + "grad_norm": 0.14567801356315613, + "learning_rate": 4.833942929829433e-05, + "loss": 1.0344, + "step": 8429 + }, + { + "epoch": 0.5121817850416186, + "grad_norm": 0.13478966057300568, + "learning_rate": 4.8329861836586645e-05, + "loss": 1.0883, + "step": 8430 + }, + { + "epoch": 0.5122425420742451, + "grad_norm": 0.23374485969543457, + "learning_rate": 4.832029443609821e-05, + "loss": 1.011, + "step": 8431 + }, + { + "epoch": 0.5123032991068717, + "grad_norm": 0.13496573269367218, + "learning_rate": 4.83107270971797e-05, + "loss": 1.0662, + "step": 8432 + }, + { + "epoch": 0.5123640561394981, + "grad_norm": 0.17448122799396515, + "learning_rate": 4.830115982018183e-05, + "loss": 1.0729, + "step": 8433 + }, + { + "epoch": 0.5124248131721246, + "grad_norm": 0.4021522104740143, + "learning_rate": 4.829159260545527e-05, + "loss": 1.104, + "step": 8434 + }, + { + "epoch": 0.5124855702047512, + "grad_norm": 0.29043328762054443, + "learning_rate": 4.828202545335071e-05, + "loss": 1.1243, + "step": 8435 + }, + { + "epoch": 0.5125463272373777, + "grad_norm": 0.13032442331314087, + "learning_rate": 4.827245836421883e-05, + "loss": 1.063, + "step": 8436 + }, + { + "epoch": 0.5126070842700042, + "grad_norm": 0.17315416038036346, + "learning_rate": 4.8262891338410334e-05, + "loss": 1.0945, + "step": 8437 + }, + { + "epoch": 0.5126678413026308, + "grad_norm": 0.3445857763290405, + "learning_rate": 4.825332437627589e-05, + "loss": 1.1487, + "step": 8438 + }, + { + "epoch": 0.5127285983352573, + "grad_norm": 0.1359899640083313, + "learning_rate": 4.824375747816616e-05, + "loss": 1.0677, + "step": 8439 + }, + { + "epoch": 0.5127893553678838, + "grad_norm": 0.2265770435333252, + "learning_rate": 4.8234190644431874e-05, + "loss": 1.1656, + "step": 8440 + }, + { + "epoch": 0.5128501124005104, + "grad_norm": 0.14991723001003265, + "learning_rate": 4.822462387542365e-05, + "loss": 1.095, + "step": 8441 + }, + { + "epoch": 0.5129108694331369, + "grad_norm": 0.26123762130737305, + "learning_rate": 4.821505717149219e-05, + "loss": 1.1039, + "step": 8442 + }, + { + "epoch": 0.5129716264657634, + "grad_norm": 0.38255685567855835, + "learning_rate": 4.8205490532988154e-05, + "loss": 1.0928, + "step": 8443 + }, + { + "epoch": 0.51303238349839, + "grad_norm": 0.31451302766799927, + "learning_rate": 4.819592396026219e-05, + "loss": 1.118, + "step": 8444 + }, + { + "epoch": 0.5130931405310165, + "grad_norm": 0.2046777606010437, + "learning_rate": 4.8186357453665e-05, + "loss": 1.0964, + "step": 8445 + }, + { + "epoch": 0.513153897563643, + "grad_norm": 0.18283958733081818, + "learning_rate": 4.8176791013547226e-05, + "loss": 1.0803, + "step": 8446 + }, + { + "epoch": 0.5132146545962695, + "grad_norm": 0.11106229573488235, + "learning_rate": 4.816722464025954e-05, + "loss": 1.0044, + "step": 8447 + }, + { + "epoch": 0.513275411628896, + "grad_norm": 0.25317859649658203, + "learning_rate": 4.8157658334152566e-05, + "loss": 1.2103, + "step": 8448 + }, + { + "epoch": 0.5133361686615225, + "grad_norm": 0.20891964435577393, + "learning_rate": 4.8148092095576994e-05, + "loss": 1.1547, + "step": 8449 + }, + { + "epoch": 0.5133969256941491, + "grad_norm": 0.21446585655212402, + "learning_rate": 4.813852592488346e-05, + "loss": 1.3115, + "step": 8450 + }, + { + "epoch": 0.5134576827267756, + "grad_norm": 0.1956537365913391, + "learning_rate": 4.812895982242262e-05, + "loss": 1.1286, + "step": 8451 + }, + { + "epoch": 0.5135184397594021, + "grad_norm": 0.15341708064079285, + "learning_rate": 4.811939378854512e-05, + "loss": 1.058, + "step": 8452 + }, + { + "epoch": 0.5135791967920287, + "grad_norm": 0.14769281446933746, + "learning_rate": 4.81098278236016e-05, + "loss": 1.0884, + "step": 8453 + }, + { + "epoch": 0.5136399538246552, + "grad_norm": 0.21183790266513824, + "learning_rate": 4.810026192794273e-05, + "loss": 1.0245, + "step": 8454 + }, + { + "epoch": 0.5137007108572818, + "grad_norm": 0.14863301813602448, + "learning_rate": 4.809069610191911e-05, + "loss": 1.1093, + "step": 8455 + }, + { + "epoch": 0.5137614678899083, + "grad_norm": 0.12465561926364899, + "learning_rate": 4.808113034588138e-05, + "loss": 1.0241, + "step": 8456 + }, + { + "epoch": 0.5138222249225348, + "grad_norm": 0.16065973043441772, + "learning_rate": 4.80715646601802e-05, + "loss": 1.0863, + "step": 8457 + }, + { + "epoch": 0.5138829819551614, + "grad_norm": 0.1811482161283493, + "learning_rate": 4.806199904516619e-05, + "loss": 1.05, + "step": 8458 + }, + { + "epoch": 0.5139437389877879, + "grad_norm": 0.13222797214984894, + "learning_rate": 4.805243350118998e-05, + "loss": 1.0773, + "step": 8459 + }, + { + "epoch": 0.5140044960204143, + "grad_norm": 0.24850741028785706, + "learning_rate": 4.804286802860221e-05, + "loss": 1.2645, + "step": 8460 + }, + { + "epoch": 0.5140652530530408, + "grad_norm": 0.13444441556930542, + "learning_rate": 4.803330262775346e-05, + "loss": 1.085, + "step": 8461 + }, + { + "epoch": 0.5141260100856674, + "grad_norm": 1.200520396232605, + "learning_rate": 4.8023737298994404e-05, + "loss": 1.1363, + "step": 8462 + }, + { + "epoch": 0.5141867671182939, + "grad_norm": 0.22339344024658203, + "learning_rate": 4.8014172042675636e-05, + "loss": 1.1247, + "step": 8463 + }, + { + "epoch": 0.5142475241509205, + "grad_norm": 0.12136029452085495, + "learning_rate": 4.800460685914776e-05, + "loss": 1.0682, + "step": 8464 + }, + { + "epoch": 0.514308281183547, + "grad_norm": 0.16660837829113007, + "learning_rate": 4.799504174876142e-05, + "loss": 1.0777, + "step": 8465 + }, + { + "epoch": 0.5143690382161735, + "grad_norm": 0.24574171006679535, + "learning_rate": 4.7985476711867215e-05, + "loss": 1.2804, + "step": 8466 + }, + { + "epoch": 0.5144297952488001, + "grad_norm": 0.26523008942604065, + "learning_rate": 4.797591174881576e-05, + "loss": 1.1038, + "step": 8467 + }, + { + "epoch": 0.5144905522814266, + "grad_norm": 0.9609251618385315, + "learning_rate": 4.796634685995764e-05, + "loss": 1.1237, + "step": 8468 + }, + { + "epoch": 0.5145513093140531, + "grad_norm": 0.2267611175775528, + "learning_rate": 4.795678204564345e-05, + "loss": 1.0742, + "step": 8469 + }, + { + "epoch": 0.5146120663466797, + "grad_norm": 0.1811976581811905, + "learning_rate": 4.794721730622383e-05, + "loss": 1.0544, + "step": 8470 + }, + { + "epoch": 0.5146728233793062, + "grad_norm": 0.2098594605922699, + "learning_rate": 4.793765264204935e-05, + "loss": 1.13, + "step": 8471 + }, + { + "epoch": 0.5147335804119327, + "grad_norm": 0.19684365391731262, + "learning_rate": 4.79280880534706e-05, + "loss": 1.0976, + "step": 8472 + }, + { + "epoch": 0.5147943374445592, + "grad_norm": 0.12757156789302826, + "learning_rate": 4.791852354083821e-05, + "loss": 1.0299, + "step": 8473 + }, + { + "epoch": 0.5148550944771857, + "grad_norm": 0.1435091644525528, + "learning_rate": 4.790895910450273e-05, + "loss": 1.1124, + "step": 8474 + }, + { + "epoch": 0.5149158515098122, + "grad_norm": 0.19999675452709198, + "learning_rate": 4.789939474481476e-05, + "loss": 1.0936, + "step": 8475 + }, + { + "epoch": 0.5149766085424388, + "grad_norm": 0.14922617375850677, + "learning_rate": 4.788983046212488e-05, + "loss": 1.0786, + "step": 8476 + }, + { + "epoch": 0.5150373655750653, + "grad_norm": 0.21757040917873383, + "learning_rate": 4.7880266256783666e-05, + "loss": 1.0763, + "step": 8477 + }, + { + "epoch": 0.5150981226076918, + "grad_norm": 0.12935255467891693, + "learning_rate": 4.78707021291417e-05, + "loss": 1.0149, + "step": 8478 + }, + { + "epoch": 0.5151588796403184, + "grad_norm": 0.14098288118839264, + "learning_rate": 4.7861138079549566e-05, + "loss": 1.0358, + "step": 8479 + }, + { + "epoch": 0.5152196366729449, + "grad_norm": 0.12675568461418152, + "learning_rate": 4.7851574108357845e-05, + "loss": 1.0817, + "step": 8480 + }, + { + "epoch": 0.5152803937055714, + "grad_norm": 0.16993513703346252, + "learning_rate": 4.784201021591707e-05, + "loss": 1.0767, + "step": 8481 + }, + { + "epoch": 0.515341150738198, + "grad_norm": 0.9628027677536011, + "learning_rate": 4.7832446402577825e-05, + "loss": 1.0496, + "step": 8482 + }, + { + "epoch": 0.5154019077708245, + "grad_norm": 0.16150809824466705, + "learning_rate": 4.7822882668690685e-05, + "loss": 1.1095, + "step": 8483 + }, + { + "epoch": 0.515462664803451, + "grad_norm": 0.1148170679807663, + "learning_rate": 4.78133190146062e-05, + "loss": 0.9828, + "step": 8484 + }, + { + "epoch": 0.5155234218360776, + "grad_norm": 0.15926486253738403, + "learning_rate": 4.7803755440674916e-05, + "loss": 1.1507, + "step": 8485 + }, + { + "epoch": 0.515584178868704, + "grad_norm": 0.1537747085094452, + "learning_rate": 4.7794191947247414e-05, + "loss": 1.0779, + "step": 8486 + }, + { + "epoch": 0.5156449359013305, + "grad_norm": 0.17183583974838257, + "learning_rate": 4.778462853467425e-05, + "loss": 1.1285, + "step": 8487 + }, + { + "epoch": 0.5157056929339571, + "grad_norm": 0.11602478474378586, + "learning_rate": 4.777506520330593e-05, + "loss": 1.0038, + "step": 8488 + }, + { + "epoch": 0.5157664499665836, + "grad_norm": 0.22357726097106934, + "learning_rate": 4.7765501953493033e-05, + "loss": 1.156, + "step": 8489 + }, + { + "epoch": 0.5158272069992101, + "grad_norm": 0.1359436810016632, + "learning_rate": 4.7755938785586086e-05, + "loss": 1.0895, + "step": 8490 + }, + { + "epoch": 0.5158879640318367, + "grad_norm": 0.14483585953712463, + "learning_rate": 4.7746375699935646e-05, + "loss": 1.0647, + "step": 8491 + }, + { + "epoch": 0.5159487210644632, + "grad_norm": 0.1440819501876831, + "learning_rate": 4.773681269689224e-05, + "loss": 1.0651, + "step": 8492 + }, + { + "epoch": 0.5160094780970897, + "grad_norm": 0.7805384397506714, + "learning_rate": 4.772724977680639e-05, + "loss": 1.2545, + "step": 8493 + }, + { + "epoch": 0.5160702351297163, + "grad_norm": 0.14209496974945068, + "learning_rate": 4.7717686940028675e-05, + "loss": 1.0452, + "step": 8494 + }, + { + "epoch": 0.5161309921623428, + "grad_norm": 0.13533027470111847, + "learning_rate": 4.770812418690954e-05, + "loss": 1.0473, + "step": 8495 + }, + { + "epoch": 0.5161917491949694, + "grad_norm": 0.4063666760921478, + "learning_rate": 4.769856151779959e-05, + "loss": 1.1717, + "step": 8496 + }, + { + "epoch": 0.5162525062275959, + "grad_norm": 0.16200560331344604, + "learning_rate": 4.768899893304929e-05, + "loss": 1.1014, + "step": 8497 + }, + { + "epoch": 0.5163132632602224, + "grad_norm": 2.216160774230957, + "learning_rate": 4.767943643300918e-05, + "loss": 1.0507, + "step": 8498 + }, + { + "epoch": 0.5163740202928488, + "grad_norm": 0.19600948691368103, + "learning_rate": 4.7669874018029785e-05, + "loss": 1.154, + "step": 8499 + }, + { + "epoch": 0.5164347773254754, + "grad_norm": 0.2035873681306839, + "learning_rate": 4.766031168846162e-05, + "loss": 1.1135, + "step": 8500 + }, + { + "epoch": 0.5164955343581019, + "grad_norm": 0.1573156863451004, + "learning_rate": 4.765074944465517e-05, + "loss": 1.0793, + "step": 8501 + }, + { + "epoch": 0.5165562913907285, + "grad_norm": 0.17408792674541473, + "learning_rate": 4.764118728696096e-05, + "loss": 1.0975, + "step": 8502 + }, + { + "epoch": 0.516617048423355, + "grad_norm": 0.14286479353904724, + "learning_rate": 4.763162521572947e-05, + "loss": 0.995, + "step": 8503 + }, + { + "epoch": 0.5166778054559815, + "grad_norm": 0.15472820401191711, + "learning_rate": 4.7622063231311225e-05, + "loss": 1.0947, + "step": 8504 + }, + { + "epoch": 0.516738562488608, + "grad_norm": 0.2096824198961258, + "learning_rate": 4.761250133405671e-05, + "loss": 1.0966, + "step": 8505 + }, + { + "epoch": 0.5167993195212346, + "grad_norm": 0.13473306596279144, + "learning_rate": 4.760293952431642e-05, + "loss": 1.0508, + "step": 8506 + }, + { + "epoch": 0.5168600765538611, + "grad_norm": 0.20115388929843903, + "learning_rate": 4.759337780244087e-05, + "loss": 1.0652, + "step": 8507 + }, + { + "epoch": 0.5169208335864877, + "grad_norm": 0.14727814495563507, + "learning_rate": 4.758381616878049e-05, + "loss": 1.1178, + "step": 8508 + }, + { + "epoch": 0.5169815906191142, + "grad_norm": 0.36924317479133606, + "learning_rate": 4.7574254623685806e-05, + "loss": 1.3663, + "step": 8509 + }, + { + "epoch": 0.5170423476517407, + "grad_norm": 0.3887315094470978, + "learning_rate": 4.756469316750729e-05, + "loss": 1.0526, + "step": 8510 + }, + { + "epoch": 0.5171031046843673, + "grad_norm": 0.2359502613544464, + "learning_rate": 4.7555131800595395e-05, + "loss": 1.0543, + "step": 8511 + }, + { + "epoch": 0.5171638617169937, + "grad_norm": 0.19352594017982483, + "learning_rate": 4.754557052330063e-05, + "loss": 1.0995, + "step": 8512 + }, + { + "epoch": 0.5172246187496202, + "grad_norm": 0.15768180787563324, + "learning_rate": 4.753600933597346e-05, + "loss": 1.0783, + "step": 8513 + }, + { + "epoch": 0.5172853757822468, + "grad_norm": 0.12654650211334229, + "learning_rate": 4.752644823896436e-05, + "loss": 1.0843, + "step": 8514 + }, + { + "epoch": 0.5173461328148733, + "grad_norm": 0.12636877596378326, + "learning_rate": 4.751688723262376e-05, + "loss": 1.0595, + "step": 8515 + }, + { + "epoch": 0.5174068898474998, + "grad_norm": 0.14403897523880005, + "learning_rate": 4.750732631730212e-05, + "loss": 1.0554, + "step": 8516 + }, + { + "epoch": 0.5174676468801264, + "grad_norm": 0.15285925567150116, + "learning_rate": 4.749776549334994e-05, + "loss": 1.1014, + "step": 8517 + }, + { + "epoch": 0.5175284039127529, + "grad_norm": 0.1536620855331421, + "learning_rate": 4.7488204761117645e-05, + "loss": 1.0784, + "step": 8518 + }, + { + "epoch": 0.5175891609453794, + "grad_norm": 0.15283985435962677, + "learning_rate": 4.747864412095567e-05, + "loss": 1.0696, + "step": 8519 + }, + { + "epoch": 0.517649917978006, + "grad_norm": 0.12930315732955933, + "learning_rate": 4.7469083573214507e-05, + "loss": 1.0957, + "step": 8520 + }, + { + "epoch": 0.5177106750106325, + "grad_norm": 0.18128079175949097, + "learning_rate": 4.745952311824458e-05, + "loss": 1.2746, + "step": 8521 + }, + { + "epoch": 0.517771432043259, + "grad_norm": 0.20560994744300842, + "learning_rate": 4.744996275639631e-05, + "loss": 1.1442, + "step": 8522 + }, + { + "epoch": 0.5178321890758856, + "grad_norm": 0.1893935650587082, + "learning_rate": 4.744040248802015e-05, + "loss": 1.1339, + "step": 8523 + }, + { + "epoch": 0.5178929461085121, + "grad_norm": 0.2803191542625427, + "learning_rate": 4.7430842313466526e-05, + "loss": 1.1397, + "step": 8524 + }, + { + "epoch": 0.5179537031411385, + "grad_norm": 0.14333286881446838, + "learning_rate": 4.742128223308587e-05, + "loss": 1.0146, + "step": 8525 + }, + { + "epoch": 0.5180144601737651, + "grad_norm": 0.11949789524078369, + "learning_rate": 4.741172224722863e-05, + "loss": 1.0436, + "step": 8526 + }, + { + "epoch": 0.5180752172063916, + "grad_norm": 0.7034077048301697, + "learning_rate": 4.740216235624521e-05, + "loss": 1.1816, + "step": 8527 + }, + { + "epoch": 0.5181359742390181, + "grad_norm": 0.12296085804700851, + "learning_rate": 4.739260256048603e-05, + "loss": 1.0484, + "step": 8528 + }, + { + "epoch": 0.5181967312716447, + "grad_norm": 0.2202768325805664, + "learning_rate": 4.7383042860301486e-05, + "loss": 1.1118, + "step": 8529 + }, + { + "epoch": 0.5182574883042712, + "grad_norm": 0.13064312934875488, + "learning_rate": 4.737348325604203e-05, + "loss": 1.0228, + "step": 8530 + }, + { + "epoch": 0.5183182453368977, + "grad_norm": 0.21044398844242096, + "learning_rate": 4.736392374805805e-05, + "loss": 1.2567, + "step": 8531 + }, + { + "epoch": 0.5183790023695243, + "grad_norm": 0.14555367827415466, + "learning_rate": 4.7354364336699944e-05, + "loss": 1.0687, + "step": 8532 + }, + { + "epoch": 0.5184397594021508, + "grad_norm": 0.17461712658405304, + "learning_rate": 4.734480502231813e-05, + "loss": 1.2131, + "step": 8533 + }, + { + "epoch": 0.5185005164347773, + "grad_norm": 0.6179195642471313, + "learning_rate": 4.733524580526302e-05, + "loss": 1.3834, + "step": 8534 + }, + { + "epoch": 0.5185612734674039, + "grad_norm": 0.15406516194343567, + "learning_rate": 4.732568668588497e-05, + "loss": 1.0398, + "step": 8535 + }, + { + "epoch": 0.5186220305000304, + "grad_norm": 0.3225206434726715, + "learning_rate": 4.7316127664534406e-05, + "loss": 1.1191, + "step": 8536 + }, + { + "epoch": 0.518682787532657, + "grad_norm": 0.16026699542999268, + "learning_rate": 4.730656874156168e-05, + "loss": 1.0895, + "step": 8537 + }, + { + "epoch": 0.5187435445652834, + "grad_norm": 0.13030384480953217, + "learning_rate": 4.729700991731721e-05, + "loss": 0.9668, + "step": 8538 + }, + { + "epoch": 0.5188043015979099, + "grad_norm": 0.9277085661888123, + "learning_rate": 4.728745119215136e-05, + "loss": 1.0626, + "step": 8539 + }, + { + "epoch": 0.5188650586305364, + "grad_norm": 0.155452698469162, + "learning_rate": 4.7277892566414505e-05, + "loss": 1.0366, + "step": 8540 + }, + { + "epoch": 0.518925815663163, + "grad_norm": 0.19903744757175446, + "learning_rate": 4.726833404045705e-05, + "loss": 1.1415, + "step": 8541 + }, + { + "epoch": 0.5189865726957895, + "grad_norm": 0.13731133937835693, + "learning_rate": 4.725877561462932e-05, + "loss": 1.1048, + "step": 8542 + }, + { + "epoch": 0.519047329728416, + "grad_norm": 0.14890733361244202, + "learning_rate": 4.72492172892817e-05, + "loss": 1.0721, + "step": 8543 + }, + { + "epoch": 0.5191080867610426, + "grad_norm": 1.7920825481414795, + "learning_rate": 4.7239659064764555e-05, + "loss": 1.1146, + "step": 8544 + }, + { + "epoch": 0.5191688437936691, + "grad_norm": 2.9327590465545654, + "learning_rate": 4.723010094142824e-05, + "loss": 1.0877, + "step": 8545 + }, + { + "epoch": 0.5192296008262957, + "grad_norm": 0.27135559916496277, + "learning_rate": 4.722054291962311e-05, + "loss": 1.1143, + "step": 8546 + }, + { + "epoch": 0.5192903578589222, + "grad_norm": 0.25059840083122253, + "learning_rate": 4.7210984999699525e-05, + "loss": 1.0972, + "step": 8547 + }, + { + "epoch": 0.5193511148915487, + "grad_norm": 0.12423308193683624, + "learning_rate": 4.720142718200781e-05, + "loss": 1.0428, + "step": 8548 + }, + { + "epoch": 0.5194118719241753, + "grad_norm": 0.22610218822956085, + "learning_rate": 4.719186946689832e-05, + "loss": 1.0909, + "step": 8549 + }, + { + "epoch": 0.5194726289568018, + "grad_norm": 0.211140975356102, + "learning_rate": 4.7182311854721405e-05, + "loss": 1.1883, + "step": 8550 + }, + { + "epoch": 0.5195333859894283, + "grad_norm": 0.13000492751598358, + "learning_rate": 4.7172754345827395e-05, + "loss": 1.0572, + "step": 8551 + }, + { + "epoch": 0.5195941430220548, + "grad_norm": 0.32262375950813293, + "learning_rate": 4.716319694056662e-05, + "loss": 1.1451, + "step": 8552 + }, + { + "epoch": 0.5196549000546813, + "grad_norm": 0.14049126207828522, + "learning_rate": 4.7153639639289405e-05, + "loss": 1.0213, + "step": 8553 + }, + { + "epoch": 0.5197156570873078, + "grad_norm": 0.23663240671157837, + "learning_rate": 4.7144082442346096e-05, + "loss": 1.1021, + "step": 8554 + }, + { + "epoch": 0.5197764141199344, + "grad_norm": 0.15460443496704102, + "learning_rate": 4.713452535008698e-05, + "loss": 1.0654, + "step": 8555 + }, + { + "epoch": 0.5198371711525609, + "grad_norm": 0.14666052162647247, + "learning_rate": 4.71249683628624e-05, + "loss": 1.1444, + "step": 8556 + }, + { + "epoch": 0.5198979281851874, + "grad_norm": 0.13057032227516174, + "learning_rate": 4.711541148102265e-05, + "loss": 1.014, + "step": 8557 + }, + { + "epoch": 0.519958685217814, + "grad_norm": 0.32210874557495117, + "learning_rate": 4.710585470491806e-05, + "loss": 1.117, + "step": 8558 + }, + { + "epoch": 0.5200194422504405, + "grad_norm": 0.28795522451400757, + "learning_rate": 4.7096298034898916e-05, + "loss": 1.0833, + "step": 8559 + }, + { + "epoch": 0.520080199283067, + "grad_norm": 0.14907464385032654, + "learning_rate": 4.7086741471315535e-05, + "loss": 1.0744, + "step": 8560 + }, + { + "epoch": 0.5201409563156936, + "grad_norm": 0.18301212787628174, + "learning_rate": 4.707718501451822e-05, + "loss": 1.1399, + "step": 8561 + }, + { + "epoch": 0.5202017133483201, + "grad_norm": 0.24117273092269897, + "learning_rate": 4.7067628664857234e-05, + "loss": 1.1392, + "step": 8562 + }, + { + "epoch": 0.5202624703809466, + "grad_norm": 0.47974660992622375, + "learning_rate": 4.705807242268289e-05, + "loss": 1.053, + "step": 8563 + }, + { + "epoch": 0.5203232274135732, + "grad_norm": 0.1926921308040619, + "learning_rate": 4.704851628834548e-05, + "loss": 1.1231, + "step": 8564 + }, + { + "epoch": 0.5203839844461996, + "grad_norm": 0.12540577352046967, + "learning_rate": 4.703896026219525e-05, + "loss": 1.0977, + "step": 8565 + }, + { + "epoch": 0.5204447414788261, + "grad_norm": 0.8470072746276855, + "learning_rate": 4.702940434458253e-05, + "loss": 1.0427, + "step": 8566 + }, + { + "epoch": 0.5205054985114527, + "grad_norm": 0.2913875877857208, + "learning_rate": 4.701984853585757e-05, + "loss": 1.1211, + "step": 8567 + }, + { + "epoch": 0.5205662555440792, + "grad_norm": 0.18532782793045044, + "learning_rate": 4.7010292836370636e-05, + "loss": 1.2509, + "step": 8568 + }, + { + "epoch": 0.5206270125767057, + "grad_norm": 0.1036534383893013, + "learning_rate": 4.7000737246471985e-05, + "loss": 1.0159, + "step": 8569 + }, + { + "epoch": 0.5206877696093323, + "grad_norm": 0.258127897977829, + "learning_rate": 4.699118176651188e-05, + "loss": 1.0991, + "step": 8570 + }, + { + "epoch": 0.5207485266419588, + "grad_norm": 0.14909648895263672, + "learning_rate": 4.69816263968406e-05, + "loss": 1.0391, + "step": 8571 + }, + { + "epoch": 0.5208092836745853, + "grad_norm": 0.18307402729988098, + "learning_rate": 4.6972071137808396e-05, + "loss": 1.0576, + "step": 8572 + }, + { + "epoch": 0.5208700407072119, + "grad_norm": 0.18096312880516052, + "learning_rate": 4.69625159897655e-05, + "loss": 1.1762, + "step": 8573 + }, + { + "epoch": 0.5209307977398384, + "grad_norm": 0.14437976479530334, + "learning_rate": 4.6952960953062194e-05, + "loss": 1.0641, + "step": 8574 + }, + { + "epoch": 0.520991554772465, + "grad_norm": 0.1318669617176056, + "learning_rate": 4.694340602804866e-05, + "loss": 1.0619, + "step": 8575 + }, + { + "epoch": 0.5210523118050915, + "grad_norm": 0.3191777765750885, + "learning_rate": 4.693385121507518e-05, + "loss": 1.1428, + "step": 8576 + }, + { + "epoch": 0.521113068837718, + "grad_norm": 0.16258472204208374, + "learning_rate": 4.6924296514491976e-05, + "loss": 1.1067, + "step": 8577 + }, + { + "epoch": 0.5211738258703444, + "grad_norm": 0.19094589352607727, + "learning_rate": 4.691474192664927e-05, + "loss": 1.1186, + "step": 8578 + }, + { + "epoch": 0.521234582902971, + "grad_norm": 0.1727590262889862, + "learning_rate": 4.6905187451897306e-05, + "loss": 1.0734, + "step": 8579 + }, + { + "epoch": 0.5212953399355975, + "grad_norm": 0.20080527663230896, + "learning_rate": 4.6895633090586287e-05, + "loss": 1.0286, + "step": 8580 + }, + { + "epoch": 0.521356096968224, + "grad_norm": 0.3128844201564789, + "learning_rate": 4.688607884306645e-05, + "loss": 1.1375, + "step": 8581 + }, + { + "epoch": 0.5214168540008506, + "grad_norm": 0.30854642391204834, + "learning_rate": 4.6876524709687995e-05, + "loss": 1.036, + "step": 8582 + }, + { + "epoch": 0.5214776110334771, + "grad_norm": 0.11264657229185104, + "learning_rate": 4.686697069080111e-05, + "loss": 1.0065, + "step": 8583 + }, + { + "epoch": 0.5215383680661037, + "grad_norm": 0.22289027273654938, + "learning_rate": 4.6857416786756036e-05, + "loss": 1.1451, + "step": 8584 + }, + { + "epoch": 0.5215991250987302, + "grad_norm": 0.15465950965881348, + "learning_rate": 4.684786299790295e-05, + "loss": 1.0474, + "step": 8585 + }, + { + "epoch": 0.5216598821313567, + "grad_norm": 0.21442024409770966, + "learning_rate": 4.683830932459205e-05, + "loss": 1.1101, + "step": 8586 + }, + { + "epoch": 0.5217206391639833, + "grad_norm": 0.16656796634197235, + "learning_rate": 4.682875576717354e-05, + "loss": 1.1451, + "step": 8587 + }, + { + "epoch": 0.5217813961966098, + "grad_norm": 0.40059763193130493, + "learning_rate": 4.681920232599761e-05, + "loss": 1.0725, + "step": 8588 + }, + { + "epoch": 0.5218421532292363, + "grad_norm": 0.24076776206493378, + "learning_rate": 4.680964900141443e-05, + "loss": 1.0504, + "step": 8589 + }, + { + "epoch": 0.5219029102618629, + "grad_norm": 0.3196810781955719, + "learning_rate": 4.680009579377418e-05, + "loss": 1.0874, + "step": 8590 + }, + { + "epoch": 0.5219636672944893, + "grad_norm": 0.1243172362446785, + "learning_rate": 4.679054270342703e-05, + "loss": 1.0737, + "step": 8591 + }, + { + "epoch": 0.5220244243271158, + "grad_norm": 0.15369655191898346, + "learning_rate": 4.6780989730723155e-05, + "loss": 1.0562, + "step": 8592 + }, + { + "epoch": 0.5220851813597424, + "grad_norm": 0.15701650083065033, + "learning_rate": 4.677143687601274e-05, + "loss": 1.0065, + "step": 8593 + }, + { + "epoch": 0.5221459383923689, + "grad_norm": 0.2327444851398468, + "learning_rate": 4.676188413964593e-05, + "loss": 1.0102, + "step": 8594 + }, + { + "epoch": 0.5222066954249954, + "grad_norm": 0.20564907789230347, + "learning_rate": 4.675233152197287e-05, + "loss": 1.0518, + "step": 8595 + }, + { + "epoch": 0.522267452457622, + "grad_norm": 0.14837133884429932, + "learning_rate": 4.6742779023343725e-05, + "loss": 1.0464, + "step": 8596 + }, + { + "epoch": 0.5223282094902485, + "grad_norm": 0.13209976255893707, + "learning_rate": 4.673322664410865e-05, + "loss": 1.0408, + "step": 8597 + }, + { + "epoch": 0.522388966522875, + "grad_norm": 0.1632174402475357, + "learning_rate": 4.672367438461778e-05, + "loss": 1.1276, + "step": 8598 + }, + { + "epoch": 0.5224497235555016, + "grad_norm": 0.247183695435524, + "learning_rate": 4.671412224522126e-05, + "loss": 1.0894, + "step": 8599 + }, + { + "epoch": 0.5225104805881281, + "grad_norm": 0.1803322732448578, + "learning_rate": 4.670457022626922e-05, + "loss": 1.0554, + "step": 8600 + }, + { + "epoch": 0.5225712376207546, + "grad_norm": 0.43978366255760193, + "learning_rate": 4.6695018328111814e-05, + "loss": 1.1711, + "step": 8601 + }, + { + "epoch": 0.5226319946533812, + "grad_norm": 0.18564847111701965, + "learning_rate": 4.668546655109914e-05, + "loss": 1.1763, + "step": 8602 + }, + { + "epoch": 0.5226927516860077, + "grad_norm": 0.16568569839000702, + "learning_rate": 4.667591489558133e-05, + "loss": 1.0638, + "step": 8603 + }, + { + "epoch": 0.5227535087186341, + "grad_norm": 0.14004436135292053, + "learning_rate": 4.666636336190849e-05, + "loss": 1.0573, + "step": 8604 + }, + { + "epoch": 0.5228142657512607, + "grad_norm": 0.1846337616443634, + "learning_rate": 4.6656811950430754e-05, + "loss": 1.044, + "step": 8605 + }, + { + "epoch": 0.5228750227838872, + "grad_norm": 0.12017328292131424, + "learning_rate": 4.664726066149822e-05, + "loss": 1.0496, + "step": 8606 + }, + { + "epoch": 0.5229357798165137, + "grad_norm": 0.24024394154548645, + "learning_rate": 4.663770949546099e-05, + "loss": 1.1732, + "step": 8607 + }, + { + "epoch": 0.5229965368491403, + "grad_norm": 0.22270292043685913, + "learning_rate": 4.66281584526692e-05, + "loss": 1.2911, + "step": 8608 + }, + { + "epoch": 0.5230572938817668, + "grad_norm": 0.15339328348636627, + "learning_rate": 4.661860753347288e-05, + "loss": 1.0564, + "step": 8609 + }, + { + "epoch": 0.5231180509143933, + "grad_norm": 0.2351030707359314, + "learning_rate": 4.660905673822217e-05, + "loss": 1.0272, + "step": 8610 + }, + { + "epoch": 0.5231788079470199, + "grad_norm": 0.2053232491016388, + "learning_rate": 4.659950606726713e-05, + "loss": 1.1677, + "step": 8611 + }, + { + "epoch": 0.5232395649796464, + "grad_norm": 0.14381182193756104, + "learning_rate": 4.658995552095785e-05, + "loss": 1.0556, + "step": 8612 + }, + { + "epoch": 0.523300322012273, + "grad_norm": 0.867560863494873, + "learning_rate": 4.658040509964441e-05, + "loss": 1.1115, + "step": 8613 + }, + { + "epoch": 0.5233610790448995, + "grad_norm": 0.22009895741939545, + "learning_rate": 4.657085480367691e-05, + "loss": 1.0878, + "step": 8614 + }, + { + "epoch": 0.523421836077526, + "grad_norm": 0.12190107256174088, + "learning_rate": 4.6561304633405365e-05, + "loss": 1.0959, + "step": 8615 + }, + { + "epoch": 0.5234825931101525, + "grad_norm": 0.17178240418434143, + "learning_rate": 4.6551754589179867e-05, + "loss": 1.0113, + "step": 8616 + }, + { + "epoch": 0.523543350142779, + "grad_norm": 0.18270833790302277, + "learning_rate": 4.654220467135047e-05, + "loss": 1.1768, + "step": 8617 + }, + { + "epoch": 0.5236041071754055, + "grad_norm": 0.21078373491764069, + "learning_rate": 4.653265488026723e-05, + "loss": 1.2156, + "step": 8618 + }, + { + "epoch": 0.523664864208032, + "grad_norm": 0.16738082468509674, + "learning_rate": 4.6523105216280196e-05, + "loss": 1.1635, + "step": 8619 + }, + { + "epoch": 0.5237256212406586, + "grad_norm": 0.15675511956214905, + "learning_rate": 4.65135556797394e-05, + "loss": 1.0399, + "step": 8620 + }, + { + "epoch": 0.5237863782732851, + "grad_norm": 0.12584108114242554, + "learning_rate": 4.6504006270994934e-05, + "loss": 1.026, + "step": 8621 + }, + { + "epoch": 0.5238471353059116, + "grad_norm": 0.13266484439373016, + "learning_rate": 4.649445699039675e-05, + "loss": 1.052, + "step": 8622 + }, + { + "epoch": 0.5239078923385382, + "grad_norm": 0.1908525973558426, + "learning_rate": 4.648490783829495e-05, + "loss": 1.0591, + "step": 8623 + }, + { + "epoch": 0.5239686493711647, + "grad_norm": 0.17659196257591248, + "learning_rate": 4.6475358815039514e-05, + "loss": 1.0764, + "step": 8624 + }, + { + "epoch": 0.5240294064037913, + "grad_norm": 0.14164158701896667, + "learning_rate": 4.6465809920980485e-05, + "loss": 1.1083, + "step": 8625 + }, + { + "epoch": 0.5240901634364178, + "grad_norm": 0.15035857260227203, + "learning_rate": 4.645626115646788e-05, + "loss": 0.9924, + "step": 8626 + }, + { + "epoch": 0.5241509204690443, + "grad_norm": 0.16510003805160522, + "learning_rate": 4.644671252185171e-05, + "loss": 1.1492, + "step": 8627 + }, + { + "epoch": 0.5242116775016709, + "grad_norm": 0.19746075570583344, + "learning_rate": 4.643716401748199e-05, + "loss": 1.068, + "step": 8628 + }, + { + "epoch": 0.5242724345342974, + "grad_norm": 0.13315987586975098, + "learning_rate": 4.64276156437087e-05, + "loss": 1.0826, + "step": 8629 + }, + { + "epoch": 0.5243331915669238, + "grad_norm": 0.1473248302936554, + "learning_rate": 4.641806740088184e-05, + "loss": 1.064, + "step": 8630 + }, + { + "epoch": 0.5243939485995504, + "grad_norm": 0.21956190466880798, + "learning_rate": 4.640851928935142e-05, + "loss": 1.0046, + "step": 8631 + }, + { + "epoch": 0.5244547056321769, + "grad_norm": 0.12775710225105286, + "learning_rate": 4.6398971309467426e-05, + "loss": 1.0448, + "step": 8632 + }, + { + "epoch": 0.5245154626648034, + "grad_norm": 0.13934054970741272, + "learning_rate": 4.638942346157982e-05, + "loss": 1.0606, + "step": 8633 + }, + { + "epoch": 0.52457621969743, + "grad_norm": 0.85642409324646, + "learning_rate": 4.6379875746038625e-05, + "loss": 1.2726, + "step": 8634 + }, + { + "epoch": 0.5246369767300565, + "grad_norm": 0.21907411515712738, + "learning_rate": 4.6370328163193754e-05, + "loss": 1.0919, + "step": 8635 + }, + { + "epoch": 0.524697733762683, + "grad_norm": 0.20063483715057373, + "learning_rate": 4.6360780713395214e-05, + "loss": 1.1787, + "step": 8636 + }, + { + "epoch": 0.5247584907953096, + "grad_norm": 0.15670740604400635, + "learning_rate": 4.6351233396992966e-05, + "loss": 1.1677, + "step": 8637 + }, + { + "epoch": 0.5248192478279361, + "grad_norm": 0.2139139324426651, + "learning_rate": 4.634168621433694e-05, + "loss": 1.0672, + "step": 8638 + }, + { + "epoch": 0.5248800048605626, + "grad_norm": 0.1647370159626007, + "learning_rate": 4.633213916577714e-05, + "loss": 1.0713, + "step": 8639 + }, + { + "epoch": 0.5249407618931892, + "grad_norm": 0.1305478811264038, + "learning_rate": 4.632259225166348e-05, + "loss": 1.033, + "step": 8640 + }, + { + "epoch": 0.5250015189258157, + "grad_norm": 0.15238302946090698, + "learning_rate": 4.631304547234592e-05, + "loss": 1.1097, + "step": 8641 + }, + { + "epoch": 0.5250622759584422, + "grad_norm": 0.20471568405628204, + "learning_rate": 4.630349882817437e-05, + "loss": 1.1534, + "step": 8642 + }, + { + "epoch": 0.5251230329910687, + "grad_norm": 0.18734177947044373, + "learning_rate": 4.62939523194988e-05, + "loss": 1.1485, + "step": 8643 + }, + { + "epoch": 0.5251837900236952, + "grad_norm": 0.14287249743938446, + "learning_rate": 4.628440594666912e-05, + "loss": 1.1792, + "step": 8644 + }, + { + "epoch": 0.5252445470563217, + "grad_norm": 0.1861511468887329, + "learning_rate": 4.627485971003527e-05, + "loss": 0.9983, + "step": 8645 + }, + { + "epoch": 0.5253053040889483, + "grad_norm": 0.1467745006084442, + "learning_rate": 4.626531360994713e-05, + "loss": 1.0541, + "step": 8646 + }, + { + "epoch": 0.5253660611215748, + "grad_norm": 0.7447397112846375, + "learning_rate": 4.625576764675466e-05, + "loss": 1.1406, + "step": 8647 + }, + { + "epoch": 0.5254268181542013, + "grad_norm": 0.20171573758125305, + "learning_rate": 4.624622182080776e-05, + "loss": 1.0727, + "step": 8648 + }, + { + "epoch": 0.5254875751868279, + "grad_norm": 0.24454587697982788, + "learning_rate": 4.623667613245632e-05, + "loss": 1.097, + "step": 8649 + }, + { + "epoch": 0.5255483322194544, + "grad_norm": 0.33716270327568054, + "learning_rate": 4.622713058205023e-05, + "loss": 1.0666, + "step": 8650 + }, + { + "epoch": 0.5256090892520809, + "grad_norm": 3.97310733795166, + "learning_rate": 4.62175851699394e-05, + "loss": 1.0461, + "step": 8651 + }, + { + "epoch": 0.5256698462847075, + "grad_norm": 0.1582850217819214, + "learning_rate": 4.620803989647373e-05, + "loss": 1.1161, + "step": 8652 + }, + { + "epoch": 0.525730603317334, + "grad_norm": 0.2617885172367096, + "learning_rate": 4.6198494762003076e-05, + "loss": 1.0628, + "step": 8653 + }, + { + "epoch": 0.5257913603499605, + "grad_norm": 0.17769746482372284, + "learning_rate": 4.618894976687733e-05, + "loss": 1.1235, + "step": 8654 + }, + { + "epoch": 0.5258521173825871, + "grad_norm": 0.33753904700279236, + "learning_rate": 4.6179404911446386e-05, + "loss": 1.1388, + "step": 8655 + }, + { + "epoch": 0.5259128744152136, + "grad_norm": 0.18174488842487335, + "learning_rate": 4.6169860196060084e-05, + "loss": 1.1416, + "step": 8656 + }, + { + "epoch": 0.52597363144784, + "grad_norm": 0.1649746149778366, + "learning_rate": 4.6160315621068294e-05, + "loss": 1.0226, + "step": 8657 + }, + { + "epoch": 0.5260343884804666, + "grad_norm": 0.1672050803899765, + "learning_rate": 4.615077118682087e-05, + "loss": 1.0787, + "step": 8658 + }, + { + "epoch": 0.5260951455130931, + "grad_norm": 0.17580434679985046, + "learning_rate": 4.6141226893667684e-05, + "loss": 1.0415, + "step": 8659 + }, + { + "epoch": 0.5261559025457196, + "grad_norm": 0.5573168396949768, + "learning_rate": 4.6131682741958566e-05, + "loss": 1.092, + "step": 8660 + }, + { + "epoch": 0.5262166595783462, + "grad_norm": 0.3986499607563019, + "learning_rate": 4.612213873204338e-05, + "loss": 1.1489, + "step": 8661 + }, + { + "epoch": 0.5262774166109727, + "grad_norm": 0.21384546160697937, + "learning_rate": 4.611259486427194e-05, + "loss": 1.098, + "step": 8662 + }, + { + "epoch": 0.5263381736435992, + "grad_norm": 0.17524658143520355, + "learning_rate": 4.610305113899407e-05, + "loss": 1.0754, + "step": 8663 + }, + { + "epoch": 0.5263989306762258, + "grad_norm": 0.1727045774459839, + "learning_rate": 4.609350755655963e-05, + "loss": 1.0928, + "step": 8664 + }, + { + "epoch": 0.5264596877088523, + "grad_norm": 0.1485423445701599, + "learning_rate": 4.608396411731842e-05, + "loss": 1.0425, + "step": 8665 + }, + { + "epoch": 0.5265204447414789, + "grad_norm": 0.2882673442363739, + "learning_rate": 4.607442082162026e-05, + "loss": 1.2019, + "step": 8666 + }, + { + "epoch": 0.5265812017741054, + "grad_norm": 0.1373392641544342, + "learning_rate": 4.606487766981496e-05, + "loss": 1.0457, + "step": 8667 + }, + { + "epoch": 0.5266419588067319, + "grad_norm": 0.5117956399917603, + "learning_rate": 4.605533466225235e-05, + "loss": 1.2386, + "step": 8668 + }, + { + "epoch": 0.5267027158393585, + "grad_norm": 0.7526136636734009, + "learning_rate": 4.60457917992822e-05, + "loss": 1.1093, + "step": 8669 + }, + { + "epoch": 0.5267634728719849, + "grad_norm": 0.31235063076019287, + "learning_rate": 4.603624908125432e-05, + "loss": 1.1961, + "step": 8670 + }, + { + "epoch": 0.5268242299046114, + "grad_norm": 0.4177379906177521, + "learning_rate": 4.6026706508518474e-05, + "loss": 1.1936, + "step": 8671 + }, + { + "epoch": 0.526884986937238, + "grad_norm": 0.15043529868125916, + "learning_rate": 4.601716408142448e-05, + "loss": 1.0563, + "step": 8672 + }, + { + "epoch": 0.5269457439698645, + "grad_norm": 0.17881150543689728, + "learning_rate": 4.600762180032212e-05, + "loss": 1.0621, + "step": 8673 + }, + { + "epoch": 0.527006501002491, + "grad_norm": 0.2779545783996582, + "learning_rate": 4.599807966556113e-05, + "loss": 1.0724, + "step": 8674 + }, + { + "epoch": 0.5270672580351176, + "grad_norm": 1.8141210079193115, + "learning_rate": 4.598853767749135e-05, + "loss": 1.0501, + "step": 8675 + }, + { + "epoch": 0.5271280150677441, + "grad_norm": 0.20107562839984894, + "learning_rate": 4.597899583646245e-05, + "loss": 1.0175, + "step": 8676 + }, + { + "epoch": 0.5271887721003706, + "grad_norm": 0.19702781736850739, + "learning_rate": 4.596945414282425e-05, + "loss": 1.1646, + "step": 8677 + }, + { + "epoch": 0.5272495291329972, + "grad_norm": 0.3360294997692108, + "learning_rate": 4.59599125969265e-05, + "loss": 1.2875, + "step": 8678 + }, + { + "epoch": 0.5273102861656237, + "grad_norm": 0.2003525346517563, + "learning_rate": 4.5950371199118906e-05, + "loss": 1.0849, + "step": 8679 + }, + { + "epoch": 0.5273710431982502, + "grad_norm": 0.1472960114479065, + "learning_rate": 4.5940829949751255e-05, + "loss": 0.9855, + "step": 8680 + }, + { + "epoch": 0.5274318002308768, + "grad_norm": 0.19163544476032257, + "learning_rate": 4.593128884917328e-05, + "loss": 1.0969, + "step": 8681 + }, + { + "epoch": 0.5274925572635033, + "grad_norm": 0.16724400222301483, + "learning_rate": 4.592174789773469e-05, + "loss": 1.0859, + "step": 8682 + }, + { + "epoch": 0.5275533142961297, + "grad_norm": 15.174552917480469, + "learning_rate": 4.591220709578521e-05, + "loss": 1.0845, + "step": 8683 + }, + { + "epoch": 0.5276140713287563, + "grad_norm": 0.14118710160255432, + "learning_rate": 4.590266644367457e-05, + "loss": 1.0497, + "step": 8684 + }, + { + "epoch": 0.5276748283613828, + "grad_norm": 0.2985968291759491, + "learning_rate": 4.589312594175249e-05, + "loss": 1.1774, + "step": 8685 + }, + { + "epoch": 0.5277355853940093, + "grad_norm": 0.4103105962276459, + "learning_rate": 4.588358559036867e-05, + "loss": 1.1844, + "step": 8686 + }, + { + "epoch": 0.5277963424266359, + "grad_norm": 0.417360782623291, + "learning_rate": 4.5874045389872804e-05, + "loss": 1.2113, + "step": 8687 + }, + { + "epoch": 0.5278570994592624, + "grad_norm": 0.24194343388080597, + "learning_rate": 4.5864505340614636e-05, + "loss": 1.3424, + "step": 8688 + }, + { + "epoch": 0.5279178564918889, + "grad_norm": 0.1654786765575409, + "learning_rate": 4.5854965442943796e-05, + "loss": 1.088, + "step": 8689 + }, + { + "epoch": 0.5279786135245155, + "grad_norm": 0.1660112887620926, + "learning_rate": 4.5845425697210005e-05, + "loss": 1.0636, + "step": 8690 + }, + { + "epoch": 0.528039370557142, + "grad_norm": 1.0987192392349243, + "learning_rate": 4.583588610376294e-05, + "loss": 1.2099, + "step": 8691 + }, + { + "epoch": 0.5281001275897685, + "grad_norm": 1.8631782531738281, + "learning_rate": 4.582634666295226e-05, + "loss": 1.1296, + "step": 8692 + }, + { + "epoch": 0.5281608846223951, + "grad_norm": 0.24905526638031006, + "learning_rate": 4.581680737512767e-05, + "loss": 1.0537, + "step": 8693 + }, + { + "epoch": 0.5282216416550216, + "grad_norm": 0.2177335023880005, + "learning_rate": 4.58072682406388e-05, + "loss": 1.0183, + "step": 8694 + }, + { + "epoch": 0.5282823986876481, + "grad_norm": 0.26563701033592224, + "learning_rate": 4.579772925983535e-05, + "loss": 1.1127, + "step": 8695 + }, + { + "epoch": 0.5283431557202746, + "grad_norm": 0.4449535310268402, + "learning_rate": 4.578819043306692e-05, + "loss": 1.144, + "step": 8696 + }, + { + "epoch": 0.5284039127529011, + "grad_norm": 0.22405296564102173, + "learning_rate": 4.5778651760683185e-05, + "loss": 1.0336, + "step": 8697 + }, + { + "epoch": 0.5284646697855276, + "grad_norm": 0.5492637157440186, + "learning_rate": 4.576911324303379e-05, + "loss": 1.2332, + "step": 8698 + }, + { + "epoch": 0.5285254268181542, + "grad_norm": 0.6914724111557007, + "learning_rate": 4.575957488046837e-05, + "loss": 1.1727, + "step": 8699 + }, + { + "epoch": 0.5285861838507807, + "grad_norm": 0.25794705748558044, + "learning_rate": 4.5750036673336535e-05, + "loss": 1.0396, + "step": 8700 + }, + { + "epoch": 0.5286469408834072, + "grad_norm": 0.2574668824672699, + "learning_rate": 4.574049862198797e-05, + "loss": 1.0947, + "step": 8701 + }, + { + "epoch": 0.5287076979160338, + "grad_norm": 0.4927690625190735, + "learning_rate": 4.573096072677221e-05, + "loss": 1.0905, + "step": 8702 + }, + { + "epoch": 0.5287684549486603, + "grad_norm": 0.25256097316741943, + "learning_rate": 4.572142298803893e-05, + "loss": 1.017, + "step": 8703 + }, + { + "epoch": 0.5288292119812869, + "grad_norm": 0.25896692276000977, + "learning_rate": 4.571188540613771e-05, + "loss": 1.1333, + "step": 8704 + }, + { + "epoch": 0.5288899690139134, + "grad_norm": 0.2356635481119156, + "learning_rate": 4.5702347981418146e-05, + "loss": 1.118, + "step": 8705 + }, + { + "epoch": 0.5289507260465399, + "grad_norm": 0.27731621265411377, + "learning_rate": 4.569281071422987e-05, + "loss": 1.094, + "step": 8706 + }, + { + "epoch": 0.5290114830791665, + "grad_norm": 0.23638027906417847, + "learning_rate": 4.568327360492244e-05, + "loss": 1.1346, + "step": 8707 + }, + { + "epoch": 0.529072240111793, + "grad_norm": 0.1733100712299347, + "learning_rate": 4.567373665384546e-05, + "loss": 1.0479, + "step": 8708 + }, + { + "epoch": 0.5291329971444194, + "grad_norm": 0.18118803203105927, + "learning_rate": 4.5664199861348495e-05, + "loss": 1.0569, + "step": 8709 + }, + { + "epoch": 0.529193754177046, + "grad_norm": 0.20814651250839233, + "learning_rate": 4.5654663227781106e-05, + "loss": 1.0691, + "step": 8710 + }, + { + "epoch": 0.5292545112096725, + "grad_norm": 0.27674707770347595, + "learning_rate": 4.564512675349289e-05, + "loss": 1.0877, + "step": 8711 + }, + { + "epoch": 0.529315268242299, + "grad_norm": 0.2000836879014969, + "learning_rate": 4.563559043883339e-05, + "loss": 1.1641, + "step": 8712 + }, + { + "epoch": 0.5293760252749256, + "grad_norm": 0.30237486958503723, + "learning_rate": 4.5626054284152165e-05, + "loss": 1.187, + "step": 8713 + }, + { + "epoch": 0.5294367823075521, + "grad_norm": 0.21813370287418365, + "learning_rate": 4.561651828979877e-05, + "loss": 1.1632, + "step": 8714 + }, + { + "epoch": 0.5294975393401786, + "grad_norm": 0.13433311879634857, + "learning_rate": 4.560698245612275e-05, + "loss": 1.0518, + "step": 8715 + }, + { + "epoch": 0.5295582963728052, + "grad_norm": 0.14734883606433868, + "learning_rate": 4.559744678347363e-05, + "loss": 1.0382, + "step": 8716 + }, + { + "epoch": 0.5296190534054317, + "grad_norm": 0.11894255131483078, + "learning_rate": 4.558791127220096e-05, + "loss": 1.007, + "step": 8717 + }, + { + "epoch": 0.5296798104380582, + "grad_norm": 0.7833021879196167, + "learning_rate": 4.5578375922654235e-05, + "loss": 1.109, + "step": 8718 + }, + { + "epoch": 0.5297405674706848, + "grad_norm": 0.14893975853919983, + "learning_rate": 4.5568840735183005e-05, + "loss": 1.0708, + "step": 8719 + }, + { + "epoch": 0.5298013245033113, + "grad_norm": 0.139544278383255, + "learning_rate": 4.555930571013678e-05, + "loss": 1.1002, + "step": 8720 + }, + { + "epoch": 0.5298620815359378, + "grad_norm": 0.19169317185878754, + "learning_rate": 4.554977084786507e-05, + "loss": 1.0939, + "step": 8721 + }, + { + "epoch": 0.5299228385685643, + "grad_norm": 0.14398300647735596, + "learning_rate": 4.5540236148717356e-05, + "loss": 1.0891, + "step": 8722 + }, + { + "epoch": 0.5299835956011908, + "grad_norm": 0.1250181943178177, + "learning_rate": 4.553070161304314e-05, + "loss": 1.0149, + "step": 8723 + }, + { + "epoch": 0.5300443526338173, + "grad_norm": 0.28524330258369446, + "learning_rate": 4.552116724119193e-05, + "loss": 1.0837, + "step": 8724 + }, + { + "epoch": 0.5301051096664439, + "grad_norm": 0.13696500658988953, + "learning_rate": 4.55116330335132e-05, + "loss": 1.045, + "step": 8725 + }, + { + "epoch": 0.5301658666990704, + "grad_norm": 0.1360851377248764, + "learning_rate": 4.5502098990356425e-05, + "loss": 1.0537, + "step": 8726 + }, + { + "epoch": 0.5302266237316969, + "grad_norm": 0.1495683640241623, + "learning_rate": 4.549256511207108e-05, + "loss": 1.0636, + "step": 8727 + }, + { + "epoch": 0.5302873807643235, + "grad_norm": 0.15629592537879944, + "learning_rate": 4.548303139900666e-05, + "loss": 1.0077, + "step": 8728 + }, + { + "epoch": 0.53034813779695, + "grad_norm": 0.14614929258823395, + "learning_rate": 4.547349785151257e-05, + "loss": 1.0294, + "step": 8729 + }, + { + "epoch": 0.5304088948295765, + "grad_norm": 0.1470760852098465, + "learning_rate": 4.546396446993829e-05, + "loss": 1.0315, + "step": 8730 + }, + { + "epoch": 0.5304696518622031, + "grad_norm": 0.15036186575889587, + "learning_rate": 4.545443125463327e-05, + "loss": 1.0804, + "step": 8731 + }, + { + "epoch": 0.5305304088948296, + "grad_norm": 0.5035548806190491, + "learning_rate": 4.544489820594696e-05, + "loss": 1.1199, + "step": 8732 + }, + { + "epoch": 0.5305911659274561, + "grad_norm": 0.23608504235744476, + "learning_rate": 4.543536532422878e-05, + "loss": 1.158, + "step": 8733 + }, + { + "epoch": 0.5306519229600827, + "grad_norm": 0.2268620878458023, + "learning_rate": 4.5425832609828156e-05, + "loss": 1.1468, + "step": 8734 + }, + { + "epoch": 0.5307126799927091, + "grad_norm": 0.14344771206378937, + "learning_rate": 4.541630006309456e-05, + "loss": 1.0013, + "step": 8735 + }, + { + "epoch": 0.5307734370253356, + "grad_norm": 0.1313079297542572, + "learning_rate": 4.5406767684377336e-05, + "loss": 1.0578, + "step": 8736 + }, + { + "epoch": 0.5308341940579622, + "grad_norm": 0.17482100427150726, + "learning_rate": 4.539723547402594e-05, + "loss": 0.9898, + "step": 8737 + }, + { + "epoch": 0.5308949510905887, + "grad_norm": 0.1623125970363617, + "learning_rate": 4.5387703432389775e-05, + "loss": 1.1636, + "step": 8738 + }, + { + "epoch": 0.5309557081232152, + "grad_norm": 0.1889430284500122, + "learning_rate": 4.537817155981821e-05, + "loss": 1.1606, + "step": 8739 + }, + { + "epoch": 0.5310164651558418, + "grad_norm": 0.20790928602218628, + "learning_rate": 4.5368639856660674e-05, + "loss": 1.2408, + "step": 8740 + }, + { + "epoch": 0.5310772221884683, + "grad_norm": 0.1391948163509369, + "learning_rate": 4.535910832326654e-05, + "loss": 1.1023, + "step": 8741 + }, + { + "epoch": 0.5311379792210948, + "grad_norm": 0.13725924491882324, + "learning_rate": 4.53495769599852e-05, + "loss": 1.055, + "step": 8742 + }, + { + "epoch": 0.5311987362537214, + "grad_norm": 0.19517844915390015, + "learning_rate": 4.534004576716599e-05, + "loss": 1.1158, + "step": 8743 + }, + { + "epoch": 0.5312594932863479, + "grad_norm": 5.072870254516602, + "learning_rate": 4.5330514745158324e-05, + "loss": 1.1402, + "step": 8744 + }, + { + "epoch": 0.5313202503189745, + "grad_norm": 0.24879388511180878, + "learning_rate": 4.532098389431154e-05, + "loss": 1.1199, + "step": 8745 + }, + { + "epoch": 0.531381007351601, + "grad_norm": 0.17267988622188568, + "learning_rate": 4.531145321497498e-05, + "loss": 1.0712, + "step": 8746 + }, + { + "epoch": 0.5314417643842275, + "grad_norm": 0.14354780316352844, + "learning_rate": 4.530192270749801e-05, + "loss": 1.0917, + "step": 8747 + }, + { + "epoch": 0.531502521416854, + "grad_norm": 0.12416389584541321, + "learning_rate": 4.5292392372229996e-05, + "loss": 1.0396, + "step": 8748 + }, + { + "epoch": 0.5315632784494805, + "grad_norm": 0.17430990934371948, + "learning_rate": 4.5282862209520236e-05, + "loss": 1.0602, + "step": 8749 + }, + { + "epoch": 0.531624035482107, + "grad_norm": 0.14102205634117126, + "learning_rate": 4.5273332219718076e-05, + "loss": 1.0283, + "step": 8750 + }, + { + "epoch": 0.5316847925147336, + "grad_norm": 0.1749001443386078, + "learning_rate": 4.526380240317282e-05, + "loss": 1.1014, + "step": 8751 + }, + { + "epoch": 0.5317455495473601, + "grad_norm": 0.17561006546020508, + "learning_rate": 4.525427276023382e-05, + "loss": 1.1112, + "step": 8752 + }, + { + "epoch": 0.5318063065799866, + "grad_norm": 0.15018552541732788, + "learning_rate": 4.524474329125036e-05, + "loss": 1.0339, + "step": 8753 + }, + { + "epoch": 0.5318670636126132, + "grad_norm": 0.17170754075050354, + "learning_rate": 4.523521399657176e-05, + "loss": 1.0689, + "step": 8754 + }, + { + "epoch": 0.5319278206452397, + "grad_norm": 0.19590263068675995, + "learning_rate": 4.522568487654733e-05, + "loss": 1.2279, + "step": 8755 + }, + { + "epoch": 0.5319885776778662, + "grad_norm": 0.18176360428333282, + "learning_rate": 4.5216155931526306e-05, + "loss": 1.041, + "step": 8756 + }, + { + "epoch": 0.5320493347104928, + "grad_norm": 0.1707795262336731, + "learning_rate": 4.520662716185803e-05, + "loss": 1.0765, + "step": 8757 + }, + { + "epoch": 0.5321100917431193, + "grad_norm": 0.1737699806690216, + "learning_rate": 4.519709856789176e-05, + "loss": 1.1436, + "step": 8758 + }, + { + "epoch": 0.5321708487757458, + "grad_norm": 0.20430248975753784, + "learning_rate": 4.5187570149976764e-05, + "loss": 1.1429, + "step": 8759 + }, + { + "epoch": 0.5322316058083724, + "grad_norm": 0.14655305445194244, + "learning_rate": 4.517804190846232e-05, + "loss": 1.0815, + "step": 8760 + }, + { + "epoch": 0.5322923628409989, + "grad_norm": 0.12511833012104034, + "learning_rate": 4.5168513843697686e-05, + "loss": 1.0549, + "step": 8761 + }, + { + "epoch": 0.5323531198736253, + "grad_norm": 0.1219484955072403, + "learning_rate": 4.5158985956032127e-05, + "loss": 1.0129, + "step": 8762 + }, + { + "epoch": 0.5324138769062519, + "grad_norm": 0.5499448180198669, + "learning_rate": 4.5149458245814854e-05, + "loss": 1.1436, + "step": 8763 + }, + { + "epoch": 0.5324746339388784, + "grad_norm": 0.1544366329908371, + "learning_rate": 4.513993071339511e-05, + "loss": 1.1108, + "step": 8764 + }, + { + "epoch": 0.5325353909715049, + "grad_norm": 0.14449705183506012, + "learning_rate": 4.513040335912217e-05, + "loss": 1.0283, + "step": 8765 + }, + { + "epoch": 0.5325961480041315, + "grad_norm": 0.1301535815000534, + "learning_rate": 4.5120876183345226e-05, + "loss": 1.0468, + "step": 8766 + }, + { + "epoch": 0.532656905036758, + "grad_norm": 0.25364330410957336, + "learning_rate": 4.5111349186413504e-05, + "loss": 1.1584, + "step": 8767 + }, + { + "epoch": 0.5327176620693845, + "grad_norm": 0.14642125368118286, + "learning_rate": 4.510182236867625e-05, + "loss": 1.0952, + "step": 8768 + }, + { + "epoch": 0.5327784191020111, + "grad_norm": 0.21068856120109558, + "learning_rate": 4.509229573048261e-05, + "loss": 1.0406, + "step": 8769 + }, + { + "epoch": 0.5328391761346376, + "grad_norm": 0.11859681457281113, + "learning_rate": 4.5082769272181844e-05, + "loss": 1.0723, + "step": 8770 + }, + { + "epoch": 0.5328999331672641, + "grad_norm": 0.12620341777801514, + "learning_rate": 4.507324299412311e-05, + "loss": 1.0329, + "step": 8771 + }, + { + "epoch": 0.5329606901998907, + "grad_norm": 0.2836582064628601, + "learning_rate": 4.50637168966556e-05, + "loss": 1.0677, + "step": 8772 + }, + { + "epoch": 0.5330214472325172, + "grad_norm": 2.033245086669922, + "learning_rate": 4.5054190980128494e-05, + "loss": 1.1083, + "step": 8773 + }, + { + "epoch": 0.5330822042651437, + "grad_norm": 0.1591397076845169, + "learning_rate": 4.504466524489099e-05, + "loss": 1.0262, + "step": 8774 + }, + { + "epoch": 0.5331429612977702, + "grad_norm": 0.18666645884513855, + "learning_rate": 4.503513969129225e-05, + "loss": 1.133, + "step": 8775 + }, + { + "epoch": 0.5332037183303967, + "grad_norm": 0.20969098806381226, + "learning_rate": 4.502561431968141e-05, + "loss": 1.0419, + "step": 8776 + }, + { + "epoch": 0.5332644753630232, + "grad_norm": 0.19253887236118317, + "learning_rate": 4.501608913040763e-05, + "loss": 1.043, + "step": 8777 + }, + { + "epoch": 0.5333252323956498, + "grad_norm": 0.15176169574260712, + "learning_rate": 4.5006564123820074e-05, + "loss": 1.0511, + "step": 8778 + }, + { + "epoch": 0.5333859894282763, + "grad_norm": 2.6306838989257812, + "learning_rate": 4.499703930026788e-05, + "loss": 1.0436, + "step": 8779 + }, + { + "epoch": 0.5334467464609028, + "grad_norm": 0.22063086926937103, + "learning_rate": 4.498751466010017e-05, + "loss": 1.0458, + "step": 8780 + }, + { + "epoch": 0.5335075034935294, + "grad_norm": 0.17193427681922913, + "learning_rate": 4.497799020366608e-05, + "loss": 1.1182, + "step": 8781 + }, + { + "epoch": 0.5335682605261559, + "grad_norm": 0.8149873614311218, + "learning_rate": 4.496846593131474e-05, + "loss": 1.0971, + "step": 8782 + }, + { + "epoch": 0.5336290175587824, + "grad_norm": 0.16165894269943237, + "learning_rate": 4.495894184339525e-05, + "loss": 1.036, + "step": 8783 + }, + { + "epoch": 0.533689774591409, + "grad_norm": 0.5210099816322327, + "learning_rate": 4.494941794025672e-05, + "loss": 1.0178, + "step": 8784 + }, + { + "epoch": 0.5337505316240355, + "grad_norm": 0.11411449313163757, + "learning_rate": 4.493989422224824e-05, + "loss": 1.0394, + "step": 8785 + }, + { + "epoch": 0.533811288656662, + "grad_norm": 0.13185474276542664, + "learning_rate": 4.493037068971892e-05, + "loss": 1.0346, + "step": 8786 + }, + { + "epoch": 0.5338720456892886, + "grad_norm": 1.2379601001739502, + "learning_rate": 4.492084734301785e-05, + "loss": 1.1314, + "step": 8787 + }, + { + "epoch": 0.533932802721915, + "grad_norm": 0.18772807717323303, + "learning_rate": 4.491132418249411e-05, + "loss": 1.0133, + "step": 8788 + }, + { + "epoch": 0.5339935597545415, + "grad_norm": 0.22782091796398163, + "learning_rate": 4.490180120849676e-05, + "loss": 1.172, + "step": 8789 + }, + { + "epoch": 0.5340543167871681, + "grad_norm": 4.431173801422119, + "learning_rate": 4.489227842137485e-05, + "loss": 1.1203, + "step": 8790 + }, + { + "epoch": 0.5341150738197946, + "grad_norm": 0.16473138332366943, + "learning_rate": 4.488275582147749e-05, + "loss": 1.0887, + "step": 8791 + }, + { + "epoch": 0.5341758308524212, + "grad_norm": 0.17537109553813934, + "learning_rate": 4.4873233409153695e-05, + "loss": 1.093, + "step": 8792 + }, + { + "epoch": 0.5342365878850477, + "grad_norm": 0.13717277348041534, + "learning_rate": 4.486371118475251e-05, + "loss": 1.0312, + "step": 8793 + }, + { + "epoch": 0.5342973449176742, + "grad_norm": 0.14005355536937714, + "learning_rate": 4.4854189148622994e-05, + "loss": 1.045, + "step": 8794 + }, + { + "epoch": 0.5343581019503008, + "grad_norm": 0.14777666330337524, + "learning_rate": 4.484466730111418e-05, + "loss": 1.1127, + "step": 8795 + }, + { + "epoch": 0.5344188589829273, + "grad_norm": 0.1594071239233017, + "learning_rate": 4.4835145642575074e-05, + "loss": 1.0486, + "step": 8796 + }, + { + "epoch": 0.5344796160155538, + "grad_norm": 0.7868282794952393, + "learning_rate": 4.48256241733547e-05, + "loss": 1.0866, + "step": 8797 + }, + { + "epoch": 0.5345403730481804, + "grad_norm": 0.1698208898305893, + "learning_rate": 4.481610289380206e-05, + "loss": 1.1494, + "step": 8798 + }, + { + "epoch": 0.5346011300808069, + "grad_norm": 2.915067195892334, + "learning_rate": 4.4806581804266186e-05, + "loss": 1.1736, + "step": 8799 + }, + { + "epoch": 0.5346618871134334, + "grad_norm": 0.19487714767456055, + "learning_rate": 4.4797060905096055e-05, + "loss": 1.1308, + "step": 8800 + }, + { + "epoch": 0.5347226441460599, + "grad_norm": 0.42165955901145935, + "learning_rate": 4.478754019664065e-05, + "loss": 1.114, + "step": 8801 + }, + { + "epoch": 0.5347834011786864, + "grad_norm": 0.13892394304275513, + "learning_rate": 4.4778019679249004e-05, + "loss": 1.1049, + "step": 8802 + }, + { + "epoch": 0.5348441582113129, + "grad_norm": 0.17071881890296936, + "learning_rate": 4.4768499353270026e-05, + "loss": 1.1136, + "step": 8803 + }, + { + "epoch": 0.5349049152439395, + "grad_norm": 0.12368696182966232, + "learning_rate": 4.475897921905272e-05, + "loss": 1.0697, + "step": 8804 + }, + { + "epoch": 0.534965672276566, + "grad_norm": 0.12244225293397903, + "learning_rate": 4.474945927694605e-05, + "loss": 1.0742, + "step": 8805 + }, + { + "epoch": 0.5350264293091925, + "grad_norm": 0.11974979192018509, + "learning_rate": 4.473993952729895e-05, + "loss": 1.066, + "step": 8806 + }, + { + "epoch": 0.5350871863418191, + "grad_norm": 0.11680539697408676, + "learning_rate": 4.47304199704604e-05, + "loss": 1.0343, + "step": 8807 + }, + { + "epoch": 0.5351479433744456, + "grad_norm": 0.1846323311328888, + "learning_rate": 4.472090060677932e-05, + "loss": 1.0626, + "step": 8808 + }, + { + "epoch": 0.5352087004070721, + "grad_norm": 1.0687099695205688, + "learning_rate": 4.471138143660465e-05, + "loss": 1.0661, + "step": 8809 + }, + { + "epoch": 0.5352694574396987, + "grad_norm": 0.12311764806509018, + "learning_rate": 4.470186246028532e-05, + "loss": 1.0031, + "step": 8810 + }, + { + "epoch": 0.5353302144723252, + "grad_norm": 0.3085007071495056, + "learning_rate": 4.469234367817023e-05, + "loss": 1.1088, + "step": 8811 + }, + { + "epoch": 0.5353909715049517, + "grad_norm": 0.17489825189113617, + "learning_rate": 4.4682825090608314e-05, + "loss": 1.1509, + "step": 8812 + }, + { + "epoch": 0.5354517285375783, + "grad_norm": 0.7380678653717041, + "learning_rate": 4.467330669794848e-05, + "loss": 1.0371, + "step": 8813 + }, + { + "epoch": 0.5355124855702047, + "grad_norm": 0.15315717458724976, + "learning_rate": 4.4663788500539603e-05, + "loss": 1.103, + "step": 8814 + }, + { + "epoch": 0.5355732426028312, + "grad_norm": 0.21286073327064514, + "learning_rate": 4.465427049873062e-05, + "loss": 1.0372, + "step": 8815 + }, + { + "epoch": 0.5356339996354578, + "grad_norm": 0.47610294818878174, + "learning_rate": 4.464475269287036e-05, + "loss": 1.0699, + "step": 8816 + }, + { + "epoch": 0.5356947566680843, + "grad_norm": 0.1400757133960724, + "learning_rate": 4.463523508330773e-05, + "loss": 1.0679, + "step": 8817 + }, + { + "epoch": 0.5357555137007108, + "grad_norm": 0.2324218899011612, + "learning_rate": 4.46257176703916e-05, + "loss": 1.0718, + "step": 8818 + }, + { + "epoch": 0.5358162707333374, + "grad_norm": 0.17135950922966003, + "learning_rate": 4.461620045447081e-05, + "loss": 1.1083, + "step": 8819 + }, + { + "epoch": 0.5358770277659639, + "grad_norm": 0.2737594246864319, + "learning_rate": 4.4606683435894255e-05, + "loss": 1.1433, + "step": 8820 + }, + { + "epoch": 0.5359377847985904, + "grad_norm": 0.22625966370105743, + "learning_rate": 4.459716661501075e-05, + "loss": 1.0643, + "step": 8821 + }, + { + "epoch": 0.535998541831217, + "grad_norm": 0.3096715211868286, + "learning_rate": 4.458764999216916e-05, + "loss": 1.1407, + "step": 8822 + }, + { + "epoch": 0.5360592988638435, + "grad_norm": 0.20227383077144623, + "learning_rate": 4.45781335677183e-05, + "loss": 1.1686, + "step": 8823 + }, + { + "epoch": 0.53612005589647, + "grad_norm": 0.15743187069892883, + "learning_rate": 4.4568617342006986e-05, + "loss": 1.1052, + "step": 8824 + }, + { + "epoch": 0.5361808129290966, + "grad_norm": 0.16367174685001373, + "learning_rate": 4.4559101315384066e-05, + "loss": 1.0887, + "step": 8825 + }, + { + "epoch": 0.5362415699617231, + "grad_norm": 1.1347402334213257, + "learning_rate": 4.454958548819834e-05, + "loss": 1.135, + "step": 8826 + }, + { + "epoch": 0.5363023269943495, + "grad_norm": 0.2929964065551758, + "learning_rate": 4.45400698607986e-05, + "loss": 1.2995, + "step": 8827 + }, + { + "epoch": 0.5363630840269761, + "grad_norm": 0.15751983225345612, + "learning_rate": 4.4530554433533666e-05, + "loss": 1.1234, + "step": 8828 + }, + { + "epoch": 0.5364238410596026, + "grad_norm": 0.25171127915382385, + "learning_rate": 4.452103920675232e-05, + "loss": 1.1662, + "step": 8829 + }, + { + "epoch": 0.5364845980922291, + "grad_norm": 0.20062485337257385, + "learning_rate": 4.4511524180803345e-05, + "loss": 1.1854, + "step": 8830 + }, + { + "epoch": 0.5365453551248557, + "grad_norm": 0.15805010497570038, + "learning_rate": 4.45020093560355e-05, + "loss": 1.1129, + "step": 8831 + }, + { + "epoch": 0.5366061121574822, + "grad_norm": 0.17778664827346802, + "learning_rate": 4.4492494732797556e-05, + "loss": 1.0276, + "step": 8832 + }, + { + "epoch": 0.5366668691901088, + "grad_norm": 0.15899530053138733, + "learning_rate": 4.44829803114383e-05, + "loss": 1.1698, + "step": 8833 + }, + { + "epoch": 0.5367276262227353, + "grad_norm": 0.16488373279571533, + "learning_rate": 4.447346609230646e-05, + "loss": 1.0858, + "step": 8834 + }, + { + "epoch": 0.5367883832553618, + "grad_norm": 0.12887528538703918, + "learning_rate": 4.44639520757508e-05, + "loss": 1.0697, + "step": 8835 + }, + { + "epoch": 0.5368491402879884, + "grad_norm": 0.21981944143772125, + "learning_rate": 4.445443826212003e-05, + "loss": 1.1858, + "step": 8836 + }, + { + "epoch": 0.5369098973206149, + "grad_norm": 0.29560816287994385, + "learning_rate": 4.44449246517629e-05, + "loss": 1.1876, + "step": 8837 + }, + { + "epoch": 0.5369706543532414, + "grad_norm": 0.14979951083660126, + "learning_rate": 4.4435411245028134e-05, + "loss": 1.0613, + "step": 8838 + }, + { + "epoch": 0.537031411385868, + "grad_norm": 1.7466373443603516, + "learning_rate": 4.442589804226445e-05, + "loss": 1.0611, + "step": 8839 + }, + { + "epoch": 0.5370921684184944, + "grad_norm": 0.21137700974941254, + "learning_rate": 4.4416385043820534e-05, + "loss": 1.0438, + "step": 8840 + }, + { + "epoch": 0.5371529254511209, + "grad_norm": 0.21308887004852295, + "learning_rate": 4.4406872250045115e-05, + "loss": 1.0869, + "step": 8841 + }, + { + "epoch": 0.5372136824837475, + "grad_norm": 0.3430609405040741, + "learning_rate": 4.4397359661286886e-05, + "loss": 1.1257, + "step": 8842 + }, + { + "epoch": 0.537274439516374, + "grad_norm": 0.14277590811252594, + "learning_rate": 4.438784727789451e-05, + "loss": 1.1124, + "step": 8843 + }, + { + "epoch": 0.5373351965490005, + "grad_norm": 0.21899861097335815, + "learning_rate": 4.437833510021667e-05, + "loss": 1.097, + "step": 8844 + }, + { + "epoch": 0.5373959535816271, + "grad_norm": 0.15400643646717072, + "learning_rate": 4.436882312860205e-05, + "loss": 1.0893, + "step": 8845 + }, + { + "epoch": 0.5374567106142536, + "grad_norm": 0.21397428214550018, + "learning_rate": 4.435931136339931e-05, + "loss": 1.115, + "step": 8846 + }, + { + "epoch": 0.5375174676468801, + "grad_norm": 0.16637156903743744, + "learning_rate": 4.43497998049571e-05, + "loss": 1.0701, + "step": 8847 + }, + { + "epoch": 0.5375782246795067, + "grad_norm": 0.17225094139575958, + "learning_rate": 4.4340288453624057e-05, + "loss": 1.1407, + "step": 8848 + }, + { + "epoch": 0.5376389817121332, + "grad_norm": 0.20830106735229492, + "learning_rate": 4.433077730974886e-05, + "loss": 1.1055, + "step": 8849 + }, + { + "epoch": 0.5376997387447597, + "grad_norm": 0.16150915622711182, + "learning_rate": 4.432126637368011e-05, + "loss": 1.0615, + "step": 8850 + }, + { + "epoch": 0.5377604957773863, + "grad_norm": 0.13671712577342987, + "learning_rate": 4.4311755645766435e-05, + "loss": 1.0434, + "step": 8851 + }, + { + "epoch": 0.5378212528100128, + "grad_norm": 0.15423385798931122, + "learning_rate": 4.430224512635644e-05, + "loss": 1.0734, + "step": 8852 + }, + { + "epoch": 0.5378820098426392, + "grad_norm": 0.1592642217874527, + "learning_rate": 4.4292734815798767e-05, + "loss": 1.1297, + "step": 8853 + }, + { + "epoch": 0.5379427668752658, + "grad_norm": 0.2576007843017578, + "learning_rate": 4.4283224714442e-05, + "loss": 1.0747, + "step": 8854 + }, + { + "epoch": 0.5380035239078923, + "grad_norm": 0.285372793674469, + "learning_rate": 4.4273714822634754e-05, + "loss": 1.1583, + "step": 8855 + }, + { + "epoch": 0.5380642809405188, + "grad_norm": 0.13003991544246674, + "learning_rate": 4.426420514072558e-05, + "loss": 1.0867, + "step": 8856 + }, + { + "epoch": 0.5381250379731454, + "grad_norm": 0.21347947418689728, + "learning_rate": 4.4254695669063066e-05, + "loss": 1.1144, + "step": 8857 + }, + { + "epoch": 0.5381857950057719, + "grad_norm": 0.1835072636604309, + "learning_rate": 4.4245186407995804e-05, + "loss": 1.1567, + "step": 8858 + }, + { + "epoch": 0.5382465520383984, + "grad_norm": 0.2373637557029724, + "learning_rate": 4.4235677357872346e-05, + "loss": 1.1071, + "step": 8859 + }, + { + "epoch": 0.538307309071025, + "grad_norm": 0.12043152749538422, + "learning_rate": 4.422616851904124e-05, + "loss": 1.0436, + "step": 8860 + }, + { + "epoch": 0.5383680661036515, + "grad_norm": 0.19565355777740479, + "learning_rate": 4.4216659891851054e-05, + "loss": 1.0849, + "step": 8861 + }, + { + "epoch": 0.538428823136278, + "grad_norm": 0.12992137670516968, + "learning_rate": 4.420715147665033e-05, + "loss": 1.0452, + "step": 8862 + }, + { + "epoch": 0.5384895801689046, + "grad_norm": 0.15971948206424713, + "learning_rate": 4.419764327378757e-05, + "loss": 1.1674, + "step": 8863 + }, + { + "epoch": 0.5385503372015311, + "grad_norm": 17.72208023071289, + "learning_rate": 4.4188135283611324e-05, + "loss": 1.051, + "step": 8864 + }, + { + "epoch": 0.5386110942341577, + "grad_norm": 0.14163905382156372, + "learning_rate": 4.417862750647009e-05, + "loss": 1.0574, + "step": 8865 + }, + { + "epoch": 0.5386718512667842, + "grad_norm": 0.27367568016052246, + "learning_rate": 4.41691199427124e-05, + "loss": 1.0457, + "step": 8866 + }, + { + "epoch": 0.5387326082994106, + "grad_norm": 0.18459880352020264, + "learning_rate": 4.4159612592686744e-05, + "loss": 1.1731, + "step": 8867 + }, + { + "epoch": 0.5387933653320371, + "grad_norm": 0.12797421216964722, + "learning_rate": 4.415010545674161e-05, + "loss": 1.0332, + "step": 8868 + }, + { + "epoch": 0.5388541223646637, + "grad_norm": 0.1594666838645935, + "learning_rate": 4.414059853522552e-05, + "loss": 1.151, + "step": 8869 + }, + { + "epoch": 0.5389148793972902, + "grad_norm": 0.22706584632396698, + "learning_rate": 4.4131091828486894e-05, + "loss": 1.0859, + "step": 8870 + }, + { + "epoch": 0.5389756364299167, + "grad_norm": 0.21176759898662567, + "learning_rate": 4.412158533687424e-05, + "loss": 1.2333, + "step": 8871 + }, + { + "epoch": 0.5390363934625433, + "grad_norm": 0.17553776502609253, + "learning_rate": 4.411207906073601e-05, + "loss": 1.1174, + "step": 8872 + }, + { + "epoch": 0.5390971504951698, + "grad_norm": 0.2820688486099243, + "learning_rate": 4.4102573000420657e-05, + "loss": 1.0816, + "step": 8873 + }, + { + "epoch": 0.5391579075277964, + "grad_norm": 0.13691022992134094, + "learning_rate": 4.409306715627663e-05, + "loss": 1.0404, + "step": 8874 + }, + { + "epoch": 0.5392186645604229, + "grad_norm": 0.15847517549991608, + "learning_rate": 4.4083561528652384e-05, + "loss": 1.0776, + "step": 8875 + }, + { + "epoch": 0.5392794215930494, + "grad_norm": 0.2586681842803955, + "learning_rate": 4.407405611789633e-05, + "loss": 1.0797, + "step": 8876 + }, + { + "epoch": 0.539340178625676, + "grad_norm": 0.18156175315380096, + "learning_rate": 4.406455092435689e-05, + "loss": 1.0736, + "step": 8877 + }, + { + "epoch": 0.5394009356583025, + "grad_norm": 0.24516716599464417, + "learning_rate": 4.405504594838247e-05, + "loss": 1.1073, + "step": 8878 + }, + { + "epoch": 0.539461692690929, + "grad_norm": 0.19556109607219696, + "learning_rate": 4.40455411903215e-05, + "loss": 1.1026, + "step": 8879 + }, + { + "epoch": 0.5395224497235555, + "grad_norm": 0.3313329219818115, + "learning_rate": 4.4036036650522376e-05, + "loss": 1.0721, + "step": 8880 + }, + { + "epoch": 0.539583206756182, + "grad_norm": 0.15644875168800354, + "learning_rate": 4.402653232933346e-05, + "loss": 1.1154, + "step": 8881 + }, + { + "epoch": 0.5396439637888085, + "grad_norm": 0.30278852581977844, + "learning_rate": 4.40170282271032e-05, + "loss": 1.1149, + "step": 8882 + }, + { + "epoch": 0.5397047208214351, + "grad_norm": 0.12426209449768066, + "learning_rate": 4.400752434417989e-05, + "loss": 1.0553, + "step": 8883 + }, + { + "epoch": 0.5397654778540616, + "grad_norm": 0.12561176717281342, + "learning_rate": 4.399802068091195e-05, + "loss": 1.0521, + "step": 8884 + }, + { + "epoch": 0.5398262348866881, + "grad_norm": 0.4633282721042633, + "learning_rate": 4.398851723764771e-05, + "loss": 1.3691, + "step": 8885 + }, + { + "epoch": 0.5398869919193147, + "grad_norm": 0.1240839883685112, + "learning_rate": 4.397901401473552e-05, + "loss": 1.0433, + "step": 8886 + }, + { + "epoch": 0.5399477489519412, + "grad_norm": 0.14549250900745392, + "learning_rate": 4.396951101252376e-05, + "loss": 1.0515, + "step": 8887 + }, + { + "epoch": 0.5400085059845677, + "grad_norm": 0.19717401266098022, + "learning_rate": 4.396000823136073e-05, + "loss": 1.1299, + "step": 8888 + }, + { + "epoch": 0.5400692630171943, + "grad_norm": 0.15542292594909668, + "learning_rate": 4.395050567159478e-05, + "loss": 1.0373, + "step": 8889 + }, + { + "epoch": 0.5401300200498208, + "grad_norm": 0.24425075948238373, + "learning_rate": 4.39410033335742e-05, + "loss": 1.0766, + "step": 8890 + }, + { + "epoch": 0.5401907770824473, + "grad_norm": 0.26645809412002563, + "learning_rate": 4.3931501217647305e-05, + "loss": 1.1077, + "step": 8891 + }, + { + "epoch": 0.5402515341150739, + "grad_norm": 0.15452319383621216, + "learning_rate": 4.392199932416242e-05, + "loss": 1.0618, + "step": 8892 + }, + { + "epoch": 0.5403122911477003, + "grad_norm": 0.170376256108284, + "learning_rate": 4.3912497653467815e-05, + "loss": 1.0709, + "step": 8893 + }, + { + "epoch": 0.5403730481803268, + "grad_norm": 0.2439974695444107, + "learning_rate": 4.3902996205911784e-05, + "loss": 1.1064, + "step": 8894 + }, + { + "epoch": 0.5404338052129534, + "grad_norm": 0.1813730001449585, + "learning_rate": 4.389349498184261e-05, + "loss": 1.1105, + "step": 8895 + }, + { + "epoch": 0.5404945622455799, + "grad_norm": 0.1947130560874939, + "learning_rate": 4.3883993981608576e-05, + "loss": 1.0493, + "step": 8896 + }, + { + "epoch": 0.5405553192782064, + "grad_norm": 0.20357480645179749, + "learning_rate": 4.387449320555791e-05, + "loss": 1.0732, + "step": 8897 + }, + { + "epoch": 0.540616076310833, + "grad_norm": 0.15034297108650208, + "learning_rate": 4.3864992654038875e-05, + "loss": 1.0392, + "step": 8898 + }, + { + "epoch": 0.5406768333434595, + "grad_norm": 0.15569765865802765, + "learning_rate": 4.3855492327399713e-05, + "loss": 1.1124, + "step": 8899 + }, + { + "epoch": 0.540737590376086, + "grad_norm": 0.15072257816791534, + "learning_rate": 4.384599222598868e-05, + "loss": 1.1082, + "step": 8900 + }, + { + "epoch": 0.5407983474087126, + "grad_norm": 0.19490168988704681, + "learning_rate": 4.383649235015399e-05, + "loss": 1.0999, + "step": 8901 + }, + { + "epoch": 0.5408591044413391, + "grad_norm": 0.16578565537929535, + "learning_rate": 4.3826992700243876e-05, + "loss": 1.1026, + "step": 8902 + }, + { + "epoch": 0.5409198614739656, + "grad_norm": 0.21928803622722626, + "learning_rate": 4.381749327660652e-05, + "loss": 1.0478, + "step": 8903 + }, + { + "epoch": 0.5409806185065922, + "grad_norm": 0.12841108441352844, + "learning_rate": 4.3807994079590145e-05, + "loss": 1.0183, + "step": 8904 + }, + { + "epoch": 0.5410413755392187, + "grad_norm": 0.18991781771183014, + "learning_rate": 4.3798495109542946e-05, + "loss": 1.0409, + "step": 8905 + }, + { + "epoch": 0.5411021325718451, + "grad_norm": 0.1959805190563202, + "learning_rate": 4.378899636681311e-05, + "loss": 1.1746, + "step": 8906 + }, + { + "epoch": 0.5411628896044717, + "grad_norm": 0.17164644598960876, + "learning_rate": 4.37794978517488e-05, + "loss": 1.135, + "step": 8907 + }, + { + "epoch": 0.5412236466370982, + "grad_norm": 0.2689679265022278, + "learning_rate": 4.376999956469822e-05, + "loss": 1.0805, + "step": 8908 + }, + { + "epoch": 0.5412844036697247, + "grad_norm": 0.22434532642364502, + "learning_rate": 4.376050150600951e-05, + "loss": 1.0775, + "step": 8909 + }, + { + "epoch": 0.5413451607023513, + "grad_norm": 0.22144941985607147, + "learning_rate": 4.375100367603081e-05, + "loss": 1.1149, + "step": 8910 + }, + { + "epoch": 0.5414059177349778, + "grad_norm": 0.22280794382095337, + "learning_rate": 4.3741506075110285e-05, + "loss": 1.1546, + "step": 8911 + }, + { + "epoch": 0.5414666747676044, + "grad_norm": 0.1654522567987442, + "learning_rate": 4.373200870359605e-05, + "loss": 1.056, + "step": 8912 + }, + { + "epoch": 0.5415274318002309, + "grad_norm": 0.24848440289497375, + "learning_rate": 4.372251156183626e-05, + "loss": 1.1273, + "step": 8913 + }, + { + "epoch": 0.5415881888328574, + "grad_norm": 0.25273504853248596, + "learning_rate": 4.3713014650179026e-05, + "loss": 1.0653, + "step": 8914 + }, + { + "epoch": 0.541648945865484, + "grad_norm": 0.15259887278079987, + "learning_rate": 4.370351796897244e-05, + "loss": 1.0733, + "step": 8915 + }, + { + "epoch": 0.5417097028981105, + "grad_norm": 0.16894663870334625, + "learning_rate": 4.369402151856465e-05, + "loss": 1.0846, + "step": 8916 + }, + { + "epoch": 0.541770459930737, + "grad_norm": 0.5015411376953125, + "learning_rate": 4.368452529930369e-05, + "loss": 1.2142, + "step": 8917 + }, + { + "epoch": 0.5418312169633636, + "grad_norm": 0.1331171989440918, + "learning_rate": 4.367502931153769e-05, + "loss": 1.0917, + "step": 8918 + }, + { + "epoch": 0.54189197399599, + "grad_norm": 0.16969387233257294, + "learning_rate": 4.366553355561471e-05, + "loss": 1.084, + "step": 8919 + }, + { + "epoch": 0.5419527310286165, + "grad_norm": 0.19851133227348328, + "learning_rate": 4.3656038031882816e-05, + "loss": 1.187, + "step": 8920 + }, + { + "epoch": 0.542013488061243, + "grad_norm": 0.17347542941570282, + "learning_rate": 4.364654274069008e-05, + "loss": 1.0572, + "step": 8921 + }, + { + "epoch": 0.5420742450938696, + "grad_norm": 0.2708236277103424, + "learning_rate": 4.363704768238456e-05, + "loss": 1.1133, + "step": 8922 + }, + { + "epoch": 0.5421350021264961, + "grad_norm": 0.7941281199455261, + "learning_rate": 4.362755285731427e-05, + "loss": 0.9987, + "step": 8923 + }, + { + "epoch": 0.5421957591591227, + "grad_norm": 0.21143792569637299, + "learning_rate": 4.3618058265827264e-05, + "loss": 1.1074, + "step": 8924 + }, + { + "epoch": 0.5422565161917492, + "grad_norm": 0.16356296837329865, + "learning_rate": 4.360856390827156e-05, + "loss": 1.1182, + "step": 8925 + }, + { + "epoch": 0.5423172732243757, + "grad_norm": 0.1267087757587433, + "learning_rate": 4.359906978499519e-05, + "loss": 1.0231, + "step": 8926 + }, + { + "epoch": 0.5423780302570023, + "grad_norm": 0.12112586200237274, + "learning_rate": 4.358957589634616e-05, + "loss": 1.082, + "step": 8927 + }, + { + "epoch": 0.5424387872896288, + "grad_norm": 0.8716529011726379, + "learning_rate": 4.358008224267245e-05, + "loss": 1.207, + "step": 8928 + }, + { + "epoch": 0.5424995443222553, + "grad_norm": 2.495262384414673, + "learning_rate": 4.3570588824322096e-05, + "loss": 1.0556, + "step": 8929 + }, + { + "epoch": 0.5425603013548819, + "grad_norm": 0.145907923579216, + "learning_rate": 4.356109564164302e-05, + "loss": 1.0894, + "step": 8930 + }, + { + "epoch": 0.5426210583875084, + "grad_norm": 0.13720855116844177, + "learning_rate": 4.355160269498324e-05, + "loss": 1.0515, + "step": 8931 + }, + { + "epoch": 0.5426818154201348, + "grad_norm": 0.24630962312221527, + "learning_rate": 4.354210998469071e-05, + "loss": 1.1706, + "step": 8932 + }, + { + "epoch": 0.5427425724527614, + "grad_norm": 0.12788785994052887, + "learning_rate": 4.353261751111337e-05, + "loss": 1.0484, + "step": 8933 + }, + { + "epoch": 0.5428033294853879, + "grad_norm": 0.17123116552829742, + "learning_rate": 4.352312527459921e-05, + "loss": 1.0325, + "step": 8934 + }, + { + "epoch": 0.5428640865180144, + "grad_norm": 0.30883681774139404, + "learning_rate": 4.351363327549613e-05, + "loss": 1.3499, + "step": 8935 + }, + { + "epoch": 0.542924843550641, + "grad_norm": 0.2096819281578064, + "learning_rate": 4.35041415141521e-05, + "loss": 1.2215, + "step": 8936 + }, + { + "epoch": 0.5429856005832675, + "grad_norm": 0.158378005027771, + "learning_rate": 4.349464999091498e-05, + "loss": 1.1658, + "step": 8937 + }, + { + "epoch": 0.543046357615894, + "grad_norm": 0.5119233131408691, + "learning_rate": 4.3485158706132744e-05, + "loss": 1.0704, + "step": 8938 + }, + { + "epoch": 0.5431071146485206, + "grad_norm": 0.2655501961708069, + "learning_rate": 4.347566766015327e-05, + "loss": 1.1108, + "step": 8939 + }, + { + "epoch": 0.5431678716811471, + "grad_norm": 0.13762451708316803, + "learning_rate": 4.346617685332447e-05, + "loss": 1.0482, + "step": 8940 + }, + { + "epoch": 0.5432286287137736, + "grad_norm": 0.18351638317108154, + "learning_rate": 4.3456686285994205e-05, + "loss": 1.0011, + "step": 8941 + }, + { + "epoch": 0.5432893857464002, + "grad_norm": 0.17542748153209686, + "learning_rate": 4.3447195958510384e-05, + "loss": 1.0055, + "step": 8942 + }, + { + "epoch": 0.5433501427790267, + "grad_norm": 0.16803203523159027, + "learning_rate": 4.343770587122086e-05, + "loss": 1.1914, + "step": 8943 + }, + { + "epoch": 0.5434108998116532, + "grad_norm": 0.11520200222730637, + "learning_rate": 4.342821602447348e-05, + "loss": 1.0222, + "step": 8944 + }, + { + "epoch": 0.5434716568442797, + "grad_norm": 0.14066025614738464, + "learning_rate": 4.341872641861612e-05, + "loss": 1.1117, + "step": 8945 + }, + { + "epoch": 0.5435324138769062, + "grad_norm": 0.13849855959415436, + "learning_rate": 4.3409237053996615e-05, + "loss": 1.0811, + "step": 8946 + }, + { + "epoch": 0.5435931709095327, + "grad_norm": 0.12274722754955292, + "learning_rate": 4.3399747930962805e-05, + "loss": 1.0202, + "step": 8947 + }, + { + "epoch": 0.5436539279421593, + "grad_norm": 0.1560252159833908, + "learning_rate": 4.3390259049862503e-05, + "loss": 1.0629, + "step": 8948 + }, + { + "epoch": 0.5437146849747858, + "grad_norm": 0.28013524413108826, + "learning_rate": 4.338077041104355e-05, + "loss": 1.0796, + "step": 8949 + }, + { + "epoch": 0.5437754420074123, + "grad_norm": 0.15444281697273254, + "learning_rate": 4.337128201485371e-05, + "loss": 1.1592, + "step": 8950 + }, + { + "epoch": 0.5438361990400389, + "grad_norm": 0.26170778274536133, + "learning_rate": 4.336179386164083e-05, + "loss": 1.2249, + "step": 8951 + }, + { + "epoch": 0.5438969560726654, + "grad_norm": 0.17421099543571472, + "learning_rate": 4.335230595175267e-05, + "loss": 1.1489, + "step": 8952 + }, + { + "epoch": 0.543957713105292, + "grad_norm": 0.2767316699028015, + "learning_rate": 4.334281828553701e-05, + "loss": 1.14, + "step": 8953 + }, + { + "epoch": 0.5440184701379185, + "grad_norm": 0.19828414916992188, + "learning_rate": 4.333333086334165e-05, + "loss": 1.1125, + "step": 8954 + }, + { + "epoch": 0.544079227170545, + "grad_norm": 0.3742527365684509, + "learning_rate": 4.332384368551433e-05, + "loss": 1.0893, + "step": 8955 + }, + { + "epoch": 0.5441399842031716, + "grad_norm": 0.22007344663143158, + "learning_rate": 4.3314356752402826e-05, + "loss": 1.1368, + "step": 8956 + }, + { + "epoch": 0.5442007412357981, + "grad_norm": 0.23593324422836304, + "learning_rate": 4.330487006435485e-05, + "loss": 1.036, + "step": 8957 + }, + { + "epoch": 0.5442614982684245, + "grad_norm": 0.2624753415584564, + "learning_rate": 4.329538362171815e-05, + "loss": 1.0548, + "step": 8958 + }, + { + "epoch": 0.544322255301051, + "grad_norm": 0.44120582938194275, + "learning_rate": 4.328589742484047e-05, + "loss": 1.2736, + "step": 8959 + }, + { + "epoch": 0.5443830123336776, + "grad_norm": 0.16598977148532867, + "learning_rate": 4.3276411474069515e-05, + "loss": 1.1385, + "step": 8960 + }, + { + "epoch": 0.5444437693663041, + "grad_norm": 0.13173899054527283, + "learning_rate": 4.326692576975299e-05, + "loss": 1.0448, + "step": 8961 + }, + { + "epoch": 0.5445045263989307, + "grad_norm": 0.9805957078933716, + "learning_rate": 4.325744031223861e-05, + "loss": 1.1726, + "step": 8962 + }, + { + "epoch": 0.5445652834315572, + "grad_norm": 0.21744363009929657, + "learning_rate": 4.324795510187407e-05, + "loss": 1.146, + "step": 8963 + }, + { + "epoch": 0.5446260404641837, + "grad_norm": 0.11330989003181458, + "learning_rate": 4.323847013900703e-05, + "loss": 1.0502, + "step": 8964 + }, + { + "epoch": 0.5446867974968103, + "grad_norm": 0.22659528255462646, + "learning_rate": 4.3228985423985176e-05, + "loss": 1.0651, + "step": 8965 + }, + { + "epoch": 0.5447475545294368, + "grad_norm": 0.16389329731464386, + "learning_rate": 4.3219500957156157e-05, + "loss": 1.0999, + "step": 8966 + }, + { + "epoch": 0.5448083115620633, + "grad_norm": 0.17884069681167603, + "learning_rate": 4.321001673886765e-05, + "loss": 1.1442, + "step": 8967 + }, + { + "epoch": 0.5448690685946899, + "grad_norm": 0.11576376855373383, + "learning_rate": 4.32005327694673e-05, + "loss": 1.004, + "step": 8968 + }, + { + "epoch": 0.5449298256273164, + "grad_norm": 0.19715000689029694, + "learning_rate": 4.319104904930274e-05, + "loss": 1.1733, + "step": 8969 + }, + { + "epoch": 0.5449905826599429, + "grad_norm": 0.191041499376297, + "learning_rate": 4.3181565578721584e-05, + "loss": 1.1468, + "step": 8970 + }, + { + "epoch": 0.5450513396925695, + "grad_norm": 0.30083873867988586, + "learning_rate": 4.3172082358071446e-05, + "loss": 1.2533, + "step": 8971 + }, + { + "epoch": 0.5451120967251959, + "grad_norm": 0.2145347148180008, + "learning_rate": 4.3162599387699966e-05, + "loss": 1.1295, + "step": 8972 + }, + { + "epoch": 0.5451728537578224, + "grad_norm": 0.12549899518489838, + "learning_rate": 4.315311666795472e-05, + "loss": 1.0227, + "step": 8973 + }, + { + "epoch": 0.545233610790449, + "grad_norm": 0.12783335149288177, + "learning_rate": 4.31436341991833e-05, + "loss": 1.0562, + "step": 8974 + }, + { + "epoch": 0.5452943678230755, + "grad_norm": 0.1302906572818756, + "learning_rate": 4.31341519817333e-05, + "loss": 1.0648, + "step": 8975 + }, + { + "epoch": 0.545355124855702, + "grad_norm": 0.12566597759723663, + "learning_rate": 4.31246700159523e-05, + "loss": 1.04, + "step": 8976 + }, + { + "epoch": 0.5454158818883286, + "grad_norm": 0.17287570238113403, + "learning_rate": 4.3115188302187844e-05, + "loss": 1.0558, + "step": 8977 + }, + { + "epoch": 0.5454766389209551, + "grad_norm": 0.20188038051128387, + "learning_rate": 4.310570684078749e-05, + "loss": 1.1919, + "step": 8978 + }, + { + "epoch": 0.5455373959535816, + "grad_norm": 0.2005751132965088, + "learning_rate": 4.309622563209876e-05, + "loss": 1.0822, + "step": 8979 + }, + { + "epoch": 0.5455981529862082, + "grad_norm": 0.2326764315366745, + "learning_rate": 4.308674467646924e-05, + "loss": 1.0903, + "step": 8980 + }, + { + "epoch": 0.5456589100188347, + "grad_norm": 0.12649978697299957, + "learning_rate": 4.307726397424642e-05, + "loss": 1.044, + "step": 8981 + }, + { + "epoch": 0.5457196670514612, + "grad_norm": 0.1572638750076294, + "learning_rate": 4.306778352577783e-05, + "loss": 1.061, + "step": 8982 + }, + { + "epoch": 0.5457804240840878, + "grad_norm": 0.14660786092281342, + "learning_rate": 4.3058303331410994e-05, + "loss": 1.0848, + "step": 8983 + }, + { + "epoch": 0.5458411811167143, + "grad_norm": 0.12938623130321503, + "learning_rate": 4.3048823391493366e-05, + "loss": 1.0715, + "step": 8984 + }, + { + "epoch": 0.5459019381493407, + "grad_norm": 0.25859758257865906, + "learning_rate": 4.3039343706372465e-05, + "loss": 1.0771, + "step": 8985 + }, + { + "epoch": 0.5459626951819673, + "grad_norm": 0.16211803257465363, + "learning_rate": 4.302986427639577e-05, + "loss": 1.1068, + "step": 8986 + }, + { + "epoch": 0.5460234522145938, + "grad_norm": 0.32541561126708984, + "learning_rate": 4.302038510191073e-05, + "loss": 1.0978, + "step": 8987 + }, + { + "epoch": 0.5460842092472203, + "grad_norm": 0.11973492801189423, + "learning_rate": 4.301090618326484e-05, + "loss": 0.9997, + "step": 8988 + }, + { + "epoch": 0.5461449662798469, + "grad_norm": 0.17381417751312256, + "learning_rate": 4.300142752080555e-05, + "loss": 1.1196, + "step": 8989 + }, + { + "epoch": 0.5462057233124734, + "grad_norm": 0.2164960354566574, + "learning_rate": 4.299194911488027e-05, + "loss": 1.0808, + "step": 8990 + }, + { + "epoch": 0.5462664803451, + "grad_norm": 0.18794627487659454, + "learning_rate": 4.2982470965836455e-05, + "loss": 1.018, + "step": 8991 + }, + { + "epoch": 0.5463272373777265, + "grad_norm": 0.33291032910346985, + "learning_rate": 4.297299307402151e-05, + "loss": 1.1558, + "step": 8992 + }, + { + "epoch": 0.546387994410353, + "grad_norm": 0.21408331394195557, + "learning_rate": 4.2963515439782864e-05, + "loss": 1.0628, + "step": 8993 + }, + { + "epoch": 0.5464487514429796, + "grad_norm": 0.1658337414264679, + "learning_rate": 4.295403806346793e-05, + "loss": 1.1061, + "step": 8994 + }, + { + "epoch": 0.5465095084756061, + "grad_norm": 0.17583677172660828, + "learning_rate": 4.294456094542408e-05, + "loss": 1.0672, + "step": 8995 + }, + { + "epoch": 0.5465702655082326, + "grad_norm": 0.24074803292751312, + "learning_rate": 4.2935084085998745e-05, + "loss": 1.1618, + "step": 8996 + }, + { + "epoch": 0.5466310225408592, + "grad_norm": 0.1395314484834671, + "learning_rate": 4.2925607485539235e-05, + "loss": 1.0544, + "step": 8997 + }, + { + "epoch": 0.5466917795734856, + "grad_norm": 0.23153254389762878, + "learning_rate": 4.291613114439296e-05, + "loss": 1.1302, + "step": 8998 + }, + { + "epoch": 0.5467525366061121, + "grad_norm": 0.20346395671367645, + "learning_rate": 4.290665506290727e-05, + "loss": 1.1132, + "step": 8999 + }, + { + "epoch": 0.5468132936387387, + "grad_norm": 5.548253536224365, + "learning_rate": 4.28971792414295e-05, + "loss": 1.0077, + "step": 9000 + }, + { + "epoch": 0.5468740506713652, + "grad_norm": 0.23009063303470612, + "learning_rate": 4.2887703680307e-05, + "loss": 1.0841, + "step": 9001 + }, + { + "epoch": 0.5469348077039917, + "grad_norm": 1.0316535234451294, + "learning_rate": 4.28782283798871e-05, + "loss": 1.1485, + "step": 9002 + }, + { + "epoch": 0.5469955647366183, + "grad_norm": 0.23427841067314148, + "learning_rate": 4.2868753340517135e-05, + "loss": 1.1046, + "step": 9003 + }, + { + "epoch": 0.5470563217692448, + "grad_norm": 2.872511625289917, + "learning_rate": 4.2859278562544373e-05, + "loss": 1.0494, + "step": 9004 + }, + { + "epoch": 0.5471170788018713, + "grad_norm": 0.7402791380882263, + "learning_rate": 4.284980404631613e-05, + "loss": 1.0541, + "step": 9005 + }, + { + "epoch": 0.5471778358344979, + "grad_norm": 0.15071167051792145, + "learning_rate": 4.2840329792179713e-05, + "loss": 1.1449, + "step": 9006 + }, + { + "epoch": 0.5472385928671244, + "grad_norm": 0.24968582391738892, + "learning_rate": 4.283085580048239e-05, + "loss": 1.2006, + "step": 9007 + }, + { + "epoch": 0.5472993498997509, + "grad_norm": 0.31403857469558716, + "learning_rate": 4.282138207157142e-05, + "loss": 1.121, + "step": 9008 + }, + { + "epoch": 0.5473601069323775, + "grad_norm": 0.32856741547584534, + "learning_rate": 4.281190860579411e-05, + "loss": 1.1303, + "step": 9009 + }, + { + "epoch": 0.547420863965004, + "grad_norm": 0.16440510749816895, + "learning_rate": 4.2802435403497654e-05, + "loss": 1.051, + "step": 9010 + }, + { + "epoch": 0.5474816209976304, + "grad_norm": 0.1508004516363144, + "learning_rate": 4.2792962465029326e-05, + "loss": 1.0438, + "step": 9011 + }, + { + "epoch": 0.547542378030257, + "grad_norm": 0.1534392535686493, + "learning_rate": 4.278348979073635e-05, + "loss": 1.0537, + "step": 9012 + }, + { + "epoch": 0.5476031350628835, + "grad_norm": 0.11253668367862701, + "learning_rate": 4.2774017380965944e-05, + "loss": 1.0282, + "step": 9013 + }, + { + "epoch": 0.54766389209551, + "grad_norm": 0.21295097470283508, + "learning_rate": 4.2764545236065335e-05, + "loss": 1.0956, + "step": 9014 + }, + { + "epoch": 0.5477246491281366, + "grad_norm": 0.17431528866291046, + "learning_rate": 4.275507335638172e-05, + "loss": 1.1214, + "step": 9015 + }, + { + "epoch": 0.5477854061607631, + "grad_norm": 0.19457079470157623, + "learning_rate": 4.2745601742262306e-05, + "loss": 1.1526, + "step": 9016 + }, + { + "epoch": 0.5478461631933896, + "grad_norm": 0.20628224313259125, + "learning_rate": 4.273613039405425e-05, + "loss": 1.1626, + "step": 9017 + }, + { + "epoch": 0.5479069202260162, + "grad_norm": 0.27337104082107544, + "learning_rate": 4.272665931210472e-05, + "loss": 1.109, + "step": 9018 + }, + { + "epoch": 0.5479676772586427, + "grad_norm": 0.14324428141117096, + "learning_rate": 4.2717188496760916e-05, + "loss": 1.0948, + "step": 9019 + }, + { + "epoch": 0.5480284342912692, + "grad_norm": 0.24862165749073029, + "learning_rate": 4.2707717948369974e-05, + "loss": 1.0717, + "step": 9020 + }, + { + "epoch": 0.5480891913238958, + "grad_norm": 0.20165401697158813, + "learning_rate": 4.2698247667279036e-05, + "loss": 1.0216, + "step": 9021 + }, + { + "epoch": 0.5481499483565223, + "grad_norm": 0.133982315659523, + "learning_rate": 4.268877765383524e-05, + "loss": 1.0226, + "step": 9022 + }, + { + "epoch": 0.5482107053891488, + "grad_norm": 0.2255735695362091, + "learning_rate": 4.267930790838573e-05, + "loss": 1.0981, + "step": 9023 + }, + { + "epoch": 0.5482714624217753, + "grad_norm": 0.15962889790534973, + "learning_rate": 4.266983843127759e-05, + "loss": 1.0329, + "step": 9024 + }, + { + "epoch": 0.5483322194544018, + "grad_norm": 0.11453757435083389, + "learning_rate": 4.2660369222857944e-05, + "loss": 1.0353, + "step": 9025 + }, + { + "epoch": 0.5483929764870283, + "grad_norm": 0.1269068419933319, + "learning_rate": 4.265090028347387e-05, + "loss": 1.051, + "step": 9026 + }, + { + "epoch": 0.5484537335196549, + "grad_norm": 0.19441090524196625, + "learning_rate": 4.2641431613472475e-05, + "loss": 1.0557, + "step": 9027 + }, + { + "epoch": 0.5485144905522814, + "grad_norm": 0.13083462417125702, + "learning_rate": 4.263196321320083e-05, + "loss": 1.0419, + "step": 9028 + }, + { + "epoch": 0.5485752475849079, + "grad_norm": 0.14303791522979736, + "learning_rate": 4.262249508300598e-05, + "loss": 1.0674, + "step": 9029 + }, + { + "epoch": 0.5486360046175345, + "grad_norm": 0.1453258991241455, + "learning_rate": 4.261302722323504e-05, + "loss": 1.1258, + "step": 9030 + }, + { + "epoch": 0.548696761650161, + "grad_norm": 0.12882418930530548, + "learning_rate": 4.260355963423498e-05, + "loss": 1.0553, + "step": 9031 + }, + { + "epoch": 0.5487575186827875, + "grad_norm": 0.18207204341888428, + "learning_rate": 4.2594092316352885e-05, + "loss": 1.1068, + "step": 9032 + }, + { + "epoch": 0.5488182757154141, + "grad_norm": 0.16854768991470337, + "learning_rate": 4.258462526993576e-05, + "loss": 1.0493, + "step": 9033 + }, + { + "epoch": 0.5488790327480406, + "grad_norm": 0.14280124008655548, + "learning_rate": 4.257515849533063e-05, + "loss": 1.079, + "step": 9034 + }, + { + "epoch": 0.5489397897806672, + "grad_norm": 0.13142609596252441, + "learning_rate": 4.25656919928845e-05, + "loss": 1.0689, + "step": 9035 + }, + { + "epoch": 0.5490005468132937, + "grad_norm": 0.22240585088729858, + "learning_rate": 4.255622576294438e-05, + "loss": 1.1404, + "step": 9036 + }, + { + "epoch": 0.5490613038459201, + "grad_norm": 0.1691696047782898, + "learning_rate": 4.254675980585723e-05, + "loss": 1.1479, + "step": 9037 + }, + { + "epoch": 0.5491220608785466, + "grad_norm": 0.22538401186466217, + "learning_rate": 4.253729412197004e-05, + "loss": 1.1245, + "step": 9038 + }, + { + "epoch": 0.5491828179111732, + "grad_norm": 0.1962536722421646, + "learning_rate": 4.2527828711629784e-05, + "loss": 1.0722, + "step": 9039 + }, + { + "epoch": 0.5492435749437997, + "grad_norm": 0.1319868564605713, + "learning_rate": 4.25183635751834e-05, + "loss": 1.0729, + "step": 9040 + }, + { + "epoch": 0.5493043319764263, + "grad_norm": 0.17093893885612488, + "learning_rate": 4.250889871297785e-05, + "loss": 1.1271, + "step": 9041 + }, + { + "epoch": 0.5493650890090528, + "grad_norm": 0.132630854845047, + "learning_rate": 4.249943412536006e-05, + "loss": 1.0778, + "step": 9042 + }, + { + "epoch": 0.5494258460416793, + "grad_norm": 0.11211725324392319, + "learning_rate": 4.248996981267698e-05, + "loss": 1.0427, + "step": 9043 + }, + { + "epoch": 0.5494866030743059, + "grad_norm": 0.13270297646522522, + "learning_rate": 4.24805057752755e-05, + "loss": 1.0222, + "step": 9044 + }, + { + "epoch": 0.5495473601069324, + "grad_norm": 0.14900323748588562, + "learning_rate": 4.2471042013502534e-05, + "loss": 1.0251, + "step": 9045 + }, + { + "epoch": 0.5496081171395589, + "grad_norm": 0.866405189037323, + "learning_rate": 4.246157852770496e-05, + "loss": 1.2466, + "step": 9046 + }, + { + "epoch": 0.5496688741721855, + "grad_norm": 0.24184077978134155, + "learning_rate": 4.245211531822971e-05, + "loss": 1.1705, + "step": 9047 + }, + { + "epoch": 0.549729631204812, + "grad_norm": 0.19942255318164825, + "learning_rate": 4.244265238542362e-05, + "loss": 1.1178, + "step": 9048 + }, + { + "epoch": 0.5497903882374385, + "grad_norm": 0.11792849749326706, + "learning_rate": 4.2433189729633566e-05, + "loss": 1.0116, + "step": 9049 + }, + { + "epoch": 0.549851145270065, + "grad_norm": 0.14112327992916107, + "learning_rate": 4.242372735120642e-05, + "loss": 1.0749, + "step": 9050 + }, + { + "epoch": 0.5499119023026915, + "grad_norm": 0.1418844312429428, + "learning_rate": 4.241426525048899e-05, + "loss": 1.0325, + "step": 9051 + }, + { + "epoch": 0.549972659335318, + "grad_norm": 0.15489812195301056, + "learning_rate": 4.240480342782814e-05, + "loss": 1.1121, + "step": 9052 + }, + { + "epoch": 0.5500334163679446, + "grad_norm": 0.14163701236248016, + "learning_rate": 4.239534188357069e-05, + "loss": 1.0602, + "step": 9053 + }, + { + "epoch": 0.5500941734005711, + "grad_norm": 0.11680836230516434, + "learning_rate": 4.238588061806345e-05, + "loss": 1.0147, + "step": 9054 + }, + { + "epoch": 0.5501549304331976, + "grad_norm": 0.20463886857032776, + "learning_rate": 4.237641963165323e-05, + "loss": 1.0429, + "step": 9055 + }, + { + "epoch": 0.5502156874658242, + "grad_norm": 0.4137899577617645, + "learning_rate": 4.236695892468684e-05, + "loss": 1.0282, + "step": 9056 + }, + { + "epoch": 0.5502764444984507, + "grad_norm": 48.9248161315918, + "learning_rate": 4.2357498497511026e-05, + "loss": 1.2152, + "step": 9057 + }, + { + "epoch": 0.5503372015310772, + "grad_norm": 0.3811236023902893, + "learning_rate": 4.234803835047259e-05, + "loss": 1.1267, + "step": 9058 + }, + { + "epoch": 0.5503979585637038, + "grad_norm": 0.12349487096071243, + "learning_rate": 4.233857848391828e-05, + "loss": 1.037, + "step": 9059 + }, + { + "epoch": 0.5504587155963303, + "grad_norm": 0.2883005738258362, + "learning_rate": 4.232911889819486e-05, + "loss": 1.1034, + "step": 9060 + }, + { + "epoch": 0.5505194726289568, + "grad_norm": 0.21447651088237762, + "learning_rate": 4.231965959364907e-05, + "loss": 1.0715, + "step": 9061 + }, + { + "epoch": 0.5505802296615834, + "grad_norm": 0.1179421916604042, + "learning_rate": 4.231020057062763e-05, + "loss": 1.0245, + "step": 9062 + }, + { + "epoch": 0.5506409866942098, + "grad_norm": 0.1727469563484192, + "learning_rate": 4.230074182947731e-05, + "loss": 1.1492, + "step": 9063 + }, + { + "epoch": 0.5507017437268363, + "grad_norm": 1.4222290515899658, + "learning_rate": 4.2291283370544754e-05, + "loss": 1.0728, + "step": 9064 + }, + { + "epoch": 0.5507625007594629, + "grad_norm": 0.1375548392534256, + "learning_rate": 4.228182519417671e-05, + "loss": 1.0859, + "step": 9065 + }, + { + "epoch": 0.5508232577920894, + "grad_norm": 0.17478792369365692, + "learning_rate": 4.2272367300719846e-05, + "loss": 1.1092, + "step": 9066 + }, + { + "epoch": 0.5508840148247159, + "grad_norm": 0.163072407245636, + "learning_rate": 4.2262909690520845e-05, + "loss": 1.0759, + "step": 9067 + }, + { + "epoch": 0.5509447718573425, + "grad_norm": 0.1808890849351883, + "learning_rate": 4.22534523639264e-05, + "loss": 1.0488, + "step": 9068 + }, + { + "epoch": 0.551005528889969, + "grad_norm": 1.1838573217391968, + "learning_rate": 4.224399532128315e-05, + "loss": 1.0856, + "step": 9069 + }, + { + "epoch": 0.5510662859225955, + "grad_norm": 0.13643936812877655, + "learning_rate": 4.223453856293777e-05, + "loss": 1.0473, + "step": 9070 + }, + { + "epoch": 0.5511270429552221, + "grad_norm": 0.36972951889038086, + "learning_rate": 4.222508208923686e-05, + "loss": 1.0227, + "step": 9071 + }, + { + "epoch": 0.5511877999878486, + "grad_norm": 0.1670450121164322, + "learning_rate": 4.221562590052707e-05, + "loss": 1.1461, + "step": 9072 + }, + { + "epoch": 0.5512485570204751, + "grad_norm": 0.20135442912578583, + "learning_rate": 4.220616999715501e-05, + "loss": 1.0979, + "step": 9073 + }, + { + "epoch": 0.5513093140531017, + "grad_norm": 0.12519875168800354, + "learning_rate": 4.219671437946731e-05, + "loss": 1.0521, + "step": 9074 + }, + { + "epoch": 0.5513700710857282, + "grad_norm": 0.7594452500343323, + "learning_rate": 4.218725904781054e-05, + "loss": 1.1522, + "step": 9075 + }, + { + "epoch": 0.5514308281183548, + "grad_norm": 0.15588246285915375, + "learning_rate": 4.2177804002531326e-05, + "loss": 1.1389, + "step": 9076 + }, + { + "epoch": 0.5514915851509812, + "grad_norm": 4.16948127746582, + "learning_rate": 4.2168349243976194e-05, + "loss": 1.1676, + "step": 9077 + }, + { + "epoch": 0.5515523421836077, + "grad_norm": 0.6780422329902649, + "learning_rate": 4.215889477249174e-05, + "loss": 1.1329, + "step": 9078 + }, + { + "epoch": 0.5516130992162342, + "grad_norm": 0.2095414400100708, + "learning_rate": 4.2149440588424515e-05, + "loss": 1.0194, + "step": 9079 + }, + { + "epoch": 0.5516738562488608, + "grad_norm": 0.1834208220243454, + "learning_rate": 4.2139986692121045e-05, + "loss": 1.162, + "step": 9080 + }, + { + "epoch": 0.5517346132814873, + "grad_norm": 0.18039526045322418, + "learning_rate": 4.21305330839279e-05, + "loss": 1.0437, + "step": 9081 + }, + { + "epoch": 0.5517953703141139, + "grad_norm": 0.22322294116020203, + "learning_rate": 4.212107976419159e-05, + "loss": 1.0523, + "step": 9082 + }, + { + "epoch": 0.5518561273467404, + "grad_norm": 0.2128293216228485, + "learning_rate": 4.211162673325863e-05, + "loss": 1.1181, + "step": 9083 + }, + { + "epoch": 0.5519168843793669, + "grad_norm": 0.17772603034973145, + "learning_rate": 4.210217399147551e-05, + "loss": 1.1684, + "step": 9084 + }, + { + "epoch": 0.5519776414119935, + "grad_norm": 0.3734113872051239, + "learning_rate": 4.209272153918871e-05, + "loss": 1.0578, + "step": 9085 + }, + { + "epoch": 0.55203839844462, + "grad_norm": 0.2782934308052063, + "learning_rate": 4.208326937674475e-05, + "loss": 1.1397, + "step": 9086 + }, + { + "epoch": 0.5520991554772465, + "grad_norm": 0.6208287477493286, + "learning_rate": 4.2073817504490076e-05, + "loss": 1.1481, + "step": 9087 + }, + { + "epoch": 0.5521599125098731, + "grad_norm": 0.2628735899925232, + "learning_rate": 4.2064365922771145e-05, + "loss": 1.0267, + "step": 9088 + }, + { + "epoch": 0.5522206695424996, + "grad_norm": 0.19605694711208344, + "learning_rate": 4.205491463193442e-05, + "loss": 1.0894, + "step": 9089 + }, + { + "epoch": 0.552281426575126, + "grad_norm": 0.16692741215229034, + "learning_rate": 4.204546363232635e-05, + "loss": 1.0414, + "step": 9090 + }, + { + "epoch": 0.5523421836077526, + "grad_norm": 0.11422184854745865, + "learning_rate": 4.203601292429334e-05, + "loss": 1.0077, + "step": 9091 + }, + { + "epoch": 0.5524029406403791, + "grad_norm": 0.19897115230560303, + "learning_rate": 4.202656250818181e-05, + "loss": 1.0989, + "step": 9092 + }, + { + "epoch": 0.5524636976730056, + "grad_norm": 0.1422666758298874, + "learning_rate": 4.201711238433817e-05, + "loss": 1.0557, + "step": 9093 + }, + { + "epoch": 0.5525244547056322, + "grad_norm": 0.20156043767929077, + "learning_rate": 4.2007662553108825e-05, + "loss": 1.029, + "step": 9094 + }, + { + "epoch": 0.5525852117382587, + "grad_norm": 0.19875234365463257, + "learning_rate": 4.199821301484015e-05, + "loss": 1.013, + "step": 9095 + }, + { + "epoch": 0.5526459687708852, + "grad_norm": 0.1583237498998642, + "learning_rate": 4.198876376987851e-05, + "loss": 1.0777, + "step": 9096 + }, + { + "epoch": 0.5527067258035118, + "grad_norm": 0.12764251232147217, + "learning_rate": 4.197931481857033e-05, + "loss": 1.0936, + "step": 9097 + }, + { + "epoch": 0.5527674828361383, + "grad_norm": 0.1488335132598877, + "learning_rate": 4.196986616126187e-05, + "loss": 1.0507, + "step": 9098 + }, + { + "epoch": 0.5528282398687648, + "grad_norm": 0.16056792438030243, + "learning_rate": 4.1960417798299536e-05, + "loss": 1.0813, + "step": 9099 + }, + { + "epoch": 0.5528889969013914, + "grad_norm": 0.1478900909423828, + "learning_rate": 4.195096973002964e-05, + "loss": 1.0775, + "step": 9100 + }, + { + "epoch": 0.5529497539340179, + "grad_norm": 0.17586633563041687, + "learning_rate": 4.194152195679849e-05, + "loss": 1.105, + "step": 9101 + }, + { + "epoch": 0.5530105109666444, + "grad_norm": 0.12817417085170746, + "learning_rate": 4.193207447895243e-05, + "loss": 1.084, + "step": 9102 + }, + { + "epoch": 0.5530712679992709, + "grad_norm": 0.1798471212387085, + "learning_rate": 4.192262729683774e-05, + "loss": 1.1244, + "step": 9103 + }, + { + "epoch": 0.5531320250318974, + "grad_norm": 0.39058050513267517, + "learning_rate": 4.19131804108007e-05, + "loss": 1.172, + "step": 9104 + }, + { + "epoch": 0.5531927820645239, + "grad_norm": 0.1737455278635025, + "learning_rate": 4.1903733821187594e-05, + "loss": 1.0815, + "step": 9105 + }, + { + "epoch": 0.5532535390971505, + "grad_norm": 0.21475253999233246, + "learning_rate": 4.189428752834468e-05, + "loss": 1.1136, + "step": 9106 + }, + { + "epoch": 0.553314296129777, + "grad_norm": 0.6660194993019104, + "learning_rate": 4.188484153261824e-05, + "loss": 1.41, + "step": 9107 + }, + { + "epoch": 0.5533750531624035, + "grad_norm": 1.6338199377059937, + "learning_rate": 4.18753958343545e-05, + "loss": 1.0513, + "step": 9108 + }, + { + "epoch": 0.5534358101950301, + "grad_norm": 0.2118992656469345, + "learning_rate": 4.186595043389968e-05, + "loss": 1.0665, + "step": 9109 + }, + { + "epoch": 0.5534965672276566, + "grad_norm": 0.20369698107242584, + "learning_rate": 4.185650533160005e-05, + "loss": 1.0997, + "step": 9110 + }, + { + "epoch": 0.5535573242602831, + "grad_norm": 0.13237892091274261, + "learning_rate": 4.1847060527801766e-05, + "loss": 1.0507, + "step": 9111 + }, + { + "epoch": 0.5536180812929097, + "grad_norm": 0.29496198892593384, + "learning_rate": 4.183761602285106e-05, + "loss": 1.0301, + "step": 9112 + }, + { + "epoch": 0.5536788383255362, + "grad_norm": 0.1493644416332245, + "learning_rate": 4.182817181709412e-05, + "loss": 1.0194, + "step": 9113 + }, + { + "epoch": 0.5537395953581628, + "grad_norm": 0.1697172373533249, + "learning_rate": 4.181872791087711e-05, + "loss": 1.091, + "step": 9114 + }, + { + "epoch": 0.5538003523907893, + "grad_norm": 0.1491381973028183, + "learning_rate": 4.180928430454622e-05, + "loss": 1.0115, + "step": 9115 + }, + { + "epoch": 0.5538611094234157, + "grad_norm": 0.15601514279842377, + "learning_rate": 4.17998409984476e-05, + "loss": 1.0507, + "step": 9116 + }, + { + "epoch": 0.5539218664560422, + "grad_norm": 0.14003917574882507, + "learning_rate": 4.17903979929274e-05, + "loss": 1.0675, + "step": 9117 + }, + { + "epoch": 0.5539826234886688, + "grad_norm": 0.14665600657463074, + "learning_rate": 4.178095528833174e-05, + "loss": 1.0589, + "step": 9118 + }, + { + "epoch": 0.5540433805212953, + "grad_norm": 0.14360341429710388, + "learning_rate": 4.177151288500674e-05, + "loss": 1.0291, + "step": 9119 + }, + { + "epoch": 0.5541041375539218, + "grad_norm": 0.1472918689250946, + "learning_rate": 4.176207078329853e-05, + "loss": 1.0682, + "step": 9120 + }, + { + "epoch": 0.5541648945865484, + "grad_norm": 0.21565595269203186, + "learning_rate": 4.175262898355322e-05, + "loss": 1.0591, + "step": 9121 + }, + { + "epoch": 0.5542256516191749, + "grad_norm": 0.2663908898830414, + "learning_rate": 4.174318748611687e-05, + "loss": 1.1274, + "step": 9122 + }, + { + "epoch": 0.5542864086518015, + "grad_norm": 0.16110442578792572, + "learning_rate": 4.1733746291335616e-05, + "loss": 1.0243, + "step": 9123 + }, + { + "epoch": 0.554347165684428, + "grad_norm": 0.11592574417591095, + "learning_rate": 4.172430539955546e-05, + "loss": 1.0567, + "step": 9124 + }, + { + "epoch": 0.5544079227170545, + "grad_norm": 0.1892186403274536, + "learning_rate": 4.1714864811122495e-05, + "loss": 1.21, + "step": 9125 + }, + { + "epoch": 0.5544686797496811, + "grad_norm": 0.15835748612880707, + "learning_rate": 4.1705424526382765e-05, + "loss": 1.0296, + "step": 9126 + }, + { + "epoch": 0.5545294367823076, + "grad_norm": 0.7885303497314453, + "learning_rate": 4.1695984545682285e-05, + "loss": 1.0375, + "step": 9127 + }, + { + "epoch": 0.5545901938149341, + "grad_norm": 0.37598615884780884, + "learning_rate": 4.1686544869367114e-05, + "loss": 1.0165, + "step": 9128 + }, + { + "epoch": 0.5546509508475606, + "grad_norm": 0.14977487921714783, + "learning_rate": 4.167710549778324e-05, + "loss": 1.0525, + "step": 9129 + }, + { + "epoch": 0.5547117078801871, + "grad_norm": 0.12446727603673935, + "learning_rate": 4.166766643127669e-05, + "loss": 1.0609, + "step": 9130 + }, + { + "epoch": 0.5547724649128136, + "grad_norm": 1.2196428775787354, + "learning_rate": 4.165822767019343e-05, + "loss": 1.1014, + "step": 9131 + }, + { + "epoch": 0.5548332219454402, + "grad_norm": 0.18719960749149323, + "learning_rate": 4.164878921487943e-05, + "loss": 1.0826, + "step": 9132 + }, + { + "epoch": 0.5548939789780667, + "grad_norm": 0.15722621977329254, + "learning_rate": 4.163935106568069e-05, + "loss": 1.1298, + "step": 9133 + }, + { + "epoch": 0.5549547360106932, + "grad_norm": 0.17909014225006104, + "learning_rate": 4.1629913222943146e-05, + "loss": 1.1346, + "step": 9134 + }, + { + "epoch": 0.5550154930433198, + "grad_norm": 0.14320820569992065, + "learning_rate": 4.162047568701275e-05, + "loss": 1.1234, + "step": 9135 + }, + { + "epoch": 0.5550762500759463, + "grad_norm": 0.1808628886938095, + "learning_rate": 4.1611038458235435e-05, + "loss": 1.2227, + "step": 9136 + }, + { + "epoch": 0.5551370071085728, + "grad_norm": 0.12627863883972168, + "learning_rate": 4.1601601536957144e-05, + "loss": 1.0702, + "step": 9137 + }, + { + "epoch": 0.5551977641411994, + "grad_norm": 0.18002478778362274, + "learning_rate": 4.159216492352377e-05, + "loss": 1.0489, + "step": 9138 + }, + { + "epoch": 0.5552585211738259, + "grad_norm": 0.14645852148532867, + "learning_rate": 4.15827286182812e-05, + "loss": 1.0984, + "step": 9139 + }, + { + "epoch": 0.5553192782064524, + "grad_norm": 0.11596376448869705, + "learning_rate": 4.1573292621575344e-05, + "loss": 1.0727, + "step": 9140 + }, + { + "epoch": 0.555380035239079, + "grad_norm": 0.1481286734342575, + "learning_rate": 4.1563856933752075e-05, + "loss": 1.0694, + "step": 9141 + }, + { + "epoch": 0.5554407922717054, + "grad_norm": 0.16426613926887512, + "learning_rate": 4.1554421555157266e-05, + "loss": 1.047, + "step": 9142 + }, + { + "epoch": 0.5555015493043319, + "grad_norm": 0.1251041740179062, + "learning_rate": 4.1544986486136784e-05, + "loss": 1.0583, + "step": 9143 + }, + { + "epoch": 0.5555623063369585, + "grad_norm": 0.6372851729393005, + "learning_rate": 4.153555172703643e-05, + "loss": 1.0703, + "step": 9144 + }, + { + "epoch": 0.555623063369585, + "grad_norm": 0.6742607951164246, + "learning_rate": 4.152611727820207e-05, + "loss": 1.2313, + "step": 9145 + }, + { + "epoch": 0.5556838204022115, + "grad_norm": 0.2668459117412567, + "learning_rate": 4.151668313997952e-05, + "loss": 1.0941, + "step": 9146 + }, + { + "epoch": 0.5557445774348381, + "grad_norm": 0.1434379518032074, + "learning_rate": 4.150724931271458e-05, + "loss": 1.0497, + "step": 9147 + }, + { + "epoch": 0.5558053344674646, + "grad_norm": 0.18460677564144135, + "learning_rate": 4.149781579675307e-05, + "loss": 1.1759, + "step": 9148 + }, + { + "epoch": 0.5558660915000911, + "grad_norm": 0.33510252833366394, + "learning_rate": 4.148838259244077e-05, + "loss": 1.0058, + "step": 9149 + }, + { + "epoch": 0.5559268485327177, + "grad_norm": 0.14448833465576172, + "learning_rate": 4.147894970012345e-05, + "loss": 1.0497, + "step": 9150 + }, + { + "epoch": 0.5559876055653442, + "grad_norm": 0.1655113399028778, + "learning_rate": 4.146951712014688e-05, + "loss": 1.0536, + "step": 9151 + }, + { + "epoch": 0.5560483625979707, + "grad_norm": 0.12680092453956604, + "learning_rate": 4.146008485285678e-05, + "loss": 1.0814, + "step": 9152 + }, + { + "epoch": 0.5561091196305973, + "grad_norm": 0.24189986288547516, + "learning_rate": 4.145065289859895e-05, + "loss": 1.177, + "step": 9153 + }, + { + "epoch": 0.5561698766632238, + "grad_norm": 0.1563245952129364, + "learning_rate": 4.144122125771908e-05, + "loss": 1.0078, + "step": 9154 + }, + { + "epoch": 0.5562306336958502, + "grad_norm": 0.119784876704216, + "learning_rate": 4.143178993056289e-05, + "loss": 1.066, + "step": 9155 + }, + { + "epoch": 0.5562913907284768, + "grad_norm": 0.17714856564998627, + "learning_rate": 4.1422358917476114e-05, + "loss": 1.1419, + "step": 9156 + }, + { + "epoch": 0.5563521477611033, + "grad_norm": 0.13944889605045319, + "learning_rate": 4.1412928218804434e-05, + "loss": 1.063, + "step": 9157 + }, + { + "epoch": 0.5564129047937298, + "grad_norm": 0.15701355040073395, + "learning_rate": 4.140349783489352e-05, + "loss": 1.1095, + "step": 9158 + }, + { + "epoch": 0.5564736618263564, + "grad_norm": 0.1993296891450882, + "learning_rate": 4.139406776608905e-05, + "loss": 1.1329, + "step": 9159 + }, + { + "epoch": 0.5565344188589829, + "grad_norm": 1.117037296295166, + "learning_rate": 4.138463801273668e-05, + "loss": 1.0801, + "step": 9160 + }, + { + "epoch": 0.5565951758916095, + "grad_norm": 0.14102678000926971, + "learning_rate": 4.1375208575182085e-05, + "loss": 1.1481, + "step": 9161 + }, + { + "epoch": 0.556655932924236, + "grad_norm": 0.19037580490112305, + "learning_rate": 4.136577945377088e-05, + "loss": 1.0346, + "step": 9162 + }, + { + "epoch": 0.5567166899568625, + "grad_norm": 0.48535385727882385, + "learning_rate": 4.13563506488487e-05, + "loss": 1.1743, + "step": 9163 + }, + { + "epoch": 0.556777446989489, + "grad_norm": 0.20781023800373077, + "learning_rate": 4.1346922160761184e-05, + "loss": 1.1715, + "step": 9164 + }, + { + "epoch": 0.5568382040221156, + "grad_norm": 0.15947094559669495, + "learning_rate": 4.133749398985387e-05, + "loss": 1.1245, + "step": 9165 + }, + { + "epoch": 0.5568989610547421, + "grad_norm": 0.17786350846290588, + "learning_rate": 4.1328066136472405e-05, + "loss": 1.2078, + "step": 9166 + }, + { + "epoch": 0.5569597180873687, + "grad_norm": 0.16088789701461792, + "learning_rate": 4.1318638600962346e-05, + "loss": 1.127, + "step": 9167 + }, + { + "epoch": 0.5570204751199951, + "grad_norm": 0.12699060142040253, + "learning_rate": 4.130921138366926e-05, + "loss": 1.0874, + "step": 9168 + }, + { + "epoch": 0.5570812321526216, + "grad_norm": 0.12009410560131073, + "learning_rate": 4.1299784484938726e-05, + "loss": 1.0541, + "step": 9169 + }, + { + "epoch": 0.5571419891852482, + "grad_norm": 0.12249850481748581, + "learning_rate": 4.1290357905116276e-05, + "loss": 1.0293, + "step": 9170 + }, + { + "epoch": 0.5572027462178747, + "grad_norm": 0.22208324074745178, + "learning_rate": 4.1280931644547435e-05, + "loss": 1.1401, + "step": 9171 + }, + { + "epoch": 0.5572635032505012, + "grad_norm": 0.12204176187515259, + "learning_rate": 4.1271505703577726e-05, + "loss": 1.067, + "step": 9172 + }, + { + "epoch": 0.5573242602831278, + "grad_norm": 0.11667222529649734, + "learning_rate": 4.1262080082552656e-05, + "loss": 1.0292, + "step": 9173 + }, + { + "epoch": 0.5573850173157543, + "grad_norm": 0.24544478952884674, + "learning_rate": 4.1252654781817746e-05, + "loss": 1.2436, + "step": 9174 + }, + { + "epoch": 0.5574457743483808, + "grad_norm": 0.17343677580356598, + "learning_rate": 4.124322980171845e-05, + "loss": 1.1382, + "step": 9175 + }, + { + "epoch": 0.5575065313810074, + "grad_norm": 0.19015738368034363, + "learning_rate": 4.1233805142600265e-05, + "loss": 1.0339, + "step": 9176 + }, + { + "epoch": 0.5575672884136339, + "grad_norm": 0.18894977867603302, + "learning_rate": 4.122438080480867e-05, + "loss": 1.0983, + "step": 9177 + }, + { + "epoch": 0.5576280454462604, + "grad_norm": 0.23062144219875336, + "learning_rate": 4.121495678868906e-05, + "loss": 1.0913, + "step": 9178 + }, + { + "epoch": 0.557688802478887, + "grad_norm": 0.18861408531665802, + "learning_rate": 4.1205533094586916e-05, + "loss": 1.1221, + "step": 9179 + }, + { + "epoch": 0.5577495595115135, + "grad_norm": 0.18230387568473816, + "learning_rate": 4.1196109722847666e-05, + "loss": 1.1506, + "step": 9180 + }, + { + "epoch": 0.55781031654414, + "grad_norm": 0.5075845718383789, + "learning_rate": 4.118668667381669e-05, + "loss": 1.0723, + "step": 9181 + }, + { + "epoch": 0.5578710735767665, + "grad_norm": 0.1514529436826706, + "learning_rate": 4.117726394783944e-05, + "loss": 1.0702, + "step": 9182 + }, + { + "epoch": 0.557931830609393, + "grad_norm": 0.27151474356651306, + "learning_rate": 4.116784154526128e-05, + "loss": 1.1091, + "step": 9183 + }, + { + "epoch": 0.5579925876420195, + "grad_norm": 2.193075180053711, + "learning_rate": 4.115841946642761e-05, + "loss": 1.0424, + "step": 9184 + }, + { + "epoch": 0.5580533446746461, + "grad_norm": 0.16766516864299774, + "learning_rate": 4.114899771168377e-05, + "loss": 1.1557, + "step": 9185 + }, + { + "epoch": 0.5581141017072726, + "grad_norm": 0.2055845707654953, + "learning_rate": 4.1139576281375125e-05, + "loss": 1.1078, + "step": 9186 + }, + { + "epoch": 0.5581748587398991, + "grad_norm": 0.1501486748456955, + "learning_rate": 4.1130155175847026e-05, + "loss": 1.0784, + "step": 9187 + }, + { + "epoch": 0.5582356157725257, + "grad_norm": 0.7084944248199463, + "learning_rate": 4.112073439544481e-05, + "loss": 1.0515, + "step": 9188 + }, + { + "epoch": 0.5582963728051522, + "grad_norm": 0.16138140857219696, + "learning_rate": 4.1111313940513776e-05, + "loss": 1.0415, + "step": 9189 + }, + { + "epoch": 0.5583571298377787, + "grad_norm": 0.23819054663181305, + "learning_rate": 4.110189381139928e-05, + "loss": 1.1669, + "step": 9190 + }, + { + "epoch": 0.5584178868704053, + "grad_norm": 0.13890103995800018, + "learning_rate": 4.109247400844656e-05, + "loss": 1.076, + "step": 9191 + }, + { + "epoch": 0.5584786439030318, + "grad_norm": 0.23677076399326324, + "learning_rate": 4.108305453200093e-05, + "loss": 1.1462, + "step": 9192 + }, + { + "epoch": 0.5585394009356583, + "grad_norm": 0.21250909566879272, + "learning_rate": 4.107363538240767e-05, + "loss": 1.15, + "step": 9193 + }, + { + "epoch": 0.5586001579682849, + "grad_norm": 0.11922785639762878, + "learning_rate": 4.106421656001201e-05, + "loss": 0.9903, + "step": 9194 + }, + { + "epoch": 0.5586609150009113, + "grad_norm": 0.1227865144610405, + "learning_rate": 4.105479806515923e-05, + "loss": 1.0515, + "step": 9195 + }, + { + "epoch": 0.5587216720335378, + "grad_norm": 0.19078896939754486, + "learning_rate": 4.104537989819456e-05, + "loss": 1.1567, + "step": 9196 + }, + { + "epoch": 0.5587824290661644, + "grad_norm": 0.24832095205783844, + "learning_rate": 4.103596205946323e-05, + "loss": 1.0544, + "step": 9197 + }, + { + "epoch": 0.5588431860987909, + "grad_norm": 0.2160288244485855, + "learning_rate": 4.102654454931043e-05, + "loss": 1.1281, + "step": 9198 + }, + { + "epoch": 0.5589039431314174, + "grad_norm": 0.1606760025024414, + "learning_rate": 4.101712736808137e-05, + "loss": 1.014, + "step": 9199 + }, + { + "epoch": 0.558964700164044, + "grad_norm": 2.069169282913208, + "learning_rate": 4.100771051612125e-05, + "loss": 1.1125, + "step": 9200 + }, + { + "epoch": 0.5590254571966705, + "grad_norm": 0.28553861379623413, + "learning_rate": 4.0998293993775237e-05, + "loss": 1.1188, + "step": 9201 + }, + { + "epoch": 0.559086214229297, + "grad_norm": 0.21476872265338898, + "learning_rate": 4.098887780138849e-05, + "loss": 1.0517, + "step": 9202 + }, + { + "epoch": 0.5591469712619236, + "grad_norm": 0.2278563678264618, + "learning_rate": 4.097946193930618e-05, + "loss": 1.1718, + "step": 9203 + }, + { + "epoch": 0.5592077282945501, + "grad_norm": 0.9798266291618347, + "learning_rate": 4.0970046407873444e-05, + "loss": 1.2012, + "step": 9204 + }, + { + "epoch": 0.5592684853271767, + "grad_norm": 1.392189621925354, + "learning_rate": 4.0960631207435394e-05, + "loss": 1.0678, + "step": 9205 + }, + { + "epoch": 0.5593292423598032, + "grad_norm": 0.19495056569576263, + "learning_rate": 4.095121633833715e-05, + "loss": 1.1045, + "step": 9206 + }, + { + "epoch": 0.5593899993924297, + "grad_norm": 0.18605756759643555, + "learning_rate": 4.094180180092382e-05, + "loss": 1.0737, + "step": 9207 + }, + { + "epoch": 0.5594507564250562, + "grad_norm": 0.13544100522994995, + "learning_rate": 4.0932387595540496e-05, + "loss": 1.2418, + "step": 9208 + }, + { + "epoch": 0.5595115134576827, + "grad_norm": 0.1522100865840912, + "learning_rate": 4.0922973722532264e-05, + "loss": 1.1121, + "step": 9209 + }, + { + "epoch": 0.5595722704903092, + "grad_norm": 0.17265434563159943, + "learning_rate": 4.091356018224419e-05, + "loss": 1.0583, + "step": 9210 + }, + { + "epoch": 0.5596330275229358, + "grad_norm": 0.18241240084171295, + "learning_rate": 4.090414697502132e-05, + "loss": 1.1417, + "step": 9211 + }, + { + "epoch": 0.5596937845555623, + "grad_norm": 0.13621163368225098, + "learning_rate": 4.089473410120868e-05, + "loss": 1.0356, + "step": 9212 + }, + { + "epoch": 0.5597545415881888, + "grad_norm": 0.20651018619537354, + "learning_rate": 4.088532156115134e-05, + "loss": 1.0771, + "step": 9213 + }, + { + "epoch": 0.5598152986208154, + "grad_norm": 1.1205204725265503, + "learning_rate": 4.08759093551943e-05, + "loss": 1.0865, + "step": 9214 + }, + { + "epoch": 0.5598760556534419, + "grad_norm": 0.15205805003643036, + "learning_rate": 4.0866497483682556e-05, + "loss": 1.0847, + "step": 9215 + }, + { + "epoch": 0.5599368126860684, + "grad_norm": 0.29775378108024597, + "learning_rate": 4.0857085946961115e-05, + "loss": 1.2864, + "step": 9216 + }, + { + "epoch": 0.559997569718695, + "grad_norm": 0.13234169781208038, + "learning_rate": 4.0847674745374974e-05, + "loss": 1.0042, + "step": 9217 + }, + { + "epoch": 0.5600583267513215, + "grad_norm": 0.16491706669330597, + "learning_rate": 4.0838263879269073e-05, + "loss": 1.0903, + "step": 9218 + }, + { + "epoch": 0.560119083783948, + "grad_norm": 0.1890789419412613, + "learning_rate": 4.082885334898838e-05, + "loss": 1.0531, + "step": 9219 + }, + { + "epoch": 0.5601798408165746, + "grad_norm": 0.16434350609779358, + "learning_rate": 4.081944315487783e-05, + "loss": 1.1153, + "step": 9220 + }, + { + "epoch": 0.560240597849201, + "grad_norm": 0.7292875051498413, + "learning_rate": 4.0810033297282375e-05, + "loss": 1.0567, + "step": 9221 + }, + { + "epoch": 0.5603013548818275, + "grad_norm": 0.16977371275424957, + "learning_rate": 4.080062377654692e-05, + "loss": 1.1081, + "step": 9222 + }, + { + "epoch": 0.5603621119144541, + "grad_norm": 0.18402603268623352, + "learning_rate": 4.079121459301638e-05, + "loss": 1.1288, + "step": 9223 + }, + { + "epoch": 0.5604228689470806, + "grad_norm": 0.12298519909381866, + "learning_rate": 4.078180574703567e-05, + "loss": 1.0805, + "step": 9224 + }, + { + "epoch": 0.5604836259797071, + "grad_norm": 0.1397022157907486, + "learning_rate": 4.077239723894963e-05, + "loss": 1.0769, + "step": 9225 + }, + { + "epoch": 0.5605443830123337, + "grad_norm": 0.2981606125831604, + "learning_rate": 4.0762989069103156e-05, + "loss": 1.0848, + "step": 9226 + }, + { + "epoch": 0.5606051400449602, + "grad_norm": 0.1549970656633377, + "learning_rate": 4.075358123784111e-05, + "loss": 1.1055, + "step": 9227 + }, + { + "epoch": 0.5606658970775867, + "grad_norm": 0.19356030225753784, + "learning_rate": 4.074417374550831e-05, + "loss": 1.1577, + "step": 9228 + }, + { + "epoch": 0.5607266541102133, + "grad_norm": 0.278775155544281, + "learning_rate": 4.073476659244963e-05, + "loss": 1.0363, + "step": 9229 + }, + { + "epoch": 0.5607874111428398, + "grad_norm": 0.26943546533584595, + "learning_rate": 4.072535977900988e-05, + "loss": 1.0827, + "step": 9230 + }, + { + "epoch": 0.5608481681754663, + "grad_norm": 0.19701923429965973, + "learning_rate": 4.071595330553385e-05, + "loss": 1.1149, + "step": 9231 + }, + { + "epoch": 0.5609089252080929, + "grad_norm": 0.13717126846313477, + "learning_rate": 4.070654717236634e-05, + "loss": 1.0747, + "step": 9232 + }, + { + "epoch": 0.5609696822407194, + "grad_norm": 0.15929082036018372, + "learning_rate": 4.069714137985214e-05, + "loss": 1.0392, + "step": 9233 + }, + { + "epoch": 0.5610304392733458, + "grad_norm": 0.2341483235359192, + "learning_rate": 4.0687735928336024e-05, + "loss": 1.1599, + "step": 9234 + }, + { + "epoch": 0.5610911963059724, + "grad_norm": 0.24456939101219177, + "learning_rate": 4.067833081816275e-05, + "loss": 0.9986, + "step": 9235 + }, + { + "epoch": 0.5611519533385989, + "grad_norm": 0.1813986897468567, + "learning_rate": 4.066892604967705e-05, + "loss": 1.1335, + "step": 9236 + }, + { + "epoch": 0.5612127103712254, + "grad_norm": 0.13461871445178986, + "learning_rate": 4.06595216232237e-05, + "loss": 1.0855, + "step": 9237 + }, + { + "epoch": 0.561273467403852, + "grad_norm": 0.2357327789068222, + "learning_rate": 4.0650117539147366e-05, + "loss": 1.1281, + "step": 9238 + }, + { + "epoch": 0.5613342244364785, + "grad_norm": 0.9234658479690552, + "learning_rate": 4.064071379779278e-05, + "loss": 1.1923, + "step": 9239 + }, + { + "epoch": 0.561394981469105, + "grad_norm": 0.42659205198287964, + "learning_rate": 4.0631310399504634e-05, + "loss": 1.2212, + "step": 9240 + }, + { + "epoch": 0.5614557385017316, + "grad_norm": 0.15744329988956451, + "learning_rate": 4.062190734462763e-05, + "loss": 1.1453, + "step": 9241 + }, + { + "epoch": 0.5615164955343581, + "grad_norm": 0.22790741920471191, + "learning_rate": 4.061250463350642e-05, + "loss": 1.2416, + "step": 9242 + }, + { + "epoch": 0.5615772525669847, + "grad_norm": 0.20907776057720184, + "learning_rate": 4.060310226648567e-05, + "loss": 1.0399, + "step": 9243 + }, + { + "epoch": 0.5616380095996112, + "grad_norm": 0.16667909920215607, + "learning_rate": 4.059370024391003e-05, + "loss": 1.1031, + "step": 9244 + }, + { + "epoch": 0.5616987666322377, + "grad_norm": 0.3520148992538452, + "learning_rate": 4.058429856612411e-05, + "loss": 1.1062, + "step": 9245 + }, + { + "epoch": 0.5617595236648643, + "grad_norm": 0.21632179617881775, + "learning_rate": 4.057489723347255e-05, + "loss": 1.126, + "step": 9246 + }, + { + "epoch": 0.5618202806974907, + "grad_norm": 0.2084217667579651, + "learning_rate": 4.056549624629996e-05, + "loss": 1.0224, + "step": 9247 + }, + { + "epoch": 0.5618810377301172, + "grad_norm": 0.2183791548013687, + "learning_rate": 4.055609560495092e-05, + "loss": 1.0627, + "step": 9248 + }, + { + "epoch": 0.5619417947627438, + "grad_norm": 8.142712593078613, + "learning_rate": 4.054669530977002e-05, + "loss": 1.0978, + "step": 9249 + }, + { + "epoch": 0.5620025517953703, + "grad_norm": 0.13680914044380188, + "learning_rate": 4.053729536110184e-05, + "loss": 1.0367, + "step": 9250 + }, + { + "epoch": 0.5620633088279968, + "grad_norm": 0.21016928553581238, + "learning_rate": 4.052789575929094e-05, + "loss": 1.0802, + "step": 9251 + }, + { + "epoch": 0.5621240658606234, + "grad_norm": 0.13444884121418, + "learning_rate": 4.051849650468184e-05, + "loss": 1.0312, + "step": 9252 + }, + { + "epoch": 0.5621848228932499, + "grad_norm": 0.5965390205383301, + "learning_rate": 4.050909759761907e-05, + "loss": 1.0374, + "step": 9253 + }, + { + "epoch": 0.5622455799258764, + "grad_norm": 0.23923075199127197, + "learning_rate": 4.049969903844717e-05, + "loss": 1.1398, + "step": 9254 + }, + { + "epoch": 0.562306336958503, + "grad_norm": 0.15571463108062744, + "learning_rate": 4.049030082751064e-05, + "loss": 1.0744, + "step": 9255 + }, + { + "epoch": 0.5623670939911295, + "grad_norm": 0.19940346479415894, + "learning_rate": 4.048090296515396e-05, + "loss": 1.1872, + "step": 9256 + }, + { + "epoch": 0.562427851023756, + "grad_norm": 0.1466224640607834, + "learning_rate": 4.047150545172166e-05, + "loss": 1.0977, + "step": 9257 + }, + { + "epoch": 0.5624886080563826, + "grad_norm": 0.1474834829568863, + "learning_rate": 4.046210828755813e-05, + "loss": 1.1366, + "step": 9258 + }, + { + "epoch": 0.5625493650890091, + "grad_norm": 0.13714270293712616, + "learning_rate": 4.0452711473007873e-05, + "loss": 1.0169, + "step": 9259 + }, + { + "epoch": 0.5626101221216355, + "grad_norm": 0.15613627433776855, + "learning_rate": 4.0443315008415334e-05, + "loss": 1.1062, + "step": 9260 + }, + { + "epoch": 0.5626708791542621, + "grad_norm": 1.8506195545196533, + "learning_rate": 4.043391889412491e-05, + "loss": 1.0487, + "step": 9261 + }, + { + "epoch": 0.5627316361868886, + "grad_norm": 0.1302403211593628, + "learning_rate": 4.042452313048105e-05, + "loss": 1.0356, + "step": 9262 + }, + { + "epoch": 0.5627923932195151, + "grad_norm": 0.21262824535369873, + "learning_rate": 4.041512771782815e-05, + "loss": 1.2119, + "step": 9263 + }, + { + "epoch": 0.5628531502521417, + "grad_norm": 0.15733003616333008, + "learning_rate": 4.040573265651061e-05, + "loss": 1.1157, + "step": 9264 + }, + { + "epoch": 0.5629139072847682, + "grad_norm": 0.24077053368091583, + "learning_rate": 4.0396337946872774e-05, + "loss": 1.1276, + "step": 9265 + }, + { + "epoch": 0.5629746643173947, + "grad_norm": 0.16773982346057892, + "learning_rate": 4.038694358925902e-05, + "loss": 1.0722, + "step": 9266 + }, + { + "epoch": 0.5630354213500213, + "grad_norm": 0.15182141959667206, + "learning_rate": 4.037754958401372e-05, + "loss": 1.0635, + "step": 9267 + }, + { + "epoch": 0.5630961783826478, + "grad_norm": 0.4050828218460083, + "learning_rate": 4.036815593148119e-05, + "loss": 1.1695, + "step": 9268 + }, + { + "epoch": 0.5631569354152743, + "grad_norm": 0.12399948388338089, + "learning_rate": 4.035876263200577e-05, + "loss": 1.0449, + "step": 9269 + }, + { + "epoch": 0.5632176924479009, + "grad_norm": 0.12388220429420471, + "learning_rate": 4.034936968593177e-05, + "loss": 1.0941, + "step": 9270 + }, + { + "epoch": 0.5632784494805274, + "grad_norm": 0.16216684877872467, + "learning_rate": 4.033997709360351e-05, + "loss": 1.0491, + "step": 9271 + }, + { + "epoch": 0.563339206513154, + "grad_norm": 0.5428144931793213, + "learning_rate": 4.033058485536524e-05, + "loss": 1.1478, + "step": 9272 + }, + { + "epoch": 0.5633999635457804, + "grad_norm": 0.1771230399608612, + "learning_rate": 4.0321192971561245e-05, + "loss": 1.1018, + "step": 9273 + }, + { + "epoch": 0.5634607205784069, + "grad_norm": 9.403618812561035, + "learning_rate": 4.031180144253579e-05, + "loss": 1.0903, + "step": 9274 + }, + { + "epoch": 0.5635214776110334, + "grad_norm": 0.2488657385110855, + "learning_rate": 4.0302410268633134e-05, + "loss": 1.0739, + "step": 9275 + }, + { + "epoch": 0.56358223464366, + "grad_norm": 0.14782527089118958, + "learning_rate": 4.02930194501975e-05, + "loss": 1.1432, + "step": 9276 + }, + { + "epoch": 0.5636429916762865, + "grad_norm": 0.14933501183986664, + "learning_rate": 4.028362898757313e-05, + "loss": 1.0251, + "step": 9277 + }, + { + "epoch": 0.563703748708913, + "grad_norm": 0.13341885805130005, + "learning_rate": 4.027423888110421e-05, + "loss": 1.0581, + "step": 9278 + }, + { + "epoch": 0.5637645057415396, + "grad_norm": 0.3196311891078949, + "learning_rate": 4.026484913113493e-05, + "loss": 1.0843, + "step": 9279 + }, + { + "epoch": 0.5638252627741661, + "grad_norm": 1.8453270196914673, + "learning_rate": 4.0255459738009496e-05, + "loss": 1.1265, + "step": 9280 + }, + { + "epoch": 0.5638860198067926, + "grad_norm": 0.15659084916114807, + "learning_rate": 4.0246070702072066e-05, + "loss": 1.0409, + "step": 9281 + }, + { + "epoch": 0.5639467768394192, + "grad_norm": 0.125941663980484, + "learning_rate": 4.0236682023666796e-05, + "loss": 1.0557, + "step": 9282 + }, + { + "epoch": 0.5640075338720457, + "grad_norm": 0.1870090216398239, + "learning_rate": 4.022729370313784e-05, + "loss": 1.162, + "step": 9283 + }, + { + "epoch": 0.5640682909046723, + "grad_norm": 0.17505106329917908, + "learning_rate": 4.0217905740829334e-05, + "loss": 0.9839, + "step": 9284 + }, + { + "epoch": 0.5641290479372988, + "grad_norm": 1.1048821210861206, + "learning_rate": 4.020851813708537e-05, + "loss": 1.026, + "step": 9285 + }, + { + "epoch": 0.5641898049699253, + "grad_norm": 0.15395544469356537, + "learning_rate": 4.0199130892250075e-05, + "loss": 1.1568, + "step": 9286 + }, + { + "epoch": 0.5642505620025517, + "grad_norm": 0.12581133842468262, + "learning_rate": 4.018974400666751e-05, + "loss": 1.0514, + "step": 9287 + }, + { + "epoch": 0.5643113190351783, + "grad_norm": 0.2663700580596924, + "learning_rate": 4.0180357480681794e-05, + "loss": 1.1256, + "step": 9288 + }, + { + "epoch": 0.5643720760678048, + "grad_norm": 0.19487228989601135, + "learning_rate": 4.0170971314636974e-05, + "loss": 1.2091, + "step": 9289 + }, + { + "epoch": 0.5644328331004314, + "grad_norm": 0.17007489502429962, + "learning_rate": 4.0161585508877084e-05, + "loss": 1.1693, + "step": 9290 + }, + { + "epoch": 0.5644935901330579, + "grad_norm": 1.9008747339248657, + "learning_rate": 4.015220006374621e-05, + "loss": 1.0776, + "step": 9291 + }, + { + "epoch": 0.5645543471656844, + "grad_norm": 0.13507528603076935, + "learning_rate": 4.0142814979588314e-05, + "loss": 1.0141, + "step": 9292 + }, + { + "epoch": 0.564615104198311, + "grad_norm": 0.18734753131866455, + "learning_rate": 4.0133430256747456e-05, + "loss": 1.1283, + "step": 9293 + }, + { + "epoch": 0.5646758612309375, + "grad_norm": 0.1442510187625885, + "learning_rate": 4.012404589556762e-05, + "loss": 1.0857, + "step": 9294 + }, + { + "epoch": 0.564736618263564, + "grad_norm": 0.1793176680803299, + "learning_rate": 4.011466189639277e-05, + "loss": 1.0161, + "step": 9295 + }, + { + "epoch": 0.5647973752961906, + "grad_norm": 0.17933893203735352, + "learning_rate": 4.010527825956692e-05, + "loss": 1.1322, + "step": 9296 + }, + { + "epoch": 0.5648581323288171, + "grad_norm": 0.20198547840118408, + "learning_rate": 4.009589498543401e-05, + "loss": 1.0565, + "step": 9297 + }, + { + "epoch": 0.5649188893614436, + "grad_norm": 0.28964295983314514, + "learning_rate": 4.008651207433798e-05, + "loss": 1.1322, + "step": 9298 + }, + { + "epoch": 0.5649796463940702, + "grad_norm": 0.3297141194343567, + "learning_rate": 4.007712952662277e-05, + "loss": 1.1392, + "step": 9299 + }, + { + "epoch": 0.5650404034266966, + "grad_norm": 0.18128494918346405, + "learning_rate": 4.006774734263227e-05, + "loss": 1.1262, + "step": 9300 + }, + { + "epoch": 0.5651011604593231, + "grad_norm": 0.25018155574798584, + "learning_rate": 4.0058365522710435e-05, + "loss": 1.0916, + "step": 9301 + }, + { + "epoch": 0.5651619174919497, + "grad_norm": 0.2777166962623596, + "learning_rate": 4.0048984067201126e-05, + "loss": 1.3092, + "step": 9302 + }, + { + "epoch": 0.5652226745245762, + "grad_norm": 0.16668292880058289, + "learning_rate": 4.003960297644822e-05, + "loss": 1.0368, + "step": 9303 + }, + { + "epoch": 0.5652834315572027, + "grad_norm": 0.214456707239151, + "learning_rate": 4.003022225079563e-05, + "loss": 1.1412, + "step": 9304 + }, + { + "epoch": 0.5653441885898293, + "grad_norm": 0.14797136187553406, + "learning_rate": 4.002084189058714e-05, + "loss": 1.0916, + "step": 9305 + }, + { + "epoch": 0.5654049456224558, + "grad_norm": 0.12118365615606308, + "learning_rate": 4.001146189616662e-05, + "loss": 1.0001, + "step": 9306 + }, + { + "epoch": 0.5654657026550823, + "grad_norm": 0.13804593682289124, + "learning_rate": 4.00020822678779e-05, + "loss": 1.0719, + "step": 9307 + }, + { + "epoch": 0.5655264596877089, + "grad_norm": 0.1874879151582718, + "learning_rate": 3.999270300606477e-05, + "loss": 1.1724, + "step": 9308 + }, + { + "epoch": 0.5655872167203354, + "grad_norm": 0.11761140078306198, + "learning_rate": 3.9983324111071064e-05, + "loss": 1.0454, + "step": 9309 + }, + { + "epoch": 0.5656479737529619, + "grad_norm": 0.8463904857635498, + "learning_rate": 3.9973945583240545e-05, + "loss": 1.0446, + "step": 9310 + }, + { + "epoch": 0.5657087307855885, + "grad_norm": 0.30176177620887756, + "learning_rate": 3.9964567422917e-05, + "loss": 1.1424, + "step": 9311 + }, + { + "epoch": 0.565769487818215, + "grad_norm": 0.18117481470108032, + "learning_rate": 3.995518963044417e-05, + "loss": 1.1793, + "step": 9312 + }, + { + "epoch": 0.5658302448508414, + "grad_norm": 0.20154625177383423, + "learning_rate": 3.994581220616579e-05, + "loss": 1.0541, + "step": 9313 + }, + { + "epoch": 0.565891001883468, + "grad_norm": 0.15985903143882751, + "learning_rate": 3.993643515042562e-05, + "loss": 1.0832, + "step": 9314 + }, + { + "epoch": 0.5659517589160945, + "grad_norm": 0.4219246506690979, + "learning_rate": 3.992705846356737e-05, + "loss": 1.0448, + "step": 9315 + }, + { + "epoch": 0.566012515948721, + "grad_norm": 0.13238303363323212, + "learning_rate": 3.991768214593472e-05, + "loss": 1.0338, + "step": 9316 + }, + { + "epoch": 0.5660732729813476, + "grad_norm": 0.15834259986877441, + "learning_rate": 3.990830619787139e-05, + "loss": 1.0897, + "step": 9317 + }, + { + "epoch": 0.5661340300139741, + "grad_norm": 7.561474800109863, + "learning_rate": 3.9898930619721065e-05, + "loss": 1.0701, + "step": 9318 + }, + { + "epoch": 0.5661947870466006, + "grad_norm": 0.3598170876502991, + "learning_rate": 3.9889555411827374e-05, + "loss": 1.0967, + "step": 9319 + }, + { + "epoch": 0.5662555440792272, + "grad_norm": 0.23852452635765076, + "learning_rate": 3.988018057453398e-05, + "loss": 1.0428, + "step": 9320 + }, + { + "epoch": 0.5663163011118537, + "grad_norm": 0.13089148700237274, + "learning_rate": 3.987080610818452e-05, + "loss": 1.0212, + "step": 9321 + }, + { + "epoch": 0.5663770581444802, + "grad_norm": 0.15455998480319977, + "learning_rate": 3.986143201312263e-05, + "loss": 1.0467, + "step": 9322 + }, + { + "epoch": 0.5664378151771068, + "grad_norm": 0.1878483146429062, + "learning_rate": 3.9852058289691905e-05, + "loss": 1.0567, + "step": 9323 + }, + { + "epoch": 0.5664985722097333, + "grad_norm": 0.21425405144691467, + "learning_rate": 3.9842684938235966e-05, + "loss": 1.0102, + "step": 9324 + }, + { + "epoch": 0.5665593292423599, + "grad_norm": 0.16678059101104736, + "learning_rate": 3.9833311959098354e-05, + "loss": 1.0462, + "step": 9325 + }, + { + "epoch": 0.5666200862749863, + "grad_norm": 0.45588287711143494, + "learning_rate": 3.982393935262264e-05, + "loss": 1.2509, + "step": 9326 + }, + { + "epoch": 0.5666808433076128, + "grad_norm": 0.14263463020324707, + "learning_rate": 3.9814567119152416e-05, + "loss": 1.0678, + "step": 9327 + }, + { + "epoch": 0.5667416003402393, + "grad_norm": 0.13871221244335175, + "learning_rate": 3.98051952590312e-05, + "loss": 1.1122, + "step": 9328 + }, + { + "epoch": 0.5668023573728659, + "grad_norm": 0.41435712575912476, + "learning_rate": 3.979582377260252e-05, + "loss": 1.0951, + "step": 9329 + }, + { + "epoch": 0.5668631144054924, + "grad_norm": 0.15884922444820404, + "learning_rate": 3.9786452660209895e-05, + "loss": 1.1005, + "step": 9330 + }, + { + "epoch": 0.566923871438119, + "grad_norm": 0.3118976950645447, + "learning_rate": 3.977708192219683e-05, + "loss": 1.1567, + "step": 9331 + }, + { + "epoch": 0.5669846284707455, + "grad_norm": 0.13443267345428467, + "learning_rate": 3.97677115589068e-05, + "loss": 1.0394, + "step": 9332 + }, + { + "epoch": 0.567045385503372, + "grad_norm": 0.13821937143802643, + "learning_rate": 3.9758341570683265e-05, + "loss": 1.0507, + "step": 9333 + }, + { + "epoch": 0.5671061425359986, + "grad_norm": 0.16394339501857758, + "learning_rate": 3.974897195786971e-05, + "loss": 1.0217, + "step": 9334 + }, + { + "epoch": 0.5671668995686251, + "grad_norm": 0.20001637935638428, + "learning_rate": 3.9739602720809575e-05, + "loss": 1.1347, + "step": 9335 + }, + { + "epoch": 0.5672276566012516, + "grad_norm": 0.15204745531082153, + "learning_rate": 3.9730233859846284e-05, + "loss": 1.0489, + "step": 9336 + }, + { + "epoch": 0.5672884136338782, + "grad_norm": 0.27701836824417114, + "learning_rate": 3.972086537532324e-05, + "loss": 1.0803, + "step": 9337 + }, + { + "epoch": 0.5673491706665047, + "grad_norm": 0.28854477405548096, + "learning_rate": 3.971149726758389e-05, + "loss": 1.1778, + "step": 9338 + }, + { + "epoch": 0.5674099276991311, + "grad_norm": 0.15160194039344788, + "learning_rate": 3.9702129536971577e-05, + "loss": 1.1792, + "step": 9339 + }, + { + "epoch": 0.5674706847317577, + "grad_norm": 0.1626320481300354, + "learning_rate": 3.96927621838297e-05, + "loss": 1.067, + "step": 9340 + }, + { + "epoch": 0.5675314417643842, + "grad_norm": 0.21114283800125122, + "learning_rate": 3.968339520850161e-05, + "loss": 1.1389, + "step": 9341 + }, + { + "epoch": 0.5675921987970107, + "grad_norm": 0.13761281967163086, + "learning_rate": 3.967402861133068e-05, + "loss": 1.0398, + "step": 9342 + }, + { + "epoch": 0.5676529558296373, + "grad_norm": 0.1349709928035736, + "learning_rate": 3.966466239266021e-05, + "loss": 1.1146, + "step": 9343 + }, + { + "epoch": 0.5677137128622638, + "grad_norm": 0.14251908659934998, + "learning_rate": 3.9655296552833565e-05, + "loss": 1.0173, + "step": 9344 + }, + { + "epoch": 0.5677744698948903, + "grad_norm": 0.15189701318740845, + "learning_rate": 3.9645931092194004e-05, + "loss": 1.0556, + "step": 9345 + }, + { + "epoch": 0.5678352269275169, + "grad_norm": 0.16033907234668732, + "learning_rate": 3.963656601108483e-05, + "loss": 1.0844, + "step": 9346 + }, + { + "epoch": 0.5678959839601434, + "grad_norm": 0.12061890214681625, + "learning_rate": 3.962720130984935e-05, + "loss": 1.0828, + "step": 9347 + }, + { + "epoch": 0.5679567409927699, + "grad_norm": 0.3561362028121948, + "learning_rate": 3.96178369888308e-05, + "loss": 1.1843, + "step": 9348 + }, + { + "epoch": 0.5680174980253965, + "grad_norm": 0.20875217020511627, + "learning_rate": 3.9608473048372444e-05, + "loss": 1.111, + "step": 9349 + }, + { + "epoch": 0.568078255058023, + "grad_norm": 0.22136706113815308, + "learning_rate": 3.959910948881751e-05, + "loss": 1.0714, + "step": 9350 + }, + { + "epoch": 0.5681390120906495, + "grad_norm": 0.14425520598888397, + "learning_rate": 3.958974631050926e-05, + "loss": 1.067, + "step": 9351 + }, + { + "epoch": 0.568199769123276, + "grad_norm": 0.1793167144060135, + "learning_rate": 3.9580383513790845e-05, + "loss": 1.0601, + "step": 9352 + }, + { + "epoch": 0.5682605261559025, + "grad_norm": 0.1604180485010147, + "learning_rate": 3.9571021099005495e-05, + "loss": 1.0436, + "step": 9353 + }, + { + "epoch": 0.568321283188529, + "grad_norm": 0.12525637447834015, + "learning_rate": 3.956165906649637e-05, + "loss": 1.1014, + "step": 9354 + }, + { + "epoch": 0.5683820402211556, + "grad_norm": 0.3861655294895172, + "learning_rate": 3.955229741660665e-05, + "loss": 1.1394, + "step": 9355 + }, + { + "epoch": 0.5684427972537821, + "grad_norm": 0.3664751946926117, + "learning_rate": 3.95429361496795e-05, + "loss": 1.2092, + "step": 9356 + }, + { + "epoch": 0.5685035542864086, + "grad_norm": 0.1381242573261261, + "learning_rate": 3.953357526605803e-05, + "loss": 1.0252, + "step": 9357 + }, + { + "epoch": 0.5685643113190352, + "grad_norm": 0.11836809664964676, + "learning_rate": 3.952421476608542e-05, + "loss": 1.0592, + "step": 9358 + }, + { + "epoch": 0.5686250683516617, + "grad_norm": 0.15251173079013824, + "learning_rate": 3.95148546501047e-05, + "loss": 1.1237, + "step": 9359 + }, + { + "epoch": 0.5686858253842882, + "grad_norm": 0.27714094519615173, + "learning_rate": 3.9505494918459034e-05, + "loss": 1.0587, + "step": 9360 + }, + { + "epoch": 0.5687465824169148, + "grad_norm": 0.1492842435836792, + "learning_rate": 3.9496135571491475e-05, + "loss": 1.0524, + "step": 9361 + }, + { + "epoch": 0.5688073394495413, + "grad_norm": 0.13584201037883759, + "learning_rate": 3.9486776609545086e-05, + "loss": 1.1166, + "step": 9362 + }, + { + "epoch": 0.5688680964821679, + "grad_norm": 0.20638935267925262, + "learning_rate": 3.947741803296294e-05, + "loss": 1.1507, + "step": 9363 + }, + { + "epoch": 0.5689288535147944, + "grad_norm": 0.16127735376358032, + "learning_rate": 3.946805984208808e-05, + "loss": 1.049, + "step": 9364 + }, + { + "epoch": 0.5689896105474208, + "grad_norm": 0.3948340117931366, + "learning_rate": 3.9458702037263515e-05, + "loss": 1.0238, + "step": 9365 + }, + { + "epoch": 0.5690503675800473, + "grad_norm": 0.13667432963848114, + "learning_rate": 3.9449344618832265e-05, + "loss": 1.1048, + "step": 9366 + }, + { + "epoch": 0.5691111246126739, + "grad_norm": 0.14048372209072113, + "learning_rate": 3.943998758713732e-05, + "loss": 1.061, + "step": 9367 + }, + { + "epoch": 0.5691718816453004, + "grad_norm": 0.2314092516899109, + "learning_rate": 3.943063094252167e-05, + "loss": 1.2434, + "step": 9368 + }, + { + "epoch": 0.569232638677927, + "grad_norm": 0.20214729011058807, + "learning_rate": 3.94212746853283e-05, + "loss": 1.0621, + "step": 9369 + }, + { + "epoch": 0.5692933957105535, + "grad_norm": 0.20059621334075928, + "learning_rate": 3.9411918815900126e-05, + "loss": 1.0898, + "step": 9370 + }, + { + "epoch": 0.56935415274318, + "grad_norm": 0.2255912721157074, + "learning_rate": 3.9402563334580155e-05, + "loss": 1.2409, + "step": 9371 + }, + { + "epoch": 0.5694149097758066, + "grad_norm": 0.1617220938205719, + "learning_rate": 3.939320824171124e-05, + "loss": 1.1072, + "step": 9372 + }, + { + "epoch": 0.5694756668084331, + "grad_norm": 0.19971393048763275, + "learning_rate": 3.938385353763634e-05, + "loss": 1.0481, + "step": 9373 + }, + { + "epoch": 0.5695364238410596, + "grad_norm": 0.16343924403190613, + "learning_rate": 3.937449922269834e-05, + "loss": 1.0222, + "step": 9374 + }, + { + "epoch": 0.5695971808736862, + "grad_norm": 0.18706795573234558, + "learning_rate": 3.9365145297240114e-05, + "loss": 1.1051, + "step": 9375 + }, + { + "epoch": 0.5696579379063127, + "grad_norm": 0.16287298500537872, + "learning_rate": 3.935579176160455e-05, + "loss": 1.1133, + "step": 9376 + }, + { + "epoch": 0.5697186949389392, + "grad_norm": 0.35146385431289673, + "learning_rate": 3.93464386161345e-05, + "loss": 1.1841, + "step": 9377 + }, + { + "epoch": 0.5697794519715657, + "grad_norm": 0.11296513676643372, + "learning_rate": 3.933708586117281e-05, + "loss": 1.0019, + "step": 9378 + }, + { + "epoch": 0.5698402090041922, + "grad_norm": 0.17266689240932465, + "learning_rate": 3.9327733497062294e-05, + "loss": 1.049, + "step": 9379 + }, + { + "epoch": 0.5699009660368187, + "grad_norm": 0.2204691767692566, + "learning_rate": 3.931838152414575e-05, + "loss": 1.091, + "step": 9380 + }, + { + "epoch": 0.5699617230694453, + "grad_norm": 0.19983020424842834, + "learning_rate": 3.930902994276601e-05, + "loss": 1.082, + "step": 9381 + }, + { + "epoch": 0.5700224801020718, + "grad_norm": 0.20253503322601318, + "learning_rate": 3.929967875326584e-05, + "loss": 1.0606, + "step": 9382 + }, + { + "epoch": 0.5700832371346983, + "grad_norm": 0.1941344290971756, + "learning_rate": 3.9290327955988014e-05, + "loss": 1.1429, + "step": 9383 + }, + { + "epoch": 0.5701439941673249, + "grad_norm": 0.3806479573249817, + "learning_rate": 3.9280977551275296e-05, + "loss": 1.1604, + "step": 9384 + }, + { + "epoch": 0.5702047511999514, + "grad_norm": 0.2293899804353714, + "learning_rate": 3.9271627539470425e-05, + "loss": 1.0977, + "step": 9385 + }, + { + "epoch": 0.5702655082325779, + "grad_norm": 0.20892605185508728, + "learning_rate": 3.926227792091611e-05, + "loss": 1.0988, + "step": 9386 + }, + { + "epoch": 0.5703262652652045, + "grad_norm": 0.14298801124095917, + "learning_rate": 3.925292869595507e-05, + "loss": 1.0489, + "step": 9387 + }, + { + "epoch": 0.570387022297831, + "grad_norm": 0.29442140460014343, + "learning_rate": 3.9243579864929995e-05, + "loss": 1.0525, + "step": 9388 + }, + { + "epoch": 0.5704477793304575, + "grad_norm": 0.23393961787223816, + "learning_rate": 3.9234231428183594e-05, + "loss": 1.0799, + "step": 9389 + }, + { + "epoch": 0.5705085363630841, + "grad_norm": 0.16348634660243988, + "learning_rate": 3.922488338605852e-05, + "loss": 1.0522, + "step": 9390 + }, + { + "epoch": 0.5705692933957106, + "grad_norm": 0.5351850986480713, + "learning_rate": 3.9215535738897435e-05, + "loss": 1.0977, + "step": 9391 + }, + { + "epoch": 0.570630050428337, + "grad_norm": 0.18496620655059814, + "learning_rate": 3.920618848704297e-05, + "loss": 1.1478, + "step": 9392 + }, + { + "epoch": 0.5706908074609636, + "grad_norm": 0.20554541051387787, + "learning_rate": 3.919684163083773e-05, + "loss": 1.1469, + "step": 9393 + }, + { + "epoch": 0.5707515644935901, + "grad_norm": 0.43232595920562744, + "learning_rate": 3.918749517062437e-05, + "loss": 1.2087, + "step": 9394 + }, + { + "epoch": 0.5708123215262166, + "grad_norm": 0.2660805583000183, + "learning_rate": 3.9178149106745454e-05, + "loss": 1.0644, + "step": 9395 + }, + { + "epoch": 0.5708730785588432, + "grad_norm": 0.1719880998134613, + "learning_rate": 3.9168803439543575e-05, + "loss": 1.05, + "step": 9396 + }, + { + "epoch": 0.5709338355914697, + "grad_norm": 0.1952056735754013, + "learning_rate": 3.91594581693613e-05, + "loss": 1.1386, + "step": 9397 + }, + { + "epoch": 0.5709945926240962, + "grad_norm": 0.18509529531002045, + "learning_rate": 3.91501132965412e-05, + "loss": 1.052, + "step": 9398 + }, + { + "epoch": 0.5710553496567228, + "grad_norm": 0.20501284301280975, + "learning_rate": 3.9140768821425775e-05, + "loss": 1.0618, + "step": 9399 + }, + { + "epoch": 0.5711161066893493, + "grad_norm": 0.11503874510526657, + "learning_rate": 3.9131424744357574e-05, + "loss": 1.0255, + "step": 9400 + }, + { + "epoch": 0.5711768637219758, + "grad_norm": 0.23324188590049744, + "learning_rate": 3.912208106567909e-05, + "loss": 1.1138, + "step": 9401 + }, + { + "epoch": 0.5712376207546024, + "grad_norm": 0.12903693318367004, + "learning_rate": 3.911273778573284e-05, + "loss": 1.0259, + "step": 9402 + }, + { + "epoch": 0.5712983777872289, + "grad_norm": 0.23123428225517273, + "learning_rate": 3.910339490486128e-05, + "loss": 1.1218, + "step": 9403 + }, + { + "epoch": 0.5713591348198555, + "grad_norm": 0.18961231410503387, + "learning_rate": 3.9094052423406886e-05, + "loss": 1.0625, + "step": 9404 + }, + { + "epoch": 0.5714198918524819, + "grad_norm": 0.13013260066509247, + "learning_rate": 3.908471034171214e-05, + "loss": 1.0603, + "step": 9405 + }, + { + "epoch": 0.5714806488851084, + "grad_norm": 0.13224995136260986, + "learning_rate": 3.907536866011941e-05, + "loss": 1.1031, + "step": 9406 + }, + { + "epoch": 0.571541405917735, + "grad_norm": 0.1757730394601822, + "learning_rate": 3.906602737897116e-05, + "loss": 1.0995, + "step": 9407 + }, + { + "epoch": 0.5716021629503615, + "grad_norm": 1.349341630935669, + "learning_rate": 3.9056686498609803e-05, + "loss": 1.2023, + "step": 9408 + }, + { + "epoch": 0.571662919982988, + "grad_norm": 0.13470570743083954, + "learning_rate": 3.9047346019377695e-05, + "loss": 1.0785, + "step": 9409 + }, + { + "epoch": 0.5717236770156146, + "grad_norm": 0.1447269320487976, + "learning_rate": 3.903800594161725e-05, + "loss": 1.1064, + "step": 9410 + }, + { + "epoch": 0.5717844340482411, + "grad_norm": 0.22651201486587524, + "learning_rate": 3.9028666265670835e-05, + "loss": 1.0325, + "step": 9411 + }, + { + "epoch": 0.5718451910808676, + "grad_norm": 0.14072097837924957, + "learning_rate": 3.901932699188076e-05, + "loss": 1.0215, + "step": 9412 + }, + { + "epoch": 0.5719059481134942, + "grad_norm": 0.15580034255981445, + "learning_rate": 3.900998812058937e-05, + "loss": 1.0087, + "step": 9413 + }, + { + "epoch": 0.5719667051461207, + "grad_norm": 0.13120998442173004, + "learning_rate": 3.900064965213899e-05, + "loss": 1.0517, + "step": 9414 + }, + { + "epoch": 0.5720274621787472, + "grad_norm": 0.16143788397312164, + "learning_rate": 3.8991311586871925e-05, + "loss": 1.1034, + "step": 9415 + }, + { + "epoch": 0.5720882192113738, + "grad_norm": 0.1284245252609253, + "learning_rate": 3.898197392513047e-05, + "loss": 1.0483, + "step": 9416 + }, + { + "epoch": 0.5721489762440003, + "grad_norm": 0.1260843425989151, + "learning_rate": 3.897263666725688e-05, + "loss": 1.0085, + "step": 9417 + }, + { + "epoch": 0.5722097332766267, + "grad_norm": 0.14079874753952026, + "learning_rate": 3.896329981359345e-05, + "loss": 1.0296, + "step": 9418 + }, + { + "epoch": 0.5722704903092533, + "grad_norm": 0.15053793787956238, + "learning_rate": 3.895396336448237e-05, + "loss": 1.0672, + "step": 9419 + }, + { + "epoch": 0.5723312473418798, + "grad_norm": 0.6577506065368652, + "learning_rate": 3.894462732026591e-05, + "loss": 1.2259, + "step": 9420 + }, + { + "epoch": 0.5723920043745063, + "grad_norm": 0.1847575306892395, + "learning_rate": 3.893529168128627e-05, + "loss": 1.1215, + "step": 9421 + }, + { + "epoch": 0.5724527614071329, + "grad_norm": 0.2351631075143814, + "learning_rate": 3.8925956447885645e-05, + "loss": 1.1323, + "step": 9422 + }, + { + "epoch": 0.5725135184397594, + "grad_norm": 0.19972263276576996, + "learning_rate": 3.8916621620406236e-05, + "loss": 1.0736, + "step": 9423 + }, + { + "epoch": 0.5725742754723859, + "grad_norm": 0.13498350977897644, + "learning_rate": 3.89072871991902e-05, + "loss": 1.0143, + "step": 9424 + }, + { + "epoch": 0.5726350325050125, + "grad_norm": 0.30430951714515686, + "learning_rate": 3.889795318457971e-05, + "loss": 1.2341, + "step": 9425 + }, + { + "epoch": 0.572695789537639, + "grad_norm": 0.14694227278232574, + "learning_rate": 3.888861957691688e-05, + "loss": 1.0595, + "step": 9426 + }, + { + "epoch": 0.5727565465702655, + "grad_norm": 0.257284551858902, + "learning_rate": 3.8879286376543835e-05, + "loss": 1.173, + "step": 9427 + }, + { + "epoch": 0.5728173036028921, + "grad_norm": 0.4094320237636566, + "learning_rate": 3.886995358380272e-05, + "loss": 1.0723, + "step": 9428 + }, + { + "epoch": 0.5728780606355186, + "grad_norm": 0.15119077265262604, + "learning_rate": 3.886062119903559e-05, + "loss": 1.0735, + "step": 9429 + }, + { + "epoch": 0.5729388176681451, + "grad_norm": 0.15245527029037476, + "learning_rate": 3.885128922258454e-05, + "loss": 1.0173, + "step": 9430 + }, + { + "epoch": 0.5729995747007716, + "grad_norm": 0.17560645937919617, + "learning_rate": 3.884195765479166e-05, + "loss": 1.0854, + "step": 9431 + }, + { + "epoch": 0.5730603317333981, + "grad_norm": 1.8061186075210571, + "learning_rate": 3.8832626495998976e-05, + "loss": 1.044, + "step": 9432 + }, + { + "epoch": 0.5731210887660246, + "grad_norm": 0.15292610228061676, + "learning_rate": 3.882329574654852e-05, + "loss": 1.0392, + "step": 9433 + }, + { + "epoch": 0.5731818457986512, + "grad_norm": 0.4316026568412781, + "learning_rate": 3.88139654067823e-05, + "loss": 1.1387, + "step": 9434 + }, + { + "epoch": 0.5732426028312777, + "grad_norm": 0.17155036330223083, + "learning_rate": 3.880463547704236e-05, + "loss": 1.1373, + "step": 9435 + }, + { + "epoch": 0.5733033598639042, + "grad_norm": 0.14839480817317963, + "learning_rate": 3.879530595767067e-05, + "loss": 1.0884, + "step": 9436 + }, + { + "epoch": 0.5733641168965308, + "grad_norm": 0.13140985369682312, + "learning_rate": 3.878597684900921e-05, + "loss": 1.0844, + "step": 9437 + }, + { + "epoch": 0.5734248739291573, + "grad_norm": 0.472240686416626, + "learning_rate": 3.877664815139994e-05, + "loss": 1.0934, + "step": 9438 + }, + { + "epoch": 0.5734856309617838, + "grad_norm": 0.1632692962884903, + "learning_rate": 3.8767319865184785e-05, + "loss": 1.0508, + "step": 9439 + }, + { + "epoch": 0.5735463879944104, + "grad_norm": 0.24705441296100616, + "learning_rate": 3.875799199070569e-05, + "loss": 1.0675, + "step": 9440 + }, + { + "epoch": 0.5736071450270369, + "grad_norm": 0.22303584218025208, + "learning_rate": 3.8748664528304585e-05, + "loss": 1.1092, + "step": 9441 + }, + { + "epoch": 0.5736679020596634, + "grad_norm": 0.9875159859657288, + "learning_rate": 3.8739337478323346e-05, + "loss": 1.374, + "step": 9442 + }, + { + "epoch": 0.57372865909229, + "grad_norm": 0.2708924114704132, + "learning_rate": 3.873001084110387e-05, + "loss": 1.0546, + "step": 9443 + }, + { + "epoch": 0.5737894161249164, + "grad_norm": 0.16349385678768158, + "learning_rate": 3.8720684616988035e-05, + "loss": 1.0983, + "step": 9444 + }, + { + "epoch": 0.5738501731575429, + "grad_norm": 0.17874056100845337, + "learning_rate": 3.871135880631769e-05, + "loss": 1.1012, + "step": 9445 + }, + { + "epoch": 0.5739109301901695, + "grad_norm": 0.14569275081157684, + "learning_rate": 3.870203340943466e-05, + "loss": 1.0513, + "step": 9446 + }, + { + "epoch": 0.573971687222796, + "grad_norm": 0.19844374060630798, + "learning_rate": 3.869270842668077e-05, + "loss": 1.0311, + "step": 9447 + }, + { + "epoch": 0.5740324442554225, + "grad_norm": 0.13414768874645233, + "learning_rate": 3.868338385839785e-05, + "loss": 0.9853, + "step": 9448 + }, + { + "epoch": 0.5740932012880491, + "grad_norm": 0.17769423127174377, + "learning_rate": 3.867405970492767e-05, + "loss": 1.0348, + "step": 9449 + }, + { + "epoch": 0.5741539583206756, + "grad_norm": 0.20692425966262817, + "learning_rate": 3.866473596661202e-05, + "loss": 1.067, + "step": 9450 + }, + { + "epoch": 0.5742147153533022, + "grad_norm": 0.1440236121416092, + "learning_rate": 3.865541264379268e-05, + "loss": 1.0152, + "step": 9451 + }, + { + "epoch": 0.5742754723859287, + "grad_norm": 0.16100431978702545, + "learning_rate": 3.864608973681139e-05, + "loss": 1.0976, + "step": 9452 + }, + { + "epoch": 0.5743362294185552, + "grad_norm": 0.12844322621822357, + "learning_rate": 3.863676724600986e-05, + "loss": 1.0736, + "step": 9453 + }, + { + "epoch": 0.5743969864511818, + "grad_norm": 0.19452571868896484, + "learning_rate": 3.862744517172982e-05, + "loss": 1.1387, + "step": 9454 + }, + { + "epoch": 0.5744577434838083, + "grad_norm": 0.15475571155548096, + "learning_rate": 3.8618123514312976e-05, + "loss": 1.0415, + "step": 9455 + }, + { + "epoch": 0.5745185005164348, + "grad_norm": 0.1372871994972229, + "learning_rate": 3.860880227410102e-05, + "loss": 1.0488, + "step": 9456 + }, + { + "epoch": 0.5745792575490613, + "grad_norm": 0.15570376813411713, + "learning_rate": 3.859948145143562e-05, + "loss": 1.1063, + "step": 9457 + }, + { + "epoch": 0.5746400145816878, + "grad_norm": 0.22972489893436432, + "learning_rate": 3.859016104665844e-05, + "loss": 1.0534, + "step": 9458 + }, + { + "epoch": 0.5747007716143143, + "grad_norm": 0.12764061987400055, + "learning_rate": 3.85808410601111e-05, + "loss": 1.0705, + "step": 9459 + }, + { + "epoch": 0.5747615286469409, + "grad_norm": 0.25844812393188477, + "learning_rate": 3.8571521492135235e-05, + "loss": 1.0179, + "step": 9460 + }, + { + "epoch": 0.5748222856795674, + "grad_norm": 0.26326504349708557, + "learning_rate": 3.856220234307246e-05, + "loss": 0.9958, + "step": 9461 + }, + { + "epoch": 0.5748830427121939, + "grad_norm": 0.18367226421833038, + "learning_rate": 3.855288361326437e-05, + "loss": 1.0969, + "step": 9462 + }, + { + "epoch": 0.5749437997448205, + "grad_norm": 0.17877331376075745, + "learning_rate": 3.8543565303052545e-05, + "loss": 1.0697, + "step": 9463 + }, + { + "epoch": 0.575004556777447, + "grad_norm": 0.2133195698261261, + "learning_rate": 3.8534247412778545e-05, + "loss": 1.0795, + "step": 9464 + }, + { + "epoch": 0.5750653138100735, + "grad_norm": 0.1494211107492447, + "learning_rate": 3.852492994278394e-05, + "loss": 1.1312, + "step": 9465 + }, + { + "epoch": 0.5751260708427001, + "grad_norm": 0.18412959575653076, + "learning_rate": 3.851561289341023e-05, + "loss": 1.0814, + "step": 9466 + }, + { + "epoch": 0.5751868278753266, + "grad_norm": 0.16899055242538452, + "learning_rate": 3.850629626499895e-05, + "loss": 1.1157, + "step": 9467 + }, + { + "epoch": 0.5752475849079531, + "grad_norm": 0.11114361137151718, + "learning_rate": 3.84969800578916e-05, + "loss": 1.0274, + "step": 9468 + }, + { + "epoch": 0.5753083419405797, + "grad_norm": 0.15761998295783997, + "learning_rate": 3.848766427242967e-05, + "loss": 1.0926, + "step": 9469 + }, + { + "epoch": 0.5753690989732061, + "grad_norm": 1.3900434970855713, + "learning_rate": 3.847834890895464e-05, + "loss": 1.1706, + "step": 9470 + }, + { + "epoch": 0.5754298560058326, + "grad_norm": 0.17364750802516937, + "learning_rate": 3.8469033967807944e-05, + "loss": 1.1114, + "step": 9471 + }, + { + "epoch": 0.5754906130384592, + "grad_norm": 0.17179855704307556, + "learning_rate": 3.845971944933107e-05, + "loss": 1.1155, + "step": 9472 + }, + { + "epoch": 0.5755513700710857, + "grad_norm": 0.24840620160102844, + "learning_rate": 3.845040535386537e-05, + "loss": 1.1415, + "step": 9473 + }, + { + "epoch": 0.5756121271037122, + "grad_norm": 0.13832220435142517, + "learning_rate": 3.844109168175231e-05, + "loss": 1.0976, + "step": 9474 + }, + { + "epoch": 0.5756728841363388, + "grad_norm": 0.14361411333084106, + "learning_rate": 3.8431778433333274e-05, + "loss": 1.0617, + "step": 9475 + }, + { + "epoch": 0.5757336411689653, + "grad_norm": 0.39725133776664734, + "learning_rate": 3.842246560894962e-05, + "loss": 1.3927, + "step": 9476 + }, + { + "epoch": 0.5757943982015918, + "grad_norm": 1.257039189338684, + "learning_rate": 3.841315320894274e-05, + "loss": 1.1913, + "step": 9477 + }, + { + "epoch": 0.5758551552342184, + "grad_norm": 0.24458445608615875, + "learning_rate": 3.840384123365397e-05, + "loss": 1.0635, + "step": 9478 + }, + { + "epoch": 0.5759159122668449, + "grad_norm": 0.1508876085281372, + "learning_rate": 3.839452968342464e-05, + "loss": 1.0651, + "step": 9479 + }, + { + "epoch": 0.5759766692994714, + "grad_norm": 0.20164048671722412, + "learning_rate": 3.838521855859607e-05, + "loss": 1.1319, + "step": 9480 + }, + { + "epoch": 0.576037426332098, + "grad_norm": 0.2970738112926483, + "learning_rate": 3.837590785950954e-05, + "loss": 1.2593, + "step": 9481 + }, + { + "epoch": 0.5760981833647245, + "grad_norm": 0.1766209453344345, + "learning_rate": 3.836659758650637e-05, + "loss": 1.0936, + "step": 9482 + }, + { + "epoch": 0.5761589403973509, + "grad_norm": 0.18700683116912842, + "learning_rate": 3.835728773992781e-05, + "loss": 1.158, + "step": 9483 + }, + { + "epoch": 0.5762196974299775, + "grad_norm": 0.2654655873775482, + "learning_rate": 3.8347978320115116e-05, + "loss": 1.0342, + "step": 9484 + }, + { + "epoch": 0.576280454462604, + "grad_norm": 0.20301446318626404, + "learning_rate": 3.8338669327409555e-05, + "loss": 1.0669, + "step": 9485 + }, + { + "epoch": 0.5763412114952305, + "grad_norm": 0.12816840410232544, + "learning_rate": 3.832936076215229e-05, + "loss": 1.0647, + "step": 9486 + }, + { + "epoch": 0.5764019685278571, + "grad_norm": 0.19805611670017242, + "learning_rate": 3.832005262468458e-05, + "loss": 1.0501, + "step": 9487 + }, + { + "epoch": 0.5764627255604836, + "grad_norm": 0.23758670687675476, + "learning_rate": 3.831074491534759e-05, + "loss": 1.0757, + "step": 9488 + }, + { + "epoch": 0.5765234825931101, + "grad_norm": 0.12797591090202332, + "learning_rate": 3.83014376344825e-05, + "loss": 1.0124, + "step": 9489 + }, + { + "epoch": 0.5765842396257367, + "grad_norm": 0.19223390519618988, + "learning_rate": 3.829213078243048e-05, + "loss": 1.1857, + "step": 9490 + }, + { + "epoch": 0.5766449966583632, + "grad_norm": 0.16237464547157288, + "learning_rate": 3.828282435953267e-05, + "loss": 1.0556, + "step": 9491 + }, + { + "epoch": 0.5767057536909898, + "grad_norm": 0.1579771488904953, + "learning_rate": 3.827351836613021e-05, + "loss": 1.0399, + "step": 9492 + }, + { + "epoch": 0.5767665107236163, + "grad_norm": 2.5476765632629395, + "learning_rate": 3.82642128025642e-05, + "loss": 1.0881, + "step": 9493 + }, + { + "epoch": 0.5768272677562428, + "grad_norm": 0.33077988028526306, + "learning_rate": 3.825490766917572e-05, + "loss": 1.0575, + "step": 9494 + }, + { + "epoch": 0.5768880247888694, + "grad_norm": 0.17774169147014618, + "learning_rate": 3.8245602966305876e-05, + "loss": 1.0979, + "step": 9495 + }, + { + "epoch": 0.5769487818214959, + "grad_norm": 0.14809660613536835, + "learning_rate": 3.823629869429573e-05, + "loss": 1.0374, + "step": 9496 + }, + { + "epoch": 0.5770095388541223, + "grad_norm": 0.1711939573287964, + "learning_rate": 3.822699485348632e-05, + "loss": 1.1024, + "step": 9497 + }, + { + "epoch": 0.5770702958867489, + "grad_norm": 0.3284071683883667, + "learning_rate": 3.8217691444218725e-05, + "loss": 1.0745, + "step": 9498 + }, + { + "epoch": 0.5771310529193754, + "grad_norm": 0.1721469610929489, + "learning_rate": 3.8208388466833886e-05, + "loss": 1.114, + "step": 9499 + }, + { + "epoch": 0.5771918099520019, + "grad_norm": 0.14575400948524475, + "learning_rate": 3.8199085921672864e-05, + "loss": 1.036, + "step": 9500 + }, + { + "epoch": 0.5772525669846285, + "grad_norm": 0.14909052848815918, + "learning_rate": 3.818978380907663e-05, + "loss": 1.0338, + "step": 9501 + }, + { + "epoch": 0.577313324017255, + "grad_norm": 0.4301423132419586, + "learning_rate": 3.818048212938614e-05, + "loss": 1.1253, + "step": 9502 + }, + { + "epoch": 0.5773740810498815, + "grad_norm": 0.6998014450073242, + "learning_rate": 3.817118088294237e-05, + "loss": 1.0611, + "step": 9503 + }, + { + "epoch": 0.5774348380825081, + "grad_norm": 0.16323049366474152, + "learning_rate": 3.8161880070086254e-05, + "loss": 1.0187, + "step": 9504 + }, + { + "epoch": 0.5774955951151346, + "grad_norm": 0.18943989276885986, + "learning_rate": 3.8152579691158726e-05, + "loss": 1.1636, + "step": 9505 + }, + { + "epoch": 0.5775563521477611, + "grad_norm": 0.14730489253997803, + "learning_rate": 3.814327974650067e-05, + "loss": 1.0257, + "step": 9506 + }, + { + "epoch": 0.5776171091803877, + "grad_norm": 0.17674776911735535, + "learning_rate": 3.813398023645297e-05, + "loss": 1.039, + "step": 9507 + }, + { + "epoch": 0.5776778662130142, + "grad_norm": 0.1440037190914154, + "learning_rate": 3.812468116135653e-05, + "loss": 1.0839, + "step": 9508 + }, + { + "epoch": 0.5777386232456407, + "grad_norm": 0.15408442914485931, + "learning_rate": 3.81153825215522e-05, + "loss": 1.0754, + "step": 9509 + }, + { + "epoch": 0.5777993802782672, + "grad_norm": 0.19904424250125885, + "learning_rate": 3.8106084317380806e-05, + "loss": 1.1259, + "step": 9510 + }, + { + "epoch": 0.5778601373108937, + "grad_norm": 0.17540007829666138, + "learning_rate": 3.80967865491832e-05, + "loss": 1.2089, + "step": 9511 + }, + { + "epoch": 0.5779208943435202, + "grad_norm": 0.13899482786655426, + "learning_rate": 3.8087489217300205e-05, + "loss": 1.0441, + "step": 9512 + }, + { + "epoch": 0.5779816513761468, + "grad_norm": 0.14518341422080994, + "learning_rate": 3.807819232207258e-05, + "loss": 1.0838, + "step": 9513 + }, + { + "epoch": 0.5780424084087733, + "grad_norm": 0.13078469038009644, + "learning_rate": 3.8068895863841114e-05, + "loss": 1.052, + "step": 9514 + }, + { + "epoch": 0.5781031654413998, + "grad_norm": 0.13291147351264954, + "learning_rate": 3.805959984294657e-05, + "loss": 1.0534, + "step": 9515 + }, + { + "epoch": 0.5781639224740264, + "grad_norm": 0.15113776922225952, + "learning_rate": 3.8050304259729705e-05, + "loss": 1.0779, + "step": 9516 + }, + { + "epoch": 0.5782246795066529, + "grad_norm": 0.1307903677225113, + "learning_rate": 3.804100911453126e-05, + "loss": 1.0643, + "step": 9517 + }, + { + "epoch": 0.5782854365392794, + "grad_norm": 0.29275310039520264, + "learning_rate": 3.803171440769194e-05, + "loss": 1.0843, + "step": 9518 + }, + { + "epoch": 0.578346193571906, + "grad_norm": 0.2584379315376282, + "learning_rate": 3.802242013955243e-05, + "loss": 1.0756, + "step": 9519 + }, + { + "epoch": 0.5784069506045325, + "grad_norm": 0.14966893196105957, + "learning_rate": 3.8013126310453426e-05, + "loss": 1.0374, + "step": 9520 + }, + { + "epoch": 0.578467707637159, + "grad_norm": 0.1569487303495407, + "learning_rate": 3.800383292073559e-05, + "loss": 1.1168, + "step": 9521 + }, + { + "epoch": 0.5785284646697856, + "grad_norm": 6.4563188552856445, + "learning_rate": 3.799453997073959e-05, + "loss": 1.1016, + "step": 9522 + }, + { + "epoch": 0.578589221702412, + "grad_norm": 0.25884515047073364, + "learning_rate": 3.798524746080603e-05, + "loss": 1.073, + "step": 9523 + }, + { + "epoch": 0.5786499787350385, + "grad_norm": 0.18514546751976013, + "learning_rate": 3.797595539127556e-05, + "loss": 1.1444, + "step": 9524 + }, + { + "epoch": 0.5787107357676651, + "grad_norm": 0.18107715249061584, + "learning_rate": 3.7966663762488775e-05, + "loss": 1.0531, + "step": 9525 + }, + { + "epoch": 0.5787714928002916, + "grad_norm": 0.19950293004512787, + "learning_rate": 3.795737257478625e-05, + "loss": 1.0847, + "step": 9526 + }, + { + "epoch": 0.5788322498329181, + "grad_norm": 0.16721925139427185, + "learning_rate": 3.794808182850856e-05, + "loss": 1.0912, + "step": 9527 + }, + { + "epoch": 0.5788930068655447, + "grad_norm": 0.11909819394350052, + "learning_rate": 3.793879152399625e-05, + "loss": 1.0477, + "step": 9528 + }, + { + "epoch": 0.5789537638981712, + "grad_norm": 0.2933345139026642, + "learning_rate": 3.7929501661589875e-05, + "loss": 1.0891, + "step": 9529 + }, + { + "epoch": 0.5790145209307977, + "grad_norm": 0.1371058076620102, + "learning_rate": 3.7920212241629954e-05, + "loss": 1.108, + "step": 9530 + }, + { + "epoch": 0.5790752779634243, + "grad_norm": 0.14041735231876373, + "learning_rate": 3.7910923264456974e-05, + "loss": 1.0633, + "step": 9531 + }, + { + "epoch": 0.5791360349960508, + "grad_norm": 0.10737396031618118, + "learning_rate": 3.790163473041146e-05, + "loss": 1.0046, + "step": 9532 + }, + { + "epoch": 0.5791967920286774, + "grad_norm": 0.18709294497966766, + "learning_rate": 3.789234663983386e-05, + "loss": 1.0852, + "step": 9533 + }, + { + "epoch": 0.5792575490613039, + "grad_norm": 0.45671042799949646, + "learning_rate": 3.788305899306463e-05, + "loss": 1.1648, + "step": 9534 + }, + { + "epoch": 0.5793183060939304, + "grad_norm": 0.1386062651872635, + "learning_rate": 3.7873771790444205e-05, + "loss": 1.049, + "step": 9535 + }, + { + "epoch": 0.5793790631265568, + "grad_norm": 0.16956038773059845, + "learning_rate": 3.786448503231302e-05, + "loss": 1.0065, + "step": 9536 + }, + { + "epoch": 0.5794398201591834, + "grad_norm": 0.15613463521003723, + "learning_rate": 3.785519871901149e-05, + "loss": 1.1221, + "step": 9537 + }, + { + "epoch": 0.5795005771918099, + "grad_norm": 0.12265809625387192, + "learning_rate": 3.784591285088e-05, + "loss": 1.0162, + "step": 9538 + }, + { + "epoch": 0.5795613342244365, + "grad_norm": 0.18768829107284546, + "learning_rate": 3.783662742825893e-05, + "loss": 1.1512, + "step": 9539 + }, + { + "epoch": 0.579622091257063, + "grad_norm": 0.13276711106300354, + "learning_rate": 3.7827342451488616e-05, + "loss": 1.0593, + "step": 9540 + }, + { + "epoch": 0.5796828482896895, + "grad_norm": 0.37872061133384705, + "learning_rate": 3.7818057920909425e-05, + "loss": 1.0451, + "step": 9541 + }, + { + "epoch": 0.5797436053223161, + "grad_norm": 0.33937597274780273, + "learning_rate": 3.7808773836861676e-05, + "loss": 1.0868, + "step": 9542 + }, + { + "epoch": 0.5798043623549426, + "grad_norm": 0.16435493528842926, + "learning_rate": 3.7799490199685675e-05, + "loss": 1.0944, + "step": 9543 + }, + { + "epoch": 0.5798651193875691, + "grad_norm": 0.23212830722332, + "learning_rate": 3.7790207009721724e-05, + "loss": 1.1096, + "step": 9544 + }, + { + "epoch": 0.5799258764201957, + "grad_norm": 0.1821647435426712, + "learning_rate": 3.77809242673101e-05, + "loss": 1.1037, + "step": 9545 + }, + { + "epoch": 0.5799866334528222, + "grad_norm": 0.3730841875076294, + "learning_rate": 3.7771641972791056e-05, + "loss": 1.008, + "step": 9546 + }, + { + "epoch": 0.5800473904854487, + "grad_norm": 0.1395702064037323, + "learning_rate": 3.776236012650484e-05, + "loss": 1.0522, + "step": 9547 + }, + { + "epoch": 0.5801081475180753, + "grad_norm": 0.13790613412857056, + "learning_rate": 3.7753078728791663e-05, + "loss": 1.0399, + "step": 9548 + }, + { + "epoch": 0.5801689045507017, + "grad_norm": 0.22079253196716309, + "learning_rate": 3.774379777999176e-05, + "loss": 1.1603, + "step": 9549 + }, + { + "epoch": 0.5802296615833282, + "grad_norm": 0.18304385244846344, + "learning_rate": 3.773451728044533e-05, + "loss": 1.0375, + "step": 9550 + }, + { + "epoch": 0.5802904186159548, + "grad_norm": 0.14945372939109802, + "learning_rate": 3.772523723049252e-05, + "loss": 1.0865, + "step": 9551 + }, + { + "epoch": 0.5803511756485813, + "grad_norm": 0.1659063994884491, + "learning_rate": 3.771595763047355e-05, + "loss": 1.1383, + "step": 9552 + }, + { + "epoch": 0.5804119326812078, + "grad_norm": 0.2993048429489136, + "learning_rate": 3.770667848072848e-05, + "loss": 1.1015, + "step": 9553 + }, + { + "epoch": 0.5804726897138344, + "grad_norm": 0.21550501883029938, + "learning_rate": 3.769739978159752e-05, + "loss": 1.1564, + "step": 9554 + }, + { + "epoch": 0.5805334467464609, + "grad_norm": 0.14772237837314606, + "learning_rate": 3.768812153342074e-05, + "loss": 1.1567, + "step": 9555 + }, + { + "epoch": 0.5805942037790874, + "grad_norm": 0.14872927963733673, + "learning_rate": 3.7678843736538224e-05, + "loss": 1.0764, + "step": 9556 + }, + { + "epoch": 0.580654960811714, + "grad_norm": 0.23177458345890045, + "learning_rate": 3.76695663912901e-05, + "loss": 1.2135, + "step": 9557 + }, + { + "epoch": 0.5807157178443405, + "grad_norm": 0.21273943781852722, + "learning_rate": 3.7660289498016395e-05, + "loss": 1.0559, + "step": 9558 + }, + { + "epoch": 0.580776474876967, + "grad_norm": 0.18735463917255402, + "learning_rate": 3.7651013057057174e-05, + "loss": 1.1357, + "step": 9559 + }, + { + "epoch": 0.5808372319095936, + "grad_norm": 0.33305954933166504, + "learning_rate": 3.764173706875245e-05, + "loss": 1.2752, + "step": 9560 + }, + { + "epoch": 0.5808979889422201, + "grad_norm": 0.19825394451618195, + "learning_rate": 3.7632461533442223e-05, + "loss": 1.1545, + "step": 9561 + }, + { + "epoch": 0.5809587459748465, + "grad_norm": 0.2466171681880951, + "learning_rate": 3.7623186451466526e-05, + "loss": 1.1579, + "step": 9562 + }, + { + "epoch": 0.5810195030074731, + "grad_norm": 0.2018802911043167, + "learning_rate": 3.761391182316532e-05, + "loss": 1.0114, + "step": 9563 + }, + { + "epoch": 0.5810802600400996, + "grad_norm": 0.15394359827041626, + "learning_rate": 3.760463764887856e-05, + "loss": 1.0898, + "step": 9564 + }, + { + "epoch": 0.5811410170727261, + "grad_norm": 0.16687077283859253, + "learning_rate": 3.759536392894624e-05, + "loss": 1.0372, + "step": 9565 + }, + { + "epoch": 0.5812017741053527, + "grad_norm": 0.15514753758907318, + "learning_rate": 3.758609066370821e-05, + "loss": 1.1511, + "step": 9566 + }, + { + "epoch": 0.5812625311379792, + "grad_norm": 0.16094811260700226, + "learning_rate": 3.757681785350445e-05, + "loss": 1.1445, + "step": 9567 + }, + { + "epoch": 0.5813232881706057, + "grad_norm": 0.1270761787891388, + "learning_rate": 3.7567545498674824e-05, + "loss": 1.0394, + "step": 9568 + }, + { + "epoch": 0.5813840452032323, + "grad_norm": 0.32100990414619446, + "learning_rate": 3.7558273599559216e-05, + "loss": 1.1557, + "step": 9569 + }, + { + "epoch": 0.5814448022358588, + "grad_norm": 0.16068701446056366, + "learning_rate": 3.75490021564975e-05, + "loss": 1.0526, + "step": 9570 + }, + { + "epoch": 0.5815055592684854, + "grad_norm": 0.21692292392253876, + "learning_rate": 3.753973116982952e-05, + "loss": 1.0261, + "step": 9571 + }, + { + "epoch": 0.5815663163011119, + "grad_norm": 0.2852819859981537, + "learning_rate": 3.7530460639895106e-05, + "loss": 1.1158, + "step": 9572 + }, + { + "epoch": 0.5816270733337384, + "grad_norm": 0.1359856277704239, + "learning_rate": 3.7521190567034065e-05, + "loss": 0.9819, + "step": 9573 + }, + { + "epoch": 0.581687830366365, + "grad_norm": 0.3075811564922333, + "learning_rate": 3.7511920951586185e-05, + "loss": 1.2604, + "step": 9574 + }, + { + "epoch": 0.5817485873989914, + "grad_norm": 0.30809861421585083, + "learning_rate": 3.750265179389126e-05, + "loss": 1.2348, + "step": 9575 + }, + { + "epoch": 0.5818093444316179, + "grad_norm": 2.467280626296997, + "learning_rate": 3.7493383094289054e-05, + "loss": 1.0171, + "step": 9576 + }, + { + "epoch": 0.5818701014642444, + "grad_norm": 0.19968968629837036, + "learning_rate": 3.74841148531193e-05, + "loss": 1.0411, + "step": 9577 + }, + { + "epoch": 0.581930858496871, + "grad_norm": 0.16926982998847961, + "learning_rate": 3.747484707072174e-05, + "loss": 1.0171, + "step": 9578 + }, + { + "epoch": 0.5819916155294975, + "grad_norm": 0.19006602466106415, + "learning_rate": 3.7465579747436094e-05, + "loss": 1.2035, + "step": 9579 + }, + { + "epoch": 0.582052372562124, + "grad_norm": 0.13153447210788727, + "learning_rate": 3.745631288360204e-05, + "loss": 1.0737, + "step": 9580 + }, + { + "epoch": 0.5821131295947506, + "grad_norm": 0.13281063735485077, + "learning_rate": 3.744704647955926e-05, + "loss": 1.0744, + "step": 9581 + }, + { + "epoch": 0.5821738866273771, + "grad_norm": 0.18007522821426392, + "learning_rate": 3.74377805356474e-05, + "loss": 1.057, + "step": 9582 + }, + { + "epoch": 0.5822346436600037, + "grad_norm": 0.2089676558971405, + "learning_rate": 3.7428515052206134e-05, + "loss": 1.1189, + "step": 9583 + }, + { + "epoch": 0.5822954006926302, + "grad_norm": 0.11739115417003632, + "learning_rate": 3.741925002957509e-05, + "loss": 1.0181, + "step": 9584 + }, + { + "epoch": 0.5823561577252567, + "grad_norm": 0.1435522735118866, + "learning_rate": 3.740998546809387e-05, + "loss": 1.0409, + "step": 9585 + }, + { + "epoch": 0.5824169147578833, + "grad_norm": 0.16813106834888458, + "learning_rate": 3.7400721368102054e-05, + "loss": 1.0666, + "step": 9586 + }, + { + "epoch": 0.5824776717905098, + "grad_norm": 0.30288517475128174, + "learning_rate": 3.7391457729939226e-05, + "loss": 1.0876, + "step": 9587 + }, + { + "epoch": 0.5825384288231362, + "grad_norm": 0.13969860970973969, + "learning_rate": 3.738219455394496e-05, + "loss": 1.0342, + "step": 9588 + }, + { + "epoch": 0.5825991858557628, + "grad_norm": 0.18582023680210114, + "learning_rate": 3.737293184045879e-05, + "loss": 1.1074, + "step": 9589 + }, + { + "epoch": 0.5826599428883893, + "grad_norm": 0.18492519855499268, + "learning_rate": 3.736366958982024e-05, + "loss": 1.1016, + "step": 9590 + }, + { + "epoch": 0.5827206999210158, + "grad_norm": 0.1521170288324356, + "learning_rate": 3.735440780236883e-05, + "loss": 1.1054, + "step": 9591 + }, + { + "epoch": 0.5827814569536424, + "grad_norm": 0.12238254398107529, + "learning_rate": 3.734514647844406e-05, + "loss": 1.0188, + "step": 9592 + }, + { + "epoch": 0.5828422139862689, + "grad_norm": 0.1778760552406311, + "learning_rate": 3.733588561838538e-05, + "loss": 1.1207, + "step": 9593 + }, + { + "epoch": 0.5829029710188954, + "grad_norm": 0.33001652359962463, + "learning_rate": 3.7326625222532256e-05, + "loss": 1.242, + "step": 9594 + }, + { + "epoch": 0.582963728051522, + "grad_norm": 0.22192518413066864, + "learning_rate": 3.7317365291224124e-05, + "loss": 1.0981, + "step": 9595 + }, + { + "epoch": 0.5830244850841485, + "grad_norm": 0.18330062925815582, + "learning_rate": 3.7308105824800436e-05, + "loss": 1.0298, + "step": 9596 + }, + { + "epoch": 0.583085242116775, + "grad_norm": 0.12442557513713837, + "learning_rate": 3.729884682360058e-05, + "loss": 1.0617, + "step": 9597 + }, + { + "epoch": 0.5831459991494016, + "grad_norm": 0.14353442192077637, + "learning_rate": 3.728958828796394e-05, + "loss": 1.0985, + "step": 9598 + }, + { + "epoch": 0.5832067561820281, + "grad_norm": 0.16315993666648865, + "learning_rate": 3.728033021822992e-05, + "loss": 1.1067, + "step": 9599 + }, + { + "epoch": 0.5832675132146546, + "grad_norm": 0.32369089126586914, + "learning_rate": 3.7271072614737834e-05, + "loss": 1.1238, + "step": 9600 + }, + { + "epoch": 0.5833282702472812, + "grad_norm": 0.14030922949314117, + "learning_rate": 3.7261815477827044e-05, + "loss": 1.0612, + "step": 9601 + }, + { + "epoch": 0.5833890272799076, + "grad_norm": 0.1400366872549057, + "learning_rate": 3.725255880783688e-05, + "loss": 1.0251, + "step": 9602 + }, + { + "epoch": 0.5834497843125341, + "grad_norm": 0.221337229013443, + "learning_rate": 3.724330260510662e-05, + "loss": 1.1251, + "step": 9603 + }, + { + "epoch": 0.5835105413451607, + "grad_norm": 0.22027553617954254, + "learning_rate": 3.7234046869975575e-05, + "loss": 1.0546, + "step": 9604 + }, + { + "epoch": 0.5835712983777872, + "grad_norm": 0.15704768896102905, + "learning_rate": 3.722479160278301e-05, + "loss": 1.1272, + "step": 9605 + }, + { + "epoch": 0.5836320554104137, + "grad_norm": 0.3534877896308899, + "learning_rate": 3.7215536803868204e-05, + "loss": 1.034, + "step": 9606 + }, + { + "epoch": 0.5836928124430403, + "grad_norm": 2.277421236038208, + "learning_rate": 3.720628247357034e-05, + "loss": 1.0541, + "step": 9607 + }, + { + "epoch": 0.5837535694756668, + "grad_norm": 0.17706996202468872, + "learning_rate": 3.719702861222866e-05, + "loss": 1.0062, + "step": 9608 + }, + { + "epoch": 0.5838143265082933, + "grad_norm": 0.12530963122844696, + "learning_rate": 3.718777522018238e-05, + "loss": 1.0443, + "step": 9609 + }, + { + "epoch": 0.5838750835409199, + "grad_norm": 0.2009391486644745, + "learning_rate": 3.717852229777068e-05, + "loss": 1.074, + "step": 9610 + }, + { + "epoch": 0.5839358405735464, + "grad_norm": 0.23084455728530884, + "learning_rate": 3.7169269845332714e-05, + "loss": 1.0247, + "step": 9611 + }, + { + "epoch": 0.583996597606173, + "grad_norm": 0.1966819018125534, + "learning_rate": 3.716001786320766e-05, + "loss": 1.0508, + "step": 9612 + }, + { + "epoch": 0.5840573546387995, + "grad_norm": 0.10911506414413452, + "learning_rate": 3.7150766351734616e-05, + "loss": 1.0649, + "step": 9613 + }, + { + "epoch": 0.584118111671426, + "grad_norm": 0.24463769793510437, + "learning_rate": 3.7141515311252724e-05, + "loss": 1.08, + "step": 9614 + }, + { + "epoch": 0.5841788687040524, + "grad_norm": 0.6713007092475891, + "learning_rate": 3.713226474210107e-05, + "loss": 1.0709, + "step": 9615 + }, + { + "epoch": 0.584239625736679, + "grad_norm": 0.17672321200370789, + "learning_rate": 3.7123014644618726e-05, + "loss": 1.0346, + "step": 9616 + }, + { + "epoch": 0.5843003827693055, + "grad_norm": 0.19808872044086456, + "learning_rate": 3.711376501914479e-05, + "loss": 1.1261, + "step": 9617 + }, + { + "epoch": 0.584361139801932, + "grad_norm": 0.2292851060628891, + "learning_rate": 3.7104515866018274e-05, + "loss": 1.1067, + "step": 9618 + }, + { + "epoch": 0.5844218968345586, + "grad_norm": 0.1645694226026535, + "learning_rate": 3.7095267185578244e-05, + "loss": 1.1376, + "step": 9619 + }, + { + "epoch": 0.5844826538671851, + "grad_norm": 0.13933876156806946, + "learning_rate": 3.708601897816367e-05, + "loss": 1.0809, + "step": 9620 + }, + { + "epoch": 0.5845434108998117, + "grad_norm": 0.17309273779392242, + "learning_rate": 3.7076771244113565e-05, + "loss": 1.1881, + "step": 9621 + }, + { + "epoch": 0.5846041679324382, + "grad_norm": 0.15303075313568115, + "learning_rate": 3.7067523983766915e-05, + "loss": 1.0673, + "step": 9622 + }, + { + "epoch": 0.5846649249650647, + "grad_norm": 0.2508925795555115, + "learning_rate": 3.705827719746267e-05, + "loss": 1.0556, + "step": 9623 + }, + { + "epoch": 0.5847256819976913, + "grad_norm": 0.382703572511673, + "learning_rate": 3.704903088553977e-05, + "loss": 1.1719, + "step": 9624 + }, + { + "epoch": 0.5847864390303178, + "grad_norm": 0.300736665725708, + "learning_rate": 3.703978504833716e-05, + "loss": 1.2541, + "step": 9625 + }, + { + "epoch": 0.5848471960629443, + "grad_norm": 0.15248076617717743, + "learning_rate": 3.7030539686193734e-05, + "loss": 1.0604, + "step": 9626 + }, + { + "epoch": 0.5849079530955709, + "grad_norm": 0.998816192150116, + "learning_rate": 3.702129479944838e-05, + "loss": 1.0413, + "step": 9627 + }, + { + "epoch": 0.5849687101281973, + "grad_norm": 0.18758432567119598, + "learning_rate": 3.7012050388439966e-05, + "loss": 1.3046, + "step": 9628 + }, + { + "epoch": 0.5850294671608238, + "grad_norm": 0.22952404618263245, + "learning_rate": 3.7002806453507356e-05, + "loss": 1.0774, + "step": 9629 + }, + { + "epoch": 0.5850902241934504, + "grad_norm": 0.1458398550748825, + "learning_rate": 3.69935629949894e-05, + "loss": 1.0456, + "step": 9630 + }, + { + "epoch": 0.5851509812260769, + "grad_norm": 0.11816703528165817, + "learning_rate": 3.6984320013224894e-05, + "loss": 1.0271, + "step": 9631 + }, + { + "epoch": 0.5852117382587034, + "grad_norm": 1.8252034187316895, + "learning_rate": 3.697507750855267e-05, + "loss": 1.0597, + "step": 9632 + }, + { + "epoch": 0.58527249529133, + "grad_norm": 0.21289880573749542, + "learning_rate": 3.696583548131147e-05, + "loss": 1.2213, + "step": 9633 + }, + { + "epoch": 0.5853332523239565, + "grad_norm": 0.11876389384269714, + "learning_rate": 3.6956593931840114e-05, + "loss": 1.0309, + "step": 9634 + }, + { + "epoch": 0.585394009356583, + "grad_norm": 0.17165471613407135, + "learning_rate": 3.694735286047731e-05, + "loss": 1.0515, + "step": 9635 + }, + { + "epoch": 0.5854547663892096, + "grad_norm": 0.2886955440044403, + "learning_rate": 3.693811226756182e-05, + "loss": 1.0805, + "step": 9636 + }, + { + "epoch": 0.5855155234218361, + "grad_norm": 0.1725347489118576, + "learning_rate": 3.692887215343235e-05, + "loss": 1.1446, + "step": 9637 + }, + { + "epoch": 0.5855762804544626, + "grad_norm": 0.16695545613765717, + "learning_rate": 3.691963251842759e-05, + "loss": 1.0156, + "step": 9638 + }, + { + "epoch": 0.5856370374870892, + "grad_norm": 0.11789147555828094, + "learning_rate": 3.6910393362886254e-05, + "loss": 1.0422, + "step": 9639 + }, + { + "epoch": 0.5856977945197157, + "grad_norm": 0.10687675327062607, + "learning_rate": 3.690115468714697e-05, + "loss": 1.0517, + "step": 9640 + }, + { + "epoch": 0.5857585515523421, + "grad_norm": 0.20679590106010437, + "learning_rate": 3.6891916491548366e-05, + "loss": 1.1821, + "step": 9641 + }, + { + "epoch": 0.5858193085849687, + "grad_norm": 0.35516971349716187, + "learning_rate": 3.688267877642912e-05, + "loss": 1.2186, + "step": 9642 + }, + { + "epoch": 0.5858800656175952, + "grad_norm": 0.167977437376976, + "learning_rate": 3.687344154212781e-05, + "loss": 1.1288, + "step": 9643 + }, + { + "epoch": 0.5859408226502217, + "grad_norm": 0.22867953777313232, + "learning_rate": 3.686420478898304e-05, + "loss": 1.0763, + "step": 9644 + }, + { + "epoch": 0.5860015796828483, + "grad_norm": 0.16651295125484467, + "learning_rate": 3.685496851733338e-05, + "loss": 1.0758, + "step": 9645 + }, + { + "epoch": 0.5860623367154748, + "grad_norm": 0.13324297964572906, + "learning_rate": 3.6845732727517404e-05, + "loss": 1.0628, + "step": 9646 + }, + { + "epoch": 0.5861230937481013, + "grad_norm": 0.13318656384944916, + "learning_rate": 3.6836497419873626e-05, + "loss": 1.0798, + "step": 9647 + }, + { + "epoch": 0.5861838507807279, + "grad_norm": 0.4447791874408722, + "learning_rate": 3.682726259474059e-05, + "loss": 1.0934, + "step": 9648 + }, + { + "epoch": 0.5862446078133544, + "grad_norm": 0.1209213063120842, + "learning_rate": 3.681802825245677e-05, + "loss": 1.0489, + "step": 9649 + }, + { + "epoch": 0.586305364845981, + "grad_norm": 0.21711570024490356, + "learning_rate": 3.680879439336069e-05, + "loss": 1.3267, + "step": 9650 + }, + { + "epoch": 0.5863661218786075, + "grad_norm": 0.22704561054706573, + "learning_rate": 3.679956101779079e-05, + "loss": 1.2459, + "step": 9651 + }, + { + "epoch": 0.586426878911234, + "grad_norm": 0.17967826128005981, + "learning_rate": 3.679032812608555e-05, + "loss": 1.0259, + "step": 9652 + }, + { + "epoch": 0.5864876359438606, + "grad_norm": 0.13895003497600555, + "learning_rate": 3.678109571858337e-05, + "loss": 1.0909, + "step": 9653 + }, + { + "epoch": 0.586548392976487, + "grad_norm": 0.10685412585735321, + "learning_rate": 3.6771863795622674e-05, + "loss": 1.0384, + "step": 9654 + }, + { + "epoch": 0.5866091500091135, + "grad_norm": 0.12601348757743835, + "learning_rate": 3.6762632357541884e-05, + "loss": 1.0736, + "step": 9655 + }, + { + "epoch": 0.58666990704174, + "grad_norm": 0.16080918908119202, + "learning_rate": 3.675340140467936e-05, + "loss": 1.0346, + "step": 9656 + }, + { + "epoch": 0.5867306640743666, + "grad_norm": 0.14532466232776642, + "learning_rate": 3.6744170937373454e-05, + "loss": 1.0835, + "step": 9657 + }, + { + "epoch": 0.5867914211069931, + "grad_norm": 0.14446935057640076, + "learning_rate": 3.673494095596254e-05, + "loss": 1.1205, + "step": 9658 + }, + { + "epoch": 0.5868521781396197, + "grad_norm": 0.1682351976633072, + "learning_rate": 3.672571146078494e-05, + "loss": 1.095, + "step": 9659 + }, + { + "epoch": 0.5869129351722462, + "grad_norm": 0.29009154438972473, + "learning_rate": 3.6716482452178934e-05, + "loss": 1.1829, + "step": 9660 + }, + { + "epoch": 0.5869736922048727, + "grad_norm": 0.15475969016551971, + "learning_rate": 3.6707253930482835e-05, + "loss": 1.1304, + "step": 9661 + }, + { + "epoch": 0.5870344492374993, + "grad_norm": 0.1583895981311798, + "learning_rate": 3.66980258960349e-05, + "loss": 1.0853, + "step": 9662 + }, + { + "epoch": 0.5870952062701258, + "grad_norm": 0.17046944797039032, + "learning_rate": 3.6688798349173416e-05, + "loss": 1.0419, + "step": 9663 + }, + { + "epoch": 0.5871559633027523, + "grad_norm": 0.140001580119133, + "learning_rate": 3.66795712902366e-05, + "loss": 1.0849, + "step": 9664 + }, + { + "epoch": 0.5872167203353789, + "grad_norm": 0.1780005395412445, + "learning_rate": 3.6670344719562655e-05, + "loss": 1.2036, + "step": 9665 + }, + { + "epoch": 0.5872774773680054, + "grad_norm": 0.25022220611572266, + "learning_rate": 3.666111863748984e-05, + "loss": 1.0155, + "step": 9666 + }, + { + "epoch": 0.5873382344006318, + "grad_norm": 0.17986616492271423, + "learning_rate": 3.6651893044356264e-05, + "loss": 1.1346, + "step": 9667 + }, + { + "epoch": 0.5873989914332584, + "grad_norm": 0.2608937621116638, + "learning_rate": 3.664266794050014e-05, + "loss": 1.0366, + "step": 9668 + }, + { + "epoch": 0.5874597484658849, + "grad_norm": 0.2139994502067566, + "learning_rate": 3.66334433262596e-05, + "loss": 1.1001, + "step": 9669 + }, + { + "epoch": 0.5875205054985114, + "grad_norm": 0.26043063402175903, + "learning_rate": 3.6624219201972776e-05, + "loss": 1.1152, + "step": 9670 + }, + { + "epoch": 0.587581262531138, + "grad_norm": 0.48465073108673096, + "learning_rate": 3.661499556797779e-05, + "loss": 1.1041, + "step": 9671 + }, + { + "epoch": 0.5876420195637645, + "grad_norm": 0.16609665751457214, + "learning_rate": 3.660577242461273e-05, + "loss": 1.0641, + "step": 9672 + }, + { + "epoch": 0.587702776596391, + "grad_norm": 0.1469862461090088, + "learning_rate": 3.6596549772215674e-05, + "loss": 1.0916, + "step": 9673 + }, + { + "epoch": 0.5877635336290176, + "grad_norm": 0.254740446805954, + "learning_rate": 3.658732761112468e-05, + "loss": 1.1355, + "step": 9674 + }, + { + "epoch": 0.5878242906616441, + "grad_norm": 0.2856753468513489, + "learning_rate": 3.657810594167777e-05, + "loss": 1.0557, + "step": 9675 + }, + { + "epoch": 0.5878850476942706, + "grad_norm": 0.13355521857738495, + "learning_rate": 3.656888476421298e-05, + "loss": 1.0546, + "step": 9676 + }, + { + "epoch": 0.5879458047268972, + "grad_norm": 0.1959685981273651, + "learning_rate": 3.655966407906832e-05, + "loss": 1.0779, + "step": 9677 + }, + { + "epoch": 0.5880065617595237, + "grad_norm": 0.15109848976135254, + "learning_rate": 3.655044388658176e-05, + "loss": 1.0887, + "step": 9678 + }, + { + "epoch": 0.5880673187921502, + "grad_norm": 0.15172675251960754, + "learning_rate": 3.6541224187091324e-05, + "loss": 1.1211, + "step": 9679 + }, + { + "epoch": 0.5881280758247767, + "grad_norm": 0.15554292500019073, + "learning_rate": 3.6532004980934876e-05, + "loss": 1.0886, + "step": 9680 + }, + { + "epoch": 0.5881888328574032, + "grad_norm": 0.16062697768211365, + "learning_rate": 3.65227862684504e-05, + "loss": 1.0527, + "step": 9681 + }, + { + "epoch": 0.5882495898900297, + "grad_norm": 12.580662727355957, + "learning_rate": 3.65135680499758e-05, + "loss": 1.0685, + "step": 9682 + }, + { + "epoch": 0.5883103469226563, + "grad_norm": 0.26436224579811096, + "learning_rate": 3.650435032584895e-05, + "loss": 1.1265, + "step": 9683 + }, + { + "epoch": 0.5883711039552828, + "grad_norm": 0.16329185664653778, + "learning_rate": 3.649513309640777e-05, + "loss": 1.1335, + "step": 9684 + }, + { + "epoch": 0.5884318609879093, + "grad_norm": 3.0906193256378174, + "learning_rate": 3.6485916361990094e-05, + "loss": 1.0864, + "step": 9685 + }, + { + "epoch": 0.5884926180205359, + "grad_norm": 0.23503564298152924, + "learning_rate": 3.647670012293377e-05, + "loss": 1.2349, + "step": 9686 + }, + { + "epoch": 0.5885533750531624, + "grad_norm": 0.18491913378238678, + "learning_rate": 3.646748437957661e-05, + "loss": 1.0454, + "step": 9687 + }, + { + "epoch": 0.5886141320857889, + "grad_norm": 0.14460882544517517, + "learning_rate": 3.645826913225642e-05, + "loss": 1.0482, + "step": 9688 + }, + { + "epoch": 0.5886748891184155, + "grad_norm": 0.16084350645542145, + "learning_rate": 3.6449054381311e-05, + "loss": 1.1088, + "step": 9689 + }, + { + "epoch": 0.588735646151042, + "grad_norm": 0.32970014214515686, + "learning_rate": 3.643984012707811e-05, + "loss": 1.1329, + "step": 9690 + }, + { + "epoch": 0.5887964031836685, + "grad_norm": 0.14513203501701355, + "learning_rate": 3.643062636989549e-05, + "loss": 1.0926, + "step": 9691 + }, + { + "epoch": 0.5888571602162951, + "grad_norm": 0.13109713792800903, + "learning_rate": 3.642141311010089e-05, + "loss": 1.0657, + "step": 9692 + }, + { + "epoch": 0.5889179172489215, + "grad_norm": 0.12235511839389801, + "learning_rate": 3.641220034803203e-05, + "loss": 1.0773, + "step": 9693 + }, + { + "epoch": 0.588978674281548, + "grad_norm": 0.15629084408283234, + "learning_rate": 3.640298808402659e-05, + "loss": 1.0427, + "step": 9694 + }, + { + "epoch": 0.5890394313141746, + "grad_norm": 0.15765614807605743, + "learning_rate": 3.639377631842225e-05, + "loss": 1.0123, + "step": 9695 + }, + { + "epoch": 0.5891001883468011, + "grad_norm": 6.2862348556518555, + "learning_rate": 3.6384565051556654e-05, + "loss": 1.0683, + "step": 9696 + }, + { + "epoch": 0.5891609453794276, + "grad_norm": 0.18151108920574188, + "learning_rate": 3.637535428376746e-05, + "loss": 1.1443, + "step": 9697 + }, + { + "epoch": 0.5892217024120542, + "grad_norm": 0.136151984333992, + "learning_rate": 3.63661440153923e-05, + "loss": 1.0831, + "step": 9698 + }, + { + "epoch": 0.5892824594446807, + "grad_norm": 0.1270492523908615, + "learning_rate": 3.635693424676877e-05, + "loss": 1.0711, + "step": 9699 + }, + { + "epoch": 0.5893432164773073, + "grad_norm": 0.2637779116630554, + "learning_rate": 3.6347724978234444e-05, + "loss": 1.2085, + "step": 9700 + }, + { + "epoch": 0.5894039735099338, + "grad_norm": 0.1644567996263504, + "learning_rate": 3.633851621012689e-05, + "loss": 1.0329, + "step": 9701 + }, + { + "epoch": 0.5894647305425603, + "grad_norm": 0.12515617907047272, + "learning_rate": 3.6329307942783676e-05, + "loss": 1.0566, + "step": 9702 + }, + { + "epoch": 0.5895254875751869, + "grad_norm": 0.19501769542694092, + "learning_rate": 3.632010017654232e-05, + "loss": 1.1064, + "step": 9703 + }, + { + "epoch": 0.5895862446078134, + "grad_norm": 0.17413097620010376, + "learning_rate": 3.6310892911740325e-05, + "loss": 1.1467, + "step": 9704 + }, + { + "epoch": 0.5896470016404399, + "grad_norm": 0.1883806437253952, + "learning_rate": 3.63016861487152e-05, + "loss": 1.0681, + "step": 9705 + }, + { + "epoch": 0.5897077586730665, + "grad_norm": 0.11197453737258911, + "learning_rate": 3.629247988780444e-05, + "loss": 1.0395, + "step": 9706 + }, + { + "epoch": 0.5897685157056929, + "grad_norm": 0.129778653383255, + "learning_rate": 3.628327412934546e-05, + "loss": 1.1012, + "step": 9707 + }, + { + "epoch": 0.5898292727383194, + "grad_norm": 0.13288386166095734, + "learning_rate": 3.627406887367572e-05, + "loss": 1.0753, + "step": 9708 + }, + { + "epoch": 0.589890029770946, + "grad_norm": 0.15064646303653717, + "learning_rate": 3.626486412113264e-05, + "loss": 1.1315, + "step": 9709 + }, + { + "epoch": 0.5899507868035725, + "grad_norm": 0.11687958240509033, + "learning_rate": 3.6255659872053616e-05, + "loss": 1.0245, + "step": 9710 + }, + { + "epoch": 0.590011543836199, + "grad_norm": 0.13441386818885803, + "learning_rate": 3.6246456126776044e-05, + "loss": 1.2249, + "step": 9711 + }, + { + "epoch": 0.5900723008688256, + "grad_norm": 0.45595914125442505, + "learning_rate": 3.623725288563726e-05, + "loss": 1.1587, + "step": 9712 + }, + { + "epoch": 0.5901330579014521, + "grad_norm": 0.17567183077335358, + "learning_rate": 3.6228050148974674e-05, + "loss": 1.0741, + "step": 9713 + }, + { + "epoch": 0.5901938149340786, + "grad_norm": 0.16142162680625916, + "learning_rate": 3.6218847917125535e-05, + "loss": 1.0423, + "step": 9714 + }, + { + "epoch": 0.5902545719667052, + "grad_norm": 0.21234942972660065, + "learning_rate": 3.62096461904272e-05, + "loss": 1.133, + "step": 9715 + }, + { + "epoch": 0.5903153289993317, + "grad_norm": 0.24975788593292236, + "learning_rate": 3.620044496921695e-05, + "loss": 1.2539, + "step": 9716 + }, + { + "epoch": 0.5903760860319582, + "grad_norm": 0.17137493193149567, + "learning_rate": 3.619124425383206e-05, + "loss": 1.036, + "step": 9717 + }, + { + "epoch": 0.5904368430645848, + "grad_norm": 0.19582246243953705, + "learning_rate": 3.618204404460978e-05, + "loss": 1.1645, + "step": 9718 + }, + { + "epoch": 0.5904976000972113, + "grad_norm": 0.2093893587589264, + "learning_rate": 3.617284434188735e-05, + "loss": 1.1117, + "step": 9719 + }, + { + "epoch": 0.5905583571298377, + "grad_norm": 0.12619788944721222, + "learning_rate": 3.616364514600198e-05, + "loss": 1.0607, + "step": 9720 + }, + { + "epoch": 0.5906191141624643, + "grad_norm": 0.21111953258514404, + "learning_rate": 3.615444645729087e-05, + "loss": 1.1281, + "step": 9721 + }, + { + "epoch": 0.5906798711950908, + "grad_norm": 0.3733457922935486, + "learning_rate": 3.6145248276091194e-05, + "loss": 1.0778, + "step": 9722 + }, + { + "epoch": 0.5907406282277173, + "grad_norm": 0.14821258187294006, + "learning_rate": 3.613605060274013e-05, + "loss": 1.0515, + "step": 9723 + }, + { + "epoch": 0.5908013852603439, + "grad_norm": 0.18887345492839813, + "learning_rate": 3.612685343757481e-05, + "loss": 1.0697, + "step": 9724 + }, + { + "epoch": 0.5908621422929704, + "grad_norm": 0.1876954287290573, + "learning_rate": 3.611765678093235e-05, + "loss": 1.0477, + "step": 9725 + }, + { + "epoch": 0.5909228993255969, + "grad_norm": 0.23776161670684814, + "learning_rate": 3.610846063314988e-05, + "loss": 1.1025, + "step": 9726 + }, + { + "epoch": 0.5909836563582235, + "grad_norm": 0.19449111819267273, + "learning_rate": 3.609926499456446e-05, + "loss": 1.0667, + "step": 9727 + }, + { + "epoch": 0.59104441339085, + "grad_norm": 0.21076911687850952, + "learning_rate": 3.609006986551318e-05, + "loss": 1.03, + "step": 9728 + }, + { + "epoch": 0.5911051704234765, + "grad_norm": 0.18143071234226227, + "learning_rate": 3.6080875246333065e-05, + "loss": 1.0281, + "step": 9729 + }, + { + "epoch": 0.5911659274561031, + "grad_norm": 0.1307360976934433, + "learning_rate": 3.6071681137361155e-05, + "loss": 1.081, + "step": 9730 + }, + { + "epoch": 0.5912266844887296, + "grad_norm": 0.22783619165420532, + "learning_rate": 3.606248753893448e-05, + "loss": 1.0565, + "step": 9731 + }, + { + "epoch": 0.5912874415213561, + "grad_norm": 0.1415441781282425, + "learning_rate": 3.605329445139001e-05, + "loss": 1.0448, + "step": 9732 + }, + { + "epoch": 0.5913481985539826, + "grad_norm": 4.329713344573975, + "learning_rate": 3.604410187506474e-05, + "loss": 1.0548, + "step": 9733 + }, + { + "epoch": 0.5914089555866091, + "grad_norm": 0.16169285774230957, + "learning_rate": 3.603490981029559e-05, + "loss": 1.071, + "step": 9734 + }, + { + "epoch": 0.5914697126192356, + "grad_norm": 0.1344682276248932, + "learning_rate": 3.602571825741953e-05, + "loss": 1.0402, + "step": 9735 + }, + { + "epoch": 0.5915304696518622, + "grad_norm": 0.17793437838554382, + "learning_rate": 3.6016527216773475e-05, + "loss": 1.0594, + "step": 9736 + }, + { + "epoch": 0.5915912266844887, + "grad_norm": 0.10395093262195587, + "learning_rate": 3.60073366886943e-05, + "loss": 1.0068, + "step": 9737 + }, + { + "epoch": 0.5916519837171152, + "grad_norm": 0.13307540118694305, + "learning_rate": 3.5998146673518915e-05, + "loss": 1.0896, + "step": 9738 + }, + { + "epoch": 0.5917127407497418, + "grad_norm": 0.2078952044248581, + "learning_rate": 3.598895717158418e-05, + "loss": 1.09, + "step": 9739 + }, + { + "epoch": 0.5917734977823683, + "grad_norm": 0.19054830074310303, + "learning_rate": 3.597976818322694e-05, + "loss": 1.1146, + "step": 9740 + }, + { + "epoch": 0.5918342548149949, + "grad_norm": 0.1147465705871582, + "learning_rate": 3.597057970878398e-05, + "loss": 1.0438, + "step": 9741 + }, + { + "epoch": 0.5918950118476214, + "grad_norm": 0.15593543648719788, + "learning_rate": 3.596139174859213e-05, + "loss": 1.055, + "step": 9742 + }, + { + "epoch": 0.5919557688802479, + "grad_norm": 0.16292649507522583, + "learning_rate": 3.5952204302988194e-05, + "loss": 1.0582, + "step": 9743 + }, + { + "epoch": 0.5920165259128745, + "grad_norm": 0.1694033443927765, + "learning_rate": 3.594301737230892e-05, + "loss": 1.0674, + "step": 9744 + }, + { + "epoch": 0.592077282945501, + "grad_norm": 0.12481171637773514, + "learning_rate": 3.5933830956891054e-05, + "loss": 1.0597, + "step": 9745 + }, + { + "epoch": 0.5921380399781274, + "grad_norm": 0.1326330453157425, + "learning_rate": 3.592464505707136e-05, + "loss": 1.0534, + "step": 9746 + }, + { + "epoch": 0.592198797010754, + "grad_norm": 0.9225171208381653, + "learning_rate": 3.591545967318649e-05, + "loss": 1.0924, + "step": 9747 + }, + { + "epoch": 0.5922595540433805, + "grad_norm": 0.13176541030406952, + "learning_rate": 3.590627480557318e-05, + "loss": 1.0137, + "step": 9748 + }, + { + "epoch": 0.592320311076007, + "grad_norm": 0.12434348464012146, + "learning_rate": 3.5897090454568097e-05, + "loss": 1.0294, + "step": 9749 + }, + { + "epoch": 0.5923810681086336, + "grad_norm": 0.21365907788276672, + "learning_rate": 3.588790662050788e-05, + "loss": 1.0716, + "step": 9750 + }, + { + "epoch": 0.5924418251412601, + "grad_norm": 0.11712610721588135, + "learning_rate": 3.587872330372918e-05, + "loss": 1.0333, + "step": 9751 + }, + { + "epoch": 0.5925025821738866, + "grad_norm": 0.1176094338297844, + "learning_rate": 3.58695405045686e-05, + "loss": 1.0008, + "step": 9752 + }, + { + "epoch": 0.5925633392065132, + "grad_norm": 0.14124080538749695, + "learning_rate": 3.586035822336277e-05, + "loss": 1.0401, + "step": 9753 + }, + { + "epoch": 0.5926240962391397, + "grad_norm": 0.11916171759366989, + "learning_rate": 3.5851176460448216e-05, + "loss": 1.0361, + "step": 9754 + }, + { + "epoch": 0.5926848532717662, + "grad_norm": 0.13915252685546875, + "learning_rate": 3.5841995216161514e-05, + "loss": 1.1064, + "step": 9755 + }, + { + "epoch": 0.5927456103043928, + "grad_norm": 0.10760362446308136, + "learning_rate": 3.583281449083924e-05, + "loss": 1.0268, + "step": 9756 + }, + { + "epoch": 0.5928063673370193, + "grad_norm": 0.21113231778144836, + "learning_rate": 3.582363428481788e-05, + "loss": 1.0781, + "step": 9757 + }, + { + "epoch": 0.5928671243696458, + "grad_norm": 0.41020911931991577, + "learning_rate": 3.5814454598433936e-05, + "loss": 1.2678, + "step": 9758 + }, + { + "epoch": 0.5929278814022723, + "grad_norm": 0.11867465078830719, + "learning_rate": 3.58052754320239e-05, + "loss": 1.0106, + "step": 9759 + }, + { + "epoch": 0.5929886384348988, + "grad_norm": 0.18245571851730347, + "learning_rate": 3.579609678592426e-05, + "loss": 1.0353, + "step": 9760 + }, + { + "epoch": 0.5930493954675253, + "grad_norm": 0.1418563574552536, + "learning_rate": 3.5786918660471424e-05, + "loss": 1.0813, + "step": 9761 + }, + { + "epoch": 0.5931101525001519, + "grad_norm": 0.2041768878698349, + "learning_rate": 3.5777741056001834e-05, + "loss": 1.0359, + "step": 9762 + }, + { + "epoch": 0.5931709095327784, + "grad_norm": 0.13422484695911407, + "learning_rate": 3.5768563972851886e-05, + "loss": 1.0514, + "step": 9763 + }, + { + "epoch": 0.5932316665654049, + "grad_norm": 0.30140751600265503, + "learning_rate": 3.575938741135798e-05, + "loss": 1.0959, + "step": 9764 + }, + { + "epoch": 0.5932924235980315, + "grad_norm": 0.2591914236545563, + "learning_rate": 3.575021137185649e-05, + "loss": 1.0814, + "step": 9765 + }, + { + "epoch": 0.593353180630658, + "grad_norm": 0.6737829446792603, + "learning_rate": 3.574103585468376e-05, + "loss": 1.1226, + "step": 9766 + }, + { + "epoch": 0.5934139376632845, + "grad_norm": 0.17212356626987457, + "learning_rate": 3.57318608601761e-05, + "loss": 1.1293, + "step": 9767 + }, + { + "epoch": 0.5934746946959111, + "grad_norm": 0.12765397131443024, + "learning_rate": 3.572268638866985e-05, + "loss": 1.0669, + "step": 9768 + }, + { + "epoch": 0.5935354517285376, + "grad_norm": 1.4577455520629883, + "learning_rate": 3.5713512440501276e-05, + "loss": 1.1219, + "step": 9769 + }, + { + "epoch": 0.5935962087611641, + "grad_norm": 0.15666432678699493, + "learning_rate": 3.570433901600668e-05, + "loss": 1.0548, + "step": 9770 + }, + { + "epoch": 0.5936569657937907, + "grad_norm": 0.1695893555879593, + "learning_rate": 3.569516611552228e-05, + "loss": 1.0468, + "step": 9771 + }, + { + "epoch": 0.5937177228264171, + "grad_norm": 0.13557523488998413, + "learning_rate": 3.568599373938435e-05, + "loss": 1.0182, + "step": 9772 + }, + { + "epoch": 0.5937784798590436, + "grad_norm": 0.1633175015449524, + "learning_rate": 3.5676821887929095e-05, + "loss": 1.0461, + "step": 9773 + }, + { + "epoch": 0.5938392368916702, + "grad_norm": 0.21911802887916565, + "learning_rate": 3.566765056149269e-05, + "loss": 1.2169, + "step": 9774 + }, + { + "epoch": 0.5938999939242967, + "grad_norm": 0.17607736587524414, + "learning_rate": 3.5658479760411326e-05, + "loss": 1.08, + "step": 9775 + }, + { + "epoch": 0.5939607509569232, + "grad_norm": 0.3913068175315857, + "learning_rate": 3.564930948502115e-05, + "loss": 1.1745, + "step": 9776 + }, + { + "epoch": 0.5940215079895498, + "grad_norm": 0.14123575389385223, + "learning_rate": 3.564013973565832e-05, + "loss": 1.0913, + "step": 9777 + }, + { + "epoch": 0.5940822650221763, + "grad_norm": 0.18459638953208923, + "learning_rate": 3.5630970512658935e-05, + "loss": 1.0579, + "step": 9778 + }, + { + "epoch": 0.5941430220548028, + "grad_norm": 0.2039920538663864, + "learning_rate": 3.56218018163591e-05, + "loss": 1.0352, + "step": 9779 + }, + { + "epoch": 0.5942037790874294, + "grad_norm": 0.16813644766807556, + "learning_rate": 3.561263364709493e-05, + "loss": 1.1772, + "step": 9780 + }, + { + "epoch": 0.5942645361200559, + "grad_norm": 0.2098015695810318, + "learning_rate": 3.560346600520242e-05, + "loss": 1.1171, + "step": 9781 + }, + { + "epoch": 0.5943252931526825, + "grad_norm": 0.1300709992647171, + "learning_rate": 3.5594298891017654e-05, + "loss": 1.0222, + "step": 9782 + }, + { + "epoch": 0.594386050185309, + "grad_norm": 0.11240896582603455, + "learning_rate": 3.5585132304876655e-05, + "loss": 1.0746, + "step": 9783 + }, + { + "epoch": 0.5944468072179355, + "grad_norm": 0.27615344524383545, + "learning_rate": 3.55759662471154e-05, + "loss": 1.0439, + "step": 9784 + }, + { + "epoch": 0.594507564250562, + "grad_norm": 0.47950446605682373, + "learning_rate": 3.5566800718069904e-05, + "loss": 1.1026, + "step": 9785 + }, + { + "epoch": 0.5945683212831885, + "grad_norm": 0.14027227461338043, + "learning_rate": 3.555763571807613e-05, + "loss": 1.0509, + "step": 9786 + }, + { + "epoch": 0.594629078315815, + "grad_norm": 0.11497654020786285, + "learning_rate": 3.5548471247469994e-05, + "loss": 1.0836, + "step": 9787 + }, + { + "epoch": 0.5946898353484416, + "grad_norm": 0.19415542483329773, + "learning_rate": 3.5539307306587444e-05, + "loss": 1.0364, + "step": 9788 + }, + { + "epoch": 0.5947505923810681, + "grad_norm": 0.15232017636299133, + "learning_rate": 3.553014389576437e-05, + "loss": 1.0243, + "step": 9789 + }, + { + "epoch": 0.5948113494136946, + "grad_norm": 0.13926400244235992, + "learning_rate": 3.552098101533667e-05, + "loss": 1.0284, + "step": 9790 + }, + { + "epoch": 0.5948721064463212, + "grad_norm": 0.1758466362953186, + "learning_rate": 3.5511818665640215e-05, + "loss": 1.0182, + "step": 9791 + }, + { + "epoch": 0.5949328634789477, + "grad_norm": 0.1317562609910965, + "learning_rate": 3.550265684701084e-05, + "loss": 1.0125, + "step": 9792 + }, + { + "epoch": 0.5949936205115742, + "grad_norm": 0.1492239236831665, + "learning_rate": 3.5493495559784405e-05, + "loss": 0.9802, + "step": 9793 + }, + { + "epoch": 0.5950543775442008, + "grad_norm": 0.1287379264831543, + "learning_rate": 3.5484334804296674e-05, + "loss": 1.0813, + "step": 9794 + }, + { + "epoch": 0.5951151345768273, + "grad_norm": 0.10461851209402084, + "learning_rate": 3.5475174580883456e-05, + "loss": 1.0211, + "step": 9795 + }, + { + "epoch": 0.5951758916094538, + "grad_norm": 0.22647830843925476, + "learning_rate": 3.546601488988053e-05, + "loss": 1.1452, + "step": 9796 + }, + { + "epoch": 0.5952366486420804, + "grad_norm": 0.259226530790329, + "learning_rate": 3.545685573162362e-05, + "loss": 1.0191, + "step": 9797 + }, + { + "epoch": 0.5952974056747068, + "grad_norm": 0.11776480823755264, + "learning_rate": 3.544769710644849e-05, + "loss": 1.0224, + "step": 9798 + }, + { + "epoch": 0.5953581627073333, + "grad_norm": 0.12759307026863098, + "learning_rate": 3.543853901469083e-05, + "loss": 1.0401, + "step": 9799 + }, + { + "epoch": 0.5954189197399599, + "grad_norm": 0.17199723422527313, + "learning_rate": 3.542938145668635e-05, + "loss": 1.1237, + "step": 9800 + }, + { + "epoch": 0.5954796767725864, + "grad_norm": 0.11512981355190277, + "learning_rate": 3.54202244327707e-05, + "loss": 1.0424, + "step": 9801 + }, + { + "epoch": 0.5955404338052129, + "grad_norm": 0.11472322791814804, + "learning_rate": 3.541106794327953e-05, + "loss": 1.0312, + "step": 9802 + }, + { + "epoch": 0.5956011908378395, + "grad_norm": 0.11672493070363998, + "learning_rate": 3.54019119885485e-05, + "loss": 1.0568, + "step": 9803 + }, + { + "epoch": 0.595661947870466, + "grad_norm": 0.2715204954147339, + "learning_rate": 3.53927565689132e-05, + "loss": 1.0753, + "step": 9804 + }, + { + "epoch": 0.5957227049030925, + "grad_norm": 0.1581743210554123, + "learning_rate": 3.538360168470922e-05, + "loss": 1.0758, + "step": 9805 + }, + { + "epoch": 0.5957834619357191, + "grad_norm": 0.1087142825126648, + "learning_rate": 3.537444733627219e-05, + "loss": 1.0104, + "step": 9806 + }, + { + "epoch": 0.5958442189683456, + "grad_norm": 0.12890246510505676, + "learning_rate": 3.536529352393757e-05, + "loss": 1.0681, + "step": 9807 + }, + { + "epoch": 0.5959049760009721, + "grad_norm": 0.12123747169971466, + "learning_rate": 3.535614024804097e-05, + "loss": 1.0417, + "step": 9808 + }, + { + "epoch": 0.5959657330335987, + "grad_norm": 0.2113116830587387, + "learning_rate": 3.5346987508917876e-05, + "loss": 1.0876, + "step": 9809 + }, + { + "epoch": 0.5960264900662252, + "grad_norm": 0.36501821875572205, + "learning_rate": 3.533783530690377e-05, + "loss": 1.2459, + "step": 9810 + }, + { + "epoch": 0.5960872470988517, + "grad_norm": 0.141757994890213, + "learning_rate": 3.532868364233416e-05, + "loss": 1.0997, + "step": 9811 + }, + { + "epoch": 0.5961480041314782, + "grad_norm": 0.22064721584320068, + "learning_rate": 3.531953251554449e-05, + "loss": 1.2755, + "step": 9812 + }, + { + "epoch": 0.5962087611641047, + "grad_norm": 0.2296675741672516, + "learning_rate": 3.531038192687021e-05, + "loss": 1.1259, + "step": 9813 + }, + { + "epoch": 0.5962695181967312, + "grad_norm": 0.13480280339717865, + "learning_rate": 3.530123187664669e-05, + "loss": 1.0737, + "step": 9814 + }, + { + "epoch": 0.5963302752293578, + "grad_norm": 0.1989314705133438, + "learning_rate": 3.529208236520936e-05, + "loss": 1.0736, + "step": 9815 + }, + { + "epoch": 0.5963910322619843, + "grad_norm": 0.2590467035770416, + "learning_rate": 3.52829333928936e-05, + "loss": 0.9889, + "step": 9816 + }, + { + "epoch": 0.5964517892946108, + "grad_norm": 0.19125080108642578, + "learning_rate": 3.527378496003476e-05, + "loss": 1.1253, + "step": 9817 + }, + { + "epoch": 0.5965125463272374, + "grad_norm": 0.12208377569913864, + "learning_rate": 3.5264637066968175e-05, + "loss": 1.0019, + "step": 9818 + }, + { + "epoch": 0.5965733033598639, + "grad_norm": 0.18308749794960022, + "learning_rate": 3.5255489714029176e-05, + "loss": 1.0547, + "step": 9819 + }, + { + "epoch": 0.5966340603924905, + "grad_norm": 0.22826820611953735, + "learning_rate": 3.524634290155306e-05, + "loss": 1.1794, + "step": 9820 + }, + { + "epoch": 0.596694817425117, + "grad_norm": 0.12926438450813293, + "learning_rate": 3.523719662987508e-05, + "loss": 1.0652, + "step": 9821 + }, + { + "epoch": 0.5967555744577435, + "grad_norm": 0.23061497509479523, + "learning_rate": 3.522805089933052e-05, + "loss": 1.1796, + "step": 9822 + }, + { + "epoch": 0.5968163314903701, + "grad_norm": 0.18201753497123718, + "learning_rate": 3.52189057102546e-05, + "loss": 1.1523, + "step": 9823 + }, + { + "epoch": 0.5968770885229966, + "grad_norm": 0.39602550864219666, + "learning_rate": 3.520976106298255e-05, + "loss": 1.132, + "step": 9824 + }, + { + "epoch": 0.596937845555623, + "grad_norm": 0.9689731597900391, + "learning_rate": 3.520061695784958e-05, + "loss": 1.0133, + "step": 9825 + }, + { + "epoch": 0.5969986025882495, + "grad_norm": 0.1577349752187729, + "learning_rate": 3.5191473395190834e-05, + "loss": 1.0683, + "step": 9826 + }, + { + "epoch": 0.5970593596208761, + "grad_norm": 0.12769648432731628, + "learning_rate": 3.518233037534151e-05, + "loss": 1.075, + "step": 9827 + }, + { + "epoch": 0.5971201166535026, + "grad_norm": 8.29349422454834, + "learning_rate": 3.517318789863673e-05, + "loss": 1.039, + "step": 9828 + }, + { + "epoch": 0.5971808736861292, + "grad_norm": 1.2124171257019043, + "learning_rate": 3.516404596541161e-05, + "loss": 1.0738, + "step": 9829 + }, + { + "epoch": 0.5972416307187557, + "grad_norm": 0.1338082104921341, + "learning_rate": 3.515490457600126e-05, + "loss": 1.0306, + "step": 9830 + }, + { + "epoch": 0.5973023877513822, + "grad_norm": 0.12332873791456223, + "learning_rate": 3.5145763730740735e-05, + "loss": 1.0568, + "step": 9831 + }, + { + "epoch": 0.5973631447840088, + "grad_norm": 0.1417461782693863, + "learning_rate": 3.513662342996512e-05, + "loss": 1.0009, + "step": 9832 + }, + { + "epoch": 0.5974239018166353, + "grad_norm": 0.16883361339569092, + "learning_rate": 3.512748367400945e-05, + "loss": 1.0215, + "step": 9833 + }, + { + "epoch": 0.5974846588492618, + "grad_norm": 0.1710953414440155, + "learning_rate": 3.511834446320875e-05, + "loss": 1.1178, + "step": 9834 + }, + { + "epoch": 0.5975454158818884, + "grad_norm": 0.1153365820646286, + "learning_rate": 3.510920579789799e-05, + "loss": 1.0419, + "step": 9835 + }, + { + "epoch": 0.5976061729145149, + "grad_norm": 0.2657296061515808, + "learning_rate": 3.510006767841218e-05, + "loss": 1.1383, + "step": 9836 + }, + { + "epoch": 0.5976669299471414, + "grad_norm": 0.15079621970653534, + "learning_rate": 3.5090930105086264e-05, + "loss": 1.0884, + "step": 9837 + }, + { + "epoch": 0.5977276869797679, + "grad_norm": 0.1657806783914566, + "learning_rate": 3.508179307825518e-05, + "loss": 1.1339, + "step": 9838 + }, + { + "epoch": 0.5977884440123944, + "grad_norm": 0.22033043205738068, + "learning_rate": 3.5072656598253875e-05, + "loss": 1.0688, + "step": 9839 + }, + { + "epoch": 0.5978492010450209, + "grad_norm": 0.19386543333530426, + "learning_rate": 3.506352066541723e-05, + "loss": 1.1037, + "step": 9840 + }, + { + "epoch": 0.5979099580776475, + "grad_norm": 0.18420758843421936, + "learning_rate": 3.5054385280080104e-05, + "loss": 1.132, + "step": 9841 + }, + { + "epoch": 0.597970715110274, + "grad_norm": 0.13463185727596283, + "learning_rate": 3.504525044257738e-05, + "loss": 1.1056, + "step": 9842 + }, + { + "epoch": 0.5980314721429005, + "grad_norm": 0.1635916382074356, + "learning_rate": 3.503611615324388e-05, + "loss": 1.087, + "step": 9843 + }, + { + "epoch": 0.5980922291755271, + "grad_norm": 0.2804458439350128, + "learning_rate": 3.502698241241445e-05, + "loss": 1.0993, + "step": 9844 + }, + { + "epoch": 0.5981529862081536, + "grad_norm": 0.1420588195323944, + "learning_rate": 3.501784922042387e-05, + "loss": 1.0947, + "step": 9845 + }, + { + "epoch": 0.5982137432407801, + "grad_norm": 0.18791642785072327, + "learning_rate": 3.500871657760692e-05, + "loss": 1.0317, + "step": 9846 + }, + { + "epoch": 0.5982745002734067, + "grad_norm": 0.25479090213775635, + "learning_rate": 3.4999584484298385e-05, + "loss": 1.012, + "step": 9847 + }, + { + "epoch": 0.5983352573060332, + "grad_norm": 0.22574377059936523, + "learning_rate": 3.4990452940832954e-05, + "loss": 1.112, + "step": 9848 + }, + { + "epoch": 0.5983960143386597, + "grad_norm": 0.16704055666923523, + "learning_rate": 3.4981321947545374e-05, + "loss": 1.062, + "step": 9849 + }, + { + "epoch": 0.5984567713712863, + "grad_norm": 0.15069197118282318, + "learning_rate": 3.497219150477035e-05, + "loss": 1.0945, + "step": 9850 + }, + { + "epoch": 0.5985175284039127, + "grad_norm": 0.2017909288406372, + "learning_rate": 3.496306161284254e-05, + "loss": 1.1254, + "step": 9851 + }, + { + "epoch": 0.5985782854365392, + "grad_norm": 0.1575460433959961, + "learning_rate": 3.495393227209662e-05, + "loss": 1.095, + "step": 9852 + }, + { + "epoch": 0.5986390424691658, + "grad_norm": 0.19820109009742737, + "learning_rate": 3.4944803482867225e-05, + "loss": 1.0684, + "step": 9853 + }, + { + "epoch": 0.5986997995017923, + "grad_norm": 0.1419120877981186, + "learning_rate": 3.4935675245488976e-05, + "loss": 1.1057, + "step": 9854 + }, + { + "epoch": 0.5987605565344188, + "grad_norm": 0.22121503949165344, + "learning_rate": 3.4926547560296445e-05, + "loss": 1.0533, + "step": 9855 + }, + { + "epoch": 0.5988213135670454, + "grad_norm": 0.22658799588680267, + "learning_rate": 3.4917420427624224e-05, + "loss": 1.0575, + "step": 9856 + }, + { + "epoch": 0.5988820705996719, + "grad_norm": 0.13354939222335815, + "learning_rate": 3.4908293847806885e-05, + "loss": 1.092, + "step": 9857 + }, + { + "epoch": 0.5989428276322984, + "grad_norm": 0.18550904095172882, + "learning_rate": 3.489916782117895e-05, + "loss": 1.096, + "step": 9858 + }, + { + "epoch": 0.599003584664925, + "grad_norm": 0.14349329471588135, + "learning_rate": 3.489004234807492e-05, + "loss": 1.0385, + "step": 9859 + }, + { + "epoch": 0.5990643416975515, + "grad_norm": 0.11136182397603989, + "learning_rate": 3.488091742882935e-05, + "loss": 1.066, + "step": 9860 + }, + { + "epoch": 0.599125098730178, + "grad_norm": 0.17109361290931702, + "learning_rate": 3.4871793063776636e-05, + "loss": 1.0444, + "step": 9861 + }, + { + "epoch": 0.5991858557628046, + "grad_norm": 0.4811965227127075, + "learning_rate": 3.486266925325129e-05, + "loss": 1.1447, + "step": 9862 + }, + { + "epoch": 0.5992466127954311, + "grad_norm": 0.15174521505832672, + "learning_rate": 3.4853545997587725e-05, + "loss": 1.1018, + "step": 9863 + }, + { + "epoch": 0.5993073698280575, + "grad_norm": 0.17021965980529785, + "learning_rate": 3.4844423297120345e-05, + "loss": 1.1157, + "step": 9864 + }, + { + "epoch": 0.5993681268606841, + "grad_norm": 10.048443794250488, + "learning_rate": 3.4835301152183575e-05, + "loss": 1.1272, + "step": 9865 + }, + { + "epoch": 0.5994288838933106, + "grad_norm": 0.16117344796657562, + "learning_rate": 3.4826179563111764e-05, + "loss": 1.0734, + "step": 9866 + }, + { + "epoch": 0.5994896409259372, + "grad_norm": 0.14404891431331635, + "learning_rate": 3.4817058530239285e-05, + "loss": 1.0993, + "step": 9867 + }, + { + "epoch": 0.5995503979585637, + "grad_norm": 0.14917537569999695, + "learning_rate": 3.480793805390046e-05, + "loss": 1.036, + "step": 9868 + }, + { + "epoch": 0.5996111549911902, + "grad_norm": 0.2056112140417099, + "learning_rate": 3.479881813442959e-05, + "loss": 1.0609, + "step": 9869 + }, + { + "epoch": 0.5996719120238168, + "grad_norm": 0.15713055431842804, + "learning_rate": 3.4789698772160986e-05, + "loss": 0.9829, + "step": 9870 + }, + { + "epoch": 0.5997326690564433, + "grad_norm": 0.13040582835674286, + "learning_rate": 3.478057996742891e-05, + "loss": 1.0383, + "step": 9871 + }, + { + "epoch": 0.5997934260890698, + "grad_norm": 0.32370424270629883, + "learning_rate": 3.477146172056761e-05, + "loss": 1.0757, + "step": 9872 + }, + { + "epoch": 0.5998541831216964, + "grad_norm": 0.2965570390224457, + "learning_rate": 3.4762344031911354e-05, + "loss": 1.2418, + "step": 9873 + }, + { + "epoch": 0.5999149401543229, + "grad_norm": 0.21298748254776, + "learning_rate": 3.4753226901794294e-05, + "loss": 1.2357, + "step": 9874 + }, + { + "epoch": 0.5999756971869494, + "grad_norm": 0.2686533033847809, + "learning_rate": 3.474411033055066e-05, + "loss": 1.141, + "step": 9875 + }, + { + "epoch": 0.600036454219576, + "grad_norm": 0.2947900593280792, + "learning_rate": 3.473499431851461e-05, + "loss": 1.1134, + "step": 9876 + }, + { + "epoch": 0.6000972112522024, + "grad_norm": 0.1630922257900238, + "learning_rate": 3.472587886602028e-05, + "loss": 1.0854, + "step": 9877 + }, + { + "epoch": 0.6001579682848289, + "grad_norm": 0.27238163352012634, + "learning_rate": 3.471676397340183e-05, + "loss": 1.1999, + "step": 9878 + }, + { + "epoch": 0.6002187253174555, + "grad_norm": 0.23262737691402435, + "learning_rate": 3.470764964099334e-05, + "loss": 1.092, + "step": 9879 + }, + { + "epoch": 0.600279482350082, + "grad_norm": 0.15897086262702942, + "learning_rate": 3.4698535869128936e-05, + "loss": 1.0942, + "step": 9880 + }, + { + "epoch": 0.6003402393827085, + "grad_norm": 0.20158566534519196, + "learning_rate": 3.4689422658142635e-05, + "loss": 1.0326, + "step": 9881 + }, + { + "epoch": 0.6004009964153351, + "grad_norm": 0.13681265711784363, + "learning_rate": 3.46803100083685e-05, + "loss": 1.0768, + "step": 9882 + }, + { + "epoch": 0.6004617534479616, + "grad_norm": 0.1403191089630127, + "learning_rate": 3.4671197920140564e-05, + "loss": 1.0179, + "step": 9883 + }, + { + "epoch": 0.6005225104805881, + "grad_norm": 0.14691096544265747, + "learning_rate": 3.466208639379284e-05, + "loss": 1.1037, + "step": 9884 + }, + { + "epoch": 0.6005832675132147, + "grad_norm": 0.1478205919265747, + "learning_rate": 3.465297542965929e-05, + "loss": 1.0709, + "step": 9885 + }, + { + "epoch": 0.6006440245458412, + "grad_norm": 1.6108773946762085, + "learning_rate": 3.464386502807391e-05, + "loss": 1.0479, + "step": 9886 + }, + { + "epoch": 0.6007047815784677, + "grad_norm": 0.10805761814117432, + "learning_rate": 3.463475518937063e-05, + "loss": 1.0305, + "step": 9887 + }, + { + "epoch": 0.6007655386110943, + "grad_norm": 0.14876708388328552, + "learning_rate": 3.4625645913883355e-05, + "loss": 1.1031, + "step": 9888 + }, + { + "epoch": 0.6008262956437208, + "grad_norm": 0.2512546479701996, + "learning_rate": 3.4616537201946e-05, + "loss": 1.1995, + "step": 9889 + }, + { + "epoch": 0.6008870526763472, + "grad_norm": 0.1403788924217224, + "learning_rate": 3.460742905389244e-05, + "loss": 1.0758, + "step": 9890 + }, + { + "epoch": 0.6009478097089738, + "grad_norm": 0.12643319368362427, + "learning_rate": 3.459832147005656e-05, + "loss": 1.0355, + "step": 9891 + }, + { + "epoch": 0.6010085667416003, + "grad_norm": 0.2717328667640686, + "learning_rate": 3.458921445077218e-05, + "loss": 1.0379, + "step": 9892 + }, + { + "epoch": 0.6010693237742268, + "grad_norm": 0.14643657207489014, + "learning_rate": 3.4580107996373106e-05, + "loss": 1.0712, + "step": 9893 + }, + { + "epoch": 0.6011300808068534, + "grad_norm": 0.11006568372249603, + "learning_rate": 3.4571002107193195e-05, + "loss": 1.0195, + "step": 9894 + }, + { + "epoch": 0.6011908378394799, + "grad_norm": 0.1936243176460266, + "learning_rate": 3.4561896783566145e-05, + "loss": 1.1409, + "step": 9895 + }, + { + "epoch": 0.6012515948721064, + "grad_norm": 0.16525816917419434, + "learning_rate": 3.455279202582577e-05, + "loss": 1.1443, + "step": 9896 + }, + { + "epoch": 0.601312351904733, + "grad_norm": 0.2687567472457886, + "learning_rate": 3.454368783430578e-05, + "loss": 1.236, + "step": 9897 + }, + { + "epoch": 0.6013731089373595, + "grad_norm": 0.19922126829624176, + "learning_rate": 3.453458420933989e-05, + "loss": 1.0721, + "step": 9898 + }, + { + "epoch": 0.601433865969986, + "grad_norm": 0.1761442869901657, + "learning_rate": 3.452548115126183e-05, + "loss": 1.0249, + "step": 9899 + }, + { + "epoch": 0.6014946230026126, + "grad_norm": 0.3242648243904114, + "learning_rate": 3.451637866040524e-05, + "loss": 1.2274, + "step": 9900 + }, + { + "epoch": 0.6015553800352391, + "grad_norm": 0.17694208025932312, + "learning_rate": 3.450727673710379e-05, + "loss": 1.1644, + "step": 9901 + }, + { + "epoch": 0.6016161370678657, + "grad_norm": 0.14571638405323029, + "learning_rate": 3.44981753816911e-05, + "loss": 1.0457, + "step": 9902 + }, + { + "epoch": 0.6016768941004921, + "grad_norm": 0.2872547209262848, + "learning_rate": 3.448907459450078e-05, + "loss": 1.1491, + "step": 9903 + }, + { + "epoch": 0.6017376511331186, + "grad_norm": 0.11442230641841888, + "learning_rate": 3.447997437586643e-05, + "loss": 1.0131, + "step": 9904 + }, + { + "epoch": 0.6017984081657451, + "grad_norm": 0.1830238550901413, + "learning_rate": 3.447087472612163e-05, + "loss": 1.0277, + "step": 9905 + }, + { + "epoch": 0.6018591651983717, + "grad_norm": 0.12001951783895493, + "learning_rate": 3.44617756455999e-05, + "loss": 1.0789, + "step": 9906 + }, + { + "epoch": 0.6019199222309982, + "grad_norm": 0.21640633046627045, + "learning_rate": 3.445267713463482e-05, + "loss": 1.1854, + "step": 9907 + }, + { + "epoch": 0.6019806792636248, + "grad_norm": 0.2306807041168213, + "learning_rate": 3.444357919355984e-05, + "loss": 1.1046, + "step": 9908 + }, + { + "epoch": 0.6020414362962513, + "grad_norm": 0.22462685406208038, + "learning_rate": 3.443448182270848e-05, + "loss": 1.1133, + "step": 9909 + }, + { + "epoch": 0.6021021933288778, + "grad_norm": 0.1434704214334488, + "learning_rate": 3.442538502241419e-05, + "loss": 1.061, + "step": 9910 + }, + { + "epoch": 0.6021629503615044, + "grad_norm": 0.17077481746673584, + "learning_rate": 3.441628879301042e-05, + "loss": 1.0613, + "step": 9911 + }, + { + "epoch": 0.6022237073941309, + "grad_norm": 0.25269031524658203, + "learning_rate": 3.44071931348306e-05, + "loss": 1.0108, + "step": 9912 + }, + { + "epoch": 0.6022844644267574, + "grad_norm": 0.17380735278129578, + "learning_rate": 3.439809804820814e-05, + "loss": 1.0152, + "step": 9913 + }, + { + "epoch": 0.602345221459384, + "grad_norm": 0.13962875306606293, + "learning_rate": 3.438900353347641e-05, + "loss": 1.0166, + "step": 9914 + }, + { + "epoch": 0.6024059784920105, + "grad_norm": 0.1959177702665329, + "learning_rate": 3.437990959096877e-05, + "loss": 1.0248, + "step": 9915 + }, + { + "epoch": 0.602466735524637, + "grad_norm": 0.1793917566537857, + "learning_rate": 3.437081622101855e-05, + "loss": 1.0951, + "step": 9916 + }, + { + "epoch": 0.6025274925572635, + "grad_norm": 0.16855919361114502, + "learning_rate": 3.436172342395909e-05, + "loss": 1.075, + "step": 9917 + }, + { + "epoch": 0.60258824958989, + "grad_norm": 0.18728770315647125, + "learning_rate": 3.435263120012369e-05, + "loss": 1.0791, + "step": 9918 + }, + { + "epoch": 0.6026490066225165, + "grad_norm": 0.16851933300495148, + "learning_rate": 3.4343539549845604e-05, + "loss": 1.0714, + "step": 9919 + }, + { + "epoch": 0.6027097636551431, + "grad_norm": 0.20317442715168, + "learning_rate": 3.433444847345812e-05, + "loss": 1.1057, + "step": 9920 + }, + { + "epoch": 0.6027705206877696, + "grad_norm": 0.1339462697505951, + "learning_rate": 3.4325357971294447e-05, + "loss": 1.0451, + "step": 9921 + }, + { + "epoch": 0.6028312777203961, + "grad_norm": 0.6338938474655151, + "learning_rate": 3.4316268043687806e-05, + "loss": 1.0922, + "step": 9922 + }, + { + "epoch": 0.6028920347530227, + "grad_norm": 0.24797533452510834, + "learning_rate": 3.430717869097139e-05, + "loss": 1.0616, + "step": 9923 + }, + { + "epoch": 0.6029527917856492, + "grad_norm": 0.16027776896953583, + "learning_rate": 3.4298089913478365e-05, + "loss": 1.0415, + "step": 9924 + }, + { + "epoch": 0.6030135488182757, + "grad_norm": 3.4841666221618652, + "learning_rate": 3.42890017115419e-05, + "loss": 1.0353, + "step": 9925 + }, + { + "epoch": 0.6030743058509023, + "grad_norm": 0.1936003565788269, + "learning_rate": 3.427991408549511e-05, + "loss": 1.0495, + "step": 9926 + }, + { + "epoch": 0.6031350628835288, + "grad_norm": 0.26577481627464294, + "learning_rate": 3.427082703567113e-05, + "loss": 1.1489, + "step": 9927 + }, + { + "epoch": 0.6031958199161553, + "grad_norm": 0.1490364819765091, + "learning_rate": 3.426174056240299e-05, + "loss": 1.1031, + "step": 9928 + }, + { + "epoch": 0.6032565769487819, + "grad_norm": 0.1860506385564804, + "learning_rate": 3.425265466602381e-05, + "loss": 1.0159, + "step": 9929 + }, + { + "epoch": 0.6033173339814083, + "grad_norm": 0.22505608201026917, + "learning_rate": 3.424356934686661e-05, + "loss": 1.1298, + "step": 9930 + }, + { + "epoch": 0.6033780910140348, + "grad_norm": 0.18013067543506622, + "learning_rate": 3.4234484605264416e-05, + "loss": 1.1182, + "step": 9931 + }, + { + "epoch": 0.6034388480466614, + "grad_norm": 0.12454739958047867, + "learning_rate": 3.422540044155025e-05, + "loss": 1.0455, + "step": 9932 + }, + { + "epoch": 0.6034996050792879, + "grad_norm": 0.1451205462217331, + "learning_rate": 3.421631685605707e-05, + "loss": 1.0411, + "step": 9933 + }, + { + "epoch": 0.6035603621119144, + "grad_norm": 0.18871988356113434, + "learning_rate": 3.420723384911787e-05, + "loss": 1.0932, + "step": 9934 + }, + { + "epoch": 0.603621119144541, + "grad_norm": 0.17945139110088348, + "learning_rate": 3.4198151421065546e-05, + "loss": 1.1294, + "step": 9935 + }, + { + "epoch": 0.6036818761771675, + "grad_norm": 0.13867826759815216, + "learning_rate": 3.418906957223303e-05, + "loss": 1.0384, + "step": 9936 + }, + { + "epoch": 0.603742633209794, + "grad_norm": 0.1369640976190567, + "learning_rate": 3.4179988302953235e-05, + "loss": 1.0742, + "step": 9937 + }, + { + "epoch": 0.6038033902424206, + "grad_norm": 0.6449360847473145, + "learning_rate": 3.417090761355902e-05, + "loss": 1.0601, + "step": 9938 + }, + { + "epoch": 0.6038641472750471, + "grad_norm": 0.17826220393180847, + "learning_rate": 3.416182750438325e-05, + "loss": 1.0608, + "step": 9939 + }, + { + "epoch": 0.6039249043076736, + "grad_norm": 0.14630477130413055, + "learning_rate": 3.415274797575878e-05, + "loss": 1.1097, + "step": 9940 + }, + { + "epoch": 0.6039856613403002, + "grad_norm": 0.11577188223600388, + "learning_rate": 3.414366902801836e-05, + "loss": 1.0183, + "step": 9941 + }, + { + "epoch": 0.6040464183729267, + "grad_norm": 0.1503218412399292, + "learning_rate": 3.413459066149483e-05, + "loss": 1.0807, + "step": 9942 + }, + { + "epoch": 0.6041071754055531, + "grad_norm": 0.15518291294574738, + "learning_rate": 3.412551287652094e-05, + "loss": 1.0353, + "step": 9943 + }, + { + "epoch": 0.6041679324381797, + "grad_norm": 0.9498729705810547, + "learning_rate": 3.411643567342944e-05, + "loss": 1.078, + "step": 9944 + }, + { + "epoch": 0.6042286894708062, + "grad_norm": 0.5091332197189331, + "learning_rate": 3.410735905255307e-05, + "loss": 1.1926, + "step": 9945 + }, + { + "epoch": 0.6042894465034327, + "grad_norm": 1.295035719871521, + "learning_rate": 3.409828301422453e-05, + "loss": 1.1119, + "step": 9946 + }, + { + "epoch": 0.6043502035360593, + "grad_norm": 0.13873019814491272, + "learning_rate": 3.408920755877651e-05, + "loss": 1.0396, + "step": 9947 + }, + { + "epoch": 0.6044109605686858, + "grad_norm": 0.19527703523635864, + "learning_rate": 3.4080132686541644e-05, + "loss": 1.113, + "step": 9948 + }, + { + "epoch": 0.6044717176013124, + "grad_norm": 0.3224121332168579, + "learning_rate": 3.4071058397852586e-05, + "loss": 1.1502, + "step": 9949 + }, + { + "epoch": 0.6045324746339389, + "grad_norm": 0.13711580634117126, + "learning_rate": 3.406198469304197e-05, + "loss": 1.0632, + "step": 9950 + }, + { + "epoch": 0.6045932316665654, + "grad_norm": 0.28841862082481384, + "learning_rate": 3.405291157244238e-05, + "loss": 1.14, + "step": 9951 + }, + { + "epoch": 0.604653988699192, + "grad_norm": 0.15258243680000305, + "learning_rate": 3.40438390363864e-05, + "loss": 1.033, + "step": 9952 + }, + { + "epoch": 0.6047147457318185, + "grad_norm": 0.40781161189079285, + "learning_rate": 3.4034767085206586e-05, + "loss": 1.1834, + "step": 9953 + }, + { + "epoch": 0.604775502764445, + "grad_norm": 0.13136610388755798, + "learning_rate": 3.4025695719235484e-05, + "loss": 0.9972, + "step": 9954 + }, + { + "epoch": 0.6048362597970716, + "grad_norm": 0.13902992010116577, + "learning_rate": 3.401662493880559e-05, + "loss": 1.1063, + "step": 9955 + }, + { + "epoch": 0.604897016829698, + "grad_norm": 0.7766736149787903, + "learning_rate": 3.400755474424939e-05, + "loss": 1.1366, + "step": 9956 + }, + { + "epoch": 0.6049577738623245, + "grad_norm": 0.2598614990711212, + "learning_rate": 3.399848513589935e-05, + "loss": 1.0349, + "step": 9957 + }, + { + "epoch": 0.6050185308949511, + "grad_norm": 0.1781376600265503, + "learning_rate": 3.3989416114087946e-05, + "loss": 1.0489, + "step": 9958 + }, + { + "epoch": 0.6050792879275776, + "grad_norm": 0.11530880630016327, + "learning_rate": 3.398034767914759e-05, + "loss": 1.0068, + "step": 9959 + }, + { + "epoch": 0.6051400449602041, + "grad_norm": 0.1621524542570114, + "learning_rate": 3.397127983141067e-05, + "loss": 1.0955, + "step": 9960 + }, + { + "epoch": 0.6052008019928307, + "grad_norm": 0.21690423786640167, + "learning_rate": 3.396221257120962e-05, + "loss": 1.0444, + "step": 9961 + }, + { + "epoch": 0.6052615590254572, + "grad_norm": 0.9953030943870544, + "learning_rate": 3.3953145898876746e-05, + "loss": 1.0638, + "step": 9962 + }, + { + "epoch": 0.6053223160580837, + "grad_norm": 0.13792288303375244, + "learning_rate": 3.394407981474441e-05, + "loss": 1.095, + "step": 9963 + }, + { + "epoch": 0.6053830730907103, + "grad_norm": 0.18514662981033325, + "learning_rate": 3.393501431914493e-05, + "loss": 1.0625, + "step": 9964 + }, + { + "epoch": 0.6054438301233368, + "grad_norm": 0.21107499301433563, + "learning_rate": 3.3925949412410605e-05, + "loss": 1.0457, + "step": 9965 + }, + { + "epoch": 0.6055045871559633, + "grad_norm": 0.20659013092517853, + "learning_rate": 3.391688509487372e-05, + "loss": 1.0561, + "step": 9966 + }, + { + "epoch": 0.6055653441885899, + "grad_norm": 0.15880298614501953, + "learning_rate": 3.390782136686652e-05, + "loss": 1.1561, + "step": 9967 + }, + { + "epoch": 0.6056261012212164, + "grad_norm": 0.16636249423027039, + "learning_rate": 3.389875822872124e-05, + "loss": 1.094, + "step": 9968 + }, + { + "epoch": 0.6056868582538428, + "grad_norm": 0.1496499478816986, + "learning_rate": 3.3889695680770085e-05, + "loss": 1.0398, + "step": 9969 + }, + { + "epoch": 0.6057476152864694, + "grad_norm": 0.1958138793706894, + "learning_rate": 3.388063372334524e-05, + "loss": 1.0476, + "step": 9970 + }, + { + "epoch": 0.6058083723190959, + "grad_norm": 0.14606082439422607, + "learning_rate": 3.387157235677889e-05, + "loss": 1.0819, + "step": 9971 + }, + { + "epoch": 0.6058691293517224, + "grad_norm": 0.9565188884735107, + "learning_rate": 3.386251158140317e-05, + "loss": 1.0515, + "step": 9972 + }, + { + "epoch": 0.605929886384349, + "grad_norm": 0.18806388974189758, + "learning_rate": 3.385345139755021e-05, + "loss": 1.1578, + "step": 9973 + }, + { + "epoch": 0.6059906434169755, + "grad_norm": 0.21009288728237152, + "learning_rate": 3.3844391805552124e-05, + "loss": 1.1221, + "step": 9974 + }, + { + "epoch": 0.606051400449602, + "grad_norm": 0.17453421652317047, + "learning_rate": 3.383533280574096e-05, + "loss": 1.1219, + "step": 9975 + }, + { + "epoch": 0.6061121574822286, + "grad_norm": 0.133493110537529, + "learning_rate": 3.38262743984488e-05, + "loss": 1.0319, + "step": 9976 + }, + { + "epoch": 0.6061729145148551, + "grad_norm": 0.21481958031654358, + "learning_rate": 3.3817216584007685e-05, + "loss": 1.1577, + "step": 9977 + }, + { + "epoch": 0.6062336715474816, + "grad_norm": 0.12405280768871307, + "learning_rate": 3.3808159362749605e-05, + "loss": 1.051, + "step": 9978 + }, + { + "epoch": 0.6062944285801082, + "grad_norm": 0.20788033306598663, + "learning_rate": 3.379910273500659e-05, + "loss": 1.1289, + "step": 9979 + }, + { + "epoch": 0.6063551856127347, + "grad_norm": 0.12333488464355469, + "learning_rate": 3.3790046701110596e-05, + "loss": 1.1233, + "step": 9980 + }, + { + "epoch": 0.6064159426453613, + "grad_norm": 0.23280581831932068, + "learning_rate": 3.378099126139358e-05, + "loss": 1.0311, + "step": 9981 + }, + { + "epoch": 0.6064766996779877, + "grad_norm": 0.17630217969417572, + "learning_rate": 3.3771936416187456e-05, + "loss": 1.1104, + "step": 9982 + }, + { + "epoch": 0.6065374567106142, + "grad_norm": 0.15698201954364777, + "learning_rate": 3.376288216582413e-05, + "loss": 1.077, + "step": 9983 + }, + { + "epoch": 0.6065982137432407, + "grad_norm": 0.13801109790802002, + "learning_rate": 3.375382851063551e-05, + "loss": 1.052, + "step": 9984 + }, + { + "epoch": 0.6066589707758673, + "grad_norm": 0.12702766060829163, + "learning_rate": 3.374477545095344e-05, + "loss": 1.0186, + "step": 9985 + }, + { + "epoch": 0.6067197278084938, + "grad_norm": 0.21862299740314484, + "learning_rate": 3.3735722987109764e-05, + "loss": 1.0767, + "step": 9986 + }, + { + "epoch": 0.6067804848411203, + "grad_norm": 2.080496311187744, + "learning_rate": 3.3726671119436325e-05, + "loss": 1.074, + "step": 9987 + }, + { + "epoch": 0.6068412418737469, + "grad_norm": 0.14575068652629852, + "learning_rate": 3.371761984826488e-05, + "loss": 1.0791, + "step": 9988 + }, + { + "epoch": 0.6069019989063734, + "grad_norm": 0.4230603277683258, + "learning_rate": 3.370856917392724e-05, + "loss": 1.1887, + "step": 9989 + }, + { + "epoch": 0.606962755939, + "grad_norm": 0.11740444600582123, + "learning_rate": 3.369951909675514e-05, + "loss": 1.024, + "step": 9990 + }, + { + "epoch": 0.6070235129716265, + "grad_norm": 0.14493657648563385, + "learning_rate": 3.3690469617080315e-05, + "loss": 1.084, + "step": 9991 + }, + { + "epoch": 0.607084270004253, + "grad_norm": 4.207291603088379, + "learning_rate": 3.368142073523448e-05, + "loss": 1.0914, + "step": 9992 + }, + { + "epoch": 0.6071450270368796, + "grad_norm": 0.20452556014060974, + "learning_rate": 3.367237245154933e-05, + "loss": 1.0837, + "step": 9993 + }, + { + "epoch": 0.6072057840695061, + "grad_norm": 0.22807788848876953, + "learning_rate": 3.366332476635653e-05, + "loss": 1.0819, + "step": 9994 + }, + { + "epoch": 0.6072665411021325, + "grad_norm": 0.15235096216201782, + "learning_rate": 3.36542776799877e-05, + "loss": 1.0217, + "step": 9995 + }, + { + "epoch": 0.607327298134759, + "grad_norm": 0.13611027598381042, + "learning_rate": 3.3645231192774486e-05, + "loss": 1.0813, + "step": 9996 + }, + { + "epoch": 0.6073880551673856, + "grad_norm": 0.14824730157852173, + "learning_rate": 3.363618530504848e-05, + "loss": 1.0464, + "step": 9997 + }, + { + "epoch": 0.6074488122000121, + "grad_norm": 0.23028042912483215, + "learning_rate": 3.362714001714128e-05, + "loss": 1.1059, + "step": 9998 + }, + { + "epoch": 0.6075095692326387, + "grad_norm": 0.1377759426832199, + "learning_rate": 3.361809532938441e-05, + "loss": 1.0555, + "step": 9999 + }, + { + "epoch": 0.6075703262652652, + "grad_norm": 0.31320837140083313, + "learning_rate": 3.360905124210943e-05, + "loss": 1.0533, + "step": 10000 + }, + { + "epoch": 0.6076310832978917, + "grad_norm": 0.1362789273262024, + "learning_rate": 3.360000775564785e-05, + "loss": 0.9948, + "step": 10001 + }, + { + "epoch": 0.6076918403305183, + "grad_norm": 0.13190710544586182, + "learning_rate": 3.3590964870331157e-05, + "loss": 1.0373, + "step": 10002 + }, + { + "epoch": 0.6077525973631448, + "grad_norm": 0.14964397251605988, + "learning_rate": 3.3581922586490815e-05, + "loss": 1.0544, + "step": 10003 + }, + { + "epoch": 0.6078133543957713, + "grad_norm": 0.21665766835212708, + "learning_rate": 3.357288090445827e-05, + "loss": 1.157, + "step": 10004 + }, + { + "epoch": 0.6078741114283979, + "grad_norm": 0.15179190039634705, + "learning_rate": 3.356383982456495e-05, + "loss": 1.0442, + "step": 10005 + }, + { + "epoch": 0.6079348684610244, + "grad_norm": 0.1554936021566391, + "learning_rate": 3.355479934714226e-05, + "loss": 1.0531, + "step": 10006 + }, + { + "epoch": 0.6079956254936509, + "grad_norm": 0.16013725101947784, + "learning_rate": 3.354575947252159e-05, + "loss": 1.034, + "step": 10007 + }, + { + "epoch": 0.6080563825262774, + "grad_norm": 0.16513048112392426, + "learning_rate": 3.353672020103428e-05, + "loss": 1.0643, + "step": 10008 + }, + { + "epoch": 0.6081171395589039, + "grad_norm": 0.1938750445842743, + "learning_rate": 3.352768153301166e-05, + "loss": 1.1527, + "step": 10009 + }, + { + "epoch": 0.6081778965915304, + "grad_norm": 0.1725156307220459, + "learning_rate": 3.351864346878507e-05, + "loss": 1.1187, + "step": 10010 + }, + { + "epoch": 0.608238653624157, + "grad_norm": 0.1336262822151184, + "learning_rate": 3.350960600868579e-05, + "loss": 1.0731, + "step": 10011 + }, + { + "epoch": 0.6082994106567835, + "grad_norm": 0.38251498341560364, + "learning_rate": 3.350056915304508e-05, + "loss": 1.0273, + "step": 10012 + }, + { + "epoch": 0.60836016768941, + "grad_norm": 0.14297010004520416, + "learning_rate": 3.34915329021942e-05, + "loss": 1.0649, + "step": 10013 + }, + { + "epoch": 0.6084209247220366, + "grad_norm": 0.12223240733146667, + "learning_rate": 3.348249725646439e-05, + "loss": 1.0048, + "step": 10014 + }, + { + "epoch": 0.6084816817546631, + "grad_norm": 0.123337522149086, + "learning_rate": 3.347346221618682e-05, + "loss": 1.1941, + "step": 10015 + }, + { + "epoch": 0.6085424387872896, + "grad_norm": 0.12456902861595154, + "learning_rate": 3.3464427781692695e-05, + "loss": 1.0506, + "step": 10016 + }, + { + "epoch": 0.6086031958199162, + "grad_norm": 0.5937848091125488, + "learning_rate": 3.345539395331315e-05, + "loss": 1.165, + "step": 10017 + }, + { + "epoch": 0.6086639528525427, + "grad_norm": 0.15157192945480347, + "learning_rate": 3.344636073137934e-05, + "loss": 1.0648, + "step": 10018 + }, + { + "epoch": 0.6087247098851692, + "grad_norm": 0.2541963458061218, + "learning_rate": 3.343732811622239e-05, + "loss": 1.0739, + "step": 10019 + }, + { + "epoch": 0.6087854669177958, + "grad_norm": 0.11770328879356384, + "learning_rate": 3.342829610817337e-05, + "loss": 1.0604, + "step": 10020 + }, + { + "epoch": 0.6088462239504223, + "grad_norm": 0.19118331372737885, + "learning_rate": 3.341926470756337e-05, + "loss": 1.1141, + "step": 10021 + }, + { + "epoch": 0.6089069809830487, + "grad_norm": 0.14297187328338623, + "learning_rate": 3.341023391472342e-05, + "loss": 1.0649, + "step": 10022 + }, + { + "epoch": 0.6089677380156753, + "grad_norm": 0.15951859951019287, + "learning_rate": 3.340120372998455e-05, + "loss": 1.0616, + "step": 10023 + }, + { + "epoch": 0.6090284950483018, + "grad_norm": 0.11807980388402939, + "learning_rate": 3.339217415367778e-05, + "loss": 1.0619, + "step": 10024 + }, + { + "epoch": 0.6090892520809283, + "grad_norm": 0.2047649472951889, + "learning_rate": 3.338314518613406e-05, + "loss": 1.2266, + "step": 10025 + }, + { + "epoch": 0.6091500091135549, + "grad_norm": 0.11830107122659683, + "learning_rate": 3.337411682768436e-05, + "loss": 1.0404, + "step": 10026 + }, + { + "epoch": 0.6092107661461814, + "grad_norm": 0.12327075004577637, + "learning_rate": 3.336508907865964e-05, + "loss": 1.0397, + "step": 10027 + }, + { + "epoch": 0.609271523178808, + "grad_norm": 0.12254289537668228, + "learning_rate": 3.3356061939390795e-05, + "loss": 1.0602, + "step": 10028 + }, + { + "epoch": 0.6093322802114345, + "grad_norm": 0.16362264752388, + "learning_rate": 3.3347035410208705e-05, + "loss": 1.2008, + "step": 10029 + }, + { + "epoch": 0.609393037244061, + "grad_norm": 0.23142939805984497, + "learning_rate": 3.333800949144426e-05, + "loss": 1.019, + "step": 10030 + }, + { + "epoch": 0.6094537942766876, + "grad_norm": 0.118656225502491, + "learning_rate": 3.332898418342829e-05, + "loss": 1.0602, + "step": 10031 + }, + { + "epoch": 0.6095145513093141, + "grad_norm": 0.15101879835128784, + "learning_rate": 3.331995948649161e-05, + "loss": 1.0141, + "step": 10032 + }, + { + "epoch": 0.6095753083419406, + "grad_norm": 0.2564082443714142, + "learning_rate": 3.331093540096507e-05, + "loss": 1.08, + "step": 10033 + }, + { + "epoch": 0.6096360653745672, + "grad_norm": 0.23222987353801727, + "learning_rate": 3.330191192717942e-05, + "loss": 1.191, + "step": 10034 + }, + { + "epoch": 0.6096968224071936, + "grad_norm": 0.3385956287384033, + "learning_rate": 3.3292889065465405e-05, + "loss": 1.2905, + "step": 10035 + }, + { + "epoch": 0.6097575794398201, + "grad_norm": 0.1074041798710823, + "learning_rate": 3.328386681615377e-05, + "loss": 1.0584, + "step": 10036 + }, + { + "epoch": 0.6098183364724467, + "grad_norm": 0.12528693675994873, + "learning_rate": 3.327484517957522e-05, + "loss": 1.0424, + "step": 10037 + }, + { + "epoch": 0.6098790935050732, + "grad_norm": 0.18149995803833008, + "learning_rate": 3.326582415606046e-05, + "loss": 1.031, + "step": 10038 + }, + { + "epoch": 0.6099398505376997, + "grad_norm": 0.13242071866989136, + "learning_rate": 3.325680374594015e-05, + "loss": 1.2254, + "step": 10039 + }, + { + "epoch": 0.6100006075703263, + "grad_norm": 0.175632044672966, + "learning_rate": 3.324778394954493e-05, + "loss": 1.0641, + "step": 10040 + }, + { + "epoch": 0.6100613646029528, + "grad_norm": 0.18444722890853882, + "learning_rate": 3.323876476720546e-05, + "loss": 1.113, + "step": 10041 + }, + { + "epoch": 0.6101221216355793, + "grad_norm": 0.15609140694141388, + "learning_rate": 3.322974619925227e-05, + "loss": 1.0359, + "step": 10042 + }, + { + "epoch": 0.6101828786682059, + "grad_norm": 0.1381586790084839, + "learning_rate": 3.322072824601598e-05, + "loss": 1.0405, + "step": 10043 + }, + { + "epoch": 0.6102436357008324, + "grad_norm": 0.4765521287918091, + "learning_rate": 3.321171090782715e-05, + "loss": 1.2477, + "step": 10044 + }, + { + "epoch": 0.6103043927334589, + "grad_norm": 0.18282434344291687, + "learning_rate": 3.320269418501628e-05, + "loss": 1.0574, + "step": 10045 + }, + { + "epoch": 0.6103651497660855, + "grad_norm": 0.19166181981563568, + "learning_rate": 3.3193678077913926e-05, + "loss": 1.084, + "step": 10046 + }, + { + "epoch": 0.610425906798712, + "grad_norm": 0.13756051659584045, + "learning_rate": 3.3184662586850545e-05, + "loss": 1.0684, + "step": 10047 + }, + { + "epoch": 0.6104866638313384, + "grad_norm": 0.2356358766555786, + "learning_rate": 3.317564771215661e-05, + "loss": 1.0379, + "step": 10048 + }, + { + "epoch": 0.610547420863965, + "grad_norm": 0.15451419353485107, + "learning_rate": 3.316663345416256e-05, + "loss": 1.1197, + "step": 10049 + }, + { + "epoch": 0.6106081778965915, + "grad_norm": 3.828979730606079, + "learning_rate": 3.315761981319881e-05, + "loss": 1.1231, + "step": 10050 + }, + { + "epoch": 0.610668934929218, + "grad_norm": 0.2020713984966278, + "learning_rate": 3.314860678959575e-05, + "loss": 1.036, + "step": 10051 + }, + { + "epoch": 0.6107296919618446, + "grad_norm": 0.1029885783791542, + "learning_rate": 3.313959438368378e-05, + "loss": 1.0235, + "step": 10052 + }, + { + "epoch": 0.6107904489944711, + "grad_norm": 0.15715427696704865, + "learning_rate": 3.3130582595793226e-05, + "loss": 1.1494, + "step": 10053 + }, + { + "epoch": 0.6108512060270976, + "grad_norm": 0.1653818041086197, + "learning_rate": 3.3121571426254456e-05, + "loss": 1.0726, + "step": 10054 + }, + { + "epoch": 0.6109119630597242, + "grad_norm": 0.33791765570640564, + "learning_rate": 3.311256087539771e-05, + "loss": 1.1035, + "step": 10055 + }, + { + "epoch": 0.6109727200923507, + "grad_norm": 0.2315925508737564, + "learning_rate": 3.310355094355333e-05, + "loss": 1.1706, + "step": 10056 + }, + { + "epoch": 0.6110334771249772, + "grad_norm": 0.24786247313022614, + "learning_rate": 3.309454163105156e-05, + "loss": 1.0292, + "step": 10057 + }, + { + "epoch": 0.6110942341576038, + "grad_norm": 0.48100122809410095, + "learning_rate": 3.308553293822262e-05, + "loss": 1.1718, + "step": 10058 + }, + { + "epoch": 0.6111549911902303, + "grad_norm": 0.1851678490638733, + "learning_rate": 3.307652486539674e-05, + "loss": 1.0945, + "step": 10059 + }, + { + "epoch": 0.6112157482228568, + "grad_norm": 0.2552066445350647, + "learning_rate": 3.306751741290413e-05, + "loss": 1.1719, + "step": 10060 + }, + { + "epoch": 0.6112765052554833, + "grad_norm": 0.204861581325531, + "learning_rate": 3.305851058107494e-05, + "loss": 1.1591, + "step": 10061 + }, + { + "epoch": 0.6113372622881098, + "grad_norm": 0.13126778602600098, + "learning_rate": 3.30495043702393e-05, + "loss": 1.0683, + "step": 10062 + }, + { + "epoch": 0.6113980193207363, + "grad_norm": 0.21355962753295898, + "learning_rate": 3.3040498780727355e-05, + "loss": 1.0148, + "step": 10063 + }, + { + "epoch": 0.6114587763533629, + "grad_norm": 0.1705038696527481, + "learning_rate": 3.303149381286922e-05, + "loss": 1.0905, + "step": 10064 + }, + { + "epoch": 0.6115195333859894, + "grad_norm": 0.14262165129184723, + "learning_rate": 3.3022489466994954e-05, + "loss": 1.052, + "step": 10065 + }, + { + "epoch": 0.611580290418616, + "grad_norm": 0.26569920778274536, + "learning_rate": 3.3013485743434615e-05, + "loss": 1.2143, + "step": 10066 + }, + { + "epoch": 0.6116410474512425, + "grad_norm": 0.17504224181175232, + "learning_rate": 3.300448264251824e-05, + "loss": 1.0714, + "step": 10067 + }, + { + "epoch": 0.611701804483869, + "grad_norm": 0.31215140223503113, + "learning_rate": 3.299548016457586e-05, + "loss": 1.0429, + "step": 10068 + }, + { + "epoch": 0.6117625615164956, + "grad_norm": 0.11999605596065521, + "learning_rate": 3.298647830993742e-05, + "loss": 0.9838, + "step": 10069 + }, + { + "epoch": 0.6118233185491221, + "grad_norm": 0.22586029767990112, + "learning_rate": 3.2977477078932904e-05, + "loss": 1.0422, + "step": 10070 + }, + { + "epoch": 0.6118840755817486, + "grad_norm": 0.1299959421157837, + "learning_rate": 3.296847647189225e-05, + "loss": 1.0724, + "step": 10071 + }, + { + "epoch": 0.6119448326143752, + "grad_norm": 0.13999871909618378, + "learning_rate": 3.29594764891454e-05, + "loss": 1.0004, + "step": 10072 + }, + { + "epoch": 0.6120055896470017, + "grad_norm": 0.13911172747612, + "learning_rate": 3.295047713102223e-05, + "loss": 1.0468, + "step": 10073 + }, + { + "epoch": 0.6120663466796281, + "grad_norm": 0.1452186107635498, + "learning_rate": 3.294147839785262e-05, + "loss": 1.0418, + "step": 10074 + }, + { + "epoch": 0.6121271037122546, + "grad_norm": 0.16490453481674194, + "learning_rate": 3.293248028996642e-05, + "loss": 1.0837, + "step": 10075 + }, + { + "epoch": 0.6121878607448812, + "grad_norm": 0.12922118604183197, + "learning_rate": 3.292348280769343e-05, + "loss": 0.9894, + "step": 10076 + }, + { + "epoch": 0.6122486177775077, + "grad_norm": 0.13093970715999603, + "learning_rate": 3.29144859513635e-05, + "loss": 1.0333, + "step": 10077 + }, + { + "epoch": 0.6123093748101343, + "grad_norm": 0.15148712694644928, + "learning_rate": 3.2905489721306385e-05, + "loss": 1.0701, + "step": 10078 + }, + { + "epoch": 0.6123701318427608, + "grad_norm": 0.25300851464271545, + "learning_rate": 3.289649411785183e-05, + "loss": 1.1538, + "step": 10079 + }, + { + "epoch": 0.6124308888753873, + "grad_norm": 0.13811343908309937, + "learning_rate": 3.2887499141329606e-05, + "loss": 1.0605, + "step": 10080 + }, + { + "epoch": 0.6124916459080139, + "grad_norm": 0.20048639178276062, + "learning_rate": 3.287850479206942e-05, + "loss": 1.0471, + "step": 10081 + }, + { + "epoch": 0.6125524029406404, + "grad_norm": 0.22141355276107788, + "learning_rate": 3.286951107040094e-05, + "loss": 1.0536, + "step": 10082 + }, + { + "epoch": 0.6126131599732669, + "grad_norm": 0.42613619565963745, + "learning_rate": 3.2860517976653845e-05, + "loss": 0.9982, + "step": 10083 + }, + { + "epoch": 0.6126739170058935, + "grad_norm": 0.5643616318702698, + "learning_rate": 3.2851525511157764e-05, + "loss": 1.0455, + "step": 10084 + }, + { + "epoch": 0.61273467403852, + "grad_norm": 0.14203448593616486, + "learning_rate": 3.284253367424234e-05, + "loss": 1.0232, + "step": 10085 + }, + { + "epoch": 0.6127954310711465, + "grad_norm": 0.23522542417049408, + "learning_rate": 3.283354246623716e-05, + "loss": 1.035, + "step": 10086 + }, + { + "epoch": 0.612856188103773, + "grad_norm": 0.1335030496120453, + "learning_rate": 3.282455188747179e-05, + "loss": 1.0774, + "step": 10087 + }, + { + "epoch": 0.6129169451363995, + "grad_norm": 0.1179598867893219, + "learning_rate": 3.2815561938275815e-05, + "loss": 1.0189, + "step": 10088 + }, + { + "epoch": 0.612977702169026, + "grad_norm": 0.2610608637332916, + "learning_rate": 3.280657261897872e-05, + "loss": 1.1764, + "step": 10089 + }, + { + "epoch": 0.6130384592016526, + "grad_norm": 1.9669466018676758, + "learning_rate": 3.279758392991004e-05, + "loss": 1.0661, + "step": 10090 + }, + { + "epoch": 0.6130992162342791, + "grad_norm": 0.4825950860977173, + "learning_rate": 3.278859587139923e-05, + "loss": 1.02, + "step": 10091 + }, + { + "epoch": 0.6131599732669056, + "grad_norm": 0.4671441316604614, + "learning_rate": 3.277960844377577e-05, + "loss": 1.2157, + "step": 10092 + }, + { + "epoch": 0.6132207302995322, + "grad_norm": 0.11444177478551865, + "learning_rate": 3.277062164736909e-05, + "loss": 1.0302, + "step": 10093 + }, + { + "epoch": 0.6132814873321587, + "grad_norm": 0.16415248811244965, + "learning_rate": 3.276163548250862e-05, + "loss": 1.0196, + "step": 10094 + }, + { + "epoch": 0.6133422443647852, + "grad_norm": 0.30551454424858093, + "learning_rate": 3.275264994952373e-05, + "loss": 1.1052, + "step": 10095 + }, + { + "epoch": 0.6134030013974118, + "grad_norm": 0.15775832533836365, + "learning_rate": 3.2743665048743785e-05, + "loss": 1.1031, + "step": 10096 + }, + { + "epoch": 0.6134637584300383, + "grad_norm": 0.3249801695346832, + "learning_rate": 3.273468078049812e-05, + "loss": 1.0651, + "step": 10097 + }, + { + "epoch": 0.6135245154626648, + "grad_norm": 0.12081403285264969, + "learning_rate": 3.272569714511607e-05, + "loss": 1.0351, + "step": 10098 + }, + { + "epoch": 0.6135852724952914, + "grad_norm": 0.13086235523223877, + "learning_rate": 3.2716714142926953e-05, + "loss": 1.0114, + "step": 10099 + }, + { + "epoch": 0.6136460295279178, + "grad_norm": 0.17534758150577545, + "learning_rate": 3.270773177425999e-05, + "loss": 1.1343, + "step": 10100 + }, + { + "epoch": 0.6137067865605443, + "grad_norm": 0.15806272625923157, + "learning_rate": 3.2698750039444504e-05, + "loss": 1.0546, + "step": 10101 + }, + { + "epoch": 0.6137675435931709, + "grad_norm": 0.12268459796905518, + "learning_rate": 3.268976893880964e-05, + "loss": 1.0859, + "step": 10102 + }, + { + "epoch": 0.6138283006257974, + "grad_norm": 0.2153148502111435, + "learning_rate": 3.268078847268465e-05, + "loss": 1.1813, + "step": 10103 + }, + { + "epoch": 0.6138890576584239, + "grad_norm": 0.1714666187763214, + "learning_rate": 3.2671808641398714e-05, + "loss": 1.0427, + "step": 10104 + }, + { + "epoch": 0.6139498146910505, + "grad_norm": 0.231259286403656, + "learning_rate": 3.266282944528097e-05, + "loss": 1.0358, + "step": 10105 + }, + { + "epoch": 0.614010571723677, + "grad_norm": 0.1366434097290039, + "learning_rate": 3.2653850884660576e-05, + "loss": 1.079, + "step": 10106 + }, + { + "epoch": 0.6140713287563035, + "grad_norm": 0.3615018129348755, + "learning_rate": 3.2644872959866626e-05, + "loss": 1.2342, + "step": 10107 + }, + { + "epoch": 0.6141320857889301, + "grad_norm": 0.11281843483448029, + "learning_rate": 3.263589567122823e-05, + "loss": 1.0384, + "step": 10108 + }, + { + "epoch": 0.6141928428215566, + "grad_norm": 0.25387972593307495, + "learning_rate": 3.262691901907442e-05, + "loss": 1.2153, + "step": 10109 + }, + { + "epoch": 0.6142535998541832, + "grad_norm": 0.1566266417503357, + "learning_rate": 3.261794300373424e-05, + "loss": 1.0753, + "step": 10110 + }, + { + "epoch": 0.6143143568868097, + "grad_norm": 0.15186652541160583, + "learning_rate": 3.260896762553673e-05, + "loss": 1.1173, + "step": 10111 + }, + { + "epoch": 0.6143751139194362, + "grad_norm": 0.384333074092865, + "learning_rate": 3.259999288481087e-05, + "loss": 1.2207, + "step": 10112 + }, + { + "epoch": 0.6144358709520626, + "grad_norm": 0.18710604310035706, + "learning_rate": 3.259101878188563e-05, + "loss": 1.04, + "step": 10113 + }, + { + "epoch": 0.6144966279846892, + "grad_norm": 0.11207471042871475, + "learning_rate": 3.2582045317089966e-05, + "loss": 1.0117, + "step": 10114 + }, + { + "epoch": 0.6145573850173157, + "grad_norm": 0.3282966911792755, + "learning_rate": 3.257307249075281e-05, + "loss": 1.2205, + "step": 10115 + }, + { + "epoch": 0.6146181420499423, + "grad_norm": 0.20154400169849396, + "learning_rate": 3.2564100303203035e-05, + "loss": 1.0794, + "step": 10116 + }, + { + "epoch": 0.6146788990825688, + "grad_norm": 0.14722199738025665, + "learning_rate": 3.2555128754769544e-05, + "loss": 1.0998, + "step": 10117 + }, + { + "epoch": 0.6147396561151953, + "grad_norm": 0.12835252285003662, + "learning_rate": 3.254615784578117e-05, + "loss": 1.0498, + "step": 10118 + }, + { + "epoch": 0.6148004131478219, + "grad_norm": 0.15189583599567413, + "learning_rate": 3.253718757656675e-05, + "loss": 1.0448, + "step": 10119 + }, + { + "epoch": 0.6148611701804484, + "grad_norm": 0.16218426823616028, + "learning_rate": 3.252821794745511e-05, + "loss": 1.101, + "step": 10120 + }, + { + "epoch": 0.6149219272130749, + "grad_norm": 0.13713757693767548, + "learning_rate": 3.251924895877502e-05, + "loss": 1.0704, + "step": 10121 + }, + { + "epoch": 0.6149826842457015, + "grad_norm": 0.17553897202014923, + "learning_rate": 3.251028061085522e-05, + "loss": 1.1297, + "step": 10122 + }, + { + "epoch": 0.615043441278328, + "grad_norm": 0.14513979852199554, + "learning_rate": 3.250131290402448e-05, + "loss": 1.0314, + "step": 10123 + }, + { + "epoch": 0.6151041983109545, + "grad_norm": 0.2382880449295044, + "learning_rate": 3.249234583861149e-05, + "loss": 1.0434, + "step": 10124 + }, + { + "epoch": 0.6151649553435811, + "grad_norm": 0.1594681739807129, + "learning_rate": 3.248337941494496e-05, + "loss": 1.0743, + "step": 10125 + }, + { + "epoch": 0.6152257123762076, + "grad_norm": 0.14009036123752594, + "learning_rate": 3.2474413633353515e-05, + "loss": 1.0436, + "step": 10126 + }, + { + "epoch": 0.615286469408834, + "grad_norm": 0.40217599272727966, + "learning_rate": 3.246544849416585e-05, + "loss": 1.0653, + "step": 10127 + }, + { + "epoch": 0.6153472264414606, + "grad_norm": 0.7947143912315369, + "learning_rate": 3.245648399771057e-05, + "loss": 1.0742, + "step": 10128 + }, + { + "epoch": 0.6154079834740871, + "grad_norm": 0.142560213804245, + "learning_rate": 3.2447520144316246e-05, + "loss": 1.1069, + "step": 10129 + }, + { + "epoch": 0.6154687405067136, + "grad_norm": 0.11180705577135086, + "learning_rate": 3.243855693431146e-05, + "loss": 1.0368, + "step": 10130 + }, + { + "epoch": 0.6155294975393402, + "grad_norm": 0.274832159280777, + "learning_rate": 3.2429594368024766e-05, + "loss": 1.0994, + "step": 10131 + }, + { + "epoch": 0.6155902545719667, + "grad_norm": 0.12490338087081909, + "learning_rate": 3.2420632445784685e-05, + "loss": 1.0175, + "step": 10132 + }, + { + "epoch": 0.6156510116045932, + "grad_norm": 0.17404744029045105, + "learning_rate": 3.241167116791971e-05, + "loss": 1.1929, + "step": 10133 + }, + { + "epoch": 0.6157117686372198, + "grad_norm": 0.16612984240055084, + "learning_rate": 3.240271053475834e-05, + "loss": 1.0293, + "step": 10134 + }, + { + "epoch": 0.6157725256698463, + "grad_norm": 0.16274787485599518, + "learning_rate": 3.239375054662903e-05, + "loss": 1.06, + "step": 10135 + }, + { + "epoch": 0.6158332827024728, + "grad_norm": 0.16788017749786377, + "learning_rate": 3.238479120386019e-05, + "loss": 1.0531, + "step": 10136 + }, + { + "epoch": 0.6158940397350994, + "grad_norm": 0.15702417492866516, + "learning_rate": 3.237583250678022e-05, + "loss": 1.037, + "step": 10137 + }, + { + "epoch": 0.6159547967677259, + "grad_norm": 0.18851259350776672, + "learning_rate": 3.236687445571752e-05, + "loss": 1.0325, + "step": 10138 + }, + { + "epoch": 0.6160155538003524, + "grad_norm": 0.21135739982128143, + "learning_rate": 3.2357917051000454e-05, + "loss": 1.3347, + "step": 10139 + }, + { + "epoch": 0.6160763108329789, + "grad_norm": 0.23360225558280945, + "learning_rate": 3.2348960292957345e-05, + "loss": 1.1451, + "step": 10140 + }, + { + "epoch": 0.6161370678656054, + "grad_norm": 0.11978757381439209, + "learning_rate": 3.234000418191652e-05, + "loss": 1.0201, + "step": 10141 + }, + { + "epoch": 0.6161978248982319, + "grad_norm": 0.8820341229438782, + "learning_rate": 3.233104871820623e-05, + "loss": 1.2746, + "step": 10142 + }, + { + "epoch": 0.6162585819308585, + "grad_norm": 0.1710776388645172, + "learning_rate": 3.2322093902154776e-05, + "loss": 1.0617, + "step": 10143 + }, + { + "epoch": 0.616319338963485, + "grad_norm": 0.1668907105922699, + "learning_rate": 3.2313139734090395e-05, + "loss": 1.0229, + "step": 10144 + }, + { + "epoch": 0.6163800959961115, + "grad_norm": 17.193498611450195, + "learning_rate": 3.2304186214341284e-05, + "loss": 1.2036, + "step": 10145 + }, + { + "epoch": 0.6164408530287381, + "grad_norm": 0.16899652779102325, + "learning_rate": 3.2295233343235645e-05, + "loss": 1.1058, + "step": 10146 + }, + { + "epoch": 0.6165016100613646, + "grad_norm": 0.11913230270147324, + "learning_rate": 3.228628112110166e-05, + "loss": 1.0108, + "step": 10147 + }, + { + "epoch": 0.6165623670939911, + "grad_norm": 0.15516933798789978, + "learning_rate": 3.227732954826747e-05, + "loss": 1.0876, + "step": 10148 + }, + { + "epoch": 0.6166231241266177, + "grad_norm": 0.23348292708396912, + "learning_rate": 3.2268378625061185e-05, + "loss": 1.1455, + "step": 10149 + }, + { + "epoch": 0.6166838811592442, + "grad_norm": 0.39024418592453003, + "learning_rate": 3.225942835181091e-05, + "loss": 1.2577, + "step": 10150 + }, + { + "epoch": 0.6167446381918708, + "grad_norm": 0.14650747179985046, + "learning_rate": 3.2250478728844704e-05, + "loss": 1.0989, + "step": 10151 + }, + { + "epoch": 0.6168053952244973, + "grad_norm": 0.17481721937656403, + "learning_rate": 3.224152975649064e-05, + "loss": 1.022, + "step": 10152 + }, + { + "epoch": 0.6168661522571237, + "grad_norm": 0.14200538396835327, + "learning_rate": 3.223258143507674e-05, + "loss": 1.0547, + "step": 10153 + }, + { + "epoch": 0.6169269092897502, + "grad_norm": 0.1690562665462494, + "learning_rate": 3.2223633764931e-05, + "loss": 1.1528, + "step": 10154 + }, + { + "epoch": 0.6169876663223768, + "grad_norm": 0.14175823330879211, + "learning_rate": 3.2214686746381415e-05, + "loss": 1.04, + "step": 10155 + }, + { + "epoch": 0.6170484233550033, + "grad_norm": 0.16546085476875305, + "learning_rate": 3.22057403797559e-05, + "loss": 1.1701, + "step": 10156 + }, + { + "epoch": 0.6171091803876299, + "grad_norm": 0.12204727530479431, + "learning_rate": 3.219679466538242e-05, + "loss": 1.0554, + "step": 10157 + }, + { + "epoch": 0.6171699374202564, + "grad_norm": 0.19949981570243835, + "learning_rate": 3.218784960358888e-05, + "loss": 1.019, + "step": 10158 + }, + { + "epoch": 0.6172306944528829, + "grad_norm": 0.2087552398443222, + "learning_rate": 3.217890519470313e-05, + "loss": 1.0144, + "step": 10159 + }, + { + "epoch": 0.6172914514855095, + "grad_norm": 0.19289065897464752, + "learning_rate": 3.216996143905308e-05, + "loss": 1.0775, + "step": 10160 + }, + { + "epoch": 0.617352208518136, + "grad_norm": 0.14146707952022552, + "learning_rate": 3.216101833696655e-05, + "loss": 1.0725, + "step": 10161 + }, + { + "epoch": 0.6174129655507625, + "grad_norm": 0.10160516202449799, + "learning_rate": 3.215207588877133e-05, + "loss": 1.0466, + "step": 10162 + }, + { + "epoch": 0.6174737225833891, + "grad_norm": 0.16701847314834595, + "learning_rate": 3.214313409479521e-05, + "loss": 1.0712, + "step": 10163 + }, + { + "epoch": 0.6175344796160156, + "grad_norm": 0.14244811236858368, + "learning_rate": 3.2134192955365954e-05, + "loss": 1.0711, + "step": 10164 + }, + { + "epoch": 0.6175952366486421, + "grad_norm": 0.11480254679918289, + "learning_rate": 3.2125252470811315e-05, + "loss": 1.0228, + "step": 10165 + }, + { + "epoch": 0.6176559936812686, + "grad_norm": 0.10029196739196777, + "learning_rate": 3.211631264145901e-05, + "loss": 1.0352, + "step": 10166 + }, + { + "epoch": 0.6177167507138951, + "grad_norm": 0.11403120309114456, + "learning_rate": 3.2107373467636715e-05, + "loss": 1.0478, + "step": 10167 + }, + { + "epoch": 0.6177775077465216, + "grad_norm": 0.1351073831319809, + "learning_rate": 3.209843494967213e-05, + "loss": 1.0764, + "step": 10168 + }, + { + "epoch": 0.6178382647791482, + "grad_norm": 0.10452364385128021, + "learning_rate": 3.208949708789284e-05, + "loss": 1.0281, + "step": 10169 + }, + { + "epoch": 0.6178990218117747, + "grad_norm": 0.27388057112693787, + "learning_rate": 3.2080559882626505e-05, + "loss": 1.0349, + "step": 10170 + }, + { + "epoch": 0.6179597788444012, + "grad_norm": 0.13104680180549622, + "learning_rate": 3.2071623334200714e-05, + "loss": 1.023, + "step": 10171 + }, + { + "epoch": 0.6180205358770278, + "grad_norm": 0.13915517926216125, + "learning_rate": 3.2062687442943023e-05, + "loss": 1.0893, + "step": 10172 + }, + { + "epoch": 0.6180812929096543, + "grad_norm": 0.16732257604599, + "learning_rate": 3.2053752209181004e-05, + "loss": 1.1302, + "step": 10173 + }, + { + "epoch": 0.6181420499422808, + "grad_norm": 0.1678365021944046, + "learning_rate": 3.2044817633242165e-05, + "loss": 1.1527, + "step": 10174 + }, + { + "epoch": 0.6182028069749074, + "grad_norm": 0.15535657107830048, + "learning_rate": 3.203588371545402e-05, + "loss": 0.997, + "step": 10175 + }, + { + "epoch": 0.6182635640075339, + "grad_norm": 0.2423597276210785, + "learning_rate": 3.2026950456144013e-05, + "loss": 1.0559, + "step": 10176 + }, + { + "epoch": 0.6183243210401604, + "grad_norm": 0.11685813963413239, + "learning_rate": 3.2018017855639606e-05, + "loss": 1.0282, + "step": 10177 + }, + { + "epoch": 0.618385078072787, + "grad_norm": 0.16012737154960632, + "learning_rate": 3.200908591426823e-05, + "loss": 1.0619, + "step": 10178 + }, + { + "epoch": 0.6184458351054134, + "grad_norm": 0.10786806046962738, + "learning_rate": 3.2000154632357305e-05, + "loss": 1.0494, + "step": 10179 + }, + { + "epoch": 0.6185065921380399, + "grad_norm": 0.1822032630443573, + "learning_rate": 3.199122401023417e-05, + "loss": 1.159, + "step": 10180 + }, + { + "epoch": 0.6185673491706665, + "grad_norm": 0.20492428541183472, + "learning_rate": 3.198229404822621e-05, + "loss": 1.0545, + "step": 10181 + }, + { + "epoch": 0.618628106203293, + "grad_norm": 4.97695779800415, + "learning_rate": 3.197336474666076e-05, + "loss": 1.1523, + "step": 10182 + }, + { + "epoch": 0.6186888632359195, + "grad_norm": 0.16445136070251465, + "learning_rate": 3.196443610586509e-05, + "loss": 1.1553, + "step": 10183 + }, + { + "epoch": 0.6187496202685461, + "grad_norm": 0.31077760457992554, + "learning_rate": 3.195550812616651e-05, + "loss": 1.1558, + "step": 10184 + }, + { + "epoch": 0.6188103773011726, + "grad_norm": 0.1357533186674118, + "learning_rate": 3.1946580807892255e-05, + "loss": 1.0948, + "step": 10185 + }, + { + "epoch": 0.6188711343337991, + "grad_norm": 0.11395826190710068, + "learning_rate": 3.1937654151369584e-05, + "loss": 1.0557, + "step": 10186 + }, + { + "epoch": 0.6189318913664257, + "grad_norm": 0.2083105444908142, + "learning_rate": 3.192872815692569e-05, + "loss": 1.0904, + "step": 10187 + }, + { + "epoch": 0.6189926483990522, + "grad_norm": 0.6378744840621948, + "learning_rate": 3.191980282488776e-05, + "loss": 1.0577, + "step": 10188 + }, + { + "epoch": 0.6190534054316787, + "grad_norm": 0.3138388395309448, + "learning_rate": 3.191087815558295e-05, + "loss": 1.0547, + "step": 10189 + }, + { + "epoch": 0.6191141624643053, + "grad_norm": 0.2703898549079895, + "learning_rate": 3.190195414933839e-05, + "loss": 1.1554, + "step": 10190 + }, + { + "epoch": 0.6191749194969318, + "grad_norm": 0.1886947900056839, + "learning_rate": 3.18930308064812e-05, + "loss": 1.1593, + "step": 10191 + }, + { + "epoch": 0.6192356765295582, + "grad_norm": 0.1358928382396698, + "learning_rate": 3.188410812733847e-05, + "loss": 1.0587, + "step": 10192 + }, + { + "epoch": 0.6192964335621848, + "grad_norm": 0.1611601561307907, + "learning_rate": 3.187518611223725e-05, + "loss": 1.0332, + "step": 10193 + }, + { + "epoch": 0.6193571905948113, + "grad_norm": 0.22348247468471527, + "learning_rate": 3.186626476150459e-05, + "loss": 1.0699, + "step": 10194 + }, + { + "epoch": 0.6194179476274378, + "grad_norm": 0.16034063696861267, + "learning_rate": 3.185734407546752e-05, + "loss": 1.0695, + "step": 10195 + }, + { + "epoch": 0.6194787046600644, + "grad_norm": 0.1393730640411377, + "learning_rate": 3.1848424054452994e-05, + "loss": 1.0583, + "step": 10196 + }, + { + "epoch": 0.6195394616926909, + "grad_norm": 0.15678107738494873, + "learning_rate": 3.1839504698787995e-05, + "loss": 1.0268, + "step": 10197 + }, + { + "epoch": 0.6196002187253175, + "grad_norm": 1.5592325925827026, + "learning_rate": 3.1830586008799454e-05, + "loss": 1.0568, + "step": 10198 + }, + { + "epoch": 0.619660975757944, + "grad_norm": 0.3552964925765991, + "learning_rate": 3.1821667984814305e-05, + "loss": 1.0747, + "step": 10199 + }, + { + "epoch": 0.6197217327905705, + "grad_norm": 0.12567734718322754, + "learning_rate": 3.181275062715943e-05, + "loss": 1.0094, + "step": 10200 + }, + { + "epoch": 0.6197824898231971, + "grad_norm": 0.21178902685642242, + "learning_rate": 3.1803833936161676e-05, + "loss": 1.1242, + "step": 10201 + }, + { + "epoch": 0.6198432468558236, + "grad_norm": 0.20213253796100616, + "learning_rate": 3.179491791214795e-05, + "loss": 1.0352, + "step": 10202 + }, + { + "epoch": 0.6199040038884501, + "grad_norm": 0.4800195097923279, + "learning_rate": 3.178600255544498e-05, + "loss": 1.027, + "step": 10203 + }, + { + "epoch": 0.6199647609210767, + "grad_norm": 0.23955950140953064, + "learning_rate": 3.177708786637963e-05, + "loss": 1.1343, + "step": 10204 + }, + { + "epoch": 0.6200255179537031, + "grad_norm": 0.15821386873722076, + "learning_rate": 3.1768173845278645e-05, + "loss": 1.1236, + "step": 10205 + }, + { + "epoch": 0.6200862749863296, + "grad_norm": 0.14776530861854553, + "learning_rate": 3.1759260492468754e-05, + "loss": 1.0658, + "step": 10206 + }, + { + "epoch": 0.6201470320189562, + "grad_norm": 0.1479075849056244, + "learning_rate": 3.175034780827672e-05, + "loss": 0.9996, + "step": 10207 + }, + { + "epoch": 0.6202077890515827, + "grad_norm": 6.934096336364746, + "learning_rate": 3.174143579302922e-05, + "loss": 1.0476, + "step": 10208 + }, + { + "epoch": 0.6202685460842092, + "grad_norm": 0.17443256080150604, + "learning_rate": 3.1732524447052904e-05, + "loss": 1.0868, + "step": 10209 + }, + { + "epoch": 0.6203293031168358, + "grad_norm": 0.3047315180301666, + "learning_rate": 3.1723613770674436e-05, + "loss": 1.1481, + "step": 10210 + }, + { + "epoch": 0.6203900601494623, + "grad_norm": 0.20682884752750397, + "learning_rate": 3.171470376422043e-05, + "loss": 1.0244, + "step": 10211 + }, + { + "epoch": 0.6204508171820888, + "grad_norm": 0.18574631214141846, + "learning_rate": 3.1705794428017506e-05, + "loss": 1.0227, + "step": 10212 + }, + { + "epoch": 0.6205115742147154, + "grad_norm": 0.11193668842315674, + "learning_rate": 3.169688576239222e-05, + "loss": 1.04, + "step": 10213 + }, + { + "epoch": 0.6205723312473419, + "grad_norm": 0.42144519090652466, + "learning_rate": 3.168797776767112e-05, + "loss": 1.1116, + "step": 10214 + }, + { + "epoch": 0.6206330882799684, + "grad_norm": 0.15046969056129456, + "learning_rate": 3.167907044418074e-05, + "loss": 1.0039, + "step": 10215 + }, + { + "epoch": 0.620693845312595, + "grad_norm": 0.14643318951129913, + "learning_rate": 3.1670163792247574e-05, + "loss": 1.0977, + "step": 10216 + }, + { + "epoch": 0.6207546023452215, + "grad_norm": 0.7566734552383423, + "learning_rate": 3.16612578121981e-05, + "loss": 1.0646, + "step": 10217 + }, + { + "epoch": 0.6208153593778479, + "grad_norm": 0.17360030114650726, + "learning_rate": 3.165235250435876e-05, + "loss": 1.1453, + "step": 10218 + }, + { + "epoch": 0.6208761164104745, + "grad_norm": 0.2574431598186493, + "learning_rate": 3.164344786905598e-05, + "loss": 1.0381, + "step": 10219 + }, + { + "epoch": 0.620936873443101, + "grad_norm": 0.17490963637828827, + "learning_rate": 3.163454390661617e-05, + "loss": 1.0717, + "step": 10220 + }, + { + "epoch": 0.6209976304757275, + "grad_norm": 0.3749448359012604, + "learning_rate": 3.162564061736571e-05, + "loss": 1.0627, + "step": 10221 + }, + { + "epoch": 0.6210583875083541, + "grad_norm": 1.0792803764343262, + "learning_rate": 3.1616738001630954e-05, + "loss": 1.1325, + "step": 10222 + }, + { + "epoch": 0.6211191445409806, + "grad_norm": 0.19147220253944397, + "learning_rate": 3.1607836059738196e-05, + "loss": 1.0971, + "step": 10223 + }, + { + "epoch": 0.6211799015736071, + "grad_norm": 0.3345095217227936, + "learning_rate": 3.159893479201377e-05, + "loss": 1.18, + "step": 10224 + }, + { + "epoch": 0.6212406586062337, + "grad_norm": 0.2410873919725418, + "learning_rate": 3.1590034198783956e-05, + "loss": 1.0383, + "step": 10225 + }, + { + "epoch": 0.6213014156388602, + "grad_norm": 0.24107584357261658, + "learning_rate": 3.158113428037499e-05, + "loss": 1.1225, + "step": 10226 + }, + { + "epoch": 0.6213621726714867, + "grad_norm": 0.19483482837677002, + "learning_rate": 3.15722350371131e-05, + "loss": 1.0414, + "step": 10227 + }, + { + "epoch": 0.6214229297041133, + "grad_norm": 0.14700210094451904, + "learning_rate": 3.156333646932451e-05, + "loss": 1.0837, + "step": 10228 + }, + { + "epoch": 0.6214836867367398, + "grad_norm": 1.29800546169281, + "learning_rate": 3.155443857733539e-05, + "loss": 1.0978, + "step": 10229 + }, + { + "epoch": 0.6215444437693664, + "grad_norm": 0.27275630831718445, + "learning_rate": 3.1545541361471876e-05, + "loss": 1.0979, + "step": 10230 + }, + { + "epoch": 0.6216052008019929, + "grad_norm": 0.16258209943771362, + "learning_rate": 3.1536644822060105e-05, + "loss": 1.0831, + "step": 10231 + }, + { + "epoch": 0.6216659578346193, + "grad_norm": 0.1788981854915619, + "learning_rate": 3.1527748959426194e-05, + "loss": 1.156, + "step": 10232 + }, + { + "epoch": 0.6217267148672458, + "grad_norm": 0.12015987187623978, + "learning_rate": 3.151885377389622e-05, + "loss": 1.0424, + "step": 10233 + }, + { + "epoch": 0.6217874718998724, + "grad_norm": 0.16199100017547607, + "learning_rate": 3.150995926579622e-05, + "loss": 1.0768, + "step": 10234 + }, + { + "epoch": 0.6218482289324989, + "grad_norm": 0.3051406443119049, + "learning_rate": 3.150106543545227e-05, + "loss": 1.13, + "step": 10235 + }, + { + "epoch": 0.6219089859651254, + "grad_norm": 0.12980064749717712, + "learning_rate": 3.1492172283190314e-05, + "loss": 1.056, + "step": 10236 + }, + { + "epoch": 0.621969742997752, + "grad_norm": 0.1398184895515442, + "learning_rate": 3.148327980933636e-05, + "loss": 1.0652, + "step": 10237 + }, + { + "epoch": 0.6220305000303785, + "grad_norm": 0.2077093869447708, + "learning_rate": 3.1474388014216376e-05, + "loss": 1.1068, + "step": 10238 + }, + { + "epoch": 0.622091257063005, + "grad_norm": 0.150908961892128, + "learning_rate": 3.1465496898156266e-05, + "loss": 1.0443, + "step": 10239 + }, + { + "epoch": 0.6221520140956316, + "grad_norm": 0.1392572820186615, + "learning_rate": 3.145660646148196e-05, + "loss": 1.0423, + "step": 10240 + }, + { + "epoch": 0.6222127711282581, + "grad_norm": 0.4874022603034973, + "learning_rate": 3.1447716704519326e-05, + "loss": 1.1987, + "step": 10241 + }, + { + "epoch": 0.6222735281608847, + "grad_norm": 2.573227643966675, + "learning_rate": 3.143882762759424e-05, + "loss": 1.2781, + "step": 10242 + }, + { + "epoch": 0.6223342851935112, + "grad_norm": 0.21928462386131287, + "learning_rate": 3.1429939231032494e-05, + "loss": 1.1183, + "step": 10243 + }, + { + "epoch": 0.6223950422261377, + "grad_norm": 0.24733304977416992, + "learning_rate": 3.142105151515991e-05, + "loss": 1.131, + "step": 10244 + }, + { + "epoch": 0.6224557992587642, + "grad_norm": 0.28779125213623047, + "learning_rate": 3.1412164480302286e-05, + "loss": 1.1175, + "step": 10245 + }, + { + "epoch": 0.6225165562913907, + "grad_norm": 0.21045023202896118, + "learning_rate": 3.140327812678536e-05, + "loss": 1.0034, + "step": 10246 + }, + { + "epoch": 0.6225773133240172, + "grad_norm": 0.9853264689445496, + "learning_rate": 3.139439245493486e-05, + "loss": 1.0436, + "step": 10247 + }, + { + "epoch": 0.6226380703566438, + "grad_norm": 0.14474140107631683, + "learning_rate": 3.138550746507652e-05, + "loss": 1.0312, + "step": 10248 + }, + { + "epoch": 0.6226988273892703, + "grad_norm": 0.14982201159000397, + "learning_rate": 3.137662315753601e-05, + "loss": 1.0111, + "step": 10249 + }, + { + "epoch": 0.6227595844218968, + "grad_norm": 0.17669539153575897, + "learning_rate": 3.1367739532638964e-05, + "loss": 1.1114, + "step": 10250 + }, + { + "epoch": 0.6228203414545234, + "grad_norm": 0.5805287957191467, + "learning_rate": 3.135885659071102e-05, + "loss": 1.28, + "step": 10251 + }, + { + "epoch": 0.6228810984871499, + "grad_norm": 0.19784559309482574, + "learning_rate": 3.134997433207779e-05, + "loss": 1.0695, + "step": 10252 + }, + { + "epoch": 0.6229418555197764, + "grad_norm": 0.1529422402381897, + "learning_rate": 3.1341092757064864e-05, + "loss": 1.1249, + "step": 10253 + }, + { + "epoch": 0.623002612552403, + "grad_norm": 0.11288879811763763, + "learning_rate": 3.1332211865997786e-05, + "loss": 1.0073, + "step": 10254 + }, + { + "epoch": 0.6230633695850295, + "grad_norm": 0.2541508972644806, + "learning_rate": 3.132333165920211e-05, + "loss": 1.0554, + "step": 10255 + }, + { + "epoch": 0.623124126617656, + "grad_norm": 1.5954954624176025, + "learning_rate": 3.1314452137003305e-05, + "loss": 1.0949, + "step": 10256 + }, + { + "epoch": 0.6231848836502826, + "grad_norm": 0.1637292206287384, + "learning_rate": 3.130557329972685e-05, + "loss": 1.1201, + "step": 10257 + }, + { + "epoch": 0.623245640682909, + "grad_norm": 0.15132907032966614, + "learning_rate": 3.129669514769825e-05, + "loss": 1.0161, + "step": 10258 + }, + { + "epoch": 0.6233063977155355, + "grad_norm": 0.20064134895801544, + "learning_rate": 3.128781768124289e-05, + "loss": 1.1348, + "step": 10259 + }, + { + "epoch": 0.6233671547481621, + "grad_norm": 1.8573087453842163, + "learning_rate": 3.127894090068618e-05, + "loss": 1.1136, + "step": 10260 + }, + { + "epoch": 0.6234279117807886, + "grad_norm": 0.17055903375148773, + "learning_rate": 3.1270064806353525e-05, + "loss": 1.0872, + "step": 10261 + }, + { + "epoch": 0.6234886688134151, + "grad_norm": 0.34187719225883484, + "learning_rate": 3.126118939857027e-05, + "loss": 1.091, + "step": 10262 + }, + { + "epoch": 0.6235494258460417, + "grad_norm": 0.12974116206169128, + "learning_rate": 3.1252314677661734e-05, + "loss": 1.1349, + "step": 10263 + }, + { + "epoch": 0.6236101828786682, + "grad_norm": 0.15073318779468536, + "learning_rate": 3.124344064395322e-05, + "loss": 1.0491, + "step": 10264 + }, + { + "epoch": 0.6236709399112947, + "grad_norm": 0.12552976608276367, + "learning_rate": 3.1234567297770025e-05, + "loss": 1.0792, + "step": 10265 + }, + { + "epoch": 0.6237316969439213, + "grad_norm": 0.15528784692287445, + "learning_rate": 3.122569463943739e-05, + "loss": 1.1446, + "step": 10266 + }, + { + "epoch": 0.6237924539765478, + "grad_norm": 0.1845795214176178, + "learning_rate": 3.1216822669280545e-05, + "loss": 1.1041, + "step": 10267 + }, + { + "epoch": 0.6238532110091743, + "grad_norm": 0.5201460123062134, + "learning_rate": 3.120795138762468e-05, + "loss": 1.1157, + "step": 10268 + }, + { + "epoch": 0.6239139680418009, + "grad_norm": 0.221136212348938, + "learning_rate": 3.119908079479503e-05, + "loss": 1.1601, + "step": 10269 + }, + { + "epoch": 0.6239747250744274, + "grad_norm": 0.7188616394996643, + "learning_rate": 3.1190210891116674e-05, + "loss": 1.0447, + "step": 10270 + }, + { + "epoch": 0.6240354821070538, + "grad_norm": 0.14929260313510895, + "learning_rate": 3.118134167691478e-05, + "loss": 1.0258, + "step": 10271 + }, + { + "epoch": 0.6240962391396804, + "grad_norm": 0.19001224637031555, + "learning_rate": 3.117247315251445e-05, + "loss": 1.0106, + "step": 10272 + }, + { + "epoch": 0.6241569961723069, + "grad_norm": 0.3410162329673767, + "learning_rate": 3.116360531824074e-05, + "loss": 1.1658, + "step": 10273 + }, + { + "epoch": 0.6242177532049334, + "grad_norm": 0.15914678573608398, + "learning_rate": 3.115473817441873e-05, + "loss": 1.0854, + "step": 10274 + }, + { + "epoch": 0.62427851023756, + "grad_norm": 0.15766802430152893, + "learning_rate": 3.114587172137345e-05, + "loss": 1.0955, + "step": 10275 + }, + { + "epoch": 0.6243392672701865, + "grad_norm": 0.1829780787229538, + "learning_rate": 3.113700595942988e-05, + "loss": 1.1395, + "step": 10276 + }, + { + "epoch": 0.624400024302813, + "grad_norm": 0.4062417149543762, + "learning_rate": 3.1128140888912994e-05, + "loss": 1.3405, + "step": 10277 + }, + { + "epoch": 0.6244607813354396, + "grad_norm": 0.45306307077407837, + "learning_rate": 3.1119276510147744e-05, + "loss": 1.0387, + "step": 10278 + }, + { + "epoch": 0.6245215383680661, + "grad_norm": 0.1936022788286209, + "learning_rate": 3.111041282345907e-05, + "loss": 1.1977, + "step": 10279 + }, + { + "epoch": 0.6245822954006927, + "grad_norm": 0.12848900258541107, + "learning_rate": 3.110154982917187e-05, + "loss": 1.0389, + "step": 10280 + }, + { + "epoch": 0.6246430524333192, + "grad_norm": 0.16689981520175934, + "learning_rate": 3.1092687527610994e-05, + "loss": 1.0021, + "step": 10281 + }, + { + "epoch": 0.6247038094659457, + "grad_norm": 0.18329235911369324, + "learning_rate": 3.1083825919101354e-05, + "loss": 1.0268, + "step": 10282 + }, + { + "epoch": 0.6247645664985723, + "grad_norm": 0.11465802043676376, + "learning_rate": 3.107496500396769e-05, + "loss": 1.0389, + "step": 10283 + }, + { + "epoch": 0.6248253235311987, + "grad_norm": 0.13188806176185608, + "learning_rate": 3.1066104782534866e-05, + "loss": 1.0612, + "step": 10284 + }, + { + "epoch": 0.6248860805638252, + "grad_norm": 0.13338439166545868, + "learning_rate": 3.105724525512762e-05, + "loss": 1.0394, + "step": 10285 + }, + { + "epoch": 0.6249468375964518, + "grad_norm": 0.13666728138923645, + "learning_rate": 3.1048386422070704e-05, + "loss": 1.0665, + "step": 10286 + }, + { + "epoch": 0.6250075946290783, + "grad_norm": 0.6837540864944458, + "learning_rate": 3.1039528283688854e-05, + "loss": 1.0765, + "step": 10287 + }, + { + "epoch": 0.6250683516617048, + "grad_norm": 0.1986069679260254, + "learning_rate": 3.1030670840306766e-05, + "loss": 1.161, + "step": 10288 + }, + { + "epoch": 0.6251291086943314, + "grad_norm": 0.8365297913551331, + "learning_rate": 3.102181409224911e-05, + "loss": 1.0698, + "step": 10289 + }, + { + "epoch": 0.6251898657269579, + "grad_norm": 0.14574310183525085, + "learning_rate": 3.101295803984052e-05, + "loss": 1.0741, + "step": 10290 + }, + { + "epoch": 0.6252506227595844, + "grad_norm": 0.19370320439338684, + "learning_rate": 3.100410268340561e-05, + "loss": 1.0286, + "step": 10291 + }, + { + "epoch": 0.625311379792211, + "grad_norm": 0.13562780618667603, + "learning_rate": 3.0995248023269e-05, + "loss": 1.0542, + "step": 10292 + }, + { + "epoch": 0.6253721368248375, + "grad_norm": 0.1542542725801468, + "learning_rate": 3.098639405975525e-05, + "loss": 1.0348, + "step": 10293 + }, + { + "epoch": 0.625432893857464, + "grad_norm": 0.13755856454372406, + "learning_rate": 3.097754079318889e-05, + "loss": 1.0213, + "step": 10294 + }, + { + "epoch": 0.6254936508900906, + "grad_norm": 0.13255921006202698, + "learning_rate": 3.096868822389448e-05, + "loss": 1.0832, + "step": 10295 + }, + { + "epoch": 0.6255544079227171, + "grad_norm": 0.11111794412136078, + "learning_rate": 3.095983635219645e-05, + "loss": 1.0273, + "step": 10296 + }, + { + "epoch": 0.6256151649553435, + "grad_norm": 0.12085297703742981, + "learning_rate": 3.09509851784193e-05, + "loss": 1.0675, + "step": 10297 + }, + { + "epoch": 0.6256759219879701, + "grad_norm": 0.12704680860042572, + "learning_rate": 3.0942134702887485e-05, + "loss": 1.0619, + "step": 10298 + }, + { + "epoch": 0.6257366790205966, + "grad_norm": 0.14826908707618713, + "learning_rate": 3.0933284925925386e-05, + "loss": 1.1516, + "step": 10299 + }, + { + "epoch": 0.6257974360532231, + "grad_norm": 0.25654202699661255, + "learning_rate": 3.092443584785742e-05, + "loss": 1.0558, + "step": 10300 + }, + { + "epoch": 0.6258581930858497, + "grad_norm": 0.14167040586471558, + "learning_rate": 3.0915587469007954e-05, + "loss": 1.1049, + "step": 10301 + }, + { + "epoch": 0.6259189501184762, + "grad_norm": 0.13542619347572327, + "learning_rate": 3.090673978970133e-05, + "loss": 1.1049, + "step": 10302 + }, + { + "epoch": 0.6259797071511027, + "grad_norm": 0.9043248891830444, + "learning_rate": 3.0897892810261844e-05, + "loss": 1.0825, + "step": 10303 + }, + { + "epoch": 0.6260404641837293, + "grad_norm": 0.19542792439460754, + "learning_rate": 3.0889046531013765e-05, + "loss": 1.1743, + "step": 10304 + }, + { + "epoch": 0.6261012212163558, + "grad_norm": 0.12149197608232498, + "learning_rate": 3.0880200952281395e-05, + "loss": 1.0514, + "step": 10305 + }, + { + "epoch": 0.6261619782489823, + "grad_norm": 0.13701939582824707, + "learning_rate": 3.087135607438896e-05, + "loss": 1.033, + "step": 10306 + }, + { + "epoch": 0.6262227352816089, + "grad_norm": 0.14510488510131836, + "learning_rate": 3.086251189766065e-05, + "loss": 1.0288, + "step": 10307 + }, + { + "epoch": 0.6262834923142354, + "grad_norm": 0.2603748142719269, + "learning_rate": 3.0853668422420675e-05, + "loss": 1.0656, + "step": 10308 + }, + { + "epoch": 0.626344249346862, + "grad_norm": 0.18288971483707428, + "learning_rate": 3.08448256489932e-05, + "loss": 1.0943, + "step": 10309 + }, + { + "epoch": 0.6264050063794884, + "grad_norm": 1.9514892101287842, + "learning_rate": 3.083598357770233e-05, + "loss": 1.0917, + "step": 10310 + }, + { + "epoch": 0.6264657634121149, + "grad_norm": 0.15267646312713623, + "learning_rate": 3.082714220887218e-05, + "loss": 1.0294, + "step": 10311 + }, + { + "epoch": 0.6265265204447414, + "grad_norm": 0.1306448131799698, + "learning_rate": 3.081830154282684e-05, + "loss": 1.048, + "step": 10312 + }, + { + "epoch": 0.626587277477368, + "grad_norm": 0.2210254669189453, + "learning_rate": 3.080946157989037e-05, + "loss": 1.1473, + "step": 10313 + }, + { + "epoch": 0.6266480345099945, + "grad_norm": 0.13626231253147125, + "learning_rate": 3.0800622320386795e-05, + "loss": 1.0451, + "step": 10314 + }, + { + "epoch": 0.626708791542621, + "grad_norm": 0.1890278458595276, + "learning_rate": 3.0791783764640134e-05, + "loss": 1.0647, + "step": 10315 + }, + { + "epoch": 0.6267695485752476, + "grad_norm": 0.13382858037948608, + "learning_rate": 3.078294591297432e-05, + "loss": 1.0338, + "step": 10316 + }, + { + "epoch": 0.6268303056078741, + "grad_norm": 0.17762376368045807, + "learning_rate": 3.077410876571336e-05, + "loss": 1.0598, + "step": 10317 + }, + { + "epoch": 0.6268910626405007, + "grad_norm": 0.17865866422653198, + "learning_rate": 3.076527232318116e-05, + "loss": 1.1162, + "step": 10318 + }, + { + "epoch": 0.6269518196731272, + "grad_norm": 0.20925891399383545, + "learning_rate": 3.075643658570161e-05, + "loss": 1.1072, + "step": 10319 + }, + { + "epoch": 0.6270125767057537, + "grad_norm": 0.1224282830953598, + "learning_rate": 3.0747601553598596e-05, + "loss": 1.0703, + "step": 10320 + }, + { + "epoch": 0.6270733337383803, + "grad_norm": 0.16769208014011383, + "learning_rate": 3.0738767227195976e-05, + "loss": 1.1299, + "step": 10321 + }, + { + "epoch": 0.6271340907710068, + "grad_norm": 0.18002669513225555, + "learning_rate": 3.072993360681757e-05, + "loss": 1.0029, + "step": 10322 + }, + { + "epoch": 0.6271948478036332, + "grad_norm": 0.16207602620124817, + "learning_rate": 3.0721100692787177e-05, + "loss": 1.0968, + "step": 10323 + }, + { + "epoch": 0.6272556048362598, + "grad_norm": 0.14748461544513702, + "learning_rate": 3.0712268485428544e-05, + "loss": 1.1545, + "step": 10324 + }, + { + "epoch": 0.6273163618688863, + "grad_norm": 0.6007484197616577, + "learning_rate": 3.070343698506545e-05, + "loss": 1.1102, + "step": 10325 + }, + { + "epoch": 0.6273771189015128, + "grad_norm": 0.6533710360527039, + "learning_rate": 3.06946061920216e-05, + "loss": 1.2175, + "step": 10326 + }, + { + "epoch": 0.6274378759341394, + "grad_norm": 0.248569518327713, + "learning_rate": 3.0685776106620704e-05, + "loss": 1.2088, + "step": 10327 + }, + { + "epoch": 0.6274986329667659, + "grad_norm": 0.17653091251850128, + "learning_rate": 3.0676946729186395e-05, + "loss": 1.0316, + "step": 10328 + }, + { + "epoch": 0.6275593899993924, + "grad_norm": 0.21529167890548706, + "learning_rate": 3.066811806004236e-05, + "loss": 1.013, + "step": 10329 + }, + { + "epoch": 0.627620147032019, + "grad_norm": 0.24814148247241974, + "learning_rate": 3.065929009951219e-05, + "loss": 1.0888, + "step": 10330 + }, + { + "epoch": 0.6276809040646455, + "grad_norm": 0.16948087513446808, + "learning_rate": 3.0650462847919476e-05, + "loss": 1.0208, + "step": 10331 + }, + { + "epoch": 0.627741661097272, + "grad_norm": 0.25379952788352966, + "learning_rate": 3.0641636305587776e-05, + "loss": 1.0766, + "step": 10332 + }, + { + "epoch": 0.6278024181298986, + "grad_norm": 0.4796024560928345, + "learning_rate": 3.063281047284065e-05, + "loss": 1.3597, + "step": 10333 + }, + { + "epoch": 0.6278631751625251, + "grad_norm": 0.13805541396141052, + "learning_rate": 3.0623985350001606e-05, + "loss": 1.0247, + "step": 10334 + }, + { + "epoch": 0.6279239321951516, + "grad_norm": 0.2653367519378662, + "learning_rate": 3.0615160937394105e-05, + "loss": 1.1992, + "step": 10335 + }, + { + "epoch": 0.6279846892277782, + "grad_norm": 0.2275865077972412, + "learning_rate": 3.060633723534167e-05, + "loss": 1.1838, + "step": 10336 + }, + { + "epoch": 0.6280454462604046, + "grad_norm": 0.2066417783498764, + "learning_rate": 3.0597514244167647e-05, + "loss": 1.0618, + "step": 10337 + }, + { + "epoch": 0.6281062032930311, + "grad_norm": 0.15471604466438293, + "learning_rate": 3.05886919641955e-05, + "loss": 1.0433, + "step": 10338 + }, + { + "epoch": 0.6281669603256577, + "grad_norm": 0.2105868011713028, + "learning_rate": 3.057987039574861e-05, + "loss": 1.0913, + "step": 10339 + }, + { + "epoch": 0.6282277173582842, + "grad_norm": 0.196090430021286, + "learning_rate": 3.057104953915031e-05, + "loss": 1.1626, + "step": 10340 + }, + { + "epoch": 0.6282884743909107, + "grad_norm": 0.12826426327228546, + "learning_rate": 3.0562229394723956e-05, + "loss": 1.0358, + "step": 10341 + }, + { + "epoch": 0.6283492314235373, + "grad_norm": 0.17824378609657288, + "learning_rate": 3.055340996279285e-05, + "loss": 1.0445, + "step": 10342 + }, + { + "epoch": 0.6284099884561638, + "grad_norm": 0.14610296487808228, + "learning_rate": 3.054459124368025e-05, + "loss": 1.0577, + "step": 10343 + }, + { + "epoch": 0.6284707454887903, + "grad_norm": 0.18150828778743744, + "learning_rate": 3.053577323770942e-05, + "loss": 1.0247, + "step": 10344 + }, + { + "epoch": 0.6285315025214169, + "grad_norm": 0.2705344557762146, + "learning_rate": 3.0526955945203576e-05, + "loss": 1.0826, + "step": 10345 + }, + { + "epoch": 0.6285922595540434, + "grad_norm": 0.19752287864685059, + "learning_rate": 3.051813936648593e-05, + "loss": 1.1792, + "step": 10346 + }, + { + "epoch": 0.6286530165866699, + "grad_norm": 0.1566229909658432, + "learning_rate": 3.050932350187966e-05, + "loss": 1.0608, + "step": 10347 + }, + { + "epoch": 0.6287137736192965, + "grad_norm": 0.13781581819057465, + "learning_rate": 3.050050835170789e-05, + "loss": 1.0477, + "step": 10348 + }, + { + "epoch": 0.628774530651923, + "grad_norm": 0.5694911479949951, + "learning_rate": 3.0491693916293783e-05, + "loss": 1.1422, + "step": 10349 + }, + { + "epoch": 0.6288352876845494, + "grad_norm": 0.14116372168064117, + "learning_rate": 3.0482880195960382e-05, + "loss": 1.0572, + "step": 10350 + }, + { + "epoch": 0.628896044717176, + "grad_norm": 0.18642696738243103, + "learning_rate": 3.0474067191030785e-05, + "loss": 1.0727, + "step": 10351 + }, + { + "epoch": 0.6289568017498025, + "grad_norm": 0.1293736845254898, + "learning_rate": 3.0465254901828032e-05, + "loss": 1.0733, + "step": 10352 + }, + { + "epoch": 0.629017558782429, + "grad_norm": 0.2832860052585602, + "learning_rate": 3.0456443328675133e-05, + "loss": 1.0717, + "step": 10353 + }, + { + "epoch": 0.6290783158150556, + "grad_norm": 0.18724596500396729, + "learning_rate": 3.0447632471895083e-05, + "loss": 1.1637, + "step": 10354 + }, + { + "epoch": 0.6291390728476821, + "grad_norm": 0.1231849417090416, + "learning_rate": 3.0438822331810846e-05, + "loss": 1.0353, + "step": 10355 + }, + { + "epoch": 0.6291998298803086, + "grad_norm": 0.13783642649650574, + "learning_rate": 3.0430012908745363e-05, + "loss": 1.0222, + "step": 10356 + }, + { + "epoch": 0.6292605869129352, + "grad_norm": 0.12766417860984802, + "learning_rate": 3.0421204203021536e-05, + "loss": 1.007, + "step": 10357 + }, + { + "epoch": 0.6293213439455617, + "grad_norm": 0.2922574579715729, + "learning_rate": 3.041239621496224e-05, + "loss": 1.311, + "step": 10358 + }, + { + "epoch": 0.6293821009781883, + "grad_norm": 0.16196677088737488, + "learning_rate": 3.040358894489036e-05, + "loss": 1.0634, + "step": 10359 + }, + { + "epoch": 0.6294428580108148, + "grad_norm": 0.1410250961780548, + "learning_rate": 3.0394782393128712e-05, + "loss": 1.0788, + "step": 10360 + }, + { + "epoch": 0.6295036150434413, + "grad_norm": 0.16530294716358185, + "learning_rate": 3.0385976560000096e-05, + "loss": 1.1272, + "step": 10361 + }, + { + "epoch": 0.6295643720760679, + "grad_norm": 0.12859342992305756, + "learning_rate": 3.037717144582733e-05, + "loss": 1.0155, + "step": 10362 + }, + { + "epoch": 0.6296251291086943, + "grad_norm": 0.16427485644817352, + "learning_rate": 3.036836705093311e-05, + "loss": 1.0501, + "step": 10363 + }, + { + "epoch": 0.6296858861413208, + "grad_norm": 0.1354958713054657, + "learning_rate": 3.0359563375640198e-05, + "loss": 1.0097, + "step": 10364 + }, + { + "epoch": 0.6297466431739474, + "grad_norm": 0.15866374969482422, + "learning_rate": 3.0350760420271285e-05, + "loss": 1.1355, + "step": 10365 + }, + { + "epoch": 0.6298074002065739, + "grad_norm": 0.21070945262908936, + "learning_rate": 3.0341958185149043e-05, + "loss": 1.151, + "step": 10366 + }, + { + "epoch": 0.6298681572392004, + "grad_norm": 3.1443984508514404, + "learning_rate": 3.033315667059613e-05, + "loss": 1.0752, + "step": 10367 + }, + { + "epoch": 0.629928914271827, + "grad_norm": 0.18908613920211792, + "learning_rate": 3.0324355876935163e-05, + "loss": 1.0556, + "step": 10368 + }, + { + "epoch": 0.6299896713044535, + "grad_norm": 0.22113458812236786, + "learning_rate": 3.0315555804488737e-05, + "loss": 1.045, + "step": 10369 + }, + { + "epoch": 0.63005042833708, + "grad_norm": 0.16768483817577362, + "learning_rate": 3.0306756453579417e-05, + "loss": 0.9816, + "step": 10370 + }, + { + "epoch": 0.6301111853697066, + "grad_norm": 0.12660662829875946, + "learning_rate": 3.029795782452973e-05, + "loss": 1.1073, + "step": 10371 + }, + { + "epoch": 0.6301719424023331, + "grad_norm": 0.15312685072422028, + "learning_rate": 3.0289159917662223e-05, + "loss": 1.0493, + "step": 10372 + }, + { + "epoch": 0.6302326994349596, + "grad_norm": 0.2500297725200653, + "learning_rate": 3.0280362733299362e-05, + "loss": 1.1539, + "step": 10373 + }, + { + "epoch": 0.6302934564675862, + "grad_norm": 0.17541056871414185, + "learning_rate": 3.0271566271763613e-05, + "loss": 1.1442, + "step": 10374 + }, + { + "epoch": 0.6303542135002127, + "grad_norm": 0.24468013644218445, + "learning_rate": 3.026277053337742e-05, + "loss": 1.1894, + "step": 10375 + }, + { + "epoch": 0.6304149705328391, + "grad_norm": 0.1420433223247528, + "learning_rate": 3.0253975518463206e-05, + "loss": 1.0121, + "step": 10376 + }, + { + "epoch": 0.6304757275654657, + "grad_norm": 0.5652739405632019, + "learning_rate": 3.0245181227343322e-05, + "loss": 1.0751, + "step": 10377 + }, + { + "epoch": 0.6305364845980922, + "grad_norm": 0.22571349143981934, + "learning_rate": 3.023638766034014e-05, + "loss": 1.1303, + "step": 10378 + }, + { + "epoch": 0.6305972416307187, + "grad_norm": 0.14158114790916443, + "learning_rate": 3.0227594817775974e-05, + "loss": 1.0511, + "step": 10379 + }, + { + "epoch": 0.6306579986633453, + "grad_norm": 0.11062348634004593, + "learning_rate": 3.0218802699973152e-05, + "loss": 1.0611, + "step": 10380 + }, + { + "epoch": 0.6307187556959718, + "grad_norm": 0.14190226793289185, + "learning_rate": 3.0210011307253945e-05, + "loss": 1.0565, + "step": 10381 + }, + { + "epoch": 0.6307795127285983, + "grad_norm": 0.24531146883964539, + "learning_rate": 3.0201220639940608e-05, + "loss": 1.1492, + "step": 10382 + }, + { + "epoch": 0.6308402697612249, + "grad_norm": 0.11381656676530838, + "learning_rate": 3.0192430698355346e-05, + "loss": 1.0532, + "step": 10383 + }, + { + "epoch": 0.6309010267938514, + "grad_norm": 0.15549519658088684, + "learning_rate": 3.0183641482820353e-05, + "loss": 1.1218, + "step": 10384 + }, + { + "epoch": 0.6309617838264779, + "grad_norm": 0.5387110710144043, + "learning_rate": 3.017485299365782e-05, + "loss": 1.17, + "step": 10385 + }, + { + "epoch": 0.6310225408591045, + "grad_norm": 0.11693233996629715, + "learning_rate": 3.0166065231189888e-05, + "loss": 1.0525, + "step": 10386 + }, + { + "epoch": 0.631083297891731, + "grad_norm": 0.2796156704425812, + "learning_rate": 3.015727819573865e-05, + "loss": 1.1601, + "step": 10387 + }, + { + "epoch": 0.6311440549243575, + "grad_norm": 0.1062614768743515, + "learning_rate": 3.0148491887626235e-05, + "loss": 1.0041, + "step": 10388 + }, + { + "epoch": 0.631204811956984, + "grad_norm": 0.19674816727638245, + "learning_rate": 3.0139706307174698e-05, + "loss": 1.1256, + "step": 10389 + }, + { + "epoch": 0.6312655689896105, + "grad_norm": 0.24845124781131744, + "learning_rate": 3.013092145470605e-05, + "loss": 1.0809, + "step": 10390 + }, + { + "epoch": 0.631326326022237, + "grad_norm": 0.1545766294002533, + "learning_rate": 3.0122137330542318e-05, + "loss": 1.065, + "step": 10391 + }, + { + "epoch": 0.6313870830548636, + "grad_norm": 0.1449037492275238, + "learning_rate": 3.0113353935005474e-05, + "loss": 1.0079, + "step": 10392 + }, + { + "epoch": 0.6314478400874901, + "grad_norm": 0.2765040993690491, + "learning_rate": 3.010457126841749e-05, + "loss": 1.1477, + "step": 10393 + }, + { + "epoch": 0.6315085971201166, + "grad_norm": 0.14687050879001617, + "learning_rate": 3.00957893311003e-05, + "loss": 1.0985, + "step": 10394 + }, + { + "epoch": 0.6315693541527432, + "grad_norm": 0.6837555170059204, + "learning_rate": 3.0087008123375787e-05, + "loss": 1.0365, + "step": 10395 + }, + { + "epoch": 0.6316301111853697, + "grad_norm": 0.14196206629276276, + "learning_rate": 3.007822764556587e-05, + "loss": 0.999, + "step": 10396 + }, + { + "epoch": 0.6316908682179962, + "grad_norm": 0.16935744881629944, + "learning_rate": 3.0069447897992342e-05, + "loss": 1.1195, + "step": 10397 + }, + { + "epoch": 0.6317516252506228, + "grad_norm": 0.13387644290924072, + "learning_rate": 3.006066888097706e-05, + "loss": 1.0343, + "step": 10398 + }, + { + "epoch": 0.6318123822832493, + "grad_norm": 0.1921854168176651, + "learning_rate": 3.0051890594841815e-05, + "loss": 1.0586, + "step": 10399 + }, + { + "epoch": 0.6318731393158759, + "grad_norm": 0.36501345038414, + "learning_rate": 3.0043113039908367e-05, + "loss": 1.1813, + "step": 10400 + }, + { + "epoch": 0.6319338963485024, + "grad_norm": 0.15174295008182526, + "learning_rate": 3.0034336216498477e-05, + "loss": 1.0601, + "step": 10401 + }, + { + "epoch": 0.6319946533811288, + "grad_norm": 0.19241754710674286, + "learning_rate": 3.0025560124933856e-05, + "loss": 1.0191, + "step": 10402 + }, + { + "epoch": 0.6320554104137553, + "grad_norm": 0.1643778681755066, + "learning_rate": 3.0016784765536194e-05, + "loss": 1.1427, + "step": 10403 + }, + { + "epoch": 0.6321161674463819, + "grad_norm": 0.23425880074501038, + "learning_rate": 3.0008010138627142e-05, + "loss": 1.1788, + "step": 10404 + }, + { + "epoch": 0.6321769244790084, + "grad_norm": 0.23254826664924622, + "learning_rate": 2.9999236244528333e-05, + "loss": 1.0011, + "step": 10405 + }, + { + "epoch": 0.632237681511635, + "grad_norm": 0.4401704668998718, + "learning_rate": 2.999046308356139e-05, + "loss": 1.1651, + "step": 10406 + }, + { + "epoch": 0.6322984385442615, + "grad_norm": 0.1550683230161667, + "learning_rate": 2.99816906560479e-05, + "loss": 1.0353, + "step": 10407 + }, + { + "epoch": 0.632359195576888, + "grad_norm": 0.20659492909908295, + "learning_rate": 2.9972918962309393e-05, + "loss": 1.0239, + "step": 10408 + }, + { + "epoch": 0.6324199526095146, + "grad_norm": 0.14924851059913635, + "learning_rate": 2.9964148002667437e-05, + "loss": 1.0378, + "step": 10409 + }, + { + "epoch": 0.6324807096421411, + "grad_norm": 0.15685848891735077, + "learning_rate": 2.9955377777443493e-05, + "loss": 1.1113, + "step": 10410 + }, + { + "epoch": 0.6325414666747676, + "grad_norm": 0.16926518082618713, + "learning_rate": 2.9946608286959056e-05, + "loss": 1.0968, + "step": 10411 + }, + { + "epoch": 0.6326022237073942, + "grad_norm": 0.1526908278465271, + "learning_rate": 2.993783953153556e-05, + "loss": 1.0395, + "step": 10412 + }, + { + "epoch": 0.6326629807400207, + "grad_norm": 0.11706577986478806, + "learning_rate": 2.9929071511494433e-05, + "loss": 1.0492, + "step": 10413 + }, + { + "epoch": 0.6327237377726472, + "grad_norm": 0.13299399614334106, + "learning_rate": 2.9920304227157082e-05, + "loss": 1.0153, + "step": 10414 + }, + { + "epoch": 0.6327844948052737, + "grad_norm": 0.1327035278081894, + "learning_rate": 2.9911537678844846e-05, + "loss": 1.1181, + "step": 10415 + }, + { + "epoch": 0.6328452518379002, + "grad_norm": 0.12151546776294708, + "learning_rate": 2.9902771866879104e-05, + "loss": 1.1217, + "step": 10416 + }, + { + "epoch": 0.6329060088705267, + "grad_norm": 0.11267746984958649, + "learning_rate": 2.989400679158112e-05, + "loss": 1.0755, + "step": 10417 + }, + { + "epoch": 0.6329667659031533, + "grad_norm": 0.4333186745643616, + "learning_rate": 2.988524245327221e-05, + "loss": 1.0757, + "step": 10418 + }, + { + "epoch": 0.6330275229357798, + "grad_norm": 0.22246746718883514, + "learning_rate": 2.9876478852273626e-05, + "loss": 1.2279, + "step": 10419 + }, + { + "epoch": 0.6330882799684063, + "grad_norm": 0.1636882722377777, + "learning_rate": 2.9867715988906602e-05, + "loss": 1.1471, + "step": 10420 + }, + { + "epoch": 0.6331490370010329, + "grad_norm": 0.16150227189064026, + "learning_rate": 2.9858953863492334e-05, + "loss": 1.1191, + "step": 10421 + }, + { + "epoch": 0.6332097940336594, + "grad_norm": 0.1300274282693863, + "learning_rate": 2.9850192476352006e-05, + "loss": 1.0367, + "step": 10422 + }, + { + "epoch": 0.6332705510662859, + "grad_norm": 0.16975072026252747, + "learning_rate": 2.9841431827806787e-05, + "loss": 1.0628, + "step": 10423 + }, + { + "epoch": 0.6333313080989125, + "grad_norm": 0.17236849665641785, + "learning_rate": 2.9832671918177762e-05, + "loss": 1.0494, + "step": 10424 + }, + { + "epoch": 0.633392065131539, + "grad_norm": 0.1864895075559616, + "learning_rate": 2.9823912747786043e-05, + "loss": 1.1578, + "step": 10425 + }, + { + "epoch": 0.6334528221641655, + "grad_norm": 0.1635923981666565, + "learning_rate": 2.9815154316952708e-05, + "loss": 1.0333, + "step": 10426 + }, + { + "epoch": 0.6335135791967921, + "grad_norm": 0.30224552750587463, + "learning_rate": 2.9806396625998794e-05, + "loss": 1.1293, + "step": 10427 + }, + { + "epoch": 0.6335743362294185, + "grad_norm": 0.15837016701698303, + "learning_rate": 2.9797639675245305e-05, + "loss": 1.0523, + "step": 10428 + }, + { + "epoch": 0.633635093262045, + "grad_norm": 0.2511090636253357, + "learning_rate": 2.9788883465013274e-05, + "loss": 1.0683, + "step": 10429 + }, + { + "epoch": 0.6336958502946716, + "grad_norm": 0.13588260114192963, + "learning_rate": 2.9780127995623587e-05, + "loss": 1.0483, + "step": 10430 + }, + { + "epoch": 0.6337566073272981, + "grad_norm": 0.11286661028862, + "learning_rate": 2.977137326739723e-05, + "loss": 1.0583, + "step": 10431 + }, + { + "epoch": 0.6338173643599246, + "grad_norm": 0.18842312693595886, + "learning_rate": 2.97626192806551e-05, + "loss": 1.0946, + "step": 10432 + }, + { + "epoch": 0.6338781213925512, + "grad_norm": 0.45880672335624695, + "learning_rate": 2.9753866035718058e-05, + "loss": 1.2263, + "step": 10433 + }, + { + "epoch": 0.6339388784251777, + "grad_norm": 0.30782759189605713, + "learning_rate": 2.974511353290698e-05, + "loss": 1.1732, + "step": 10434 + }, + { + "epoch": 0.6339996354578042, + "grad_norm": 0.1627044677734375, + "learning_rate": 2.9736361772542687e-05, + "loss": 1.06, + "step": 10435 + }, + { + "epoch": 0.6340603924904308, + "grad_norm": 0.13880908489227295, + "learning_rate": 2.9727610754945972e-05, + "loss": 1.0873, + "step": 10436 + }, + { + "epoch": 0.6341211495230573, + "grad_norm": 0.1428736448287964, + "learning_rate": 2.9718860480437594e-05, + "loss": 1.077, + "step": 10437 + }, + { + "epoch": 0.6341819065556838, + "grad_norm": 0.10837585479021072, + "learning_rate": 2.9710110949338298e-05, + "loss": 0.9905, + "step": 10438 + }, + { + "epoch": 0.6342426635883104, + "grad_norm": 0.13783054053783417, + "learning_rate": 2.970136216196882e-05, + "loss": 1.0985, + "step": 10439 + }, + { + "epoch": 0.6343034206209369, + "grad_norm": 0.1523531973361969, + "learning_rate": 2.969261411864983e-05, + "loss": 1.0682, + "step": 10440 + }, + { + "epoch": 0.6343641776535635, + "grad_norm": 0.19871826469898224, + "learning_rate": 2.968386681970199e-05, + "loss": 1.0593, + "step": 10441 + }, + { + "epoch": 0.6344249346861899, + "grad_norm": 0.13012000918388367, + "learning_rate": 2.9675120265445944e-05, + "loss": 1.069, + "step": 10442 + }, + { + "epoch": 0.6344856917188164, + "grad_norm": 0.4994335174560547, + "learning_rate": 2.9666374456202306e-05, + "loss": 1.0828, + "step": 10443 + }, + { + "epoch": 0.634546448751443, + "grad_norm": 0.19163455069065094, + "learning_rate": 2.9657629392291632e-05, + "loss": 1.0965, + "step": 10444 + }, + { + "epoch": 0.6346072057840695, + "grad_norm": 0.15011663734912872, + "learning_rate": 2.9648885074034483e-05, + "loss": 1.1319, + "step": 10445 + }, + { + "epoch": 0.634667962816696, + "grad_norm": 0.27260082960128784, + "learning_rate": 2.9640141501751384e-05, + "loss": 1.3209, + "step": 10446 + }, + { + "epoch": 0.6347287198493226, + "grad_norm": 0.12057328224182129, + "learning_rate": 2.9631398675762835e-05, + "loss": 1.0218, + "step": 10447 + }, + { + "epoch": 0.6347894768819491, + "grad_norm": 0.1451476365327835, + "learning_rate": 2.9622656596389308e-05, + "loss": 1.079, + "step": 10448 + }, + { + "epoch": 0.6348502339145756, + "grad_norm": 0.12040580809116364, + "learning_rate": 2.9613915263951247e-05, + "loss": 1.0153, + "step": 10449 + }, + { + "epoch": 0.6349109909472022, + "grad_norm": 0.22193381190299988, + "learning_rate": 2.9605174678769054e-05, + "loss": 1.1129, + "step": 10450 + }, + { + "epoch": 0.6349717479798287, + "grad_norm": 0.3113779127597809, + "learning_rate": 2.9596434841163124e-05, + "loss": 1.1277, + "step": 10451 + }, + { + "epoch": 0.6350325050124552, + "grad_norm": 0.17773836851119995, + "learning_rate": 2.958769575145382e-05, + "loss": 1.1239, + "step": 10452 + }, + { + "epoch": 0.6350932620450818, + "grad_norm": 0.20376388728618622, + "learning_rate": 2.9578957409961478e-05, + "loss": 1.0878, + "step": 10453 + }, + { + "epoch": 0.6351540190777083, + "grad_norm": 0.1555800437927246, + "learning_rate": 2.9570219817006384e-05, + "loss": 1.2266, + "step": 10454 + }, + { + "epoch": 0.6352147761103347, + "grad_norm": 0.1502917855978012, + "learning_rate": 2.956148297290885e-05, + "loss": 1.1663, + "step": 10455 + }, + { + "epoch": 0.6352755331429613, + "grad_norm": 0.5676726698875427, + "learning_rate": 2.9552746877989113e-05, + "loss": 1.2282, + "step": 10456 + }, + { + "epoch": 0.6353362901755878, + "grad_norm": 0.2551223635673523, + "learning_rate": 2.9544011532567385e-05, + "loss": 1.1928, + "step": 10457 + }, + { + "epoch": 0.6353970472082143, + "grad_norm": 0.17729078233242035, + "learning_rate": 2.9535276936963864e-05, + "loss": 1.1115, + "step": 10458 + }, + { + "epoch": 0.6354578042408409, + "grad_norm": 0.1536313146352768, + "learning_rate": 2.9526543091498715e-05, + "loss": 1.1336, + "step": 10459 + }, + { + "epoch": 0.6355185612734674, + "grad_norm": 0.1466873288154602, + "learning_rate": 2.9517809996492096e-05, + "loss": 1.1072, + "step": 10460 + }, + { + "epoch": 0.6355793183060939, + "grad_norm": 0.14344127476215363, + "learning_rate": 2.9509077652264106e-05, + "loss": 1.0029, + "step": 10461 + }, + { + "epoch": 0.6356400753387205, + "grad_norm": 0.6649088263511658, + "learning_rate": 2.950034605913483e-05, + "loss": 1.1936, + "step": 10462 + }, + { + "epoch": 0.635700832371347, + "grad_norm": 0.17644619941711426, + "learning_rate": 2.949161521742436e-05, + "loss": 1.0747, + "step": 10463 + }, + { + "epoch": 0.6357615894039735, + "grad_norm": 0.11065807938575745, + "learning_rate": 2.9482885127452676e-05, + "loss": 1.0062, + "step": 10464 + }, + { + "epoch": 0.6358223464366001, + "grad_norm": 0.13071666657924652, + "learning_rate": 2.9474155789539803e-05, + "loss": 1.0401, + "step": 10465 + }, + { + "epoch": 0.6358831034692266, + "grad_norm": 0.26256754994392395, + "learning_rate": 2.9465427204005725e-05, + "loss": 1.2293, + "step": 10466 + }, + { + "epoch": 0.6359438605018531, + "grad_norm": 0.20208019018173218, + "learning_rate": 2.9456699371170372e-05, + "loss": 1.0935, + "step": 10467 + }, + { + "epoch": 0.6360046175344796, + "grad_norm": 0.11011306941509247, + "learning_rate": 2.9447972291353676e-05, + "loss": 1.0297, + "step": 10468 + }, + { + "epoch": 0.6360653745671061, + "grad_norm": 0.6539217829704285, + "learning_rate": 2.943924596487554e-05, + "loss": 1.1171, + "step": 10469 + }, + { + "epoch": 0.6361261315997326, + "grad_norm": 0.12469131499528885, + "learning_rate": 2.9430520392055817e-05, + "loss": 1.0509, + "step": 10470 + }, + { + "epoch": 0.6361868886323592, + "grad_norm": 0.176783487200737, + "learning_rate": 2.9421795573214337e-05, + "loss": 1.1422, + "step": 10471 + }, + { + "epoch": 0.6362476456649857, + "grad_norm": 0.15529657900333405, + "learning_rate": 2.9413071508670907e-05, + "loss": 1.1086, + "step": 10472 + }, + { + "epoch": 0.6363084026976122, + "grad_norm": 0.13071373105049133, + "learning_rate": 2.9404348198745334e-05, + "loss": 1.0379, + "step": 10473 + }, + { + "epoch": 0.6363691597302388, + "grad_norm": 0.1773880273103714, + "learning_rate": 2.9395625643757365e-05, + "loss": 1.0744, + "step": 10474 + }, + { + "epoch": 0.6364299167628653, + "grad_norm": 0.18066534399986267, + "learning_rate": 2.93869038440267e-05, + "loss": 1.1468, + "step": 10475 + }, + { + "epoch": 0.6364906737954918, + "grad_norm": 0.1577855795621872, + "learning_rate": 2.9378182799873094e-05, + "loss": 1.1089, + "step": 10476 + }, + { + "epoch": 0.6365514308281184, + "grad_norm": 0.2647530436515808, + "learning_rate": 2.9369462511616154e-05, + "loss": 1.1218, + "step": 10477 + }, + { + "epoch": 0.6366121878607449, + "grad_norm": 0.1956561654806137, + "learning_rate": 2.936074297957556e-05, + "loss": 1.1213, + "step": 10478 + }, + { + "epoch": 0.6366729448933715, + "grad_norm": 0.1589953899383545, + "learning_rate": 2.9352024204070927e-05, + "loss": 1.0411, + "step": 10479 + }, + { + "epoch": 0.636733701925998, + "grad_norm": 0.5645908117294312, + "learning_rate": 2.934330618542182e-05, + "loss": 1.2063, + "step": 10480 + }, + { + "epoch": 0.6367944589586244, + "grad_norm": 0.1910964995622635, + "learning_rate": 2.9334588923947838e-05, + "loss": 1.016, + "step": 10481 + }, + { + "epoch": 0.636855215991251, + "grad_norm": 0.14719392359256744, + "learning_rate": 2.932587241996848e-05, + "loss": 1.0743, + "step": 10482 + }, + { + "epoch": 0.6369159730238775, + "grad_norm": 0.17193061113357544, + "learning_rate": 2.9317156673803282e-05, + "loss": 1.0588, + "step": 10483 + }, + { + "epoch": 0.636976730056504, + "grad_norm": 0.13495969772338867, + "learning_rate": 2.9308441685771692e-05, + "loss": 1.0239, + "step": 10484 + }, + { + "epoch": 0.6370374870891305, + "grad_norm": 0.18823975324630737, + "learning_rate": 2.9299727456193172e-05, + "loss": 1.1506, + "step": 10485 + }, + { + "epoch": 0.6370982441217571, + "grad_norm": 0.16708354651927948, + "learning_rate": 2.929101398538714e-05, + "loss": 1.0709, + "step": 10486 + }, + { + "epoch": 0.6371590011543836, + "grad_norm": 0.1167384460568428, + "learning_rate": 2.9282301273672996e-05, + "loss": 1.0237, + "step": 10487 + }, + { + "epoch": 0.6372197581870102, + "grad_norm": 0.18278738856315613, + "learning_rate": 2.9273589321370098e-05, + "loss": 1.0955, + "step": 10488 + }, + { + "epoch": 0.6372805152196367, + "grad_norm": 0.14713147282600403, + "learning_rate": 2.9264878128797802e-05, + "loss": 1.0901, + "step": 10489 + }, + { + "epoch": 0.6373412722522632, + "grad_norm": 0.21573582291603088, + "learning_rate": 2.925616769627541e-05, + "loss": 1.0395, + "step": 10490 + }, + { + "epoch": 0.6374020292848898, + "grad_norm": 0.1737573891878128, + "learning_rate": 2.9247458024122193e-05, + "loss": 1.0336, + "step": 10491 + }, + { + "epoch": 0.6374627863175163, + "grad_norm": 0.38860005140304565, + "learning_rate": 2.9238749112657414e-05, + "loss": 1.0884, + "step": 10492 + }, + { + "epoch": 0.6375235433501428, + "grad_norm": 0.14887909591197968, + "learning_rate": 2.923004096220029e-05, + "loss": 1.0261, + "step": 10493 + }, + { + "epoch": 0.6375843003827693, + "grad_norm": 0.15561585128307343, + "learning_rate": 2.922133357307003e-05, + "loss": 1.0969, + "step": 10494 + }, + { + "epoch": 0.6376450574153958, + "grad_norm": 0.15924541652202606, + "learning_rate": 2.921262694558582e-05, + "loss": 1.0666, + "step": 10495 + }, + { + "epoch": 0.6377058144480223, + "grad_norm": 0.12230934947729111, + "learning_rate": 2.9203921080066786e-05, + "loss": 0.9906, + "step": 10496 + }, + { + "epoch": 0.6377665714806489, + "grad_norm": 0.21622680127620697, + "learning_rate": 2.9195215976832034e-05, + "loss": 1.1063, + "step": 10497 + }, + { + "epoch": 0.6378273285132754, + "grad_norm": 0.27397438883781433, + "learning_rate": 2.9186511636200666e-05, + "loss": 1.0661, + "step": 10498 + }, + { + "epoch": 0.6378880855459019, + "grad_norm": 8.239015579223633, + "learning_rate": 2.917780805849173e-05, + "loss": 1.1225, + "step": 10499 + }, + { + "epoch": 0.6379488425785285, + "grad_norm": 0.17071060836315155, + "learning_rate": 2.9169105244024275e-05, + "loss": 1.1244, + "step": 10500 + }, + { + "epoch": 0.638009599611155, + "grad_norm": 0.14276830852031708, + "learning_rate": 2.9160403193117292e-05, + "loss": 1.1034, + "step": 10501 + }, + { + "epoch": 0.6380703566437815, + "grad_norm": 0.30681511759757996, + "learning_rate": 2.9151701906089735e-05, + "loss": 1.1993, + "step": 10502 + }, + { + "epoch": 0.6381311136764081, + "grad_norm": 0.18345583975315094, + "learning_rate": 2.9143001383260614e-05, + "loss": 1.1962, + "step": 10503 + }, + { + "epoch": 0.6381918707090346, + "grad_norm": 0.11108536273241043, + "learning_rate": 2.913430162494879e-05, + "loss": 1.0579, + "step": 10504 + }, + { + "epoch": 0.6382526277416611, + "grad_norm": 0.14707481861114502, + "learning_rate": 2.9125602631473176e-05, + "loss": 1.0368, + "step": 10505 + }, + { + "epoch": 0.6383133847742877, + "grad_norm": 0.19864681363105774, + "learning_rate": 2.911690440315263e-05, + "loss": 1.1421, + "step": 10506 + }, + { + "epoch": 0.6383741418069141, + "grad_norm": 0.16419346630573273, + "learning_rate": 2.9108206940305972e-05, + "loss": 1.08, + "step": 10507 + }, + { + "epoch": 0.6384348988395406, + "grad_norm": 0.6969701647758484, + "learning_rate": 2.9099510243252042e-05, + "loss": 1.0362, + "step": 10508 + }, + { + "epoch": 0.6384956558721672, + "grad_norm": 0.32078859210014343, + "learning_rate": 2.909081431230961e-05, + "loss": 1.0556, + "step": 10509 + }, + { + "epoch": 0.6385564129047937, + "grad_norm": 0.14009247720241547, + "learning_rate": 2.9082119147797437e-05, + "loss": 1.0372, + "step": 10510 + }, + { + "epoch": 0.6386171699374202, + "grad_norm": 0.9470230340957642, + "learning_rate": 2.907342475003421e-05, + "loss": 1.0585, + "step": 10511 + }, + { + "epoch": 0.6386779269700468, + "grad_norm": 0.14937447011470795, + "learning_rate": 2.906473111933863e-05, + "loss": 1.2305, + "step": 10512 + }, + { + "epoch": 0.6387386840026733, + "grad_norm": 0.17845764756202698, + "learning_rate": 2.9056038256029388e-05, + "loss": 1.1116, + "step": 10513 + }, + { + "epoch": 0.6387994410352998, + "grad_norm": 0.19338946044445038, + "learning_rate": 2.9047346160425114e-05, + "loss": 1.0547, + "step": 10514 + }, + { + "epoch": 0.6388601980679264, + "grad_norm": 0.7391157746315002, + "learning_rate": 2.9038654832844425e-05, + "loss": 1.0786, + "step": 10515 + }, + { + "epoch": 0.6389209551005529, + "grad_norm": 12.721567153930664, + "learning_rate": 2.9029964273605902e-05, + "loss": 1.0449, + "step": 10516 + }, + { + "epoch": 0.6389817121331794, + "grad_norm": 0.18733571469783783, + "learning_rate": 2.902127448302806e-05, + "loss": 1.0663, + "step": 10517 + }, + { + "epoch": 0.639042469165806, + "grad_norm": 0.1624296009540558, + "learning_rate": 2.9012585461429475e-05, + "loss": 1.036, + "step": 10518 + }, + { + "epoch": 0.6391032261984325, + "grad_norm": 0.1422363519668579, + "learning_rate": 2.9003897209128627e-05, + "loss": 1.02, + "step": 10519 + }, + { + "epoch": 0.6391639832310589, + "grad_norm": 0.15627799928188324, + "learning_rate": 2.8995209726443988e-05, + "loss": 1.0093, + "step": 10520 + }, + { + "epoch": 0.6392247402636855, + "grad_norm": 0.43517640233039856, + "learning_rate": 2.8986523013694e-05, + "loss": 1.0599, + "step": 10521 + }, + { + "epoch": 0.639285497296312, + "grad_norm": 0.1844719499349594, + "learning_rate": 2.8977837071197067e-05, + "loss": 1.1382, + "step": 10522 + }, + { + "epoch": 0.6393462543289385, + "grad_norm": 1.516577959060669, + "learning_rate": 2.8969151899271573e-05, + "loss": 1.0643, + "step": 10523 + }, + { + "epoch": 0.6394070113615651, + "grad_norm": 0.1679154336452484, + "learning_rate": 2.8960467498235887e-05, + "loss": 1.0892, + "step": 10524 + }, + { + "epoch": 0.6394677683941916, + "grad_norm": 0.18547673523426056, + "learning_rate": 2.895178386840834e-05, + "loss": 1.0279, + "step": 10525 + }, + { + "epoch": 0.6395285254268182, + "grad_norm": 0.23361608386039734, + "learning_rate": 2.8943101010107214e-05, + "loss": 1.1299, + "step": 10526 + }, + { + "epoch": 0.6395892824594447, + "grad_norm": 0.19014549255371094, + "learning_rate": 2.8934418923650795e-05, + "loss": 1.032, + "step": 10527 + }, + { + "epoch": 0.6396500394920712, + "grad_norm": 0.18541701138019562, + "learning_rate": 2.8925737609357296e-05, + "loss": 1.0271, + "step": 10528 + }, + { + "epoch": 0.6397107965246978, + "grad_norm": 0.18043692409992218, + "learning_rate": 2.8917057067544983e-05, + "loss": 1.1251, + "step": 10529 + }, + { + "epoch": 0.6397715535573243, + "grad_norm": 0.14186227321624756, + "learning_rate": 2.8908377298532042e-05, + "loss": 1.0757, + "step": 10530 + }, + { + "epoch": 0.6398323105899508, + "grad_norm": 0.19671863317489624, + "learning_rate": 2.8899698302636586e-05, + "loss": 1.1594, + "step": 10531 + }, + { + "epoch": 0.6398930676225774, + "grad_norm": 0.14527735114097595, + "learning_rate": 2.8891020080176767e-05, + "loss": 1.0831, + "step": 10532 + }, + { + "epoch": 0.6399538246552038, + "grad_norm": 1.8104153871536255, + "learning_rate": 2.888234263147066e-05, + "loss": 1.0576, + "step": 10533 + }, + { + "epoch": 0.6400145816878303, + "grad_norm": 0.1322147399187088, + "learning_rate": 2.887366595683639e-05, + "loss": 1.0787, + "step": 10534 + }, + { + "epoch": 0.6400753387204569, + "grad_norm": 0.3149656057357788, + "learning_rate": 2.8864990056591984e-05, + "loss": 1.1864, + "step": 10535 + }, + { + "epoch": 0.6401360957530834, + "grad_norm": 0.32331764698028564, + "learning_rate": 2.885631493105545e-05, + "loss": 1.0815, + "step": 10536 + }, + { + "epoch": 0.6401968527857099, + "grad_norm": 0.2264787256717682, + "learning_rate": 2.88476405805448e-05, + "loss": 1.262, + "step": 10537 + }, + { + "epoch": 0.6402576098183365, + "grad_norm": 0.16247673332691193, + "learning_rate": 2.8838967005377927e-05, + "loss": 1.1141, + "step": 10538 + }, + { + "epoch": 0.640318366850963, + "grad_norm": 0.26086339354515076, + "learning_rate": 2.8830294205872833e-05, + "loss": 1.1664, + "step": 10539 + }, + { + "epoch": 0.6403791238835895, + "grad_norm": 0.14740754663944244, + "learning_rate": 2.8821622182347395e-05, + "loss": 1.1476, + "step": 10540 + }, + { + "epoch": 0.6404398809162161, + "grad_norm": 0.5690014362335205, + "learning_rate": 2.8812950935119498e-05, + "loss": 1.1018, + "step": 10541 + }, + { + "epoch": 0.6405006379488426, + "grad_norm": 0.18084542453289032, + "learning_rate": 2.8804280464506973e-05, + "loss": 1.1514, + "step": 10542 + }, + { + "epoch": 0.6405613949814691, + "grad_norm": 0.16567806899547577, + "learning_rate": 2.8795610770827643e-05, + "loss": 1.0586, + "step": 10543 + }, + { + "epoch": 0.6406221520140957, + "grad_norm": 1.2102230787277222, + "learning_rate": 2.8786941854399297e-05, + "loss": 1.0514, + "step": 10544 + }, + { + "epoch": 0.6406829090467222, + "grad_norm": 0.3500695824623108, + "learning_rate": 2.8778273715539706e-05, + "loss": 1.0911, + "step": 10545 + }, + { + "epoch": 0.6407436660793487, + "grad_norm": 0.5379045009613037, + "learning_rate": 2.8769606354566585e-05, + "loss": 1.0863, + "step": 10546 + }, + { + "epoch": 0.6408044231119752, + "grad_norm": 0.22588512301445007, + "learning_rate": 2.8760939771797656e-05, + "loss": 1.0849, + "step": 10547 + }, + { + "epoch": 0.6408651801446017, + "grad_norm": 0.133767768740654, + "learning_rate": 2.875227396755058e-05, + "loss": 1.0827, + "step": 10548 + }, + { + "epoch": 0.6409259371772282, + "grad_norm": 0.22321519255638123, + "learning_rate": 2.8743608942142986e-05, + "loss": 1.1239, + "step": 10549 + }, + { + "epoch": 0.6409866942098548, + "grad_norm": 0.23637591302394867, + "learning_rate": 2.8734944695892564e-05, + "loss": 1.177, + "step": 10550 + }, + { + "epoch": 0.6410474512424813, + "grad_norm": 0.13198213279247284, + "learning_rate": 2.8726281229116826e-05, + "loss": 1.0856, + "step": 10551 + }, + { + "epoch": 0.6411082082751078, + "grad_norm": 0.16887839138507843, + "learning_rate": 2.8717618542133362e-05, + "loss": 1.0927, + "step": 10552 + }, + { + "epoch": 0.6411689653077344, + "grad_norm": 0.2495092898607254, + "learning_rate": 2.870895663525971e-05, + "loss": 1.0293, + "step": 10553 + }, + { + "epoch": 0.6412297223403609, + "grad_norm": 0.17197994887828827, + "learning_rate": 2.8700295508813342e-05, + "loss": 1.2436, + "step": 10554 + }, + { + "epoch": 0.6412904793729874, + "grad_norm": 0.21060949563980103, + "learning_rate": 2.869163516311178e-05, + "loss": 1.112, + "step": 10555 + }, + { + "epoch": 0.641351236405614, + "grad_norm": 0.1529807299375534, + "learning_rate": 2.868297559847245e-05, + "loss": 1.0931, + "step": 10556 + }, + { + "epoch": 0.6414119934382405, + "grad_norm": 0.16953857243061066, + "learning_rate": 2.8674316815212797e-05, + "loss": 1.0739, + "step": 10557 + }, + { + "epoch": 0.641472750470867, + "grad_norm": 0.35685837268829346, + "learning_rate": 2.8665658813650153e-05, + "loss": 1.3015, + "step": 10558 + }, + { + "epoch": 0.6415335075034936, + "grad_norm": 0.11360722035169601, + "learning_rate": 2.8657001594101895e-05, + "loss": 1.0546, + "step": 10559 + }, + { + "epoch": 0.64159426453612, + "grad_norm": 0.22124333679676056, + "learning_rate": 2.864834515688539e-05, + "loss": 1.0973, + "step": 10560 + }, + { + "epoch": 0.6416550215687465, + "grad_norm": 0.14160633087158203, + "learning_rate": 2.8639689502317917e-05, + "loss": 1.0706, + "step": 10561 + }, + { + "epoch": 0.6417157786013731, + "grad_norm": 0.2581811845302582, + "learning_rate": 2.863103463071676e-05, + "loss": 1.0185, + "step": 10562 + }, + { + "epoch": 0.6417765356339996, + "grad_norm": 0.11835815757513046, + "learning_rate": 2.862238054239915e-05, + "loss": 1.0791, + "step": 10563 + }, + { + "epoch": 0.6418372926666261, + "grad_norm": 0.24685421586036682, + "learning_rate": 2.861372723768232e-05, + "loss": 1.0465, + "step": 10564 + }, + { + "epoch": 0.6418980496992527, + "grad_norm": 0.15230706334114075, + "learning_rate": 2.8605074716883452e-05, + "loss": 1.0551, + "step": 10565 + }, + { + "epoch": 0.6419588067318792, + "grad_norm": 0.6146996021270752, + "learning_rate": 2.8596422980319704e-05, + "loss": 1.0456, + "step": 10566 + }, + { + "epoch": 0.6420195637645058, + "grad_norm": 5.732402801513672, + "learning_rate": 2.8587772028308208e-05, + "loss": 1.0217, + "step": 10567 + }, + { + "epoch": 0.6420803207971323, + "grad_norm": 0.1213982030749321, + "learning_rate": 2.857912186116607e-05, + "loss": 1.0752, + "step": 10568 + }, + { + "epoch": 0.6421410778297588, + "grad_norm": 0.12367095053195953, + "learning_rate": 2.8570472479210354e-05, + "loss": 1.0006, + "step": 10569 + }, + { + "epoch": 0.6422018348623854, + "grad_norm": 0.14044378697872162, + "learning_rate": 2.856182388275812e-05, + "loss": 1.0052, + "step": 10570 + }, + { + "epoch": 0.6422625918950119, + "grad_norm": 0.14465275406837463, + "learning_rate": 2.855317607212637e-05, + "loss": 1.04, + "step": 10571 + }, + { + "epoch": 0.6423233489276384, + "grad_norm": 0.14098306000232697, + "learning_rate": 2.8544529047632095e-05, + "loss": 1.143, + "step": 10572 + }, + { + "epoch": 0.6423841059602649, + "grad_norm": 0.10634411126375198, + "learning_rate": 2.853588280959226e-05, + "loss": 1.0059, + "step": 10573 + }, + { + "epoch": 0.6424448629928914, + "grad_norm": 0.13710854947566986, + "learning_rate": 2.8527237358323768e-05, + "loss": 1.1003, + "step": 10574 + }, + { + "epoch": 0.6425056200255179, + "grad_norm": 0.1620447039604187, + "learning_rate": 2.8518592694143564e-05, + "loss": 1.0908, + "step": 10575 + }, + { + "epoch": 0.6425663770581445, + "grad_norm": 0.2394874393939972, + "learning_rate": 2.8509948817368492e-05, + "loss": 1.0776, + "step": 10576 + }, + { + "epoch": 0.642627134090771, + "grad_norm": 0.1275450736284256, + "learning_rate": 2.850130572831542e-05, + "loss": 1.0457, + "step": 10577 + }, + { + "epoch": 0.6426878911233975, + "grad_norm": 0.10778269916772842, + "learning_rate": 2.8492663427301127e-05, + "loss": 1.0434, + "step": 10578 + }, + { + "epoch": 0.6427486481560241, + "grad_norm": 0.3755713403224945, + "learning_rate": 2.848402191464239e-05, + "loss": 1.1312, + "step": 10579 + }, + { + "epoch": 0.6428094051886506, + "grad_norm": 0.17444919049739838, + "learning_rate": 2.8475381190656013e-05, + "loss": 1.0979, + "step": 10580 + }, + { + "epoch": 0.6428701622212771, + "grad_norm": 0.13567398488521576, + "learning_rate": 2.8466741255658693e-05, + "loss": 1.1309, + "step": 10581 + }, + { + "epoch": 0.6429309192539037, + "grad_norm": 0.5624414086341858, + "learning_rate": 2.8458102109967133e-05, + "loss": 1.2213, + "step": 10582 + }, + { + "epoch": 0.6429916762865302, + "grad_norm": 0.16844765841960907, + "learning_rate": 2.844946375389803e-05, + "loss": 1.1224, + "step": 10583 + }, + { + "epoch": 0.6430524333191567, + "grad_norm": 1.1051650047302246, + "learning_rate": 2.844082618776796e-05, + "loss": 1.0876, + "step": 10584 + }, + { + "epoch": 0.6431131903517833, + "grad_norm": 0.619744598865509, + "learning_rate": 2.843218941189359e-05, + "loss": 1.084, + "step": 10585 + }, + { + "epoch": 0.6431739473844097, + "grad_norm": 0.15182097256183624, + "learning_rate": 2.8423553426591486e-05, + "loss": 1.0927, + "step": 10586 + }, + { + "epoch": 0.6432347044170362, + "grad_norm": 0.14888085424900055, + "learning_rate": 2.84149182321782e-05, + "loss": 1.0007, + "step": 10587 + }, + { + "epoch": 0.6432954614496628, + "grad_norm": 0.5216175317764282, + "learning_rate": 2.840628382897026e-05, + "loss": 1.073, + "step": 10588 + }, + { + "epoch": 0.6433562184822893, + "grad_norm": 0.19235432147979736, + "learning_rate": 2.8397650217284165e-05, + "loss": 1.1685, + "step": 10589 + }, + { + "epoch": 0.6434169755149158, + "grad_norm": 0.12589845061302185, + "learning_rate": 2.838901739743637e-05, + "loss": 1.0045, + "step": 10590 + }, + { + "epoch": 0.6434777325475424, + "grad_norm": 0.5008223652839661, + "learning_rate": 2.8380385369743325e-05, + "loss": 1.1294, + "step": 10591 + }, + { + "epoch": 0.6435384895801689, + "grad_norm": 0.15247169137001038, + "learning_rate": 2.837175413452143e-05, + "loss": 1.0924, + "step": 10592 + }, + { + "epoch": 0.6435992466127954, + "grad_norm": 0.15467919409275055, + "learning_rate": 2.836312369208707e-05, + "loss": 1.0603, + "step": 10593 + }, + { + "epoch": 0.643660003645422, + "grad_norm": 0.23309412598609924, + "learning_rate": 2.835449404275659e-05, + "loss": 1.0223, + "step": 10594 + }, + { + "epoch": 0.6437207606780485, + "grad_norm": 0.12963947653770447, + "learning_rate": 2.83458651868463e-05, + "loss": 1.0094, + "step": 10595 + }, + { + "epoch": 0.643781517710675, + "grad_norm": 0.1775350123643875, + "learning_rate": 2.833723712467252e-05, + "loss": 1.163, + "step": 10596 + }, + { + "epoch": 0.6438422747433016, + "grad_norm": 0.11030242592096329, + "learning_rate": 2.8328609856551524e-05, + "loss": 1.002, + "step": 10597 + }, + { + "epoch": 0.6439030317759281, + "grad_norm": 0.1264825314283371, + "learning_rate": 2.83199833827995e-05, + "loss": 1.0638, + "step": 10598 + }, + { + "epoch": 0.6439637888085545, + "grad_norm": 0.16903752088546753, + "learning_rate": 2.8311357703732683e-05, + "loss": 1.1039, + "step": 10599 + }, + { + "epoch": 0.6440245458411811, + "grad_norm": 0.14801651239395142, + "learning_rate": 2.830273281966722e-05, + "loss": 1.0716, + "step": 10600 + }, + { + "epoch": 0.6440853028738076, + "grad_norm": 0.120904341340065, + "learning_rate": 2.8294108730919295e-05, + "loss": 1.0582, + "step": 10601 + }, + { + "epoch": 0.6441460599064341, + "grad_norm": 0.23208607733249664, + "learning_rate": 2.8285485437805016e-05, + "loss": 1.1607, + "step": 10602 + }, + { + "epoch": 0.6442068169390607, + "grad_norm": 0.1501462310552597, + "learning_rate": 2.827686294064048e-05, + "loss": 1.0478, + "step": 10603 + }, + { + "epoch": 0.6442675739716872, + "grad_norm": 0.18621088564395905, + "learning_rate": 2.8268241239741712e-05, + "loss": 1.1535, + "step": 10604 + }, + { + "epoch": 0.6443283310043137, + "grad_norm": 0.13974034786224365, + "learning_rate": 2.825962033542474e-05, + "loss": 1.0888, + "step": 10605 + }, + { + "epoch": 0.6443890880369403, + "grad_norm": 0.18857252597808838, + "learning_rate": 2.8251000228005614e-05, + "loss": 1.1022, + "step": 10606 + }, + { + "epoch": 0.6444498450695668, + "grad_norm": 0.12503036856651306, + "learning_rate": 2.8242380917800278e-05, + "loss": 1.0141, + "step": 10607 + }, + { + "epoch": 0.6445106021021934, + "grad_norm": 0.13200336694717407, + "learning_rate": 2.823376240512467e-05, + "loss": 1.0721, + "step": 10608 + }, + { + "epoch": 0.6445713591348199, + "grad_norm": 2.661329507827759, + "learning_rate": 2.8225144690294714e-05, + "loss": 1.0722, + "step": 10609 + }, + { + "epoch": 0.6446321161674464, + "grad_norm": 0.31004250049591064, + "learning_rate": 2.8216527773626277e-05, + "loss": 1.3008, + "step": 10610 + }, + { + "epoch": 0.644692873200073, + "grad_norm": 0.11052168905735016, + "learning_rate": 2.8207911655435238e-05, + "loss": 1.1473, + "step": 10611 + }, + { + "epoch": 0.6447536302326994, + "grad_norm": 0.13838598132133484, + "learning_rate": 2.8199296336037406e-05, + "loss": 1.0776, + "step": 10612 + }, + { + "epoch": 0.6448143872653259, + "grad_norm": 0.22109512984752655, + "learning_rate": 2.8190681815748577e-05, + "loss": 1.0097, + "step": 10613 + }, + { + "epoch": 0.6448751442979525, + "grad_norm": 0.5599778294563293, + "learning_rate": 2.8182068094884527e-05, + "loss": 1.1104, + "step": 10614 + }, + { + "epoch": 0.644935901330579, + "grad_norm": 0.15470832586288452, + "learning_rate": 2.8173455173760987e-05, + "loss": 1.1861, + "step": 10615 + }, + { + "epoch": 0.6449966583632055, + "grad_norm": 0.11315450072288513, + "learning_rate": 2.8164843052693645e-05, + "loss": 1.004, + "step": 10616 + }, + { + "epoch": 0.6450574153958321, + "grad_norm": 0.26550528407096863, + "learning_rate": 2.815623173199824e-05, + "loss": 1.1666, + "step": 10617 + }, + { + "epoch": 0.6451181724284586, + "grad_norm": 0.22693970799446106, + "learning_rate": 2.814762121199036e-05, + "loss": 1.0744, + "step": 10618 + }, + { + "epoch": 0.6451789294610851, + "grad_norm": 0.1459822803735733, + "learning_rate": 2.8139011492985657e-05, + "loss": 1.0092, + "step": 10619 + }, + { + "epoch": 0.6452396864937117, + "grad_norm": 0.21222250163555145, + "learning_rate": 2.813040257529971e-05, + "loss": 1.1666, + "step": 10620 + }, + { + "epoch": 0.6453004435263382, + "grad_norm": 0.1804226189851761, + "learning_rate": 2.8121794459248063e-05, + "loss": 1.1138, + "step": 10621 + }, + { + "epoch": 0.6453612005589647, + "grad_norm": 0.1522487848997116, + "learning_rate": 2.811318714514629e-05, + "loss": 1.0307, + "step": 10622 + }, + { + "epoch": 0.6454219575915913, + "grad_norm": 0.13227388262748718, + "learning_rate": 2.8104580633309872e-05, + "loss": 1.0279, + "step": 10623 + }, + { + "epoch": 0.6454827146242178, + "grad_norm": 0.1361694186925888, + "learning_rate": 2.8095974924054303e-05, + "loss": 1.0827, + "step": 10624 + }, + { + "epoch": 0.6455434716568442, + "grad_norm": 0.12480373680591583, + "learning_rate": 2.8087370017694992e-05, + "loss": 1.0458, + "step": 10625 + }, + { + "epoch": 0.6456042286894708, + "grad_norm": 0.17040294408798218, + "learning_rate": 2.8078765914547345e-05, + "loss": 1.0888, + "step": 10626 + }, + { + "epoch": 0.6456649857220973, + "grad_norm": 0.17852675914764404, + "learning_rate": 2.8070162614926787e-05, + "loss": 1.1456, + "step": 10627 + }, + { + "epoch": 0.6457257427547238, + "grad_norm": 0.11578996479511261, + "learning_rate": 2.8061560119148662e-05, + "loss": 1.0609, + "step": 10628 + }, + { + "epoch": 0.6457864997873504, + "grad_norm": 0.11707695573568344, + "learning_rate": 2.8052958427528285e-05, + "loss": 0.9949, + "step": 10629 + }, + { + "epoch": 0.6458472568199769, + "grad_norm": 0.12889371812343597, + "learning_rate": 2.8044357540380984e-05, + "loss": 1.0478, + "step": 10630 + }, + { + "epoch": 0.6459080138526034, + "grad_norm": 0.1696360856294632, + "learning_rate": 2.803575745802196e-05, + "loss": 1.08, + "step": 10631 + }, + { + "epoch": 0.64596877088523, + "grad_norm": 0.12604303658008575, + "learning_rate": 2.8027158180766518e-05, + "loss": 1.0659, + "step": 10632 + }, + { + "epoch": 0.6460295279178565, + "grad_norm": 0.16087479889392853, + "learning_rate": 2.8018559708929837e-05, + "loss": 1.048, + "step": 10633 + }, + { + "epoch": 0.646090284950483, + "grad_norm": 0.14207258820533752, + "learning_rate": 2.8009962042827098e-05, + "loss": 1.0718, + "step": 10634 + }, + { + "epoch": 0.6461510419831096, + "grad_norm": 0.161500945687294, + "learning_rate": 2.8001365182773447e-05, + "loss": 1.1255, + "step": 10635 + }, + { + "epoch": 0.6462117990157361, + "grad_norm": 0.17684707045555115, + "learning_rate": 2.7992769129084005e-05, + "loss": 1.1101, + "step": 10636 + }, + { + "epoch": 0.6462725560483626, + "grad_norm": 0.1507025808095932, + "learning_rate": 2.7984173882073872e-05, + "loss": 1.0229, + "step": 10637 + }, + { + "epoch": 0.6463333130809891, + "grad_norm": 0.1448822170495987, + "learning_rate": 2.7975579442058102e-05, + "loss": 1.0163, + "step": 10638 + }, + { + "epoch": 0.6463940701136156, + "grad_norm": 0.17672374844551086, + "learning_rate": 2.796698580935172e-05, + "loss": 1.0837, + "step": 10639 + }, + { + "epoch": 0.6464548271462421, + "grad_norm": 0.1632174402475357, + "learning_rate": 2.795839298426973e-05, + "loss": 1.124, + "step": 10640 + }, + { + "epoch": 0.6465155841788687, + "grad_norm": 0.2601765990257263, + "learning_rate": 2.7949800967127116e-05, + "loss": 1.1536, + "step": 10641 + }, + { + "epoch": 0.6465763412114952, + "grad_norm": 0.37360918521881104, + "learning_rate": 2.7941209758238786e-05, + "loss": 1.2292, + "step": 10642 + }, + { + "epoch": 0.6466370982441217, + "grad_norm": 0.3798327147960663, + "learning_rate": 2.7932619357919687e-05, + "loss": 1.133, + "step": 10643 + }, + { + "epoch": 0.6466978552767483, + "grad_norm": 0.11715422570705414, + "learning_rate": 2.7924029766484717e-05, + "loss": 1.0242, + "step": 10644 + }, + { + "epoch": 0.6467586123093748, + "grad_norm": 0.11328744888305664, + "learning_rate": 2.7915440984248686e-05, + "loss": 1.0338, + "step": 10645 + }, + { + "epoch": 0.6468193693420013, + "grad_norm": 0.189795583486557, + "learning_rate": 2.7906853011526435e-05, + "loss": 1.0895, + "step": 10646 + }, + { + "epoch": 0.6468801263746279, + "grad_norm": 0.12398957461118698, + "learning_rate": 2.7898265848632733e-05, + "loss": 1.0871, + "step": 10647 + }, + { + "epoch": 0.6469408834072544, + "grad_norm": 0.12003659456968307, + "learning_rate": 2.7889679495882385e-05, + "loss": 1.0596, + "step": 10648 + }, + { + "epoch": 0.647001640439881, + "grad_norm": 0.19922269880771637, + "learning_rate": 2.788109395359012e-05, + "loss": 1.0853, + "step": 10649 + }, + { + "epoch": 0.6470623974725075, + "grad_norm": 0.12221385538578033, + "learning_rate": 2.7872509222070642e-05, + "loss": 1.0815, + "step": 10650 + }, + { + "epoch": 0.647123154505134, + "grad_norm": 0.21900205314159393, + "learning_rate": 2.7863925301638598e-05, + "loss": 1.0685, + "step": 10651 + }, + { + "epoch": 0.6471839115377604, + "grad_norm": 0.16819904744625092, + "learning_rate": 2.7855342192608637e-05, + "loss": 1.1614, + "step": 10652 + }, + { + "epoch": 0.647244668570387, + "grad_norm": 0.1284661889076233, + "learning_rate": 2.7846759895295397e-05, + "loss": 1.0313, + "step": 10653 + }, + { + "epoch": 0.6473054256030135, + "grad_norm": 0.1185019314289093, + "learning_rate": 2.783817841001346e-05, + "loss": 1.0814, + "step": 10654 + }, + { + "epoch": 0.64736618263564, + "grad_norm": 0.14398090541362762, + "learning_rate": 2.7829597737077385e-05, + "loss": 1.086, + "step": 10655 + }, + { + "epoch": 0.6474269396682666, + "grad_norm": 0.14481377601623535, + "learning_rate": 2.782101787680168e-05, + "loss": 1.0586, + "step": 10656 + }, + { + "epoch": 0.6474876967008931, + "grad_norm": 0.13460202515125275, + "learning_rate": 2.781243882950085e-05, + "loss": 1.063, + "step": 10657 + }, + { + "epoch": 0.6475484537335197, + "grad_norm": 0.3281172811985016, + "learning_rate": 2.7803860595489373e-05, + "loss": 1.1042, + "step": 10658 + }, + { + "epoch": 0.6476092107661462, + "grad_norm": 0.15195156633853912, + "learning_rate": 2.7795283175081678e-05, + "loss": 1.1059, + "step": 10659 + }, + { + "epoch": 0.6476699677987727, + "grad_norm": 0.3348652720451355, + "learning_rate": 2.7786706568592163e-05, + "loss": 1.1535, + "step": 10660 + }, + { + "epoch": 0.6477307248313993, + "grad_norm": 0.20453037321567535, + "learning_rate": 2.7778130776335217e-05, + "loss": 1.0573, + "step": 10661 + }, + { + "epoch": 0.6477914818640258, + "grad_norm": 0.20219139754772186, + "learning_rate": 2.776955579862518e-05, + "loss": 1.051, + "step": 10662 + }, + { + "epoch": 0.6478522388966523, + "grad_norm": 0.1860876977443695, + "learning_rate": 2.776098163577635e-05, + "loss": 1.1814, + "step": 10663 + }, + { + "epoch": 0.6479129959292789, + "grad_norm": 0.7542808055877686, + "learning_rate": 2.7752408288103082e-05, + "loss": 1.096, + "step": 10664 + }, + { + "epoch": 0.6479737529619053, + "grad_norm": 0.17214106023311615, + "learning_rate": 2.774383575591956e-05, + "loss": 1.0859, + "step": 10665 + }, + { + "epoch": 0.6480345099945318, + "grad_norm": 0.17734161019325256, + "learning_rate": 2.773526403954005e-05, + "loss": 1.0953, + "step": 10666 + }, + { + "epoch": 0.6480952670271584, + "grad_norm": 0.1600418984889984, + "learning_rate": 2.7726693139278716e-05, + "loss": 1.0923, + "step": 10667 + }, + { + "epoch": 0.6481560240597849, + "grad_norm": 0.11942606419324875, + "learning_rate": 2.7718123055449773e-05, + "loss": 1.0367, + "step": 10668 + }, + { + "epoch": 0.6482167810924114, + "grad_norm": 0.1817196160554886, + "learning_rate": 2.770955378836733e-05, + "loss": 1.1031, + "step": 10669 + }, + { + "epoch": 0.648277538125038, + "grad_norm": 0.12810561060905457, + "learning_rate": 2.7700985338345525e-05, + "loss": 1.129, + "step": 10670 + }, + { + "epoch": 0.6483382951576645, + "grad_norm": 0.15423093736171722, + "learning_rate": 2.769241770569839e-05, + "loss": 1.0173, + "step": 10671 + }, + { + "epoch": 0.648399052190291, + "grad_norm": 0.1347077190876007, + "learning_rate": 2.768385089073998e-05, + "loss": 1.0857, + "step": 10672 + }, + { + "epoch": 0.6484598092229176, + "grad_norm": 0.09607825428247452, + "learning_rate": 2.7675284893784353e-05, + "loss": 1.0281, + "step": 10673 + }, + { + "epoch": 0.6485205662555441, + "grad_norm": 0.11636609584093094, + "learning_rate": 2.766671971514547e-05, + "loss": 1.0655, + "step": 10674 + }, + { + "epoch": 0.6485813232881706, + "grad_norm": 4.822450160980225, + "learning_rate": 2.7658155355137295e-05, + "loss": 1.173, + "step": 10675 + }, + { + "epoch": 0.6486420803207972, + "grad_norm": 0.14269623160362244, + "learning_rate": 2.764959181407375e-05, + "loss": 1.0798, + "step": 10676 + }, + { + "epoch": 0.6487028373534237, + "grad_norm": 0.21184715628623962, + "learning_rate": 2.7641029092268733e-05, + "loss": 1.1282, + "step": 10677 + }, + { + "epoch": 0.6487635943860501, + "grad_norm": 0.19763189554214478, + "learning_rate": 2.7632467190036128e-05, + "loss": 1.1804, + "step": 10678 + }, + { + "epoch": 0.6488243514186767, + "grad_norm": 0.16328218579292297, + "learning_rate": 2.762390610768975e-05, + "loss": 1.134, + "step": 10679 + }, + { + "epoch": 0.6488851084513032, + "grad_norm": 0.1238720566034317, + "learning_rate": 2.761534584554343e-05, + "loss": 1.0484, + "step": 10680 + }, + { + "epoch": 0.6489458654839297, + "grad_norm": 0.1482764333486557, + "learning_rate": 2.7606786403910923e-05, + "loss": 1.1044, + "step": 10681 + }, + { + "epoch": 0.6490066225165563, + "grad_norm": 0.12053503841161728, + "learning_rate": 2.7598227783105995e-05, + "loss": 1.0508, + "step": 10682 + }, + { + "epoch": 0.6490673795491828, + "grad_norm": 0.2628561556339264, + "learning_rate": 2.758966998344234e-05, + "loss": 1.2877, + "step": 10683 + }, + { + "epoch": 0.6491281365818093, + "grad_norm": 0.12281625717878342, + "learning_rate": 2.75811130052337e-05, + "loss": 0.9922, + "step": 10684 + }, + { + "epoch": 0.6491888936144359, + "grad_norm": 0.11099492013454437, + "learning_rate": 2.757255684879368e-05, + "loss": 1.0338, + "step": 10685 + }, + { + "epoch": 0.6492496506470624, + "grad_norm": 0.12935888767242432, + "learning_rate": 2.756400151443592e-05, + "loss": 1.2749, + "step": 10686 + }, + { + "epoch": 0.649310407679689, + "grad_norm": 0.11092004179954529, + "learning_rate": 2.7555447002474023e-05, + "loss": 1.0279, + "step": 10687 + }, + { + "epoch": 0.6493711647123155, + "grad_norm": 1.9871572256088257, + "learning_rate": 2.7546893313221535e-05, + "loss": 1.0454, + "step": 10688 + }, + { + "epoch": 0.649431921744942, + "grad_norm": 0.19476854801177979, + "learning_rate": 2.753834044699203e-05, + "loss": 1.2029, + "step": 10689 + }, + { + "epoch": 0.6494926787775686, + "grad_norm": 0.4752329885959625, + "learning_rate": 2.7529788404098998e-05, + "loss": 1.0481, + "step": 10690 + }, + { + "epoch": 0.649553435810195, + "grad_norm": 0.1420932561159134, + "learning_rate": 2.752123718485594e-05, + "loss": 1.0116, + "step": 10691 + }, + { + "epoch": 0.6496141928428215, + "grad_norm": 0.15726050734519958, + "learning_rate": 2.7512686789576253e-05, + "loss": 1.076, + "step": 10692 + }, + { + "epoch": 0.649674949875448, + "grad_norm": 0.1392820030450821, + "learning_rate": 2.750413721857336e-05, + "loss": 1.0952, + "step": 10693 + }, + { + "epoch": 0.6497357069080746, + "grad_norm": 0.1012481153011322, + "learning_rate": 2.7495588472160687e-05, + "loss": 1.0534, + "step": 10694 + }, + { + "epoch": 0.6497964639407011, + "grad_norm": 0.1261141300201416, + "learning_rate": 2.7487040550651564e-05, + "loss": 1.0513, + "step": 10695 + }, + { + "epoch": 0.6498572209733277, + "grad_norm": 0.1541125625371933, + "learning_rate": 2.7478493454359334e-05, + "loss": 1.1897, + "step": 10696 + }, + { + "epoch": 0.6499179780059542, + "grad_norm": 0.10333828628063202, + "learning_rate": 2.746994718359729e-05, + "loss": 1.069, + "step": 10697 + }, + { + "epoch": 0.6499787350385807, + "grad_norm": 0.2629009485244751, + "learning_rate": 2.7461401738678643e-05, + "loss": 1.0752, + "step": 10698 + }, + { + "epoch": 0.6500394920712073, + "grad_norm": 0.7989603281021118, + "learning_rate": 2.7452857119916696e-05, + "loss": 1.3133, + "step": 10699 + }, + { + "epoch": 0.6501002491038338, + "grad_norm": 0.3149626851081848, + "learning_rate": 2.7444313327624632e-05, + "loss": 1.2889, + "step": 10700 + }, + { + "epoch": 0.6501610061364603, + "grad_norm": 0.11603736877441406, + "learning_rate": 2.743577036211562e-05, + "loss": 1.1079, + "step": 10701 + }, + { + "epoch": 0.6502217631690869, + "grad_norm": 0.13387934863567352, + "learning_rate": 2.7427228223702805e-05, + "loss": 1.0513, + "step": 10702 + }, + { + "epoch": 0.6502825202017134, + "grad_norm": 0.13568900525569916, + "learning_rate": 2.741868691269931e-05, + "loss": 1.0568, + "step": 10703 + }, + { + "epoch": 0.6503432772343398, + "grad_norm": 0.13343168795108795, + "learning_rate": 2.7410146429418206e-05, + "loss": 1.0544, + "step": 10704 + }, + { + "epoch": 0.6504040342669664, + "grad_norm": 0.12232920527458191, + "learning_rate": 2.740160677417255e-05, + "loss": 1.0325, + "step": 10705 + }, + { + "epoch": 0.6504647912995929, + "grad_norm": 0.7413262724876404, + "learning_rate": 2.739306794727537e-05, + "loss": 1.2225, + "step": 10706 + }, + { + "epoch": 0.6505255483322194, + "grad_norm": 0.15551714599132538, + "learning_rate": 2.738452994903965e-05, + "loss": 1.1337, + "step": 10707 + }, + { + "epoch": 0.650586305364846, + "grad_norm": 0.1549120545387268, + "learning_rate": 2.737599277977836e-05, + "loss": 1.0281, + "step": 10708 + }, + { + "epoch": 0.6506470623974725, + "grad_norm": 0.21087591350078583, + "learning_rate": 2.7367456439804407e-05, + "loss": 1.0843, + "step": 10709 + }, + { + "epoch": 0.650707819430099, + "grad_norm": 0.12115465104579926, + "learning_rate": 2.735892092943073e-05, + "loss": 1.0472, + "step": 10710 + }, + { + "epoch": 0.6507685764627256, + "grad_norm": 0.19685101509094238, + "learning_rate": 2.7350386248970207e-05, + "loss": 1.1719, + "step": 10711 + }, + { + "epoch": 0.6508293334953521, + "grad_norm": 0.12675067782402039, + "learning_rate": 2.734185239873563e-05, + "loss": 1.0328, + "step": 10712 + }, + { + "epoch": 0.6508900905279786, + "grad_norm": 0.2035692185163498, + "learning_rate": 2.7333319379039834e-05, + "loss": 1.1672, + "step": 10713 + }, + { + "epoch": 0.6509508475606052, + "grad_norm": 0.26875996589660645, + "learning_rate": 2.7324787190195577e-05, + "loss": 1.1056, + "step": 10714 + }, + { + "epoch": 0.6510116045932317, + "grad_norm": 0.12651421129703522, + "learning_rate": 2.731625583251565e-05, + "loss": 1.0526, + "step": 10715 + }, + { + "epoch": 0.6510723616258582, + "grad_norm": 0.18824221193790436, + "learning_rate": 2.730772530631275e-05, + "loss": 1.0381, + "step": 10716 + }, + { + "epoch": 0.6511331186584847, + "grad_norm": 0.13760711252689362, + "learning_rate": 2.7299195611899587e-05, + "loss": 1.0657, + "step": 10717 + }, + { + "epoch": 0.6511938756911112, + "grad_norm": 0.14851705729961395, + "learning_rate": 2.729066674958878e-05, + "loss": 1.1141, + "step": 10718 + }, + { + "epoch": 0.6512546327237377, + "grad_norm": 0.1626603901386261, + "learning_rate": 2.728213871969294e-05, + "loss": 1.0949, + "step": 10719 + }, + { + "epoch": 0.6513153897563643, + "grad_norm": 0.14219172298908234, + "learning_rate": 2.7273611522524728e-05, + "loss": 1.0762, + "step": 10720 + }, + { + "epoch": 0.6513761467889908, + "grad_norm": 0.16569769382476807, + "learning_rate": 2.7265085158396675e-05, + "loss": 1.0168, + "step": 10721 + }, + { + "epoch": 0.6514369038216173, + "grad_norm": 0.1216806098818779, + "learning_rate": 2.725655962762132e-05, + "loss": 1.0689, + "step": 10722 + }, + { + "epoch": 0.6514976608542439, + "grad_norm": 0.15485937893390656, + "learning_rate": 2.724803493051117e-05, + "loss": 1.0672, + "step": 10723 + }, + { + "epoch": 0.6515584178868704, + "grad_norm": 0.12191330641508102, + "learning_rate": 2.7239511067378692e-05, + "loss": 1.0433, + "step": 10724 + }, + { + "epoch": 0.651619174919497, + "grad_norm": 0.10191213339567184, + "learning_rate": 2.723098803853634e-05, + "loss": 1.0429, + "step": 10725 + }, + { + "epoch": 0.6516799319521235, + "grad_norm": 0.16689635813236237, + "learning_rate": 2.722246584429652e-05, + "loss": 1.1634, + "step": 10726 + }, + { + "epoch": 0.65174068898475, + "grad_norm": 0.13971641659736633, + "learning_rate": 2.7213944484971622e-05, + "loss": 1.0802, + "step": 10727 + }, + { + "epoch": 0.6518014460173766, + "grad_norm": 0.20681428909301758, + "learning_rate": 2.7205423960873987e-05, + "loss": 1.0837, + "step": 10728 + }, + { + "epoch": 0.6518622030500031, + "grad_norm": 0.4985653758049011, + "learning_rate": 2.7196904272315947e-05, + "loss": 1.1315, + "step": 10729 + }, + { + "epoch": 0.6519229600826295, + "grad_norm": 0.12481482326984406, + "learning_rate": 2.718838541960977e-05, + "loss": 1.1028, + "step": 10730 + }, + { + "epoch": 0.651983717115256, + "grad_norm": 0.1376858800649643, + "learning_rate": 2.717986740306777e-05, + "loss": 1.1033, + "step": 10731 + }, + { + "epoch": 0.6520444741478826, + "grad_norm": 0.3027942180633545, + "learning_rate": 2.7171350223002123e-05, + "loss": 1.1748, + "step": 10732 + }, + { + "epoch": 0.6521052311805091, + "grad_norm": 0.16355596482753754, + "learning_rate": 2.716283387972505e-05, + "loss": 1.0503, + "step": 10733 + }, + { + "epoch": 0.6521659882131357, + "grad_norm": 0.12199196964502335, + "learning_rate": 2.715431837354871e-05, + "loss": 1.0452, + "step": 10734 + }, + { + "epoch": 0.6522267452457622, + "grad_norm": 0.17801955342292786, + "learning_rate": 2.7145803704785234e-05, + "loss": 1.1548, + "step": 10735 + }, + { + "epoch": 0.6522875022783887, + "grad_norm": 0.15752169489860535, + "learning_rate": 2.7137289873746752e-05, + "loss": 1.1457, + "step": 10736 + }, + { + "epoch": 0.6523482593110153, + "grad_norm": 0.16104267537593842, + "learning_rate": 2.7128776880745354e-05, + "loss": 1.0534, + "step": 10737 + }, + { + "epoch": 0.6524090163436418, + "grad_norm": 0.16529831290245056, + "learning_rate": 2.7120264726093038e-05, + "loss": 1.1007, + "step": 10738 + }, + { + "epoch": 0.6524697733762683, + "grad_norm": 0.14979039132595062, + "learning_rate": 2.7111753410101838e-05, + "loss": 1.1359, + "step": 10739 + }, + { + "epoch": 0.6525305304088949, + "grad_norm": 0.129634290933609, + "learning_rate": 2.710324293308373e-05, + "loss": 1.0222, + "step": 10740 + }, + { + "epoch": 0.6525912874415214, + "grad_norm": 0.10899734497070312, + "learning_rate": 2.709473329535069e-05, + "loss": 1.0715, + "step": 10741 + }, + { + "epoch": 0.6526520444741479, + "grad_norm": 0.227709099650383, + "learning_rate": 2.708622449721463e-05, + "loss": 1.0479, + "step": 10742 + }, + { + "epoch": 0.6527128015067744, + "grad_norm": 0.1568709909915924, + "learning_rate": 2.707771653898744e-05, + "loss": 1.0215, + "step": 10743 + }, + { + "epoch": 0.6527735585394009, + "grad_norm": 0.14843539893627167, + "learning_rate": 2.7069209420981002e-05, + "loss": 1.1602, + "step": 10744 + }, + { + "epoch": 0.6528343155720274, + "grad_norm": 0.2109106034040451, + "learning_rate": 2.7060703143507088e-05, + "loss": 1.0605, + "step": 10745 + }, + { + "epoch": 0.652895072604654, + "grad_norm": 0.12173515558242798, + "learning_rate": 2.7052197706877553e-05, + "loss": 1.0649, + "step": 10746 + }, + { + "epoch": 0.6529558296372805, + "grad_norm": 0.14209100604057312, + "learning_rate": 2.7043693111404146e-05, + "loss": 1.0505, + "step": 10747 + }, + { + "epoch": 0.653016586669907, + "grad_norm": 0.15922392904758453, + "learning_rate": 2.7035189357398605e-05, + "loss": 1.0151, + "step": 10748 + }, + { + "epoch": 0.6530773437025336, + "grad_norm": 0.19324637949466705, + "learning_rate": 2.7026686445172634e-05, + "loss": 1.0934, + "step": 10749 + }, + { + "epoch": 0.6531381007351601, + "grad_norm": 0.3077450096607208, + "learning_rate": 2.7018184375037913e-05, + "loss": 1.11, + "step": 10750 + }, + { + "epoch": 0.6531988577677866, + "grad_norm": 0.30469441413879395, + "learning_rate": 2.700968314730609e-05, + "loss": 1.1193, + "step": 10751 + }, + { + "epoch": 0.6532596148004132, + "grad_norm": 0.14849016070365906, + "learning_rate": 2.7001182762288774e-05, + "loss": 1.0774, + "step": 10752 + }, + { + "epoch": 0.6533203718330397, + "grad_norm": 0.18265512585639954, + "learning_rate": 2.699268322029755e-05, + "loss": 1.0336, + "step": 10753 + }, + { + "epoch": 0.6533811288656662, + "grad_norm": 0.3886958062648773, + "learning_rate": 2.6984184521643968e-05, + "loss": 1.3925, + "step": 10754 + }, + { + "epoch": 0.6534418858982928, + "grad_norm": 0.12576435506343842, + "learning_rate": 2.6975686666639556e-05, + "loss": 1.0325, + "step": 10755 + }, + { + "epoch": 0.6535026429309193, + "grad_norm": 0.1552131474018097, + "learning_rate": 2.696718965559577e-05, + "loss": 1.1016, + "step": 10756 + }, + { + "epoch": 0.6535633999635457, + "grad_norm": 0.15450161695480347, + "learning_rate": 2.6958693488824127e-05, + "loss": 1.1005, + "step": 10757 + }, + { + "epoch": 0.6536241569961723, + "grad_norm": 0.12544409930706024, + "learning_rate": 2.6950198166636042e-05, + "loss": 1.024, + "step": 10758 + }, + { + "epoch": 0.6536849140287988, + "grad_norm": 0.21194180846214294, + "learning_rate": 2.6941703689342885e-05, + "loss": 1.1391, + "step": 10759 + }, + { + "epoch": 0.6537456710614253, + "grad_norm": 0.18956564366817474, + "learning_rate": 2.6933210057256014e-05, + "loss": 0.9917, + "step": 10760 + }, + { + "epoch": 0.6538064280940519, + "grad_norm": 0.21792666614055634, + "learning_rate": 2.6924717270686807e-05, + "loss": 1.1133, + "step": 10761 + }, + { + "epoch": 0.6538671851266784, + "grad_norm": 0.12324512004852295, + "learning_rate": 2.691622532994655e-05, + "loss": 1.0287, + "step": 10762 + }, + { + "epoch": 0.6539279421593049, + "grad_norm": 0.21461355686187744, + "learning_rate": 2.6907734235346515e-05, + "loss": 1.1449, + "step": 10763 + }, + { + "epoch": 0.6539886991919315, + "grad_norm": 0.16204892098903656, + "learning_rate": 2.6899243987197963e-05, + "loss": 1.0116, + "step": 10764 + }, + { + "epoch": 0.654049456224558, + "grad_norm": 0.5267785787582397, + "learning_rate": 2.689075458581205e-05, + "loss": 1.2177, + "step": 10765 + }, + { + "epoch": 0.6541102132571845, + "grad_norm": 0.1675375998020172, + "learning_rate": 2.6882266031500016e-05, + "loss": 1.0919, + "step": 10766 + }, + { + "epoch": 0.6541709702898111, + "grad_norm": 0.27366146445274353, + "learning_rate": 2.6873778324572986e-05, + "loss": 1.0982, + "step": 10767 + }, + { + "epoch": 0.6542317273224376, + "grad_norm": 0.18844632804393768, + "learning_rate": 2.686529146534208e-05, + "loss": 1.0637, + "step": 10768 + }, + { + "epoch": 0.6542924843550642, + "grad_norm": 0.16073046624660492, + "learning_rate": 2.685680545411839e-05, + "loss": 1.0852, + "step": 10769 + }, + { + "epoch": 0.6543532413876906, + "grad_norm": 0.1605251580476761, + "learning_rate": 2.684832029121297e-05, + "loss": 1.103, + "step": 10770 + }, + { + "epoch": 0.6544139984203171, + "grad_norm": 0.12175038456916809, + "learning_rate": 2.6839835976936843e-05, + "loss": 1.0475, + "step": 10771 + }, + { + "epoch": 0.6544747554529436, + "grad_norm": 0.22805438935756683, + "learning_rate": 2.6831352511601006e-05, + "loss": 1.1575, + "step": 10772 + }, + { + "epoch": 0.6545355124855702, + "grad_norm": 0.16298699378967285, + "learning_rate": 2.6822869895516424e-05, + "loss": 1.0302, + "step": 10773 + }, + { + "epoch": 0.6545962695181967, + "grad_norm": 0.15765489637851715, + "learning_rate": 2.6814388128994023e-05, + "loss": 1.0899, + "step": 10774 + }, + { + "epoch": 0.6546570265508233, + "grad_norm": 0.17365390062332153, + "learning_rate": 2.6805907212344705e-05, + "loss": 1.1037, + "step": 10775 + }, + { + "epoch": 0.6547177835834498, + "grad_norm": 0.12098949402570724, + "learning_rate": 2.6797427145879327e-05, + "loss": 1.0278, + "step": 10776 + }, + { + "epoch": 0.6547785406160763, + "grad_norm": 0.13909979164600372, + "learning_rate": 2.6788947929908757e-05, + "loss": 1.0753, + "step": 10777 + }, + { + "epoch": 0.6548392976487029, + "grad_norm": 0.1650567203760147, + "learning_rate": 2.678046956474381e-05, + "loss": 1.0894, + "step": 10778 + }, + { + "epoch": 0.6549000546813294, + "grad_norm": 0.4109768271446228, + "learning_rate": 2.6771992050695215e-05, + "loss": 1.0635, + "step": 10779 + }, + { + "epoch": 0.6549608117139559, + "grad_norm": 0.12718398869037628, + "learning_rate": 2.6763515388073745e-05, + "loss": 1.0622, + "step": 10780 + }, + { + "epoch": 0.6550215687465825, + "grad_norm": 2.839046001434326, + "learning_rate": 2.6755039577190087e-05, + "loss": 1.1341, + "step": 10781 + }, + { + "epoch": 0.655082325779209, + "grad_norm": 0.3101520538330078, + "learning_rate": 2.6746564618354974e-05, + "loss": 1.0789, + "step": 10782 + }, + { + "epoch": 0.6551430828118354, + "grad_norm": 0.159837543964386, + "learning_rate": 2.6738090511879022e-05, + "loss": 1.0418, + "step": 10783 + }, + { + "epoch": 0.655203839844462, + "grad_norm": 1.0854791402816772, + "learning_rate": 2.6729617258072883e-05, + "loss": 1.0776, + "step": 10784 + }, + { + "epoch": 0.6552645968770885, + "grad_norm": 0.6836337447166443, + "learning_rate": 2.67211448572471e-05, + "loss": 1.1007, + "step": 10785 + }, + { + "epoch": 0.655325353909715, + "grad_norm": 0.31533119082450867, + "learning_rate": 2.6712673309712232e-05, + "loss": 1.1158, + "step": 10786 + }, + { + "epoch": 0.6553861109423416, + "grad_norm": 0.17787127196788788, + "learning_rate": 2.6704202615778844e-05, + "loss": 1.1277, + "step": 10787 + }, + { + "epoch": 0.6554468679749681, + "grad_norm": 0.16356894373893738, + "learning_rate": 2.669573277575741e-05, + "loss": 1.1824, + "step": 10788 + }, + { + "epoch": 0.6555076250075946, + "grad_norm": 0.23183858394622803, + "learning_rate": 2.6687263789958407e-05, + "loss": 1.0299, + "step": 10789 + }, + { + "epoch": 0.6555683820402212, + "grad_norm": 0.11725512892007828, + "learning_rate": 2.6678795658692247e-05, + "loss": 1.0734, + "step": 10790 + }, + { + "epoch": 0.6556291390728477, + "grad_norm": 0.4919125437736511, + "learning_rate": 2.6670328382269338e-05, + "loss": 0.9961, + "step": 10791 + }, + { + "epoch": 0.6556898961054742, + "grad_norm": 0.185536190867424, + "learning_rate": 2.6661861961000058e-05, + "loss": 1.1405, + "step": 10792 + }, + { + "epoch": 0.6557506531381008, + "grad_norm": 0.1605636030435562, + "learning_rate": 2.6653396395194728e-05, + "loss": 1.1675, + "step": 10793 + }, + { + "epoch": 0.6558114101707273, + "grad_norm": 0.1225455105304718, + "learning_rate": 2.6644931685163678e-05, + "loss": 1.0777, + "step": 10794 + }, + { + "epoch": 0.6558721672033538, + "grad_norm": 0.13150855898857117, + "learning_rate": 2.663646783121716e-05, + "loss": 1.0669, + "step": 10795 + }, + { + "epoch": 0.6559329242359803, + "grad_norm": 0.9739035964012146, + "learning_rate": 2.6628004833665443e-05, + "loss": 1.1015, + "step": 10796 + }, + { + "epoch": 0.6559936812686068, + "grad_norm": 0.10610534995794296, + "learning_rate": 2.66195426928187e-05, + "loss": 1.0493, + "step": 10797 + }, + { + "epoch": 0.6560544383012333, + "grad_norm": 0.20631539821624756, + "learning_rate": 2.6611081408987177e-05, + "loss": 1.2029, + "step": 10798 + }, + { + "epoch": 0.6561151953338599, + "grad_norm": 0.10467001795768738, + "learning_rate": 2.660262098248097e-05, + "loss": 1.0358, + "step": 10799 + }, + { + "epoch": 0.6561759523664864, + "grad_norm": 0.1426679641008377, + "learning_rate": 2.6594161413610207e-05, + "loss": 1.041, + "step": 10800 + }, + { + "epoch": 0.6562367093991129, + "grad_norm": 0.16207429766654968, + "learning_rate": 2.6585702702684988e-05, + "loss": 1.0635, + "step": 10801 + }, + { + "epoch": 0.6562974664317395, + "grad_norm": 0.17047178745269775, + "learning_rate": 2.6577244850015337e-05, + "loss": 1.0086, + "step": 10802 + }, + { + "epoch": 0.656358223464366, + "grad_norm": 0.1677495539188385, + "learning_rate": 2.656878785591133e-05, + "loss": 1.0827, + "step": 10803 + }, + { + "epoch": 0.6564189804969925, + "grad_norm": 0.19676806032657623, + "learning_rate": 2.6560331720682952e-05, + "loss": 1.1453, + "step": 10804 + }, + { + "epoch": 0.6564797375296191, + "grad_norm": 0.12434057891368866, + "learning_rate": 2.6551876444640123e-05, + "loss": 1.0852, + "step": 10805 + }, + { + "epoch": 0.6565404945622456, + "grad_norm": 0.130045548081398, + "learning_rate": 2.6543422028092797e-05, + "loss": 1.0049, + "step": 10806 + }, + { + "epoch": 0.6566012515948721, + "grad_norm": 0.31346607208251953, + "learning_rate": 2.6534968471350852e-05, + "loss": 1.0867, + "step": 10807 + }, + { + "epoch": 0.6566620086274987, + "grad_norm": 0.15215517580509186, + "learning_rate": 2.6526515774724193e-05, + "loss": 1.0468, + "step": 10808 + }, + { + "epoch": 0.6567227656601251, + "grad_norm": 0.15309739112854004, + "learning_rate": 2.651806393852263e-05, + "loss": 1.0681, + "step": 10809 + }, + { + "epoch": 0.6567835226927516, + "grad_norm": 0.12594451010227203, + "learning_rate": 2.6509612963055974e-05, + "loss": 1.0111, + "step": 10810 + }, + { + "epoch": 0.6568442797253782, + "grad_norm": 0.1614355444908142, + "learning_rate": 2.650116284863402e-05, + "loss": 1.0984, + "step": 10811 + }, + { + "epoch": 0.6569050367580047, + "grad_norm": 0.13417722284793854, + "learning_rate": 2.6492713595566443e-05, + "loss": 1.0524, + "step": 10812 + }, + { + "epoch": 0.6569657937906312, + "grad_norm": 0.13377425074577332, + "learning_rate": 2.6484265204163005e-05, + "loss": 1.0751, + "step": 10813 + }, + { + "epoch": 0.6570265508232578, + "grad_norm": 0.14764824509620667, + "learning_rate": 2.6475817674733383e-05, + "loss": 1.075, + "step": 10814 + }, + { + "epoch": 0.6570873078558843, + "grad_norm": 0.11233925819396973, + "learning_rate": 2.6467371007587203e-05, + "loss": 1.0509, + "step": 10815 + }, + { + "epoch": 0.6571480648885109, + "grad_norm": 0.13913224637508392, + "learning_rate": 2.645892520303409e-05, + "loss": 1.0815, + "step": 10816 + }, + { + "epoch": 0.6572088219211374, + "grad_norm": 0.6653662919998169, + "learning_rate": 2.6450480261383626e-05, + "loss": 1.2268, + "step": 10817 + }, + { + "epoch": 0.6572695789537639, + "grad_norm": 0.24871478974819183, + "learning_rate": 2.644203618294536e-05, + "loss": 1.1791, + "step": 10818 + }, + { + "epoch": 0.6573303359863905, + "grad_norm": 0.14004692435264587, + "learning_rate": 2.6433592968028807e-05, + "loss": 1.0971, + "step": 10819 + }, + { + "epoch": 0.657391093019017, + "grad_norm": 0.12995389103889465, + "learning_rate": 2.6425150616943463e-05, + "loss": 1.0689, + "step": 10820 + }, + { + "epoch": 0.6574518500516435, + "grad_norm": 0.16400907933712006, + "learning_rate": 2.6416709129998783e-05, + "loss": 1.1193, + "step": 10821 + }, + { + "epoch": 0.65751260708427, + "grad_norm": 0.19335129857063293, + "learning_rate": 2.6408268507504186e-05, + "loss": 1.1282, + "step": 10822 + }, + { + "epoch": 0.6575733641168965, + "grad_norm": 0.14659562706947327, + "learning_rate": 2.639982874976904e-05, + "loss": 1.0798, + "step": 10823 + }, + { + "epoch": 0.657634121149523, + "grad_norm": 0.20350515842437744, + "learning_rate": 2.6391389857102762e-05, + "loss": 1.0526, + "step": 10824 + }, + { + "epoch": 0.6576948781821496, + "grad_norm": 0.14672085642814636, + "learning_rate": 2.638295182981467e-05, + "loss": 1.0872, + "step": 10825 + }, + { + "epoch": 0.6577556352147761, + "grad_norm": 0.19582423567771912, + "learning_rate": 2.6374514668214024e-05, + "loss": 1.0811, + "step": 10826 + }, + { + "epoch": 0.6578163922474026, + "grad_norm": 0.11572594195604324, + "learning_rate": 2.6366078372610103e-05, + "loss": 1.0072, + "step": 10827 + }, + { + "epoch": 0.6578771492800292, + "grad_norm": 0.2721400260925293, + "learning_rate": 2.6357642943312134e-05, + "loss": 1.0856, + "step": 10828 + }, + { + "epoch": 0.6579379063126557, + "grad_norm": 0.217032328248024, + "learning_rate": 2.6349208380629353e-05, + "loss": 1.1815, + "step": 10829 + }, + { + "epoch": 0.6579986633452822, + "grad_norm": 0.16566398739814758, + "learning_rate": 2.6340774684870904e-05, + "loss": 1.1236, + "step": 10830 + }, + { + "epoch": 0.6580594203779088, + "grad_norm": 0.21326296031475067, + "learning_rate": 2.633234185634596e-05, + "loss": 1.0908, + "step": 10831 + }, + { + "epoch": 0.6581201774105353, + "grad_norm": 0.12842196226119995, + "learning_rate": 2.6323909895363575e-05, + "loss": 1.0734, + "step": 10832 + }, + { + "epoch": 0.6581809344431618, + "grad_norm": 0.18694041669368744, + "learning_rate": 2.6315478802232835e-05, + "loss": 1.1869, + "step": 10833 + }, + { + "epoch": 0.6582416914757884, + "grad_norm": 0.13833561539649963, + "learning_rate": 2.6307048577262815e-05, + "loss": 1.0638, + "step": 10834 + }, + { + "epoch": 0.6583024485084148, + "grad_norm": 0.15762682259082794, + "learning_rate": 2.629861922076251e-05, + "loss": 1.0498, + "step": 10835 + }, + { + "epoch": 0.6583632055410413, + "grad_norm": 0.1437039077281952, + "learning_rate": 2.6290190733040897e-05, + "loss": 1.0247, + "step": 10836 + }, + { + "epoch": 0.6584239625736679, + "grad_norm": 0.23634670674800873, + "learning_rate": 2.6281763114406923e-05, + "loss": 1.1219, + "step": 10837 + }, + { + "epoch": 0.6584847196062944, + "grad_norm": 0.13402417302131653, + "learning_rate": 2.627333636516951e-05, + "loss": 1.0591, + "step": 10838 + }, + { + "epoch": 0.6585454766389209, + "grad_norm": 0.15598180890083313, + "learning_rate": 2.626491048563754e-05, + "loss": 1.0986, + "step": 10839 + }, + { + "epoch": 0.6586062336715475, + "grad_norm": 0.12586364150047302, + "learning_rate": 2.6256485476119863e-05, + "loss": 1.0323, + "step": 10840 + }, + { + "epoch": 0.658666990704174, + "grad_norm": 0.9201759696006775, + "learning_rate": 2.6248061336925302e-05, + "loss": 1.1661, + "step": 10841 + }, + { + "epoch": 0.6587277477368005, + "grad_norm": 0.09927939623594284, + "learning_rate": 2.6239638068362636e-05, + "loss": 1.0428, + "step": 10842 + }, + { + "epoch": 0.6587885047694271, + "grad_norm": 0.16602684557437897, + "learning_rate": 2.623121567074064e-05, + "loss": 1.0052, + "step": 10843 + }, + { + "epoch": 0.6588492618020536, + "grad_norm": 0.22177504003047943, + "learning_rate": 2.6222794144368e-05, + "loss": 1.1179, + "step": 10844 + }, + { + "epoch": 0.6589100188346801, + "grad_norm": 0.13188055157661438, + "learning_rate": 2.621437348955348e-05, + "loss": 1.0621, + "step": 10845 + }, + { + "epoch": 0.6589707758673067, + "grad_norm": 0.12681160867214203, + "learning_rate": 2.620595370660568e-05, + "loss": 1.0535, + "step": 10846 + }, + { + "epoch": 0.6590315328999332, + "grad_norm": 0.162693053483963, + "learning_rate": 2.619753479583324e-05, + "loss": 1.0399, + "step": 10847 + }, + { + "epoch": 0.6590922899325596, + "grad_norm": 0.13260555267333984, + "learning_rate": 2.6189116757544764e-05, + "loss": 1.0678, + "step": 10848 + }, + { + "epoch": 0.6591530469651862, + "grad_norm": 0.16105875372886658, + "learning_rate": 2.6180699592048795e-05, + "loss": 1.0822, + "step": 10849 + }, + { + "epoch": 0.6592138039978127, + "grad_norm": 0.20426367223262787, + "learning_rate": 2.617228329965391e-05, + "loss": 1.1188, + "step": 10850 + }, + { + "epoch": 0.6592745610304392, + "grad_norm": 0.14040818810462952, + "learning_rate": 2.6163867880668614e-05, + "loss": 1.0195, + "step": 10851 + }, + { + "epoch": 0.6593353180630658, + "grad_norm": 0.2034810185432434, + "learning_rate": 2.6155453335401315e-05, + "loss": 1.1623, + "step": 10852 + }, + { + "epoch": 0.6593960750956923, + "grad_norm": 0.14832821488380432, + "learning_rate": 2.6147039664160488e-05, + "loss": 1.0944, + "step": 10853 + }, + { + "epoch": 0.6594568321283188, + "grad_norm": 0.15600016713142395, + "learning_rate": 2.6138626867254512e-05, + "loss": 1.0418, + "step": 10854 + }, + { + "epoch": 0.6595175891609454, + "grad_norm": 0.11755774170160294, + "learning_rate": 2.6130214944991804e-05, + "loss": 1.0597, + "step": 10855 + }, + { + "epoch": 0.6595783461935719, + "grad_norm": 0.14025630056858063, + "learning_rate": 2.612180389768068e-05, + "loss": 1.0553, + "step": 10856 + }, + { + "epoch": 0.6596391032261985, + "grad_norm": 0.15888909995555878, + "learning_rate": 2.6113393725629447e-05, + "loss": 1.0819, + "step": 10857 + }, + { + "epoch": 0.659699860258825, + "grad_norm": 2.1332199573516846, + "learning_rate": 2.6104984429146384e-05, + "loss": 1.1181, + "step": 10858 + }, + { + "epoch": 0.6597606172914515, + "grad_norm": 6.67297887802124, + "learning_rate": 2.609657600853974e-05, + "loss": 1.1161, + "step": 10859 + }, + { + "epoch": 0.6598213743240781, + "grad_norm": 0.16308479011058807, + "learning_rate": 2.608816846411772e-05, + "loss": 1.1502, + "step": 10860 + }, + { + "epoch": 0.6598821313567046, + "grad_norm": 0.12557394802570343, + "learning_rate": 2.6079761796188508e-05, + "loss": 1.0559, + "step": 10861 + }, + { + "epoch": 0.659942888389331, + "grad_norm": 0.4175530672073364, + "learning_rate": 2.6071356005060248e-05, + "loss": 1.0965, + "step": 10862 + }, + { + "epoch": 0.6600036454219576, + "grad_norm": 0.111270472407341, + "learning_rate": 2.606295109104106e-05, + "loss": 1.0279, + "step": 10863 + }, + { + "epoch": 0.6600644024545841, + "grad_norm": 0.11335918307304382, + "learning_rate": 2.6054547054439026e-05, + "loss": 1.0291, + "step": 10864 + }, + { + "epoch": 0.6601251594872106, + "grad_norm": 0.15028226375579834, + "learning_rate": 2.60461438955622e-05, + "loss": 1.0169, + "step": 10865 + }, + { + "epoch": 0.6601859165198372, + "grad_norm": 0.12362881749868393, + "learning_rate": 2.6037741614718602e-05, + "loss": 1.0731, + "step": 10866 + }, + { + "epoch": 0.6602466735524637, + "grad_norm": 0.1826833039522171, + "learning_rate": 2.602934021221621e-05, + "loss": 1.1245, + "step": 10867 + }, + { + "epoch": 0.6603074305850902, + "grad_norm": 0.15229396522045135, + "learning_rate": 2.6020939688362995e-05, + "loss": 1.1351, + "step": 10868 + }, + { + "epoch": 0.6603681876177168, + "grad_norm": 0.5797024965286255, + "learning_rate": 2.601254004346685e-05, + "loss": 1.4027, + "step": 10869 + }, + { + "epoch": 0.6604289446503433, + "grad_norm": 0.3920063078403473, + "learning_rate": 2.6004141277835703e-05, + "loss": 1.154, + "step": 10870 + }, + { + "epoch": 0.6604897016829698, + "grad_norm": 0.12447533756494522, + "learning_rate": 2.599574339177741e-05, + "loss": 1.0448, + "step": 10871 + }, + { + "epoch": 0.6605504587155964, + "grad_norm": 0.16003093123435974, + "learning_rate": 2.598734638559977e-05, + "loss": 1.0382, + "step": 10872 + }, + { + "epoch": 0.6606112157482229, + "grad_norm": 0.17947323620319366, + "learning_rate": 2.597895025961059e-05, + "loss": 1.1228, + "step": 10873 + }, + { + "epoch": 0.6606719727808494, + "grad_norm": 0.22580967843532562, + "learning_rate": 2.5970555014117605e-05, + "loss": 1.2768, + "step": 10874 + }, + { + "epoch": 0.6607327298134759, + "grad_norm": 0.496307909488678, + "learning_rate": 2.59621606494286e-05, + "loss": 1.0932, + "step": 10875 + }, + { + "epoch": 0.6607934868461024, + "grad_norm": 0.1197834238409996, + "learning_rate": 2.5953767165851238e-05, + "loss": 1.0422, + "step": 10876 + }, + { + "epoch": 0.6608542438787289, + "grad_norm": 0.15819180011749268, + "learning_rate": 2.5945374563693185e-05, + "loss": 1.0388, + "step": 10877 + }, + { + "epoch": 0.6609150009113555, + "grad_norm": 0.19857493042945862, + "learning_rate": 2.5936982843262102e-05, + "loss": 1.0388, + "step": 10878 + }, + { + "epoch": 0.660975757943982, + "grad_norm": 0.11137481033802032, + "learning_rate": 2.5928592004865526e-05, + "loss": 1.0291, + "step": 10879 + }, + { + "epoch": 0.6610365149766085, + "grad_norm": 0.29282453656196594, + "learning_rate": 2.5920202048811075e-05, + "loss": 1.0689, + "step": 10880 + }, + { + "epoch": 0.6610972720092351, + "grad_norm": 0.17970162630081177, + "learning_rate": 2.591181297540628e-05, + "loss": 1.1914, + "step": 10881 + }, + { + "epoch": 0.6611580290418616, + "grad_norm": 0.1269974559545517, + "learning_rate": 2.590342478495863e-05, + "loss": 1.053, + "step": 10882 + }, + { + "epoch": 0.6612187860744881, + "grad_norm": 0.13030850887298584, + "learning_rate": 2.589503747777561e-05, + "loss": 1.0744, + "step": 10883 + }, + { + "epoch": 0.6612795431071147, + "grad_norm": 0.13220517337322235, + "learning_rate": 2.5886651054164647e-05, + "loss": 1.0142, + "step": 10884 + }, + { + "epoch": 0.6613403001397412, + "grad_norm": 0.14280858635902405, + "learning_rate": 2.5878265514433148e-05, + "loss": 1.1011, + "step": 10885 + }, + { + "epoch": 0.6614010571723677, + "grad_norm": 0.15455102920532227, + "learning_rate": 2.586988085888849e-05, + "loss": 1.1168, + "step": 10886 + }, + { + "epoch": 0.6614618142049943, + "grad_norm": 0.13458912074565887, + "learning_rate": 2.5861497087838016e-05, + "loss": 0.9959, + "step": 10887 + }, + { + "epoch": 0.6615225712376207, + "grad_norm": 0.22777879238128662, + "learning_rate": 2.585311420158903e-05, + "loss": 1.0927, + "step": 10888 + }, + { + "epoch": 0.6615833282702472, + "grad_norm": 0.18483905494213104, + "learning_rate": 2.5844732200448813e-05, + "loss": 1.4205, + "step": 10889 + }, + { + "epoch": 0.6616440853028738, + "grad_norm": 0.23490746319293976, + "learning_rate": 2.5836351084724587e-05, + "loss": 1.056, + "step": 10890 + }, + { + "epoch": 0.6617048423355003, + "grad_norm": 0.23946048319339752, + "learning_rate": 2.5827970854723615e-05, + "loss": 1.2313, + "step": 10891 + }, + { + "epoch": 0.6617655993681268, + "grad_norm": 0.21814477443695068, + "learning_rate": 2.581959151075303e-05, + "loss": 1.1219, + "step": 10892 + }, + { + "epoch": 0.6618263564007534, + "grad_norm": 0.15023307502269745, + "learning_rate": 2.581121305311999e-05, + "loss": 1.0767, + "step": 10893 + }, + { + "epoch": 0.6618871134333799, + "grad_norm": 0.4145217835903168, + "learning_rate": 2.5802835482131616e-05, + "loss": 1.0436, + "step": 10894 + }, + { + "epoch": 0.6619478704660064, + "grad_norm": 0.21603821218013763, + "learning_rate": 2.5794458798094955e-05, + "loss": 1.0727, + "step": 10895 + }, + { + "epoch": 0.662008627498633, + "grad_norm": 0.13193009793758392, + "learning_rate": 2.578608300131712e-05, + "loss": 1.0198, + "step": 10896 + }, + { + "epoch": 0.6620693845312595, + "grad_norm": 0.1871628314256668, + "learning_rate": 2.577770809210508e-05, + "loss": 1.1044, + "step": 10897 + }, + { + "epoch": 0.662130141563886, + "grad_norm": 0.1724691241979599, + "learning_rate": 2.5769334070765856e-05, + "loss": 1.0182, + "step": 10898 + }, + { + "epoch": 0.6621908985965126, + "grad_norm": 0.1359548419713974, + "learning_rate": 2.5760960937606353e-05, + "loss": 1.0623, + "step": 10899 + }, + { + "epoch": 0.6622516556291391, + "grad_norm": 0.20106449723243713, + "learning_rate": 2.57525886929335e-05, + "loss": 1.2013, + "step": 10900 + }, + { + "epoch": 0.6623124126617655, + "grad_norm": 0.2684769928455353, + "learning_rate": 2.574421733705421e-05, + "loss": 1.0162, + "step": 10901 + }, + { + "epoch": 0.6623731696943921, + "grad_norm": 0.12225805222988129, + "learning_rate": 2.5735846870275322e-05, + "loss": 0.9762, + "step": 10902 + }, + { + "epoch": 0.6624339267270186, + "grad_norm": 0.1472126990556717, + "learning_rate": 2.5727477292903655e-05, + "loss": 1.1052, + "step": 10903 + }, + { + "epoch": 0.6624946837596452, + "grad_norm": 2.2278921604156494, + "learning_rate": 2.5719108605246e-05, + "loss": 1.0728, + "step": 10904 + }, + { + "epoch": 0.6625554407922717, + "grad_norm": 0.2266005277633667, + "learning_rate": 2.5710740807609117e-05, + "loss": 1.1619, + "step": 10905 + }, + { + "epoch": 0.6626161978248982, + "grad_norm": 0.998564600944519, + "learning_rate": 2.5702373900299725e-05, + "loss": 1.3294, + "step": 10906 + }, + { + "epoch": 0.6626769548575248, + "grad_norm": 0.15773192048072815, + "learning_rate": 2.569400788362451e-05, + "loss": 1.0401, + "step": 10907 + }, + { + "epoch": 0.6627377118901513, + "grad_norm": 0.11532074213027954, + "learning_rate": 2.5685642757890138e-05, + "loss": 0.9735, + "step": 10908 + }, + { + "epoch": 0.6627984689227778, + "grad_norm": 4.395709991455078, + "learning_rate": 2.5677278523403226e-05, + "loss": 1.1131, + "step": 10909 + }, + { + "epoch": 0.6628592259554044, + "grad_norm": 0.2470182329416275, + "learning_rate": 2.5668915180470376e-05, + "loss": 1.1702, + "step": 10910 + }, + { + "epoch": 0.6629199829880309, + "grad_norm": 0.17347879707813263, + "learning_rate": 2.5660552729398118e-05, + "loss": 1.1674, + "step": 10911 + }, + { + "epoch": 0.6629807400206574, + "grad_norm": 0.20448634028434753, + "learning_rate": 2.5652191170493045e-05, + "loss": 1.1145, + "step": 10912 + }, + { + "epoch": 0.663041497053284, + "grad_norm": 0.13824328780174255, + "learning_rate": 2.5643830504061594e-05, + "loss": 1.0981, + "step": 10913 + }, + { + "epoch": 0.6631022540859104, + "grad_norm": 0.2224777191877365, + "learning_rate": 2.5635470730410243e-05, + "loss": 1.0506, + "step": 10914 + }, + { + "epoch": 0.6631630111185369, + "grad_norm": 0.17914369702339172, + "learning_rate": 2.562711184984542e-05, + "loss": 1.0237, + "step": 10915 + }, + { + "epoch": 0.6632237681511635, + "grad_norm": 0.16801179945468903, + "learning_rate": 2.5618753862673506e-05, + "loss": 1.1143, + "step": 10916 + }, + { + "epoch": 0.66328452518379, + "grad_norm": 0.13636700809001923, + "learning_rate": 2.5610396769200906e-05, + "loss": 1.0748, + "step": 10917 + }, + { + "epoch": 0.6633452822164165, + "grad_norm": 0.17905090749263763, + "learning_rate": 2.5602040569733936e-05, + "loss": 1.1598, + "step": 10918 + }, + { + "epoch": 0.6634060392490431, + "grad_norm": 0.14596080780029297, + "learning_rate": 2.559368526457887e-05, + "loss": 1.0201, + "step": 10919 + }, + { + "epoch": 0.6634667962816696, + "grad_norm": 0.31742027401924133, + "learning_rate": 2.5585330854041988e-05, + "loss": 1.0801, + "step": 10920 + }, + { + "epoch": 0.6635275533142961, + "grad_norm": 0.33845844864845276, + "learning_rate": 2.557697733842951e-05, + "loss": 1.1417, + "step": 10921 + }, + { + "epoch": 0.6635883103469227, + "grad_norm": 0.12948045134544373, + "learning_rate": 2.556862471804766e-05, + "loss": 1.0289, + "step": 10922 + }, + { + "epoch": 0.6636490673795492, + "grad_norm": 0.15239079296588898, + "learning_rate": 2.5560272993202594e-05, + "loss": 1.0488, + "step": 10923 + }, + { + "epoch": 0.6637098244121757, + "grad_norm": 0.3333180844783783, + "learning_rate": 2.555192216420045e-05, + "loss": 1.2639, + "step": 10924 + }, + { + "epoch": 0.6637705814448023, + "grad_norm": 0.13333414494991302, + "learning_rate": 2.5543572231347345e-05, + "loss": 1.066, + "step": 10925 + }, + { + "epoch": 0.6638313384774288, + "grad_norm": 0.2191387116909027, + "learning_rate": 2.553522319494928e-05, + "loss": 1.0223, + "step": 10926 + }, + { + "epoch": 0.6638920955100552, + "grad_norm": 0.10697758942842484, + "learning_rate": 2.552687505531236e-05, + "loss": 1.0463, + "step": 10927 + }, + { + "epoch": 0.6639528525426818, + "grad_norm": 0.12608550488948822, + "learning_rate": 2.551852781274257e-05, + "loss": 1.039, + "step": 10928 + }, + { + "epoch": 0.6640136095753083, + "grad_norm": 0.16342447698116302, + "learning_rate": 2.5510181467545873e-05, + "loss": 1.0519, + "step": 10929 + }, + { + "epoch": 0.6640743666079348, + "grad_norm": 0.16439864039421082, + "learning_rate": 2.5501836020028208e-05, + "loss": 1.0758, + "step": 10930 + }, + { + "epoch": 0.6641351236405614, + "grad_norm": 0.1616610437631607, + "learning_rate": 2.549349147049548e-05, + "loss": 1.0354, + "step": 10931 + }, + { + "epoch": 0.6641958806731879, + "grad_norm": 0.17169110476970673, + "learning_rate": 2.5485147819253556e-05, + "loss": 1.0739, + "step": 10932 + }, + { + "epoch": 0.6642566377058144, + "grad_norm": 0.11216206848621368, + "learning_rate": 2.547680506660828e-05, + "loss": 1.0598, + "step": 10933 + }, + { + "epoch": 0.664317394738441, + "grad_norm": 0.265135258436203, + "learning_rate": 2.5468463212865455e-05, + "loss": 1.21, + "step": 10934 + }, + { + "epoch": 0.6643781517710675, + "grad_norm": 0.1278349906206131, + "learning_rate": 2.5460122258330853e-05, + "loss": 1.0443, + "step": 10935 + }, + { + "epoch": 0.664438908803694, + "grad_norm": 0.1615898460149765, + "learning_rate": 2.5451782203310208e-05, + "loss": 1.1094, + "step": 10936 + }, + { + "epoch": 0.6644996658363206, + "grad_norm": 0.1697765439748764, + "learning_rate": 2.5443443048109217e-05, + "loss": 1.0075, + "step": 10937 + }, + { + "epoch": 0.6645604228689471, + "grad_norm": 0.2195979505777359, + "learning_rate": 2.5435104793033605e-05, + "loss": 1.0721, + "step": 10938 + }, + { + "epoch": 0.6646211799015737, + "grad_norm": 0.25098177790641785, + "learning_rate": 2.5426767438388954e-05, + "loss": 1.1371, + "step": 10939 + }, + { + "epoch": 0.6646819369342001, + "grad_norm": 0.3639383316040039, + "learning_rate": 2.54184309844809e-05, + "loss": 1.1418, + "step": 10940 + }, + { + "epoch": 0.6647426939668266, + "grad_norm": 0.18113672733306885, + "learning_rate": 2.5410095431615e-05, + "loss": 1.1016, + "step": 10941 + }, + { + "epoch": 0.6648034509994531, + "grad_norm": 0.1271345317363739, + "learning_rate": 2.5401760780096794e-05, + "loss": 1.036, + "step": 10942 + }, + { + "epoch": 0.6648642080320797, + "grad_norm": 0.19353890419006348, + "learning_rate": 2.5393427030231816e-05, + "loss": 1.1762, + "step": 10943 + }, + { + "epoch": 0.6649249650647062, + "grad_norm": 0.14769570529460907, + "learning_rate": 2.5385094182325525e-05, + "loss": 1.1017, + "step": 10944 + }, + { + "epoch": 0.6649857220973328, + "grad_norm": 0.13683219254016876, + "learning_rate": 2.537676223668338e-05, + "loss": 1.052, + "step": 10945 + }, + { + "epoch": 0.6650464791299593, + "grad_norm": 0.10853105783462524, + "learning_rate": 2.5368431193610763e-05, + "loss": 1.0403, + "step": 10946 + }, + { + "epoch": 0.6651072361625858, + "grad_norm": 0.18989138305187225, + "learning_rate": 2.5360101053413045e-05, + "loss": 1.078, + "step": 10947 + }, + { + "epoch": 0.6651679931952124, + "grad_norm": 0.12131781131029129, + "learning_rate": 2.5351771816395597e-05, + "loss": 1.0689, + "step": 10948 + }, + { + "epoch": 0.6652287502278389, + "grad_norm": 0.11907931417226791, + "learning_rate": 2.5343443482863728e-05, + "loss": 1.0483, + "step": 10949 + }, + { + "epoch": 0.6652895072604654, + "grad_norm": 0.10854718089103699, + "learning_rate": 2.53351160531227e-05, + "loss": 0.9932, + "step": 10950 + }, + { + "epoch": 0.665350264293092, + "grad_norm": 0.28004562854766846, + "learning_rate": 2.5326789527477756e-05, + "loss": 1.1395, + "step": 10951 + }, + { + "epoch": 0.6654110213257185, + "grad_norm": 0.20339399576187134, + "learning_rate": 2.5318463906234113e-05, + "loss": 1.1368, + "step": 10952 + }, + { + "epoch": 0.6654717783583449, + "grad_norm": 0.37540802359580994, + "learning_rate": 2.5310139189696945e-05, + "loss": 1.2658, + "step": 10953 + }, + { + "epoch": 0.6655325353909715, + "grad_norm": 0.13918866217136383, + "learning_rate": 2.53018153781714e-05, + "loss": 0.9907, + "step": 10954 + }, + { + "epoch": 0.665593292423598, + "grad_norm": 0.3053690195083618, + "learning_rate": 2.5293492471962587e-05, + "loss": 1.1532, + "step": 10955 + }, + { + "epoch": 0.6656540494562245, + "grad_norm": 0.24435748159885406, + "learning_rate": 2.528517047137558e-05, + "loss": 1.1165, + "step": 10956 + }, + { + "epoch": 0.6657148064888511, + "grad_norm": 0.2918207347393036, + "learning_rate": 2.527684937671543e-05, + "loss": 1.2466, + "step": 10957 + }, + { + "epoch": 0.6657755635214776, + "grad_norm": 0.10876744985580444, + "learning_rate": 2.5268529188287137e-05, + "loss": 1.0431, + "step": 10958 + }, + { + "epoch": 0.6658363205541041, + "grad_norm": 3.3927807807922363, + "learning_rate": 2.52602099063957e-05, + "loss": 1.0761, + "step": 10959 + }, + { + "epoch": 0.6658970775867307, + "grad_norm": 0.20972682535648346, + "learning_rate": 2.5251891531346038e-05, + "loss": 1.0863, + "step": 10960 + }, + { + "epoch": 0.6659578346193572, + "grad_norm": 0.2238680124282837, + "learning_rate": 2.524357406344309e-05, + "loss": 1.0027, + "step": 10961 + }, + { + "epoch": 0.6660185916519837, + "grad_norm": 1.1363712549209595, + "learning_rate": 2.523525750299169e-05, + "loss": 1.1623, + "step": 10962 + }, + { + "epoch": 0.6660793486846103, + "grad_norm": 0.13125662505626678, + "learning_rate": 2.5226941850296737e-05, + "loss": 1.1134, + "step": 10963 + }, + { + "epoch": 0.6661401057172368, + "grad_norm": 0.10065761208534241, + "learning_rate": 2.5218627105663017e-05, + "loss": 1.0206, + "step": 10964 + }, + { + "epoch": 0.6662008627498633, + "grad_norm": 0.16558073461055756, + "learning_rate": 2.5210313269395326e-05, + "loss": 1.0784, + "step": 10965 + }, + { + "epoch": 0.6662616197824899, + "grad_norm": 0.16034385561943054, + "learning_rate": 2.520200034179838e-05, + "loss": 1.0932, + "step": 10966 + }, + { + "epoch": 0.6663223768151163, + "grad_norm": 0.13057871162891388, + "learning_rate": 2.5193688323176882e-05, + "loss": 1.0698, + "step": 10967 + }, + { + "epoch": 0.6663831338477428, + "grad_norm": 0.10659851133823395, + "learning_rate": 2.518537721383556e-05, + "loss": 1.0417, + "step": 10968 + }, + { + "epoch": 0.6664438908803694, + "grad_norm": 0.1049768477678299, + "learning_rate": 2.5177067014079026e-05, + "loss": 1.0249, + "step": 10969 + }, + { + "epoch": 0.6665046479129959, + "grad_norm": 0.1295790672302246, + "learning_rate": 2.51687577242119e-05, + "loss": 1.0233, + "step": 10970 + }, + { + "epoch": 0.6665654049456224, + "grad_norm": 0.140318363904953, + "learning_rate": 2.516044934453876e-05, + "loss": 1.0856, + "step": 10971 + }, + { + "epoch": 0.666626161978249, + "grad_norm": 0.10695676505565643, + "learning_rate": 2.5152141875364148e-05, + "loss": 1.0294, + "step": 10972 + }, + { + "epoch": 0.6666869190108755, + "grad_norm": 0.14935553073883057, + "learning_rate": 2.514383531699258e-05, + "loss": 1.0153, + "step": 10973 + }, + { + "epoch": 0.666747676043502, + "grad_norm": 0.15685604512691498, + "learning_rate": 2.5135529669728537e-05, + "loss": 1.1119, + "step": 10974 + }, + { + "epoch": 0.6668084330761286, + "grad_norm": 0.13498510420322418, + "learning_rate": 2.512722493387646e-05, + "loss": 1.0648, + "step": 10975 + }, + { + "epoch": 0.6668691901087551, + "grad_norm": 0.1312147080898285, + "learning_rate": 2.5118921109740757e-05, + "loss": 1.0833, + "step": 10976 + }, + { + "epoch": 0.6669299471413817, + "grad_norm": 0.09532418102025986, + "learning_rate": 2.5110618197625806e-05, + "loss": 1.0211, + "step": 10977 + }, + { + "epoch": 0.6669907041740082, + "grad_norm": 0.126339852809906, + "learning_rate": 2.510231619783594e-05, + "loss": 1.0884, + "step": 10978 + }, + { + "epoch": 0.6670514612066347, + "grad_norm": 0.14915476739406586, + "learning_rate": 2.509401511067553e-05, + "loss": 1.0718, + "step": 10979 + }, + { + "epoch": 0.6671122182392611, + "grad_norm": 0.5402998924255371, + "learning_rate": 2.508571493644879e-05, + "loss": 1.0478, + "step": 10980 + }, + { + "epoch": 0.6671729752718877, + "grad_norm": 0.3424145579338074, + "learning_rate": 2.5077415675459982e-05, + "loss": 1.0312, + "step": 10981 + }, + { + "epoch": 0.6672337323045142, + "grad_norm": 0.3049122393131256, + "learning_rate": 2.506911732801332e-05, + "loss": 1.1784, + "step": 10982 + }, + { + "epoch": 0.6672944893371408, + "grad_norm": 0.25699368119239807, + "learning_rate": 2.5060819894412963e-05, + "loss": 1.1183, + "step": 10983 + }, + { + "epoch": 0.6673552463697673, + "grad_norm": 1.1047624349594116, + "learning_rate": 2.505252337496309e-05, + "loss": 1.0959, + "step": 10984 + }, + { + "epoch": 0.6674160034023938, + "grad_norm": 0.1271514594554901, + "learning_rate": 2.504422776996782e-05, + "loss": 1.053, + "step": 10985 + }, + { + "epoch": 0.6674767604350204, + "grad_norm": 0.17269587516784668, + "learning_rate": 2.5035933079731176e-05, + "loss": 1.0488, + "step": 10986 + }, + { + "epoch": 0.6675375174676469, + "grad_norm": 0.21040619909763336, + "learning_rate": 2.5027639304557236e-05, + "loss": 1.1364, + "step": 10987 + }, + { + "epoch": 0.6675982745002734, + "grad_norm": 0.1320820450782776, + "learning_rate": 2.5019346444749992e-05, + "loss": 1.0364, + "step": 10988 + }, + { + "epoch": 0.6676590315329, + "grad_norm": 0.11433776468038559, + "learning_rate": 2.5011054500613446e-05, + "loss": 1.0156, + "step": 10989 + }, + { + "epoch": 0.6677197885655265, + "grad_norm": 0.13606613874435425, + "learning_rate": 2.5002763472451528e-05, + "loss": 1.0408, + "step": 10990 + }, + { + "epoch": 0.667780545598153, + "grad_norm": 0.14474043250083923, + "learning_rate": 2.4994473360568145e-05, + "loss": 1.0466, + "step": 10991 + }, + { + "epoch": 0.6678413026307796, + "grad_norm": 0.20358166098594666, + "learning_rate": 2.4986184165267195e-05, + "loss": 1.0616, + "step": 10992 + }, + { + "epoch": 0.667902059663406, + "grad_norm": 0.11155235022306442, + "learning_rate": 2.497789588685246e-05, + "loss": 1.0487, + "step": 10993 + }, + { + "epoch": 0.6679628166960325, + "grad_norm": 0.18793247640132904, + "learning_rate": 2.4969608525627807e-05, + "loss": 1.0539, + "step": 10994 + }, + { + "epoch": 0.6680235737286591, + "grad_norm": 0.1075793206691742, + "learning_rate": 2.4961322081896994e-05, + "loss": 1.068, + "step": 10995 + }, + { + "epoch": 0.6680843307612856, + "grad_norm": 1.0784072875976562, + "learning_rate": 2.4953036555963754e-05, + "loss": 1.0165, + "step": 10996 + }, + { + "epoch": 0.6681450877939121, + "grad_norm": 0.17310203611850739, + "learning_rate": 2.4944751948131806e-05, + "loss": 1.0761, + "step": 10997 + }, + { + "epoch": 0.6682058448265387, + "grad_norm": 0.17463995516300201, + "learning_rate": 2.493646825870482e-05, + "loss": 1.0954, + "step": 10998 + }, + { + "epoch": 0.6682666018591652, + "grad_norm": 0.13798128068447113, + "learning_rate": 2.4928185487986433e-05, + "loss": 1.0186, + "step": 10999 + }, + { + "epoch": 0.6683273588917917, + "grad_norm": 0.12489543855190277, + "learning_rate": 2.491990363628025e-05, + "loss": 1.113, + "step": 11000 + }, + { + "epoch": 0.6683881159244183, + "grad_norm": 0.30142515897750854, + "learning_rate": 2.4911622703889848e-05, + "loss": 1.1783, + "step": 11001 + }, + { + "epoch": 0.6684488729570448, + "grad_norm": 0.7820678353309631, + "learning_rate": 2.490334269111877e-05, + "loss": 1.1694, + "step": 11002 + }, + { + "epoch": 0.6685096299896713, + "grad_norm": 0.45104581117630005, + "learning_rate": 2.489506359827051e-05, + "loss": 1.1753, + "step": 11003 + }, + { + "epoch": 0.6685703870222979, + "grad_norm": 0.20650026202201843, + "learning_rate": 2.4886785425648533e-05, + "loss": 1.0191, + "step": 11004 + }, + { + "epoch": 0.6686311440549244, + "grad_norm": 0.18666957318782806, + "learning_rate": 2.4878508173556327e-05, + "loss": 1.0125, + "step": 11005 + }, + { + "epoch": 0.6686919010875508, + "grad_norm": 0.43002381920814514, + "learning_rate": 2.4870231842297236e-05, + "loss": 1.1658, + "step": 11006 + }, + { + "epoch": 0.6687526581201774, + "grad_norm": 0.16620266437530518, + "learning_rate": 2.486195643217466e-05, + "loss": 1.0975, + "step": 11007 + }, + { + "epoch": 0.6688134151528039, + "grad_norm": 0.21435941755771637, + "learning_rate": 2.485368194349193e-05, + "loss": 1.0637, + "step": 11008 + }, + { + "epoch": 0.6688741721854304, + "grad_norm": 0.18640761077404022, + "learning_rate": 2.4845408376552325e-05, + "loss": 1.0572, + "step": 11009 + }, + { + "epoch": 0.668934929218057, + "grad_norm": 0.1668463796377182, + "learning_rate": 2.4837135731659168e-05, + "loss": 1.2412, + "step": 11010 + }, + { + "epoch": 0.6689956862506835, + "grad_norm": 0.14010213315486908, + "learning_rate": 2.4828864009115654e-05, + "loss": 1.058, + "step": 11011 + }, + { + "epoch": 0.66905644328331, + "grad_norm": 0.18175546824932098, + "learning_rate": 2.4820593209225017e-05, + "loss": 1.1308, + "step": 11012 + }, + { + "epoch": 0.6691172003159366, + "grad_norm": 0.2023133635520935, + "learning_rate": 2.4812323332290384e-05, + "loss": 1.0996, + "step": 11013 + }, + { + "epoch": 0.6691779573485631, + "grad_norm": 0.11518340557813644, + "learning_rate": 2.4804054378614887e-05, + "loss": 1.0337, + "step": 11014 + }, + { + "epoch": 0.6692387143811896, + "grad_norm": 0.10573253035545349, + "learning_rate": 2.479578634850167e-05, + "loss": 1.0204, + "step": 11015 + }, + { + "epoch": 0.6692994714138162, + "grad_norm": 0.12148343026638031, + "learning_rate": 2.4787519242253776e-05, + "loss": 1.0455, + "step": 11016 + }, + { + "epoch": 0.6693602284464427, + "grad_norm": 0.3998861312866211, + "learning_rate": 2.4779253060174233e-05, + "loss": 1.069, + "step": 11017 + }, + { + "epoch": 0.6694209854790693, + "grad_norm": 0.14318203926086426, + "learning_rate": 2.4770987802566054e-05, + "loss": 1.0613, + "step": 11018 + }, + { + "epoch": 0.6694817425116957, + "grad_norm": 0.15260010957717896, + "learning_rate": 2.4762723469732186e-05, + "loss": 1.0182, + "step": 11019 + }, + { + "epoch": 0.6695424995443222, + "grad_norm": 0.18030261993408203, + "learning_rate": 2.475446006197557e-05, + "loss": 1.0358, + "step": 11020 + }, + { + "epoch": 0.6696032565769487, + "grad_norm": 0.18028858304023743, + "learning_rate": 2.4746197579599095e-05, + "loss": 1.1461, + "step": 11021 + }, + { + "epoch": 0.6696640136095753, + "grad_norm": 0.14516298472881317, + "learning_rate": 2.4737936022905632e-05, + "loss": 1.1109, + "step": 11022 + }, + { + "epoch": 0.6697247706422018, + "grad_norm": 0.15879233181476593, + "learning_rate": 2.472967539219801e-05, + "loss": 1.067, + "step": 11023 + }, + { + "epoch": 0.6697855276748284, + "grad_norm": 0.16960769891738892, + "learning_rate": 2.472141568777902e-05, + "loss": 1.0521, + "step": 11024 + }, + { + "epoch": 0.6698462847074549, + "grad_norm": 0.16727851331233978, + "learning_rate": 2.4713156909951424e-05, + "loss": 1.0142, + "step": 11025 + }, + { + "epoch": 0.6699070417400814, + "grad_norm": 0.20663772523403168, + "learning_rate": 2.470489905901795e-05, + "loss": 1.1121, + "step": 11026 + }, + { + "epoch": 0.669967798772708, + "grad_norm": 0.1411844938993454, + "learning_rate": 2.469664213528129e-05, + "loss": 1.053, + "step": 11027 + }, + { + "epoch": 0.6700285558053345, + "grad_norm": 0.13510943949222565, + "learning_rate": 2.46883861390441e-05, + "loss": 1.0884, + "step": 11028 + }, + { + "epoch": 0.670089312837961, + "grad_norm": 0.16399888694286346, + "learning_rate": 2.4680131070609015e-05, + "loss": 1.0639, + "step": 11029 + }, + { + "epoch": 0.6701500698705876, + "grad_norm": 0.22690443694591522, + "learning_rate": 2.4671876930278598e-05, + "loss": 1.1131, + "step": 11030 + }, + { + "epoch": 0.6702108269032141, + "grad_norm": 0.12540382146835327, + "learning_rate": 2.4663623718355444e-05, + "loss": 1.1023, + "step": 11031 + }, + { + "epoch": 0.6702715839358405, + "grad_norm": 0.18903689086437225, + "learning_rate": 2.4655371435142084e-05, + "loss": 1.103, + "step": 11032 + }, + { + "epoch": 0.670332340968467, + "grad_norm": 0.11257229000329971, + "learning_rate": 2.4647120080940962e-05, + "loss": 1.0481, + "step": 11033 + }, + { + "epoch": 0.6703930980010936, + "grad_norm": 0.14155890047550201, + "learning_rate": 2.4638869656054554e-05, + "loss": 1.1018, + "step": 11034 + }, + { + "epoch": 0.6704538550337201, + "grad_norm": 0.2668917179107666, + "learning_rate": 2.463062016078526e-05, + "loss": 1.1144, + "step": 11035 + }, + { + "epoch": 0.6705146120663467, + "grad_norm": 0.10257581621408463, + "learning_rate": 2.4622371595435506e-05, + "loss": 1.0188, + "step": 11036 + }, + { + "epoch": 0.6705753690989732, + "grad_norm": 0.2013516128063202, + "learning_rate": 2.4614123960307634e-05, + "loss": 1.1496, + "step": 11037 + }, + { + "epoch": 0.6706361261315997, + "grad_norm": 1.106760859489441, + "learning_rate": 2.4605877255703947e-05, + "loss": 1.0553, + "step": 11038 + }, + { + "epoch": 0.6706968831642263, + "grad_norm": 0.2565065324306488, + "learning_rate": 2.459763148192676e-05, + "loss": 1.029, + "step": 11039 + }, + { + "epoch": 0.6707576401968528, + "grad_norm": 0.16055181622505188, + "learning_rate": 2.458938663927826e-05, + "loss": 0.9793, + "step": 11040 + }, + { + "epoch": 0.6708183972294793, + "grad_norm": 0.2202807515859604, + "learning_rate": 2.458114272806073e-05, + "loss": 1.0342, + "step": 11041 + }, + { + "epoch": 0.6708791542621059, + "grad_norm": 0.13346879184246063, + "learning_rate": 2.457289974857632e-05, + "loss": 1.0884, + "step": 11042 + }, + { + "epoch": 0.6709399112947324, + "grad_norm": 0.1459265798330307, + "learning_rate": 2.4564657701127193e-05, + "loss": 1.0081, + "step": 11043 + }, + { + "epoch": 0.6710006683273589, + "grad_norm": 0.15224696695804596, + "learning_rate": 2.4556416586015446e-05, + "loss": 1.0997, + "step": 11044 + }, + { + "epoch": 0.6710614253599854, + "grad_norm": 0.22559818625450134, + "learning_rate": 2.454817640354317e-05, + "loss": 1.108, + "step": 11045 + }, + { + "epoch": 0.6711221823926119, + "grad_norm": 0.3387386202812195, + "learning_rate": 2.453993715401241e-05, + "loss": 1.1044, + "step": 11046 + }, + { + "epoch": 0.6711829394252384, + "grad_norm": 0.19596999883651733, + "learning_rate": 2.4531698837725175e-05, + "loss": 1.1141, + "step": 11047 + }, + { + "epoch": 0.671243696457865, + "grad_norm": 0.12199502438306808, + "learning_rate": 2.4523461454983436e-05, + "loss": 1.0643, + "step": 11048 + }, + { + "epoch": 0.6713044534904915, + "grad_norm": 0.13556912541389465, + "learning_rate": 2.4515225006089147e-05, + "loss": 1.0302, + "step": 11049 + }, + { + "epoch": 0.671365210523118, + "grad_norm": 0.19067586958408356, + "learning_rate": 2.4506989491344213e-05, + "loss": 1.1643, + "step": 11050 + }, + { + "epoch": 0.6714259675557446, + "grad_norm": 0.1274019181728363, + "learning_rate": 2.4498754911050485e-05, + "loss": 0.9875, + "step": 11051 + }, + { + "epoch": 0.6714867245883711, + "grad_norm": 0.1863957643508911, + "learning_rate": 2.4490521265509863e-05, + "loss": 1.1279, + "step": 11052 + }, + { + "epoch": 0.6715474816209976, + "grad_norm": 0.1458347737789154, + "learning_rate": 2.4482288555024097e-05, + "loss": 1.0337, + "step": 11053 + }, + { + "epoch": 0.6716082386536242, + "grad_norm": 0.2231624722480774, + "learning_rate": 2.4474056779894978e-05, + "loss": 1.1914, + "step": 11054 + }, + { + "epoch": 0.6716689956862507, + "grad_norm": 0.18422985076904297, + "learning_rate": 2.4465825940424236e-05, + "loss": 1.1243, + "step": 11055 + }, + { + "epoch": 0.6717297527188772, + "grad_norm": 0.178894504904747, + "learning_rate": 2.4457596036913567e-05, + "loss": 1.0835, + "step": 11056 + }, + { + "epoch": 0.6717905097515038, + "grad_norm": 0.15781378746032715, + "learning_rate": 2.444936706966467e-05, + "loss": 1.0731, + "step": 11057 + }, + { + "epoch": 0.6718512667841302, + "grad_norm": 0.15718761086463928, + "learning_rate": 2.444113903897916e-05, + "loss": 1.039, + "step": 11058 + }, + { + "epoch": 0.6719120238167567, + "grad_norm": 0.1350620836019516, + "learning_rate": 2.4432911945158653e-05, + "loss": 1.0101, + "step": 11059 + }, + { + "epoch": 0.6719727808493833, + "grad_norm": 0.355625718832016, + "learning_rate": 2.442468578850467e-05, + "loss": 1.179, + "step": 11060 + }, + { + "epoch": 0.6720335378820098, + "grad_norm": 0.1486140340566635, + "learning_rate": 2.4416460569318796e-05, + "loss": 1.0926, + "step": 11061 + }, + { + "epoch": 0.6720942949146363, + "grad_norm": 0.2064017951488495, + "learning_rate": 2.4408236287902497e-05, + "loss": 1.0995, + "step": 11062 + }, + { + "epoch": 0.6721550519472629, + "grad_norm": 0.19788654148578644, + "learning_rate": 2.4400012944557244e-05, + "loss": 1.1173, + "step": 11063 + }, + { + "epoch": 0.6722158089798894, + "grad_norm": 0.2470221221446991, + "learning_rate": 2.439179053958447e-05, + "loss": 1.0547, + "step": 11064 + }, + { + "epoch": 0.672276566012516, + "grad_norm": 0.7126647233963013, + "learning_rate": 2.438356907328556e-05, + "loss": 1.2172, + "step": 11065 + }, + { + "epoch": 0.6723373230451425, + "grad_norm": 0.25403162837028503, + "learning_rate": 2.4375348545961878e-05, + "loss": 1.1078, + "step": 11066 + }, + { + "epoch": 0.672398080077769, + "grad_norm": 0.16500109434127808, + "learning_rate": 2.4367128957914746e-05, + "loss": 1.0563, + "step": 11067 + }, + { + "epoch": 0.6724588371103956, + "grad_norm": 0.1750573068857193, + "learning_rate": 2.435891030944546e-05, + "loss": 1.1769, + "step": 11068 + }, + { + "epoch": 0.6725195941430221, + "grad_norm": 0.14198659360408783, + "learning_rate": 2.4350692600855274e-05, + "loss": 0.985, + "step": 11069 + }, + { + "epoch": 0.6725803511756486, + "grad_norm": 0.15442423522472382, + "learning_rate": 2.4342475832445405e-05, + "loss": 1.0744, + "step": 11070 + }, + { + "epoch": 0.6726411082082752, + "grad_norm": 0.41567301750183105, + "learning_rate": 2.4334260004517022e-05, + "loss": 1.2301, + "step": 11071 + }, + { + "epoch": 0.6727018652409016, + "grad_norm": 0.1515897661447525, + "learning_rate": 2.4326045117371343e-05, + "loss": 1.1096, + "step": 11072 + }, + { + "epoch": 0.6727626222735281, + "grad_norm": 0.9282751679420471, + "learning_rate": 2.4317831171309423e-05, + "loss": 1.0783, + "step": 11073 + }, + { + "epoch": 0.6728233793061547, + "grad_norm": 0.21398164331912994, + "learning_rate": 2.430961816663237e-05, + "loss": 1.0437, + "step": 11074 + }, + { + "epoch": 0.6728841363387812, + "grad_norm": 0.15392647683620453, + "learning_rate": 2.430140610364122e-05, + "loss": 1.0981, + "step": 11075 + }, + { + "epoch": 0.6729448933714077, + "grad_norm": 0.23070895671844482, + "learning_rate": 2.4293194982636986e-05, + "loss": 1.1094, + "step": 11076 + }, + { + "epoch": 0.6730056504040343, + "grad_norm": 0.13760994374752045, + "learning_rate": 2.4284984803920673e-05, + "loss": 1.0755, + "step": 11077 + }, + { + "epoch": 0.6730664074366608, + "grad_norm": 0.22460895776748657, + "learning_rate": 2.4276775567793214e-05, + "loss": 1.0105, + "step": 11078 + }, + { + "epoch": 0.6731271644692873, + "grad_norm": 0.11733175069093704, + "learning_rate": 2.4268567274555538e-05, + "loss": 1.0348, + "step": 11079 + }, + { + "epoch": 0.6731879215019139, + "grad_norm": 0.11407577246427536, + "learning_rate": 2.426035992450848e-05, + "loss": 0.9997, + "step": 11080 + }, + { + "epoch": 0.6732486785345404, + "grad_norm": 0.15263628959655762, + "learning_rate": 2.4252153517952884e-05, + "loss": 1.1412, + "step": 11081 + }, + { + "epoch": 0.6733094355671669, + "grad_norm": 0.19868876039981842, + "learning_rate": 2.4243948055189602e-05, + "loss": 1.1893, + "step": 11082 + }, + { + "epoch": 0.6733701925997935, + "grad_norm": 0.4263297915458679, + "learning_rate": 2.4235743536519383e-05, + "loss": 1.1724, + "step": 11083 + }, + { + "epoch": 0.67343094963242, + "grad_norm": 0.2823430001735687, + "learning_rate": 2.422753996224296e-05, + "loss": 1.0477, + "step": 11084 + }, + { + "epoch": 0.6734917066650464, + "grad_norm": 0.13645730912685394, + "learning_rate": 2.4219337332661045e-05, + "loss": 1.1096, + "step": 11085 + }, + { + "epoch": 0.673552463697673, + "grad_norm": 0.28503063321113586, + "learning_rate": 2.421113564807431e-05, + "loss": 1.1504, + "step": 11086 + }, + { + "epoch": 0.6736132207302995, + "grad_norm": 0.17133508622646332, + "learning_rate": 2.420293490878337e-05, + "loss": 1.1179, + "step": 11087 + }, + { + "epoch": 0.673673977762926, + "grad_norm": 0.24579021334648132, + "learning_rate": 2.4194735115088846e-05, + "loss": 1.1998, + "step": 11088 + }, + { + "epoch": 0.6737347347955526, + "grad_norm": 0.12109159678220749, + "learning_rate": 2.4186536267291287e-05, + "loss": 1.0868, + "step": 11089 + }, + { + "epoch": 0.6737954918281791, + "grad_norm": 0.12620693445205688, + "learning_rate": 2.4178338365691238e-05, + "loss": 1.0801, + "step": 11090 + }, + { + "epoch": 0.6738562488608056, + "grad_norm": 0.24572831392288208, + "learning_rate": 2.4170141410589185e-05, + "loss": 1.2019, + "step": 11091 + }, + { + "epoch": 0.6739170058934322, + "grad_norm": 0.11499699205160141, + "learning_rate": 2.416194540228559e-05, + "loss": 1.0401, + "step": 11092 + }, + { + "epoch": 0.6739777629260587, + "grad_norm": 0.2524833381175995, + "learning_rate": 2.415375034108088e-05, + "loss": 1.3011, + "step": 11093 + }, + { + "epoch": 0.6740385199586852, + "grad_norm": 0.14241334795951843, + "learning_rate": 2.414555622727545e-05, + "loss": 1.0803, + "step": 11094 + }, + { + "epoch": 0.6740992769913118, + "grad_norm": 0.21915780007839203, + "learning_rate": 2.4137363061169656e-05, + "loss": 1.1197, + "step": 11095 + }, + { + "epoch": 0.6741600340239383, + "grad_norm": 0.11939509958028793, + "learning_rate": 2.4129170843063816e-05, + "loss": 1.0567, + "step": 11096 + }, + { + "epoch": 0.6742207910565648, + "grad_norm": 0.19560158252716064, + "learning_rate": 2.4120979573258195e-05, + "loss": 1.1072, + "step": 11097 + }, + { + "epoch": 0.6742815480891913, + "grad_norm": 2.686720848083496, + "learning_rate": 2.4112789252053092e-05, + "loss": 1.1111, + "step": 11098 + }, + { + "epoch": 0.6743423051218178, + "grad_norm": 0.22586117684841156, + "learning_rate": 2.4104599879748724e-05, + "loss": 1.1085, + "step": 11099 + }, + { + "epoch": 0.6744030621544443, + "grad_norm": 0.1262160837650299, + "learning_rate": 2.4096411456645235e-05, + "loss": 1.0541, + "step": 11100 + }, + { + "epoch": 0.6744638191870709, + "grad_norm": 2.3152763843536377, + "learning_rate": 2.408822398304279e-05, + "loss": 1.0395, + "step": 11101 + }, + { + "epoch": 0.6745245762196974, + "grad_norm": 0.15275660157203674, + "learning_rate": 2.4080037459241482e-05, + "loss": 1.0939, + "step": 11102 + }, + { + "epoch": 0.674585333252324, + "grad_norm": 0.5373700857162476, + "learning_rate": 2.4071851885541435e-05, + "loss": 1.0866, + "step": 11103 + }, + { + "epoch": 0.6746460902849505, + "grad_norm": 0.13188719749450684, + "learning_rate": 2.4063667262242662e-05, + "loss": 1.1003, + "step": 11104 + }, + { + "epoch": 0.674706847317577, + "grad_norm": 0.17168322205543518, + "learning_rate": 2.4055483589645185e-05, + "loss": 1.0888, + "step": 11105 + }, + { + "epoch": 0.6747676043502036, + "grad_norm": 0.17305390536785126, + "learning_rate": 2.404730086804899e-05, + "loss": 1.0685, + "step": 11106 + }, + { + "epoch": 0.6748283613828301, + "grad_norm": 0.3638569712638855, + "learning_rate": 2.403911909775396e-05, + "loss": 1.0102, + "step": 11107 + }, + { + "epoch": 0.6748891184154566, + "grad_norm": 0.47309207916259766, + "learning_rate": 2.4030938279060054e-05, + "loss": 1.1301, + "step": 11108 + }, + { + "epoch": 0.6749498754480832, + "grad_norm": 0.1250740885734558, + "learning_rate": 2.4022758412267126e-05, + "loss": 1.0399, + "step": 11109 + }, + { + "epoch": 0.6750106324807097, + "grad_norm": 0.1532147377729416, + "learning_rate": 2.401457949767501e-05, + "loss": 1.0876, + "step": 11110 + }, + { + "epoch": 0.6750713895133361, + "grad_norm": 0.23548004031181335, + "learning_rate": 2.4006401535583507e-05, + "loss": 0.9906, + "step": 11111 + }, + { + "epoch": 0.6751321465459627, + "grad_norm": 0.21655581891536713, + "learning_rate": 2.3998224526292377e-05, + "loss": 1.1653, + "step": 11112 + }, + { + "epoch": 0.6751929035785892, + "grad_norm": 0.14888663589954376, + "learning_rate": 2.3990048470101355e-05, + "loss": 1.1008, + "step": 11113 + }, + { + "epoch": 0.6752536606112157, + "grad_norm": 0.18984763324260712, + "learning_rate": 2.3981873367310136e-05, + "loss": 1.1244, + "step": 11114 + }, + { + "epoch": 0.6753144176438423, + "grad_norm": 0.1585441529750824, + "learning_rate": 2.3973699218218375e-05, + "loss": 1.113, + "step": 11115 + }, + { + "epoch": 0.6753751746764688, + "grad_norm": 0.11531566083431244, + "learning_rate": 2.39655260231257e-05, + "loss": 1.05, + "step": 11116 + }, + { + "epoch": 0.6754359317090953, + "grad_norm": 0.17235319316387177, + "learning_rate": 2.3957353782331706e-05, + "loss": 1.1019, + "step": 11117 + }, + { + "epoch": 0.6754966887417219, + "grad_norm": 0.12548208236694336, + "learning_rate": 2.3949182496135913e-05, + "loss": 1.018, + "step": 11118 + }, + { + "epoch": 0.6755574457743484, + "grad_norm": 0.14066247642040253, + "learning_rate": 2.3941012164837917e-05, + "loss": 1.0494, + "step": 11119 + }, + { + "epoch": 0.6756182028069749, + "grad_norm": 0.16103151440620422, + "learning_rate": 2.3932842788737135e-05, + "loss": 1.1113, + "step": 11120 + }, + { + "epoch": 0.6756789598396015, + "grad_norm": 0.2735982835292816, + "learning_rate": 2.3924674368133042e-05, + "loss": 1.113, + "step": 11121 + }, + { + "epoch": 0.675739716872228, + "grad_norm": 0.1598770171403885, + "learning_rate": 2.3916506903325053e-05, + "loss": 1.0469, + "step": 11122 + }, + { + "epoch": 0.6758004739048545, + "grad_norm": 0.20828813314437866, + "learning_rate": 2.390834039461252e-05, + "loss": 1.1276, + "step": 11123 + }, + { + "epoch": 0.675861230937481, + "grad_norm": 0.1409241110086441, + "learning_rate": 2.3900174842294836e-05, + "loss": 1.0763, + "step": 11124 + }, + { + "epoch": 0.6759219879701075, + "grad_norm": 0.12823322415351868, + "learning_rate": 2.3892010246671287e-05, + "loss": 1.0786, + "step": 11125 + }, + { + "epoch": 0.675982745002734, + "grad_norm": 0.16855323314666748, + "learning_rate": 2.388384660804117e-05, + "loss": 1.1017, + "step": 11126 + }, + { + "epoch": 0.6760435020353606, + "grad_norm": 0.16470292210578918, + "learning_rate": 2.3875683926703683e-05, + "loss": 1.0998, + "step": 11127 + }, + { + "epoch": 0.6761042590679871, + "grad_norm": 0.13120052218437195, + "learning_rate": 2.386752220295803e-05, + "loss": 1.073, + "step": 11128 + }, + { + "epoch": 0.6761650161006136, + "grad_norm": 0.31621286273002625, + "learning_rate": 2.3859361437103428e-05, + "loss": 1.2871, + "step": 11129 + }, + { + "epoch": 0.6762257731332402, + "grad_norm": 0.4286743998527527, + "learning_rate": 2.385120162943898e-05, + "loss": 1.0333, + "step": 11130 + }, + { + "epoch": 0.6762865301658667, + "grad_norm": 0.16835333406925201, + "learning_rate": 2.384304278026379e-05, + "loss": 1.1153, + "step": 11131 + }, + { + "epoch": 0.6763472871984932, + "grad_norm": 0.12320169061422348, + "learning_rate": 2.3834884889876925e-05, + "loss": 1.0807, + "step": 11132 + }, + { + "epoch": 0.6764080442311198, + "grad_norm": 0.19194099307060242, + "learning_rate": 2.3826727958577412e-05, + "loss": 1.0582, + "step": 11133 + }, + { + "epoch": 0.6764688012637463, + "grad_norm": 0.11436954885721207, + "learning_rate": 2.381857198666424e-05, + "loss": 1.0233, + "step": 11134 + }, + { + "epoch": 0.6765295582963728, + "grad_norm": 0.1854429394006729, + "learning_rate": 2.3810416974436378e-05, + "loss": 1.1164, + "step": 11135 + }, + { + "epoch": 0.6765903153289994, + "grad_norm": 0.10896814614534378, + "learning_rate": 2.3802262922192742e-05, + "loss": 1.0463, + "step": 11136 + }, + { + "epoch": 0.6766510723616258, + "grad_norm": 0.15312983095645905, + "learning_rate": 2.3794109830232226e-05, + "loss": 1.1177, + "step": 11137 + }, + { + "epoch": 0.6767118293942523, + "grad_norm": 0.1960471272468567, + "learning_rate": 2.378595769885368e-05, + "loss": 1.1229, + "step": 11138 + }, + { + "epoch": 0.6767725864268789, + "grad_norm": 0.21762306988239288, + "learning_rate": 2.3777806528355918e-05, + "loss": 1.0554, + "step": 11139 + }, + { + "epoch": 0.6768333434595054, + "grad_norm": 0.15447430312633514, + "learning_rate": 2.3769656319037724e-05, + "loss": 1.0138, + "step": 11140 + }, + { + "epoch": 0.676894100492132, + "grad_norm": 0.13208043575286865, + "learning_rate": 2.3761507071197854e-05, + "loss": 1.0548, + "step": 11141 + }, + { + "epoch": 0.6769548575247585, + "grad_norm": 0.12053914368152618, + "learning_rate": 2.3753358785135005e-05, + "loss": 1.064, + "step": 11142 + }, + { + "epoch": 0.677015614557385, + "grad_norm": 0.2518937587738037, + "learning_rate": 2.374521146114787e-05, + "loss": 1.1704, + "step": 11143 + }, + { + "epoch": 0.6770763715900115, + "grad_norm": 0.14782080054283142, + "learning_rate": 2.3737065099535066e-05, + "loss": 1.0677, + "step": 11144 + }, + { + "epoch": 0.6771371286226381, + "grad_norm": 0.1666611135005951, + "learning_rate": 2.3728919700595233e-05, + "loss": 1.1756, + "step": 11145 + }, + { + "epoch": 0.6771978856552646, + "grad_norm": 0.11371220648288727, + "learning_rate": 2.372077526462695e-05, + "loss": 1.0639, + "step": 11146 + }, + { + "epoch": 0.6772586426878912, + "grad_norm": 0.15326513350009918, + "learning_rate": 2.3712631791928703e-05, + "loss": 1.1054, + "step": 11147 + }, + { + "epoch": 0.6773193997205177, + "grad_norm": 0.14061607420444489, + "learning_rate": 2.3704489282799025e-05, + "loss": 1.0928, + "step": 11148 + }, + { + "epoch": 0.6773801567531442, + "grad_norm": 0.1306842565536499, + "learning_rate": 2.3696347737536352e-05, + "loss": 1.0713, + "step": 11149 + }, + { + "epoch": 0.6774409137857706, + "grad_norm": 0.17402581870555878, + "learning_rate": 2.3688207156439156e-05, + "loss": 1.158, + "step": 11150 + }, + { + "epoch": 0.6775016708183972, + "grad_norm": 0.1504419595003128, + "learning_rate": 2.3680067539805817e-05, + "loss": 1.0521, + "step": 11151 + }, + { + "epoch": 0.6775624278510237, + "grad_norm": 0.15994706749916077, + "learning_rate": 2.3671928887934687e-05, + "loss": 0.995, + "step": 11152 + }, + { + "epoch": 0.6776231848836503, + "grad_norm": 0.2099013328552246, + "learning_rate": 2.366379120112409e-05, + "loss": 1.0981, + "step": 11153 + }, + { + "epoch": 0.6776839419162768, + "grad_norm": 0.32417577505111694, + "learning_rate": 2.3655654479672324e-05, + "loss": 1.1424, + "step": 11154 + }, + { + "epoch": 0.6777446989489033, + "grad_norm": 0.15805046260356903, + "learning_rate": 2.3647518723877632e-05, + "loss": 1.11, + "step": 11155 + }, + { + "epoch": 0.6778054559815299, + "grad_norm": 0.17728765308856964, + "learning_rate": 2.3639383934038233e-05, + "loss": 1.0659, + "step": 11156 + }, + { + "epoch": 0.6778662130141564, + "grad_norm": 0.2740498185157776, + "learning_rate": 2.3631250110452312e-05, + "loss": 1.057, + "step": 11157 + }, + { + "epoch": 0.6779269700467829, + "grad_norm": 0.1369834691286087, + "learning_rate": 2.362311725341802e-05, + "loss": 1.0916, + "step": 11158 + }, + { + "epoch": 0.6779877270794095, + "grad_norm": 0.13214300572872162, + "learning_rate": 2.361498536323346e-05, + "loss": 1.0806, + "step": 11159 + }, + { + "epoch": 0.678048484112036, + "grad_norm": 0.13550175726413727, + "learning_rate": 2.3606854440196714e-05, + "loss": 1.0746, + "step": 11160 + }, + { + "epoch": 0.6781092411446625, + "grad_norm": 0.1341206282377243, + "learning_rate": 2.3598724484605815e-05, + "loss": 1.0567, + "step": 11161 + }, + { + "epoch": 0.6781699981772891, + "grad_norm": 0.2170247882604599, + "learning_rate": 2.3590595496758776e-05, + "loss": 1.041, + "step": 11162 + }, + { + "epoch": 0.6782307552099155, + "grad_norm": 0.12049305438995361, + "learning_rate": 2.3582467476953563e-05, + "loss": 1.0234, + "step": 11163 + }, + { + "epoch": 0.678291512242542, + "grad_norm": 0.26586124300956726, + "learning_rate": 2.357434042548809e-05, + "loss": 0.9987, + "step": 11164 + }, + { + "epoch": 0.6783522692751686, + "grad_norm": 0.14460714161396027, + "learning_rate": 2.3566214342660304e-05, + "loss": 1.0291, + "step": 11165 + }, + { + "epoch": 0.6784130263077951, + "grad_norm": 0.2643889784812927, + "learning_rate": 2.3558089228768048e-05, + "loss": 1.2214, + "step": 11166 + }, + { + "epoch": 0.6784737833404216, + "grad_norm": 0.8038681149482727, + "learning_rate": 2.3549965084109132e-05, + "loss": 1.0938, + "step": 11167 + }, + { + "epoch": 0.6785345403730482, + "grad_norm": 0.3597032129764557, + "learning_rate": 2.3541841908981353e-05, + "loss": 1.1543, + "step": 11168 + }, + { + "epoch": 0.6785952974056747, + "grad_norm": 0.14750725030899048, + "learning_rate": 2.353371970368246e-05, + "loss": 1.097, + "step": 11169 + }, + { + "epoch": 0.6786560544383012, + "grad_norm": 0.1521051973104477, + "learning_rate": 2.3525598468510208e-05, + "loss": 1.0609, + "step": 11170 + }, + { + "epoch": 0.6787168114709278, + "grad_norm": 0.14152897894382477, + "learning_rate": 2.3517478203762255e-05, + "loss": 1.0153, + "step": 11171 + }, + { + "epoch": 0.6787775685035543, + "grad_norm": 0.18961024284362793, + "learning_rate": 2.3509358909736256e-05, + "loss": 1.0387, + "step": 11172 + }, + { + "epoch": 0.6788383255361808, + "grad_norm": 0.3233139216899872, + "learning_rate": 2.350124058672985e-05, + "loss": 1.0516, + "step": 11173 + }, + { + "epoch": 0.6788990825688074, + "grad_norm": 0.13173529505729675, + "learning_rate": 2.3493123235040554e-05, + "loss": 1.0767, + "step": 11174 + }, + { + "epoch": 0.6789598396014339, + "grad_norm": 0.1663878709077835, + "learning_rate": 2.348500685496597e-05, + "loss": 1.0475, + "step": 11175 + }, + { + "epoch": 0.6790205966340604, + "grad_norm": 0.24867284297943115, + "learning_rate": 2.347689144680358e-05, + "loss": 1.2727, + "step": 11176 + }, + { + "epoch": 0.6790813536666869, + "grad_norm": 0.13534094393253326, + "learning_rate": 2.3468777010850863e-05, + "loss": 1.0833, + "step": 11177 + }, + { + "epoch": 0.6791421106993134, + "grad_norm": 0.1138947457075119, + "learning_rate": 2.3460663547405253e-05, + "loss": 1.018, + "step": 11178 + }, + { + "epoch": 0.6792028677319399, + "grad_norm": 0.15786230564117432, + "learning_rate": 2.3452551056764156e-05, + "loss": 1.1388, + "step": 11179 + }, + { + "epoch": 0.6792636247645665, + "grad_norm": 0.18799610435962677, + "learning_rate": 2.344443953922492e-05, + "loss": 1.0976, + "step": 11180 + }, + { + "epoch": 0.679324381797193, + "grad_norm": 0.13827615976333618, + "learning_rate": 2.3436328995084895e-05, + "loss": 1.0099, + "step": 11181 + }, + { + "epoch": 0.6793851388298195, + "grad_norm": 0.16614465415477753, + "learning_rate": 2.3428219424641363e-05, + "loss": 1.1709, + "step": 11182 + }, + { + "epoch": 0.6794458958624461, + "grad_norm": 0.12641862034797668, + "learning_rate": 2.3420110828191584e-05, + "loss": 1.0408, + "step": 11183 + }, + { + "epoch": 0.6795066528950726, + "grad_norm": 0.12962312996387482, + "learning_rate": 2.3412003206032786e-05, + "loss": 1.1191, + "step": 11184 + }, + { + "epoch": 0.6795674099276992, + "grad_norm": 0.24748635292053223, + "learning_rate": 2.3403896558462118e-05, + "loss": 1.1064, + "step": 11185 + }, + { + "epoch": 0.6796281669603257, + "grad_norm": 0.19643889367580414, + "learning_rate": 2.3395790885776807e-05, + "loss": 1.1248, + "step": 11186 + }, + { + "epoch": 0.6796889239929522, + "grad_norm": 4.767789363861084, + "learning_rate": 2.338768618827391e-05, + "loss": 1.0578, + "step": 11187 + }, + { + "epoch": 0.6797496810255788, + "grad_norm": 0.1541973203420639, + "learning_rate": 2.3379582466250515e-05, + "loss": 1.0691, + "step": 11188 + }, + { + "epoch": 0.6798104380582053, + "grad_norm": 0.13764290511608124, + "learning_rate": 2.337147972000367e-05, + "loss": 1.0353, + "step": 11189 + }, + { + "epoch": 0.6798711950908317, + "grad_norm": 0.1775774359703064, + "learning_rate": 2.3363377949830368e-05, + "loss": 1.0523, + "step": 11190 + }, + { + "epoch": 0.6799319521234582, + "grad_norm": 0.13941790163516998, + "learning_rate": 2.335527715602761e-05, + "loss": 1.0844, + "step": 11191 + }, + { + "epoch": 0.6799927091560848, + "grad_norm": 0.1713925004005432, + "learning_rate": 2.3347177338892318e-05, + "loss": 1.1128, + "step": 11192 + }, + { + "epoch": 0.6800534661887113, + "grad_norm": 0.5088496208190918, + "learning_rate": 2.3339078498721413e-05, + "loss": 1.1756, + "step": 11193 + }, + { + "epoch": 0.6801142232213379, + "grad_norm": 0.17205387353897095, + "learning_rate": 2.3330980635811722e-05, + "loss": 1.0744, + "step": 11194 + }, + { + "epoch": 0.6801749802539644, + "grad_norm": 0.15832848846912384, + "learning_rate": 2.3322883750460068e-05, + "loss": 1.0926, + "step": 11195 + }, + { + "epoch": 0.6802357372865909, + "grad_norm": 0.19478711485862732, + "learning_rate": 2.3314787842963286e-05, + "loss": 1.1616, + "step": 11196 + }, + { + "epoch": 0.6802964943192175, + "grad_norm": 0.11826007068157196, + "learning_rate": 2.3306692913618116e-05, + "loss": 1.0181, + "step": 11197 + }, + { + "epoch": 0.680357251351844, + "grad_norm": 0.11528857052326202, + "learning_rate": 2.329859896272128e-05, + "loss": 1.0055, + "step": 11198 + }, + { + "epoch": 0.6804180083844705, + "grad_norm": 0.14169877767562866, + "learning_rate": 2.3290505990569457e-05, + "loss": 1.1219, + "step": 11199 + }, + { + "epoch": 0.6804787654170971, + "grad_norm": 0.12776890397071838, + "learning_rate": 2.3282413997459295e-05, + "loss": 1.0739, + "step": 11200 + }, + { + "epoch": 0.6805395224497236, + "grad_norm": 0.1028270497918129, + "learning_rate": 2.3274322983687417e-05, + "loss": 1.0428, + "step": 11201 + }, + { + "epoch": 0.6806002794823501, + "grad_norm": 0.12106592953205109, + "learning_rate": 2.3266232949550397e-05, + "loss": 1.0452, + "step": 11202 + }, + { + "epoch": 0.6806610365149766, + "grad_norm": 0.14473384618759155, + "learning_rate": 2.3258143895344774e-05, + "loss": 1.0717, + "step": 11203 + }, + { + "epoch": 0.6807217935476031, + "grad_norm": 0.2115626037120819, + "learning_rate": 2.3250055821367056e-05, + "loss": 1.1266, + "step": 11204 + }, + { + "epoch": 0.6807825505802296, + "grad_norm": 0.159811332821846, + "learning_rate": 2.3241968727913716e-05, + "loss": 1.0065, + "step": 11205 + }, + { + "epoch": 0.6808433076128562, + "grad_norm": 0.11820317059755325, + "learning_rate": 2.3233882615281184e-05, + "loss": 1.0482, + "step": 11206 + }, + { + "epoch": 0.6809040646454827, + "grad_norm": 0.1523427814245224, + "learning_rate": 2.3225797483765855e-05, + "loss": 1.0416, + "step": 11207 + }, + { + "epoch": 0.6809648216781092, + "grad_norm": 0.15173062682151794, + "learning_rate": 2.321771333366409e-05, + "loss": 1.102, + "step": 11208 + }, + { + "epoch": 0.6810255787107358, + "grad_norm": 0.18065033853054047, + "learning_rate": 2.320963016527223e-05, + "loss": 1.0746, + "step": 11209 + }, + { + "epoch": 0.6810863357433623, + "grad_norm": 0.8758243918418884, + "learning_rate": 2.3201547978886544e-05, + "loss": 1.149, + "step": 11210 + }, + { + "epoch": 0.6811470927759888, + "grad_norm": 0.23307965695858002, + "learning_rate": 2.3193466774803284e-05, + "loss": 1.0976, + "step": 11211 + }, + { + "epoch": 0.6812078498086154, + "grad_norm": 0.12007883936166763, + "learning_rate": 2.31853865533187e-05, + "loss": 1.0643, + "step": 11212 + }, + { + "epoch": 0.6812686068412419, + "grad_norm": 0.12658709287643433, + "learning_rate": 2.3177307314728974e-05, + "loss": 1.059, + "step": 11213 + }, + { + "epoch": 0.6813293638738684, + "grad_norm": 0.17198549211025238, + "learning_rate": 2.3169229059330217e-05, + "loss": 1.0674, + "step": 11214 + }, + { + "epoch": 0.681390120906495, + "grad_norm": 0.2031710147857666, + "learning_rate": 2.316115178741855e-05, + "loss": 1.1218, + "step": 11215 + }, + { + "epoch": 0.6814508779391214, + "grad_norm": 0.2512986361980438, + "learning_rate": 2.3153075499290034e-05, + "loss": 1.0482, + "step": 11216 + }, + { + "epoch": 0.6815116349717479, + "grad_norm": 0.1166347935795784, + "learning_rate": 2.3145000195240745e-05, + "loss": 1.0264, + "step": 11217 + }, + { + "epoch": 0.6815723920043745, + "grad_norm": 0.24622927606105804, + "learning_rate": 2.313692587556667e-05, + "loss": 1.3013, + "step": 11218 + }, + { + "epoch": 0.681633149037001, + "grad_norm": 0.12514033913612366, + "learning_rate": 2.3128852540563762e-05, + "loss": 1.0634, + "step": 11219 + }, + { + "epoch": 0.6816939060696275, + "grad_norm": 0.22220630943775177, + "learning_rate": 2.3120780190527975e-05, + "loss": 1.0936, + "step": 11220 + }, + { + "epoch": 0.6817546631022541, + "grad_norm": 0.1998179405927658, + "learning_rate": 2.3112708825755153e-05, + "loss": 1.0548, + "step": 11221 + }, + { + "epoch": 0.6818154201348806, + "grad_norm": 0.4532238245010376, + "learning_rate": 2.3104638446541204e-05, + "loss": 1.1619, + "step": 11222 + }, + { + "epoch": 0.6818761771675071, + "grad_norm": 0.14723381400108337, + "learning_rate": 2.3096569053181933e-05, + "loss": 1.0154, + "step": 11223 + }, + { + "epoch": 0.6819369342001337, + "grad_norm": 0.12438743561506271, + "learning_rate": 2.308850064597312e-05, + "loss": 1.0344, + "step": 11224 + }, + { + "epoch": 0.6819976912327602, + "grad_norm": 0.11856713891029358, + "learning_rate": 2.3080433225210523e-05, + "loss": 1.0176, + "step": 11225 + }, + { + "epoch": 0.6820584482653868, + "grad_norm": 0.19358035922050476, + "learning_rate": 2.3072366791189847e-05, + "loss": 1.0851, + "step": 11226 + }, + { + "epoch": 0.6821192052980133, + "grad_norm": 0.1101171150803566, + "learning_rate": 2.3064301344206768e-05, + "loss": 1.0453, + "step": 11227 + }, + { + "epoch": 0.6821799623306398, + "grad_norm": 0.16870665550231934, + "learning_rate": 2.3056236884556937e-05, + "loss": 1.0954, + "step": 11228 + }, + { + "epoch": 0.6822407193632662, + "grad_norm": 1.1572208404541016, + "learning_rate": 2.3048173412535945e-05, + "loss": 1.0756, + "step": 11229 + }, + { + "epoch": 0.6823014763958928, + "grad_norm": 0.12642860412597656, + "learning_rate": 2.3040110928439367e-05, + "loss": 1.0724, + "step": 11230 + }, + { + "epoch": 0.6823622334285193, + "grad_norm": 0.2222655713558197, + "learning_rate": 2.3032049432562734e-05, + "loss": 1.0656, + "step": 11231 + }, + { + "epoch": 0.6824229904611459, + "grad_norm": 0.12913857400417328, + "learning_rate": 2.3023988925201527e-05, + "loss": 1.054, + "step": 11232 + }, + { + "epoch": 0.6824837474937724, + "grad_norm": 0.2229100614786148, + "learning_rate": 2.3015929406651255e-05, + "loss": 1.1127, + "step": 11233 + }, + { + "epoch": 0.6825445045263989, + "grad_norm": 0.13377363979816437, + "learning_rate": 2.3007870877207288e-05, + "loss": 1.0141, + "step": 11234 + }, + { + "epoch": 0.6826052615590255, + "grad_norm": 0.22718104720115662, + "learning_rate": 2.2999813337165034e-05, + "loss": 1.1603, + "step": 11235 + }, + { + "epoch": 0.682666018591652, + "grad_norm": 0.1535019725561142, + "learning_rate": 2.2991756786819833e-05, + "loss": 1.0866, + "step": 11236 + }, + { + "epoch": 0.6827267756242785, + "grad_norm": 0.14151044189929962, + "learning_rate": 2.2983701226467e-05, + "loss": 1.006, + "step": 11237 + }, + { + "epoch": 0.6827875326569051, + "grad_norm": 0.12746143341064453, + "learning_rate": 2.297564665640183e-05, + "loss": 1.0689, + "step": 11238 + }, + { + "epoch": 0.6828482896895316, + "grad_norm": 0.1492866575717926, + "learning_rate": 2.2967593076919558e-05, + "loss": 1.135, + "step": 11239 + }, + { + "epoch": 0.6829090467221581, + "grad_norm": 0.16964216530323029, + "learning_rate": 2.2959540488315408e-05, + "loss": 1.022, + "step": 11240 + }, + { + "epoch": 0.6829698037547847, + "grad_norm": 0.1894938200712204, + "learning_rate": 2.295148889088451e-05, + "loss": 1.0979, + "step": 11241 + }, + { + "epoch": 0.6830305607874111, + "grad_norm": 0.17001105844974518, + "learning_rate": 2.2943438284921994e-05, + "loss": 1.1067, + "step": 11242 + }, + { + "epoch": 0.6830913178200376, + "grad_norm": 0.5521331429481506, + "learning_rate": 2.2935388670722996e-05, + "loss": 1.198, + "step": 11243 + }, + { + "epoch": 0.6831520748526642, + "grad_norm": 0.1320745050907135, + "learning_rate": 2.2927340048582563e-05, + "loss": 1.0598, + "step": 11244 + }, + { + "epoch": 0.6832128318852907, + "grad_norm": 0.15190023183822632, + "learning_rate": 2.2919292418795706e-05, + "loss": 1.1067, + "step": 11245 + }, + { + "epoch": 0.6832735889179172, + "grad_norm": 0.12350473552942276, + "learning_rate": 2.2911245781657425e-05, + "loss": 1.0186, + "step": 11246 + }, + { + "epoch": 0.6833343459505438, + "grad_norm": 0.2489331215620041, + "learning_rate": 2.2903200137462668e-05, + "loss": 1.1251, + "step": 11247 + }, + { + "epoch": 0.6833951029831703, + "grad_norm": 0.29204264283180237, + "learning_rate": 2.2895155486506337e-05, + "loss": 0.9742, + "step": 11248 + }, + { + "epoch": 0.6834558600157968, + "grad_norm": 0.36877569556236267, + "learning_rate": 2.288711182908333e-05, + "loss": 1.0788, + "step": 11249 + }, + { + "epoch": 0.6835166170484234, + "grad_norm": 0.2829798758029938, + "learning_rate": 2.2879069165488477e-05, + "loss": 1.176, + "step": 11250 + }, + { + "epoch": 0.6835773740810499, + "grad_norm": 0.8081707954406738, + "learning_rate": 2.287102749601659e-05, + "loss": 1.0698, + "step": 11251 + }, + { + "epoch": 0.6836381311136764, + "grad_norm": 0.18228988349437714, + "learning_rate": 2.2862986820962428e-05, + "loss": 1.0294, + "step": 11252 + }, + { + "epoch": 0.683698888146303, + "grad_norm": 0.11081290990114212, + "learning_rate": 2.285494714062073e-05, + "loss": 1.0573, + "step": 11253 + }, + { + "epoch": 0.6837596451789295, + "grad_norm": 0.16781076788902283, + "learning_rate": 2.2846908455286198e-05, + "loss": 1.1313, + "step": 11254 + }, + { + "epoch": 0.6838204022115559, + "grad_norm": 0.18613655865192413, + "learning_rate": 2.283887076525348e-05, + "loss": 1.1006, + "step": 11255 + }, + { + "epoch": 0.6838811592441825, + "grad_norm": 0.13374072313308716, + "learning_rate": 2.2830834070817204e-05, + "loss": 1.0478, + "step": 11256 + }, + { + "epoch": 0.683941916276809, + "grad_norm": 0.12921583652496338, + "learning_rate": 2.282279837227194e-05, + "loss": 1.0537, + "step": 11257 + }, + { + "epoch": 0.6840026733094355, + "grad_norm": 0.19202256202697754, + "learning_rate": 2.2814763669912276e-05, + "loss": 1.162, + "step": 11258 + }, + { + "epoch": 0.6840634303420621, + "grad_norm": 0.11374222487211227, + "learning_rate": 2.2806729964032696e-05, + "loss": 1.0306, + "step": 11259 + }, + { + "epoch": 0.6841241873746886, + "grad_norm": 0.12950368225574493, + "learning_rate": 2.2798697254927716e-05, + "loss": 1.0692, + "step": 11260 + }, + { + "epoch": 0.6841849444073151, + "grad_norm": 0.17806057631969452, + "learning_rate": 2.2790665542891716e-05, + "loss": 1.0438, + "step": 11261 + }, + { + "epoch": 0.6842457014399417, + "grad_norm": 0.2502976655960083, + "learning_rate": 2.278263482821912e-05, + "loss": 1.1371, + "step": 11262 + }, + { + "epoch": 0.6843064584725682, + "grad_norm": 0.12404952943325043, + "learning_rate": 2.2774605111204317e-05, + "loss": 1.0468, + "step": 11263 + }, + { + "epoch": 0.6843672155051947, + "grad_norm": 0.4306713342666626, + "learning_rate": 2.2766576392141632e-05, + "loss": 1.1541, + "step": 11264 + }, + { + "epoch": 0.6844279725378213, + "grad_norm": 0.21711395680904388, + "learning_rate": 2.2758548671325348e-05, + "loss": 1.0316, + "step": 11265 + }, + { + "epoch": 0.6844887295704478, + "grad_norm": 0.23647603392601013, + "learning_rate": 2.275052194904973e-05, + "loss": 1.1839, + "step": 11266 + }, + { + "epoch": 0.6845494866030744, + "grad_norm": 0.15655535459518433, + "learning_rate": 2.2742496225609e-05, + "loss": 1.0443, + "step": 11267 + }, + { + "epoch": 0.6846102436357008, + "grad_norm": 0.1550855040550232, + "learning_rate": 2.273447150129734e-05, + "loss": 1.1046, + "step": 11268 + }, + { + "epoch": 0.6846710006683273, + "grad_norm": 0.22014521062374115, + "learning_rate": 2.2726447776408894e-05, + "loss": 1.0747, + "step": 11269 + }, + { + "epoch": 0.6847317577009538, + "grad_norm": 0.15115638077259064, + "learning_rate": 2.271842505123778e-05, + "loss": 1.0504, + "step": 11270 + }, + { + "epoch": 0.6847925147335804, + "grad_norm": 0.20195476710796356, + "learning_rate": 2.2710403326078066e-05, + "loss": 1.0528, + "step": 11271 + }, + { + "epoch": 0.6848532717662069, + "grad_norm": 0.15342214703559875, + "learning_rate": 2.270238260122379e-05, + "loss": 1.0804, + "step": 11272 + }, + { + "epoch": 0.6849140287988335, + "grad_norm": 0.2931040823459625, + "learning_rate": 2.2694362876968965e-05, + "loss": 1.051, + "step": 11273 + }, + { + "epoch": 0.68497478583146, + "grad_norm": 0.23116204142570496, + "learning_rate": 2.268634415360754e-05, + "loss": 1.0184, + "step": 11274 + }, + { + "epoch": 0.6850355428640865, + "grad_norm": 0.11853302270174026, + "learning_rate": 2.2678326431433454e-05, + "loss": 1.0903, + "step": 11275 + }, + { + "epoch": 0.6850962998967131, + "grad_norm": 0.59605872631073, + "learning_rate": 2.267030971074059e-05, + "loss": 1.1906, + "step": 11276 + }, + { + "epoch": 0.6851570569293396, + "grad_norm": 0.14413706958293915, + "learning_rate": 2.2662293991822807e-05, + "loss": 1.0237, + "step": 11277 + }, + { + "epoch": 0.6852178139619661, + "grad_norm": 0.21427135169506073, + "learning_rate": 2.2654279274973905e-05, + "loss": 1.0428, + "step": 11278 + }, + { + "epoch": 0.6852785709945927, + "grad_norm": 0.21035341918468475, + "learning_rate": 2.26462655604877e-05, + "loss": 1.1416, + "step": 11279 + }, + { + "epoch": 0.6853393280272192, + "grad_norm": 0.1456451565027237, + "learning_rate": 2.2638252848657938e-05, + "loss": 1.0368, + "step": 11280 + }, + { + "epoch": 0.6854000850598457, + "grad_norm": 0.1785242259502411, + "learning_rate": 2.2630241139778286e-05, + "loss": 1.0541, + "step": 11281 + }, + { + "epoch": 0.6854608420924722, + "grad_norm": 0.2017097920179367, + "learning_rate": 2.2622230434142438e-05, + "loss": 1.1664, + "step": 11282 + }, + { + "epoch": 0.6855215991250987, + "grad_norm": 0.2288617640733719, + "learning_rate": 2.2614220732044006e-05, + "loss": 1.1246, + "step": 11283 + }, + { + "epoch": 0.6855823561577252, + "grad_norm": 0.5452085137367249, + "learning_rate": 2.2606212033776625e-05, + "loss": 1.1486, + "step": 11284 + }, + { + "epoch": 0.6856431131903518, + "grad_norm": 0.24827709794044495, + "learning_rate": 2.2598204339633833e-05, + "loss": 1.1406, + "step": 11285 + }, + { + "epoch": 0.6857038702229783, + "grad_norm": 0.22711576521396637, + "learning_rate": 2.259019764990916e-05, + "loss": 1.0707, + "step": 11286 + }, + { + "epoch": 0.6857646272556048, + "grad_norm": 0.11822300404310226, + "learning_rate": 2.258219196489611e-05, + "loss": 1.0378, + "step": 11287 + }, + { + "epoch": 0.6858253842882314, + "grad_norm": 0.13896025717258453, + "learning_rate": 2.2574187284888076e-05, + "loss": 1.0406, + "step": 11288 + }, + { + "epoch": 0.6858861413208579, + "grad_norm": 0.11160395294427872, + "learning_rate": 2.2566183610178525e-05, + "loss": 1.0201, + "step": 11289 + }, + { + "epoch": 0.6859468983534844, + "grad_norm": 0.5209375619888306, + "learning_rate": 2.2558180941060823e-05, + "loss": 1.1955, + "step": 11290 + }, + { + "epoch": 0.686007655386111, + "grad_norm": 0.11260931938886642, + "learning_rate": 2.25501792778283e-05, + "loss": 1.0778, + "step": 11291 + }, + { + "epoch": 0.6860684124187375, + "grad_norm": 0.21111446619033813, + "learning_rate": 2.2542178620774257e-05, + "loss": 1.0234, + "step": 11292 + }, + { + "epoch": 0.686129169451364, + "grad_norm": 0.12418483197689056, + "learning_rate": 2.2534178970191966e-05, + "loss": 0.9883, + "step": 11293 + }, + { + "epoch": 0.6861899264839906, + "grad_norm": 1.1833758354187012, + "learning_rate": 2.2526180326374656e-05, + "loss": 1.0256, + "step": 11294 + }, + { + "epoch": 0.686250683516617, + "grad_norm": 0.13782691955566406, + "learning_rate": 2.2518182689615518e-05, + "loss": 1.0233, + "step": 11295 + }, + { + "epoch": 0.6863114405492435, + "grad_norm": 0.1393684297800064, + "learning_rate": 2.2510186060207707e-05, + "loss": 1.0351, + "step": 11296 + }, + { + "epoch": 0.6863721975818701, + "grad_norm": 0.11530917882919312, + "learning_rate": 2.250219043844433e-05, + "loss": 1.0398, + "step": 11297 + }, + { + "epoch": 0.6864329546144966, + "grad_norm": 0.13274326920509338, + "learning_rate": 2.2494195824618485e-05, + "loss": 1.048, + "step": 11298 + }, + { + "epoch": 0.6864937116471231, + "grad_norm": 0.1376384198665619, + "learning_rate": 2.2486202219023188e-05, + "loss": 1.0707, + "step": 11299 + }, + { + "epoch": 0.6865544686797497, + "grad_norm": 0.1071726530790329, + "learning_rate": 2.2478209621951508e-05, + "loss": 1.0238, + "step": 11300 + }, + { + "epoch": 0.6866152257123762, + "grad_norm": 0.17224492132663727, + "learning_rate": 2.247021803369635e-05, + "loss": 1.105, + "step": 11301 + }, + { + "epoch": 0.6866759827450027, + "grad_norm": 0.24265849590301514, + "learning_rate": 2.2462227454550677e-05, + "loss": 1.1862, + "step": 11302 + }, + { + "epoch": 0.6867367397776293, + "grad_norm": 0.14050258696079254, + "learning_rate": 2.245423788480737e-05, + "loss": 1.116, + "step": 11303 + }, + { + "epoch": 0.6867974968102558, + "grad_norm": 0.14598074555397034, + "learning_rate": 2.244624932475929e-05, + "loss": 1.0575, + "step": 11304 + }, + { + "epoch": 0.6868582538428823, + "grad_norm": 0.12525883316993713, + "learning_rate": 2.2438261774699283e-05, + "loss": 1.0121, + "step": 11305 + }, + { + "epoch": 0.6869190108755089, + "grad_norm": 0.43297070264816284, + "learning_rate": 2.2430275234920118e-05, + "loss": 1.1337, + "step": 11306 + }, + { + "epoch": 0.6869797679081354, + "grad_norm": 0.8391994833946228, + "learning_rate": 2.242228970571456e-05, + "loss": 1.0812, + "step": 11307 + }, + { + "epoch": 0.6870405249407618, + "grad_norm": 0.1682329922914505, + "learning_rate": 2.2414305187375286e-05, + "loss": 1.1872, + "step": 11308 + }, + { + "epoch": 0.6871012819733884, + "grad_norm": 0.15211036801338196, + "learning_rate": 2.2406321680194965e-05, + "loss": 1.1219, + "step": 11309 + }, + { + "epoch": 0.6871620390060149, + "grad_norm": 0.11000309139490128, + "learning_rate": 2.2398339184466278e-05, + "loss": 1.0118, + "step": 11310 + }, + { + "epoch": 0.6872227960386414, + "grad_norm": 0.16255216300487518, + "learning_rate": 2.2390357700481808e-05, + "loss": 1.1365, + "step": 11311 + }, + { + "epoch": 0.687283553071268, + "grad_norm": 0.2930504381656647, + "learning_rate": 2.238237722853411e-05, + "loss": 1.1252, + "step": 11312 + }, + { + "epoch": 0.6873443101038945, + "grad_norm": 0.12112327665090561, + "learning_rate": 2.237439776891573e-05, + "loss": 1.027, + "step": 11313 + }, + { + "epoch": 0.687405067136521, + "grad_norm": 0.14984898269176483, + "learning_rate": 2.2366419321919102e-05, + "loss": 1.1272, + "step": 11314 + }, + { + "epoch": 0.6874658241691476, + "grad_norm": 0.5324952602386475, + "learning_rate": 2.2358441887836735e-05, + "loss": 1.2852, + "step": 11315 + }, + { + "epoch": 0.6875265812017741, + "grad_norm": 0.21704143285751343, + "learning_rate": 2.2350465466961024e-05, + "loss": 1.1833, + "step": 11316 + }, + { + "epoch": 0.6875873382344007, + "grad_norm": 0.28614747524261475, + "learning_rate": 2.2342490059584347e-05, + "loss": 1.0676, + "step": 11317 + }, + { + "epoch": 0.6876480952670272, + "grad_norm": 0.19358699023723602, + "learning_rate": 2.233451566599904e-05, + "loss": 1.0704, + "step": 11318 + }, + { + "epoch": 0.6877088522996537, + "grad_norm": 0.13374973833560944, + "learning_rate": 2.232654228649741e-05, + "loss": 1.0887, + "step": 11319 + }, + { + "epoch": 0.6877696093322803, + "grad_norm": 0.12314755469560623, + "learning_rate": 2.2318569921371725e-05, + "loss": 1.1698, + "step": 11320 + }, + { + "epoch": 0.6878303663649067, + "grad_norm": 0.12458351999521255, + "learning_rate": 2.2310598570914204e-05, + "loss": 1.0192, + "step": 11321 + }, + { + "epoch": 0.6878911233975332, + "grad_norm": 0.16685117781162262, + "learning_rate": 2.230262823541705e-05, + "loss": 1.1628, + "step": 11322 + }, + { + "epoch": 0.6879518804301598, + "grad_norm": 0.14661340415477753, + "learning_rate": 2.229465891517241e-05, + "loss": 1.1166, + "step": 11323 + }, + { + "epoch": 0.6880126374627863, + "grad_norm": 0.22112315893173218, + "learning_rate": 2.2286690610472404e-05, + "loss": 1.2693, + "step": 11324 + }, + { + "epoch": 0.6880733944954128, + "grad_norm": 9.382180213928223, + "learning_rate": 2.227872332160909e-05, + "loss": 1.1014, + "step": 11325 + }, + { + "epoch": 0.6881341515280394, + "grad_norm": 0.2312825620174408, + "learning_rate": 2.2270757048874553e-05, + "loss": 1.185, + "step": 11326 + }, + { + "epoch": 0.6881949085606659, + "grad_norm": 0.17301885783672333, + "learning_rate": 2.226279179256079e-05, + "loss": 1.0081, + "step": 11327 + }, + { + "epoch": 0.6882556655932924, + "grad_norm": 0.12839139997959137, + "learning_rate": 2.225482755295974e-05, + "loss": 1.0546, + "step": 11328 + }, + { + "epoch": 0.688316422625919, + "grad_norm": 0.18592028319835663, + "learning_rate": 2.2246864330363348e-05, + "loss": 1.0263, + "step": 11329 + }, + { + "epoch": 0.6883771796585455, + "grad_norm": 0.1334301233291626, + "learning_rate": 2.2238902125063487e-05, + "loss": 1.0454, + "step": 11330 + }, + { + "epoch": 0.688437936691172, + "grad_norm": 0.12021974474191666, + "learning_rate": 2.2230940937352057e-05, + "loss": 1.0313, + "step": 11331 + }, + { + "epoch": 0.6884986937237986, + "grad_norm": 0.10884489864110947, + "learning_rate": 2.2222980767520856e-05, + "loss": 1.0278, + "step": 11332 + }, + { + "epoch": 0.6885594507564251, + "grad_norm": 0.1361217349767685, + "learning_rate": 2.2215021615861658e-05, + "loss": 1.017, + "step": 11333 + }, + { + "epoch": 0.6886202077890515, + "grad_norm": 0.17185622453689575, + "learning_rate": 2.220706348266624e-05, + "loss": 1.2327, + "step": 11334 + }, + { + "epoch": 0.6886809648216781, + "grad_norm": 0.3156260848045349, + "learning_rate": 2.219910636822624e-05, + "loss": 1.0775, + "step": 11335 + }, + { + "epoch": 0.6887417218543046, + "grad_norm": 0.4924779534339905, + "learning_rate": 2.219115027283339e-05, + "loss": 1.1998, + "step": 11336 + }, + { + "epoch": 0.6888024788869311, + "grad_norm": 0.167219877243042, + "learning_rate": 2.21831951967793e-05, + "loss": 1.0629, + "step": 11337 + }, + { + "epoch": 0.6888632359195577, + "grad_norm": 0.24007928371429443, + "learning_rate": 2.2175241140355568e-05, + "loss": 1.0389, + "step": 11338 + }, + { + "epoch": 0.6889239929521842, + "grad_norm": 0.18390145897865295, + "learning_rate": 2.216728810385375e-05, + "loss": 1.1026, + "step": 11339 + }, + { + "epoch": 0.6889847499848107, + "grad_norm": 0.12695859372615814, + "learning_rate": 2.215933608756537e-05, + "loss": 1.0668, + "step": 11340 + }, + { + "epoch": 0.6890455070174373, + "grad_norm": 0.12433171272277832, + "learning_rate": 2.2151385091781907e-05, + "loss": 1.0158, + "step": 11341 + }, + { + "epoch": 0.6891062640500638, + "grad_norm": 0.4193268120288849, + "learning_rate": 2.2143435116794803e-05, + "loss": 1.073, + "step": 11342 + }, + { + "epoch": 0.6891670210826903, + "grad_norm": 0.19214269518852234, + "learning_rate": 2.2135486162895468e-05, + "loss": 1.0785, + "step": 11343 + }, + { + "epoch": 0.6892277781153169, + "grad_norm": 0.16940905153751373, + "learning_rate": 2.2127538230375277e-05, + "loss": 1.0851, + "step": 11344 + }, + { + "epoch": 0.6892885351479434, + "grad_norm": 0.15236569941043854, + "learning_rate": 2.2119591319525557e-05, + "loss": 1.0669, + "step": 11345 + }, + { + "epoch": 0.68934929218057, + "grad_norm": 0.28772369027137756, + "learning_rate": 2.211164543063759e-05, + "loss": 1.0773, + "step": 11346 + }, + { + "epoch": 0.6894100492131964, + "grad_norm": 0.2376510053873062, + "learning_rate": 2.2103700564002688e-05, + "loss": 1.0759, + "step": 11347 + }, + { + "epoch": 0.6894708062458229, + "grad_norm": 0.20857170224189758, + "learning_rate": 2.2095756719912014e-05, + "loss": 1.0781, + "step": 11348 + }, + { + "epoch": 0.6895315632784494, + "grad_norm": 0.139313742518425, + "learning_rate": 2.2087813898656774e-05, + "loss": 1.0252, + "step": 11349 + }, + { + "epoch": 0.689592320311076, + "grad_norm": 0.1044449508190155, + "learning_rate": 2.2079872100528108e-05, + "loss": 1.0784, + "step": 11350 + }, + { + "epoch": 0.6896530773437025, + "grad_norm": 0.12288221716880798, + "learning_rate": 2.2071931325817114e-05, + "loss": 1.0203, + "step": 11351 + }, + { + "epoch": 0.689713834376329, + "grad_norm": 0.12695297598838806, + "learning_rate": 2.2063991574814884e-05, + "loss": 1.0102, + "step": 11352 + }, + { + "epoch": 0.6897745914089556, + "grad_norm": 0.18684028089046478, + "learning_rate": 2.2056052847812454e-05, + "loss": 1.0566, + "step": 11353 + }, + { + "epoch": 0.6898353484415821, + "grad_norm": 0.18013936281204224, + "learning_rate": 2.2048115145100817e-05, + "loss": 1.1283, + "step": 11354 + }, + { + "epoch": 0.6898961054742087, + "grad_norm": 0.14498640596866608, + "learning_rate": 2.204017846697089e-05, + "loss": 1.1544, + "step": 11355 + }, + { + "epoch": 0.6899568625068352, + "grad_norm": 0.15685342252254486, + "learning_rate": 2.2032242813713643e-05, + "loss": 1.0622, + "step": 11356 + }, + { + "epoch": 0.6900176195394617, + "grad_norm": 0.1755741387605667, + "learning_rate": 2.2024308185619946e-05, + "loss": 1.0581, + "step": 11357 + }, + { + "epoch": 0.6900783765720883, + "grad_norm": 1.366424560546875, + "learning_rate": 2.2016374582980638e-05, + "loss": 1.2208, + "step": 11358 + }, + { + "epoch": 0.6901391336047148, + "grad_norm": 0.12541456520557404, + "learning_rate": 2.200844200608652e-05, + "loss": 1.0127, + "step": 11359 + }, + { + "epoch": 0.6901998906373412, + "grad_norm": 0.15432308614253998, + "learning_rate": 2.200051045522838e-05, + "loss": 1.0694, + "step": 11360 + }, + { + "epoch": 0.6902606476699678, + "grad_norm": 0.15922018885612488, + "learning_rate": 2.1992579930696945e-05, + "loss": 1.0379, + "step": 11361 + }, + { + "epoch": 0.6903214047025943, + "grad_norm": 0.11118892580270767, + "learning_rate": 2.19846504327829e-05, + "loss": 1.0874, + "step": 11362 + }, + { + "epoch": 0.6903821617352208, + "grad_norm": 0.18039391934871674, + "learning_rate": 2.197672196177691e-05, + "loss": 1.148, + "step": 11363 + }, + { + "epoch": 0.6904429187678474, + "grad_norm": 0.5095503926277161, + "learning_rate": 2.1968794517969588e-05, + "loss": 1.0956, + "step": 11364 + }, + { + "epoch": 0.6905036758004739, + "grad_norm": 0.19518274068832397, + "learning_rate": 2.1960868101651527e-05, + "loss": 1.0276, + "step": 11365 + }, + { + "epoch": 0.6905644328331004, + "grad_norm": 0.11616974323987961, + "learning_rate": 2.1952942713113246e-05, + "loss": 1.0895, + "step": 11366 + }, + { + "epoch": 0.690625189865727, + "grad_norm": 0.13836927711963654, + "learning_rate": 2.1945018352645308e-05, + "loss": 1.035, + "step": 11367 + }, + { + "epoch": 0.6906859468983535, + "grad_norm": 3.9429945945739746, + "learning_rate": 2.193709502053813e-05, + "loss": 1.1059, + "step": 11368 + }, + { + "epoch": 0.69074670393098, + "grad_norm": 0.16126501560211182, + "learning_rate": 2.1929172717082152e-05, + "loss": 1.1234, + "step": 11369 + }, + { + "epoch": 0.6908074609636066, + "grad_norm": 0.2413211315870285, + "learning_rate": 2.1921251442567782e-05, + "loss": 1.1332, + "step": 11370 + }, + { + "epoch": 0.6908682179962331, + "grad_norm": 0.22435836493968964, + "learning_rate": 2.191333119728534e-05, + "loss": 1.0584, + "step": 11371 + }, + { + "epoch": 0.6909289750288596, + "grad_norm": 0.13752995431423187, + "learning_rate": 2.1905411981525197e-05, + "loss": 1.0454, + "step": 11372 + }, + { + "epoch": 0.6909897320614861, + "grad_norm": 0.32286930084228516, + "learning_rate": 2.1897493795577606e-05, + "loss": 1.0526, + "step": 11373 + }, + { + "epoch": 0.6910504890941126, + "grad_norm": 0.3062862157821655, + "learning_rate": 2.1889576639732834e-05, + "loss": 1.1951, + "step": 11374 + }, + { + "epoch": 0.6911112461267391, + "grad_norm": 0.24602198600769043, + "learning_rate": 2.1881660514281043e-05, + "loss": 1.2371, + "step": 11375 + }, + { + "epoch": 0.6911720031593657, + "grad_norm": 0.16420650482177734, + "learning_rate": 2.1873745419512403e-05, + "loss": 1.13, + "step": 11376 + }, + { + "epoch": 0.6912327601919922, + "grad_norm": 0.11032281070947647, + "learning_rate": 2.1865831355717075e-05, + "loss": 1.0498, + "step": 11377 + }, + { + "epoch": 0.6912935172246187, + "grad_norm": 0.2551252841949463, + "learning_rate": 2.1857918323185145e-05, + "loss": 1.1135, + "step": 11378 + }, + { + "epoch": 0.6913542742572453, + "grad_norm": 1.0878547430038452, + "learning_rate": 2.1850006322206656e-05, + "loss": 1.029, + "step": 11379 + }, + { + "epoch": 0.6914150312898718, + "grad_norm": 1.1755034923553467, + "learning_rate": 2.184209535307164e-05, + "loss": 1.287, + "step": 11380 + }, + { + "epoch": 0.6914757883224983, + "grad_norm": 0.17688456177711487, + "learning_rate": 2.183418541607003e-05, + "loss": 1.055, + "step": 11381 + }, + { + "epoch": 0.6915365453551249, + "grad_norm": 0.2281084954738617, + "learning_rate": 2.1826276511491815e-05, + "loss": 1.1393, + "step": 11382 + }, + { + "epoch": 0.6915973023877514, + "grad_norm": 0.20563869178295135, + "learning_rate": 2.181836863962688e-05, + "loss": 1.0698, + "step": 11383 + }, + { + "epoch": 0.691658059420378, + "grad_norm": 0.7641192674636841, + "learning_rate": 2.181046180076509e-05, + "loss": 1.2132, + "step": 11384 + }, + { + "epoch": 0.6917188164530045, + "grad_norm": 0.3450571894645691, + "learning_rate": 2.1802555995196273e-05, + "loss": 0.9568, + "step": 11385 + }, + { + "epoch": 0.691779573485631, + "grad_norm": 0.2267465442419052, + "learning_rate": 2.179465122321021e-05, + "loss": 1.1404, + "step": 11386 + }, + { + "epoch": 0.6918403305182574, + "grad_norm": 0.1363375186920166, + "learning_rate": 2.1786747485096665e-05, + "loss": 1.0125, + "step": 11387 + }, + { + "epoch": 0.691901087550884, + "grad_norm": 0.2029709368944168, + "learning_rate": 2.1778844781145347e-05, + "loss": 1.0236, + "step": 11388 + }, + { + "epoch": 0.6919618445835105, + "grad_norm": 0.15069545805454254, + "learning_rate": 2.1770943111645918e-05, + "loss": 1.0264, + "step": 11389 + }, + { + "epoch": 0.692022601616137, + "grad_norm": 0.163344606757164, + "learning_rate": 2.1763042476888036e-05, + "loss": 1.0488, + "step": 11390 + }, + { + "epoch": 0.6920833586487636, + "grad_norm": 0.13086891174316406, + "learning_rate": 2.1755142877161282e-05, + "loss": 1.0315, + "step": 11391 + }, + { + "epoch": 0.6921441156813901, + "grad_norm": 0.16269154846668243, + "learning_rate": 2.1747244312755218e-05, + "loss": 1.0962, + "step": 11392 + }, + { + "epoch": 0.6922048727140167, + "grad_norm": 0.14945822954177856, + "learning_rate": 2.1739346783959387e-05, + "loss": 1.1243, + "step": 11393 + }, + { + "epoch": 0.6922656297466432, + "grad_norm": 0.1559775173664093, + "learning_rate": 2.1731450291063275e-05, + "loss": 1.106, + "step": 11394 + }, + { + "epoch": 0.6923263867792697, + "grad_norm": 0.16421636939048767, + "learning_rate": 2.1723554834356307e-05, + "loss": 1.1212, + "step": 11395 + }, + { + "epoch": 0.6923871438118963, + "grad_norm": 0.1422232836484909, + "learning_rate": 2.1715660414127896e-05, + "loss": 1.0992, + "step": 11396 + }, + { + "epoch": 0.6924479008445228, + "grad_norm": 0.2752242088317871, + "learning_rate": 2.1707767030667403e-05, + "loss": 1.1142, + "step": 11397 + }, + { + "epoch": 0.6925086578771493, + "grad_norm": 0.11977748572826385, + "learning_rate": 2.16998746842642e-05, + "loss": 1.0349, + "step": 11398 + }, + { + "epoch": 0.6925694149097759, + "grad_norm": 0.13460689783096313, + "learning_rate": 2.1691983375207557e-05, + "loss": 1.0567, + "step": 11399 + }, + { + "epoch": 0.6926301719424023, + "grad_norm": 0.09862243384122849, + "learning_rate": 2.168409310378674e-05, + "loss": 1.0159, + "step": 11400 + }, + { + "epoch": 0.6926909289750288, + "grad_norm": 0.10983823984861374, + "learning_rate": 2.167620387029098e-05, + "loss": 1.0426, + "step": 11401 + }, + { + "epoch": 0.6927516860076554, + "grad_norm": 1.2687128782272339, + "learning_rate": 2.1668315675009404e-05, + "loss": 1.0385, + "step": 11402 + }, + { + "epoch": 0.6928124430402819, + "grad_norm": 0.8386787176132202, + "learning_rate": 2.1660428518231207e-05, + "loss": 1.0136, + "step": 11403 + }, + { + "epoch": 0.6928732000729084, + "grad_norm": 0.14497166872024536, + "learning_rate": 2.165254240024549e-05, + "loss": 1.0494, + "step": 11404 + }, + { + "epoch": 0.692933957105535, + "grad_norm": 0.4801327884197235, + "learning_rate": 2.1644657321341304e-05, + "loss": 1.2055, + "step": 11405 + }, + { + "epoch": 0.6929947141381615, + "grad_norm": 0.6916252374649048, + "learning_rate": 2.1636773281807687e-05, + "loss": 1.253, + "step": 11406 + }, + { + "epoch": 0.693055471170788, + "grad_norm": 0.10596638172864914, + "learning_rate": 2.1628890281933627e-05, + "loss": 1.048, + "step": 11407 + }, + { + "epoch": 0.6931162282034146, + "grad_norm": 0.35621994733810425, + "learning_rate": 2.162100832200808e-05, + "loss": 1.0984, + "step": 11408 + }, + { + "epoch": 0.6931769852360411, + "grad_norm": 0.23045781254768372, + "learning_rate": 2.1613127402319954e-05, + "loss": 1.0596, + "step": 11409 + }, + { + "epoch": 0.6932377422686676, + "grad_norm": 0.20667323470115662, + "learning_rate": 2.1605247523158133e-05, + "loss": 1.1198, + "step": 11410 + }, + { + "epoch": 0.6932984993012942, + "grad_norm": 0.1826099157333374, + "learning_rate": 2.1597368684811454e-05, + "loss": 1.1868, + "step": 11411 + }, + { + "epoch": 0.6933592563339207, + "grad_norm": 0.15626446902751923, + "learning_rate": 2.1589490887568715e-05, + "loss": 1.0739, + "step": 11412 + }, + { + "epoch": 0.6934200133665471, + "grad_norm": 0.12921178340911865, + "learning_rate": 2.1581614131718663e-05, + "loss": 1.0857, + "step": 11413 + }, + { + "epoch": 0.6934807703991737, + "grad_norm": 0.2649064362049103, + "learning_rate": 2.1573738417550072e-05, + "loss": 1.1169, + "step": 11414 + }, + { + "epoch": 0.6935415274318002, + "grad_norm": 0.1484181433916092, + "learning_rate": 2.1565863745351577e-05, + "loss": 1.1053, + "step": 11415 + }, + { + "epoch": 0.6936022844644267, + "grad_norm": 0.4540680944919586, + "learning_rate": 2.1557990115411843e-05, + "loss": 1.1722, + "step": 11416 + }, + { + "epoch": 0.6936630414970533, + "grad_norm": 0.22185561060905457, + "learning_rate": 2.1550117528019474e-05, + "loss": 1.0228, + "step": 11417 + }, + { + "epoch": 0.6937237985296798, + "grad_norm": 0.13270683586597443, + "learning_rate": 2.1542245983463035e-05, + "loss": 1.0491, + "step": 11418 + }, + { + "epoch": 0.6937845555623063, + "grad_norm": 0.349508672952652, + "learning_rate": 2.153437548203108e-05, + "loss": 1.0694, + "step": 11419 + }, + { + "epoch": 0.6938453125949329, + "grad_norm": 0.2577899396419525, + "learning_rate": 2.15265060240121e-05, + "loss": 1.0473, + "step": 11420 + }, + { + "epoch": 0.6939060696275594, + "grad_norm": 0.12111520767211914, + "learning_rate": 2.1518637609694565e-05, + "loss": 1.0436, + "step": 11421 + }, + { + "epoch": 0.6939668266601859, + "grad_norm": 0.1914292424917221, + "learning_rate": 2.151077023936685e-05, + "loss": 1.1916, + "step": 11422 + }, + { + "epoch": 0.6940275836928125, + "grad_norm": 0.1535150110721588, + "learning_rate": 2.1502903913317346e-05, + "loss": 1.0549, + "step": 11423 + }, + { + "epoch": 0.694088340725439, + "grad_norm": 0.30049237608909607, + "learning_rate": 2.1495038631834426e-05, + "loss": 1.0861, + "step": 11424 + }, + { + "epoch": 0.6941490977580655, + "grad_norm": 0.15553316473960876, + "learning_rate": 2.148717439520637e-05, + "loss": 1.0094, + "step": 11425 + }, + { + "epoch": 0.694209854790692, + "grad_norm": 0.1438228189945221, + "learning_rate": 2.1479311203721452e-05, + "loss": 0.9906, + "step": 11426 + }, + { + "epoch": 0.6942706118233185, + "grad_norm": 0.11101258546113968, + "learning_rate": 2.1471449057667907e-05, + "loss": 1.0369, + "step": 11427 + }, + { + "epoch": 0.694331368855945, + "grad_norm": 0.16801387071609497, + "learning_rate": 2.146358795733388e-05, + "loss": 1.189, + "step": 11428 + }, + { + "epoch": 0.6943921258885716, + "grad_norm": 0.15543152391910553, + "learning_rate": 2.145572790300757e-05, + "loss": 1.0902, + "step": 11429 + }, + { + "epoch": 0.6944528829211981, + "grad_norm": 0.36190688610076904, + "learning_rate": 2.1447868894977073e-05, + "loss": 1.2635, + "step": 11430 + }, + { + "epoch": 0.6945136399538246, + "grad_norm": 0.1915113925933838, + "learning_rate": 2.1440010933530456e-05, + "loss": 1.0793, + "step": 11431 + }, + { + "epoch": 0.6945743969864512, + "grad_norm": 0.13725712895393372, + "learning_rate": 2.1432154018955763e-05, + "loss": 1.0968, + "step": 11432 + }, + { + "epoch": 0.6946351540190777, + "grad_norm": 0.17632752656936646, + "learning_rate": 2.1424298151540983e-05, + "loss": 1.122, + "step": 11433 + }, + { + "epoch": 0.6946959110517043, + "grad_norm": 0.16672050952911377, + "learning_rate": 2.141644333157408e-05, + "loss": 1.0549, + "step": 11434 + }, + { + "epoch": 0.6947566680843308, + "grad_norm": 0.9295737743377686, + "learning_rate": 2.1408589559342974e-05, + "loss": 1.117, + "step": 11435 + }, + { + "epoch": 0.6948174251169573, + "grad_norm": 0.15703879296779633, + "learning_rate": 2.1400736835135545e-05, + "loss": 1.0562, + "step": 11436 + }, + { + "epoch": 0.6948781821495839, + "grad_norm": 0.15473277866840363, + "learning_rate": 2.1392885159239628e-05, + "loss": 1.0567, + "step": 11437 + }, + { + "epoch": 0.6949389391822104, + "grad_norm": 0.1298961490392685, + "learning_rate": 2.138503453194304e-05, + "loss": 1.0552, + "step": 11438 + }, + { + "epoch": 0.6949996962148368, + "grad_norm": 0.25411659479141235, + "learning_rate": 2.1377184953533526e-05, + "loss": 1.0615, + "step": 11439 + }, + { + "epoch": 0.6950604532474634, + "grad_norm": 0.12521715462207794, + "learning_rate": 2.136933642429884e-05, + "loss": 1.0257, + "step": 11440 + }, + { + "epoch": 0.6951212102800899, + "grad_norm": 0.11640459299087524, + "learning_rate": 2.1361488944526686e-05, + "loss": 1.0124, + "step": 11441 + }, + { + "epoch": 0.6951819673127164, + "grad_norm": 0.146500825881958, + "learning_rate": 2.1353642514504675e-05, + "loss": 1.0278, + "step": 11442 + }, + { + "epoch": 0.695242724345343, + "grad_norm": 0.12997595965862274, + "learning_rate": 2.134579713452043e-05, + "loss": 1.0052, + "step": 11443 + }, + { + "epoch": 0.6953034813779695, + "grad_norm": 0.22321529686450958, + "learning_rate": 2.1337952804861515e-05, + "loss": 1.2223, + "step": 11444 + }, + { + "epoch": 0.695364238410596, + "grad_norm": 0.9836867451667786, + "learning_rate": 2.1330109525815494e-05, + "loss": 1.1228, + "step": 11445 + }, + { + "epoch": 0.6954249954432226, + "grad_norm": 0.22070465981960297, + "learning_rate": 2.132226729766985e-05, + "loss": 1.1562, + "step": 11446 + }, + { + "epoch": 0.6954857524758491, + "grad_norm": 0.1521066129207611, + "learning_rate": 2.1314426120712066e-05, + "loss": 1.1043, + "step": 11447 + }, + { + "epoch": 0.6955465095084756, + "grad_norm": 0.18961137533187866, + "learning_rate": 2.13065859952295e-05, + "loss": 1.0512, + "step": 11448 + }, + { + "epoch": 0.6956072665411022, + "grad_norm": 0.13366352021694183, + "learning_rate": 2.1298746921509582e-05, + "loss": 1.1128, + "step": 11449 + }, + { + "epoch": 0.6956680235737287, + "grad_norm": 0.10963138937950134, + "learning_rate": 2.1290908899839653e-05, + "loss": 0.979, + "step": 11450 + }, + { + "epoch": 0.6957287806063552, + "grad_norm": 0.18463166058063507, + "learning_rate": 2.1283071930506998e-05, + "loss": 1.0449, + "step": 11451 + }, + { + "epoch": 0.6957895376389817, + "grad_norm": 0.3294398784637451, + "learning_rate": 2.12752360137989e-05, + "loss": 1.1423, + "step": 11452 + }, + { + "epoch": 0.6958502946716082, + "grad_norm": 0.1256033331155777, + "learning_rate": 2.126740115000257e-05, + "loss": 1.0647, + "step": 11453 + }, + { + "epoch": 0.6959110517042347, + "grad_norm": 0.09753338247537613, + "learning_rate": 2.1259567339405213e-05, + "loss": 1.0281, + "step": 11454 + }, + { + "epoch": 0.6959718087368613, + "grad_norm": 0.13088829815387726, + "learning_rate": 2.1251734582293963e-05, + "loss": 1.0716, + "step": 11455 + }, + { + "epoch": 0.6960325657694878, + "grad_norm": 0.15262579917907715, + "learning_rate": 2.1243902878955946e-05, + "loss": 1.0808, + "step": 11456 + }, + { + "epoch": 0.6960933228021143, + "grad_norm": 0.1111823245882988, + "learning_rate": 2.1236072229678223e-05, + "loss": 1.0569, + "step": 11457 + }, + { + "epoch": 0.6961540798347409, + "grad_norm": 0.18385662138462067, + "learning_rate": 2.1228242634747835e-05, + "loss": 1.0541, + "step": 11458 + }, + { + "epoch": 0.6962148368673674, + "grad_norm": 0.18422475457191467, + "learning_rate": 2.1220414094451752e-05, + "loss": 1.0532, + "step": 11459 + }, + { + "epoch": 0.6962755938999939, + "grad_norm": 0.1610908806324005, + "learning_rate": 2.121258660907698e-05, + "loss": 1.1046, + "step": 11460 + }, + { + "epoch": 0.6963363509326205, + "grad_norm": 0.11393465846776962, + "learning_rate": 2.1204760178910422e-05, + "loss": 1.0313, + "step": 11461 + }, + { + "epoch": 0.696397107965247, + "grad_norm": 0.16193285584449768, + "learning_rate": 2.119693480423893e-05, + "loss": 1.0794, + "step": 11462 + }, + { + "epoch": 0.6964578649978735, + "grad_norm": 0.212175652384758, + "learning_rate": 2.1189110485349354e-05, + "loss": 1.138, + "step": 11463 + }, + { + "epoch": 0.6965186220305001, + "grad_norm": 0.11896412819623947, + "learning_rate": 2.118128722252849e-05, + "loss": 1.0222, + "step": 11464 + }, + { + "epoch": 0.6965793790631265, + "grad_norm": 0.1666450798511505, + "learning_rate": 2.1173465016063134e-05, + "loss": 1.1224, + "step": 11465 + }, + { + "epoch": 0.696640136095753, + "grad_norm": 0.15710842609405518, + "learning_rate": 2.1165643866239982e-05, + "loss": 1.089, + "step": 11466 + }, + { + "epoch": 0.6967008931283796, + "grad_norm": 0.14782632887363434, + "learning_rate": 2.1157823773345752e-05, + "loss": 0.9961, + "step": 11467 + }, + { + "epoch": 0.6967616501610061, + "grad_norm": 0.14348682761192322, + "learning_rate": 2.1150004737667045e-05, + "loss": 1.0893, + "step": 11468 + }, + { + "epoch": 0.6968224071936326, + "grad_norm": 0.12337322533130646, + "learning_rate": 2.1142186759490473e-05, + "loss": 1.0436, + "step": 11469 + }, + { + "epoch": 0.6968831642262592, + "grad_norm": 0.1315898895263672, + "learning_rate": 2.1134369839102646e-05, + "loss": 1.0505, + "step": 11470 + }, + { + "epoch": 0.6969439212588857, + "grad_norm": 0.405222624540329, + "learning_rate": 2.112655397679007e-05, + "loss": 1.1287, + "step": 11471 + }, + { + "epoch": 0.6970046782915122, + "grad_norm": 0.2001056671142578, + "learning_rate": 2.1118739172839236e-05, + "loss": 1.1667, + "step": 11472 + }, + { + "epoch": 0.6970654353241388, + "grad_norm": 0.16974933445453644, + "learning_rate": 2.1110925427536604e-05, + "loss": 1.1377, + "step": 11473 + }, + { + "epoch": 0.6971261923567653, + "grad_norm": 0.12256669998168945, + "learning_rate": 2.1103112741168578e-05, + "loss": 1.025, + "step": 11474 + }, + { + "epoch": 0.6971869493893919, + "grad_norm": 0.1347525417804718, + "learning_rate": 2.109530111402155e-05, + "loss": 1.0915, + "step": 11475 + }, + { + "epoch": 0.6972477064220184, + "grad_norm": 0.11152520030736923, + "learning_rate": 2.108749054638184e-05, + "loss": 1.0437, + "step": 11476 + }, + { + "epoch": 0.6973084634546449, + "grad_norm": 0.40980181097984314, + "learning_rate": 2.1079681038535753e-05, + "loss": 1.2565, + "step": 11477 + }, + { + "epoch": 0.6973692204872713, + "grad_norm": 0.1527237445116043, + "learning_rate": 2.1071872590769554e-05, + "loss": 1.0618, + "step": 11478 + }, + { + "epoch": 0.6974299775198979, + "grad_norm": 0.2353675663471222, + "learning_rate": 2.1064065203369448e-05, + "loss": 1.1475, + "step": 11479 + }, + { + "epoch": 0.6974907345525244, + "grad_norm": 0.18684354424476624, + "learning_rate": 2.1056258876621616e-05, + "loss": 1.1217, + "step": 11480 + }, + { + "epoch": 0.697551491585151, + "grad_norm": 0.13205884397029877, + "learning_rate": 2.104845361081224e-05, + "loss": 1.0379, + "step": 11481 + }, + { + "epoch": 0.6976122486177775, + "grad_norm": 0.16660812497138977, + "learning_rate": 2.1040649406227376e-05, + "loss": 1.1084, + "step": 11482 + }, + { + "epoch": 0.697673005650404, + "grad_norm": 0.24975334107875824, + "learning_rate": 2.1032846263153105e-05, + "loss": 1.1076, + "step": 11483 + }, + { + "epoch": 0.6977337626830306, + "grad_norm": 0.20460328459739685, + "learning_rate": 2.1025044181875453e-05, + "loss": 1.0736, + "step": 11484 + }, + { + "epoch": 0.6977945197156571, + "grad_norm": 0.11432195454835892, + "learning_rate": 2.101724316268039e-05, + "loss": 1.0284, + "step": 11485 + }, + { + "epoch": 0.6978552767482836, + "grad_norm": 0.15732809901237488, + "learning_rate": 2.1009443205853902e-05, + "loss": 1.1179, + "step": 11486 + }, + { + "epoch": 0.6979160337809102, + "grad_norm": 0.12171676009893417, + "learning_rate": 2.1001644311681873e-05, + "loss": 1.0094, + "step": 11487 + }, + { + "epoch": 0.6979767908135367, + "grad_norm": 0.14438176155090332, + "learning_rate": 2.0993846480450198e-05, + "loss": 1.0545, + "step": 11488 + }, + { + "epoch": 0.6980375478461632, + "grad_norm": 0.1527024507522583, + "learning_rate": 2.0986049712444666e-05, + "loss": 1.0182, + "step": 11489 + }, + { + "epoch": 0.6980983048787898, + "grad_norm": 0.1665276437997818, + "learning_rate": 2.0978254007951077e-05, + "loss": 1.0243, + "step": 11490 + }, + { + "epoch": 0.6981590619114163, + "grad_norm": 0.11962895095348358, + "learning_rate": 2.0970459367255207e-05, + "loss": 1.1142, + "step": 11491 + }, + { + "epoch": 0.6982198189440427, + "grad_norm": 0.10681907087564468, + "learning_rate": 2.0962665790642765e-05, + "loss": 1.0157, + "step": 11492 + }, + { + "epoch": 0.6982805759766693, + "grad_norm": 0.31860029697418213, + "learning_rate": 2.0954873278399422e-05, + "loss": 1.0798, + "step": 11493 + }, + { + "epoch": 0.6983413330092958, + "grad_norm": 0.7815797328948975, + "learning_rate": 2.0947081830810833e-05, + "loss": 1.1652, + "step": 11494 + }, + { + "epoch": 0.6984020900419223, + "grad_norm": 0.2584242522716522, + "learning_rate": 2.0939291448162534e-05, + "loss": 1.1049, + "step": 11495 + }, + { + "epoch": 0.6984628470745489, + "grad_norm": 0.23324842751026154, + "learning_rate": 2.093150213074015e-05, + "loss": 1.0847, + "step": 11496 + }, + { + "epoch": 0.6985236041071754, + "grad_norm": 0.14944741129875183, + "learning_rate": 2.0923713878829165e-05, + "loss": 1.0428, + "step": 11497 + }, + { + "epoch": 0.6985843611398019, + "grad_norm": 0.260637104511261, + "learning_rate": 2.0915926692715073e-05, + "loss": 1.1506, + "step": 11498 + }, + { + "epoch": 0.6986451181724285, + "grad_norm": 0.13984835147857666, + "learning_rate": 2.090814057268331e-05, + "loss": 1.0728, + "step": 11499 + }, + { + "epoch": 0.698705875205055, + "grad_norm": 0.15960261225700378, + "learning_rate": 2.0900355519019277e-05, + "loss": 1.0212, + "step": 11500 + }, + { + "epoch": 0.6987666322376815, + "grad_norm": 0.10564113408327103, + "learning_rate": 2.0892571532008338e-05, + "loss": 1.019, + "step": 11501 + }, + { + "epoch": 0.6988273892703081, + "grad_norm": 0.19508783519268036, + "learning_rate": 2.0884788611935813e-05, + "loss": 1.1809, + "step": 11502 + }, + { + "epoch": 0.6988881463029346, + "grad_norm": 0.10866780579090118, + "learning_rate": 2.0877006759086993e-05, + "loss": 1.0216, + "step": 11503 + }, + { + "epoch": 0.6989489033355611, + "grad_norm": 0.17517589032649994, + "learning_rate": 2.086922597374712e-05, + "loss": 1.0548, + "step": 11504 + }, + { + "epoch": 0.6990096603681876, + "grad_norm": 0.13531221449375153, + "learning_rate": 2.0861446256201394e-05, + "loss": 1.0234, + "step": 11505 + }, + { + "epoch": 0.6990704174008141, + "grad_norm": 0.14674626290798187, + "learning_rate": 2.085366760673497e-05, + "loss": 1.1378, + "step": 11506 + }, + { + "epoch": 0.6991311744334406, + "grad_norm": 0.63823401927948, + "learning_rate": 2.084589002563302e-05, + "loss": 1.0373, + "step": 11507 + }, + { + "epoch": 0.6991919314660672, + "grad_norm": 0.2077614665031433, + "learning_rate": 2.083811351318062e-05, + "loss": 1.008, + "step": 11508 + }, + { + "epoch": 0.6992526884986937, + "grad_norm": 0.14270423352718353, + "learning_rate": 2.0830338069662785e-05, + "loss": 1.1257, + "step": 11509 + }, + { + "epoch": 0.6993134455313202, + "grad_norm": 0.1445399820804596, + "learning_rate": 2.082256369536455e-05, + "loss": 1.0397, + "step": 11510 + }, + { + "epoch": 0.6993742025639468, + "grad_norm": 0.17797310650348663, + "learning_rate": 2.0814790390570865e-05, + "loss": 1.0898, + "step": 11511 + }, + { + "epoch": 0.6994349595965733, + "grad_norm": 0.1764162927865982, + "learning_rate": 2.08070181555667e-05, + "loss": 1.078, + "step": 11512 + }, + { + "epoch": 0.6994957166291998, + "grad_norm": 0.19250036776065826, + "learning_rate": 2.0799246990636922e-05, + "loss": 1.0716, + "step": 11513 + }, + { + "epoch": 0.6995564736618264, + "grad_norm": 1.214611291885376, + "learning_rate": 2.0791476896066407e-05, + "loss": 1.0709, + "step": 11514 + }, + { + "epoch": 0.6996172306944529, + "grad_norm": 0.1410617232322693, + "learning_rate": 2.078370787213994e-05, + "loss": 1.0417, + "step": 11515 + }, + { + "epoch": 0.6996779877270795, + "grad_norm": 2.257185935974121, + "learning_rate": 2.0775939919142284e-05, + "loss": 1.1272, + "step": 11516 + }, + { + "epoch": 0.699738744759706, + "grad_norm": 0.19316335022449493, + "learning_rate": 2.0768173037358224e-05, + "loss": 1.0308, + "step": 11517 + }, + { + "epoch": 0.6997995017923324, + "grad_norm": 0.27087971568107605, + "learning_rate": 2.0760407227072424e-05, + "loss": 1.1541, + "step": 11518 + }, + { + "epoch": 0.699860258824959, + "grad_norm": 0.10225939005613327, + "learning_rate": 2.0752642488569556e-05, + "loss": 1.0157, + "step": 11519 + }, + { + "epoch": 0.6999210158575855, + "grad_norm": 0.10104874521493912, + "learning_rate": 2.074487882213423e-05, + "loss": 1.047, + "step": 11520 + }, + { + "epoch": 0.699981772890212, + "grad_norm": 0.4039386510848999, + "learning_rate": 2.0737116228051023e-05, + "loss": 1.0645, + "step": 11521 + }, + { + "epoch": 0.7000425299228386, + "grad_norm": 0.11678233742713928, + "learning_rate": 2.0729354706604476e-05, + "loss": 1.0396, + "step": 11522 + }, + { + "epoch": 0.7001032869554651, + "grad_norm": 11.37902545928955, + "learning_rate": 2.0721594258079093e-05, + "loss": 1.0509, + "step": 11523 + }, + { + "epoch": 0.7001640439880916, + "grad_norm": 0.1936226338148117, + "learning_rate": 2.071383488275933e-05, + "loss": 1.0874, + "step": 11524 + }, + { + "epoch": 0.7002248010207182, + "grad_norm": 0.11814224720001221, + "learning_rate": 2.070607658092961e-05, + "loss": 1.0227, + "step": 11525 + }, + { + "epoch": 0.7002855580533447, + "grad_norm": 0.14713111519813538, + "learning_rate": 2.069831935287432e-05, + "loss": 1.0856, + "step": 11526 + }, + { + "epoch": 0.7003463150859712, + "grad_norm": 3.862598180770874, + "learning_rate": 2.0690563198877776e-05, + "loss": 1.0789, + "step": 11527 + }, + { + "epoch": 0.7004070721185978, + "grad_norm": 0.1093360111117363, + "learning_rate": 2.068280811922434e-05, + "loss": 1.0277, + "step": 11528 + }, + { + "epoch": 0.7004678291512243, + "grad_norm": 0.1838693618774414, + "learning_rate": 2.067505411419822e-05, + "loss": 1.0592, + "step": 11529 + }, + { + "epoch": 0.7005285861838508, + "grad_norm": 0.1865403652191162, + "learning_rate": 2.066730118408366e-05, + "loss": 1.0853, + "step": 11530 + }, + { + "epoch": 0.7005893432164773, + "grad_norm": 0.14745014905929565, + "learning_rate": 2.0659549329164845e-05, + "loss": 1.0585, + "step": 11531 + }, + { + "epoch": 0.7006501002491038, + "grad_norm": 0.1056385412812233, + "learning_rate": 2.0651798549725903e-05, + "loss": 1.0336, + "step": 11532 + }, + { + "epoch": 0.7007108572817303, + "grad_norm": 0.11104696989059448, + "learning_rate": 2.0644048846050977e-05, + "loss": 1.0837, + "step": 11533 + }, + { + "epoch": 0.7007716143143569, + "grad_norm": 0.21141260862350464, + "learning_rate": 2.0636300218424136e-05, + "loss": 1.1472, + "step": 11534 + }, + { + "epoch": 0.7008323713469834, + "grad_norm": 6.516228199005127, + "learning_rate": 2.0628552667129363e-05, + "loss": 1.0657, + "step": 11535 + }, + { + "epoch": 0.7008931283796099, + "grad_norm": 0.24817363917827606, + "learning_rate": 2.062080619245067e-05, + "loss": 1.0763, + "step": 11536 + }, + { + "epoch": 0.7009538854122365, + "grad_norm": 0.47558996081352234, + "learning_rate": 2.061306079467199e-05, + "loss": 1.2482, + "step": 11537 + }, + { + "epoch": 0.701014642444863, + "grad_norm": 0.11814045161008835, + "learning_rate": 2.0605316474077264e-05, + "loss": 1.0928, + "step": 11538 + }, + { + "epoch": 0.7010753994774895, + "grad_norm": 0.18713463842868805, + "learning_rate": 2.0597573230950347e-05, + "loss": 1.0827, + "step": 11539 + }, + { + "epoch": 0.7011361565101161, + "grad_norm": 0.2151116132736206, + "learning_rate": 2.058983106557506e-05, + "loss": 1.0757, + "step": 11540 + }, + { + "epoch": 0.7011969135427426, + "grad_norm": 0.12864471971988678, + "learning_rate": 2.0582089978235204e-05, + "loss": 1.037, + "step": 11541 + }, + { + "epoch": 0.7012576705753691, + "grad_norm": 0.14003022015094757, + "learning_rate": 2.0574349969214524e-05, + "loss": 1.0369, + "step": 11542 + }, + { + "epoch": 0.7013184276079957, + "grad_norm": 0.19501356780529022, + "learning_rate": 2.0566611038796735e-05, + "loss": 1.068, + "step": 11543 + }, + { + "epoch": 0.7013791846406221, + "grad_norm": 0.11872688680887222, + "learning_rate": 2.055887318726551e-05, + "loss": 1.0429, + "step": 11544 + }, + { + "epoch": 0.7014399416732486, + "grad_norm": 0.12238573282957077, + "learning_rate": 2.0551136414904475e-05, + "loss": 0.9964, + "step": 11545 + }, + { + "epoch": 0.7015006987058752, + "grad_norm": 0.16445164382457733, + "learning_rate": 2.054340072199723e-05, + "loss": 1.0246, + "step": 11546 + }, + { + "epoch": 0.7015614557385017, + "grad_norm": 0.13491672277450562, + "learning_rate": 2.0535666108827323e-05, + "loss": 1.0629, + "step": 11547 + }, + { + "epoch": 0.7016222127711282, + "grad_norm": 0.20657508075237274, + "learning_rate": 2.052793257567827e-05, + "loss": 1.1592, + "step": 11548 + }, + { + "epoch": 0.7016829698037548, + "grad_norm": 0.1072314977645874, + "learning_rate": 2.052020012283355e-05, + "loss": 1.004, + "step": 11549 + }, + { + "epoch": 0.7017437268363813, + "grad_norm": 0.13779258728027344, + "learning_rate": 2.0512468750576592e-05, + "loss": 1.0574, + "step": 11550 + }, + { + "epoch": 0.7018044838690078, + "grad_norm": 0.16147683560848236, + "learning_rate": 2.0504738459190785e-05, + "loss": 1.1305, + "step": 11551 + }, + { + "epoch": 0.7018652409016344, + "grad_norm": 0.1400379240512848, + "learning_rate": 2.04970092489595e-05, + "loss": 1.0113, + "step": 11552 + }, + { + "epoch": 0.7019259979342609, + "grad_norm": 0.38990890979766846, + "learning_rate": 2.048928112016602e-05, + "loss": 1.1475, + "step": 11553 + }, + { + "epoch": 0.7019867549668874, + "grad_norm": 0.12363315373659134, + "learning_rate": 2.0481554073093666e-05, + "loss": 1.0476, + "step": 11554 + }, + { + "epoch": 0.702047511999514, + "grad_norm": 0.17727401852607727, + "learning_rate": 2.0473828108025666e-05, + "loss": 1.124, + "step": 11555 + }, + { + "epoch": 0.7021082690321405, + "grad_norm": 0.11687791347503662, + "learning_rate": 2.0466103225245192e-05, + "loss": 1.0628, + "step": 11556 + }, + { + "epoch": 0.7021690260647669, + "grad_norm": 0.12113591283559799, + "learning_rate": 2.045837942503539e-05, + "loss": 1.069, + "step": 11557 + }, + { + "epoch": 0.7022297830973935, + "grad_norm": 0.20920348167419434, + "learning_rate": 2.0450656707679422e-05, + "loss": 1.303, + "step": 11558 + }, + { + "epoch": 0.70229054013002, + "grad_norm": 0.10549594461917877, + "learning_rate": 2.0442935073460345e-05, + "loss": 1.0291, + "step": 11559 + }, + { + "epoch": 0.7023512971626465, + "grad_norm": 0.22226223349571228, + "learning_rate": 2.0435214522661194e-05, + "loss": 1.0751, + "step": 11560 + }, + { + "epoch": 0.7024120541952731, + "grad_norm": 0.10856860131025314, + "learning_rate": 2.042749505556499e-05, + "loss": 1.0625, + "step": 11561 + }, + { + "epoch": 0.7024728112278996, + "grad_norm": 0.09366056323051453, + "learning_rate": 2.0419776672454634e-05, + "loss": 1.0127, + "step": 11562 + }, + { + "epoch": 0.7025335682605262, + "grad_norm": 0.15423716604709625, + "learning_rate": 2.0412059373613096e-05, + "loss": 1.1022, + "step": 11563 + }, + { + "epoch": 0.7025943252931527, + "grad_norm": 0.1703069806098938, + "learning_rate": 2.0404343159323243e-05, + "loss": 1.0425, + "step": 11564 + }, + { + "epoch": 0.7026550823257792, + "grad_norm": 0.1194060817360878, + "learning_rate": 2.039662802986791e-05, + "loss": 1.0708, + "step": 11565 + }, + { + "epoch": 0.7027158393584058, + "grad_norm": 0.18426650762557983, + "learning_rate": 2.03889139855299e-05, + "loss": 1.1813, + "step": 11566 + }, + { + "epoch": 0.7027765963910323, + "grad_norm": 0.17231197655200958, + "learning_rate": 2.038120102659198e-05, + "loss": 1.1226, + "step": 11567 + }, + { + "epoch": 0.7028373534236588, + "grad_norm": 2.869856595993042, + "learning_rate": 2.037348915333685e-05, + "loss": 1.1197, + "step": 11568 + }, + { + "epoch": 0.7028981104562854, + "grad_norm": 0.11700932681560516, + "learning_rate": 2.0365778366047206e-05, + "loss": 1.0465, + "step": 11569 + }, + { + "epoch": 0.7029588674889118, + "grad_norm": 0.29313933849334717, + "learning_rate": 2.0358068665005685e-05, + "loss": 1.0747, + "step": 11570 + }, + { + "epoch": 0.7030196245215383, + "grad_norm": 0.12552626430988312, + "learning_rate": 2.0350360050494888e-05, + "loss": 1.0245, + "step": 11571 + }, + { + "epoch": 0.7030803815541649, + "grad_norm": 0.19134995341300964, + "learning_rate": 2.0342652522797374e-05, + "loss": 1.0625, + "step": 11572 + }, + { + "epoch": 0.7031411385867914, + "grad_norm": 0.11859611421823502, + "learning_rate": 2.0334946082195643e-05, + "loss": 1.0288, + "step": 11573 + }, + { + "epoch": 0.7032018956194179, + "grad_norm": 0.1901741623878479, + "learning_rate": 2.0327240728972215e-05, + "loss": 1.1176, + "step": 11574 + }, + { + "epoch": 0.7032626526520445, + "grad_norm": 0.22749671339988708, + "learning_rate": 2.0319536463409533e-05, + "loss": 1.0367, + "step": 11575 + }, + { + "epoch": 0.703323409684671, + "grad_norm": 0.16825591027736664, + "learning_rate": 2.0311833285789968e-05, + "loss": 0.9985, + "step": 11576 + }, + { + "epoch": 0.7033841667172975, + "grad_norm": 0.25161734223365784, + "learning_rate": 2.030413119639588e-05, + "loss": 1.0393, + "step": 11577 + }, + { + "epoch": 0.7034449237499241, + "grad_norm": 0.14071886241436005, + "learning_rate": 2.0296430195509597e-05, + "loss": 1.0845, + "step": 11578 + }, + { + "epoch": 0.7035056807825506, + "grad_norm": 0.35491394996643066, + "learning_rate": 2.0288730283413416e-05, + "loss": 1.153, + "step": 11579 + }, + { + "epoch": 0.7035664378151771, + "grad_norm": 2.2419357299804688, + "learning_rate": 2.0281031460389576e-05, + "loss": 0.9888, + "step": 11580 + }, + { + "epoch": 0.7036271948478037, + "grad_norm": 0.13546213507652283, + "learning_rate": 2.0273333726720284e-05, + "loss": 1.0679, + "step": 11581 + }, + { + "epoch": 0.7036879518804302, + "grad_norm": 0.11944033205509186, + "learning_rate": 2.0265637082687677e-05, + "loss": 1.0736, + "step": 11582 + }, + { + "epoch": 0.7037487089130566, + "grad_norm": 0.2002701759338379, + "learning_rate": 2.0257941528573872e-05, + "loss": 1.2367, + "step": 11583 + }, + { + "epoch": 0.7038094659456832, + "grad_norm": 0.23972612619400024, + "learning_rate": 2.0250247064660987e-05, + "loss": 1.1442, + "step": 11584 + }, + { + "epoch": 0.7038702229783097, + "grad_norm": 0.19439685344696045, + "learning_rate": 2.024255369123104e-05, + "loss": 1.1399, + "step": 11585 + }, + { + "epoch": 0.7039309800109362, + "grad_norm": 0.1960258185863495, + "learning_rate": 2.0234861408566052e-05, + "loss": 1.13, + "step": 11586 + }, + { + "epoch": 0.7039917370435628, + "grad_norm": 2.3999454975128174, + "learning_rate": 2.0227170216947967e-05, + "loss": 1.1277, + "step": 11587 + }, + { + "epoch": 0.7040524940761893, + "grad_norm": 0.16559021174907684, + "learning_rate": 2.0219480116658707e-05, + "loss": 1.0738, + "step": 11588 + }, + { + "epoch": 0.7041132511088158, + "grad_norm": 0.17407269775867462, + "learning_rate": 2.0211791107980167e-05, + "loss": 0.9817, + "step": 11589 + }, + { + "epoch": 0.7041740081414424, + "grad_norm": 0.24084053933620453, + "learning_rate": 2.0204103191194178e-05, + "loss": 1.2063, + "step": 11590 + }, + { + "epoch": 0.7042347651740689, + "grad_norm": 0.14700211584568024, + "learning_rate": 2.0196416366582543e-05, + "loss": 1.0464, + "step": 11591 + }, + { + "epoch": 0.7042955222066954, + "grad_norm": 0.13604646921157837, + "learning_rate": 2.0188730634427035e-05, + "loss": 1.0574, + "step": 11592 + }, + { + "epoch": 0.704356279239322, + "grad_norm": 0.18877767026424408, + "learning_rate": 2.018104599500936e-05, + "loss": 1.0423, + "step": 11593 + }, + { + "epoch": 0.7044170362719485, + "grad_norm": 0.12309647351503372, + "learning_rate": 2.0173362448611195e-05, + "loss": 1.0944, + "step": 11594 + }, + { + "epoch": 0.704477793304575, + "grad_norm": 0.5432168245315552, + "learning_rate": 2.016567999551423e-05, + "loss": 1.2447, + "step": 11595 + }, + { + "epoch": 0.7045385503372016, + "grad_norm": 0.16699868440628052, + "learning_rate": 2.0157998636000015e-05, + "loss": 1.0167, + "step": 11596 + }, + { + "epoch": 0.704599307369828, + "grad_norm": 0.1616109162569046, + "learning_rate": 2.0150318370350125e-05, + "loss": 1.11, + "step": 11597 + }, + { + "epoch": 0.7046600644024545, + "grad_norm": 0.12054099887609482, + "learning_rate": 2.0142639198846086e-05, + "loss": 1.0459, + "step": 11598 + }, + { + "epoch": 0.7047208214350811, + "grad_norm": 0.19042310118675232, + "learning_rate": 2.0134961121769364e-05, + "loss": 1.0666, + "step": 11599 + }, + { + "epoch": 0.7047815784677076, + "grad_norm": 0.11563307791948318, + "learning_rate": 2.0127284139401426e-05, + "loss": 1.003, + "step": 11600 + }, + { + "epoch": 0.7048423355003341, + "grad_norm": 0.16380497813224792, + "learning_rate": 2.0119608252023688e-05, + "loss": 1.047, + "step": 11601 + }, + { + "epoch": 0.7049030925329607, + "grad_norm": 0.15207651257514954, + "learning_rate": 2.011193345991746e-05, + "loss": 1.0427, + "step": 11602 + }, + { + "epoch": 0.7049638495655872, + "grad_norm": 0.3474125266075134, + "learning_rate": 2.010425976336409e-05, + "loss": 1.0842, + "step": 11603 + }, + { + "epoch": 0.7050246065982138, + "grad_norm": 0.1500178575515747, + "learning_rate": 2.0096587162644842e-05, + "loss": 1.0639, + "step": 11604 + }, + { + "epoch": 0.7050853636308403, + "grad_norm": 0.11654957383871078, + "learning_rate": 2.0088915658040992e-05, + "loss": 1.0401, + "step": 11605 + }, + { + "epoch": 0.7051461206634668, + "grad_norm": 0.115399070084095, + "learning_rate": 2.0081245249833713e-05, + "loss": 1.035, + "step": 11606 + }, + { + "epoch": 0.7052068776960934, + "grad_norm": 0.17292408645153046, + "learning_rate": 2.0073575938304178e-05, + "loss": 1.0734, + "step": 11607 + }, + { + "epoch": 0.7052676347287199, + "grad_norm": 0.15123586356639862, + "learning_rate": 2.006590772373352e-05, + "loss": 1.0592, + "step": 11608 + }, + { + "epoch": 0.7053283917613464, + "grad_norm": 0.1322067379951477, + "learning_rate": 2.0058240606402755e-05, + "loss": 1.021, + "step": 11609 + }, + { + "epoch": 0.7053891487939729, + "grad_norm": 0.48658719658851624, + "learning_rate": 2.0050574586592996e-05, + "loss": 1.0216, + "step": 11610 + }, + { + "epoch": 0.7054499058265994, + "grad_norm": 0.2290707677602768, + "learning_rate": 2.0042909664585214e-05, + "loss": 1.1641, + "step": 11611 + }, + { + "epoch": 0.7055106628592259, + "grad_norm": 0.16731087863445282, + "learning_rate": 2.0035245840660367e-05, + "loss": 1.0914, + "step": 11612 + }, + { + "epoch": 0.7055714198918525, + "grad_norm": 0.2058500200510025, + "learning_rate": 2.0027583115099375e-05, + "loss": 1.1184, + "step": 11613 + }, + { + "epoch": 0.705632176924479, + "grad_norm": 0.11693782359361649, + "learning_rate": 2.0019921488183118e-05, + "loss": 1.0551, + "step": 11614 + }, + { + "epoch": 0.7056929339571055, + "grad_norm": 0.12149243801832199, + "learning_rate": 2.001226096019243e-05, + "loss": 1.0696, + "step": 11615 + }, + { + "epoch": 0.7057536909897321, + "grad_norm": 0.1214408278465271, + "learning_rate": 2.000460153140812e-05, + "loss": 1.0698, + "step": 11616 + }, + { + "epoch": 0.7058144480223586, + "grad_norm": 0.10472622513771057, + "learning_rate": 1.9996943202110934e-05, + "loss": 1.0172, + "step": 11617 + }, + { + "epoch": 0.7058752050549851, + "grad_norm": 0.18020954728126526, + "learning_rate": 1.9989285972581595e-05, + "loss": 1.1391, + "step": 11618 + }, + { + "epoch": 0.7059359620876117, + "grad_norm": 0.1960398256778717, + "learning_rate": 1.9981629843100784e-05, + "loss": 1.1194, + "step": 11619 + }, + { + "epoch": 0.7059967191202382, + "grad_norm": 0.19699768722057343, + "learning_rate": 1.9973974813949115e-05, + "loss": 1.0165, + "step": 11620 + }, + { + "epoch": 0.7060574761528647, + "grad_norm": 0.15521733462810516, + "learning_rate": 1.9966320885407213e-05, + "loss": 1.0824, + "step": 11621 + }, + { + "epoch": 0.7061182331854913, + "grad_norm": 0.12691210210323334, + "learning_rate": 1.995866805775565e-05, + "loss": 0.9915, + "step": 11622 + }, + { + "epoch": 0.7061789902181177, + "grad_norm": 0.15325871109962463, + "learning_rate": 1.9951016331274897e-05, + "loss": 1.0658, + "step": 11623 + }, + { + "epoch": 0.7062397472507442, + "grad_norm": 0.13043555617332458, + "learning_rate": 1.9943365706245447e-05, + "loss": 1.0327, + "step": 11624 + }, + { + "epoch": 0.7063005042833708, + "grad_norm": 0.12522801756858826, + "learning_rate": 1.9935716182947716e-05, + "loss": 1.1267, + "step": 11625 + }, + { + "epoch": 0.7063612613159973, + "grad_norm": 1.1309356689453125, + "learning_rate": 1.9928067761662146e-05, + "loss": 1.0665, + "step": 11626 + }, + { + "epoch": 0.7064220183486238, + "grad_norm": 0.3840314745903015, + "learning_rate": 1.992042044266906e-05, + "loss": 1.1726, + "step": 11627 + }, + { + "epoch": 0.7064827753812504, + "grad_norm": 0.11954229325056076, + "learning_rate": 1.991277422624879e-05, + "loss": 1.0611, + "step": 11628 + }, + { + "epoch": 0.7065435324138769, + "grad_norm": 0.24044634401798248, + "learning_rate": 1.9905129112681585e-05, + "loss": 1.0721, + "step": 11629 + }, + { + "epoch": 0.7066042894465034, + "grad_norm": 0.1350989192724228, + "learning_rate": 1.989748510224767e-05, + "loss": 1.1328, + "step": 11630 + }, + { + "epoch": 0.70666504647913, + "grad_norm": 0.11142914742231369, + "learning_rate": 1.9889842195227276e-05, + "loss": 1.0081, + "step": 11631 + }, + { + "epoch": 0.7067258035117565, + "grad_norm": 0.18580536544322968, + "learning_rate": 1.9882200391900536e-05, + "loss": 1.1613, + "step": 11632 + }, + { + "epoch": 0.706786560544383, + "grad_norm": 0.14073748886585236, + "learning_rate": 1.987455969254756e-05, + "loss": 1.0609, + "step": 11633 + }, + { + "epoch": 0.7068473175770096, + "grad_norm": 0.11089940369129181, + "learning_rate": 1.9866920097448426e-05, + "loss": 1.01, + "step": 11634 + }, + { + "epoch": 0.7069080746096361, + "grad_norm": 0.17491337656974792, + "learning_rate": 1.9859281606883158e-05, + "loss": 1.1184, + "step": 11635 + }, + { + "epoch": 0.7069688316422625, + "grad_norm": 0.1741686910390854, + "learning_rate": 1.985164422113175e-05, + "loss": 1.0952, + "step": 11636 + }, + { + "epoch": 0.7070295886748891, + "grad_norm": 1.6490581035614014, + "learning_rate": 1.9844007940474153e-05, + "loss": 1.0455, + "step": 11637 + }, + { + "epoch": 0.7070903457075156, + "grad_norm": 0.17692893743515015, + "learning_rate": 1.983637276519027e-05, + "loss": 1.1266, + "step": 11638 + }, + { + "epoch": 0.7071511027401421, + "grad_norm": 0.16384969651699066, + "learning_rate": 1.9828738695559972e-05, + "loss": 1.0853, + "step": 11639 + }, + { + "epoch": 0.7072118597727687, + "grad_norm": 0.647416889667511, + "learning_rate": 1.9821105731863093e-05, + "loss": 1.0369, + "step": 11640 + }, + { + "epoch": 0.7072726168053952, + "grad_norm": 0.13874442875385284, + "learning_rate": 1.9813473874379395e-05, + "loss": 1.0949, + "step": 11641 + }, + { + "epoch": 0.7073333738380218, + "grad_norm": 0.11145878583192825, + "learning_rate": 1.9805843123388686e-05, + "loss": 1.0891, + "step": 11642 + }, + { + "epoch": 0.7073941308706483, + "grad_norm": 0.12096337229013443, + "learning_rate": 1.9798213479170613e-05, + "loss": 1.0212, + "step": 11643 + }, + { + "epoch": 0.7074548879032748, + "grad_norm": 0.1196274682879448, + "learning_rate": 1.9790584942004862e-05, + "loss": 1.0435, + "step": 11644 + }, + { + "epoch": 0.7075156449359014, + "grad_norm": 0.11231572180986404, + "learning_rate": 1.9782957512171068e-05, + "loss": 1.0428, + "step": 11645 + }, + { + "epoch": 0.7075764019685279, + "grad_norm": 0.755441427230835, + "learning_rate": 1.977533118994878e-05, + "loss": 1.0262, + "step": 11646 + }, + { + "epoch": 0.7076371590011544, + "grad_norm": 0.1362077295780182, + "learning_rate": 1.976770597561759e-05, + "loss": 1.0145, + "step": 11647 + }, + { + "epoch": 0.707697916033781, + "grad_norm": 0.11280715465545654, + "learning_rate": 1.9760081869457005e-05, + "loss": 1.0652, + "step": 11648 + }, + { + "epoch": 0.7077586730664074, + "grad_norm": 0.14085575938224792, + "learning_rate": 1.9752458871746437e-05, + "loss": 1.114, + "step": 11649 + }, + { + "epoch": 0.7078194300990339, + "grad_norm": 0.1524488478899002, + "learning_rate": 1.9744836982765325e-05, + "loss": 1.1108, + "step": 11650 + }, + { + "epoch": 0.7078801871316605, + "grad_norm": 0.15377359092235565, + "learning_rate": 1.9737216202793073e-05, + "loss": 0.9882, + "step": 11651 + }, + { + "epoch": 0.707940944164287, + "grad_norm": 0.11559594422578812, + "learning_rate": 1.972959653210902e-05, + "loss": 1.0137, + "step": 11652 + }, + { + "epoch": 0.7080017011969135, + "grad_norm": 0.1677667200565338, + "learning_rate": 1.972197797099245e-05, + "loss": 1.1743, + "step": 11653 + }, + { + "epoch": 0.7080624582295401, + "grad_norm": 0.13336063921451569, + "learning_rate": 1.971436051972263e-05, + "loss": 1.0211, + "step": 11654 + }, + { + "epoch": 0.7081232152621666, + "grad_norm": 0.10918305814266205, + "learning_rate": 1.9706744178578783e-05, + "loss": 1.0419, + "step": 11655 + }, + { + "epoch": 0.7081839722947931, + "grad_norm": 0.19564831256866455, + "learning_rate": 1.969912894784008e-05, + "loss": 1.0255, + "step": 11656 + }, + { + "epoch": 0.7082447293274197, + "grad_norm": 0.21771742403507233, + "learning_rate": 1.9691514827785663e-05, + "loss": 1.107, + "step": 11657 + }, + { + "epoch": 0.7083054863600462, + "grad_norm": 0.24258442223072052, + "learning_rate": 1.9683901818694632e-05, + "loss": 1.0283, + "step": 11658 + }, + { + "epoch": 0.7083662433926727, + "grad_norm": 0.17653697729110718, + "learning_rate": 1.9676289920846038e-05, + "loss": 1.0946, + "step": 11659 + }, + { + "epoch": 0.7084270004252993, + "grad_norm": 0.12213408946990967, + "learning_rate": 1.9668679134518898e-05, + "loss": 1.0595, + "step": 11660 + }, + { + "epoch": 0.7084877574579258, + "grad_norm": 0.1997223198413849, + "learning_rate": 1.966106945999217e-05, + "loss": 1.1843, + "step": 11661 + }, + { + "epoch": 0.7085485144905522, + "grad_norm": 0.16083401441574097, + "learning_rate": 1.965346089754484e-05, + "loss": 1.0753, + "step": 11662 + }, + { + "epoch": 0.7086092715231788, + "grad_norm": 0.17574000358581543, + "learning_rate": 1.9645853447455748e-05, + "loss": 1.1772, + "step": 11663 + }, + { + "epoch": 0.7086700285558053, + "grad_norm": 0.1727604866027832, + "learning_rate": 1.9638247110003765e-05, + "loss": 1.1427, + "step": 11664 + }, + { + "epoch": 0.7087307855884318, + "grad_norm": 0.10754947364330292, + "learning_rate": 1.9630641885467704e-05, + "loss": 1.0053, + "step": 11665 + }, + { + "epoch": 0.7087915426210584, + "grad_norm": 0.16104000806808472, + "learning_rate": 1.9623037774126317e-05, + "loss": 1.0288, + "step": 11666 + }, + { + "epoch": 0.7088522996536849, + "grad_norm": 0.1376451998949051, + "learning_rate": 1.9615434776258372e-05, + "loss": 1.0181, + "step": 11667 + }, + { + "epoch": 0.7089130566863114, + "grad_norm": 0.14112310111522675, + "learning_rate": 1.9607832892142554e-05, + "loss": 1.1272, + "step": 11668 + }, + { + "epoch": 0.708973813718938, + "grad_norm": 0.12976184487342834, + "learning_rate": 1.9600232122057478e-05, + "loss": 1.1176, + "step": 11669 + }, + { + "epoch": 0.7090345707515645, + "grad_norm": 0.1489325910806656, + "learning_rate": 1.9592632466281775e-05, + "loss": 1.0262, + "step": 11670 + }, + { + "epoch": 0.709095327784191, + "grad_norm": 0.16774222254753113, + "learning_rate": 1.9585033925093982e-05, + "loss": 1.1054, + "step": 11671 + }, + { + "epoch": 0.7091560848168176, + "grad_norm": 0.3156583309173584, + "learning_rate": 1.9577436498772666e-05, + "loss": 1.0832, + "step": 11672 + }, + { + "epoch": 0.7092168418494441, + "grad_norm": 0.13944421708583832, + "learning_rate": 1.95698401875963e-05, + "loss": 1.0354, + "step": 11673 + }, + { + "epoch": 0.7092775988820706, + "grad_norm": 0.22708818316459656, + "learning_rate": 1.9562244991843327e-05, + "loss": 1.0832, + "step": 11674 + }, + { + "epoch": 0.7093383559146971, + "grad_norm": 0.4300410747528076, + "learning_rate": 1.955465091179216e-05, + "loss": 1.2113, + "step": 11675 + }, + { + "epoch": 0.7093991129473236, + "grad_norm": 0.14897175133228302, + "learning_rate": 1.9547057947721116e-05, + "loss": 1.0562, + "step": 11676 + }, + { + "epoch": 0.7094598699799501, + "grad_norm": 0.13382786512374878, + "learning_rate": 1.9539466099908565e-05, + "loss": 1.0461, + "step": 11677 + }, + { + "epoch": 0.7095206270125767, + "grad_norm": 0.1356203854084015, + "learning_rate": 1.9531875368632774e-05, + "loss": 1.0654, + "step": 11678 + }, + { + "epoch": 0.7095813840452032, + "grad_norm": 0.145925372838974, + "learning_rate": 1.9524285754171978e-05, + "loss": 1.0768, + "step": 11679 + }, + { + "epoch": 0.7096421410778297, + "grad_norm": 0.17852981388568878, + "learning_rate": 1.9516697256804378e-05, + "loss": 1.0411, + "step": 11680 + }, + { + "epoch": 0.7097028981104563, + "grad_norm": 0.12320247292518616, + "learning_rate": 1.9509109876808135e-05, + "loss": 1.0199, + "step": 11681 + }, + { + "epoch": 0.7097636551430828, + "grad_norm": 0.11359871923923492, + "learning_rate": 1.9501523614461365e-05, + "loss": 1.0602, + "step": 11682 + }, + { + "epoch": 0.7098244121757094, + "grad_norm": 0.15387961268424988, + "learning_rate": 1.949393847004214e-05, + "loss": 1.0988, + "step": 11683 + }, + { + "epoch": 0.7098851692083359, + "grad_norm": 0.15408551692962646, + "learning_rate": 1.9486354443828498e-05, + "loss": 1.0873, + "step": 11684 + }, + { + "epoch": 0.7099459262409624, + "grad_norm": 0.11309153586626053, + "learning_rate": 1.947877153609843e-05, + "loss": 1.0026, + "step": 11685 + }, + { + "epoch": 0.710006683273589, + "grad_norm": 0.15927071869373322, + "learning_rate": 1.947118974712989e-05, + "loss": 1.0389, + "step": 11686 + }, + { + "epoch": 0.7100674403062155, + "grad_norm": 0.1430070549249649, + "learning_rate": 1.9463609077200774e-05, + "loss": 1.1247, + "step": 11687 + }, + { + "epoch": 0.7101281973388419, + "grad_norm": 0.1528358906507492, + "learning_rate": 1.945602952658901e-05, + "loss": 0.9834, + "step": 11688 + }, + { + "epoch": 0.7101889543714685, + "grad_norm": 0.14306876063346863, + "learning_rate": 1.9448451095572362e-05, + "loss": 1.1184, + "step": 11689 + }, + { + "epoch": 0.710249711404095, + "grad_norm": 1.314124584197998, + "learning_rate": 1.944087378442865e-05, + "loss": 1.0671, + "step": 11690 + }, + { + "epoch": 0.7103104684367215, + "grad_norm": 0.12583403289318085, + "learning_rate": 1.9433297593435617e-05, + "loss": 1.0415, + "step": 11691 + }, + { + "epoch": 0.710371225469348, + "grad_norm": 0.13705484569072723, + "learning_rate": 1.9425722522870948e-05, + "loss": 1.0628, + "step": 11692 + }, + { + "epoch": 0.7104319825019746, + "grad_norm": 0.22917865216732025, + "learning_rate": 1.9418148573012352e-05, + "loss": 1.1318, + "step": 11693 + }, + { + "epoch": 0.7104927395346011, + "grad_norm": 0.21376343071460724, + "learning_rate": 1.9410575744137426e-05, + "loss": 1.043, + "step": 11694 + }, + { + "epoch": 0.7105534965672277, + "grad_norm": 0.12615026533603668, + "learning_rate": 1.940300403652378e-05, + "loss": 1.0492, + "step": 11695 + }, + { + "epoch": 0.7106142535998542, + "grad_norm": 4.3653106689453125, + "learning_rate": 1.9395433450448914e-05, + "loss": 1.2242, + "step": 11696 + }, + { + "epoch": 0.7106750106324807, + "grad_norm": 0.1611001193523407, + "learning_rate": 1.9387863986190337e-05, + "loss": 1.0784, + "step": 11697 + }, + { + "epoch": 0.7107357676651073, + "grad_norm": 0.179573193192482, + "learning_rate": 1.9380295644025538e-05, + "loss": 1.0448, + "step": 11698 + }, + { + "epoch": 0.7107965246977338, + "grad_norm": 0.11063571274280548, + "learning_rate": 1.9372728424231923e-05, + "loss": 1.0488, + "step": 11699 + }, + { + "epoch": 0.7108572817303603, + "grad_norm": 0.16249185800552368, + "learning_rate": 1.9365162327086867e-05, + "loss": 1.0264, + "step": 11700 + }, + { + "epoch": 0.7109180387629869, + "grad_norm": 0.28120604157447815, + "learning_rate": 1.9357597352867706e-05, + "loss": 1.0068, + "step": 11701 + }, + { + "epoch": 0.7109787957956133, + "grad_norm": 0.13428813219070435, + "learning_rate": 1.9350033501851737e-05, + "loss": 1.0645, + "step": 11702 + }, + { + "epoch": 0.7110395528282398, + "grad_norm": 0.12566058337688446, + "learning_rate": 1.9342470774316217e-05, + "loss": 1.0386, + "step": 11703 + }, + { + "epoch": 0.7111003098608664, + "grad_norm": 0.1819542646408081, + "learning_rate": 1.9334909170538356e-05, + "loss": 1.01, + "step": 11704 + }, + { + "epoch": 0.7111610668934929, + "grad_norm": 0.24413032829761505, + "learning_rate": 1.9327348690795326e-05, + "loss": 1.1124, + "step": 11705 + }, + { + "epoch": 0.7112218239261194, + "grad_norm": 0.15328948199748993, + "learning_rate": 1.9319789335364258e-05, + "loss": 1.0426, + "step": 11706 + }, + { + "epoch": 0.711282580958746, + "grad_norm": 0.17072217166423798, + "learning_rate": 1.9312231104522242e-05, + "loss": 1.0202, + "step": 11707 + }, + { + "epoch": 0.7113433379913725, + "grad_norm": 0.1225581243634224, + "learning_rate": 1.9304673998546312e-05, + "loss": 1.0133, + "step": 11708 + }, + { + "epoch": 0.711404095023999, + "grad_norm": 0.25380784273147583, + "learning_rate": 1.9297118017713524e-05, + "loss": 1.225, + "step": 11709 + }, + { + "epoch": 0.7114648520566256, + "grad_norm": 0.1189921498298645, + "learning_rate": 1.9289563162300787e-05, + "loss": 1.0839, + "step": 11710 + }, + { + "epoch": 0.7115256090892521, + "grad_norm": 0.21586130559444427, + "learning_rate": 1.9282009432585056e-05, + "loss": 1.1584, + "step": 11711 + }, + { + "epoch": 0.7115863661218786, + "grad_norm": 0.21023230254650116, + "learning_rate": 1.9274456828843206e-05, + "loss": 1.0591, + "step": 11712 + }, + { + "epoch": 0.7116471231545052, + "grad_norm": 0.10978969186544418, + "learning_rate": 1.926690535135206e-05, + "loss": 0.9849, + "step": 11713 + }, + { + "epoch": 0.7117078801871317, + "grad_norm": 0.17131248116493225, + "learning_rate": 1.9259355000388457e-05, + "loss": 1.1152, + "step": 11714 + }, + { + "epoch": 0.7117686372197581, + "grad_norm": 0.2565372884273529, + "learning_rate": 1.9251805776229153e-05, + "loss": 1.1037, + "step": 11715 + }, + { + "epoch": 0.7118293942523847, + "grad_norm": 0.13198554515838623, + "learning_rate": 1.924425767915084e-05, + "loss": 1.0037, + "step": 11716 + }, + { + "epoch": 0.7118901512850112, + "grad_norm": 0.1634645015001297, + "learning_rate": 1.92367107094302e-05, + "loss": 1.0472, + "step": 11717 + }, + { + "epoch": 0.7119509083176377, + "grad_norm": 0.11387443542480469, + "learning_rate": 1.9229164867343863e-05, + "loss": 1.0396, + "step": 11718 + }, + { + "epoch": 0.7120116653502643, + "grad_norm": 0.11026987433433533, + "learning_rate": 1.922162015316845e-05, + "loss": 1.0756, + "step": 11719 + }, + { + "epoch": 0.7120724223828908, + "grad_norm": 0.10341232270002365, + "learning_rate": 1.9214076567180506e-05, + "loss": 0.9706, + "step": 11720 + }, + { + "epoch": 0.7121331794155173, + "grad_norm": 0.1101595014333725, + "learning_rate": 1.9206534109656532e-05, + "loss": 0.9948, + "step": 11721 + }, + { + "epoch": 0.7121939364481439, + "grad_norm": 0.17904014885425568, + "learning_rate": 1.9198992780873015e-05, + "loss": 1.0907, + "step": 11722 + }, + { + "epoch": 0.7122546934807704, + "grad_norm": 0.12634815275669098, + "learning_rate": 1.9191452581106346e-05, + "loss": 1.0334, + "step": 11723 + }, + { + "epoch": 0.712315450513397, + "grad_norm": 0.19385063648223877, + "learning_rate": 1.9183913510632945e-05, + "loss": 1.0873, + "step": 11724 + }, + { + "epoch": 0.7123762075460235, + "grad_norm": 0.1293027549982071, + "learning_rate": 1.917637556972916e-05, + "loss": 1.022, + "step": 11725 + }, + { + "epoch": 0.71243696457865, + "grad_norm": 0.12360483407974243, + "learning_rate": 1.9168838758671282e-05, + "loss": 1.0981, + "step": 11726 + }, + { + "epoch": 0.7124977216112766, + "grad_norm": 0.17805059254169464, + "learning_rate": 1.9161303077735575e-05, + "loss": 1.1905, + "step": 11727 + }, + { + "epoch": 0.712558478643903, + "grad_norm": 0.12121447920799255, + "learning_rate": 1.9153768527198272e-05, + "loss": 1.0508, + "step": 11728 + }, + { + "epoch": 0.7126192356765295, + "grad_norm": 0.1323792040348053, + "learning_rate": 1.9146235107335543e-05, + "loss": 1.0333, + "step": 11729 + }, + { + "epoch": 0.712679992709156, + "grad_norm": 0.2845796048641205, + "learning_rate": 1.913870281842353e-05, + "loss": 1.1417, + "step": 11730 + }, + { + "epoch": 0.7127407497417826, + "grad_norm": 0.11830468475818634, + "learning_rate": 1.9131171660738335e-05, + "loss": 1.0567, + "step": 11731 + }, + { + "epoch": 0.7128015067744091, + "grad_norm": 0.2766297459602356, + "learning_rate": 1.9123641634556e-05, + "loss": 1.1617, + "step": 11732 + }, + { + "epoch": 0.7128622638070357, + "grad_norm": 0.21680931746959686, + "learning_rate": 1.911611274015256e-05, + "loss": 1.1791, + "step": 11733 + }, + { + "epoch": 0.7129230208396622, + "grad_norm": 0.1302758753299713, + "learning_rate": 1.910858497780395e-05, + "loss": 1.0951, + "step": 11734 + }, + { + "epoch": 0.7129837778722887, + "grad_norm": 0.3375566601753235, + "learning_rate": 1.9101058347786173e-05, + "loss": 1.0107, + "step": 11735 + }, + { + "epoch": 0.7130445349049153, + "grad_norm": 0.1535346359014511, + "learning_rate": 1.909353285037505e-05, + "loss": 1.0656, + "step": 11736 + }, + { + "epoch": 0.7131052919375418, + "grad_norm": 0.13433580100536346, + "learning_rate": 1.908600848584646e-05, + "loss": 1.0545, + "step": 11737 + }, + { + "epoch": 0.7131660489701683, + "grad_norm": 0.12825262546539307, + "learning_rate": 1.9078485254476198e-05, + "loss": 1.0438, + "step": 11738 + }, + { + "epoch": 0.7132268060027949, + "grad_norm": 0.1130940392613411, + "learning_rate": 1.9070963156540023e-05, + "loss": 1.0711, + "step": 11739 + }, + { + "epoch": 0.7132875630354214, + "grad_norm": 0.17816604673862457, + "learning_rate": 1.9063442192313687e-05, + "loss": 1.0674, + "step": 11740 + }, + { + "epoch": 0.7133483200680478, + "grad_norm": 0.16936558485031128, + "learning_rate": 1.9055922362072858e-05, + "loss": 1.1216, + "step": 11741 + }, + { + "epoch": 0.7134090771006744, + "grad_norm": 0.36161476373672485, + "learning_rate": 1.9048403666093195e-05, + "loss": 1.1203, + "step": 11742 + }, + { + "epoch": 0.7134698341333009, + "grad_norm": 0.24623535573482513, + "learning_rate": 1.904088610465024e-05, + "loss": 1.0673, + "step": 11743 + }, + { + "epoch": 0.7135305911659274, + "grad_norm": 0.16006368398666382, + "learning_rate": 1.9033369678019613e-05, + "loss": 1.0809, + "step": 11744 + }, + { + "epoch": 0.713591348198554, + "grad_norm": 0.27312740683555603, + "learning_rate": 1.9025854386476798e-05, + "loss": 1.1195, + "step": 11745 + }, + { + "epoch": 0.7136521052311805, + "grad_norm": 0.1588553935289383, + "learning_rate": 1.9018340230297288e-05, + "loss": 1.1176, + "step": 11746 + }, + { + "epoch": 0.713712862263807, + "grad_norm": 0.1724746823310852, + "learning_rate": 1.90108272097565e-05, + "loss": 1.0724, + "step": 11747 + }, + { + "epoch": 0.7137736192964336, + "grad_norm": 0.18205776810646057, + "learning_rate": 1.900331532512983e-05, + "loss": 1.1882, + "step": 11748 + }, + { + "epoch": 0.7138343763290601, + "grad_norm": 0.11100442707538605, + "learning_rate": 1.8995804576692627e-05, + "loss": 1.066, + "step": 11749 + }, + { + "epoch": 0.7138951333616866, + "grad_norm": 0.12582969665527344, + "learning_rate": 1.89882949647202e-05, + "loss": 1.0308, + "step": 11750 + }, + { + "epoch": 0.7139558903943132, + "grad_norm": 0.366331547498703, + "learning_rate": 1.898078648948782e-05, + "loss": 1.049, + "step": 11751 + }, + { + "epoch": 0.7140166474269397, + "grad_norm": 0.16668683290481567, + "learning_rate": 1.8973279151270702e-05, + "loss": 0.9968, + "step": 11752 + }, + { + "epoch": 0.7140774044595662, + "grad_norm": 0.23617713153362274, + "learning_rate": 1.8965772950344042e-05, + "loss": 1.0512, + "step": 11753 + }, + { + "epoch": 0.7141381614921927, + "grad_norm": 0.12710700929164886, + "learning_rate": 1.8958267886982945e-05, + "loss": 1.0293, + "step": 11754 + }, + { + "epoch": 0.7141989185248192, + "grad_norm": 0.1962948441505432, + "learning_rate": 1.8950763961462582e-05, + "loss": 1.2104, + "step": 11755 + }, + { + "epoch": 0.7142596755574457, + "grad_norm": 0.19061042368412018, + "learning_rate": 1.8943261174057948e-05, + "loss": 1.0444, + "step": 11756 + }, + { + "epoch": 0.7143204325900723, + "grad_norm": 0.18270787596702576, + "learning_rate": 1.893575952504408e-05, + "loss": 1.0478, + "step": 11757 + }, + { + "epoch": 0.7143811896226988, + "grad_norm": 0.1116868332028389, + "learning_rate": 1.8928259014695948e-05, + "loss": 1.0138, + "step": 11758 + }, + { + "epoch": 0.7144419466553253, + "grad_norm": 0.1393412947654724, + "learning_rate": 1.892075964328847e-05, + "loss": 1.1346, + "step": 11759 + }, + { + "epoch": 0.7145027036879519, + "grad_norm": 0.4019707143306732, + "learning_rate": 1.8913261411096572e-05, + "loss": 1.0636, + "step": 11760 + }, + { + "epoch": 0.7145634607205784, + "grad_norm": 0.1373920887708664, + "learning_rate": 1.8905764318395093e-05, + "loss": 1.0183, + "step": 11761 + }, + { + "epoch": 0.714624217753205, + "grad_norm": 0.15592533349990845, + "learning_rate": 1.8898268365458844e-05, + "loss": 1.3152, + "step": 11762 + }, + { + "epoch": 0.7146849747858315, + "grad_norm": 0.1144845113158226, + "learning_rate": 1.889077355256256e-05, + "loss": 1.0361, + "step": 11763 + }, + { + "epoch": 0.714745731818458, + "grad_norm": 0.12774446606636047, + "learning_rate": 1.8883279879980976e-05, + "loss": 1.037, + "step": 11764 + }, + { + "epoch": 0.7148064888510846, + "grad_norm": 0.14454355835914612, + "learning_rate": 1.8875787347988793e-05, + "loss": 1.0525, + "step": 11765 + }, + { + "epoch": 0.7148672458837111, + "grad_norm": 0.19021962583065033, + "learning_rate": 1.8868295956860642e-05, + "loss": 1.1999, + "step": 11766 + }, + { + "epoch": 0.7149280029163375, + "grad_norm": 0.1694939285516739, + "learning_rate": 1.886080570687112e-05, + "loss": 1.0812, + "step": 11767 + }, + { + "epoch": 0.714988759948964, + "grad_norm": 0.19839046895503998, + "learning_rate": 1.885331659829479e-05, + "loss": 1.1567, + "step": 11768 + }, + { + "epoch": 0.7150495169815906, + "grad_norm": 0.142764151096344, + "learning_rate": 1.8845828631406153e-05, + "loss": 1.0673, + "step": 11769 + }, + { + "epoch": 0.7151102740142171, + "grad_norm": 0.14924031496047974, + "learning_rate": 1.883834180647969e-05, + "loss": 1.0916, + "step": 11770 + }, + { + "epoch": 0.7151710310468437, + "grad_norm": 0.18129679560661316, + "learning_rate": 1.8830856123789836e-05, + "loss": 1.0506, + "step": 11771 + }, + { + "epoch": 0.7152317880794702, + "grad_norm": 0.1394844800233841, + "learning_rate": 1.8823371583610977e-05, + "loss": 1.0792, + "step": 11772 + }, + { + "epoch": 0.7152925451120967, + "grad_norm": 0.1567200869321823, + "learning_rate": 1.8815888186217457e-05, + "loss": 1.0068, + "step": 11773 + }, + { + "epoch": 0.7153533021447233, + "grad_norm": 0.17005854845046997, + "learning_rate": 1.8808405931883583e-05, + "loss": 1.0275, + "step": 11774 + }, + { + "epoch": 0.7154140591773498, + "grad_norm": 0.1875138133764267, + "learning_rate": 1.8800924820883597e-05, + "loss": 1.1628, + "step": 11775 + }, + { + "epoch": 0.7154748162099763, + "grad_norm": 0.12211021035909653, + "learning_rate": 1.8793444853491783e-05, + "loss": 1.0867, + "step": 11776 + }, + { + "epoch": 0.7155355732426029, + "grad_norm": 0.15800920128822327, + "learning_rate": 1.878596602998226e-05, + "loss": 1.1003, + "step": 11777 + }, + { + "epoch": 0.7155963302752294, + "grad_norm": 0.15006932616233826, + "learning_rate": 1.8778488350629187e-05, + "loss": 1.0562, + "step": 11778 + }, + { + "epoch": 0.7156570873078559, + "grad_norm": 0.16724249720573425, + "learning_rate": 1.8771011815706657e-05, + "loss": 1.089, + "step": 11779 + }, + { + "epoch": 0.7157178443404824, + "grad_norm": 0.17960305511951447, + "learning_rate": 1.8763536425488704e-05, + "loss": 1.0965, + "step": 11780 + }, + { + "epoch": 0.7157786013731089, + "grad_norm": 0.2078692466020584, + "learning_rate": 1.8756062180249385e-05, + "loss": 1.1419, + "step": 11781 + }, + { + "epoch": 0.7158393584057354, + "grad_norm": 0.16367106139659882, + "learning_rate": 1.8748589080262662e-05, + "loss": 1.0788, + "step": 11782 + }, + { + "epoch": 0.715900115438362, + "grad_norm": 0.4139288067817688, + "learning_rate": 1.8741117125802427e-05, + "loss": 1.1373, + "step": 11783 + }, + { + "epoch": 0.7159608724709885, + "grad_norm": 0.1478399634361267, + "learning_rate": 1.873364631714259e-05, + "loss": 1.095, + "step": 11784 + }, + { + "epoch": 0.716021629503615, + "grad_norm": 0.2829451858997345, + "learning_rate": 1.872617665455697e-05, + "loss": 1.0528, + "step": 11785 + }, + { + "epoch": 0.7160823865362416, + "grad_norm": 0.1564915031194687, + "learning_rate": 1.871870813831941e-05, + "loss": 1.0406, + "step": 11786 + }, + { + "epoch": 0.7161431435688681, + "grad_norm": 0.2062319666147232, + "learning_rate": 1.8711240768703647e-05, + "loss": 1.1469, + "step": 11787 + }, + { + "epoch": 0.7162039006014946, + "grad_norm": 0.1241144984960556, + "learning_rate": 1.8703774545983405e-05, + "loss": 1.036, + "step": 11788 + }, + { + "epoch": 0.7162646576341212, + "grad_norm": 0.10204458236694336, + "learning_rate": 1.8696309470432372e-05, + "loss": 1.0282, + "step": 11789 + }, + { + "epoch": 0.7163254146667477, + "grad_norm": 0.16363711655139923, + "learning_rate": 1.868884554232413e-05, + "loss": 1.1376, + "step": 11790 + }, + { + "epoch": 0.7163861716993742, + "grad_norm": 0.12379411607980728, + "learning_rate": 1.8681382761932324e-05, + "loss": 1.0577, + "step": 11791 + }, + { + "epoch": 0.7164469287320008, + "grad_norm": 0.19269931316375732, + "learning_rate": 1.8673921129530492e-05, + "loss": 1.0754, + "step": 11792 + }, + { + "epoch": 0.7165076857646272, + "grad_norm": 0.14565134048461914, + "learning_rate": 1.866646064539213e-05, + "loss": 1.0789, + "step": 11793 + }, + { + "epoch": 0.7165684427972537, + "grad_norm": 0.1668594628572464, + "learning_rate": 1.8659001309790715e-05, + "loss": 1.2008, + "step": 11794 + }, + { + "epoch": 0.7166291998298803, + "grad_norm": 0.1097547635436058, + "learning_rate": 1.865154312299966e-05, + "loss": 1.0236, + "step": 11795 + }, + { + "epoch": 0.7166899568625068, + "grad_norm": 0.17278924584388733, + "learning_rate": 1.8644086085292357e-05, + "loss": 1.1165, + "step": 11796 + }, + { + "epoch": 0.7167507138951333, + "grad_norm": 0.20955996215343475, + "learning_rate": 1.8636630196942136e-05, + "loss": 1.094, + "step": 11797 + }, + { + "epoch": 0.7168114709277599, + "grad_norm": 0.15809226036071777, + "learning_rate": 1.8629175458222293e-05, + "loss": 1.0972, + "step": 11798 + }, + { + "epoch": 0.7168722279603864, + "grad_norm": 0.22863392531871796, + "learning_rate": 1.8621721869406095e-05, + "loss": 1.1475, + "step": 11799 + }, + { + "epoch": 0.716932984993013, + "grad_norm": 0.12562121450901031, + "learning_rate": 1.861426943076674e-05, + "loss": 1.0516, + "step": 11800 + }, + { + "epoch": 0.7169937420256395, + "grad_norm": 0.1876962035894394, + "learning_rate": 1.8606818142577392e-05, + "loss": 1.1917, + "step": 11801 + }, + { + "epoch": 0.717054499058266, + "grad_norm": 0.27948957681655884, + "learning_rate": 1.8599368005111233e-05, + "loss": 1.2759, + "step": 11802 + }, + { + "epoch": 0.7171152560908925, + "grad_norm": 0.3056049644947052, + "learning_rate": 1.8591919018641284e-05, + "loss": 1.2254, + "step": 11803 + }, + { + "epoch": 0.7171760131235191, + "grad_norm": 1.7359598875045776, + "learning_rate": 1.8584471183440623e-05, + "loss": 1.1066, + "step": 11804 + }, + { + "epoch": 0.7172367701561456, + "grad_norm": 0.12048758566379547, + "learning_rate": 1.8577024499782237e-05, + "loss": 1.0475, + "step": 11805 + }, + { + "epoch": 0.7172975271887722, + "grad_norm": 0.13789333403110504, + "learning_rate": 1.8569578967939073e-05, + "loss": 1.056, + "step": 11806 + }, + { + "epoch": 0.7173582842213986, + "grad_norm": 0.15338440239429474, + "learning_rate": 1.856213458818409e-05, + "loss": 1.1763, + "step": 11807 + }, + { + "epoch": 0.7174190412540251, + "grad_norm": 0.13651174306869507, + "learning_rate": 1.855469136079014e-05, + "loss": 1.122, + "step": 11808 + }, + { + "epoch": 0.7174797982866516, + "grad_norm": 0.14389897882938385, + "learning_rate": 1.8547249286030065e-05, + "loss": 1.0688, + "step": 11809 + }, + { + "epoch": 0.7175405553192782, + "grad_norm": 0.10650651901960373, + "learning_rate": 1.8539808364176635e-05, + "loss": 1.0866, + "step": 11810 + }, + { + "epoch": 0.7176013123519047, + "grad_norm": 0.11491388827562332, + "learning_rate": 1.85323685955026e-05, + "loss": 1.0276, + "step": 11811 + }, + { + "epoch": 0.7176620693845313, + "grad_norm": 0.14369851350784302, + "learning_rate": 1.8524929980280685e-05, + "loss": 1.0363, + "step": 11812 + }, + { + "epoch": 0.7177228264171578, + "grad_norm": 0.17646241188049316, + "learning_rate": 1.851749251878355e-05, + "loss": 1.1339, + "step": 11813 + }, + { + "epoch": 0.7177835834497843, + "grad_norm": 0.1842593103647232, + "learning_rate": 1.851005621128381e-05, + "loss": 1.114, + "step": 11814 + }, + { + "epoch": 0.7178443404824109, + "grad_norm": 0.18943586945533752, + "learning_rate": 1.8502621058054048e-05, + "loss": 1.0658, + "step": 11815 + }, + { + "epoch": 0.7179050975150374, + "grad_norm": 0.11217565834522247, + "learning_rate": 1.8495187059366804e-05, + "loss": 1.0307, + "step": 11816 + }, + { + "epoch": 0.7179658545476639, + "grad_norm": 0.11643880605697632, + "learning_rate": 1.8487754215494564e-05, + "loss": 1.0323, + "step": 11817 + }, + { + "epoch": 0.7180266115802905, + "grad_norm": 0.11993326991796494, + "learning_rate": 1.8480322526709783e-05, + "loss": 0.9903, + "step": 11818 + }, + { + "epoch": 0.718087368612917, + "grad_norm": 0.14355207979679108, + "learning_rate": 1.8472891993284867e-05, + "loss": 1.0061, + "step": 11819 + }, + { + "epoch": 0.7181481256455434, + "grad_norm": 0.23948566615581512, + "learning_rate": 1.8465462615492196e-05, + "loss": 1.187, + "step": 11820 + }, + { + "epoch": 0.71820888267817, + "grad_norm": 0.5445346236228943, + "learning_rate": 1.845803439360409e-05, + "loss": 1.0417, + "step": 11821 + }, + { + "epoch": 0.7182696397107965, + "grad_norm": 0.13110193610191345, + "learning_rate": 1.845060732789283e-05, + "loss": 1.0378, + "step": 11822 + }, + { + "epoch": 0.718330396743423, + "grad_norm": 0.12356985360383987, + "learning_rate": 1.844318141863065e-05, + "loss": 1.0175, + "step": 11823 + }, + { + "epoch": 0.7183911537760496, + "grad_norm": 0.18453015387058258, + "learning_rate": 1.843575666608976e-05, + "loss": 1.0694, + "step": 11824 + }, + { + "epoch": 0.7184519108086761, + "grad_norm": 0.17998893558979034, + "learning_rate": 1.8428333070542308e-05, + "loss": 1.0021, + "step": 11825 + }, + { + "epoch": 0.7185126678413026, + "grad_norm": 0.10383981466293335, + "learning_rate": 1.8420910632260414e-05, + "loss": 0.9852, + "step": 11826 + }, + { + "epoch": 0.7185734248739292, + "grad_norm": 0.17593352496623993, + "learning_rate": 1.8413489351516128e-05, + "loss": 1.0634, + "step": 11827 + }, + { + "epoch": 0.7186341819065557, + "grad_norm": 0.12185440212488174, + "learning_rate": 1.840606922858151e-05, + "loss": 1.0412, + "step": 11828 + }, + { + "epoch": 0.7186949389391822, + "grad_norm": 0.2973095774650574, + "learning_rate": 1.8398650263728547e-05, + "loss": 1.0959, + "step": 11829 + }, + { + "epoch": 0.7187556959718088, + "grad_norm": 1.0236891508102417, + "learning_rate": 1.8391232457229153e-05, + "loss": 1.062, + "step": 11830 + }, + { + "epoch": 0.7188164530044353, + "grad_norm": 0.15837439894676208, + "learning_rate": 1.8383815809355237e-05, + "loss": 0.9918, + "step": 11831 + }, + { + "epoch": 0.7188772100370618, + "grad_norm": 0.1673840433359146, + "learning_rate": 1.8376400320378654e-05, + "loss": 1.1054, + "step": 11832 + }, + { + "epoch": 0.7189379670696883, + "grad_norm": 0.17833562195301056, + "learning_rate": 1.836898599057124e-05, + "loss": 1.118, + "step": 11833 + }, + { + "epoch": 0.7189987241023148, + "grad_norm": 0.5794760584831238, + "learning_rate": 1.8361572820204768e-05, + "loss": 0.9715, + "step": 11834 + }, + { + "epoch": 0.7190594811349413, + "grad_norm": 0.13030590116977692, + "learning_rate": 1.8354160809550952e-05, + "loss": 1.0605, + "step": 11835 + }, + { + "epoch": 0.7191202381675679, + "grad_norm": 28.77001190185547, + "learning_rate": 1.8346749958881492e-05, + "loss": 1.0651, + "step": 11836 + }, + { + "epoch": 0.7191809952001944, + "grad_norm": 29.64858055114746, + "learning_rate": 1.833934026846803e-05, + "loss": 1.0844, + "step": 11837 + }, + { + "epoch": 0.7192417522328209, + "grad_norm": 0.1516307145357132, + "learning_rate": 1.833193173858217e-05, + "loss": 1.0323, + "step": 11838 + }, + { + "epoch": 0.7193025092654475, + "grad_norm": 0.14092974364757538, + "learning_rate": 1.8324524369495476e-05, + "loss": 1.0861, + "step": 11839 + }, + { + "epoch": 0.719363266298074, + "grad_norm": 1.4835987091064453, + "learning_rate": 1.8317118161479463e-05, + "loss": 1.0908, + "step": 11840 + }, + { + "epoch": 0.7194240233307005, + "grad_norm": 0.22931554913520813, + "learning_rate": 1.8309713114805603e-05, + "loss": 1.1115, + "step": 11841 + }, + { + "epoch": 0.7194847803633271, + "grad_norm": 0.11138559132814407, + "learning_rate": 1.830230922974534e-05, + "loss": 0.9981, + "step": 11842 + }, + { + "epoch": 0.7195455373959536, + "grad_norm": 0.15380050241947174, + "learning_rate": 1.829490650657006e-05, + "loss": 1.2166, + "step": 11843 + }, + { + "epoch": 0.7196062944285802, + "grad_norm": 0.14814504981040955, + "learning_rate": 1.8287504945551104e-05, + "loss": 1.0389, + "step": 11844 + }, + { + "epoch": 0.7196670514612067, + "grad_norm": 0.197509765625, + "learning_rate": 1.828010454695979e-05, + "loss": 1.017, + "step": 11845 + }, + { + "epoch": 0.7197278084938331, + "grad_norm": 0.1280500888824463, + "learning_rate": 1.8272705311067374e-05, + "loss": 1.0313, + "step": 11846 + }, + { + "epoch": 0.7197885655264596, + "grad_norm": 0.8988205194473267, + "learning_rate": 1.8265307238145074e-05, + "loss": 1.0784, + "step": 11847 + }, + { + "epoch": 0.7198493225590862, + "grad_norm": 0.3101656436920166, + "learning_rate": 1.8257910328464057e-05, + "loss": 1.137, + "step": 11848 + }, + { + "epoch": 0.7199100795917127, + "grad_norm": 0.24984928965568542, + "learning_rate": 1.825051458229551e-05, + "loss": 1.0211, + "step": 11849 + }, + { + "epoch": 0.7199708366243392, + "grad_norm": 0.3139202296733856, + "learning_rate": 1.824311999991046e-05, + "loss": 1.0751, + "step": 11850 + }, + { + "epoch": 0.7200315936569658, + "grad_norm": 0.1468210220336914, + "learning_rate": 1.8235726581579998e-05, + "loss": 0.9974, + "step": 11851 + }, + { + "epoch": 0.7200923506895923, + "grad_norm": 0.10285634547472, + "learning_rate": 1.8228334327575097e-05, + "loss": 0.9946, + "step": 11852 + }, + { + "epoch": 0.7201531077222189, + "grad_norm": 0.5532729625701904, + "learning_rate": 1.8220943238166754e-05, + "loss": 1.0921, + "step": 11853 + }, + { + "epoch": 0.7202138647548454, + "grad_norm": 0.12387371063232422, + "learning_rate": 1.8213553313625888e-05, + "loss": 1.0732, + "step": 11854 + }, + { + "epoch": 0.7202746217874719, + "grad_norm": 0.1855122447013855, + "learning_rate": 1.8206164554223358e-05, + "loss": 1.1238, + "step": 11855 + }, + { + "epoch": 0.7203353788200985, + "grad_norm": 0.10577202588319778, + "learning_rate": 1.819877696023004e-05, + "loss": 1.0391, + "step": 11856 + }, + { + "epoch": 0.720396135852725, + "grad_norm": 0.23974394798278809, + "learning_rate": 1.8191390531916663e-05, + "loss": 1.0801, + "step": 11857 + }, + { + "epoch": 0.7204568928853515, + "grad_norm": 0.22621716558933258, + "learning_rate": 1.8184005269554027e-05, + "loss": 1.1183, + "step": 11858 + }, + { + "epoch": 0.720517649917978, + "grad_norm": 0.2566772401332855, + "learning_rate": 1.8176621173412834e-05, + "loss": 1.1688, + "step": 11859 + }, + { + "epoch": 0.7205784069506045, + "grad_norm": 0.21181346476078033, + "learning_rate": 1.8169238243763735e-05, + "loss": 1.0364, + "step": 11860 + }, + { + "epoch": 0.720639163983231, + "grad_norm": 0.6864669919013977, + "learning_rate": 1.8161856480877365e-05, + "loss": 1.0518, + "step": 11861 + }, + { + "epoch": 0.7206999210158576, + "grad_norm": 0.14368070662021637, + "learning_rate": 1.81544758850243e-05, + "loss": 1.0893, + "step": 11862 + }, + { + "epoch": 0.7207606780484841, + "grad_norm": 0.19693714380264282, + "learning_rate": 1.814709645647508e-05, + "loss": 1.1898, + "step": 11863 + }, + { + "epoch": 0.7208214350811106, + "grad_norm": 0.23038814961910248, + "learning_rate": 1.8139718195500193e-05, + "loss": 1.1314, + "step": 11864 + }, + { + "epoch": 0.7208821921137372, + "grad_norm": 0.1323247104883194, + "learning_rate": 1.81323411023701e-05, + "loss": 1.0347, + "step": 11865 + }, + { + "epoch": 0.7209429491463637, + "grad_norm": 0.1980310082435608, + "learning_rate": 1.8124965177355195e-05, + "loss": 1.0538, + "step": 11866 + }, + { + "epoch": 0.7210037061789902, + "grad_norm": 0.12161320447921753, + "learning_rate": 1.8117590420725856e-05, + "loss": 1.0722, + "step": 11867 + }, + { + "epoch": 0.7210644632116168, + "grad_norm": 0.11527378112077713, + "learning_rate": 1.8110216832752387e-05, + "loss": 1.0053, + "step": 11868 + }, + { + "epoch": 0.7211252202442433, + "grad_norm": 0.2602807283401489, + "learning_rate": 1.8102844413705117e-05, + "loss": 1.0766, + "step": 11869 + }, + { + "epoch": 0.7211859772768698, + "grad_norm": 0.13532786071300507, + "learning_rate": 1.8095473163854237e-05, + "loss": 1.0631, + "step": 11870 + }, + { + "epoch": 0.7212467343094964, + "grad_norm": 0.10426671802997589, + "learning_rate": 1.808810308346995e-05, + "loss": 1.0461, + "step": 11871 + }, + { + "epoch": 0.7213074913421228, + "grad_norm": 0.1360999047756195, + "learning_rate": 1.8080734172822417e-05, + "loss": 1.0698, + "step": 11872 + }, + { + "epoch": 0.7213682483747493, + "grad_norm": 0.10016483813524246, + "learning_rate": 1.807336643218172e-05, + "loss": 1.0322, + "step": 11873 + }, + { + "epoch": 0.7214290054073759, + "grad_norm": 0.20410671830177307, + "learning_rate": 1.8065999861817972e-05, + "loss": 1.0454, + "step": 11874 + }, + { + "epoch": 0.7214897624400024, + "grad_norm": 0.10459274053573608, + "learning_rate": 1.8058634462001168e-05, + "loss": 1.0299, + "step": 11875 + }, + { + "epoch": 0.7215505194726289, + "grad_norm": 0.19205737113952637, + "learning_rate": 1.805127023300131e-05, + "loss": 1.0933, + "step": 11876 + }, + { + "epoch": 0.7216112765052555, + "grad_norm": 0.12911053001880646, + "learning_rate": 1.8043907175088298e-05, + "loss": 1.0144, + "step": 11877 + }, + { + "epoch": 0.721672033537882, + "grad_norm": 0.4698108434677124, + "learning_rate": 1.803654528853203e-05, + "loss": 1.2417, + "step": 11878 + }, + { + "epoch": 0.7217327905705085, + "grad_norm": 0.1186222955584526, + "learning_rate": 1.8029184573602394e-05, + "loss": 1.0676, + "step": 11879 + }, + { + "epoch": 0.7217935476031351, + "grad_norm": 0.11740090698003769, + "learning_rate": 1.8021825030569168e-05, + "loss": 0.9866, + "step": 11880 + }, + { + "epoch": 0.7218543046357616, + "grad_norm": 0.14112940430641174, + "learning_rate": 1.8014466659702133e-05, + "loss": 1.1117, + "step": 11881 + }, + { + "epoch": 0.7219150616683881, + "grad_norm": 0.2183370590209961, + "learning_rate": 1.8007109461271006e-05, + "loss": 1.0429, + "step": 11882 + }, + { + "epoch": 0.7219758187010147, + "grad_norm": 0.1846650242805481, + "learning_rate": 1.7999753435545464e-05, + "loss": 1.2141, + "step": 11883 + }, + { + "epoch": 0.7220365757336412, + "grad_norm": 0.11909636110067368, + "learning_rate": 1.7992398582795146e-05, + "loss": 1.0356, + "step": 11884 + }, + { + "epoch": 0.7220973327662676, + "grad_norm": 0.10280126333236694, + "learning_rate": 1.7985044903289645e-05, + "loss": 1.0218, + "step": 11885 + }, + { + "epoch": 0.7221580897988942, + "grad_norm": 0.3202227056026459, + "learning_rate": 1.7977692397298507e-05, + "loss": 1.0067, + "step": 11886 + }, + { + "epoch": 0.7222188468315207, + "grad_norm": 0.11542931944131851, + "learning_rate": 1.7970341065091245e-05, + "loss": 1.0532, + "step": 11887 + }, + { + "epoch": 0.7222796038641472, + "grad_norm": 0.2886582612991333, + "learning_rate": 1.7962990906937326e-05, + "loss": 1.1831, + "step": 11888 + }, + { + "epoch": 0.7223403608967738, + "grad_norm": 0.1688026338815689, + "learning_rate": 1.795564192310617e-05, + "loss": 1.0442, + "step": 11889 + }, + { + "epoch": 0.7224011179294003, + "grad_norm": 0.18989261984825134, + "learning_rate": 1.7948294113867146e-05, + "loss": 1.0394, + "step": 11890 + }, + { + "epoch": 0.7224618749620269, + "grad_norm": 0.19108445942401886, + "learning_rate": 1.7940947479489605e-05, + "loss": 1.1426, + "step": 11891 + }, + { + "epoch": 0.7225226319946534, + "grad_norm": 0.17166262865066528, + "learning_rate": 1.7933602020242823e-05, + "loss": 1.0184, + "step": 11892 + }, + { + "epoch": 0.7225833890272799, + "grad_norm": 0.19406643509864807, + "learning_rate": 1.7926257736396062e-05, + "loss": 1.1246, + "step": 11893 + }, + { + "epoch": 0.7226441460599065, + "grad_norm": 0.18000058829784393, + "learning_rate": 1.791891462821851e-05, + "loss": 1.1396, + "step": 11894 + }, + { + "epoch": 0.722704903092533, + "grad_norm": 0.6983065009117126, + "learning_rate": 1.791157269597935e-05, + "loss": 1.0904, + "step": 11895 + }, + { + "epoch": 0.7227656601251595, + "grad_norm": 0.11733144521713257, + "learning_rate": 1.7904231939947724e-05, + "loss": 1.0607, + "step": 11896 + }, + { + "epoch": 0.7228264171577861, + "grad_norm": 27.400362014770508, + "learning_rate": 1.7896892360392654e-05, + "loss": 1.0329, + "step": 11897 + }, + { + "epoch": 0.7228871741904125, + "grad_norm": 0.12632763385772705, + "learning_rate": 1.7889553957583205e-05, + "loss": 1.0182, + "step": 11898 + }, + { + "epoch": 0.722947931223039, + "grad_norm": 0.11717429012060165, + "learning_rate": 1.788221673178834e-05, + "loss": 1.0502, + "step": 11899 + }, + { + "epoch": 0.7230086882556656, + "grad_norm": 0.1840904802083969, + "learning_rate": 1.7874880683277053e-05, + "loss": 1.1037, + "step": 11900 + }, + { + "epoch": 0.7230694452882921, + "grad_norm": 0.22967280447483063, + "learning_rate": 1.786754581231822e-05, + "loss": 1.148, + "step": 11901 + }, + { + "epoch": 0.7231302023209186, + "grad_norm": 0.17893098294734955, + "learning_rate": 1.786021211918071e-05, + "loss": 1.0412, + "step": 11902 + }, + { + "epoch": 0.7231909593535452, + "grad_norm": 0.18129698932170868, + "learning_rate": 1.785287960413335e-05, + "loss": 1.2564, + "step": 11903 + }, + { + "epoch": 0.7232517163861717, + "grad_norm": 0.12053747475147247, + "learning_rate": 1.784554826744487e-05, + "loss": 1.0459, + "step": 11904 + }, + { + "epoch": 0.7233124734187982, + "grad_norm": 0.2647194266319275, + "learning_rate": 1.7838218109384053e-05, + "loss": 1.0365, + "step": 11905 + }, + { + "epoch": 0.7233732304514248, + "grad_norm": 0.12505848705768585, + "learning_rate": 1.7830889130219564e-05, + "loss": 1.0399, + "step": 11906 + }, + { + "epoch": 0.7234339874840513, + "grad_norm": 0.10708886384963989, + "learning_rate": 1.782356133022005e-05, + "loss": 1.043, + "step": 11907 + }, + { + "epoch": 0.7234947445166778, + "grad_norm": 0.10663751512765884, + "learning_rate": 1.7816234709654124e-05, + "loss": 1.0387, + "step": 11908 + }, + { + "epoch": 0.7235555015493044, + "grad_norm": 0.10991241782903671, + "learning_rate": 1.7808909268790323e-05, + "loss": 1.0226, + "step": 11909 + }, + { + "epoch": 0.7236162585819309, + "grad_norm": 0.22962409257888794, + "learning_rate": 1.7801585007897188e-05, + "loss": 1.2356, + "step": 11910 + }, + { + "epoch": 0.7236770156145574, + "grad_norm": 0.14256666600704193, + "learning_rate": 1.7794261927243173e-05, + "loss": 1.0354, + "step": 11911 + }, + { + "epoch": 0.7237377726471839, + "grad_norm": 0.11486692726612091, + "learning_rate": 1.778694002709671e-05, + "loss": 1.0556, + "step": 11912 + }, + { + "epoch": 0.7237985296798104, + "grad_norm": 0.184869185090065, + "learning_rate": 1.7779619307726186e-05, + "loss": 1.0473, + "step": 11913 + }, + { + "epoch": 0.7238592867124369, + "grad_norm": 0.1681230366230011, + "learning_rate": 1.7772299769399948e-05, + "loss": 1.1425, + "step": 11914 + }, + { + "epoch": 0.7239200437450635, + "grad_norm": 0.20325717329978943, + "learning_rate": 1.7764981412386273e-05, + "loss": 1.0678, + "step": 11915 + }, + { + "epoch": 0.72398080077769, + "grad_norm": 0.11284222453832626, + "learning_rate": 1.7757664236953468e-05, + "loss": 1.083, + "step": 11916 + }, + { + "epoch": 0.7240415578103165, + "grad_norm": 0.14161108434200287, + "learning_rate": 1.7750348243369703e-05, + "loss": 1.0956, + "step": 11917 + }, + { + "epoch": 0.7241023148429431, + "grad_norm": 0.18631429970264435, + "learning_rate": 1.774303343190315e-05, + "loss": 1.1746, + "step": 11918 + }, + { + "epoch": 0.7241630718755696, + "grad_norm": 0.11615665256977081, + "learning_rate": 1.7735719802821943e-05, + "loss": 1.0379, + "step": 11919 + }, + { + "epoch": 0.7242238289081961, + "grad_norm": 0.11753161996603012, + "learning_rate": 1.7728407356394144e-05, + "loss": 1.0209, + "step": 11920 + }, + { + "epoch": 0.7242845859408227, + "grad_norm": 0.3985714316368103, + "learning_rate": 1.7721096092887833e-05, + "loss": 1.1739, + "step": 11921 + }, + { + "epoch": 0.7243453429734492, + "grad_norm": 0.30062320828437805, + "learning_rate": 1.7713786012570983e-05, + "loss": 1.0979, + "step": 11922 + }, + { + "epoch": 0.7244061000060757, + "grad_norm": 0.1258896291255951, + "learning_rate": 1.7706477115711563e-05, + "loss": 1.0349, + "step": 11923 + }, + { + "epoch": 0.7244668570387023, + "grad_norm": 0.14100538194179535, + "learning_rate": 1.7699169402577447e-05, + "loss": 1.0864, + "step": 11924 + }, + { + "epoch": 0.7245276140713287, + "grad_norm": 0.17764458060264587, + "learning_rate": 1.7691862873436504e-05, + "loss": 1.0911, + "step": 11925 + }, + { + "epoch": 0.7245883711039552, + "grad_norm": 0.138014018535614, + "learning_rate": 1.7684557528556595e-05, + "loss": 1.0563, + "step": 11926 + }, + { + "epoch": 0.7246491281365818, + "grad_norm": 0.12526796758174896, + "learning_rate": 1.7677253368205472e-05, + "loss": 1.0369, + "step": 11927 + }, + { + "epoch": 0.7247098851692083, + "grad_norm": 0.2356281876564026, + "learning_rate": 1.7669950392650876e-05, + "loss": 1.2178, + "step": 11928 + }, + { + "epoch": 0.7247706422018348, + "grad_norm": 0.30109789967536926, + "learning_rate": 1.7662648602160493e-05, + "loss": 1.1486, + "step": 11929 + }, + { + "epoch": 0.7248313992344614, + "grad_norm": 0.24375095963478088, + "learning_rate": 1.765534799700198e-05, + "loss": 1.2662, + "step": 11930 + }, + { + "epoch": 0.7248921562670879, + "grad_norm": 0.151702880859375, + "learning_rate": 1.764804857744294e-05, + "loss": 1.1219, + "step": 11931 + }, + { + "epoch": 0.7249529132997145, + "grad_norm": 0.15219072997570038, + "learning_rate": 1.764075034375093e-05, + "loss": 1.0062, + "step": 11932 + }, + { + "epoch": 0.725013670332341, + "grad_norm": 0.19406989216804504, + "learning_rate": 1.763345329619347e-05, + "loss": 1.0271, + "step": 11933 + }, + { + "epoch": 0.7250744273649675, + "grad_norm": 0.9423526525497437, + "learning_rate": 1.762615743503804e-05, + "loss": 1.1378, + "step": 11934 + }, + { + "epoch": 0.7251351843975941, + "grad_norm": 0.1921922266483307, + "learning_rate": 1.7618862760552063e-05, + "loss": 1.1375, + "step": 11935 + }, + { + "epoch": 0.7251959414302206, + "grad_norm": 0.24168115854263306, + "learning_rate": 1.761156927300293e-05, + "loss": 1.1208, + "step": 11936 + }, + { + "epoch": 0.7252566984628471, + "grad_norm": 0.13272972404956818, + "learning_rate": 1.760427697265799e-05, + "loss": 1.0707, + "step": 11937 + }, + { + "epoch": 0.7253174554954736, + "grad_norm": 0.11252856999635696, + "learning_rate": 1.7596985859784537e-05, + "loss": 1.0489, + "step": 11938 + }, + { + "epoch": 0.7253782125281001, + "grad_norm": 0.10038076341152191, + "learning_rate": 1.7589695934649837e-05, + "loss": 1.0496, + "step": 11939 + }, + { + "epoch": 0.7254389695607266, + "grad_norm": 0.13965252041816711, + "learning_rate": 1.758240719752109e-05, + "loss": 1.0691, + "step": 11940 + }, + { + "epoch": 0.7254997265933532, + "grad_norm": 0.28648120164871216, + "learning_rate": 1.7575119648665457e-05, + "loss": 1.0717, + "step": 11941 + }, + { + "epoch": 0.7255604836259797, + "grad_norm": 0.12222371995449066, + "learning_rate": 1.75678332883501e-05, + "loss": 1.0412, + "step": 11942 + }, + { + "epoch": 0.7256212406586062, + "grad_norm": 0.12257443368434906, + "learning_rate": 1.7560548116842097e-05, + "loss": 1.0781, + "step": 11943 + }, + { + "epoch": 0.7256819976912328, + "grad_norm": 0.3077135980129242, + "learning_rate": 1.755326413440846e-05, + "loss": 1.1147, + "step": 11944 + }, + { + "epoch": 0.7257427547238593, + "grad_norm": 0.19839894771575928, + "learning_rate": 1.7545981341316174e-05, + "loss": 1.1566, + "step": 11945 + }, + { + "epoch": 0.7258035117564858, + "grad_norm": 0.137166365981102, + "learning_rate": 1.7538699737832236e-05, + "loss": 1.0498, + "step": 11946 + }, + { + "epoch": 0.7258642687891124, + "grad_norm": 0.12378855049610138, + "learning_rate": 1.7531419324223536e-05, + "loss": 1.0502, + "step": 11947 + }, + { + "epoch": 0.7259250258217389, + "grad_norm": 0.13504281640052795, + "learning_rate": 1.7524140100756937e-05, + "loss": 1.0169, + "step": 11948 + }, + { + "epoch": 0.7259857828543654, + "grad_norm": 0.5035527348518372, + "learning_rate": 1.751686206769926e-05, + "loss": 1.1884, + "step": 11949 + }, + { + "epoch": 0.726046539886992, + "grad_norm": 0.20623120665550232, + "learning_rate": 1.750958522531728e-05, + "loss": 1.1622, + "step": 11950 + }, + { + "epoch": 0.7261072969196184, + "grad_norm": 0.177435040473938, + "learning_rate": 1.7502309573877735e-05, + "loss": 1.0072, + "step": 11951 + }, + { + "epoch": 0.7261680539522449, + "grad_norm": 0.11040189117193222, + "learning_rate": 1.7495035113647312e-05, + "loss": 0.9951, + "step": 11952 + }, + { + "epoch": 0.7262288109848715, + "grad_norm": 0.15410874783992767, + "learning_rate": 1.7487761844892663e-05, + "loss": 1.0728, + "step": 11953 + }, + { + "epoch": 0.726289568017498, + "grad_norm": 0.17565593123435974, + "learning_rate": 1.7480489767880388e-05, + "loss": 1.1792, + "step": 11954 + }, + { + "epoch": 0.7263503250501245, + "grad_norm": 0.23485685884952545, + "learning_rate": 1.7473218882877047e-05, + "loss": 1.225, + "step": 11955 + }, + { + "epoch": 0.7264110820827511, + "grad_norm": 0.16741417348384857, + "learning_rate": 1.7465949190149155e-05, + "loss": 1.0828, + "step": 11956 + }, + { + "epoch": 0.7264718391153776, + "grad_norm": 0.4004530608654022, + "learning_rate": 1.745868068996318e-05, + "loss": 1.028, + "step": 11957 + }, + { + "epoch": 0.7265325961480041, + "grad_norm": 0.17415936291217804, + "learning_rate": 1.7451413382585557e-05, + "loss": 1.0247, + "step": 11958 + }, + { + "epoch": 0.7265933531806307, + "grad_norm": 0.1294311285018921, + "learning_rate": 1.7444147268282673e-05, + "loss": 1.083, + "step": 11959 + }, + { + "epoch": 0.7266541102132572, + "grad_norm": 0.12395626306533813, + "learning_rate": 1.7436882347320856e-05, + "loss": 1.0623, + "step": 11960 + }, + { + "epoch": 0.7267148672458837, + "grad_norm": 0.1823682337999344, + "learning_rate": 1.74296186199664e-05, + "loss": 1.058, + "step": 11961 + }, + { + "epoch": 0.7267756242785103, + "grad_norm": 0.10366540402173996, + "learning_rate": 1.742235608648558e-05, + "loss": 1.0377, + "step": 11962 + }, + { + "epoch": 0.7268363813111368, + "grad_norm": 0.15267492830753326, + "learning_rate": 1.7415094747144617e-05, + "loss": 1.1065, + "step": 11963 + }, + { + "epoch": 0.7268971383437632, + "grad_norm": 0.14518509805202484, + "learning_rate": 1.7407834602209633e-05, + "loss": 1.0374, + "step": 11964 + }, + { + "epoch": 0.7269578953763898, + "grad_norm": 0.14894914627075195, + "learning_rate": 1.7400575651946778e-05, + "loss": 1.1389, + "step": 11965 + }, + { + "epoch": 0.7270186524090163, + "grad_norm": 0.21335726976394653, + "learning_rate": 1.7393317896622103e-05, + "loss": 1.1297, + "step": 11966 + }, + { + "epoch": 0.7270794094416428, + "grad_norm": 0.12031121551990509, + "learning_rate": 1.738606133650168e-05, + "loss": 1.0386, + "step": 11967 + }, + { + "epoch": 0.7271401664742694, + "grad_norm": 0.11088176816701889, + "learning_rate": 1.737880597185148e-05, + "loss": 1.0218, + "step": 11968 + }, + { + "epoch": 0.7272009235068959, + "grad_norm": 0.13445907831192017, + "learning_rate": 1.7371551802937456e-05, + "loss": 1.1113, + "step": 11969 + }, + { + "epoch": 0.7272616805395224, + "grad_norm": 0.1174154281616211, + "learning_rate": 1.736429883002553e-05, + "loss": 1.0994, + "step": 11970 + }, + { + "epoch": 0.727322437572149, + "grad_norm": 0.1192723736166954, + "learning_rate": 1.7357047053381492e-05, + "loss": 1.0526, + "step": 11971 + }, + { + "epoch": 0.7273831946047755, + "grad_norm": 0.35053887963294983, + "learning_rate": 1.734979647327123e-05, + "loss": 1.0611, + "step": 11972 + }, + { + "epoch": 0.727443951637402, + "grad_norm": 0.13890041410923004, + "learning_rate": 1.7342547089960487e-05, + "loss": 1.049, + "step": 11973 + }, + { + "epoch": 0.7275047086700286, + "grad_norm": 0.12907762825489044, + "learning_rate": 1.733529890371499e-05, + "loss": 1.1144, + "step": 11974 + }, + { + "epoch": 0.7275654657026551, + "grad_norm": 0.1540406048297882, + "learning_rate": 1.7328051914800432e-05, + "loss": 1.0802, + "step": 11975 + }, + { + "epoch": 0.7276262227352817, + "grad_norm": 0.15978215634822845, + "learning_rate": 1.7320806123482447e-05, + "loss": 1.023, + "step": 11976 + }, + { + "epoch": 0.7276869797679081, + "grad_norm": 0.2384788542985916, + "learning_rate": 1.7313561530026624e-05, + "loss": 1.1058, + "step": 11977 + }, + { + "epoch": 0.7277477368005346, + "grad_norm": 0.16895924508571625, + "learning_rate": 1.7306318134698523e-05, + "loss": 1.0798, + "step": 11978 + }, + { + "epoch": 0.7278084938331612, + "grad_norm": 4.890584468841553, + "learning_rate": 1.7299075937763654e-05, + "loss": 1.0808, + "step": 11979 + }, + { + "epoch": 0.7278692508657877, + "grad_norm": 0.13193364441394806, + "learning_rate": 1.7291834939487473e-05, + "loss": 1.1062, + "step": 11980 + }, + { + "epoch": 0.7279300078984142, + "grad_norm": 0.1554129421710968, + "learning_rate": 1.7284595140135408e-05, + "loss": 1.0449, + "step": 11981 + }, + { + "epoch": 0.7279907649310408, + "grad_norm": 0.2118116170167923, + "learning_rate": 1.7277356539972812e-05, + "loss": 1.0861, + "step": 11982 + }, + { + "epoch": 0.7280515219636673, + "grad_norm": 0.20252224802970886, + "learning_rate": 1.7270119139265072e-05, + "loss": 1.0703, + "step": 11983 + }, + { + "epoch": 0.7281122789962938, + "grad_norm": 0.11053638160228729, + "learning_rate": 1.7262882938277425e-05, + "loss": 1.0638, + "step": 11984 + }, + { + "epoch": 0.7281730360289204, + "grad_norm": 0.11425353586673737, + "learning_rate": 1.7255647937275133e-05, + "loss": 1.0067, + "step": 11985 + }, + { + "epoch": 0.7282337930615469, + "grad_norm": 0.13651826977729797, + "learning_rate": 1.7248414136523396e-05, + "loss": 1.0203, + "step": 11986 + }, + { + "epoch": 0.7282945500941734, + "grad_norm": 0.2547251582145691, + "learning_rate": 1.724118153628735e-05, + "loss": 1.0748, + "step": 11987 + }, + { + "epoch": 0.7283553071268, + "grad_norm": 0.15423952043056488, + "learning_rate": 1.7233950136832143e-05, + "loss": 1.0642, + "step": 11988 + }, + { + "epoch": 0.7284160641594265, + "grad_norm": 0.10580328851938248, + "learning_rate": 1.722671993842283e-05, + "loss": 1.0218, + "step": 11989 + }, + { + "epoch": 0.7284768211920529, + "grad_norm": 0.1980973482131958, + "learning_rate": 1.7219490941324444e-05, + "loss": 1.0959, + "step": 11990 + }, + { + "epoch": 0.7285375782246795, + "grad_norm": 0.1393929123878479, + "learning_rate": 1.7212263145801943e-05, + "loss": 1.0355, + "step": 11991 + }, + { + "epoch": 0.728598335257306, + "grad_norm": 0.5448645949363708, + "learning_rate": 1.7205036552120257e-05, + "loss": 1.059, + "step": 11992 + }, + { + "epoch": 0.7286590922899325, + "grad_norm": 0.21412117779254913, + "learning_rate": 1.7197811160544307e-05, + "loss": 1.1711, + "step": 11993 + }, + { + "epoch": 0.7287198493225591, + "grad_norm": 0.6389798521995544, + "learning_rate": 1.7190586971338935e-05, + "loss": 1.0625, + "step": 11994 + }, + { + "epoch": 0.7287806063551856, + "grad_norm": 0.16428568959236145, + "learning_rate": 1.7183363984768935e-05, + "loss": 1.1916, + "step": 11995 + }, + { + "epoch": 0.7288413633878121, + "grad_norm": 0.13203231990337372, + "learning_rate": 1.7176142201099077e-05, + "loss": 1.0246, + "step": 11996 + }, + { + "epoch": 0.7289021204204387, + "grad_norm": 4.570551872253418, + "learning_rate": 1.716892162059407e-05, + "loss": 1.0759, + "step": 11997 + }, + { + "epoch": 0.7289628774530652, + "grad_norm": 0.7408970594406128, + "learning_rate": 1.7161702243518585e-05, + "loss": 1.0312, + "step": 11998 + }, + { + "epoch": 0.7290236344856917, + "grad_norm": 0.18424023687839508, + "learning_rate": 1.715448407013725e-05, + "loss": 1.064, + "step": 11999 + }, + { + "epoch": 0.7290843915183183, + "grad_norm": 0.12323799729347229, + "learning_rate": 1.7147267100714654e-05, + "loss": 1.0692, + "step": 12000 + }, + { + "epoch": 0.7291451485509448, + "grad_norm": 0.1933857798576355, + "learning_rate": 1.714005133551533e-05, + "loss": 1.1501, + "step": 12001 + }, + { + "epoch": 0.7292059055835713, + "grad_norm": 0.13889583945274353, + "learning_rate": 1.713283677480378e-05, + "loss": 1.0426, + "step": 12002 + }, + { + "epoch": 0.7292666626161978, + "grad_norm": 0.09956985712051392, + "learning_rate": 1.7125623418844444e-05, + "loss": 1.0175, + "step": 12003 + }, + { + "epoch": 0.7293274196488243, + "grad_norm": 0.18919014930725098, + "learning_rate": 1.711841126790174e-05, + "loss": 1.1045, + "step": 12004 + }, + { + "epoch": 0.7293881766814508, + "grad_norm": 0.14285099506378174, + "learning_rate": 1.7111200322240027e-05, + "loss": 1.1028, + "step": 12005 + }, + { + "epoch": 0.7294489337140774, + "grad_norm": 0.3515326976776123, + "learning_rate": 1.7103990582123624e-05, + "loss": 1.0577, + "step": 12006 + }, + { + "epoch": 0.7295096907467039, + "grad_norm": 0.14477676153182983, + "learning_rate": 1.7096782047816805e-05, + "loss": 1.1403, + "step": 12007 + }, + { + "epoch": 0.7295704477793304, + "grad_norm": 0.12925733625888824, + "learning_rate": 1.7089574719583785e-05, + "loss": 1.0734, + "step": 12008 + }, + { + "epoch": 0.729631204811957, + "grad_norm": 0.13059315085411072, + "learning_rate": 1.708236859768878e-05, + "loss": 1.0159, + "step": 12009 + }, + { + "epoch": 0.7296919618445835, + "grad_norm": 0.20773211121559143, + "learning_rate": 1.7075163682395933e-05, + "loss": 1.0933, + "step": 12010 + }, + { + "epoch": 0.72975271887721, + "grad_norm": 0.13940361142158508, + "learning_rate": 1.706795997396931e-05, + "loss": 1.0617, + "step": 12011 + }, + { + "epoch": 0.7298134759098366, + "grad_norm": 0.15254928171634674, + "learning_rate": 1.7060757472672977e-05, + "loss": 1.122, + "step": 12012 + }, + { + "epoch": 0.7298742329424631, + "grad_norm": 0.13257399201393127, + "learning_rate": 1.705355617877093e-05, + "loss": 1.0407, + "step": 12013 + }, + { + "epoch": 0.7299349899750897, + "grad_norm": 0.20717093348503113, + "learning_rate": 1.704635609252717e-05, + "loss": 1.0555, + "step": 12014 + }, + { + "epoch": 0.7299957470077162, + "grad_norm": 0.12712256610393524, + "learning_rate": 1.7039157214205593e-05, + "loss": 1.0881, + "step": 12015 + }, + { + "epoch": 0.7300565040403427, + "grad_norm": 0.12965604662895203, + "learning_rate": 1.7031959544070082e-05, + "loss": 1.0441, + "step": 12016 + }, + { + "epoch": 0.7301172610729691, + "grad_norm": 0.22230522334575653, + "learning_rate": 1.7024763082384483e-05, + "loss": 1.1574, + "step": 12017 + }, + { + "epoch": 0.7301780181055957, + "grad_norm": 0.5337817072868347, + "learning_rate": 1.7017567829412535e-05, + "loss": 1.0982, + "step": 12018 + }, + { + "epoch": 0.7302387751382222, + "grad_norm": 0.09890550374984741, + "learning_rate": 1.7010373785418028e-05, + "loss": 1.0779, + "step": 12019 + }, + { + "epoch": 0.7302995321708488, + "grad_norm": 0.20456703007221222, + "learning_rate": 1.7003180950664642e-05, + "loss": 1.0771, + "step": 12020 + }, + { + "epoch": 0.7303602892034753, + "grad_norm": 0.1896372139453888, + "learning_rate": 1.6995989325416046e-05, + "loss": 0.9824, + "step": 12021 + }, + { + "epoch": 0.7304210462361018, + "grad_norm": 0.09826211631298065, + "learning_rate": 1.6988798909935833e-05, + "loss": 1.0475, + "step": 12022 + }, + { + "epoch": 0.7304818032687284, + "grad_norm": 0.12386680394411087, + "learning_rate": 1.6981609704487583e-05, + "loss": 1.051, + "step": 12023 + }, + { + "epoch": 0.7305425603013549, + "grad_norm": 0.1515650451183319, + "learning_rate": 1.6974421709334808e-05, + "loss": 1.0871, + "step": 12024 + }, + { + "epoch": 0.7306033173339814, + "grad_norm": 0.22790172696113586, + "learning_rate": 1.696723492474099e-05, + "loss": 1.0778, + "step": 12025 + }, + { + "epoch": 0.730664074366608, + "grad_norm": 0.5616945624351501, + "learning_rate": 1.696004935096956e-05, + "loss": 1.207, + "step": 12026 + }, + { + "epoch": 0.7307248313992345, + "grad_norm": 0.1577715426683426, + "learning_rate": 1.6952864988283907e-05, + "loss": 1.1128, + "step": 12027 + }, + { + "epoch": 0.730785588431861, + "grad_norm": 0.12401007115840912, + "learning_rate": 1.6945681836947374e-05, + "loss": 1.0281, + "step": 12028 + }, + { + "epoch": 0.7308463454644876, + "grad_norm": 0.1118738129734993, + "learning_rate": 1.6938499897223252e-05, + "loss": 1.0026, + "step": 12029 + }, + { + "epoch": 0.730907102497114, + "grad_norm": 0.16849000751972198, + "learning_rate": 1.6931319169374836e-05, + "loss": 1.0547, + "step": 12030 + }, + { + "epoch": 0.7309678595297405, + "grad_norm": 0.18020136654376984, + "learning_rate": 1.6924139653665293e-05, + "loss": 1.0322, + "step": 12031 + }, + { + "epoch": 0.7310286165623671, + "grad_norm": 0.1565113067626953, + "learning_rate": 1.6916961350357802e-05, + "loss": 0.9947, + "step": 12032 + }, + { + "epoch": 0.7310893735949936, + "grad_norm": 0.1534905582666397, + "learning_rate": 1.690978425971549e-05, + "loss": 1.0182, + "step": 12033 + }, + { + "epoch": 0.7311501306276201, + "grad_norm": 0.123008593916893, + "learning_rate": 1.6902608382001415e-05, + "loss": 1.0365, + "step": 12034 + }, + { + "epoch": 0.7312108876602467, + "grad_norm": 0.2115435153245926, + "learning_rate": 1.6895433717478643e-05, + "loss": 1.1278, + "step": 12035 + }, + { + "epoch": 0.7312716446928732, + "grad_norm": 0.4212009608745575, + "learning_rate": 1.688826026641015e-05, + "loss": 1.0335, + "step": 12036 + }, + { + "epoch": 0.7313324017254997, + "grad_norm": 5.170857906341553, + "learning_rate": 1.688108802905889e-05, + "loss": 1.0288, + "step": 12037 + }, + { + "epoch": 0.7313931587581263, + "grad_norm": 0.22664779424667358, + "learning_rate": 1.687391700568772e-05, + "loss": 1.1439, + "step": 12038 + }, + { + "epoch": 0.7314539157907528, + "grad_norm": 0.11372818052768707, + "learning_rate": 1.686674719655954e-05, + "loss": 1.0972, + "step": 12039 + }, + { + "epoch": 0.7315146728233793, + "grad_norm": 0.19588282704353333, + "learning_rate": 1.6859578601937144e-05, + "loss": 1.1204, + "step": 12040 + }, + { + "epoch": 0.7315754298560059, + "grad_norm": 0.17658144235610962, + "learning_rate": 1.6852411222083302e-05, + "loss": 1.0986, + "step": 12041 + }, + { + "epoch": 0.7316361868886324, + "grad_norm": 0.12479786574840546, + "learning_rate": 1.6845245057260734e-05, + "loss": 1.0102, + "step": 12042 + }, + { + "epoch": 0.7316969439212588, + "grad_norm": 7.837426662445068, + "learning_rate": 1.6838080107732117e-05, + "loss": 1.0823, + "step": 12043 + }, + { + "epoch": 0.7317577009538854, + "grad_norm": 0.6720539331436157, + "learning_rate": 1.6830916373760087e-05, + "loss": 1.1298, + "step": 12044 + }, + { + "epoch": 0.7318184579865119, + "grad_norm": 0.3161039650440216, + "learning_rate": 1.6823753855607222e-05, + "loss": 1.2, + "step": 12045 + }, + { + "epoch": 0.7318792150191384, + "grad_norm": 0.14072582125663757, + "learning_rate": 1.6816592553536077e-05, + "loss": 1.0535, + "step": 12046 + }, + { + "epoch": 0.731939972051765, + "grad_norm": 0.22343140840530396, + "learning_rate": 1.6809432467809144e-05, + "loss": 1.0222, + "step": 12047 + }, + { + "epoch": 0.7320007290843915, + "grad_norm": 0.12016142904758453, + "learning_rate": 1.6802273598688884e-05, + "loss": 1.0236, + "step": 12048 + }, + { + "epoch": 0.732061486117018, + "grad_norm": 0.1599024087190628, + "learning_rate": 1.67951159464377e-05, + "loss": 1.11, + "step": 12049 + }, + { + "epoch": 0.7321222431496446, + "grad_norm": 0.1445721685886383, + "learning_rate": 1.6787959511317958e-05, + "loss": 1.0586, + "step": 12050 + }, + { + "epoch": 0.7321830001822711, + "grad_norm": 0.10825826227664948, + "learning_rate": 1.678080429359198e-05, + "loss": 1.0494, + "step": 12051 + }, + { + "epoch": 0.7322437572148977, + "grad_norm": 0.12876258790493011, + "learning_rate": 1.677365029352204e-05, + "loss": 1.0637, + "step": 12052 + }, + { + "epoch": 0.7323045142475242, + "grad_norm": 1.277681827545166, + "learning_rate": 1.6766497511370376e-05, + "loss": 1.1165, + "step": 12053 + }, + { + "epoch": 0.7323652712801507, + "grad_norm": 0.16351719200611115, + "learning_rate": 1.6759345947399145e-05, + "loss": 1.0655, + "step": 12054 + }, + { + "epoch": 0.7324260283127773, + "grad_norm": 0.1531379669904709, + "learning_rate": 1.675219560187054e-05, + "loss": 1.0843, + "step": 12055 + }, + { + "epoch": 0.7324867853454037, + "grad_norm": 0.14996446669101715, + "learning_rate": 1.6745046475046627e-05, + "loss": 1.0445, + "step": 12056 + }, + { + "epoch": 0.7325475423780302, + "grad_norm": 0.15850290656089783, + "learning_rate": 1.673789856718948e-05, + "loss": 1.0966, + "step": 12057 + }, + { + "epoch": 0.7326082994106567, + "grad_norm": 1.516983151435852, + "learning_rate": 1.6730751878561074e-05, + "loss": 1.0543, + "step": 12058 + }, + { + "epoch": 0.7326690564432833, + "grad_norm": 0.17107877135276794, + "learning_rate": 1.672360640942337e-05, + "loss": 1.1013, + "step": 12059 + }, + { + "epoch": 0.7327298134759098, + "grad_norm": 0.48979371786117554, + "learning_rate": 1.6716462160038325e-05, + "loss": 1.1381, + "step": 12060 + }, + { + "epoch": 0.7327905705085364, + "grad_norm": 0.13007274270057678, + "learning_rate": 1.6709319130667793e-05, + "loss": 1.0532, + "step": 12061 + }, + { + "epoch": 0.7328513275411629, + "grad_norm": 0.11528556793928146, + "learning_rate": 1.6702177321573603e-05, + "loss": 1.031, + "step": 12062 + }, + { + "epoch": 0.7329120845737894, + "grad_norm": 0.10124536603689194, + "learning_rate": 1.669503673301754e-05, + "loss": 1.0465, + "step": 12063 + }, + { + "epoch": 0.732972841606416, + "grad_norm": 0.18992623686790466, + "learning_rate": 1.668789736526134e-05, + "loss": 1.1641, + "step": 12064 + }, + { + "epoch": 0.7330335986390425, + "grad_norm": 0.1198720708489418, + "learning_rate": 1.6680759218566704e-05, + "loss": 1.0275, + "step": 12065 + }, + { + "epoch": 0.733094355671669, + "grad_norm": 0.1708758920431137, + "learning_rate": 1.667362229319528e-05, + "loss": 1.0434, + "step": 12066 + }, + { + "epoch": 0.7331551127042956, + "grad_norm": 0.1209886223077774, + "learning_rate": 1.666648658940867e-05, + "loss": 1.0277, + "step": 12067 + }, + { + "epoch": 0.7332158697369221, + "grad_norm": 0.15256762504577637, + "learning_rate": 1.665935210746844e-05, + "loss": 1.1144, + "step": 12068 + }, + { + "epoch": 0.7332766267695485, + "grad_norm": 0.11685581505298615, + "learning_rate": 1.6652218847636098e-05, + "loss": 0.9854, + "step": 12069 + }, + { + "epoch": 0.7333373838021751, + "grad_norm": 0.12188849598169327, + "learning_rate": 1.664508681017313e-05, + "loss": 1.0406, + "step": 12070 + }, + { + "epoch": 0.7333981408348016, + "grad_norm": 0.14406493306159973, + "learning_rate": 1.663795599534094e-05, + "loss": 1.0887, + "step": 12071 + }, + { + "epoch": 0.7334588978674281, + "grad_norm": 0.13072574138641357, + "learning_rate": 1.663082640340093e-05, + "loss": 1.0692, + "step": 12072 + }, + { + "epoch": 0.7335196549000547, + "grad_norm": 0.2514955699443817, + "learning_rate": 1.6623698034614428e-05, + "loss": 1.1562, + "step": 12073 + }, + { + "epoch": 0.7335804119326812, + "grad_norm": 0.21190212666988373, + "learning_rate": 1.661657088924272e-05, + "loss": 1.0614, + "step": 12074 + }, + { + "epoch": 0.7336411689653077, + "grad_norm": 0.1246088445186615, + "learning_rate": 1.660944496754705e-05, + "loss": 1.0525, + "step": 12075 + }, + { + "epoch": 0.7337019259979343, + "grad_norm": 0.633964478969574, + "learning_rate": 1.660232026978864e-05, + "loss": 1.041, + "step": 12076 + }, + { + "epoch": 0.7337626830305608, + "grad_norm": 0.18055935204029083, + "learning_rate": 1.6595196796228655e-05, + "loss": 1.0343, + "step": 12077 + }, + { + "epoch": 0.7338234400631873, + "grad_norm": 0.10113296657800674, + "learning_rate": 1.6588074547128173e-05, + "loss": 0.9919, + "step": 12078 + }, + { + "epoch": 0.7338841970958139, + "grad_norm": 0.14399178326129913, + "learning_rate": 1.658095352274827e-05, + "loss": 1.0757, + "step": 12079 + }, + { + "epoch": 0.7339449541284404, + "grad_norm": 0.18908679485321045, + "learning_rate": 1.657383372334997e-05, + "loss": 1.1575, + "step": 12080 + }, + { + "epoch": 0.7340057111610669, + "grad_norm": 0.14220589399337769, + "learning_rate": 1.656671514919427e-05, + "loss": 1.0921, + "step": 12081 + }, + { + "epoch": 0.7340664681936934, + "grad_norm": 0.380215585231781, + "learning_rate": 1.6559597800542082e-05, + "loss": 1.1456, + "step": 12082 + }, + { + "epoch": 0.7341272252263199, + "grad_norm": 0.5016540288925171, + "learning_rate": 1.6552481677654302e-05, + "loss": 1.1984, + "step": 12083 + }, + { + "epoch": 0.7341879822589464, + "grad_norm": 0.26983851194381714, + "learning_rate": 1.6545366780791792e-05, + "loss": 1.1842, + "step": 12084 + }, + { + "epoch": 0.734248739291573, + "grad_norm": 0.24236908555030823, + "learning_rate": 1.6538253110215292e-05, + "loss": 1.1977, + "step": 12085 + }, + { + "epoch": 0.7343094963241995, + "grad_norm": 0.14403243362903595, + "learning_rate": 1.6531140666185614e-05, + "loss": 1.0771, + "step": 12086 + }, + { + "epoch": 0.734370253356826, + "grad_norm": 0.18638917803764343, + "learning_rate": 1.652402944896344e-05, + "loss": 1.0281, + "step": 12087 + }, + { + "epoch": 0.7344310103894526, + "grad_norm": 0.1815180778503418, + "learning_rate": 1.651691945880944e-05, + "loss": 1.087, + "step": 12088 + }, + { + "epoch": 0.7344917674220791, + "grad_norm": 0.1412448137998581, + "learning_rate": 1.6509810695984225e-05, + "loss": 0.9898, + "step": 12089 + }, + { + "epoch": 0.7345525244547056, + "grad_norm": 0.13540153205394745, + "learning_rate": 1.650270316074838e-05, + "loss": 1.0673, + "step": 12090 + }, + { + "epoch": 0.7346132814873322, + "grad_norm": 35.782196044921875, + "learning_rate": 1.6495596853362415e-05, + "loss": 1.1115, + "step": 12091 + }, + { + "epoch": 0.7346740385199587, + "grad_norm": 0.1554182916879654, + "learning_rate": 1.6488491774086834e-05, + "loss": 1.0441, + "step": 12092 + }, + { + "epoch": 0.7347347955525853, + "grad_norm": 0.2042216658592224, + "learning_rate": 1.6481387923182055e-05, + "loss": 1.1018, + "step": 12093 + }, + { + "epoch": 0.7347955525852118, + "grad_norm": 0.1271388679742813, + "learning_rate": 1.6474285300908486e-05, + "loss": 1.0561, + "step": 12094 + }, + { + "epoch": 0.7348563096178382, + "grad_norm": 0.1373508721590042, + "learning_rate": 1.6467183907526472e-05, + "loss": 1.0392, + "step": 12095 + }, + { + "epoch": 0.7349170666504647, + "grad_norm": 0.1256594955921173, + "learning_rate": 1.6460083743296296e-05, + "loss": 1.0061, + "step": 12096 + }, + { + "epoch": 0.7349778236830913, + "grad_norm": 0.23491358757019043, + "learning_rate": 1.6452984808478267e-05, + "loss": 1.1557, + "step": 12097 + }, + { + "epoch": 0.7350385807157178, + "grad_norm": 0.13289356231689453, + "learning_rate": 1.644588710333254e-05, + "loss": 1.0213, + "step": 12098 + }, + { + "epoch": 0.7350993377483444, + "grad_norm": 0.21962270140647888, + "learning_rate": 1.6438790628119317e-05, + "loss": 1.1838, + "step": 12099 + }, + { + "epoch": 0.7351600947809709, + "grad_norm": 0.16216178238391876, + "learning_rate": 1.6431695383098704e-05, + "loss": 1.091, + "step": 12100 + }, + { + "epoch": 0.7352208518135974, + "grad_norm": 0.3127570152282715, + "learning_rate": 1.6424601368530772e-05, + "loss": 1.1891, + "step": 12101 + }, + { + "epoch": 0.735281608846224, + "grad_norm": 0.22137241065502167, + "learning_rate": 1.6417508584675583e-05, + "loss": 1.1461, + "step": 12102 + }, + { + "epoch": 0.7353423658788505, + "grad_norm": 4.210534572601318, + "learning_rate": 1.6410417031793112e-05, + "loss": 1.0345, + "step": 12103 + }, + { + "epoch": 0.735403122911477, + "grad_norm": 0.23011243343353271, + "learning_rate": 1.6403326710143306e-05, + "loss": 1.0741, + "step": 12104 + }, + { + "epoch": 0.7354638799441036, + "grad_norm": 0.25292232632637024, + "learning_rate": 1.6396237619986042e-05, + "loss": 1.0699, + "step": 12105 + }, + { + "epoch": 0.7355246369767301, + "grad_norm": 0.13957048952579498, + "learning_rate": 1.638914976158117e-05, + "loss": 1.0243, + "step": 12106 + }, + { + "epoch": 0.7355853940093566, + "grad_norm": 0.12817224860191345, + "learning_rate": 1.6382063135188517e-05, + "loss": 1.0472, + "step": 12107 + }, + { + "epoch": 0.735646151041983, + "grad_norm": 0.12771575152873993, + "learning_rate": 1.6374977741067847e-05, + "loss": 1.0779, + "step": 12108 + }, + { + "epoch": 0.7357069080746096, + "grad_norm": 0.1966123878955841, + "learning_rate": 1.6367893579478865e-05, + "loss": 1.0523, + "step": 12109 + }, + { + "epoch": 0.7357676651072361, + "grad_norm": 0.16574272513389587, + "learning_rate": 1.6360810650681262e-05, + "loss": 1.1028, + "step": 12110 + }, + { + "epoch": 0.7358284221398627, + "grad_norm": 0.12013058364391327, + "learning_rate": 1.635372895493461e-05, + "loss": 1.029, + "step": 12111 + }, + { + "epoch": 0.7358891791724892, + "grad_norm": 0.15217263996601105, + "learning_rate": 1.634664849249855e-05, + "loss": 1.1132, + "step": 12112 + }, + { + "epoch": 0.7359499362051157, + "grad_norm": 2.272892475128174, + "learning_rate": 1.6339569263632588e-05, + "loss": 1.0675, + "step": 12113 + }, + { + "epoch": 0.7360106932377423, + "grad_norm": 0.1886737197637558, + "learning_rate": 1.6332491268596223e-05, + "loss": 1.0251, + "step": 12114 + }, + { + "epoch": 0.7360714502703688, + "grad_norm": 0.15079881250858307, + "learning_rate": 1.63254145076489e-05, + "loss": 1.1271, + "step": 12115 + }, + { + "epoch": 0.7361322073029953, + "grad_norm": 0.16714252531528473, + "learning_rate": 1.631833898105002e-05, + "loss": 1.079, + "step": 12116 + }, + { + "epoch": 0.7361929643356219, + "grad_norm": 0.10738638043403625, + "learning_rate": 1.631126468905893e-05, + "loss": 1.0957, + "step": 12117 + }, + { + "epoch": 0.7362537213682484, + "grad_norm": 0.12305158376693726, + "learning_rate": 1.6304191631934945e-05, + "loss": 1.0236, + "step": 12118 + }, + { + "epoch": 0.7363144784008749, + "grad_norm": 0.18824654817581177, + "learning_rate": 1.6297119809937328e-05, + "loss": 1.1123, + "step": 12119 + }, + { + "epoch": 0.7363752354335015, + "grad_norm": 0.11218249052762985, + "learning_rate": 1.6290049223325304e-05, + "loss": 1.0697, + "step": 12120 + }, + { + "epoch": 0.736435992466128, + "grad_norm": 0.8790396451950073, + "learning_rate": 1.628297987235804e-05, + "loss": 1.0434, + "step": 12121 + }, + { + "epoch": 0.7364967494987544, + "grad_norm": 0.15753483772277832, + "learning_rate": 1.627591175729465e-05, + "loss": 1.141, + "step": 12122 + }, + { + "epoch": 0.736557506531381, + "grad_norm": 6.803406238555908, + "learning_rate": 1.6268844878394247e-05, + "loss": 0.9891, + "step": 12123 + }, + { + "epoch": 0.7366182635640075, + "grad_norm": 0.854327917098999, + "learning_rate": 1.626177923591587e-05, + "loss": 1.1061, + "step": 12124 + }, + { + "epoch": 0.736679020596634, + "grad_norm": 0.13866448402404785, + "learning_rate": 1.6254714830118482e-05, + "loss": 1.0706, + "step": 12125 + }, + { + "epoch": 0.7367397776292606, + "grad_norm": 0.2640054523944855, + "learning_rate": 1.6247651661261047e-05, + "loss": 1.1143, + "step": 12126 + }, + { + "epoch": 0.7368005346618871, + "grad_norm": 0.12271498143672943, + "learning_rate": 1.6240589729602446e-05, + "loss": 1.0639, + "step": 12127 + }, + { + "epoch": 0.7368612916945136, + "grad_norm": 0.18867793679237366, + "learning_rate": 1.6233529035401575e-05, + "loss": 1.1719, + "step": 12128 + }, + { + "epoch": 0.7369220487271402, + "grad_norm": 0.2110293060541153, + "learning_rate": 1.622646957891722e-05, + "loss": 1.0572, + "step": 12129 + }, + { + "epoch": 0.7369828057597667, + "grad_norm": 0.118031807243824, + "learning_rate": 1.6219411360408148e-05, + "loss": 1.0803, + "step": 12130 + }, + { + "epoch": 0.7370435627923932, + "grad_norm": 0.12642765045166016, + "learning_rate": 1.6212354380133088e-05, + "loss": 1.0499, + "step": 12131 + }, + { + "epoch": 0.7371043198250198, + "grad_norm": 0.23367492854595184, + "learning_rate": 1.6205298638350704e-05, + "loss": 1.0253, + "step": 12132 + }, + { + "epoch": 0.7371650768576463, + "grad_norm": 0.28645071387290955, + "learning_rate": 1.6198244135319636e-05, + "loss": 1.1369, + "step": 12133 + }, + { + "epoch": 0.7372258338902729, + "grad_norm": 0.14862053096294403, + "learning_rate": 1.6191190871298457e-05, + "loss": 1.0935, + "step": 12134 + }, + { + "epoch": 0.7372865909228993, + "grad_norm": 0.14767231047153473, + "learning_rate": 1.6184138846545714e-05, + "loss": 1.0481, + "step": 12135 + }, + { + "epoch": 0.7373473479555258, + "grad_norm": 0.11814218759536743, + "learning_rate": 1.6177088061319902e-05, + "loss": 1.0082, + "step": 12136 + }, + { + "epoch": 0.7374081049881523, + "grad_norm": 0.5006747245788574, + "learning_rate": 1.617003851587946e-05, + "loss": 1.0453, + "step": 12137 + }, + { + "epoch": 0.7374688620207789, + "grad_norm": 0.14699113368988037, + "learning_rate": 1.61629902104828e-05, + "loss": 1.0858, + "step": 12138 + }, + { + "epoch": 0.7375296190534054, + "grad_norm": 0.114358089864254, + "learning_rate": 1.615594314538827e-05, + "loss": 1.0256, + "step": 12139 + }, + { + "epoch": 0.737590376086032, + "grad_norm": 0.26095932722091675, + "learning_rate": 1.614889732085419e-05, + "loss": 1.1842, + "step": 12140 + }, + { + "epoch": 0.7376511331186585, + "grad_norm": 0.12283433228731155, + "learning_rate": 1.614185273713882e-05, + "loss": 1.0087, + "step": 12141 + }, + { + "epoch": 0.737711890151285, + "grad_norm": 0.11278723925352097, + "learning_rate": 1.613480939450038e-05, + "loss": 1.076, + "step": 12142 + }, + { + "epoch": 0.7377726471839116, + "grad_norm": 0.1318920999765396, + "learning_rate": 1.6127767293197034e-05, + "loss": 1.0812, + "step": 12143 + }, + { + "epoch": 0.7378334042165381, + "grad_norm": 0.10175958275794983, + "learning_rate": 1.6120726433486953e-05, + "loss": 0.9952, + "step": 12144 + }, + { + "epoch": 0.7378941612491646, + "grad_norm": 0.10769624263048172, + "learning_rate": 1.6113686815628187e-05, + "loss": 1.0417, + "step": 12145 + }, + { + "epoch": 0.7379549182817912, + "grad_norm": 0.1390557587146759, + "learning_rate": 1.6106648439878776e-05, + "loss": 1.109, + "step": 12146 + }, + { + "epoch": 0.7380156753144177, + "grad_norm": 0.451018363237381, + "learning_rate": 1.6099611306496698e-05, + "loss": 1.0775, + "step": 12147 + }, + { + "epoch": 0.7380764323470441, + "grad_norm": 0.14769063889980316, + "learning_rate": 1.6092575415739943e-05, + "loss": 1.0381, + "step": 12148 + }, + { + "epoch": 0.7381371893796707, + "grad_norm": 0.15738914906978607, + "learning_rate": 1.608554076786638e-05, + "loss": 1.0091, + "step": 12149 + }, + { + "epoch": 0.7381979464122972, + "grad_norm": 0.1770702749490738, + "learning_rate": 1.6078507363133888e-05, + "loss": 1.1069, + "step": 12150 + }, + { + "epoch": 0.7382587034449237, + "grad_norm": 0.10331787168979645, + "learning_rate": 1.6071475201800272e-05, + "loss": 1.0161, + "step": 12151 + }, + { + "epoch": 0.7383194604775503, + "grad_norm": 0.11065951734781265, + "learning_rate": 1.6064444284123265e-05, + "loss": 1.0884, + "step": 12152 + }, + { + "epoch": 0.7383802175101768, + "grad_norm": 0.13166925311088562, + "learning_rate": 1.6057414610360627e-05, + "loss": 1.032, + "step": 12153 + }, + { + "epoch": 0.7384409745428033, + "grad_norm": 0.13021034002304077, + "learning_rate": 1.6050386180770017e-05, + "loss": 1.0543, + "step": 12154 + }, + { + "epoch": 0.7385017315754299, + "grad_norm": 0.16593828797340393, + "learning_rate": 1.604335899560906e-05, + "loss": 1.0737, + "step": 12155 + }, + { + "epoch": 0.7385624886080564, + "grad_norm": 0.13188204169273376, + "learning_rate": 1.6036333055135344e-05, + "loss": 1.0397, + "step": 12156 + }, + { + "epoch": 0.7386232456406829, + "grad_norm": 0.09796518087387085, + "learning_rate": 1.602930835960641e-05, + "loss": 0.9523, + "step": 12157 + }, + { + "epoch": 0.7386840026733095, + "grad_norm": 0.1713603436946869, + "learning_rate": 1.602228490927974e-05, + "loss": 1.0674, + "step": 12158 + }, + { + "epoch": 0.738744759705936, + "grad_norm": 0.17747315764427185, + "learning_rate": 1.6015262704412787e-05, + "loss": 1.1334, + "step": 12159 + }, + { + "epoch": 0.7388055167385625, + "grad_norm": 0.15091343224048615, + "learning_rate": 1.6008241745262947e-05, + "loss": 1.1572, + "step": 12160 + }, + { + "epoch": 0.738866273771189, + "grad_norm": 0.14343756437301636, + "learning_rate": 1.6001222032087575e-05, + "loss": 1.074, + "step": 12161 + }, + { + "epoch": 0.7389270308038155, + "grad_norm": 0.1796892285346985, + "learning_rate": 1.5994203565143982e-05, + "loss": 1.0948, + "step": 12162 + }, + { + "epoch": 0.738987787836442, + "grad_norm": 0.12104852497577667, + "learning_rate": 1.598718634468942e-05, + "loss": 1.061, + "step": 12163 + }, + { + "epoch": 0.7390485448690686, + "grad_norm": 0.4887903332710266, + "learning_rate": 1.598017037098114e-05, + "loss": 1.0691, + "step": 12164 + }, + { + "epoch": 0.7391093019016951, + "grad_norm": 0.15365274250507355, + "learning_rate": 1.5973155644276284e-05, + "loss": 1.0465, + "step": 12165 + }, + { + "epoch": 0.7391700589343216, + "grad_norm": 0.17284902930259705, + "learning_rate": 1.5966142164831976e-05, + "loss": 1.0845, + "step": 12166 + }, + { + "epoch": 0.7392308159669482, + "grad_norm": 0.1603914052248001, + "learning_rate": 1.5959129932905314e-05, + "loss": 1.116, + "step": 12167 + }, + { + "epoch": 0.7392915729995747, + "grad_norm": 0.13822023570537567, + "learning_rate": 1.5952118948753302e-05, + "loss": 1.0972, + "step": 12168 + }, + { + "epoch": 0.7393523300322012, + "grad_norm": 0.13838990032672882, + "learning_rate": 1.594510921263297e-05, + "loss": 1.0701, + "step": 12169 + }, + { + "epoch": 0.7394130870648278, + "grad_norm": 0.1430082470178604, + "learning_rate": 1.593810072480124e-05, + "loss": 1.0517, + "step": 12170 + }, + { + "epoch": 0.7394738440974543, + "grad_norm": 0.11661330610513687, + "learning_rate": 1.593109348551503e-05, + "loss": 1.0801, + "step": 12171 + }, + { + "epoch": 0.7395346011300808, + "grad_norm": 0.11382436752319336, + "learning_rate": 1.592408749503116e-05, + "loss": 1.02, + "step": 12172 + }, + { + "epoch": 0.7395953581627074, + "grad_norm": 0.11320145428180695, + "learning_rate": 1.5917082753606434e-05, + "loss": 1.0246, + "step": 12173 + }, + { + "epoch": 0.7396561151953338, + "grad_norm": 0.11681144684553146, + "learning_rate": 1.591007926149764e-05, + "loss": 1.0359, + "step": 12174 + }, + { + "epoch": 0.7397168722279603, + "grad_norm": 0.10601933300495148, + "learning_rate": 1.5903077018961477e-05, + "loss": 1.0226, + "step": 12175 + }, + { + "epoch": 0.7397776292605869, + "grad_norm": 0.10508060455322266, + "learning_rate": 1.5896076026254626e-05, + "loss": 1.0079, + "step": 12176 + }, + { + "epoch": 0.7398383862932134, + "grad_norm": 0.2315458506345749, + "learning_rate": 1.5889076283633707e-05, + "loss": 1.1132, + "step": 12177 + }, + { + "epoch": 0.73989914332584, + "grad_norm": 0.2041950225830078, + "learning_rate": 1.5882077791355265e-05, + "loss": 1.1536, + "step": 12178 + }, + { + "epoch": 0.7399599003584665, + "grad_norm": 0.1524074524641037, + "learning_rate": 1.587508054967587e-05, + "loss": 1.0637, + "step": 12179 + }, + { + "epoch": 0.740020657391093, + "grad_norm": 0.4127258062362671, + "learning_rate": 1.5868084558851994e-05, + "loss": 1.1612, + "step": 12180 + }, + { + "epoch": 0.7400814144237196, + "grad_norm": 3.113245964050293, + "learning_rate": 1.5861089819140074e-05, + "loss": 1.0365, + "step": 12181 + }, + { + "epoch": 0.7401421714563461, + "grad_norm": 0.19555634260177612, + "learning_rate": 1.5854096330796507e-05, + "loss": 1.087, + "step": 12182 + }, + { + "epoch": 0.7402029284889726, + "grad_norm": 0.15480531752109528, + "learning_rate": 1.5847104094077636e-05, + "loss": 1.0608, + "step": 12183 + }, + { + "epoch": 0.7402636855215992, + "grad_norm": 0.20994332432746887, + "learning_rate": 1.5840113109239772e-05, + "loss": 1.1236, + "step": 12184 + }, + { + "epoch": 0.7403244425542257, + "grad_norm": 0.1111488938331604, + "learning_rate": 1.583312337653916e-05, + "loss": 1.01, + "step": 12185 + }, + { + "epoch": 0.7403851995868522, + "grad_norm": 0.17714236676692963, + "learning_rate": 1.582613489623202e-05, + "loss": 1.0383, + "step": 12186 + }, + { + "epoch": 0.7404459566194787, + "grad_norm": 0.27466830611228943, + "learning_rate": 1.5819147668574507e-05, + "loss": 1.1285, + "step": 12187 + }, + { + "epoch": 0.7405067136521052, + "grad_norm": 0.15673334896564484, + "learning_rate": 1.5812161693822747e-05, + "loss": 1.0882, + "step": 12188 + }, + { + "epoch": 0.7405674706847317, + "grad_norm": 12.478750228881836, + "learning_rate": 1.580517697223279e-05, + "loss": 1.1318, + "step": 12189 + }, + { + "epoch": 0.7406282277173583, + "grad_norm": 0.197187140583992, + "learning_rate": 1.5798193504060692e-05, + "loss": 1.0544, + "step": 12190 + }, + { + "epoch": 0.7406889847499848, + "grad_norm": 0.17191274464130402, + "learning_rate": 1.5791211289562447e-05, + "loss": 1.0347, + "step": 12191 + }, + { + "epoch": 0.7407497417826113, + "grad_norm": 0.14423011243343353, + "learning_rate": 1.5784230328993942e-05, + "loss": 1.1482, + "step": 12192 + }, + { + "epoch": 0.7408104988152379, + "grad_norm": 0.11650660634040833, + "learning_rate": 1.5777250622611096e-05, + "loss": 1.0523, + "step": 12193 + }, + { + "epoch": 0.7408712558478644, + "grad_norm": 0.1786743849515915, + "learning_rate": 1.577027217066972e-05, + "loss": 1.1743, + "step": 12194 + }, + { + "epoch": 0.7409320128804909, + "grad_norm": 0.15046550333499908, + "learning_rate": 1.576329497342565e-05, + "loss": 1.0375, + "step": 12195 + }, + { + "epoch": 0.7409927699131175, + "grad_norm": 0.11667760461568832, + "learning_rate": 1.5756319031134626e-05, + "loss": 1.0528, + "step": 12196 + }, + { + "epoch": 0.741053526945744, + "grad_norm": 0.13398396968841553, + "learning_rate": 1.574934434405234e-05, + "loss": 1.0074, + "step": 12197 + }, + { + "epoch": 0.7411142839783705, + "grad_norm": 0.4830251634120941, + "learning_rate": 1.574237091243448e-05, + "loss": 1.2222, + "step": 12198 + }, + { + "epoch": 0.7411750410109971, + "grad_norm": 0.12647002935409546, + "learning_rate": 1.57353987365366e-05, + "loss": 1.0748, + "step": 12199 + }, + { + "epoch": 0.7412357980436235, + "grad_norm": 0.21349944174289703, + "learning_rate": 1.572842781661432e-05, + "loss": 1.1692, + "step": 12200 + }, + { + "epoch": 0.74129655507625, + "grad_norm": 7.244073867797852, + "learning_rate": 1.5721458152923142e-05, + "loss": 1.0532, + "step": 12201 + }, + { + "epoch": 0.7413573121088766, + "grad_norm": 0.11956740915775299, + "learning_rate": 1.571448974571854e-05, + "loss": 1.0122, + "step": 12202 + }, + { + "epoch": 0.7414180691415031, + "grad_norm": 0.13429811596870422, + "learning_rate": 1.5707522595255953e-05, + "loss": 1.0962, + "step": 12203 + }, + { + "epoch": 0.7414788261741296, + "grad_norm": 0.16790010035037994, + "learning_rate": 1.570055670179074e-05, + "loss": 1.1043, + "step": 12204 + }, + { + "epoch": 0.7415395832067562, + "grad_norm": 0.17315034568309784, + "learning_rate": 1.569359206557826e-05, + "loss": 1.1732, + "step": 12205 + }, + { + "epoch": 0.7416003402393827, + "grad_norm": 0.221226766705513, + "learning_rate": 1.5686628686873796e-05, + "loss": 1.0745, + "step": 12206 + }, + { + "epoch": 0.7416610972720092, + "grad_norm": 0.11447381228208542, + "learning_rate": 1.5679666565932584e-05, + "loss": 1.0063, + "step": 12207 + }, + { + "epoch": 0.7417218543046358, + "grad_norm": 0.2961326837539673, + "learning_rate": 1.5672705703009828e-05, + "loss": 1.0339, + "step": 12208 + }, + { + "epoch": 0.7417826113372623, + "grad_norm": 0.572446882724762, + "learning_rate": 1.5665746098360685e-05, + "loss": 1.1093, + "step": 12209 + }, + { + "epoch": 0.7418433683698888, + "grad_norm": 0.7450178861618042, + "learning_rate": 1.5658787752240234e-05, + "loss": 1.1228, + "step": 12210 + }, + { + "epoch": 0.7419041254025154, + "grad_norm": 0.14105729758739471, + "learning_rate": 1.5651830664903588e-05, + "loss": 1.0317, + "step": 12211 + }, + { + "epoch": 0.7419648824351419, + "grad_norm": 0.17359383404254913, + "learning_rate": 1.5644874836605716e-05, + "loss": 1.079, + "step": 12212 + }, + { + "epoch": 0.7420256394677683, + "grad_norm": 0.4730372428894043, + "learning_rate": 1.5637920267601597e-05, + "loss": 1.1325, + "step": 12213 + }, + { + "epoch": 0.7420863965003949, + "grad_norm": 0.1052330955862999, + "learning_rate": 1.5630966958146154e-05, + "loss": 1.0287, + "step": 12214 + }, + { + "epoch": 0.7421471535330214, + "grad_norm": 0.11401963233947754, + "learning_rate": 1.562401490849424e-05, + "loss": 1.0561, + "step": 12215 + }, + { + "epoch": 0.7422079105656479, + "grad_norm": 0.16880464553833008, + "learning_rate": 1.561706411890072e-05, + "loss": 1.1105, + "step": 12216 + }, + { + "epoch": 0.7422686675982745, + "grad_norm": 0.15431378781795502, + "learning_rate": 1.5610114589620366e-05, + "loss": 1.0083, + "step": 12217 + }, + { + "epoch": 0.742329424630901, + "grad_norm": 0.2946162223815918, + "learning_rate": 1.560316632090792e-05, + "loss": 1.2708, + "step": 12218 + }, + { + "epoch": 0.7423901816635275, + "grad_norm": 0.4436761140823364, + "learning_rate": 1.559621931301805e-05, + "loss": 1.2363, + "step": 12219 + }, + { + "epoch": 0.7424509386961541, + "grad_norm": 0.16144710779190063, + "learning_rate": 1.5589273566205394e-05, + "loss": 1.1268, + "step": 12220 + }, + { + "epoch": 0.7425116957287806, + "grad_norm": 0.20668721199035645, + "learning_rate": 1.558232908072458e-05, + "loss": 1.0069, + "step": 12221 + }, + { + "epoch": 0.7425724527614072, + "grad_norm": 0.31884416937828064, + "learning_rate": 1.557538585683015e-05, + "loss": 1.0605, + "step": 12222 + }, + { + "epoch": 0.7426332097940337, + "grad_norm": 0.5737851858139038, + "learning_rate": 1.5568443894776602e-05, + "loss": 1.028, + "step": 12223 + }, + { + "epoch": 0.7426939668266602, + "grad_norm": 0.11045891046524048, + "learning_rate": 1.55615031948184e-05, + "loss": 1.0736, + "step": 12224 + }, + { + "epoch": 0.7427547238592868, + "grad_norm": 0.11405198276042938, + "learning_rate": 1.5554563757209956e-05, + "loss": 1.063, + "step": 12225 + }, + { + "epoch": 0.7428154808919133, + "grad_norm": 0.16240975260734558, + "learning_rate": 1.554762558220563e-05, + "loss": 1.073, + "step": 12226 + }, + { + "epoch": 0.7428762379245397, + "grad_norm": 0.3138273358345032, + "learning_rate": 1.5540688670059754e-05, + "loss": 1.0593, + "step": 12227 + }, + { + "epoch": 0.7429369949571663, + "grad_norm": 0.10800588876008987, + "learning_rate": 1.55337530210266e-05, + "loss": 1.0841, + "step": 12228 + }, + { + "epoch": 0.7429977519897928, + "grad_norm": 4.588499069213867, + "learning_rate": 1.552681863536038e-05, + "loss": 1.0594, + "step": 12229 + }, + { + "epoch": 0.7430585090224193, + "grad_norm": 0.15338660776615143, + "learning_rate": 1.551988551331529e-05, + "loss": 1.0901, + "step": 12230 + }, + { + "epoch": 0.7431192660550459, + "grad_norm": 0.1269010752439499, + "learning_rate": 1.5512953655145467e-05, + "loss": 1.047, + "step": 12231 + }, + { + "epoch": 0.7431800230876724, + "grad_norm": 0.19623033702373505, + "learning_rate": 1.550602306110499e-05, + "loss": 1.0537, + "step": 12232 + }, + { + "epoch": 0.7432407801202989, + "grad_norm": 0.10276135802268982, + "learning_rate": 1.5499093731447905e-05, + "loss": 1.0172, + "step": 12233 + }, + { + "epoch": 0.7433015371529255, + "grad_norm": 0.16195812821388245, + "learning_rate": 1.549216566642821e-05, + "loss": 1.0106, + "step": 12234 + }, + { + "epoch": 0.743362294185552, + "grad_norm": 0.1606951355934143, + "learning_rate": 1.548523886629985e-05, + "loss": 1.1256, + "step": 12235 + }, + { + "epoch": 0.7434230512181785, + "grad_norm": 0.20359116792678833, + "learning_rate": 1.547831333131672e-05, + "loss": 1.0881, + "step": 12236 + }, + { + "epoch": 0.7434838082508051, + "grad_norm": 0.1122780293226242, + "learning_rate": 1.54713890617327e-05, + "loss": 1.0172, + "step": 12237 + }, + { + "epoch": 0.7435445652834316, + "grad_norm": 0.346225768327713, + "learning_rate": 1.54644660578016e-05, + "loss": 1.0837, + "step": 12238 + }, + { + "epoch": 0.7436053223160581, + "grad_norm": 0.2300349622964859, + "learning_rate": 1.5457544319777163e-05, + "loss": 1.15, + "step": 12239 + }, + { + "epoch": 0.7436660793486846, + "grad_norm": 0.11195831745862961, + "learning_rate": 1.54506238479131e-05, + "loss": 1.0275, + "step": 12240 + }, + { + "epoch": 0.7437268363813111, + "grad_norm": 0.17067274451255798, + "learning_rate": 1.544370464246311e-05, + "loss": 1.0225, + "step": 12241 + }, + { + "epoch": 0.7437875934139376, + "grad_norm": 0.15167181193828583, + "learning_rate": 1.5436786703680806e-05, + "loss": 1.0461, + "step": 12242 + }, + { + "epoch": 0.7438483504465642, + "grad_norm": 0.17596226930618286, + "learning_rate": 1.5429870031819766e-05, + "loss": 1.0996, + "step": 12243 + }, + { + "epoch": 0.7439091074791907, + "grad_norm": 0.15617777407169342, + "learning_rate": 1.542295462713354e-05, + "loss": 1.0769, + "step": 12244 + }, + { + "epoch": 0.7439698645118172, + "grad_norm": 0.7419642210006714, + "learning_rate": 1.5416040489875565e-05, + "loss": 1.0329, + "step": 12245 + }, + { + "epoch": 0.7440306215444438, + "grad_norm": 0.15686364471912384, + "learning_rate": 1.5409127620299326e-05, + "loss": 0.9753, + "step": 12246 + }, + { + "epoch": 0.7440913785770703, + "grad_norm": 0.4805592894554138, + "learning_rate": 1.54022160186582e-05, + "loss": 1.0545, + "step": 12247 + }, + { + "epoch": 0.7441521356096968, + "grad_norm": 0.14140735566616058, + "learning_rate": 1.539530568520553e-05, + "loss": 1.0401, + "step": 12248 + }, + { + "epoch": 0.7442128926423234, + "grad_norm": 0.9698283076286316, + "learning_rate": 1.5388396620194623e-05, + "loss": 1.1128, + "step": 12249 + }, + { + "epoch": 0.7442736496749499, + "grad_norm": 0.10790181159973145, + "learning_rate": 1.5381488823878725e-05, + "loss": 1.0545, + "step": 12250 + }, + { + "epoch": 0.7443344067075764, + "grad_norm": 0.21961985528469086, + "learning_rate": 1.5374582296511053e-05, + "loss": 1.1129, + "step": 12251 + }, + { + "epoch": 0.744395163740203, + "grad_norm": 0.11807654798030853, + "learning_rate": 1.5367677038344754e-05, + "loss": 1.016, + "step": 12252 + }, + { + "epoch": 0.7444559207728294, + "grad_norm": 0.20182126760482788, + "learning_rate": 1.536077304963295e-05, + "loss": 1.0361, + "step": 12253 + }, + { + "epoch": 0.7445166778054559, + "grad_norm": 0.12720921635627747, + "learning_rate": 1.53538703306287e-05, + "loss": 0.9927, + "step": 12254 + }, + { + "epoch": 0.7445774348380825, + "grad_norm": 0.18298062682151794, + "learning_rate": 1.5346968881585034e-05, + "loss": 1.212, + "step": 12255 + }, + { + "epoch": 0.744638191870709, + "grad_norm": 0.18976110219955444, + "learning_rate": 1.5340068702754906e-05, + "loss": 1.0713, + "step": 12256 + }, + { + "epoch": 0.7446989489033355, + "grad_norm": 0.16418978571891785, + "learning_rate": 1.533316979439128e-05, + "loss": 1.0432, + "step": 12257 + }, + { + "epoch": 0.7447597059359621, + "grad_norm": 0.10490068793296814, + "learning_rate": 1.5326272156747024e-05, + "loss": 1.0197, + "step": 12258 + }, + { + "epoch": 0.7448204629685886, + "grad_norm": 0.10313988476991653, + "learning_rate": 1.531937579007495e-05, + "loss": 1.0386, + "step": 12259 + }, + { + "epoch": 0.7448812200012151, + "grad_norm": 0.11124314367771149, + "learning_rate": 1.5312480694627862e-05, + "loss": 1.0259, + "step": 12260 + }, + { + "epoch": 0.7449419770338417, + "grad_norm": 0.12163428962230682, + "learning_rate": 1.5305586870658485e-05, + "loss": 1.098, + "step": 12261 + }, + { + "epoch": 0.7450027340664682, + "grad_norm": 5.009799003601074, + "learning_rate": 1.5298694318419536e-05, + "loss": 1.0474, + "step": 12262 + }, + { + "epoch": 0.7450634910990948, + "grad_norm": 0.19056597352027893, + "learning_rate": 1.5291803038163665e-05, + "loss": 1.0466, + "step": 12263 + }, + { + "epoch": 0.7451242481317213, + "grad_norm": 0.11156143248081207, + "learning_rate": 1.5284913030143472e-05, + "loss": 1.0277, + "step": 12264 + }, + { + "epoch": 0.7451850051643478, + "grad_norm": 0.15466099977493286, + "learning_rate": 1.5278024294611487e-05, + "loss": 1.0494, + "step": 12265 + }, + { + "epoch": 0.7452457621969742, + "grad_norm": 0.11629234254360199, + "learning_rate": 1.5271136831820214e-05, + "loss": 1.0384, + "step": 12266 + }, + { + "epoch": 0.7453065192296008, + "grad_norm": 0.14503991603851318, + "learning_rate": 1.5264250642022153e-05, + "loss": 1.0128, + "step": 12267 + }, + { + "epoch": 0.7453672762622273, + "grad_norm": 0.1715267300605774, + "learning_rate": 1.5257365725469703e-05, + "loss": 1.159, + "step": 12268 + }, + { + "epoch": 0.7454280332948539, + "grad_norm": 0.18305064737796783, + "learning_rate": 1.5250482082415223e-05, + "loss": 1.2106, + "step": 12269 + }, + { + "epoch": 0.7454887903274804, + "grad_norm": 0.8006494641304016, + "learning_rate": 1.524359971311104e-05, + "loss": 1.1424, + "step": 12270 + }, + { + "epoch": 0.7455495473601069, + "grad_norm": 0.25118541717529297, + "learning_rate": 1.5236718617809425e-05, + "loss": 1.143, + "step": 12271 + }, + { + "epoch": 0.7456103043927335, + "grad_norm": 0.12858976423740387, + "learning_rate": 1.5229838796762613e-05, + "loss": 1.1199, + "step": 12272 + }, + { + "epoch": 0.74567106142536, + "grad_norm": 0.114949531853199, + "learning_rate": 1.522296025022278e-05, + "loss": 1.0263, + "step": 12273 + }, + { + "epoch": 0.7457318184579865, + "grad_norm": 0.3934478759765625, + "learning_rate": 1.521608297844206e-05, + "loss": 1.0977, + "step": 12274 + }, + { + "epoch": 0.7457925754906131, + "grad_norm": 0.10857676714658737, + "learning_rate": 1.520920698167254e-05, + "loss": 1.0782, + "step": 12275 + }, + { + "epoch": 0.7458533325232396, + "grad_norm": 0.22125573456287384, + "learning_rate": 1.5202332260166269e-05, + "loss": 1.1118, + "step": 12276 + }, + { + "epoch": 0.7459140895558661, + "grad_norm": 0.16291722655296326, + "learning_rate": 1.5195458814175217e-05, + "loss": 1.0884, + "step": 12277 + }, + { + "epoch": 0.7459748465884927, + "grad_norm": 0.17648471891880035, + "learning_rate": 1.518858664395138e-05, + "loss": 1.0698, + "step": 12278 + }, + { + "epoch": 0.7460356036211191, + "grad_norm": 0.12250689417123795, + "learning_rate": 1.518171574974661e-05, + "loss": 1.014, + "step": 12279 + }, + { + "epoch": 0.7460963606537456, + "grad_norm": 0.27059710025787354, + "learning_rate": 1.5174846131812787e-05, + "loss": 1.078, + "step": 12280 + }, + { + "epoch": 0.7461571176863722, + "grad_norm": 0.09177951514720917, + "learning_rate": 1.5167977790401705e-05, + "loss": 0.978, + "step": 12281 + }, + { + "epoch": 0.7462178747189987, + "grad_norm": 0.12109772115945816, + "learning_rate": 1.5161110725765116e-05, + "loss": 1.0156, + "step": 12282 + }, + { + "epoch": 0.7462786317516252, + "grad_norm": 0.11416082084178925, + "learning_rate": 1.515424493815476e-05, + "loss": 1.0002, + "step": 12283 + }, + { + "epoch": 0.7463393887842518, + "grad_norm": 0.12184463441371918, + "learning_rate": 1.5147380427822295e-05, + "loss": 1.0733, + "step": 12284 + }, + { + "epoch": 0.7464001458168783, + "grad_norm": 0.1866767257452011, + "learning_rate": 1.514051719501935e-05, + "loss": 1.0614, + "step": 12285 + }, + { + "epoch": 0.7464609028495048, + "grad_norm": 0.6591934561729431, + "learning_rate": 1.5133655239997474e-05, + "loss": 1.21, + "step": 12286 + }, + { + "epoch": 0.7465216598821314, + "grad_norm": 9.733088493347168, + "learning_rate": 1.5126794563008184e-05, + "loss": 1.1255, + "step": 12287 + }, + { + "epoch": 0.7465824169147579, + "grad_norm": 0.14364604651927948, + "learning_rate": 1.5119935164303e-05, + "loss": 1.0558, + "step": 12288 + }, + { + "epoch": 0.7466431739473844, + "grad_norm": 0.12493626773357391, + "learning_rate": 1.5113077044133334e-05, + "loss": 1.0376, + "step": 12289 + }, + { + "epoch": 0.746703930980011, + "grad_norm": 0.18352729082107544, + "learning_rate": 1.510622020275057e-05, + "loss": 1.1661, + "step": 12290 + }, + { + "epoch": 0.7467646880126375, + "grad_norm": 0.16050124168395996, + "learning_rate": 1.5099364640406066e-05, + "loss": 1.059, + "step": 12291 + }, + { + "epoch": 0.7468254450452639, + "grad_norm": 0.5644650459289551, + "learning_rate": 1.509251035735107e-05, + "loss": 1.2231, + "step": 12292 + }, + { + "epoch": 0.7468862020778905, + "grad_norm": 0.15874862670898438, + "learning_rate": 1.5085657353836863e-05, + "loss": 1.0652, + "step": 12293 + }, + { + "epoch": 0.746946959110517, + "grad_norm": 0.1588876098394394, + "learning_rate": 1.5078805630114639e-05, + "loss": 1.1025, + "step": 12294 + }, + { + "epoch": 0.7470077161431435, + "grad_norm": 0.10968033969402313, + "learning_rate": 1.5071955186435544e-05, + "loss": 1.0647, + "step": 12295 + }, + { + "epoch": 0.7470684731757701, + "grad_norm": 0.19421853125095367, + "learning_rate": 1.506510602305068e-05, + "loss": 1.0627, + "step": 12296 + }, + { + "epoch": 0.7471292302083966, + "grad_norm": 0.1826222538948059, + "learning_rate": 1.5058258140211112e-05, + "loss": 1.1209, + "step": 12297 + }, + { + "epoch": 0.7471899872410231, + "grad_norm": 0.20129136741161346, + "learning_rate": 1.5051411538167843e-05, + "loss": 1.1843, + "step": 12298 + }, + { + "epoch": 0.7472507442736497, + "grad_norm": 0.20420442521572113, + "learning_rate": 1.504456621717184e-05, + "loss": 1.0034, + "step": 12299 + }, + { + "epoch": 0.7473115013062762, + "grad_norm": 0.23053786158561707, + "learning_rate": 1.5037722177474023e-05, + "loss": 1.0401, + "step": 12300 + }, + { + "epoch": 0.7473722583389028, + "grad_norm": 0.16696183383464813, + "learning_rate": 1.503087941932525e-05, + "loss": 1.034, + "step": 12301 + }, + { + "epoch": 0.7474330153715293, + "grad_norm": 0.171365886926651, + "learning_rate": 1.502403794297636e-05, + "loss": 1.0115, + "step": 12302 + }, + { + "epoch": 0.7474937724041558, + "grad_norm": 0.20434030890464783, + "learning_rate": 1.5017197748678096e-05, + "loss": 1.0599, + "step": 12303 + }, + { + "epoch": 0.7475545294367824, + "grad_norm": 0.24190174043178558, + "learning_rate": 1.5010358836681232e-05, + "loss": 1.0744, + "step": 12304 + }, + { + "epoch": 0.7476152864694088, + "grad_norm": 0.13057444989681244, + "learning_rate": 1.5003521207236443e-05, + "loss": 1.0539, + "step": 12305 + }, + { + "epoch": 0.7476760435020353, + "grad_norm": 0.14971034228801727, + "learning_rate": 1.4996684860594329e-05, + "loss": 1.073, + "step": 12306 + }, + { + "epoch": 0.7477368005346618, + "grad_norm": 0.19243723154067993, + "learning_rate": 1.49898497970055e-05, + "loss": 1.0818, + "step": 12307 + }, + { + "epoch": 0.7477975575672884, + "grad_norm": 0.14464719593524933, + "learning_rate": 1.498301601672048e-05, + "loss": 1.0696, + "step": 12308 + }, + { + "epoch": 0.7478583145999149, + "grad_norm": 0.12280137836933136, + "learning_rate": 1.497618351998979e-05, + "loss": 1.0549, + "step": 12309 + }, + { + "epoch": 0.7479190716325415, + "grad_norm": 0.12875477969646454, + "learning_rate": 1.4969352307063866e-05, + "loss": 1.0963, + "step": 12310 + }, + { + "epoch": 0.747979828665168, + "grad_norm": 0.8757300972938538, + "learning_rate": 1.4962522378193118e-05, + "loss": 1.1342, + "step": 12311 + }, + { + "epoch": 0.7480405856977945, + "grad_norm": 0.11182583123445511, + "learning_rate": 1.4955693733627867e-05, + "loss": 1.0095, + "step": 12312 + }, + { + "epoch": 0.7481013427304211, + "grad_norm": 0.9983725547790527, + "learning_rate": 1.4948866373618426e-05, + "loss": 1.0637, + "step": 12313 + }, + { + "epoch": 0.7481620997630476, + "grad_norm": 0.1568031758069992, + "learning_rate": 1.4942040298415072e-05, + "loss": 0.9961, + "step": 12314 + }, + { + "epoch": 0.7482228567956741, + "grad_norm": 0.266242653131485, + "learning_rate": 1.4935215508268014e-05, + "loss": 1.1979, + "step": 12315 + }, + { + "epoch": 0.7482836138283007, + "grad_norm": 0.16601009666919708, + "learning_rate": 1.4928392003427404e-05, + "loss": 1.0398, + "step": 12316 + }, + { + "epoch": 0.7483443708609272, + "grad_norm": 2.6673924922943115, + "learning_rate": 1.4921569784143368e-05, + "loss": 1.037, + "step": 12317 + }, + { + "epoch": 0.7484051278935536, + "grad_norm": 0.12594975531101227, + "learning_rate": 1.491474885066597e-05, + "loss": 1.0705, + "step": 12318 + }, + { + "epoch": 0.7484658849261802, + "grad_norm": 0.17537593841552734, + "learning_rate": 1.4907929203245235e-05, + "loss": 1.1044, + "step": 12319 + }, + { + "epoch": 0.7485266419588067, + "grad_norm": 0.12230952829122543, + "learning_rate": 1.4901110842131138e-05, + "loss": 1.02, + "step": 12320 + }, + { + "epoch": 0.7485873989914332, + "grad_norm": 0.18852370977401733, + "learning_rate": 1.4894293767573613e-05, + "loss": 1.082, + "step": 12321 + }, + { + "epoch": 0.7486481560240598, + "grad_norm": 0.11276037991046906, + "learning_rate": 1.488747797982253e-05, + "loss": 1.093, + "step": 12322 + }, + { + "epoch": 0.7487089130566863, + "grad_norm": 0.11569953709840775, + "learning_rate": 1.4880663479127727e-05, + "loss": 1.0358, + "step": 12323 + }, + { + "epoch": 0.7487696700893128, + "grad_norm": 0.622245728969574, + "learning_rate": 1.487385026573898e-05, + "loss": 1.1218, + "step": 12324 + }, + { + "epoch": 0.7488304271219394, + "grad_norm": 0.21036945283412933, + "learning_rate": 1.4867038339906076e-05, + "loss": 1.1005, + "step": 12325 + }, + { + "epoch": 0.7488911841545659, + "grad_norm": 0.11691872030496597, + "learning_rate": 1.4860227701878654e-05, + "loss": 1.0575, + "step": 12326 + }, + { + "epoch": 0.7489519411871924, + "grad_norm": 0.11479263007640839, + "learning_rate": 1.4853418351906374e-05, + "loss": 1.0512, + "step": 12327 + }, + { + "epoch": 0.749012698219819, + "grad_norm": 0.11864512413740158, + "learning_rate": 1.4846610290238843e-05, + "loss": 1.0454, + "step": 12328 + }, + { + "epoch": 0.7490734552524455, + "grad_norm": 0.18619774281978607, + "learning_rate": 1.483980351712559e-05, + "loss": 1.0688, + "step": 12329 + }, + { + "epoch": 0.749134212285072, + "grad_norm": 0.23992513120174408, + "learning_rate": 1.4832998032816148e-05, + "loss": 1.1711, + "step": 12330 + }, + { + "epoch": 0.7491949693176986, + "grad_norm": 0.24409279227256775, + "learning_rate": 1.4826193837559981e-05, + "loss": 1.2871, + "step": 12331 + }, + { + "epoch": 0.749255726350325, + "grad_norm": 0.14463911950588226, + "learning_rate": 1.4819390931606458e-05, + "loss": 1.0566, + "step": 12332 + }, + { + "epoch": 0.7493164833829515, + "grad_norm": 0.17773549258708954, + "learning_rate": 1.481258931520495e-05, + "loss": 1.1099, + "step": 12333 + }, + { + "epoch": 0.7493772404155781, + "grad_norm": 0.13054946064949036, + "learning_rate": 1.4805788988604791e-05, + "loss": 1.0594, + "step": 12334 + }, + { + "epoch": 0.7494379974482046, + "grad_norm": 0.19657039642333984, + "learning_rate": 1.4798989952055248e-05, + "loss": 1.0853, + "step": 12335 + }, + { + "epoch": 0.7494987544808311, + "grad_norm": 0.11151206493377686, + "learning_rate": 1.4792192205805527e-05, + "loss": 0.9878, + "step": 12336 + }, + { + "epoch": 0.7495595115134577, + "grad_norm": 0.1223205104470253, + "learning_rate": 1.4785395750104808e-05, + "loss": 1.0022, + "step": 12337 + }, + { + "epoch": 0.7496202685460842, + "grad_norm": 1.025681972503662, + "learning_rate": 1.4778600585202213e-05, + "loss": 1.018, + "step": 12338 + }, + { + "epoch": 0.7496810255787107, + "grad_norm": 0.920658528804779, + "learning_rate": 1.477180671134682e-05, + "loss": 1.067, + "step": 12339 + }, + { + "epoch": 0.7497417826113373, + "grad_norm": 0.4540456235408783, + "learning_rate": 1.4765014128787663e-05, + "loss": 1.1926, + "step": 12340 + }, + { + "epoch": 0.7498025396439638, + "grad_norm": 0.19764259457588196, + "learning_rate": 1.4758222837773727e-05, + "loss": 1.0697, + "step": 12341 + }, + { + "epoch": 0.7498632966765904, + "grad_norm": 0.940133810043335, + "learning_rate": 1.475143283855394e-05, + "loss": 1.1281, + "step": 12342 + }, + { + "epoch": 0.7499240537092169, + "grad_norm": 0.1195477619767189, + "learning_rate": 1.47446441313772e-05, + "loss": 1.0252, + "step": 12343 + }, + { + "epoch": 0.7499848107418434, + "grad_norm": 0.19660130143165588, + "learning_rate": 1.4737856716492344e-05, + "loss": 1.0857, + "step": 12344 + }, + { + "epoch": 0.7500455677744698, + "grad_norm": 0.16826272010803223, + "learning_rate": 1.4731070594148166e-05, + "loss": 1.0879, + "step": 12345 + }, + { + "epoch": 0.7501063248070964, + "grad_norm": 0.2910539507865906, + "learning_rate": 1.472428576459341e-05, + "loss": 1.1754, + "step": 12346 + }, + { + "epoch": 0.7501670818397229, + "grad_norm": 0.21095766127109528, + "learning_rate": 1.4717502228076779e-05, + "loss": 1.106, + "step": 12347 + }, + { + "epoch": 0.7502278388723495, + "grad_norm": 0.21968616545200348, + "learning_rate": 1.4710719984846927e-05, + "loss": 1.1062, + "step": 12348 + }, + { + "epoch": 0.750288595904976, + "grad_norm": 0.15479840338230133, + "learning_rate": 1.4703939035152436e-05, + "loss": 1.0531, + "step": 12349 + }, + { + "epoch": 0.7503493529376025, + "grad_norm": 0.14020900428295135, + "learning_rate": 1.4697159379241899e-05, + "loss": 1.05, + "step": 12350 + }, + { + "epoch": 0.7504101099702291, + "grad_norm": 0.15454229712486267, + "learning_rate": 1.4690381017363808e-05, + "loss": 1.0851, + "step": 12351 + }, + { + "epoch": 0.7504708670028556, + "grad_norm": 0.30164051055908203, + "learning_rate": 1.4683603949766639e-05, + "loss": 1.0625, + "step": 12352 + }, + { + "epoch": 0.7505316240354821, + "grad_norm": 0.17074117064476013, + "learning_rate": 1.4676828176698782e-05, + "loss": 1.065, + "step": 12353 + }, + { + "epoch": 0.7505923810681087, + "grad_norm": 0.12957727909088135, + "learning_rate": 1.4670053698408598e-05, + "loss": 1.0886, + "step": 12354 + }, + { + "epoch": 0.7506531381007352, + "grad_norm": 0.21802391111850739, + "learning_rate": 1.4663280515144445e-05, + "loss": 1.2373, + "step": 12355 + }, + { + "epoch": 0.7507138951333617, + "grad_norm": 0.11143873631954193, + "learning_rate": 1.465650862715457e-05, + "loss": 1.028, + "step": 12356 + }, + { + "epoch": 0.7507746521659883, + "grad_norm": 0.22323362529277802, + "learning_rate": 1.4649738034687206e-05, + "loss": 1.1476, + "step": 12357 + }, + { + "epoch": 0.7508354091986147, + "grad_norm": 0.16581158339977264, + "learning_rate": 1.4642968737990543e-05, + "loss": 1.111, + "step": 12358 + }, + { + "epoch": 0.7508961662312412, + "grad_norm": 0.25218355655670166, + "learning_rate": 1.463620073731266e-05, + "loss": 1.0233, + "step": 12359 + }, + { + "epoch": 0.7509569232638678, + "grad_norm": 0.15309728682041168, + "learning_rate": 1.4629434032901696e-05, + "loss": 1.1349, + "step": 12360 + }, + { + "epoch": 0.7510176802964943, + "grad_norm": 0.16351532936096191, + "learning_rate": 1.462266862500566e-05, + "loss": 1.0063, + "step": 12361 + }, + { + "epoch": 0.7510784373291208, + "grad_norm": 0.12894141674041748, + "learning_rate": 1.4615904513872547e-05, + "loss": 1.1062, + "step": 12362 + }, + { + "epoch": 0.7511391943617474, + "grad_norm": 0.16792377829551697, + "learning_rate": 1.4609141699750295e-05, + "loss": 1.0515, + "step": 12363 + }, + { + "epoch": 0.7511999513943739, + "grad_norm": 0.11803977936506271, + "learning_rate": 1.4602380182886788e-05, + "loss": 1.0415, + "step": 12364 + }, + { + "epoch": 0.7512607084270004, + "grad_norm": 0.1203421950340271, + "learning_rate": 1.4595619963529883e-05, + "loss": 1.024, + "step": 12365 + }, + { + "epoch": 0.751321465459627, + "grad_norm": 0.1662909984588623, + "learning_rate": 1.458886104192737e-05, + "loss": 1.0982, + "step": 12366 + }, + { + "epoch": 0.7513822224922535, + "grad_norm": 0.19591106474399567, + "learning_rate": 1.4582103418327003e-05, + "loss": 1.0386, + "step": 12367 + }, + { + "epoch": 0.75144297952488, + "grad_norm": 0.16303405165672302, + "learning_rate": 1.4575347092976477e-05, + "loss": 1.0542, + "step": 12368 + }, + { + "epoch": 0.7515037365575066, + "grad_norm": 0.8739423751831055, + "learning_rate": 1.4568592066123454e-05, + "loss": 1.202, + "step": 12369 + }, + { + "epoch": 0.7515644935901331, + "grad_norm": 0.13482946157455444, + "learning_rate": 1.456183833801552e-05, + "loss": 1.0478, + "step": 12370 + }, + { + "epoch": 0.7516252506227595, + "grad_norm": 0.21555595099925995, + "learning_rate": 1.4555085908900267e-05, + "loss": 1.1375, + "step": 12371 + }, + { + "epoch": 0.7516860076553861, + "grad_norm": 0.15941737592220306, + "learning_rate": 1.4548334779025213e-05, + "loss": 1.097, + "step": 12372 + }, + { + "epoch": 0.7517467646880126, + "grad_norm": 0.11273514479398727, + "learning_rate": 1.4541584948637776e-05, + "loss": 1.0163, + "step": 12373 + }, + { + "epoch": 0.7518075217206391, + "grad_norm": 0.18924646079540253, + "learning_rate": 1.4534836417985403e-05, + "loss": 1.0429, + "step": 12374 + }, + { + "epoch": 0.7518682787532657, + "grad_norm": 0.1637611985206604, + "learning_rate": 1.4528089187315436e-05, + "loss": 1.1046, + "step": 12375 + }, + { + "epoch": 0.7519290357858922, + "grad_norm": 0.1842072606086731, + "learning_rate": 1.4521343256875235e-05, + "loss": 1.087, + "step": 12376 + }, + { + "epoch": 0.7519897928185187, + "grad_norm": 0.38574734330177307, + "learning_rate": 1.4514598626912057e-05, + "loss": 1.1483, + "step": 12377 + }, + { + "epoch": 0.7520505498511453, + "grad_norm": 0.30958425998687744, + "learning_rate": 1.450785529767314e-05, + "loss": 1.1951, + "step": 12378 + }, + { + "epoch": 0.7521113068837718, + "grad_norm": 0.14373669028282166, + "learning_rate": 1.450111326940563e-05, + "loss": 1.0685, + "step": 12379 + }, + { + "epoch": 0.7521720639163983, + "grad_norm": 0.742682158946991, + "learning_rate": 1.449437254235666e-05, + "loss": 1.0204, + "step": 12380 + }, + { + "epoch": 0.7522328209490249, + "grad_norm": 0.21696442365646362, + "learning_rate": 1.4487633116773341e-05, + "loss": 1.0672, + "step": 12381 + }, + { + "epoch": 0.7522935779816514, + "grad_norm": 0.6758202314376831, + "learning_rate": 1.4480894992902694e-05, + "loss": 1.239, + "step": 12382 + }, + { + "epoch": 0.752354335014278, + "grad_norm": 0.16507484018802643, + "learning_rate": 1.4474158170991708e-05, + "loss": 1.007, + "step": 12383 + }, + { + "epoch": 0.7524150920469044, + "grad_norm": 0.2582539916038513, + "learning_rate": 1.4467422651287326e-05, + "loss": 1.1495, + "step": 12384 + }, + { + "epoch": 0.7524758490795309, + "grad_norm": 0.1980871707201004, + "learning_rate": 1.4460688434036429e-05, + "loss": 0.9937, + "step": 12385 + }, + { + "epoch": 0.7525366061121574, + "grad_norm": 0.23131780326366425, + "learning_rate": 1.4453955519485868e-05, + "loss": 1.0921, + "step": 12386 + }, + { + "epoch": 0.752597363144784, + "grad_norm": 0.14088793098926544, + "learning_rate": 1.444722390788244e-05, + "loss": 1.0398, + "step": 12387 + }, + { + "epoch": 0.7526581201774105, + "grad_norm": 0.12949642539024353, + "learning_rate": 1.4440493599472894e-05, + "loss": 1.0913, + "step": 12388 + }, + { + "epoch": 0.752718877210037, + "grad_norm": 0.15121890604496002, + "learning_rate": 1.4433764594503923e-05, + "loss": 1.1299, + "step": 12389 + }, + { + "epoch": 0.7527796342426636, + "grad_norm": 0.5240110158920288, + "learning_rate": 1.442703689322219e-05, + "loss": 1.2855, + "step": 12390 + }, + { + "epoch": 0.7528403912752901, + "grad_norm": 0.17915916442871094, + "learning_rate": 1.4420310495874279e-05, + "loss": 1.1821, + "step": 12391 + }, + { + "epoch": 0.7529011483079167, + "grad_norm": 0.157648965716362, + "learning_rate": 1.4413585402706798e-05, + "loss": 1.0633, + "step": 12392 + }, + { + "epoch": 0.7529619053405432, + "grad_norm": 0.14002954959869385, + "learning_rate": 1.4406861613966205e-05, + "loss": 1.0221, + "step": 12393 + }, + { + "epoch": 0.7530226623731697, + "grad_norm": 0.2535688877105713, + "learning_rate": 1.4400139129898977e-05, + "loss": 1.1571, + "step": 12394 + }, + { + "epoch": 0.7530834194057963, + "grad_norm": 0.11355717480182648, + "learning_rate": 1.439341795075153e-05, + "loss": 1.0427, + "step": 12395 + }, + { + "epoch": 0.7531441764384228, + "grad_norm": 0.20710282027721405, + "learning_rate": 1.4386698076770216e-05, + "loss": 1.0965, + "step": 12396 + }, + { + "epoch": 0.7532049334710492, + "grad_norm": 0.23512859642505646, + "learning_rate": 1.4379979508201385e-05, + "loss": 1.1267, + "step": 12397 + }, + { + "epoch": 0.7532656905036758, + "grad_norm": 0.19728349149227142, + "learning_rate": 1.4373262245291308e-05, + "loss": 1.1702, + "step": 12398 + }, + { + "epoch": 0.7533264475363023, + "grad_norm": 0.10688597708940506, + "learning_rate": 1.4366546288286176e-05, + "loss": 0.9936, + "step": 12399 + }, + { + "epoch": 0.7533872045689288, + "grad_norm": 0.15643784403800964, + "learning_rate": 1.435983163743217e-05, + "loss": 1.1451, + "step": 12400 + }, + { + "epoch": 0.7534479616015554, + "grad_norm": 0.914742648601532, + "learning_rate": 1.4353118292975414e-05, + "loss": 1.1157, + "step": 12401 + }, + { + "epoch": 0.7535087186341819, + "grad_norm": 0.17047327756881714, + "learning_rate": 1.434640625516201e-05, + "loss": 1.0628, + "step": 12402 + }, + { + "epoch": 0.7535694756668084, + "grad_norm": 0.1906994879245758, + "learning_rate": 1.4339695524237979e-05, + "loss": 1.1583, + "step": 12403 + }, + { + "epoch": 0.753630232699435, + "grad_norm": 0.20968379080295563, + "learning_rate": 1.4332986100449303e-05, + "loss": 1.099, + "step": 12404 + }, + { + "epoch": 0.7536909897320615, + "grad_norm": 0.15112251043319702, + "learning_rate": 1.4326277984041925e-05, + "loss": 1.0748, + "step": 12405 + }, + { + "epoch": 0.753751746764688, + "grad_norm": 0.18442794680595398, + "learning_rate": 1.4319571175261698e-05, + "loss": 1.0924, + "step": 12406 + }, + { + "epoch": 0.7538125037973146, + "grad_norm": 0.11638113856315613, + "learning_rate": 1.4312865674354497e-05, + "loss": 1.0333, + "step": 12407 + }, + { + "epoch": 0.7538732608299411, + "grad_norm": 0.21298284828662872, + "learning_rate": 1.4306161481566105e-05, + "loss": 1.0891, + "step": 12408 + }, + { + "epoch": 0.7539340178625676, + "grad_norm": 0.10447728633880615, + "learning_rate": 1.4299458597142267e-05, + "loss": 1.0405, + "step": 12409 + }, + { + "epoch": 0.7539947748951941, + "grad_norm": 0.168282613158226, + "learning_rate": 1.4292757021328673e-05, + "loss": 1.0638, + "step": 12410 + }, + { + "epoch": 0.7540555319278206, + "grad_norm": 0.16812369227409363, + "learning_rate": 1.4286056754370968e-05, + "loss": 1.083, + "step": 12411 + }, + { + "epoch": 0.7541162889604471, + "grad_norm": 0.20447130501270294, + "learning_rate": 1.4279357796514759e-05, + "loss": 1.0735, + "step": 12412 + }, + { + "epoch": 0.7541770459930737, + "grad_norm": 0.12645472586154938, + "learning_rate": 1.4272660148005596e-05, + "loss": 1.0476, + "step": 12413 + }, + { + "epoch": 0.7542378030257002, + "grad_norm": 0.09878091514110565, + "learning_rate": 1.4265963809088977e-05, + "loss": 1.0258, + "step": 12414 + }, + { + "epoch": 0.7542985600583267, + "grad_norm": 0.21717269718647003, + "learning_rate": 1.4259268780010371e-05, + "loss": 1.0834, + "step": 12415 + }, + { + "epoch": 0.7543593170909533, + "grad_norm": 0.09347713738679886, + "learning_rate": 1.4252575061015166e-05, + "loss": 1.0204, + "step": 12416 + }, + { + "epoch": 0.7544200741235798, + "grad_norm": 0.21152333915233612, + "learning_rate": 1.4245882652348724e-05, + "loss": 1.1361, + "step": 12417 + }, + { + "epoch": 0.7544808311562063, + "grad_norm": 0.18669576942920685, + "learning_rate": 1.4239191554256376e-05, + "loss": 1.0662, + "step": 12418 + }, + { + "epoch": 0.7545415881888329, + "grad_norm": 0.11657535284757614, + "learning_rate": 1.4232501766983391e-05, + "loss": 1.0741, + "step": 12419 + }, + { + "epoch": 0.7546023452214594, + "grad_norm": 0.10764314979314804, + "learning_rate": 1.4225813290774948e-05, + "loss": 1.0436, + "step": 12420 + }, + { + "epoch": 0.754663102254086, + "grad_norm": 0.21123827993869781, + "learning_rate": 1.4219126125876237e-05, + "loss": 1.1184, + "step": 12421 + }, + { + "epoch": 0.7547238592867125, + "grad_norm": 0.19128882884979248, + "learning_rate": 1.4212440272532362e-05, + "loss": 1.0029, + "step": 12422 + }, + { + "epoch": 0.754784616319339, + "grad_norm": 0.1671556532382965, + "learning_rate": 1.4205755730988418e-05, + "loss": 1.0467, + "step": 12423 + }, + { + "epoch": 0.7548453733519654, + "grad_norm": 0.22247761487960815, + "learning_rate": 1.4199072501489418e-05, + "loss": 1.176, + "step": 12424 + }, + { + "epoch": 0.754906130384592, + "grad_norm": 0.12096160650253296, + "learning_rate": 1.4192390584280346e-05, + "loss": 1.0593, + "step": 12425 + }, + { + "epoch": 0.7549668874172185, + "grad_norm": 0.17339357733726501, + "learning_rate": 1.4185709979606094e-05, + "loss": 0.9622, + "step": 12426 + }, + { + "epoch": 0.755027644449845, + "grad_norm": 0.1513710767030716, + "learning_rate": 1.417903068771158e-05, + "loss": 1.0831, + "step": 12427 + }, + { + "epoch": 0.7550884014824716, + "grad_norm": 0.3548330068588257, + "learning_rate": 1.4172352708841618e-05, + "loss": 1.1623, + "step": 12428 + }, + { + "epoch": 0.7551491585150981, + "grad_norm": 0.1272754818201065, + "learning_rate": 1.4165676043240995e-05, + "loss": 1.0407, + "step": 12429 + }, + { + "epoch": 0.7552099155477247, + "grad_norm": 0.1449151635169983, + "learning_rate": 1.4159000691154439e-05, + "loss": 1.0723, + "step": 12430 + }, + { + "epoch": 0.7552706725803512, + "grad_norm": 0.11652117967605591, + "learning_rate": 1.4152326652826647e-05, + "loss": 1.0292, + "step": 12431 + }, + { + "epoch": 0.7553314296129777, + "grad_norm": 0.14926502108573914, + "learning_rate": 1.4145653928502245e-05, + "loss": 1.0288, + "step": 12432 + }, + { + "epoch": 0.7553921866456043, + "grad_norm": 0.11315730214118958, + "learning_rate": 1.4138982518425831e-05, + "loss": 1.0105, + "step": 12433 + }, + { + "epoch": 0.7554529436782308, + "grad_norm": 0.1694616824388504, + "learning_rate": 1.4132312422841947e-05, + "loss": 1.1487, + "step": 12434 + }, + { + "epoch": 0.7555137007108573, + "grad_norm": 0.14117653667926788, + "learning_rate": 1.4125643641995078e-05, + "loss": 1.1222, + "step": 12435 + }, + { + "epoch": 0.7555744577434839, + "grad_norm": 0.11897718161344528, + "learning_rate": 1.4118976176129684e-05, + "loss": 1.0341, + "step": 12436 + }, + { + "epoch": 0.7556352147761103, + "grad_norm": 0.12595760822296143, + "learning_rate": 1.411231002549015e-05, + "loss": 1.0318, + "step": 12437 + }, + { + "epoch": 0.7556959718087368, + "grad_norm": 0.11892735213041306, + "learning_rate": 1.410564519032081e-05, + "loss": 1.0856, + "step": 12438 + }, + { + "epoch": 0.7557567288413634, + "grad_norm": 0.12020894885063171, + "learning_rate": 1.4098981670866018e-05, + "loss": 1.0343, + "step": 12439 + }, + { + "epoch": 0.7558174858739899, + "grad_norm": 0.1720348298549652, + "learning_rate": 1.4092319467369974e-05, + "loss": 1.1239, + "step": 12440 + }, + { + "epoch": 0.7558782429066164, + "grad_norm": 0.21336239576339722, + "learning_rate": 1.4085658580076905e-05, + "loss": 1.1394, + "step": 12441 + }, + { + "epoch": 0.755938999939243, + "grad_norm": 0.28349900245666504, + "learning_rate": 1.4078999009230947e-05, + "loss": 1.0413, + "step": 12442 + }, + { + "epoch": 0.7559997569718695, + "grad_norm": 0.15616722404956818, + "learning_rate": 1.4072340755076235e-05, + "loss": 1.0589, + "step": 12443 + }, + { + "epoch": 0.756060514004496, + "grad_norm": 0.15842755138874054, + "learning_rate": 1.4065683817856823e-05, + "loss": 1.1563, + "step": 12444 + }, + { + "epoch": 0.7561212710371226, + "grad_norm": 0.14412358403205872, + "learning_rate": 1.4059028197816725e-05, + "loss": 0.993, + "step": 12445 + }, + { + "epoch": 0.7561820280697491, + "grad_norm": 1.4671894311904907, + "learning_rate": 1.4052373895199888e-05, + "loss": 1.2388, + "step": 12446 + }, + { + "epoch": 0.7562427851023756, + "grad_norm": 0.11971438676118851, + "learning_rate": 1.4045720910250216e-05, + "loss": 1.0773, + "step": 12447 + }, + { + "epoch": 0.7563035421350022, + "grad_norm": 0.14378733932971954, + "learning_rate": 1.4039069243211605e-05, + "loss": 1.0301, + "step": 12448 + }, + { + "epoch": 0.7563642991676287, + "grad_norm": 0.3817189335823059, + "learning_rate": 1.403241889432787e-05, + "loss": 1.1437, + "step": 12449 + }, + { + "epoch": 0.7564250562002551, + "grad_norm": 3.130676507949829, + "learning_rate": 1.4025769863842764e-05, + "loss": 1.0221, + "step": 12450 + }, + { + "epoch": 0.7564858132328817, + "grad_norm": 0.19876748323440552, + "learning_rate": 1.4019122152000025e-05, + "loss": 1.0384, + "step": 12451 + }, + { + "epoch": 0.7565465702655082, + "grad_norm": 0.20972973108291626, + "learning_rate": 1.4012475759043314e-05, + "loss": 1.139, + "step": 12452 + }, + { + "epoch": 0.7566073272981347, + "grad_norm": 0.11403432488441467, + "learning_rate": 1.4005830685216259e-05, + "loss": 1.0368, + "step": 12453 + }, + { + "epoch": 0.7566680843307613, + "grad_norm": 0.13528011739253998, + "learning_rate": 1.3999186930762442e-05, + "loss": 1.0917, + "step": 12454 + }, + { + "epoch": 0.7567288413633878, + "grad_norm": 0.15780939161777496, + "learning_rate": 1.3992544495925386e-05, + "loss": 1.043, + "step": 12455 + }, + { + "epoch": 0.7567895983960143, + "grad_norm": 0.12039822340011597, + "learning_rate": 1.3985903380948573e-05, + "loss": 1.05, + "step": 12456 + }, + { + "epoch": 0.7568503554286409, + "grad_norm": 0.10794486850500107, + "learning_rate": 1.3979263586075426e-05, + "loss": 1.0666, + "step": 12457 + }, + { + "epoch": 0.7569111124612674, + "grad_norm": 0.1303398609161377, + "learning_rate": 1.3972625111549325e-05, + "loss": 1.0089, + "step": 12458 + }, + { + "epoch": 0.756971869493894, + "grad_norm": 0.12871064245700836, + "learning_rate": 1.3965987957613647e-05, + "loss": 1.0701, + "step": 12459 + }, + { + "epoch": 0.7570326265265205, + "grad_norm": 0.17288193106651306, + "learning_rate": 1.3959352124511627e-05, + "loss": 1.096, + "step": 12460 + }, + { + "epoch": 0.757093383559147, + "grad_norm": 0.13613943755626678, + "learning_rate": 1.3952717612486522e-05, + "loss": 1.0485, + "step": 12461 + }, + { + "epoch": 0.7571541405917736, + "grad_norm": 0.15245550870895386, + "learning_rate": 1.394608442178152e-05, + "loss": 1.0997, + "step": 12462 + }, + { + "epoch": 0.7572148976244, + "grad_norm": 0.1725936233997345, + "learning_rate": 1.3939452552639748e-05, + "loss": 1.0967, + "step": 12463 + }, + { + "epoch": 0.7572756546570265, + "grad_norm": 0.1804647296667099, + "learning_rate": 1.3932822005304325e-05, + "loss": 1.0522, + "step": 12464 + }, + { + "epoch": 0.757336411689653, + "grad_norm": 3.0747997760772705, + "learning_rate": 1.3926192780018305e-05, + "loss": 1.1279, + "step": 12465 + }, + { + "epoch": 0.7573971687222796, + "grad_norm": 0.13486158847808838, + "learning_rate": 1.3919564877024644e-05, + "loss": 1.0703, + "step": 12466 + }, + { + "epoch": 0.7574579257549061, + "grad_norm": 0.15360982716083527, + "learning_rate": 1.3912938296566302e-05, + "loss": 1.0675, + "step": 12467 + }, + { + "epoch": 0.7575186827875326, + "grad_norm": 0.17876802384853363, + "learning_rate": 1.3906313038886171e-05, + "loss": 1.1884, + "step": 12468 + }, + { + "epoch": 0.7575794398201592, + "grad_norm": 0.16275517642498016, + "learning_rate": 1.3899689104227126e-05, + "loss": 1.0359, + "step": 12469 + }, + { + "epoch": 0.7576401968527857, + "grad_norm": 0.120885469019413, + "learning_rate": 1.3893066492831957e-05, + "loss": 1.0314, + "step": 12470 + }, + { + "epoch": 0.7577009538854123, + "grad_norm": 0.27184921503067017, + "learning_rate": 1.3886445204943416e-05, + "loss": 1.0179, + "step": 12471 + }, + { + "epoch": 0.7577617109180388, + "grad_norm": 0.141347736120224, + "learning_rate": 1.3879825240804217e-05, + "loss": 1.0039, + "step": 12472 + }, + { + "epoch": 0.7578224679506653, + "grad_norm": 0.13252796232700348, + "learning_rate": 1.387320660065698e-05, + "loss": 1.0423, + "step": 12473 + }, + { + "epoch": 0.7578832249832919, + "grad_norm": 0.18378327786922455, + "learning_rate": 1.3866589284744352e-05, + "loss": 1.0412, + "step": 12474 + }, + { + "epoch": 0.7579439820159184, + "grad_norm": 0.10154604911804199, + "learning_rate": 1.3859973293308876e-05, + "loss": 1.0551, + "step": 12475 + }, + { + "epoch": 0.7580047390485448, + "grad_norm": 0.28289902210235596, + "learning_rate": 1.3853358626593065e-05, + "loss": 1.0512, + "step": 12476 + }, + { + "epoch": 0.7580654960811714, + "grad_norm": 0.9105175137519836, + "learning_rate": 1.3846745284839385e-05, + "loss": 1.0816, + "step": 12477 + }, + { + "epoch": 0.7581262531137979, + "grad_norm": 0.10451336205005646, + "learning_rate": 1.3840133268290239e-05, + "loss": 0.99, + "step": 12478 + }, + { + "epoch": 0.7581870101464244, + "grad_norm": 0.2284788191318512, + "learning_rate": 1.3833522577188002e-05, + "loss": 1.2458, + "step": 12479 + }, + { + "epoch": 0.758247767179051, + "grad_norm": 0.15661358833312988, + "learning_rate": 1.3826913211774983e-05, + "loss": 1.1283, + "step": 12480 + }, + { + "epoch": 0.7583085242116775, + "grad_norm": 0.15281632542610168, + "learning_rate": 1.3820305172293451e-05, + "loss": 1.0337, + "step": 12481 + }, + { + "epoch": 0.758369281244304, + "grad_norm": 0.13122248649597168, + "learning_rate": 1.381369845898563e-05, + "loss": 1.074, + "step": 12482 + }, + { + "epoch": 0.7584300382769306, + "grad_norm": 0.10453058034181595, + "learning_rate": 1.3807093072093686e-05, + "loss": 1.0441, + "step": 12483 + }, + { + "epoch": 0.7584907953095571, + "grad_norm": 0.1655968874692917, + "learning_rate": 1.3800489011859724e-05, + "loss": 1.1208, + "step": 12484 + }, + { + "epoch": 0.7585515523421836, + "grad_norm": 0.1780315637588501, + "learning_rate": 1.3793886278525853e-05, + "loss": 1.0966, + "step": 12485 + }, + { + "epoch": 0.7586123093748102, + "grad_norm": 0.37785041332244873, + "learning_rate": 1.3787284872334095e-05, + "loss": 1.0914, + "step": 12486 + }, + { + "epoch": 0.7586730664074367, + "grad_norm": 0.15251842141151428, + "learning_rate": 1.3780684793526389e-05, + "loss": 1.1238, + "step": 12487 + }, + { + "epoch": 0.7587338234400632, + "grad_norm": 0.2933756113052368, + "learning_rate": 1.3774086042344686e-05, + "loss": 1.1127, + "step": 12488 + }, + { + "epoch": 0.7587945804726897, + "grad_norm": 0.11405391246080399, + "learning_rate": 1.3767488619030843e-05, + "loss": 1.0085, + "step": 12489 + }, + { + "epoch": 0.7588553375053162, + "grad_norm": 0.37702250480651855, + "learning_rate": 1.3760892523826729e-05, + "loss": 1.2371, + "step": 12490 + }, + { + "epoch": 0.7589160945379427, + "grad_norm": 0.14275868237018585, + "learning_rate": 1.3754297756974099e-05, + "loss": 1.1094, + "step": 12491 + }, + { + "epoch": 0.7589768515705693, + "grad_norm": 0.15083137154579163, + "learning_rate": 1.374770431871471e-05, + "loss": 1.1236, + "step": 12492 + }, + { + "epoch": 0.7590376086031958, + "grad_norm": 0.23004627227783203, + "learning_rate": 1.3741112209290214e-05, + "loss": 1.0212, + "step": 12493 + }, + { + "epoch": 0.7590983656358223, + "grad_norm": 0.1742396503686905, + "learning_rate": 1.3734521428942238e-05, + "loss": 1.0842, + "step": 12494 + }, + { + "epoch": 0.7591591226684489, + "grad_norm": 0.10751919448375702, + "learning_rate": 1.3727931977912407e-05, + "loss": 1.0743, + "step": 12495 + }, + { + "epoch": 0.7592198797010754, + "grad_norm": 0.15435214340686798, + "learning_rate": 1.3721343856442238e-05, + "loss": 1.084, + "step": 12496 + }, + { + "epoch": 0.7592806367337019, + "grad_norm": 0.20320147275924683, + "learning_rate": 1.3714757064773226e-05, + "loss": 1.0652, + "step": 12497 + }, + { + "epoch": 0.7593413937663285, + "grad_norm": 0.3919805586338043, + "learning_rate": 1.3708171603146803e-05, + "loss": 1.1089, + "step": 12498 + }, + { + "epoch": 0.759402150798955, + "grad_norm": 0.2511536478996277, + "learning_rate": 1.3701587471804367e-05, + "loss": 0.9924, + "step": 12499 + }, + { + "epoch": 0.7594629078315815, + "grad_norm": 0.1596825122833252, + "learning_rate": 1.369500467098726e-05, + "loss": 1.0995, + "step": 12500 + }, + { + "epoch": 0.7595236648642081, + "grad_norm": 0.42139509320259094, + "learning_rate": 1.3688423200936773e-05, + "loss": 1.1318, + "step": 12501 + }, + { + "epoch": 0.7595844218968345, + "grad_norm": 0.24840416014194489, + "learning_rate": 1.368184306189415e-05, + "loss": 1.083, + "step": 12502 + }, + { + "epoch": 0.759645178929461, + "grad_norm": 0.22676849365234375, + "learning_rate": 1.3675264254100595e-05, + "loss": 1.1176, + "step": 12503 + }, + { + "epoch": 0.7597059359620876, + "grad_norm": 0.1771671622991562, + "learning_rate": 1.3668686777797247e-05, + "loss": 1.0393, + "step": 12504 + }, + { + "epoch": 0.7597666929947141, + "grad_norm": 0.1207723468542099, + "learning_rate": 1.3662110633225196e-05, + "loss": 1.0225, + "step": 12505 + }, + { + "epoch": 0.7598274500273406, + "grad_norm": 0.1388079673051834, + "learning_rate": 1.3655535820625532e-05, + "loss": 1.0324, + "step": 12506 + }, + { + "epoch": 0.7598882070599672, + "grad_norm": 0.3035030961036682, + "learning_rate": 1.3648962340239208e-05, + "loss": 1.0867, + "step": 12507 + }, + { + "epoch": 0.7599489640925937, + "grad_norm": 0.1515635848045349, + "learning_rate": 1.3642390192307204e-05, + "loss": 1.1514, + "step": 12508 + }, + { + "epoch": 0.7600097211252203, + "grad_norm": 0.11775967478752136, + "learning_rate": 1.3635819377070407e-05, + "loss": 1.011, + "step": 12509 + }, + { + "epoch": 0.7600704781578468, + "grad_norm": 0.14208143949508667, + "learning_rate": 1.3629249894769668e-05, + "loss": 1.0877, + "step": 12510 + }, + { + "epoch": 0.7601312351904733, + "grad_norm": 0.14568772912025452, + "learning_rate": 1.3622681745645821e-05, + "loss": 1.0621, + "step": 12511 + }, + { + "epoch": 0.7601919922230999, + "grad_norm": 0.1941111981868744, + "learning_rate": 1.3616114929939622e-05, + "loss": 1.0849, + "step": 12512 + }, + { + "epoch": 0.7602527492557264, + "grad_norm": 0.16677498817443848, + "learning_rate": 1.3609549447891745e-05, + "loss": 1.1111, + "step": 12513 + }, + { + "epoch": 0.7603135062883529, + "grad_norm": 0.1665506213903427, + "learning_rate": 1.360298529974287e-05, + "loss": 1.0657, + "step": 12514 + }, + { + "epoch": 0.7603742633209793, + "grad_norm": 0.1916867196559906, + "learning_rate": 1.3596422485733589e-05, + "loss": 1.1055, + "step": 12515 + }, + { + "epoch": 0.7604350203536059, + "grad_norm": 0.1459992378950119, + "learning_rate": 1.3589861006104494e-05, + "loss": 1.0527, + "step": 12516 + }, + { + "epoch": 0.7604957773862324, + "grad_norm": 0.12731795012950897, + "learning_rate": 1.3583300861096077e-05, + "loss": 1.0137, + "step": 12517 + }, + { + "epoch": 0.760556534418859, + "grad_norm": 0.19834287464618683, + "learning_rate": 1.357674205094881e-05, + "loss": 1.1117, + "step": 12518 + }, + { + "epoch": 0.7606172914514855, + "grad_norm": 0.21660210192203522, + "learning_rate": 1.3570184575903105e-05, + "loss": 1.0666, + "step": 12519 + }, + { + "epoch": 0.760678048484112, + "grad_norm": 0.17140260338783264, + "learning_rate": 1.3563628436199322e-05, + "loss": 1.1116, + "step": 12520 + }, + { + "epoch": 0.7607388055167386, + "grad_norm": 0.5122325420379639, + "learning_rate": 1.3557073632077783e-05, + "loss": 1.0389, + "step": 12521 + }, + { + "epoch": 0.7607995625493651, + "grad_norm": 0.1227804496884346, + "learning_rate": 1.3550520163778757e-05, + "loss": 0.9998, + "step": 12522 + }, + { + "epoch": 0.7608603195819916, + "grad_norm": 0.12985700368881226, + "learning_rate": 1.3543968031542459e-05, + "loss": 1.0405, + "step": 12523 + }, + { + "epoch": 0.7609210766146182, + "grad_norm": 0.1181931421160698, + "learning_rate": 1.3537417235609057e-05, + "loss": 1.0142, + "step": 12524 + }, + { + "epoch": 0.7609818336472447, + "grad_norm": 0.12929067015647888, + "learning_rate": 1.3530867776218675e-05, + "loss": 1.0285, + "step": 12525 + }, + { + "epoch": 0.7610425906798712, + "grad_norm": 0.14773963391780853, + "learning_rate": 1.3524319653611384e-05, + "loss": 1.09, + "step": 12526 + }, + { + "epoch": 0.7611033477124978, + "grad_norm": 0.11514797061681747, + "learning_rate": 1.3517772868027206e-05, + "loss": 1.0791, + "step": 12527 + }, + { + "epoch": 0.7611641047451243, + "grad_norm": 0.10016696900129318, + "learning_rate": 1.3511227419706119e-05, + "loss": 1.0048, + "step": 12528 + }, + { + "epoch": 0.7612248617777507, + "grad_norm": 0.1402485966682434, + "learning_rate": 1.3504683308888043e-05, + "loss": 1.0774, + "step": 12529 + }, + { + "epoch": 0.7612856188103773, + "grad_norm": 0.14354290068149567, + "learning_rate": 1.3498140535812852e-05, + "loss": 1.0617, + "step": 12530 + }, + { + "epoch": 0.7613463758430038, + "grad_norm": 0.25909605622291565, + "learning_rate": 1.3491599100720365e-05, + "loss": 1.14, + "step": 12531 + }, + { + "epoch": 0.7614071328756303, + "grad_norm": 3.9069385528564453, + "learning_rate": 1.3485059003850392e-05, + "loss": 1.0566, + "step": 12532 + }, + { + "epoch": 0.7614678899082569, + "grad_norm": 0.22198474407196045, + "learning_rate": 1.3478520245442633e-05, + "loss": 1.0977, + "step": 12533 + }, + { + "epoch": 0.7615286469408834, + "grad_norm": 0.09995206445455551, + "learning_rate": 1.3471982825736767e-05, + "loss": 1.0127, + "step": 12534 + }, + { + "epoch": 0.7615894039735099, + "grad_norm": 0.2163786143064499, + "learning_rate": 1.346544674497242e-05, + "loss": 1.128, + "step": 12535 + }, + { + "epoch": 0.7616501610061365, + "grad_norm": 0.16006778180599213, + "learning_rate": 1.3458912003389201e-05, + "loss": 1.0914, + "step": 12536 + }, + { + "epoch": 0.761710918038763, + "grad_norm": 0.4079490005970001, + "learning_rate": 1.3452378601226624e-05, + "loss": 1.116, + "step": 12537 + }, + { + "epoch": 0.7617716750713895, + "grad_norm": 0.2563630938529968, + "learning_rate": 1.3445846538724171e-05, + "loss": 1.1518, + "step": 12538 + }, + { + "epoch": 0.7618324321040161, + "grad_norm": 0.11792192608118057, + "learning_rate": 1.3439315816121294e-05, + "loss": 1.0224, + "step": 12539 + }, + { + "epoch": 0.7618931891366426, + "grad_norm": 0.2553277611732483, + "learning_rate": 1.3432786433657335e-05, + "loss": 1.2811, + "step": 12540 + }, + { + "epoch": 0.7619539461692691, + "grad_norm": 0.14214277267456055, + "learning_rate": 1.3426258391571677e-05, + "loss": 1.0724, + "step": 12541 + }, + { + "epoch": 0.7620147032018956, + "grad_norm": 0.33141443133354187, + "learning_rate": 1.3419731690103581e-05, + "loss": 1.0426, + "step": 12542 + }, + { + "epoch": 0.7620754602345221, + "grad_norm": 0.38680481910705566, + "learning_rate": 1.3413206329492295e-05, + "loss": 1.1436, + "step": 12543 + }, + { + "epoch": 0.7621362172671486, + "grad_norm": 0.2522801160812378, + "learning_rate": 1.3406682309976998e-05, + "loss": 1.0829, + "step": 12544 + }, + { + "epoch": 0.7621969742997752, + "grad_norm": 0.16162461042404175, + "learning_rate": 1.3400159631796838e-05, + "loss": 1.0985, + "step": 12545 + }, + { + "epoch": 0.7622577313324017, + "grad_norm": 0.13372021913528442, + "learning_rate": 1.3393638295190896e-05, + "loss": 1.0622, + "step": 12546 + }, + { + "epoch": 0.7623184883650282, + "grad_norm": 0.1485844850540161, + "learning_rate": 1.3387118300398227e-05, + "loss": 1.1156, + "step": 12547 + }, + { + "epoch": 0.7623792453976548, + "grad_norm": 0.1145109161734581, + "learning_rate": 1.338059964765781e-05, + "loss": 1.0838, + "step": 12548 + }, + { + "epoch": 0.7624400024302813, + "grad_norm": 0.1353783905506134, + "learning_rate": 1.3374082337208587e-05, + "loss": 1.008, + "step": 12549 + }, + { + "epoch": 0.7625007594629079, + "grad_norm": 0.17304907739162445, + "learning_rate": 1.3367566369289457e-05, + "loss": 1.1866, + "step": 12550 + }, + { + "epoch": 0.7625615164955344, + "grad_norm": 0.13886840641498566, + "learning_rate": 1.3361051744139253e-05, + "loss": 1.0019, + "step": 12551 + }, + { + "epoch": 0.7626222735281609, + "grad_norm": 0.18054184317588806, + "learning_rate": 1.3354538461996807e-05, + "loss": 1.131, + "step": 12552 + }, + { + "epoch": 0.7626830305607875, + "grad_norm": 1.101049542427063, + "learning_rate": 1.334802652310082e-05, + "loss": 1.0338, + "step": 12553 + }, + { + "epoch": 0.762743787593414, + "grad_norm": 0.1377827525138855, + "learning_rate": 1.3341515927690007e-05, + "loss": 1.0876, + "step": 12554 + }, + { + "epoch": 0.7628045446260404, + "grad_norm": 0.1813245266675949, + "learning_rate": 1.3335006676003014e-05, + "loss": 1.1161, + "step": 12555 + }, + { + "epoch": 0.762865301658667, + "grad_norm": 0.14815883338451385, + "learning_rate": 1.332849876827842e-05, + "loss": 1.0937, + "step": 12556 + }, + { + "epoch": 0.7629260586912935, + "grad_norm": 0.5480836033821106, + "learning_rate": 1.3321992204754813e-05, + "loss": 1.151, + "step": 12557 + }, + { + "epoch": 0.76298681572392, + "grad_norm": 0.3353072702884674, + "learning_rate": 1.3315486985670667e-05, + "loss": 1.0173, + "step": 12558 + }, + { + "epoch": 0.7630475727565466, + "grad_norm": 0.21494805812835693, + "learning_rate": 1.330898311126445e-05, + "loss": 1.1367, + "step": 12559 + }, + { + "epoch": 0.7631083297891731, + "grad_norm": 0.10747019946575165, + "learning_rate": 1.330248058177454e-05, + "loss": 1.0232, + "step": 12560 + }, + { + "epoch": 0.7631690868217996, + "grad_norm": 0.1298971027135849, + "learning_rate": 1.3295979397439279e-05, + "loss": 1.0208, + "step": 12561 + }, + { + "epoch": 0.7632298438544262, + "grad_norm": 0.14917881786823273, + "learning_rate": 1.3289479558497003e-05, + "loss": 1.0154, + "step": 12562 + }, + { + "epoch": 0.7632906008870527, + "grad_norm": 0.14374469220638275, + "learning_rate": 1.328298106518595e-05, + "loss": 1.0469, + "step": 12563 + }, + { + "epoch": 0.7633513579196792, + "grad_norm": 0.1334616094827652, + "learning_rate": 1.3276483917744326e-05, + "loss": 1.0582, + "step": 12564 + }, + { + "epoch": 0.7634121149523058, + "grad_norm": 0.15402697026729584, + "learning_rate": 1.3269988116410276e-05, + "loss": 1.0611, + "step": 12565 + }, + { + "epoch": 0.7634728719849323, + "grad_norm": 0.8438519835472107, + "learning_rate": 1.3263493661421916e-05, + "loss": 1.0733, + "step": 12566 + }, + { + "epoch": 0.7635336290175588, + "grad_norm": 0.12258189916610718, + "learning_rate": 1.3257000553017296e-05, + "loss": 1.0386, + "step": 12567 + }, + { + "epoch": 0.7635943860501853, + "grad_norm": 0.13384394347667694, + "learning_rate": 1.3250508791434424e-05, + "loss": 1.0002, + "step": 12568 + }, + { + "epoch": 0.7636551430828118, + "grad_norm": 0.11661837995052338, + "learning_rate": 1.3244018376911255e-05, + "loss": 1.0488, + "step": 12569 + }, + { + "epoch": 0.7637159001154383, + "grad_norm": 0.10678543895483017, + "learning_rate": 1.3237529309685697e-05, + "loss": 1.0065, + "step": 12570 + }, + { + "epoch": 0.7637766571480649, + "grad_norm": 0.10978089272975922, + "learning_rate": 1.3231041589995607e-05, + "loss": 1.0077, + "step": 12571 + }, + { + "epoch": 0.7638374141806914, + "grad_norm": 0.14187034964561462, + "learning_rate": 1.3224555218078782e-05, + "loss": 1.0413, + "step": 12572 + }, + { + "epoch": 0.7638981712133179, + "grad_norm": 0.12638205289840698, + "learning_rate": 1.3218070194173027e-05, + "loss": 1.0565, + "step": 12573 + }, + { + "epoch": 0.7639589282459445, + "grad_norm": 0.1691051572561264, + "learning_rate": 1.3211586518516e-05, + "loss": 1.1334, + "step": 12574 + }, + { + "epoch": 0.764019685278571, + "grad_norm": 0.15638159215450287, + "learning_rate": 1.320510419134538e-05, + "loss": 1.1161, + "step": 12575 + }, + { + "epoch": 0.7640804423111975, + "grad_norm": 0.15488126873970032, + "learning_rate": 1.3198623212898776e-05, + "loss": 1.0244, + "step": 12576 + }, + { + "epoch": 0.7641411993438241, + "grad_norm": 0.17011305689811707, + "learning_rate": 1.319214358341374e-05, + "loss": 1.1553, + "step": 12577 + }, + { + "epoch": 0.7642019563764506, + "grad_norm": 0.5920494794845581, + "learning_rate": 1.3185665303127809e-05, + "loss": 1.0001, + "step": 12578 + }, + { + "epoch": 0.7642627134090771, + "grad_norm": 0.16712260246276855, + "learning_rate": 1.317918837227845e-05, + "loss": 1.1527, + "step": 12579 + }, + { + "epoch": 0.7643234704417037, + "grad_norm": 0.19923590123653412, + "learning_rate": 1.3172712791103043e-05, + "loss": 1.1439, + "step": 12580 + }, + { + "epoch": 0.7643842274743301, + "grad_norm": 0.13136544823646545, + "learning_rate": 1.316623855983896e-05, + "loss": 1.0158, + "step": 12581 + }, + { + "epoch": 0.7644449845069566, + "grad_norm": 0.13120952248573303, + "learning_rate": 1.3159765678723518e-05, + "loss": 1.0376, + "step": 12582 + }, + { + "epoch": 0.7645057415395832, + "grad_norm": 0.20183120667934418, + "learning_rate": 1.3153294147993995e-05, + "loss": 1.1735, + "step": 12583 + }, + { + "epoch": 0.7645664985722097, + "grad_norm": 0.5586725473403931, + "learning_rate": 1.31468239678876e-05, + "loss": 1.121, + "step": 12584 + }, + { + "epoch": 0.7646272556048362, + "grad_norm": 0.12262866646051407, + "learning_rate": 1.3140355138641496e-05, + "loss": 1.1113, + "step": 12585 + }, + { + "epoch": 0.7646880126374628, + "grad_norm": 0.25252658128738403, + "learning_rate": 1.313388766049281e-05, + "loss": 1.2143, + "step": 12586 + }, + { + "epoch": 0.7647487696700893, + "grad_norm": 0.29822754859924316, + "learning_rate": 1.3127421533678575e-05, + "loss": 1.2904, + "step": 12587 + }, + { + "epoch": 0.7648095267027158, + "grad_norm": 0.11995142698287964, + "learning_rate": 1.3120956758435837e-05, + "loss": 1.0784, + "step": 12588 + }, + { + "epoch": 0.7648702837353424, + "grad_norm": 0.15583375096321106, + "learning_rate": 1.3114493335001565e-05, + "loss": 1.0207, + "step": 12589 + }, + { + "epoch": 0.7649310407679689, + "grad_norm": 0.17384937405586243, + "learning_rate": 1.310803126361267e-05, + "loss": 1.0822, + "step": 12590 + }, + { + "epoch": 0.7649917978005955, + "grad_norm": 0.20129141211509705, + "learning_rate": 1.3101570544506014e-05, + "loss": 1.1052, + "step": 12591 + }, + { + "epoch": 0.765052554833222, + "grad_norm": 0.15278537571430206, + "learning_rate": 1.3095111177918423e-05, + "loss": 1.0464, + "step": 12592 + }, + { + "epoch": 0.7651133118658485, + "grad_norm": 0.1675327718257904, + "learning_rate": 1.3088653164086667e-05, + "loss": 1.1144, + "step": 12593 + }, + { + "epoch": 0.765174068898475, + "grad_norm": 0.56601482629776, + "learning_rate": 1.3082196503247469e-05, + "loss": 1.0663, + "step": 12594 + }, + { + "epoch": 0.7652348259311015, + "grad_norm": 0.14412938058376312, + "learning_rate": 1.3075741195637487e-05, + "loss": 1.0697, + "step": 12595 + }, + { + "epoch": 0.765295582963728, + "grad_norm": 0.1973801851272583, + "learning_rate": 1.306928724149335e-05, + "loss": 1.1855, + "step": 12596 + }, + { + "epoch": 0.7653563399963546, + "grad_norm": 0.1966913342475891, + "learning_rate": 1.306283464105163e-05, + "loss": 1.1904, + "step": 12597 + }, + { + "epoch": 0.7654170970289811, + "grad_norm": 0.1815931648015976, + "learning_rate": 1.3056383394548832e-05, + "loss": 1.0903, + "step": 12598 + }, + { + "epoch": 0.7654778540616076, + "grad_norm": 0.16487367451190948, + "learning_rate": 1.3049933502221468e-05, + "loss": 1.1026, + "step": 12599 + }, + { + "epoch": 0.7655386110942342, + "grad_norm": 0.15805833041667938, + "learning_rate": 1.3043484964305924e-05, + "loss": 1.0216, + "step": 12600 + }, + { + "epoch": 0.7655993681268607, + "grad_norm": 0.10602639615535736, + "learning_rate": 1.303703778103858e-05, + "loss": 1.0302, + "step": 12601 + }, + { + "epoch": 0.7656601251594872, + "grad_norm": 0.1442016214132309, + "learning_rate": 1.3030591952655757e-05, + "loss": 1.0711, + "step": 12602 + }, + { + "epoch": 0.7657208821921138, + "grad_norm": 0.133018359541893, + "learning_rate": 1.3024147479393722e-05, + "loss": 1.0445, + "step": 12603 + }, + { + "epoch": 0.7657816392247403, + "grad_norm": 0.18071715533733368, + "learning_rate": 1.301770436148872e-05, + "loss": 1.0399, + "step": 12604 + }, + { + "epoch": 0.7658423962573668, + "grad_norm": 0.22149837017059326, + "learning_rate": 1.3011262599176915e-05, + "loss": 1.0678, + "step": 12605 + }, + { + "epoch": 0.7659031532899934, + "grad_norm": 0.18498311936855316, + "learning_rate": 1.3004822192694444e-05, + "loss": 1.2245, + "step": 12606 + }, + { + "epoch": 0.7659639103226198, + "grad_norm": 0.12543554604053497, + "learning_rate": 1.2998383142277348e-05, + "loss": 1.0037, + "step": 12607 + }, + { + "epoch": 0.7660246673552463, + "grad_norm": 0.20280525088310242, + "learning_rate": 1.2991945448161658e-05, + "loss": 1.1559, + "step": 12608 + }, + { + "epoch": 0.7660854243878729, + "grad_norm": 0.2181260585784912, + "learning_rate": 1.2985509110583378e-05, + "loss": 1.139, + "step": 12609 + }, + { + "epoch": 0.7661461814204994, + "grad_norm": 0.1568908542394638, + "learning_rate": 1.2979074129778419e-05, + "loss": 1.1372, + "step": 12610 + }, + { + "epoch": 0.7662069384531259, + "grad_norm": 0.23146596550941467, + "learning_rate": 1.2972640505982647e-05, + "loss": 1.1664, + "step": 12611 + }, + { + "epoch": 0.7662676954857525, + "grad_norm": 0.18140651285648346, + "learning_rate": 1.2966208239431899e-05, + "loss": 0.9685, + "step": 12612 + }, + { + "epoch": 0.766328452518379, + "grad_norm": 0.23390540480613708, + "learning_rate": 1.2959777330361938e-05, + "loss": 1.0568, + "step": 12613 + }, + { + "epoch": 0.7663892095510055, + "grad_norm": 0.12980833649635315, + "learning_rate": 1.2953347779008508e-05, + "loss": 1.0603, + "step": 12614 + }, + { + "epoch": 0.7664499665836321, + "grad_norm": 0.17330950498580933, + "learning_rate": 1.2946919585607275e-05, + "loss": 1.0121, + "step": 12615 + }, + { + "epoch": 0.7665107236162586, + "grad_norm": 0.1783524602651596, + "learning_rate": 1.2940492750393863e-05, + "loss": 1.1702, + "step": 12616 + }, + { + "epoch": 0.7665714806488851, + "grad_norm": 0.2138451486825943, + "learning_rate": 1.2934067273603856e-05, + "loss": 1.0531, + "step": 12617 + }, + { + "epoch": 0.7666322376815117, + "grad_norm": 0.11507545411586761, + "learning_rate": 1.2927643155472774e-05, + "loss": 1.0518, + "step": 12618 + }, + { + "epoch": 0.7666929947141382, + "grad_norm": 0.3343908190727234, + "learning_rate": 1.29212203962361e-05, + "loss": 1.237, + "step": 12619 + }, + { + "epoch": 0.7667537517467646, + "grad_norm": 0.14791886508464813, + "learning_rate": 1.2914798996129262e-05, + "loss": 1.026, + "step": 12620 + }, + { + "epoch": 0.7668145087793912, + "grad_norm": 0.178230881690979, + "learning_rate": 1.2908378955387634e-05, + "loss": 1.1533, + "step": 12621 + }, + { + "epoch": 0.7668752658120177, + "grad_norm": 0.13301712274551392, + "learning_rate": 1.2901960274246544e-05, + "loss": 1.0041, + "step": 12622 + }, + { + "epoch": 0.7669360228446442, + "grad_norm": 0.20597591996192932, + "learning_rate": 1.289554295294127e-05, + "loss": 1.1047, + "step": 12623 + }, + { + "epoch": 0.7669967798772708, + "grad_norm": 0.13181617856025696, + "learning_rate": 1.288912699170703e-05, + "loss": 1.0647, + "step": 12624 + }, + { + "epoch": 0.7670575369098973, + "grad_norm": 0.1167430430650711, + "learning_rate": 1.2882712390779022e-05, + "loss": 1.047, + "step": 12625 + }, + { + "epoch": 0.7671182939425238, + "grad_norm": 0.1887039691209793, + "learning_rate": 1.287629915039239e-05, + "loss": 1.0784, + "step": 12626 + }, + { + "epoch": 0.7671790509751504, + "grad_norm": 0.18720541894435883, + "learning_rate": 1.2869887270782165e-05, + "loss": 1.15, + "step": 12627 + }, + { + "epoch": 0.7672398080077769, + "grad_norm": 0.13905468583106995, + "learning_rate": 1.286347675218339e-05, + "loss": 1.0477, + "step": 12628 + }, + { + "epoch": 0.7673005650404034, + "grad_norm": 0.1416422426700592, + "learning_rate": 1.2857067594831063e-05, + "loss": 0.9789, + "step": 12629 + }, + { + "epoch": 0.76736132207303, + "grad_norm": 0.20307201147079468, + "learning_rate": 1.2850659798960107e-05, + "loss": 1.1788, + "step": 12630 + }, + { + "epoch": 0.7674220791056565, + "grad_norm": 0.1398327350616455, + "learning_rate": 1.2844253364805398e-05, + "loss": 1.0557, + "step": 12631 + }, + { + "epoch": 0.767482836138283, + "grad_norm": 0.1993551403284073, + "learning_rate": 1.2837848292601755e-05, + "loss": 1.1008, + "step": 12632 + }, + { + "epoch": 0.7675435931709096, + "grad_norm": 0.13359631597995758, + "learning_rate": 1.2831444582583973e-05, + "loss": 1.039, + "step": 12633 + }, + { + "epoch": 0.767604350203536, + "grad_norm": 0.16576634347438812, + "learning_rate": 1.2825042234986779e-05, + "loss": 1.067, + "step": 12634 + }, + { + "epoch": 0.7676651072361625, + "grad_norm": 0.23622706532478333, + "learning_rate": 1.2818641250044838e-05, + "loss": 1.1062, + "step": 12635 + }, + { + "epoch": 0.7677258642687891, + "grad_norm": 0.18494898080825806, + "learning_rate": 1.2812241627992793e-05, + "loss": 1.1224, + "step": 12636 + }, + { + "epoch": 0.7677866213014156, + "grad_norm": 0.1257193684577942, + "learning_rate": 1.2805843369065223e-05, + "loss": 1.036, + "step": 12637 + }, + { + "epoch": 0.7678473783340422, + "grad_norm": 0.1562509834766388, + "learning_rate": 1.2799446473496651e-05, + "loss": 1.0862, + "step": 12638 + }, + { + "epoch": 0.7679081353666687, + "grad_norm": 0.2027399241924286, + "learning_rate": 1.2793050941521556e-05, + "loss": 1.1339, + "step": 12639 + }, + { + "epoch": 0.7679688923992952, + "grad_norm": 0.15130436420440674, + "learning_rate": 1.2786656773374373e-05, + "loss": 1.2908, + "step": 12640 + }, + { + "epoch": 0.7680296494319218, + "grad_norm": 0.1184222549200058, + "learning_rate": 1.278026396928948e-05, + "loss": 1.0829, + "step": 12641 + }, + { + "epoch": 0.7680904064645483, + "grad_norm": 0.15989989042282104, + "learning_rate": 1.2773872529501207e-05, + "loss": 1.1005, + "step": 12642 + }, + { + "epoch": 0.7681511634971748, + "grad_norm": 0.14799381792545319, + "learning_rate": 1.2767482454243829e-05, + "loss": 1.0421, + "step": 12643 + }, + { + "epoch": 0.7682119205298014, + "grad_norm": 0.10433269292116165, + "learning_rate": 1.2761093743751568e-05, + "loss": 1.014, + "step": 12644 + }, + { + "epoch": 0.7682726775624279, + "grad_norm": 0.19913047552108765, + "learning_rate": 1.2754706398258625e-05, + "loss": 1.112, + "step": 12645 + }, + { + "epoch": 0.7683334345950544, + "grad_norm": 0.11359617114067078, + "learning_rate": 1.2748320417999132e-05, + "loss": 1.012, + "step": 12646 + }, + { + "epoch": 0.7683941916276809, + "grad_norm": 0.1482570320367813, + "learning_rate": 1.2741935803207144e-05, + "loss": 1.1103, + "step": 12647 + }, + { + "epoch": 0.7684549486603074, + "grad_norm": 0.1119273379445076, + "learning_rate": 1.2735552554116698e-05, + "loss": 1.0069, + "step": 12648 + }, + { + "epoch": 0.7685157056929339, + "grad_norm": 0.22167113423347473, + "learning_rate": 1.272917067096176e-05, + "loss": 1.1505, + "step": 12649 + }, + { + "epoch": 0.7685764627255605, + "grad_norm": 0.49580806493759155, + "learning_rate": 1.2722790153976294e-05, + "loss": 1.0523, + "step": 12650 + }, + { + "epoch": 0.768637219758187, + "grad_norm": 0.2841155230998993, + "learning_rate": 1.2716411003394157e-05, + "loss": 1.1503, + "step": 12651 + }, + { + "epoch": 0.7686979767908135, + "grad_norm": 1.1812560558319092, + "learning_rate": 1.2710033219449185e-05, + "loss": 1.0168, + "step": 12652 + }, + { + "epoch": 0.7687587338234401, + "grad_norm": 0.13246190547943115, + "learning_rate": 1.2703656802375164e-05, + "loss": 1.0813, + "step": 12653 + }, + { + "epoch": 0.7688194908560666, + "grad_norm": 0.10599101334810257, + "learning_rate": 1.2697281752405787e-05, + "loss": 1.0439, + "step": 12654 + }, + { + "epoch": 0.7688802478886931, + "grad_norm": 0.2182929664850235, + "learning_rate": 1.269090806977477e-05, + "loss": 1.1778, + "step": 12655 + }, + { + "epoch": 0.7689410049213197, + "grad_norm": 0.184401273727417, + "learning_rate": 1.2684535754715732e-05, + "loss": 0.9632, + "step": 12656 + }, + { + "epoch": 0.7690017619539462, + "grad_norm": 0.12318959832191467, + "learning_rate": 1.2678164807462244e-05, + "loss": 1.022, + "step": 12657 + }, + { + "epoch": 0.7690625189865727, + "grad_norm": 0.206369087100029, + "learning_rate": 1.267179522824784e-05, + "loss": 1.0669, + "step": 12658 + }, + { + "epoch": 0.7691232760191993, + "grad_norm": 0.11718953400850296, + "learning_rate": 1.2665427017306002e-05, + "loss": 1.0868, + "step": 12659 + }, + { + "epoch": 0.7691840330518257, + "grad_norm": 0.3162151277065277, + "learning_rate": 1.2659060174870146e-05, + "loss": 1.197, + "step": 12660 + }, + { + "epoch": 0.7692447900844522, + "grad_norm": 4.24622917175293, + "learning_rate": 1.2652694701173662e-05, + "loss": 1.1749, + "step": 12661 + }, + { + "epoch": 0.7693055471170788, + "grad_norm": 0.15454228222370148, + "learning_rate": 1.2646330596449873e-05, + "loss": 1.0722, + "step": 12662 + }, + { + "epoch": 0.7693663041497053, + "grad_norm": 0.1315198838710785, + "learning_rate": 1.2639967860932056e-05, + "loss": 1.0242, + "step": 12663 + }, + { + "epoch": 0.7694270611823318, + "grad_norm": 0.16658826172351837, + "learning_rate": 1.263360649485344e-05, + "loss": 1.1173, + "step": 12664 + }, + { + "epoch": 0.7694878182149584, + "grad_norm": 0.19234028458595276, + "learning_rate": 1.2627246498447182e-05, + "loss": 1.1459, + "step": 12665 + }, + { + "epoch": 0.7695485752475849, + "grad_norm": 0.18884725868701935, + "learning_rate": 1.2620887871946462e-05, + "loss": 1.0918, + "step": 12666 + }, + { + "epoch": 0.7696093322802114, + "grad_norm": 0.1349470168352127, + "learning_rate": 1.2614530615584303e-05, + "loss": 1.0923, + "step": 12667 + }, + { + "epoch": 0.769670089312838, + "grad_norm": 2.6494998931884766, + "learning_rate": 1.2608174729593753e-05, + "loss": 1.0735, + "step": 12668 + }, + { + "epoch": 0.7697308463454645, + "grad_norm": 0.159627303481102, + "learning_rate": 1.2601820214207783e-05, + "loss": 1.1251, + "step": 12669 + }, + { + "epoch": 0.769791603378091, + "grad_norm": 0.1379135102033615, + "learning_rate": 1.2595467069659312e-05, + "loss": 1.0477, + "step": 12670 + }, + { + "epoch": 0.7698523604107176, + "grad_norm": 0.39774608612060547, + "learning_rate": 1.2589115296181237e-05, + "loss": 1.1864, + "step": 12671 + }, + { + "epoch": 0.7699131174433441, + "grad_norm": 0.15946069359779358, + "learning_rate": 1.2582764894006371e-05, + "loss": 1.1818, + "step": 12672 + }, + { + "epoch": 0.7699738744759705, + "grad_norm": 0.13292591273784637, + "learning_rate": 1.2576415863367508e-05, + "loss": 1.0215, + "step": 12673 + }, + { + "epoch": 0.7700346315085971, + "grad_norm": 0.1330001950263977, + "learning_rate": 1.2570068204497343e-05, + "loss": 1.0559, + "step": 12674 + }, + { + "epoch": 0.7700953885412236, + "grad_norm": 0.29064276814460754, + "learning_rate": 1.2563721917628546e-05, + "loss": 1.0435, + "step": 12675 + }, + { + "epoch": 0.7701561455738501, + "grad_norm": 0.1436695158481598, + "learning_rate": 1.255737700299377e-05, + "loss": 1.0638, + "step": 12676 + }, + { + "epoch": 0.7702169026064767, + "grad_norm": 0.0998484417796135, + "learning_rate": 1.2551033460825579e-05, + "loss": 1.0161, + "step": 12677 + }, + { + "epoch": 0.7702776596391032, + "grad_norm": 0.13796480000019073, + "learning_rate": 1.2544691291356497e-05, + "loss": 1.0462, + "step": 12678 + }, + { + "epoch": 0.7703384166717298, + "grad_norm": 0.18114705383777618, + "learning_rate": 1.2538350494818996e-05, + "loss": 1.094, + "step": 12679 + }, + { + "epoch": 0.7703991737043563, + "grad_norm": 0.1539202630519867, + "learning_rate": 1.25320110714455e-05, + "loss": 1.0909, + "step": 12680 + }, + { + "epoch": 0.7704599307369828, + "grad_norm": 0.10257013142108917, + "learning_rate": 1.2525673021468376e-05, + "loss": 0.9988, + "step": 12681 + }, + { + "epoch": 0.7705206877696094, + "grad_norm": 0.22239956259727478, + "learning_rate": 1.2519336345119953e-05, + "loss": 1.0645, + "step": 12682 + }, + { + "epoch": 0.7705814448022359, + "grad_norm": 0.1461649090051651, + "learning_rate": 1.25130010426325e-05, + "loss": 1.028, + "step": 12683 + }, + { + "epoch": 0.7706422018348624, + "grad_norm": 0.20986881852149963, + "learning_rate": 1.250666711423824e-05, + "loss": 1.1951, + "step": 12684 + }, + { + "epoch": 0.770702958867489, + "grad_norm": 0.1416889876127243, + "learning_rate": 1.2500334560169347e-05, + "loss": 1.0959, + "step": 12685 + }, + { + "epoch": 0.7707637159001154, + "grad_norm": 2.351872205734253, + "learning_rate": 1.2494003380657937e-05, + "loss": 1.0869, + "step": 12686 + }, + { + "epoch": 0.7708244729327419, + "grad_norm": 0.1239066869020462, + "learning_rate": 1.2487673575936082e-05, + "loss": 1.0489, + "step": 12687 + }, + { + "epoch": 0.7708852299653685, + "grad_norm": 0.2513124644756317, + "learning_rate": 1.2481345146235807e-05, + "loss": 1.2966, + "step": 12688 + }, + { + "epoch": 0.770945986997995, + "grad_norm": 0.15611149370670319, + "learning_rate": 1.2475018091789075e-05, + "loss": 1.1034, + "step": 12689 + }, + { + "epoch": 0.7710067440306215, + "grad_norm": 0.1298285275697708, + "learning_rate": 1.246869241282781e-05, + "loss": 1.067, + "step": 12690 + }, + { + "epoch": 0.7710675010632481, + "grad_norm": 1.1889944076538086, + "learning_rate": 1.2462368109583861e-05, + "loss": 1.1691, + "step": 12691 + }, + { + "epoch": 0.7711282580958746, + "grad_norm": 0.14899390935897827, + "learning_rate": 1.2456045182289083e-05, + "loss": 1.1229, + "step": 12692 + }, + { + "epoch": 0.7711890151285011, + "grad_norm": 0.14661161601543427, + "learning_rate": 1.2449723631175237e-05, + "loss": 1.2312, + "step": 12693 + }, + { + "epoch": 0.7712497721611277, + "grad_norm": 0.15550890564918518, + "learning_rate": 1.2443403456474017e-05, + "loss": 1.0361, + "step": 12694 + }, + { + "epoch": 0.7713105291937542, + "grad_norm": 0.1178649514913559, + "learning_rate": 1.2437084658417104e-05, + "loss": 1.0193, + "step": 12695 + }, + { + "epoch": 0.7713712862263807, + "grad_norm": 0.34149467945098877, + "learning_rate": 1.2430767237236096e-05, + "loss": 1.0803, + "step": 12696 + }, + { + "epoch": 0.7714320432590073, + "grad_norm": 0.20721812546253204, + "learning_rate": 1.2424451193162594e-05, + "loss": 1.0325, + "step": 12697 + }, + { + "epoch": 0.7714928002916338, + "grad_norm": 0.3458195924758911, + "learning_rate": 1.2418136526428098e-05, + "loss": 1.1719, + "step": 12698 + }, + { + "epoch": 0.7715535573242602, + "grad_norm": 0.1345955729484558, + "learning_rate": 1.2411823237264069e-05, + "loss": 1.1226, + "step": 12699 + }, + { + "epoch": 0.7716143143568868, + "grad_norm": 0.15379756689071655, + "learning_rate": 1.2405511325901942e-05, + "loss": 1.2648, + "step": 12700 + }, + { + "epoch": 0.7716750713895133, + "grad_norm": 0.10465983301401138, + "learning_rate": 1.239920079257303e-05, + "loss": 0.9921, + "step": 12701 + }, + { + "epoch": 0.7717358284221398, + "grad_norm": 0.3000970780849457, + "learning_rate": 1.2392891637508702e-05, + "loss": 1.1169, + "step": 12702 + }, + { + "epoch": 0.7717965854547664, + "grad_norm": 0.14947441220283508, + "learning_rate": 1.2386583860940198e-05, + "loss": 1.0486, + "step": 12703 + }, + { + "epoch": 0.7718573424873929, + "grad_norm": 0.18780438601970673, + "learning_rate": 1.2380277463098733e-05, + "loss": 1.0876, + "step": 12704 + }, + { + "epoch": 0.7719180995200194, + "grad_norm": 0.8837225437164307, + "learning_rate": 1.2373972444215464e-05, + "loss": 1.1414, + "step": 12705 + }, + { + "epoch": 0.771978856552646, + "grad_norm": 0.30864495038986206, + "learning_rate": 1.2367668804521515e-05, + "loss": 1.2305, + "step": 12706 + }, + { + "epoch": 0.7720396135852725, + "grad_norm": 0.17433524131774902, + "learning_rate": 1.2361366544247937e-05, + "loss": 1.1584, + "step": 12707 + }, + { + "epoch": 0.772100370617899, + "grad_norm": 0.14750021696090698, + "learning_rate": 1.2355065663625747e-05, + "loss": 1.073, + "step": 12708 + }, + { + "epoch": 0.7721611276505256, + "grad_norm": 0.1649530678987503, + "learning_rate": 1.23487661628859e-05, + "loss": 1.09, + "step": 12709 + }, + { + "epoch": 0.7722218846831521, + "grad_norm": 0.18619085848331451, + "learning_rate": 1.2342468042259309e-05, + "loss": 1.1082, + "step": 12710 + }, + { + "epoch": 0.7722826417157787, + "grad_norm": 0.13991715013980865, + "learning_rate": 1.2336171301976824e-05, + "loss": 1.0397, + "step": 12711 + }, + { + "epoch": 0.7723433987484051, + "grad_norm": 0.18284957110881805, + "learning_rate": 1.2329875942269253e-05, + "loss": 1.0869, + "step": 12712 + }, + { + "epoch": 0.7724041557810316, + "grad_norm": 0.1092042550444603, + "learning_rate": 1.2323581963367386e-05, + "loss": 1.0545, + "step": 12713 + }, + { + "epoch": 0.7724649128136581, + "grad_norm": 0.16548630595207214, + "learning_rate": 1.2317289365501894e-05, + "loss": 1.0854, + "step": 12714 + }, + { + "epoch": 0.7725256698462847, + "grad_norm": 0.11044737696647644, + "learning_rate": 1.2310998148903441e-05, + "loss": 1.0844, + "step": 12715 + }, + { + "epoch": 0.7725864268789112, + "grad_norm": 0.13364773988723755, + "learning_rate": 1.2304708313802632e-05, + "loss": 1.0503, + "step": 12716 + }, + { + "epoch": 0.7726471839115377, + "grad_norm": 0.1477249413728714, + "learning_rate": 1.2298419860430016e-05, + "loss": 0.9986, + "step": 12717 + }, + { + "epoch": 0.7727079409441643, + "grad_norm": 0.12467773258686066, + "learning_rate": 1.2292132789016125e-05, + "loss": 1.0257, + "step": 12718 + }, + { + "epoch": 0.7727686979767908, + "grad_norm": 0.13956567645072937, + "learning_rate": 1.2285847099791391e-05, + "loss": 1.1357, + "step": 12719 + }, + { + "epoch": 0.7728294550094174, + "grad_norm": 0.2354406863451004, + "learning_rate": 1.2279562792986238e-05, + "loss": 1.0079, + "step": 12720 + }, + { + "epoch": 0.7728902120420439, + "grad_norm": 0.39486417174339294, + "learning_rate": 1.2273279868830973e-05, + "loss": 1.1166, + "step": 12721 + }, + { + "epoch": 0.7729509690746704, + "grad_norm": 0.18616119027137756, + "learning_rate": 1.2266998327555939e-05, + "loss": 1.0442, + "step": 12722 + }, + { + "epoch": 0.773011726107297, + "grad_norm": 0.2511233389377594, + "learning_rate": 1.2260718169391377e-05, + "loss": 1.1556, + "step": 12723 + }, + { + "epoch": 0.7730724831399235, + "grad_norm": 0.13469520211219788, + "learning_rate": 1.225443939456749e-05, + "loss": 1.2609, + "step": 12724 + }, + { + "epoch": 0.7731332401725499, + "grad_norm": 0.1431630700826645, + "learning_rate": 1.2248162003314418e-05, + "loss": 1.041, + "step": 12725 + }, + { + "epoch": 0.7731939972051765, + "grad_norm": 0.25470998883247375, + "learning_rate": 1.2241885995862262e-05, + "loss": 1.1215, + "step": 12726 + }, + { + "epoch": 0.773254754237803, + "grad_norm": 0.6969568133354187, + "learning_rate": 1.223561137244108e-05, + "loss": 1.1334, + "step": 12727 + }, + { + "epoch": 0.7733155112704295, + "grad_norm": 0.11331703513860703, + "learning_rate": 1.222933813328086e-05, + "loss": 0.9843, + "step": 12728 + }, + { + "epoch": 0.7733762683030561, + "grad_norm": 0.1726769357919693, + "learning_rate": 1.222306627861155e-05, + "loss": 1.0938, + "step": 12729 + }, + { + "epoch": 0.7734370253356826, + "grad_norm": 0.12890057265758514, + "learning_rate": 1.221679580866305e-05, + "loss": 1.0258, + "step": 12730 + }, + { + "epoch": 0.7734977823683091, + "grad_norm": 0.144577294588089, + "learning_rate": 1.22105267236652e-05, + "loss": 1.1019, + "step": 12731 + }, + { + "epoch": 0.7735585394009357, + "grad_norm": 0.1436421275138855, + "learning_rate": 1.2204259023847796e-05, + "loss": 1.102, + "step": 12732 + }, + { + "epoch": 0.7736192964335622, + "grad_norm": 0.21898682415485382, + "learning_rate": 1.2197992709440592e-05, + "loss": 1.124, + "step": 12733 + }, + { + "epoch": 0.7736800534661887, + "grad_norm": 0.28340819478034973, + "learning_rate": 1.2191727780673262e-05, + "loss": 1.1258, + "step": 12734 + }, + { + "epoch": 0.7737408104988153, + "grad_norm": 0.1154094710946083, + "learning_rate": 1.2185464237775467e-05, + "loss": 1.054, + "step": 12735 + }, + { + "epoch": 0.7738015675314418, + "grad_norm": 0.12246297299861908, + "learning_rate": 1.217920208097678e-05, + "loss": 1.0494, + "step": 12736 + }, + { + "epoch": 0.7738623245640683, + "grad_norm": 0.629007875919342, + "learning_rate": 1.2172941310506748e-05, + "loss": 1.0815, + "step": 12737 + }, + { + "epoch": 0.7739230815966949, + "grad_norm": 0.22257009148597717, + "learning_rate": 1.216668192659487e-05, + "loss": 1.0272, + "step": 12738 + }, + { + "epoch": 0.7739838386293213, + "grad_norm": 0.13889539241790771, + "learning_rate": 1.2160423929470582e-05, + "loss": 1.079, + "step": 12739 + }, + { + "epoch": 0.7740445956619478, + "grad_norm": 0.12575124204158783, + "learning_rate": 1.2154167319363286e-05, + "loss": 0.982, + "step": 12740 + }, + { + "epoch": 0.7741053526945744, + "grad_norm": 0.12299622595310211, + "learning_rate": 1.2147912096502285e-05, + "loss": 1.0463, + "step": 12741 + }, + { + "epoch": 0.7741661097272009, + "grad_norm": 0.18291832506656647, + "learning_rate": 1.2141658261116867e-05, + "loss": 1.0606, + "step": 12742 + }, + { + "epoch": 0.7742268667598274, + "grad_norm": 0.16416339576244354, + "learning_rate": 1.2135405813436301e-05, + "loss": 1.0713, + "step": 12743 + }, + { + "epoch": 0.774287623792454, + "grad_norm": 0.1198337972164154, + "learning_rate": 1.2129154753689754e-05, + "loss": 1.1134, + "step": 12744 + }, + { + "epoch": 0.7743483808250805, + "grad_norm": 0.11347414553165436, + "learning_rate": 1.2122905082106357e-05, + "loss": 1.0261, + "step": 12745 + }, + { + "epoch": 0.774409137857707, + "grad_norm": 0.16280025243759155, + "learning_rate": 1.21166567989152e-05, + "loss": 1.2671, + "step": 12746 + }, + { + "epoch": 0.7744698948903336, + "grad_norm": 0.17514172196388245, + "learning_rate": 1.2110409904345305e-05, + "loss": 1.1239, + "step": 12747 + }, + { + "epoch": 0.7745306519229601, + "grad_norm": 0.15606075525283813, + "learning_rate": 1.2104164398625662e-05, + "loss": 1.131, + "step": 12748 + }, + { + "epoch": 0.7745914089555866, + "grad_norm": 0.12669014930725098, + "learning_rate": 1.2097920281985198e-05, + "loss": 1.0605, + "step": 12749 + }, + { + "epoch": 0.7746521659882132, + "grad_norm": 0.7310323119163513, + "learning_rate": 1.2091677554652792e-05, + "loss": 1.119, + "step": 12750 + }, + { + "epoch": 0.7747129230208397, + "grad_norm": 0.10180962085723877, + "learning_rate": 1.2085436216857277e-05, + "loss": 1.0049, + "step": 12751 + }, + { + "epoch": 0.7747736800534661, + "grad_norm": 0.10264743119478226, + "learning_rate": 1.2079196268827419e-05, + "loss": 1.0467, + "step": 12752 + }, + { + "epoch": 0.7748344370860927, + "grad_norm": 0.11282286047935486, + "learning_rate": 1.2072957710791955e-05, + "loss": 1.0158, + "step": 12753 + }, + { + "epoch": 0.7748951941187192, + "grad_norm": 0.18218880891799927, + "learning_rate": 1.206672054297956e-05, + "loss": 1.1653, + "step": 12754 + }, + { + "epoch": 0.7749559511513457, + "grad_norm": 0.27505213022232056, + "learning_rate": 1.2060484765618851e-05, + "loss": 1.2276, + "step": 12755 + }, + { + "epoch": 0.7750167081839723, + "grad_norm": 0.17598998546600342, + "learning_rate": 1.205425037893841e-05, + "loss": 1.0465, + "step": 12756 + }, + { + "epoch": 0.7750774652165988, + "grad_norm": 0.11984530836343765, + "learning_rate": 1.2048017383166753e-05, + "loss": 0.9833, + "step": 12757 + }, + { + "epoch": 0.7751382222492254, + "grad_norm": 0.1310300976037979, + "learning_rate": 1.2041785778532339e-05, + "loss": 1.0269, + "step": 12758 + }, + { + "epoch": 0.7751989792818519, + "grad_norm": 0.23538441956043243, + "learning_rate": 1.2035555565263623e-05, + "loss": 1.1002, + "step": 12759 + }, + { + "epoch": 0.7752597363144784, + "grad_norm": 0.12445208430290222, + "learning_rate": 1.202932674358897e-05, + "loss": 1.0521, + "step": 12760 + }, + { + "epoch": 0.775320493347105, + "grad_norm": 0.1923869252204895, + "learning_rate": 1.202309931373667e-05, + "loss": 1.0318, + "step": 12761 + }, + { + "epoch": 0.7753812503797315, + "grad_norm": 0.3935041129589081, + "learning_rate": 1.2016873275935004e-05, + "loss": 1.0251, + "step": 12762 + }, + { + "epoch": 0.775442007412358, + "grad_norm": 0.13856910169124603, + "learning_rate": 1.2010648630412175e-05, + "loss": 1.0557, + "step": 12763 + }, + { + "epoch": 0.7755027644449846, + "grad_norm": 0.19792990386486053, + "learning_rate": 1.2004425377396383e-05, + "loss": 1.1648, + "step": 12764 + }, + { + "epoch": 0.775563521477611, + "grad_norm": 0.14647440612316132, + "learning_rate": 1.199820351711572e-05, + "loss": 1.0814, + "step": 12765 + }, + { + "epoch": 0.7756242785102375, + "grad_norm": 0.13246290385723114, + "learning_rate": 1.199198304979825e-05, + "loss": 1.0764, + "step": 12766 + }, + { + "epoch": 0.775685035542864, + "grad_norm": 1.2874914407730103, + "learning_rate": 1.1985763975672004e-05, + "loss": 1.003, + "step": 12767 + }, + { + "epoch": 0.7757457925754906, + "grad_norm": 0.24590452015399933, + "learning_rate": 1.1979546294964899e-05, + "loss": 1.2237, + "step": 12768 + }, + { + "epoch": 0.7758065496081171, + "grad_norm": 0.4111698567867279, + "learning_rate": 1.1973330007904892e-05, + "loss": 1.1349, + "step": 12769 + }, + { + "epoch": 0.7758673066407437, + "grad_norm": 0.10011672973632812, + "learning_rate": 1.1967115114719818e-05, + "loss": 1.0022, + "step": 12770 + }, + { + "epoch": 0.7759280636733702, + "grad_norm": 0.26296618580818176, + "learning_rate": 1.1960901615637494e-05, + "loss": 1.1378, + "step": 12771 + }, + { + "epoch": 0.7759888207059967, + "grad_norm": 0.14681220054626465, + "learning_rate": 1.1954689510885675e-05, + "loss": 1.0925, + "step": 12772 + }, + { + "epoch": 0.7760495777386233, + "grad_norm": 0.154121994972229, + "learning_rate": 1.1948478800692065e-05, + "loss": 1.1075, + "step": 12773 + }, + { + "epoch": 0.7761103347712498, + "grad_norm": 0.10315140336751938, + "learning_rate": 1.194226948528432e-05, + "loss": 1.0448, + "step": 12774 + }, + { + "epoch": 0.7761710918038763, + "grad_norm": 0.116829514503479, + "learning_rate": 1.1936061564890044e-05, + "loss": 1.0315, + "step": 12775 + }, + { + "epoch": 0.7762318488365029, + "grad_norm": 0.13943418860435486, + "learning_rate": 1.1929855039736786e-05, + "loss": 1.0536, + "step": 12776 + }, + { + "epoch": 0.7762926058691294, + "grad_norm": 0.18070149421691895, + "learning_rate": 1.1923649910052054e-05, + "loss": 1.1006, + "step": 12777 + }, + { + "epoch": 0.7763533629017558, + "grad_norm": 0.13364963233470917, + "learning_rate": 1.191744617606329e-05, + "loss": 1.0934, + "step": 12778 + }, + { + "epoch": 0.7764141199343824, + "grad_norm": 0.1769493967294693, + "learning_rate": 1.1911243837997887e-05, + "loss": 1.095, + "step": 12779 + }, + { + "epoch": 0.7764748769670089, + "grad_norm": 0.12219464778900146, + "learning_rate": 1.1905042896083235e-05, + "loss": 1.0864, + "step": 12780 + }, + { + "epoch": 0.7765356339996354, + "grad_norm": 0.23633837699890137, + "learning_rate": 1.1898843350546579e-05, + "loss": 1.1657, + "step": 12781 + }, + { + "epoch": 0.776596391032262, + "grad_norm": 0.16362082958221436, + "learning_rate": 1.1892645201615188e-05, + "loss": 1.0872, + "step": 12782 + }, + { + "epoch": 0.7766571480648885, + "grad_norm": 0.1066916361451149, + "learning_rate": 1.1886448449516251e-05, + "loss": 1.0152, + "step": 12783 + }, + { + "epoch": 0.776717905097515, + "grad_norm": 0.11957157403230667, + "learning_rate": 1.1880253094476901e-05, + "loss": 1.0419, + "step": 12784 + }, + { + "epoch": 0.7767786621301416, + "grad_norm": 0.12654800713062286, + "learning_rate": 1.1874059136724253e-05, + "loss": 1.0735, + "step": 12785 + }, + { + "epoch": 0.7768394191627681, + "grad_norm": 0.27567052841186523, + "learning_rate": 1.1867866576485342e-05, + "loss": 1.2144, + "step": 12786 + }, + { + "epoch": 0.7769001761953946, + "grad_norm": 0.14142964780330658, + "learning_rate": 1.1861675413987162e-05, + "loss": 1.0765, + "step": 12787 + }, + { + "epoch": 0.7769609332280212, + "grad_norm": 0.178693950176239, + "learning_rate": 1.1855485649456633e-05, + "loss": 1.1344, + "step": 12788 + }, + { + "epoch": 0.7770216902606477, + "grad_norm": 0.6197378635406494, + "learning_rate": 1.184929728312063e-05, + "loss": 1.0859, + "step": 12789 + }, + { + "epoch": 0.7770824472932742, + "grad_norm": 0.1488838493824005, + "learning_rate": 1.184311031520603e-05, + "loss": 1.0169, + "step": 12790 + }, + { + "epoch": 0.7771432043259007, + "grad_norm": 0.1694076657295227, + "learning_rate": 1.1836924745939593e-05, + "loss": 1.1357, + "step": 12791 + }, + { + "epoch": 0.7772039613585272, + "grad_norm": 0.13343407213687897, + "learning_rate": 1.1830740575548055e-05, + "loss": 1.1007, + "step": 12792 + }, + { + "epoch": 0.7772647183911537, + "grad_norm": 0.21537846326828003, + "learning_rate": 1.1824557804258101e-05, + "loss": 1.1133, + "step": 12793 + }, + { + "epoch": 0.7773254754237803, + "grad_norm": 0.17056311666965485, + "learning_rate": 1.1818376432296358e-05, + "loss": 1.0999, + "step": 12794 + }, + { + "epoch": 0.7773862324564068, + "grad_norm": 0.19689084589481354, + "learning_rate": 1.1812196459889413e-05, + "loss": 1.2193, + "step": 12795 + }, + { + "epoch": 0.7774469894890333, + "grad_norm": 0.22491297125816345, + "learning_rate": 1.1806017887263782e-05, + "loss": 1.0621, + "step": 12796 + }, + { + "epoch": 0.7775077465216599, + "grad_norm": 0.2003372311592102, + "learning_rate": 1.179984071464595e-05, + "loss": 1.0682, + "step": 12797 + }, + { + "epoch": 0.7775685035542864, + "grad_norm": 0.17473378777503967, + "learning_rate": 1.179366494226234e-05, + "loss": 1.0865, + "step": 12798 + }, + { + "epoch": 0.777629260586913, + "grad_norm": 0.3054414391517639, + "learning_rate": 1.1787490570339326e-05, + "loss": 1.196, + "step": 12799 + }, + { + "epoch": 0.7776900176195395, + "grad_norm": 0.10129894316196442, + "learning_rate": 1.1781317599103236e-05, + "loss": 1.0289, + "step": 12800 + }, + { + "epoch": 0.777750774652166, + "grad_norm": 0.311766117811203, + "learning_rate": 1.1775146028780332e-05, + "loss": 1.1187, + "step": 12801 + }, + { + "epoch": 0.7778115316847926, + "grad_norm": 0.13467176258563995, + "learning_rate": 1.1768975859596843e-05, + "loss": 1.0423, + "step": 12802 + }, + { + "epoch": 0.7778722887174191, + "grad_norm": 0.18228508532047272, + "learning_rate": 1.1762807091778933e-05, + "loss": 1.0727, + "step": 12803 + }, + { + "epoch": 0.7779330457500455, + "grad_norm": 0.12224791198968887, + "learning_rate": 1.175663972555272e-05, + "loss": 1.1033, + "step": 12804 + }, + { + "epoch": 0.777993802782672, + "grad_norm": 0.14524850249290466, + "learning_rate": 1.1750473761144254e-05, + "loss": 1.046, + "step": 12805 + }, + { + "epoch": 0.7780545598152986, + "grad_norm": 0.1831561028957367, + "learning_rate": 1.174430919877958e-05, + "loss": 1.0119, + "step": 12806 + }, + { + "epoch": 0.7781153168479251, + "grad_norm": 0.3943927586078644, + "learning_rate": 1.1738146038684662e-05, + "loss": 1.2763, + "step": 12807 + }, + { + "epoch": 0.7781760738805517, + "grad_norm": 0.6518760323524475, + "learning_rate": 1.1731984281085385e-05, + "loss": 1.0511, + "step": 12808 + }, + { + "epoch": 0.7782368309131782, + "grad_norm": 0.12650218605995178, + "learning_rate": 1.1725823926207612e-05, + "loss": 1.0181, + "step": 12809 + }, + { + "epoch": 0.7782975879458047, + "grad_norm": 0.11816632747650146, + "learning_rate": 1.1719664974277156e-05, + "loss": 1.0329, + "step": 12810 + }, + { + "epoch": 0.7783583449784313, + "grad_norm": 0.16062131524085999, + "learning_rate": 1.1713507425519793e-05, + "loss": 1.1158, + "step": 12811 + }, + { + "epoch": 0.7784191020110578, + "grad_norm": 0.17883680760860443, + "learning_rate": 1.1707351280161216e-05, + "loss": 1.1943, + "step": 12812 + }, + { + "epoch": 0.7784798590436843, + "grad_norm": 0.15480759739875793, + "learning_rate": 1.1701196538427079e-05, + "loss": 1.1563, + "step": 12813 + }, + { + "epoch": 0.7785406160763109, + "grad_norm": 0.1633390337228775, + "learning_rate": 1.1695043200542982e-05, + "loss": 1.0152, + "step": 12814 + }, + { + "epoch": 0.7786013731089374, + "grad_norm": 0.16703739762306213, + "learning_rate": 1.1688891266734486e-05, + "loss": 1.047, + "step": 12815 + }, + { + "epoch": 0.7786621301415639, + "grad_norm": 0.15224634110927582, + "learning_rate": 1.1682740737227083e-05, + "loss": 1.1063, + "step": 12816 + }, + { + "epoch": 0.7787228871741904, + "grad_norm": 0.13909094035625458, + "learning_rate": 1.1676591612246224e-05, + "loss": 1.0591, + "step": 12817 + }, + { + "epoch": 0.7787836442068169, + "grad_norm": 0.14900583028793335, + "learning_rate": 1.1670443892017314e-05, + "loss": 1.1542, + "step": 12818 + }, + { + "epoch": 0.7788444012394434, + "grad_norm": 0.22665028274059296, + "learning_rate": 1.1664297576765687e-05, + "loss": 1.1414, + "step": 12819 + }, + { + "epoch": 0.77890515827207, + "grad_norm": 0.15524421632289886, + "learning_rate": 1.1658152666716638e-05, + "loss": 1.0208, + "step": 12820 + }, + { + "epoch": 0.7789659153046965, + "grad_norm": 0.14574138820171356, + "learning_rate": 1.1652009162095424e-05, + "loss": 1.1487, + "step": 12821 + }, + { + "epoch": 0.779026672337323, + "grad_norm": 0.3664519786834717, + "learning_rate": 1.1645867063127219e-05, + "loss": 1.0267, + "step": 12822 + }, + { + "epoch": 0.7790874293699496, + "grad_norm": 0.20768223702907562, + "learning_rate": 1.1639726370037174e-05, + "loss": 1.022, + "step": 12823 + }, + { + "epoch": 0.7791481864025761, + "grad_norm": 0.17619727551937103, + "learning_rate": 1.1633587083050374e-05, + "loss": 1.1451, + "step": 12824 + }, + { + "epoch": 0.7792089434352026, + "grad_norm": 0.13255421817302704, + "learning_rate": 1.1627449202391849e-05, + "loss": 1.0363, + "step": 12825 + }, + { + "epoch": 0.7792697004678292, + "grad_norm": 0.12598589062690735, + "learning_rate": 1.1621312728286581e-05, + "loss": 1.0438, + "step": 12826 + }, + { + "epoch": 0.7793304575004557, + "grad_norm": 0.6745483875274658, + "learning_rate": 1.1615177660959547e-05, + "loss": 1.1095, + "step": 12827 + }, + { + "epoch": 0.7793912145330822, + "grad_norm": 0.18534834682941437, + "learning_rate": 1.1609044000635577e-05, + "loss": 1.0394, + "step": 12828 + }, + { + "epoch": 0.7794519715657088, + "grad_norm": 0.13179436326026917, + "learning_rate": 1.160291174753952e-05, + "loss": 1.0576, + "step": 12829 + }, + { + "epoch": 0.7795127285983352, + "grad_norm": 0.17568884789943695, + "learning_rate": 1.1596780901896138e-05, + "loss": 1.0925, + "step": 12830 + }, + { + "epoch": 0.7795734856309617, + "grad_norm": 0.2002159059047699, + "learning_rate": 1.1590651463930191e-05, + "loss": 1.0955, + "step": 12831 + }, + { + "epoch": 0.7796342426635883, + "grad_norm": 0.1371026635169983, + "learning_rate": 1.158452343386634e-05, + "loss": 0.9927, + "step": 12832 + }, + { + "epoch": 0.7796949996962148, + "grad_norm": 0.6081156730651855, + "learning_rate": 1.157839681192921e-05, + "loss": 1.2386, + "step": 12833 + }, + { + "epoch": 0.7797557567288413, + "grad_norm": 0.1321309506893158, + "learning_rate": 1.1572271598343381e-05, + "loss": 1.0653, + "step": 12834 + }, + { + "epoch": 0.7798165137614679, + "grad_norm": 0.16023296117782593, + "learning_rate": 1.1566147793333343e-05, + "loss": 1.0827, + "step": 12835 + }, + { + "epoch": 0.7798772707940944, + "grad_norm": 0.1974269151687622, + "learning_rate": 1.1560025397123592e-05, + "loss": 1.1422, + "step": 12836 + }, + { + "epoch": 0.779938027826721, + "grad_norm": 0.18258316814899445, + "learning_rate": 1.1553904409938548e-05, + "loss": 1.0663, + "step": 12837 + }, + { + "epoch": 0.7799987848593475, + "grad_norm": 0.15497559309005737, + "learning_rate": 1.1547784832002567e-05, + "loss": 1.0619, + "step": 12838 + }, + { + "epoch": 0.780059541891974, + "grad_norm": 0.09782491624355316, + "learning_rate": 1.1541666663539963e-05, + "loss": 0.9979, + "step": 12839 + }, + { + "epoch": 0.7801202989246006, + "grad_norm": 0.7276683449745178, + "learning_rate": 1.1535549904775006e-05, + "loss": 1.1006, + "step": 12840 + }, + { + "epoch": 0.7801810559572271, + "grad_norm": 7.402686595916748, + "learning_rate": 1.15294345559319e-05, + "loss": 1.0741, + "step": 12841 + }, + { + "epoch": 0.7802418129898536, + "grad_norm": 0.1841738522052765, + "learning_rate": 1.1523320617234806e-05, + "loss": 1.1202, + "step": 12842 + }, + { + "epoch": 0.7803025700224802, + "grad_norm": 0.14523489773273468, + "learning_rate": 1.1517208088907833e-05, + "loss": 1.0366, + "step": 12843 + }, + { + "epoch": 0.7803633270551066, + "grad_norm": 0.12169469147920609, + "learning_rate": 1.1511096971175034e-05, + "loss": 1.0584, + "step": 12844 + }, + { + "epoch": 0.7804240840877331, + "grad_norm": 0.16683024168014526, + "learning_rate": 1.1504987264260414e-05, + "loss": 1.0587, + "step": 12845 + }, + { + "epoch": 0.7804848411203597, + "grad_norm": 0.17324349284172058, + "learning_rate": 1.1498878968387916e-05, + "loss": 1.0863, + "step": 12846 + }, + { + "epoch": 0.7805455981529862, + "grad_norm": 0.27542775869369507, + "learning_rate": 1.1492772083781478e-05, + "loss": 1.022, + "step": 12847 + }, + { + "epoch": 0.7806063551856127, + "grad_norm": 0.13815228641033173, + "learning_rate": 1.1486666610664908e-05, + "loss": 1.1454, + "step": 12848 + }, + { + "epoch": 0.7806671122182393, + "grad_norm": 0.12550212442874908, + "learning_rate": 1.1480562549262019e-05, + "loss": 1.0226, + "step": 12849 + }, + { + "epoch": 0.7807278692508658, + "grad_norm": 0.12101548165082932, + "learning_rate": 1.1474459899796559e-05, + "loss": 1.0132, + "step": 12850 + }, + { + "epoch": 0.7807886262834923, + "grad_norm": 0.1512622833251953, + "learning_rate": 1.1468358662492202e-05, + "loss": 1.0244, + "step": 12851 + }, + { + "epoch": 0.7808493833161189, + "grad_norm": 0.3264085054397583, + "learning_rate": 1.1462258837572615e-05, + "loss": 1.047, + "step": 12852 + }, + { + "epoch": 0.7809101403487454, + "grad_norm": 0.15057966113090515, + "learning_rate": 1.1456160425261386e-05, + "loss": 1.0285, + "step": 12853 + }, + { + "epoch": 0.7809708973813719, + "grad_norm": 0.11836346238851547, + "learning_rate": 1.145006342578206e-05, + "loss": 1.0454, + "step": 12854 + }, + { + "epoch": 0.7810316544139985, + "grad_norm": 0.19074952602386475, + "learning_rate": 1.1443967839358094e-05, + "loss": 1.1281, + "step": 12855 + }, + { + "epoch": 0.781092411446625, + "grad_norm": 0.2930346429347992, + "learning_rate": 1.1437873666212928e-05, + "loss": 1.0946, + "step": 12856 + }, + { + "epoch": 0.7811531684792514, + "grad_norm": 0.13799655437469482, + "learning_rate": 1.1431780906569966e-05, + "loss": 1.0709, + "step": 12857 + }, + { + "epoch": 0.781213925511878, + "grad_norm": 0.12468823045492172, + "learning_rate": 1.1425689560652536e-05, + "loss": 1.0806, + "step": 12858 + }, + { + "epoch": 0.7812746825445045, + "grad_norm": 0.1512451022863388, + "learning_rate": 1.1419599628683909e-05, + "loss": 1.0849, + "step": 12859 + }, + { + "epoch": 0.781335439577131, + "grad_norm": 0.2822954058647156, + "learning_rate": 1.1413511110887314e-05, + "loss": 1.0825, + "step": 12860 + }, + { + "epoch": 0.7813961966097576, + "grad_norm": 0.10462968796491623, + "learning_rate": 1.1407424007485929e-05, + "loss": 0.9911, + "step": 12861 + }, + { + "epoch": 0.7814569536423841, + "grad_norm": 0.12267274409532547, + "learning_rate": 1.1401338318702876e-05, + "loss": 1.0221, + "step": 12862 + }, + { + "epoch": 0.7815177106750106, + "grad_norm": 0.14118769764900208, + "learning_rate": 1.1395254044761227e-05, + "loss": 1.1061, + "step": 12863 + }, + { + "epoch": 0.7815784677076372, + "grad_norm": 0.157334104180336, + "learning_rate": 1.1389171185884006e-05, + "loss": 1.1094, + "step": 12864 + }, + { + "epoch": 0.7816392247402637, + "grad_norm": 0.2385435551404953, + "learning_rate": 1.1383089742294179e-05, + "loss": 1.1388, + "step": 12865 + }, + { + "epoch": 0.7816999817728902, + "grad_norm": 0.2596820890903473, + "learning_rate": 1.1377009714214658e-05, + "loss": 1.11, + "step": 12866 + }, + { + "epoch": 0.7817607388055168, + "grad_norm": 0.2865890562534332, + "learning_rate": 1.1370931101868315e-05, + "loss": 1.0757, + "step": 12867 + }, + { + "epoch": 0.7818214958381433, + "grad_norm": 0.104372538626194, + "learning_rate": 1.1364853905477956e-05, + "loss": 1.0245, + "step": 12868 + }, + { + "epoch": 0.7818822528707698, + "grad_norm": 0.16807779669761658, + "learning_rate": 1.1358778125266344e-05, + "loss": 1.1249, + "step": 12869 + }, + { + "epoch": 0.7819430099033963, + "grad_norm": 0.25906169414520264, + "learning_rate": 1.1352703761456196e-05, + "loss": 1.1488, + "step": 12870 + }, + { + "epoch": 0.7820037669360228, + "grad_norm": 0.3172304928302765, + "learning_rate": 1.1346630814270154e-05, + "loss": 1.0882, + "step": 12871 + }, + { + "epoch": 0.7820645239686493, + "grad_norm": 0.1424168199300766, + "learning_rate": 1.1340559283930823e-05, + "loss": 1.0628, + "step": 12872 + }, + { + "epoch": 0.7821252810012759, + "grad_norm": 0.13839729130268097, + "learning_rate": 1.1334489170660779e-05, + "loss": 1.0628, + "step": 12873 + }, + { + "epoch": 0.7821860380339024, + "grad_norm": 0.26125919818878174, + "learning_rate": 1.1328420474682516e-05, + "loss": 1.1765, + "step": 12874 + }, + { + "epoch": 0.7822467950665289, + "grad_norm": 0.1128741130232811, + "learning_rate": 1.1322353196218465e-05, + "loss": 1.0465, + "step": 12875 + }, + { + "epoch": 0.7823075520991555, + "grad_norm": 0.1515772044658661, + "learning_rate": 1.1316287335491032e-05, + "loss": 1.0886, + "step": 12876 + }, + { + "epoch": 0.782368309131782, + "grad_norm": 0.14717566967010498, + "learning_rate": 1.1310222892722556e-05, + "loss": 1.1337, + "step": 12877 + }, + { + "epoch": 0.7824290661644085, + "grad_norm": 0.19552476704120636, + "learning_rate": 1.1304159868135344e-05, + "loss": 1.1012, + "step": 12878 + }, + { + "epoch": 0.7824898231970351, + "grad_norm": 0.2012212723493576, + "learning_rate": 1.129809826195164e-05, + "loss": 1.1345, + "step": 12879 + }, + { + "epoch": 0.7825505802296616, + "grad_norm": 0.2269507199525833, + "learning_rate": 1.1292038074393618e-05, + "loss": 1.2227, + "step": 12880 + }, + { + "epoch": 0.7826113372622882, + "grad_norm": 0.1639718860387802, + "learning_rate": 1.1285979305683441e-05, + "loss": 1.0904, + "step": 12881 + }, + { + "epoch": 0.7826720942949147, + "grad_norm": 0.9077989459037781, + "learning_rate": 1.1279921956043144e-05, + "loss": 1.0791, + "step": 12882 + }, + { + "epoch": 0.7827328513275411, + "grad_norm": 0.14997003972530365, + "learning_rate": 1.1273866025694813e-05, + "loss": 1.0671, + "step": 12883 + }, + { + "epoch": 0.7827936083601676, + "grad_norm": 0.14225393533706665, + "learning_rate": 1.1267811514860399e-05, + "loss": 1.109, + "step": 12884 + }, + { + "epoch": 0.7828543653927942, + "grad_norm": 0.19047455489635468, + "learning_rate": 1.1261758423761848e-05, + "loss": 1.1537, + "step": 12885 + }, + { + "epoch": 0.7829151224254207, + "grad_norm": 0.15400217473506927, + "learning_rate": 1.1255706752621026e-05, + "loss": 1.0874, + "step": 12886 + }, + { + "epoch": 0.7829758794580473, + "grad_norm": 0.19490458071231842, + "learning_rate": 1.1249656501659767e-05, + "loss": 1.087, + "step": 12887 + }, + { + "epoch": 0.7830366364906738, + "grad_norm": 0.128581240773201, + "learning_rate": 1.1243607671099831e-05, + "loss": 1.0141, + "step": 12888 + }, + { + "epoch": 0.7830973935233003, + "grad_norm": 0.11357045918703079, + "learning_rate": 1.1237560261162954e-05, + "loss": 1.0539, + "step": 12889 + }, + { + "epoch": 0.7831581505559269, + "grad_norm": 0.12881328165531158, + "learning_rate": 1.1231514272070798e-05, + "loss": 1.0236, + "step": 12890 + }, + { + "epoch": 0.7832189075885534, + "grad_norm": 0.1487404853105545, + "learning_rate": 1.1225469704044978e-05, + "loss": 1.0815, + "step": 12891 + }, + { + "epoch": 0.7832796646211799, + "grad_norm": 0.15983174741268158, + "learning_rate": 1.1219426557307061e-05, + "loss": 1.0992, + "step": 12892 + }, + { + "epoch": 0.7833404216538065, + "grad_norm": 0.10644016414880753, + "learning_rate": 1.1213384832078545e-05, + "loss": 1.0345, + "step": 12893 + }, + { + "epoch": 0.783401178686433, + "grad_norm": 0.13520319759845734, + "learning_rate": 1.1207344528580937e-05, + "loss": 1.2728, + "step": 12894 + }, + { + "epoch": 0.7834619357190595, + "grad_norm": 0.11113191395998001, + "learning_rate": 1.1201305647035598e-05, + "loss": 1.0074, + "step": 12895 + }, + { + "epoch": 0.783522692751686, + "grad_norm": 0.4640451967716217, + "learning_rate": 1.1195268187663899e-05, + "loss": 1.0263, + "step": 12896 + }, + { + "epoch": 0.7835834497843125, + "grad_norm": 0.13769207894802094, + "learning_rate": 1.1189232150687145e-05, + "loss": 1.0469, + "step": 12897 + }, + { + "epoch": 0.783644206816939, + "grad_norm": 0.3489821255207062, + "learning_rate": 1.1183197536326573e-05, + "loss": 1.1215, + "step": 12898 + }, + { + "epoch": 0.7837049638495656, + "grad_norm": 0.10755273699760437, + "learning_rate": 1.1177164344803415e-05, + "loss": 1.0593, + "step": 12899 + }, + { + "epoch": 0.7837657208821921, + "grad_norm": 0.13133902847766876, + "learning_rate": 1.11711325763388e-05, + "loss": 1.0788, + "step": 12900 + }, + { + "epoch": 0.7838264779148186, + "grad_norm": 0.1868750900030136, + "learning_rate": 1.1165102231153846e-05, + "loss": 1.0827, + "step": 12901 + }, + { + "epoch": 0.7838872349474452, + "grad_norm": 0.2032591551542282, + "learning_rate": 1.1159073309469553e-05, + "loss": 1.1533, + "step": 12902 + }, + { + "epoch": 0.7839479919800717, + "grad_norm": 0.16748055815696716, + "learning_rate": 1.115304581150693e-05, + "loss": 1.1537, + "step": 12903 + }, + { + "epoch": 0.7840087490126982, + "grad_norm": 0.12188611924648285, + "learning_rate": 1.114701973748693e-05, + "loss": 1.0402, + "step": 12904 + }, + { + "epoch": 0.7840695060453248, + "grad_norm": 0.13133476674556732, + "learning_rate": 1.1140995087630434e-05, + "loss": 1.0876, + "step": 12905 + }, + { + "epoch": 0.7841302630779513, + "grad_norm": 0.13564378023147583, + "learning_rate": 1.1134971862158277e-05, + "loss": 1.0694, + "step": 12906 + }, + { + "epoch": 0.7841910201105778, + "grad_norm": 0.22386950254440308, + "learning_rate": 1.112895006129125e-05, + "loss": 1.1008, + "step": 12907 + }, + { + "epoch": 0.7842517771432044, + "grad_norm": 0.11448849737644196, + "learning_rate": 1.1122929685250045e-05, + "loss": 1.0933, + "step": 12908 + }, + { + "epoch": 0.7843125341758308, + "grad_norm": 0.16264112293720245, + "learning_rate": 1.1116910734255375e-05, + "loss": 1.1221, + "step": 12909 + }, + { + "epoch": 0.7843732912084573, + "grad_norm": 0.11998185515403748, + "learning_rate": 1.1110893208527866e-05, + "loss": 1.0744, + "step": 12910 + }, + { + "epoch": 0.7844340482410839, + "grad_norm": 0.1568371057510376, + "learning_rate": 1.1104877108288082e-05, + "loss": 1.0677, + "step": 12911 + }, + { + "epoch": 0.7844948052737104, + "grad_norm": 0.13577401638031006, + "learning_rate": 1.1098862433756545e-05, + "loss": 1.0711, + "step": 12912 + }, + { + "epoch": 0.7845555623063369, + "grad_norm": 0.11962083727121353, + "learning_rate": 1.1092849185153725e-05, + "loss": 1.0296, + "step": 12913 + }, + { + "epoch": 0.7846163193389635, + "grad_norm": 0.13312940299510956, + "learning_rate": 1.1086837362700036e-05, + "loss": 1.0341, + "step": 12914 + }, + { + "epoch": 0.78467707637159, + "grad_norm": 0.11990492790937424, + "learning_rate": 1.1080826966615848e-05, + "loss": 1.0306, + "step": 12915 + }, + { + "epoch": 0.7847378334042165, + "grad_norm": 0.13836820423603058, + "learning_rate": 1.1074817997121472e-05, + "loss": 1.0391, + "step": 12916 + }, + { + "epoch": 0.7847985904368431, + "grad_norm": 0.1741345226764679, + "learning_rate": 1.1068810454437161e-05, + "loss": 1.1001, + "step": 12917 + }, + { + "epoch": 0.7848593474694696, + "grad_norm": 0.20143209397792816, + "learning_rate": 1.1062804338783134e-05, + "loss": 1.1875, + "step": 12918 + }, + { + "epoch": 0.7849201045020961, + "grad_norm": 0.14273950457572937, + "learning_rate": 1.1056799650379523e-05, + "loss": 1.0625, + "step": 12919 + }, + { + "epoch": 0.7849808615347227, + "grad_norm": 2.2902894020080566, + "learning_rate": 1.1050796389446466e-05, + "loss": 1.0467, + "step": 12920 + }, + { + "epoch": 0.7850416185673492, + "grad_norm": 0.3513869345188141, + "learning_rate": 1.1044794556204009e-05, + "loss": 1.1476, + "step": 12921 + }, + { + "epoch": 0.7851023755999756, + "grad_norm": 0.1931939572095871, + "learning_rate": 1.1038794150872116e-05, + "loss": 1.1242, + "step": 12922 + }, + { + "epoch": 0.7851631326326022, + "grad_norm": 0.8756475448608398, + "learning_rate": 1.1032795173670747e-05, + "loss": 1.0923, + "step": 12923 + }, + { + "epoch": 0.7852238896652287, + "grad_norm": 0.36362606287002563, + "learning_rate": 1.1026797624819818e-05, + "loss": 1.1572, + "step": 12924 + }, + { + "epoch": 0.7852846466978552, + "grad_norm": 0.131103515625, + "learning_rate": 1.1020801504539152e-05, + "loss": 1.086, + "step": 12925 + }, + { + "epoch": 0.7853454037304818, + "grad_norm": 0.12699051201343536, + "learning_rate": 1.1014806813048545e-05, + "loss": 1.0769, + "step": 12926 + }, + { + "epoch": 0.7854061607631083, + "grad_norm": 0.1372574120759964, + "learning_rate": 1.1008813550567725e-05, + "loss": 1.0063, + "step": 12927 + }, + { + "epoch": 0.7854669177957349, + "grad_norm": 0.6051851511001587, + "learning_rate": 1.1002821717316387e-05, + "loss": 1.0447, + "step": 12928 + }, + { + "epoch": 0.7855276748283614, + "grad_norm": 5.440812110900879, + "learning_rate": 1.099683131351415e-05, + "loss": 1.071, + "step": 12929 + }, + { + "epoch": 0.7855884318609879, + "grad_norm": 0.1737460494041443, + "learning_rate": 1.0990842339380608e-05, + "loss": 1.1249, + "step": 12930 + }, + { + "epoch": 0.7856491888936145, + "grad_norm": 0.31205490231513977, + "learning_rate": 1.0984854795135274e-05, + "loss": 1.0677, + "step": 12931 + }, + { + "epoch": 0.785709945926241, + "grad_norm": 0.18849575519561768, + "learning_rate": 1.0978868680997634e-05, + "loss": 1.1083, + "step": 12932 + }, + { + "epoch": 0.7857707029588675, + "grad_norm": 0.18966080248355865, + "learning_rate": 1.09728839971871e-05, + "loss": 1.0447, + "step": 12933 + }, + { + "epoch": 0.7858314599914941, + "grad_norm": 0.12457823008298874, + "learning_rate": 1.0966900743923052e-05, + "loss": 1.0785, + "step": 12934 + }, + { + "epoch": 0.7858922170241205, + "grad_norm": 0.34969809651374817, + "learning_rate": 1.0960918921424801e-05, + "loss": 1.0148, + "step": 12935 + }, + { + "epoch": 0.785952974056747, + "grad_norm": 0.29265135526657104, + "learning_rate": 1.0954938529911608e-05, + "loss": 1.069, + "step": 12936 + }, + { + "epoch": 0.7860137310893736, + "grad_norm": 0.12106429040431976, + "learning_rate": 1.0948959569602695e-05, + "loss": 1.0792, + "step": 12937 + }, + { + "epoch": 0.7860744881220001, + "grad_norm": 0.241487056016922, + "learning_rate": 1.094298204071722e-05, + "loss": 1.1267, + "step": 12938 + }, + { + "epoch": 0.7861352451546266, + "grad_norm": 0.13090172410011292, + "learning_rate": 1.0937005943474266e-05, + "loss": 0.9939, + "step": 12939 + }, + { + "epoch": 0.7861960021872532, + "grad_norm": 0.10063038021326065, + "learning_rate": 1.093103127809293e-05, + "loss": 1.0189, + "step": 12940 + }, + { + "epoch": 0.7862567592198797, + "grad_norm": 0.10935657471418381, + "learning_rate": 1.0925058044792208e-05, + "loss": 1.0644, + "step": 12941 + }, + { + "epoch": 0.7863175162525062, + "grad_norm": 0.2448442280292511, + "learning_rate": 1.091908624379102e-05, + "loss": 1.1166, + "step": 12942 + }, + { + "epoch": 0.7863782732851328, + "grad_norm": 0.7896729707717896, + "learning_rate": 1.0913115875308283e-05, + "loss": 1.1464, + "step": 12943 + }, + { + "epoch": 0.7864390303177593, + "grad_norm": 0.14838793873786926, + "learning_rate": 1.0907146939562818e-05, + "loss": 1.0086, + "step": 12944 + }, + { + "epoch": 0.7864997873503858, + "grad_norm": 0.10829726606607437, + "learning_rate": 1.0901179436773462e-05, + "loss": 1.0336, + "step": 12945 + }, + { + "epoch": 0.7865605443830124, + "grad_norm": 0.3901865482330322, + "learning_rate": 1.0895213367158924e-05, + "loss": 1.0696, + "step": 12946 + }, + { + "epoch": 0.7866213014156389, + "grad_norm": 0.16181686520576477, + "learning_rate": 1.08892487309379e-05, + "loss": 1.0718, + "step": 12947 + }, + { + "epoch": 0.7866820584482654, + "grad_norm": 0.16711625456809998, + "learning_rate": 1.088328552832904e-05, + "loss": 1.0335, + "step": 12948 + }, + { + "epoch": 0.7867428154808919, + "grad_norm": 0.18678444623947144, + "learning_rate": 1.0877323759550884e-05, + "loss": 1.0285, + "step": 12949 + }, + { + "epoch": 0.7868035725135184, + "grad_norm": 0.1567946821451187, + "learning_rate": 1.0871363424821995e-05, + "loss": 0.9879, + "step": 12950 + }, + { + "epoch": 0.7868643295461449, + "grad_norm": 0.17752927541732788, + "learning_rate": 1.086540452436085e-05, + "loss": 1.0294, + "step": 12951 + }, + { + "epoch": 0.7869250865787715, + "grad_norm": 0.42185690999031067, + "learning_rate": 1.0859447058385862e-05, + "loss": 1.1973, + "step": 12952 + }, + { + "epoch": 0.786985843611398, + "grad_norm": 0.13146072626113892, + "learning_rate": 1.085349102711541e-05, + "loss": 1.0571, + "step": 12953 + }, + { + "epoch": 0.7870466006440245, + "grad_norm": 0.10630039870738983, + "learning_rate": 1.0847536430767813e-05, + "loss": 0.9831, + "step": 12954 + }, + { + "epoch": 0.7871073576766511, + "grad_norm": 0.1909441202878952, + "learning_rate": 1.0841583269561334e-05, + "loss": 1.0351, + "step": 12955 + }, + { + "epoch": 0.7871681147092776, + "grad_norm": 0.1109812930226326, + "learning_rate": 1.0835631543714186e-05, + "loss": 0.9941, + "step": 12956 + }, + { + "epoch": 0.7872288717419041, + "grad_norm": 0.1555769443511963, + "learning_rate": 1.082968125344454e-05, + "loss": 1.0479, + "step": 12957 + }, + { + "epoch": 0.7872896287745307, + "grad_norm": 0.16438855230808258, + "learning_rate": 1.0823732398970492e-05, + "loss": 1.0779, + "step": 12958 + }, + { + "epoch": 0.7873503858071572, + "grad_norm": 0.11183220893144608, + "learning_rate": 1.0817784980510115e-05, + "loss": 0.9849, + "step": 12959 + }, + { + "epoch": 0.7874111428397838, + "grad_norm": 0.13389062881469727, + "learning_rate": 1.0811838998281382e-05, + "loss": 1.0351, + "step": 12960 + }, + { + "epoch": 0.7874718998724103, + "grad_norm": 0.30436673760414124, + "learning_rate": 1.0805894452502296e-05, + "loss": 1.2856, + "step": 12961 + }, + { + "epoch": 0.7875326569050367, + "grad_norm": 0.10991024225950241, + "learning_rate": 1.0799951343390714e-05, + "loss": 1.0655, + "step": 12962 + }, + { + "epoch": 0.7875934139376632, + "grad_norm": 0.15333764255046844, + "learning_rate": 1.0794009671164484e-05, + "loss": 1.1108, + "step": 12963 + }, + { + "epoch": 0.7876541709702898, + "grad_norm": 1.4359647035598755, + "learning_rate": 1.0788069436041415e-05, + "loss": 1.0512, + "step": 12964 + }, + { + "epoch": 0.7877149280029163, + "grad_norm": 0.11067744344472885, + "learning_rate": 1.0782130638239224e-05, + "loss": 1.0046, + "step": 12965 + }, + { + "epoch": 0.7877756850355428, + "grad_norm": 0.23311765491962433, + "learning_rate": 1.0776193277975633e-05, + "loss": 1.0825, + "step": 12966 + }, + { + "epoch": 0.7878364420681694, + "grad_norm": 0.12318284064531326, + "learning_rate": 1.0770257355468256e-05, + "loss": 1.028, + "step": 12967 + }, + { + "epoch": 0.7878971991007959, + "grad_norm": 0.16160030663013458, + "learning_rate": 1.076432287093469e-05, + "loss": 1.179, + "step": 12968 + }, + { + "epoch": 0.7879579561334225, + "grad_norm": 0.3871358633041382, + "learning_rate": 1.075838982459244e-05, + "loss": 1.1621, + "step": 12969 + }, + { + "epoch": 0.788018713166049, + "grad_norm": 0.14440901577472687, + "learning_rate": 1.0752458216658984e-05, + "loss": 1.0397, + "step": 12970 + }, + { + "epoch": 0.7880794701986755, + "grad_norm": 0.11363501846790314, + "learning_rate": 1.0746528047351772e-05, + "loss": 0.9929, + "step": 12971 + }, + { + "epoch": 0.7881402272313021, + "grad_norm": 0.09387975931167603, + "learning_rate": 1.0740599316888167e-05, + "loss": 1.074, + "step": 12972 + }, + { + "epoch": 0.7882009842639286, + "grad_norm": 0.4731582999229431, + "learning_rate": 1.0734672025485476e-05, + "loss": 1.0937, + "step": 12973 + }, + { + "epoch": 0.7882617412965551, + "grad_norm": 0.16211146116256714, + "learning_rate": 1.0728746173360987e-05, + "loss": 1.1173, + "step": 12974 + }, + { + "epoch": 0.7883224983291816, + "grad_norm": 0.15314048528671265, + "learning_rate": 1.0722821760731871e-05, + "loss": 1.0428, + "step": 12975 + }, + { + "epoch": 0.7883832553618081, + "grad_norm": 0.1534699648618698, + "learning_rate": 1.0716898787815327e-05, + "loss": 1.0606, + "step": 12976 + }, + { + "epoch": 0.7884440123944346, + "grad_norm": 0.12805882096290588, + "learning_rate": 1.0710977254828458e-05, + "loss": 1.0784, + "step": 12977 + }, + { + "epoch": 0.7885047694270612, + "grad_norm": 0.11373342573642731, + "learning_rate": 1.0705057161988308e-05, + "loss": 1.0482, + "step": 12978 + }, + { + "epoch": 0.7885655264596877, + "grad_norm": 0.10901042819023132, + "learning_rate": 1.069913850951188e-05, + "loss": 1.0653, + "step": 12979 + }, + { + "epoch": 0.7886262834923142, + "grad_norm": 0.18511372804641724, + "learning_rate": 1.069322129761613e-05, + "loss": 1.1228, + "step": 12980 + }, + { + "epoch": 0.7886870405249408, + "grad_norm": 0.20320042967796326, + "learning_rate": 1.0687305526517949e-05, + "loss": 1.1332, + "step": 12981 + }, + { + "epoch": 0.7887477975575673, + "grad_norm": 0.16992968320846558, + "learning_rate": 1.0681391196434187e-05, + "loss": 1.1155, + "step": 12982 + }, + { + "epoch": 0.7888085545901938, + "grad_norm": 0.21405334770679474, + "learning_rate": 1.0675478307581627e-05, + "loss": 1.0134, + "step": 12983 + }, + { + "epoch": 0.7888693116228204, + "grad_norm": 0.09224823117256165, + "learning_rate": 1.066956686017701e-05, + "loss": 0.9915, + "step": 12984 + }, + { + "epoch": 0.7889300686554469, + "grad_norm": 0.18220220506191254, + "learning_rate": 1.0663656854437027e-05, + "loss": 1.1035, + "step": 12985 + }, + { + "epoch": 0.7889908256880734, + "grad_norm": 0.10819485783576965, + "learning_rate": 1.065774829057829e-05, + "loss": 1.0231, + "step": 12986 + }, + { + "epoch": 0.7890515827207, + "grad_norm": 0.12159349024295807, + "learning_rate": 1.0651841168817406e-05, + "loss": 1.0249, + "step": 12987 + }, + { + "epoch": 0.7891123397533264, + "grad_norm": 0.21518905460834503, + "learning_rate": 1.06459354893709e-05, + "loss": 1.1489, + "step": 12988 + }, + { + "epoch": 0.7891730967859529, + "grad_norm": 0.13885504007339478, + "learning_rate": 1.0640031252455229e-05, + "loss": 1.0772, + "step": 12989 + }, + { + "epoch": 0.7892338538185795, + "grad_norm": 0.11387944966554642, + "learning_rate": 1.0634128458286819e-05, + "loss": 0.9961, + "step": 12990 + }, + { + "epoch": 0.789294610851206, + "grad_norm": 0.7089393138885498, + "learning_rate": 1.0628227107082022e-05, + "loss": 1.1075, + "step": 12991 + }, + { + "epoch": 0.7893553678838325, + "grad_norm": 0.13204385340213776, + "learning_rate": 1.0622327199057192e-05, + "loss": 1.0834, + "step": 12992 + }, + { + "epoch": 0.7894161249164591, + "grad_norm": 0.12831248342990875, + "learning_rate": 1.0616428734428562e-05, + "loss": 1.0608, + "step": 12993 + }, + { + "epoch": 0.7894768819490856, + "grad_norm": 0.25789281725883484, + "learning_rate": 1.0610531713412352e-05, + "loss": 1.0449, + "step": 12994 + }, + { + "epoch": 0.7895376389817121, + "grad_norm": 0.10960721224546432, + "learning_rate": 1.0604636136224733e-05, + "loss": 1.0431, + "step": 12995 + }, + { + "epoch": 0.7895983960143387, + "grad_norm": 0.1594148576259613, + "learning_rate": 1.0598742003081762e-05, + "loss": 1.1183, + "step": 12996 + }, + { + "epoch": 0.7896591530469652, + "grad_norm": 5.922727584838867, + "learning_rate": 1.0592849314199532e-05, + "loss": 1.0603, + "step": 12997 + }, + { + "epoch": 0.7897199100795917, + "grad_norm": 0.1328015923500061, + "learning_rate": 1.0586958069794028e-05, + "loss": 1.0169, + "step": 12998 + }, + { + "epoch": 0.7897806671122183, + "grad_norm": 0.16861291229724884, + "learning_rate": 1.058106827008119e-05, + "loss": 1.0451, + "step": 12999 + }, + { + "epoch": 0.7898414241448448, + "grad_norm": 6.649844169616699, + "learning_rate": 1.0575179915276917e-05, + "loss": 1.0618, + "step": 13000 + }, + { + "epoch": 0.7899021811774712, + "grad_norm": 0.15430712699890137, + "learning_rate": 1.0569293005597048e-05, + "loss": 1.1052, + "step": 13001 + }, + { + "epoch": 0.7899629382100978, + "grad_norm": 0.2725692093372345, + "learning_rate": 1.0563407541257364e-05, + "loss": 1.1127, + "step": 13002 + }, + { + "epoch": 0.7900236952427243, + "grad_norm": 0.19224128127098083, + "learning_rate": 1.05575235224736e-05, + "loss": 1.0674, + "step": 13003 + }, + { + "epoch": 0.7900844522753508, + "grad_norm": 0.13214683532714844, + "learning_rate": 1.0551640949461433e-05, + "loss": 1.0691, + "step": 13004 + }, + { + "epoch": 0.7901452093079774, + "grad_norm": 0.15809671580791473, + "learning_rate": 1.0545759822436496e-05, + "loss": 1.1539, + "step": 13005 + }, + { + "epoch": 0.7902059663406039, + "grad_norm": 0.1760806292295456, + "learning_rate": 1.0539880141614355e-05, + "loss": 1.0844, + "step": 13006 + }, + { + "epoch": 0.7902667233732305, + "grad_norm": 0.15000326931476593, + "learning_rate": 1.0534001907210522e-05, + "loss": 1.1025, + "step": 13007 + }, + { + "epoch": 0.790327480405857, + "grad_norm": 0.11090702563524246, + "learning_rate": 1.0528125119440507e-05, + "loss": 1.0588, + "step": 13008 + }, + { + "epoch": 0.7903882374384835, + "grad_norm": 0.14829276502132416, + "learning_rate": 1.0522249778519677e-05, + "loss": 1.092, + "step": 13009 + }, + { + "epoch": 0.7904489944711101, + "grad_norm": 0.10501495748758316, + "learning_rate": 1.051637588466342e-05, + "loss": 1.0122, + "step": 13010 + }, + { + "epoch": 0.7905097515037366, + "grad_norm": 0.11800161004066467, + "learning_rate": 1.0510503438087033e-05, + "loss": 1.0314, + "step": 13011 + }, + { + "epoch": 0.7905705085363631, + "grad_norm": 0.21349094808101654, + "learning_rate": 1.0504632439005762e-05, + "loss": 1.0456, + "step": 13012 + }, + { + "epoch": 0.7906312655689897, + "grad_norm": 0.35080158710479736, + "learning_rate": 1.0498762887634832e-05, + "loss": 1.1781, + "step": 13013 + }, + { + "epoch": 0.7906920226016161, + "grad_norm": 0.14761504530906677, + "learning_rate": 1.0492894784189389e-05, + "loss": 0.9978, + "step": 13014 + }, + { + "epoch": 0.7907527796342426, + "grad_norm": 0.1098819300532341, + "learning_rate": 1.0487028128884541e-05, + "loss": 1.0609, + "step": 13015 + }, + { + "epoch": 0.7908135366668692, + "grad_norm": 1.724346399307251, + "learning_rate": 1.0481162921935283e-05, + "loss": 1.0793, + "step": 13016 + }, + { + "epoch": 0.7908742936994957, + "grad_norm": 0.11826807260513306, + "learning_rate": 1.0475299163556651e-05, + "loss": 1.0929, + "step": 13017 + }, + { + "epoch": 0.7909350507321222, + "grad_norm": 0.12821528315544128, + "learning_rate": 1.0469436853963566e-05, + "loss": 1.0892, + "step": 13018 + }, + { + "epoch": 0.7909958077647488, + "grad_norm": 0.2582976222038269, + "learning_rate": 1.0463575993370917e-05, + "loss": 1.1234, + "step": 13019 + }, + { + "epoch": 0.7910565647973753, + "grad_norm": 0.1912316530942917, + "learning_rate": 1.0457716581993531e-05, + "loss": 1.1253, + "step": 13020 + }, + { + "epoch": 0.7911173218300018, + "grad_norm": 0.10567454993724823, + "learning_rate": 1.045185862004619e-05, + "loss": 1.0588, + "step": 13021 + }, + { + "epoch": 0.7911780788626284, + "grad_norm": 0.17749686539173126, + "learning_rate": 1.0446002107743612e-05, + "loss": 1.0633, + "step": 13022 + }, + { + "epoch": 0.7912388358952549, + "grad_norm": 0.16335220634937286, + "learning_rate": 1.0440147045300474e-05, + "loss": 1.055, + "step": 13023 + }, + { + "epoch": 0.7912995929278814, + "grad_norm": 0.14854291081428528, + "learning_rate": 1.0434293432931398e-05, + "loss": 1.0141, + "step": 13024 + }, + { + "epoch": 0.791360349960508, + "grad_norm": 0.21400520205497742, + "learning_rate": 1.0428441270850935e-05, + "loss": 1.1369, + "step": 13025 + }, + { + "epoch": 0.7914211069931345, + "grad_norm": 0.09609528630971909, + "learning_rate": 1.0422590559273609e-05, + "loss": 1.0386, + "step": 13026 + }, + { + "epoch": 0.7914818640257609, + "grad_norm": 1.0455647706985474, + "learning_rate": 1.0416741298413878e-05, + "loss": 1.1828, + "step": 13027 + }, + { + "epoch": 0.7915426210583875, + "grad_norm": 0.19983692467212677, + "learning_rate": 1.0410893488486145e-05, + "loss": 1.1282, + "step": 13028 + }, + { + "epoch": 0.791603378091014, + "grad_norm": 4.148086071014404, + "learning_rate": 1.0405047129704764e-05, + "loss": 1.2251, + "step": 13029 + }, + { + "epoch": 0.7916641351236405, + "grad_norm": 0.18747232854366302, + "learning_rate": 1.0399202222284033e-05, + "loss": 1.0936, + "step": 13030 + }, + { + "epoch": 0.7917248921562671, + "grad_norm": 0.1455703228712082, + "learning_rate": 1.0393358766438199e-05, + "loss": 1.0376, + "step": 13031 + }, + { + "epoch": 0.7917856491888936, + "grad_norm": 0.14367344975471497, + "learning_rate": 1.0387516762381444e-05, + "loss": 1.0534, + "step": 13032 + }, + { + "epoch": 0.7918464062215201, + "grad_norm": 0.13840726017951965, + "learning_rate": 1.0381676210327934e-05, + "loss": 1.0164, + "step": 13033 + }, + { + "epoch": 0.7919071632541467, + "grad_norm": 0.11749805510044098, + "learning_rate": 1.0375837110491737e-05, + "loss": 1.0384, + "step": 13034 + }, + { + "epoch": 0.7919679202867732, + "grad_norm": 0.1618070900440216, + "learning_rate": 1.0369999463086905e-05, + "loss": 1.0941, + "step": 13035 + }, + { + "epoch": 0.7920286773193997, + "grad_norm": 0.21264484524726868, + "learning_rate": 1.0364163268327387e-05, + "loss": 1.1066, + "step": 13036 + }, + { + "epoch": 0.7920894343520263, + "grad_norm": 0.1712161749601364, + "learning_rate": 1.0358328526427113e-05, + "loss": 1.0919, + "step": 13037 + }, + { + "epoch": 0.7921501913846528, + "grad_norm": 0.10082321614027023, + "learning_rate": 1.0352495237599985e-05, + "loss": 1.0525, + "step": 13038 + }, + { + "epoch": 0.7922109484172793, + "grad_norm": 0.27976882457733154, + "learning_rate": 1.0346663402059809e-05, + "loss": 1.2963, + "step": 13039 + }, + { + "epoch": 0.7922717054499058, + "grad_norm": 0.12554937601089478, + "learning_rate": 1.0340833020020347e-05, + "loss": 1.0778, + "step": 13040 + }, + { + "epoch": 0.7923324624825323, + "grad_norm": 0.2319071739912033, + "learning_rate": 1.0335004091695333e-05, + "loss": 1.0929, + "step": 13041 + }, + { + "epoch": 0.7923932195151588, + "grad_norm": 0.09979699552059174, + "learning_rate": 1.0329176617298381e-05, + "loss": 1.0091, + "step": 13042 + }, + { + "epoch": 0.7924539765477854, + "grad_norm": 0.20330984890460968, + "learning_rate": 1.0323350597043141e-05, + "loss": 1.143, + "step": 13043 + }, + { + "epoch": 0.7925147335804119, + "grad_norm": 0.6742188334465027, + "learning_rate": 1.031752603114316e-05, + "loss": 1.0548, + "step": 13044 + }, + { + "epoch": 0.7925754906130384, + "grad_norm": 1.9805086851119995, + "learning_rate": 1.0311702919811928e-05, + "loss": 1.1302, + "step": 13045 + }, + { + "epoch": 0.792636247645665, + "grad_norm": 0.131484255194664, + "learning_rate": 1.03058812632629e-05, + "loss": 1.0748, + "step": 13046 + }, + { + "epoch": 0.7926970046782915, + "grad_norm": 0.16047367453575134, + "learning_rate": 1.0300061061709465e-05, + "loss": 1.1203, + "step": 13047 + }, + { + "epoch": 0.792757761710918, + "grad_norm": 0.14155474305152893, + "learning_rate": 1.0294242315364966e-05, + "loss": 1.0415, + "step": 13048 + }, + { + "epoch": 0.7928185187435446, + "grad_norm": 0.29167142510414124, + "learning_rate": 1.0288425024442689e-05, + "loss": 0.9946, + "step": 13049 + }, + { + "epoch": 0.7928792757761711, + "grad_norm": 0.10048561543226242, + "learning_rate": 1.0282609189155867e-05, + "loss": 1.0097, + "step": 13050 + }, + { + "epoch": 0.7929400328087977, + "grad_norm": 0.23381289839744568, + "learning_rate": 1.0276794809717688e-05, + "loss": 1.1833, + "step": 13051 + }, + { + "epoch": 0.7930007898414242, + "grad_norm": 0.16975292563438416, + "learning_rate": 1.0270981886341268e-05, + "loss": 1.0282, + "step": 13052 + }, + { + "epoch": 0.7930615468740507, + "grad_norm": 0.12489912658929825, + "learning_rate": 1.0265170419239678e-05, + "loss": 1.0749, + "step": 13053 + }, + { + "epoch": 0.7931223039066772, + "grad_norm": 0.17183059453964233, + "learning_rate": 1.0259360408625957e-05, + "loss": 1.0156, + "step": 13054 + }, + { + "epoch": 0.7931830609393037, + "grad_norm": 0.1626148819923401, + "learning_rate": 1.0253551854713079e-05, + "loss": 1.0509, + "step": 13055 + }, + { + "epoch": 0.7932438179719302, + "grad_norm": 0.1180085688829422, + "learning_rate": 1.0247744757713923e-05, + "loss": 1.0849, + "step": 13056 + }, + { + "epoch": 0.7933045750045568, + "grad_norm": 0.15574371814727783, + "learning_rate": 1.0241939117841371e-05, + "loss": 1.0382, + "step": 13057 + }, + { + "epoch": 0.7933653320371833, + "grad_norm": 0.12374011427164078, + "learning_rate": 1.0236134935308205e-05, + "loss": 1.0393, + "step": 13058 + }, + { + "epoch": 0.7934260890698098, + "grad_norm": 0.15752482414245605, + "learning_rate": 1.023033221032722e-05, + "loss": 1.1205, + "step": 13059 + }, + { + "epoch": 0.7934868461024364, + "grad_norm": 0.12764738500118256, + "learning_rate": 1.0224530943111094e-05, + "loss": 1.0722, + "step": 13060 + }, + { + "epoch": 0.7935476031350629, + "grad_norm": 0.10181684792041779, + "learning_rate": 1.0218731133872484e-05, + "loss": 1.045, + "step": 13061 + }, + { + "epoch": 0.7936083601676894, + "grad_norm": 0.10471001267433167, + "learning_rate": 1.0212932782823965e-05, + "loss": 1.0592, + "step": 13062 + }, + { + "epoch": 0.793669117200316, + "grad_norm": 0.38642391562461853, + "learning_rate": 1.0207135890178066e-05, + "loss": 1.0435, + "step": 13063 + }, + { + "epoch": 0.7937298742329425, + "grad_norm": 0.12425728142261505, + "learning_rate": 1.0201340456147313e-05, + "loss": 1.0719, + "step": 13064 + }, + { + "epoch": 0.793790631265569, + "grad_norm": 0.13001494109630585, + "learning_rate": 1.0195546480944113e-05, + "loss": 1.0305, + "step": 13065 + }, + { + "epoch": 0.7938513882981956, + "grad_norm": 0.12355656176805496, + "learning_rate": 1.0189753964780857e-05, + "loss": 1.0618, + "step": 13066 + }, + { + "epoch": 0.793912145330822, + "grad_norm": 0.12111663073301315, + "learning_rate": 1.018396290786986e-05, + "loss": 1.0489, + "step": 13067 + }, + { + "epoch": 0.7939729023634485, + "grad_norm": 0.11895778030157089, + "learning_rate": 1.01781733104234e-05, + "loss": 1.0285, + "step": 13068 + }, + { + "epoch": 0.7940336593960751, + "grad_norm": 0.09712397307157516, + "learning_rate": 1.0172385172653692e-05, + "loss": 1.0307, + "step": 13069 + }, + { + "epoch": 0.7940944164287016, + "grad_norm": 0.14626674354076385, + "learning_rate": 1.0166598494772905e-05, + "loss": 1.0442, + "step": 13070 + }, + { + "epoch": 0.7941551734613281, + "grad_norm": 0.12935839593410492, + "learning_rate": 1.0160813276993153e-05, + "loss": 1.0446, + "step": 13071 + }, + { + "epoch": 0.7942159304939547, + "grad_norm": 0.181484192609787, + "learning_rate": 1.0155029519526487e-05, + "loss": 1.1431, + "step": 13072 + }, + { + "epoch": 0.7942766875265812, + "grad_norm": 0.13355356454849243, + "learning_rate": 1.0149247222584923e-05, + "loss": 1.0258, + "step": 13073 + }, + { + "epoch": 0.7943374445592077, + "grad_norm": 0.13636019825935364, + "learning_rate": 1.0143466386380385e-05, + "loss": 1.0626, + "step": 13074 + }, + { + "epoch": 0.7943982015918343, + "grad_norm": 0.10556565225124359, + "learning_rate": 1.013768701112482e-05, + "loss": 1.0374, + "step": 13075 + }, + { + "epoch": 0.7944589586244608, + "grad_norm": 0.31504783034324646, + "learning_rate": 1.013190909703003e-05, + "loss": 1.0132, + "step": 13076 + }, + { + "epoch": 0.7945197156570873, + "grad_norm": 0.10009024292230606, + "learning_rate": 1.0126132644307817e-05, + "loss": 1.0124, + "step": 13077 + }, + { + "epoch": 0.7945804726897139, + "grad_norm": 0.1064009815454483, + "learning_rate": 1.0120357653169921e-05, + "loss": 1.0205, + "step": 13078 + }, + { + "epoch": 0.7946412297223404, + "grad_norm": 0.11540800333023071, + "learning_rate": 1.0114584123828008e-05, + "loss": 1.023, + "step": 13079 + }, + { + "epoch": 0.7947019867549668, + "grad_norm": 0.15264244377613068, + "learning_rate": 1.0108812056493738e-05, + "loss": 1.0702, + "step": 13080 + }, + { + "epoch": 0.7947627437875934, + "grad_norm": 0.16647522151470184, + "learning_rate": 1.010304145137867e-05, + "loss": 1.1173, + "step": 13081 + }, + { + "epoch": 0.7948235008202199, + "grad_norm": 0.3319341838359833, + "learning_rate": 1.0097272308694345e-05, + "loss": 1.273, + "step": 13082 + }, + { + "epoch": 0.7948842578528464, + "grad_norm": 0.12956970930099487, + "learning_rate": 1.0091504628652204e-05, + "loss": 1.0449, + "step": 13083 + }, + { + "epoch": 0.794945014885473, + "grad_norm": 0.5735976099967957, + "learning_rate": 1.0085738411463664e-05, + "loss": 0.9946, + "step": 13084 + }, + { + "epoch": 0.7950057719180995, + "grad_norm": 0.10754632204771042, + "learning_rate": 1.0079973657340108e-05, + "loss": 1.0732, + "step": 13085 + }, + { + "epoch": 0.795066528950726, + "grad_norm": 0.31312495470046997, + "learning_rate": 1.0074210366492831e-05, + "loss": 1.1918, + "step": 13086 + }, + { + "epoch": 0.7951272859833526, + "grad_norm": 0.12384498864412308, + "learning_rate": 1.0068448539133096e-05, + "loss": 1.0675, + "step": 13087 + }, + { + "epoch": 0.7951880430159791, + "grad_norm": 0.2554831802845001, + "learning_rate": 1.0062688175472113e-05, + "loss": 1.1105, + "step": 13088 + }, + { + "epoch": 0.7952488000486057, + "grad_norm": 0.15807794034481049, + "learning_rate": 1.0056929275720982e-05, + "loss": 1.0198, + "step": 13089 + }, + { + "epoch": 0.7953095570812322, + "grad_norm": 0.1424776017665863, + "learning_rate": 1.0051171840090845e-05, + "loss": 1.0663, + "step": 13090 + }, + { + "epoch": 0.7953703141138587, + "grad_norm": 0.12099411338567734, + "learning_rate": 1.0045415868792729e-05, + "loss": 1.0547, + "step": 13091 + }, + { + "epoch": 0.7954310711464853, + "grad_norm": 0.1377709060907364, + "learning_rate": 1.003966136203761e-05, + "loss": 1.0447, + "step": 13092 + }, + { + "epoch": 0.7954918281791117, + "grad_norm": 0.1375075876712799, + "learning_rate": 1.0033908320036433e-05, + "loss": 1.0632, + "step": 13093 + }, + { + "epoch": 0.7955525852117382, + "grad_norm": 0.1367979496717453, + "learning_rate": 1.0028156743000072e-05, + "loss": 1.0598, + "step": 13094 + }, + { + "epoch": 0.7956133422443648, + "grad_norm": 0.13092826306819916, + "learning_rate": 1.0022406631139352e-05, + "loss": 1.0403, + "step": 13095 + }, + { + "epoch": 0.7956740992769913, + "grad_norm": 0.15822453796863556, + "learning_rate": 1.0016657984665045e-05, + "loss": 1.112, + "step": 13096 + }, + { + "epoch": 0.7957348563096178, + "grad_norm": 0.17074939608573914, + "learning_rate": 1.0010910803787865e-05, + "loss": 1.1617, + "step": 13097 + }, + { + "epoch": 0.7957956133422444, + "grad_norm": 0.17768992483615875, + "learning_rate": 1.0005165088718483e-05, + "loss": 1.0631, + "step": 13098 + }, + { + "epoch": 0.7958563703748709, + "grad_norm": 0.13596971333026886, + "learning_rate": 9.999420839667505e-06, + "loss": 1.0526, + "step": 13099 + }, + { + "epoch": 0.7959171274074974, + "grad_norm": 0.1868571788072586, + "learning_rate": 9.993678056845474e-06, + "loss": 1.1884, + "step": 13100 + }, + { + "epoch": 0.795977884440124, + "grad_norm": 0.13141293823719025, + "learning_rate": 9.987936740462923e-06, + "loss": 1.0669, + "step": 13101 + }, + { + "epoch": 0.7960386414727505, + "grad_norm": 0.15654785931110382, + "learning_rate": 9.982196890730294e-06, + "loss": 1.1035, + "step": 13102 + }, + { + "epoch": 0.796099398505377, + "grad_norm": 0.15711770951747894, + "learning_rate": 9.97645850785796e-06, + "loss": 1.0229, + "step": 13103 + }, + { + "epoch": 0.7961601555380036, + "grad_norm": 0.3458845913410187, + "learning_rate": 9.970721592056276e-06, + "loss": 1.1981, + "step": 13104 + }, + { + "epoch": 0.7962209125706301, + "grad_norm": 0.2297106832265854, + "learning_rate": 9.964986143535515e-06, + "loss": 1.0576, + "step": 13105 + }, + { + "epoch": 0.7962816696032565, + "grad_norm": 0.14223851263523102, + "learning_rate": 9.959252162505944e-06, + "loss": 1.1083, + "step": 13106 + }, + { + "epoch": 0.7963424266358831, + "grad_norm": 0.2293814718723297, + "learning_rate": 9.953519649177717e-06, + "loss": 1.0875, + "step": 13107 + }, + { + "epoch": 0.7964031836685096, + "grad_norm": 0.13752363622188568, + "learning_rate": 9.947788603760987e-06, + "loss": 1.0701, + "step": 13108 + }, + { + "epoch": 0.7964639407011361, + "grad_norm": 0.12873654067516327, + "learning_rate": 9.942059026465794e-06, + "loss": 1.0256, + "step": 13109 + }, + { + "epoch": 0.7965246977337627, + "grad_norm": 0.13620485365390778, + "learning_rate": 9.936330917502157e-06, + "loss": 1.0144, + "step": 13110 + }, + { + "epoch": 0.7965854547663892, + "grad_norm": 0.1412217766046524, + "learning_rate": 9.93060427708007e-06, + "loss": 1.057, + "step": 13111 + }, + { + "epoch": 0.7966462117990157, + "grad_norm": 0.21335838735103607, + "learning_rate": 9.924879105409423e-06, + "loss": 1.0967, + "step": 13112 + }, + { + "epoch": 0.7967069688316423, + "grad_norm": 0.20370370149612427, + "learning_rate": 9.919155402700082e-06, + "loss": 1.087, + "step": 13113 + }, + { + "epoch": 0.7967677258642688, + "grad_norm": 0.1506512314081192, + "learning_rate": 9.913433169161845e-06, + "loss": 1.0187, + "step": 13114 + }, + { + "epoch": 0.7968284828968953, + "grad_norm": 0.19056439399719238, + "learning_rate": 9.907712405004466e-06, + "loss": 1.1814, + "step": 13115 + }, + { + "epoch": 0.7968892399295219, + "grad_norm": 0.12057289481163025, + "learning_rate": 9.901993110437635e-06, + "loss": 1.0754, + "step": 13116 + }, + { + "epoch": 0.7969499969621484, + "grad_norm": 0.12495239824056625, + "learning_rate": 9.896275285671003e-06, + "loss": 1.0486, + "step": 13117 + }, + { + "epoch": 0.797010753994775, + "grad_norm": 0.1455259621143341, + "learning_rate": 9.890558930914146e-06, + "loss": 1.0627, + "step": 13118 + }, + { + "epoch": 0.7970715110274014, + "grad_norm": 0.10583961755037308, + "learning_rate": 9.884844046376607e-06, + "loss": 1.0394, + "step": 13119 + }, + { + "epoch": 0.7971322680600279, + "grad_norm": 0.12401382625102997, + "learning_rate": 9.879130632267864e-06, + "loss": 1.0273, + "step": 13120 + }, + { + "epoch": 0.7971930250926544, + "grad_norm": 0.30011144280433655, + "learning_rate": 9.873418688797332e-06, + "loss": 1.0685, + "step": 13121 + }, + { + "epoch": 0.797253782125281, + "grad_norm": 0.4915805459022522, + "learning_rate": 9.86770821617442e-06, + "loss": 1.1651, + "step": 13122 + }, + { + "epoch": 0.7973145391579075, + "grad_norm": 0.17420677840709686, + "learning_rate": 9.861999214608403e-06, + "loss": 1.0862, + "step": 13123 + }, + { + "epoch": 0.797375296190534, + "grad_norm": 0.12418276816606522, + "learning_rate": 9.856291684308566e-06, + "loss": 1.0232, + "step": 13124 + }, + { + "epoch": 0.7974360532231606, + "grad_norm": 0.4045256972312927, + "learning_rate": 9.850585625484105e-06, + "loss": 1.1388, + "step": 13125 + }, + { + "epoch": 0.7974968102557871, + "grad_norm": 0.12089885771274567, + "learning_rate": 9.8448810383442e-06, + "loss": 1.0539, + "step": 13126 + }, + { + "epoch": 0.7975575672884136, + "grad_norm": 0.1419459581375122, + "learning_rate": 9.839177923097942e-06, + "loss": 1.1091, + "step": 13127 + }, + { + "epoch": 0.7976183243210402, + "grad_norm": 0.11342260241508484, + "learning_rate": 9.833476279954395e-06, + "loss": 1.0748, + "step": 13128 + }, + { + "epoch": 0.7976790813536667, + "grad_norm": 0.24692946672439575, + "learning_rate": 9.827776109122526e-06, + "loss": 1.1235, + "step": 13129 + }, + { + "epoch": 0.7977398383862933, + "grad_norm": 0.13997912406921387, + "learning_rate": 9.822077410811275e-06, + "loss": 1.0214, + "step": 13130 + }, + { + "epoch": 0.7978005954189198, + "grad_norm": 0.2458542138338089, + "learning_rate": 9.816380185229556e-06, + "loss": 1.2319, + "step": 13131 + }, + { + "epoch": 0.7978613524515462, + "grad_norm": 0.18516439199447632, + "learning_rate": 9.810684432586188e-06, + "loss": 1.115, + "step": 13132 + }, + { + "epoch": 0.7979221094841727, + "grad_norm": 0.18086761236190796, + "learning_rate": 9.80499015308995e-06, + "loss": 1.0498, + "step": 13133 + }, + { + "epoch": 0.7979828665167993, + "grad_norm": 0.16141141951084137, + "learning_rate": 9.79929734694957e-06, + "loss": 1.117, + "step": 13134 + }, + { + "epoch": 0.7980436235494258, + "grad_norm": 0.1359003484249115, + "learning_rate": 9.793606014373712e-06, + "loss": 1.0516, + "step": 13135 + }, + { + "epoch": 0.7981043805820524, + "grad_norm": 0.12042230367660522, + "learning_rate": 9.787916155571003e-06, + "loss": 1.0276, + "step": 13136 + }, + { + "epoch": 0.7981651376146789, + "grad_norm": 0.8082221746444702, + "learning_rate": 9.782227770749997e-06, + "loss": 1.2311, + "step": 13137 + }, + { + "epoch": 0.7982258946473054, + "grad_norm": 0.11634038388729095, + "learning_rate": 9.776540860119205e-06, + "loss": 1.0583, + "step": 13138 + }, + { + "epoch": 0.798286651679932, + "grad_norm": 0.12310948222875595, + "learning_rate": 9.770855423887088e-06, + "loss": 1.0488, + "step": 13139 + }, + { + "epoch": 0.7983474087125585, + "grad_norm": 0.13864214718341827, + "learning_rate": 9.765171462262036e-06, + "loss": 1.0353, + "step": 13140 + }, + { + "epoch": 0.798408165745185, + "grad_norm": 5.499795436859131, + "learning_rate": 9.759488975452386e-06, + "loss": 1.063, + "step": 13141 + }, + { + "epoch": 0.7984689227778116, + "grad_norm": 0.11382286250591278, + "learning_rate": 9.753807963666478e-06, + "loss": 1.0494, + "step": 13142 + }, + { + "epoch": 0.7985296798104381, + "grad_norm": 0.1148020401597023, + "learning_rate": 9.748128427112502e-06, + "loss": 1.0672, + "step": 13143 + }, + { + "epoch": 0.7985904368430646, + "grad_norm": 0.12468332052230835, + "learning_rate": 9.742450365998663e-06, + "loss": 1.0281, + "step": 13144 + }, + { + "epoch": 0.7986511938756911, + "grad_norm": 0.1946880966424942, + "learning_rate": 9.736773780533081e-06, + "loss": 1.1204, + "step": 13145 + }, + { + "epoch": 0.7987119509083176, + "grad_norm": 0.27400967478752136, + "learning_rate": 9.731098670923832e-06, + "loss": 1.1401, + "step": 13146 + }, + { + "epoch": 0.7987727079409441, + "grad_norm": 0.2063048630952835, + "learning_rate": 9.725425037378954e-06, + "loss": 1.0165, + "step": 13147 + }, + { + "epoch": 0.7988334649735707, + "grad_norm": 0.16999012231826782, + "learning_rate": 9.71975288010641e-06, + "loss": 1.1012, + "step": 13148 + }, + { + "epoch": 0.7988942220061972, + "grad_norm": 0.13342589139938354, + "learning_rate": 9.714082199314118e-06, + "loss": 1.0646, + "step": 13149 + }, + { + "epoch": 0.7989549790388237, + "grad_norm": 0.11692260950803757, + "learning_rate": 9.708412995209921e-06, + "loss": 1.051, + "step": 13150 + }, + { + "epoch": 0.7990157360714503, + "grad_norm": 0.3315751552581787, + "learning_rate": 9.702745268001617e-06, + "loss": 1.2028, + "step": 13151 + }, + { + "epoch": 0.7990764931040768, + "grad_norm": 0.1433667093515396, + "learning_rate": 9.697079017896993e-06, + "loss": 1.0102, + "step": 13152 + }, + { + "epoch": 0.7991372501367033, + "grad_norm": 0.1572551429271698, + "learning_rate": 9.691414245103725e-06, + "loss": 1.1013, + "step": 13153 + }, + { + "epoch": 0.7991980071693299, + "grad_norm": 0.340774267911911, + "learning_rate": 9.685750949829458e-06, + "loss": 1.1231, + "step": 13154 + }, + { + "epoch": 0.7992587642019564, + "grad_norm": 0.1548185497522354, + "learning_rate": 9.6800891322818e-06, + "loss": 1.0659, + "step": 13155 + }, + { + "epoch": 0.7993195212345829, + "grad_norm": 0.15976443886756897, + "learning_rate": 9.674428792668238e-06, + "loss": 1.0074, + "step": 13156 + }, + { + "epoch": 0.7993802782672095, + "grad_norm": 0.13568075001239777, + "learning_rate": 9.668769931196297e-06, + "loss": 1.0429, + "step": 13157 + }, + { + "epoch": 0.799441035299836, + "grad_norm": 0.14354725182056427, + "learning_rate": 9.663112548073394e-06, + "loss": 1.0901, + "step": 13158 + }, + { + "epoch": 0.7995017923324624, + "grad_norm": 0.1283273547887802, + "learning_rate": 9.65745664350689e-06, + "loss": 1.0648, + "step": 13159 + }, + { + "epoch": 0.799562549365089, + "grad_norm": 0.2266400009393692, + "learning_rate": 9.65180221770412e-06, + "loss": 1.1014, + "step": 13160 + }, + { + "epoch": 0.7996233063977155, + "grad_norm": 0.11634045839309692, + "learning_rate": 9.646149270872334e-06, + "loss": 1.0607, + "step": 13161 + }, + { + "epoch": 0.799684063430342, + "grad_norm": 0.2146165668964386, + "learning_rate": 9.640497803218745e-06, + "loss": 1.0938, + "step": 13162 + }, + { + "epoch": 0.7997448204629686, + "grad_norm": 0.19099931418895721, + "learning_rate": 9.634847814950509e-06, + "loss": 1.152, + "step": 13163 + }, + { + "epoch": 0.7998055774955951, + "grad_norm": 0.11193753778934479, + "learning_rate": 9.629199306274733e-06, + "loss": 1.0193, + "step": 13164 + }, + { + "epoch": 0.7998663345282216, + "grad_norm": 0.16398470103740692, + "learning_rate": 9.623552277398461e-06, + "loss": 1.0807, + "step": 13165 + }, + { + "epoch": 0.7999270915608482, + "grad_norm": 15.153077125549316, + "learning_rate": 9.61790672852868e-06, + "loss": 1.0093, + "step": 13166 + }, + { + "epoch": 0.7999878485934747, + "grad_norm": 0.11747323721647263, + "learning_rate": 9.612262659872322e-06, + "loss": 1.0322, + "step": 13167 + }, + { + "epoch": 0.8000486056261013, + "grad_norm": 0.1298796683549881, + "learning_rate": 9.606620071636302e-06, + "loss": 1.0672, + "step": 13168 + }, + { + "epoch": 0.8001093626587278, + "grad_norm": 2.933661937713623, + "learning_rate": 9.600978964027435e-06, + "loss": 0.9854, + "step": 13169 + }, + { + "epoch": 0.8001701196913543, + "grad_norm": 5.170802116394043, + "learning_rate": 9.595339337252485e-06, + "loss": 1.0516, + "step": 13170 + }, + { + "epoch": 0.8002308767239809, + "grad_norm": 0.5858650803565979, + "learning_rate": 9.589701191518186e-06, + "loss": 1.4054, + "step": 13171 + }, + { + "epoch": 0.8002916337566073, + "grad_norm": 0.18030177056789398, + "learning_rate": 9.584064527031178e-06, + "loss": 1.0384, + "step": 13172 + }, + { + "epoch": 0.8003523907892338, + "grad_norm": 0.555079996585846, + "learning_rate": 9.578429343998119e-06, + "loss": 1.4273, + "step": 13173 + }, + { + "epoch": 0.8004131478218603, + "grad_norm": 0.1230132132768631, + "learning_rate": 9.572795642625537e-06, + "loss": 1.0499, + "step": 13174 + }, + { + "epoch": 0.8004739048544869, + "grad_norm": 0.1732676774263382, + "learning_rate": 9.567163423119968e-06, + "loss": 1.1516, + "step": 13175 + }, + { + "epoch": 0.8005346618871134, + "grad_norm": 0.2401278018951416, + "learning_rate": 9.561532685687818e-06, + "loss": 1.0172, + "step": 13176 + }, + { + "epoch": 0.80059541891974, + "grad_norm": 0.9617950320243835, + "learning_rate": 9.555903430535496e-06, + "loss": 1.1345, + "step": 13177 + }, + { + "epoch": 0.8006561759523665, + "grad_norm": 0.14101830124855042, + "learning_rate": 9.550275657869362e-06, + "loss": 1.0538, + "step": 13178 + }, + { + "epoch": 0.800716932984993, + "grad_norm": 0.12207241356372833, + "learning_rate": 9.544649367895692e-06, + "loss": 1.0581, + "step": 13179 + }, + { + "epoch": 0.8007776900176196, + "grad_norm": 0.19040468335151672, + "learning_rate": 9.53902456082072e-06, + "loss": 1.0202, + "step": 13180 + }, + { + "epoch": 0.8008384470502461, + "grad_norm": 1.8374100923538208, + "learning_rate": 9.533401236850626e-06, + "loss": 1.0007, + "step": 13181 + }, + { + "epoch": 0.8008992040828726, + "grad_norm": 0.1839904934167862, + "learning_rate": 9.527779396191528e-06, + "loss": 0.9872, + "step": 13182 + }, + { + "epoch": 0.8009599611154992, + "grad_norm": 0.20559729635715485, + "learning_rate": 9.5221590390495e-06, + "loss": 1.1505, + "step": 13183 + }, + { + "epoch": 0.8010207181481257, + "grad_norm": 0.15975527465343475, + "learning_rate": 9.516540165630556e-06, + "loss": 1.0522, + "step": 13184 + }, + { + "epoch": 0.8010814751807521, + "grad_norm": 1.3011716604232788, + "learning_rate": 9.510922776140663e-06, + "loss": 1.0767, + "step": 13185 + }, + { + "epoch": 0.8011422322133787, + "grad_norm": 0.29181694984436035, + "learning_rate": 9.505306870785713e-06, + "loss": 1.1833, + "step": 13186 + }, + { + "epoch": 0.8012029892460052, + "grad_norm": 0.13684216141700745, + "learning_rate": 9.499692449771574e-06, + "loss": 1.0307, + "step": 13187 + }, + { + "epoch": 0.8012637462786317, + "grad_norm": 1.0310965776443481, + "learning_rate": 9.494079513304021e-06, + "loss": 1.1975, + "step": 13188 + }, + { + "epoch": 0.8013245033112583, + "grad_norm": 0.11666496843099594, + "learning_rate": 9.48846806158884e-06, + "loss": 1.0423, + "step": 13189 + }, + { + "epoch": 0.8013852603438848, + "grad_norm": 0.10867491364479065, + "learning_rate": 9.482858094831681e-06, + "loss": 1.0596, + "step": 13190 + }, + { + "epoch": 0.8014460173765113, + "grad_norm": 0.24226537346839905, + "learning_rate": 9.477249613238186e-06, + "loss": 1.0954, + "step": 13191 + }, + { + "epoch": 0.8015067744091379, + "grad_norm": 0.32712113857269287, + "learning_rate": 9.471642617013948e-06, + "loss": 1.2661, + "step": 13192 + }, + { + "epoch": 0.8015675314417644, + "grad_norm": 0.14403632283210754, + "learning_rate": 9.466037106364462e-06, + "loss": 1.134, + "step": 13193 + }, + { + "epoch": 0.8016282884743909, + "grad_norm": 0.160531684756279, + "learning_rate": 9.46043308149524e-06, + "loss": 1.0509, + "step": 13194 + }, + { + "epoch": 0.8016890455070175, + "grad_norm": 0.15243718028068542, + "learning_rate": 9.454830542611686e-06, + "loss": 1.0402, + "step": 13195 + }, + { + "epoch": 0.801749802539644, + "grad_norm": 0.17986595630645752, + "learning_rate": 9.449229489919147e-06, + "loss": 1.0973, + "step": 13196 + }, + { + "epoch": 0.8018105595722705, + "grad_norm": 0.22678661346435547, + "learning_rate": 9.443629923622937e-06, + "loss": 1.0546, + "step": 13197 + }, + { + "epoch": 0.801871316604897, + "grad_norm": 0.22991134226322174, + "learning_rate": 9.438031843928302e-06, + "loss": 1.2182, + "step": 13198 + }, + { + "epoch": 0.8019320736375235, + "grad_norm": 0.1474834829568863, + "learning_rate": 9.43243525104046e-06, + "loss": 1.0439, + "step": 13199 + }, + { + "epoch": 0.80199283067015, + "grad_norm": 0.20641137659549713, + "learning_rate": 9.426840145164551e-06, + "loss": 1.1445, + "step": 13200 + }, + { + "epoch": 0.8020535877027766, + "grad_norm": 0.14994990825653076, + "learning_rate": 9.421246526505656e-06, + "loss": 1.0733, + "step": 13201 + }, + { + "epoch": 0.8021143447354031, + "grad_norm": 0.09064609557390213, + "learning_rate": 9.415654395268826e-06, + "loss": 1.0211, + "step": 13202 + }, + { + "epoch": 0.8021751017680296, + "grad_norm": 0.12877589464187622, + "learning_rate": 9.410063751658998e-06, + "loss": 1.0603, + "step": 13203 + }, + { + "epoch": 0.8022358588006562, + "grad_norm": 0.1695820689201355, + "learning_rate": 9.404474595881146e-06, + "loss": 1.0732, + "step": 13204 + }, + { + "epoch": 0.8022966158332827, + "grad_norm": 0.10930179059505463, + "learning_rate": 9.398886928140127e-06, + "loss": 1.0257, + "step": 13205 + }, + { + "epoch": 0.8023573728659092, + "grad_norm": 0.2766391634941101, + "learning_rate": 9.393300748640755e-06, + "loss": 1.0819, + "step": 13206 + }, + { + "epoch": 0.8024181298985358, + "grad_norm": 0.4591292440891266, + "learning_rate": 9.387716057587792e-06, + "loss": 1.2201, + "step": 13207 + }, + { + "epoch": 0.8024788869311623, + "grad_norm": 0.4646061062812805, + "learning_rate": 9.382132855185943e-06, + "loss": 1.245, + "step": 13208 + }, + { + "epoch": 0.8025396439637889, + "grad_norm": 0.13472218811511993, + "learning_rate": 9.376551141639872e-06, + "loss": 1.0377, + "step": 13209 + }, + { + "epoch": 0.8026004009964154, + "grad_norm": 0.11996764689683914, + "learning_rate": 9.370970917154165e-06, + "loss": 1.0544, + "step": 13210 + }, + { + "epoch": 0.8026611580290418, + "grad_norm": 0.17723794281482697, + "learning_rate": 9.36539218193338e-06, + "loss": 1.0522, + "step": 13211 + }, + { + "epoch": 0.8027219150616683, + "grad_norm": 0.18431003391742706, + "learning_rate": 9.359814936181992e-06, + "loss": 1.2172, + "step": 13212 + }, + { + "epoch": 0.8027826720942949, + "grad_norm": 0.188750758767128, + "learning_rate": 9.354239180104445e-06, + "loss": 1.0233, + "step": 13213 + }, + { + "epoch": 0.8028434291269214, + "grad_norm": 0.12524496018886566, + "learning_rate": 9.348664913905108e-06, + "loss": 1.0335, + "step": 13214 + }, + { + "epoch": 0.802904186159548, + "grad_norm": 0.12777726352214813, + "learning_rate": 9.343092137788329e-06, + "loss": 1.0841, + "step": 13215 + }, + { + "epoch": 0.8029649431921745, + "grad_norm": 0.09639745205640793, + "learning_rate": 9.337520851958376e-06, + "loss": 1.0334, + "step": 13216 + }, + { + "epoch": 0.803025700224801, + "grad_norm": 0.2233753651380539, + "learning_rate": 9.331951056619442e-06, + "loss": 1.1472, + "step": 13217 + }, + { + "epoch": 0.8030864572574276, + "grad_norm": 0.16895906627178192, + "learning_rate": 9.326382751975693e-06, + "loss": 1.0756, + "step": 13218 + }, + { + "epoch": 0.8031472142900541, + "grad_norm": 0.16784971952438354, + "learning_rate": 9.320815938231258e-06, + "loss": 1.076, + "step": 13219 + }, + { + "epoch": 0.8032079713226806, + "grad_norm": 4.107651710510254, + "learning_rate": 9.315250615590182e-06, + "loss": 1.1153, + "step": 13220 + }, + { + "epoch": 0.8032687283553072, + "grad_norm": 0.10829100012779236, + "learning_rate": 9.309686784256456e-06, + "loss": 1.049, + "step": 13221 + }, + { + "epoch": 0.8033294853879337, + "grad_norm": 0.10990607738494873, + "learning_rate": 9.304124444434037e-06, + "loss": 1.0091, + "step": 13222 + }, + { + "epoch": 0.8033902424205602, + "grad_norm": 0.1258353441953659, + "learning_rate": 9.298563596326777e-06, + "loss": 1.0488, + "step": 13223 + }, + { + "epoch": 0.8034509994531867, + "grad_norm": 0.1442100703716278, + "learning_rate": 9.293004240138548e-06, + "loss": 1.082, + "step": 13224 + }, + { + "epoch": 0.8035117564858132, + "grad_norm": 0.160639688372612, + "learning_rate": 9.287446376073123e-06, + "loss": 1.1263, + "step": 13225 + }, + { + "epoch": 0.8035725135184397, + "grad_norm": 0.32805612683296204, + "learning_rate": 9.281890004334214e-06, + "loss": 1.1557, + "step": 13226 + }, + { + "epoch": 0.8036332705510663, + "grad_norm": 0.1491694450378418, + "learning_rate": 9.276335125125501e-06, + "loss": 1.123, + "step": 13227 + }, + { + "epoch": 0.8036940275836928, + "grad_norm": 0.11665769666433334, + "learning_rate": 9.270781738650597e-06, + "loss": 1.0358, + "step": 13228 + }, + { + "epoch": 0.8037547846163193, + "grad_norm": 0.1171853318810463, + "learning_rate": 9.26522984511306e-06, + "loss": 1.0452, + "step": 13229 + }, + { + "epoch": 0.8038155416489459, + "grad_norm": 0.15451809763908386, + "learning_rate": 9.259679444716397e-06, + "loss": 1.0003, + "step": 13230 + }, + { + "epoch": 0.8038762986815724, + "grad_norm": 0.10360805690288544, + "learning_rate": 9.254130537664057e-06, + "loss": 1.0297, + "step": 13231 + }, + { + "epoch": 0.8039370557141989, + "grad_norm": 0.15688200294971466, + "learning_rate": 9.248583124159438e-06, + "loss": 1.0976, + "step": 13232 + }, + { + "epoch": 0.8039978127468255, + "grad_norm": 0.6644073128700256, + "learning_rate": 9.24303720440588e-06, + "loss": 1.0816, + "step": 13233 + }, + { + "epoch": 0.804058569779452, + "grad_norm": 0.10312240570783615, + "learning_rate": 9.237492778606665e-06, + "loss": 1.0075, + "step": 13234 + }, + { + "epoch": 0.8041193268120785, + "grad_norm": 0.12448840588331223, + "learning_rate": 9.231949846965038e-06, + "loss": 1.0182, + "step": 13235 + }, + { + "epoch": 0.8041800838447051, + "grad_norm": 0.12514646351337433, + "learning_rate": 9.226408409684189e-06, + "loss": 1.0028, + "step": 13236 + }, + { + "epoch": 0.8042408408773315, + "grad_norm": 18.818635940551758, + "learning_rate": 9.220868466967203e-06, + "loss": 1.1145, + "step": 13237 + }, + { + "epoch": 0.804301597909958, + "grad_norm": 0.1440465897321701, + "learning_rate": 9.215330019017165e-06, + "loss": 1.0281, + "step": 13238 + }, + { + "epoch": 0.8043623549425846, + "grad_norm": 0.4489496648311615, + "learning_rate": 9.209793066037081e-06, + "loss": 1.2157, + "step": 13239 + }, + { + "epoch": 0.8044231119752111, + "grad_norm": 0.17046141624450684, + "learning_rate": 9.204257608229928e-06, + "loss": 1.0976, + "step": 13240 + }, + { + "epoch": 0.8044838690078376, + "grad_norm": 2.879633665084839, + "learning_rate": 9.198723645798595e-06, + "loss": 1.0512, + "step": 13241 + }, + { + "epoch": 0.8045446260404642, + "grad_norm": 0.11650007218122482, + "learning_rate": 9.193191178945948e-06, + "loss": 0.9942, + "step": 13242 + }, + { + "epoch": 0.8046053830730907, + "grad_norm": 0.14756543934345245, + "learning_rate": 9.187660207874754e-06, + "loss": 1.1114, + "step": 13243 + }, + { + "epoch": 0.8046661401057172, + "grad_norm": 0.19879499077796936, + "learning_rate": 9.182130732787747e-06, + "loss": 0.9994, + "step": 13244 + }, + { + "epoch": 0.8047268971383438, + "grad_norm": 6.629814147949219, + "learning_rate": 9.176602753887648e-06, + "loss": 1.0806, + "step": 13245 + }, + { + "epoch": 0.8047876541709703, + "grad_norm": 0.2768288254737854, + "learning_rate": 9.171076271377061e-06, + "loss": 1.1867, + "step": 13246 + }, + { + "epoch": 0.8048484112035968, + "grad_norm": 0.1433205008506775, + "learning_rate": 9.165551285458562e-06, + "loss": 1.0882, + "step": 13247 + }, + { + "epoch": 0.8049091682362234, + "grad_norm": 0.133346289396286, + "learning_rate": 9.16002779633468e-06, + "loss": 1.0438, + "step": 13248 + }, + { + "epoch": 0.8049699252688499, + "grad_norm": 0.1338362991809845, + "learning_rate": 9.154505804207875e-06, + "loss": 1.0284, + "step": 13249 + }, + { + "epoch": 0.8050306823014763, + "grad_norm": 0.15715482831001282, + "learning_rate": 9.14898530928055e-06, + "loss": 1.0422, + "step": 13250 + }, + { + "epoch": 0.8050914393341029, + "grad_norm": 0.13264602422714233, + "learning_rate": 9.143466311755067e-06, + "loss": 0.984, + "step": 13251 + }, + { + "epoch": 0.8051521963667294, + "grad_norm": 0.1752057522535324, + "learning_rate": 9.137948811833718e-06, + "loss": 1.1425, + "step": 13252 + }, + { + "epoch": 0.805212953399356, + "grad_norm": 0.14663147926330566, + "learning_rate": 9.13243280971876e-06, + "loss": 1.0541, + "step": 13253 + }, + { + "epoch": 0.8052737104319825, + "grad_norm": 0.1315969079732895, + "learning_rate": 9.126918305612376e-06, + "loss": 1.0325, + "step": 13254 + }, + { + "epoch": 0.805334467464609, + "grad_norm": 0.18285177648067474, + "learning_rate": 9.12140529971669e-06, + "loss": 1.1071, + "step": 13255 + }, + { + "epoch": 0.8053952244972356, + "grad_norm": 0.13508103787899017, + "learning_rate": 9.11589379223382e-06, + "loss": 1.0691, + "step": 13256 + }, + { + "epoch": 0.8054559815298621, + "grad_norm": 0.10481157153844833, + "learning_rate": 9.11038378336575e-06, + "loss": 1.0188, + "step": 13257 + }, + { + "epoch": 0.8055167385624886, + "grad_norm": 0.15566766262054443, + "learning_rate": 9.104875273314472e-06, + "loss": 1.0761, + "step": 13258 + }, + { + "epoch": 0.8055774955951152, + "grad_norm": 0.1830064058303833, + "learning_rate": 9.099368262281887e-06, + "loss": 1.0461, + "step": 13259 + }, + { + "epoch": 0.8056382526277417, + "grad_norm": 0.21882294118404388, + "learning_rate": 9.093862750469862e-06, + "loss": 1.0922, + "step": 13260 + }, + { + "epoch": 0.8056990096603682, + "grad_norm": 0.14830783009529114, + "learning_rate": 9.08835873808021e-06, + "loss": 1.052, + "step": 13261 + }, + { + "epoch": 0.8057597666929948, + "grad_norm": 0.09598112851381302, + "learning_rate": 9.08285622531469e-06, + "loss": 1.0082, + "step": 13262 + }, + { + "epoch": 0.8058205237256213, + "grad_norm": 0.14668449759483337, + "learning_rate": 9.077355212374972e-06, + "loss": 1.1288, + "step": 13263 + }, + { + "epoch": 0.8058812807582477, + "grad_norm": 0.18051615357398987, + "learning_rate": 9.071855699462712e-06, + "loss": 1.0731, + "step": 13264 + }, + { + "epoch": 0.8059420377908743, + "grad_norm": 0.1578369289636612, + "learning_rate": 9.066357686779475e-06, + "loss": 1.1266, + "step": 13265 + }, + { + "epoch": 0.8060027948235008, + "grad_norm": 0.13977807760238647, + "learning_rate": 9.060861174526824e-06, + "loss": 1.0327, + "step": 13266 + }, + { + "epoch": 0.8060635518561273, + "grad_norm": 0.21295364201068878, + "learning_rate": 9.05536616290622e-06, + "loss": 1.1553, + "step": 13267 + }, + { + "epoch": 0.8061243088887539, + "grad_norm": 0.13446098566055298, + "learning_rate": 9.049872652119085e-06, + "loss": 1.0259, + "step": 13268 + }, + { + "epoch": 0.8061850659213804, + "grad_norm": 0.41465988755226135, + "learning_rate": 9.044380642366795e-06, + "loss": 1.139, + "step": 13269 + }, + { + "epoch": 0.8062458229540069, + "grad_norm": 0.16579413414001465, + "learning_rate": 9.038890133850625e-06, + "loss": 1.0413, + "step": 13270 + }, + { + "epoch": 0.8063065799866335, + "grad_norm": 0.20012642443180084, + "learning_rate": 9.033401126771867e-06, + "loss": 1.0729, + "step": 13271 + }, + { + "epoch": 0.80636733701926, + "grad_norm": 0.1496468335390091, + "learning_rate": 9.027913621331707e-06, + "loss": 1.1078, + "step": 13272 + }, + { + "epoch": 0.8064280940518865, + "grad_norm": 0.13976065814495087, + "learning_rate": 9.022427617731293e-06, + "loss": 1.0792, + "step": 13273 + }, + { + "epoch": 0.8064888510845131, + "grad_norm": 0.1137533038854599, + "learning_rate": 9.016943116171722e-06, + "loss": 1.0368, + "step": 13274 + }, + { + "epoch": 0.8065496081171396, + "grad_norm": 0.11362741887569427, + "learning_rate": 9.011460116854015e-06, + "loss": 1.0346, + "step": 13275 + }, + { + "epoch": 0.8066103651497661, + "grad_norm": 0.14698021113872528, + "learning_rate": 9.005978619979167e-06, + "loss": 1.0043, + "step": 13276 + }, + { + "epoch": 0.8066711221823926, + "grad_norm": 0.1776665300130844, + "learning_rate": 9.000498625748088e-06, + "loss": 1.0662, + "step": 13277 + }, + { + "epoch": 0.8067318792150191, + "grad_norm": 0.17688356339931488, + "learning_rate": 8.995020134361664e-06, + "loss": 1.1296, + "step": 13278 + }, + { + "epoch": 0.8067926362476456, + "grad_norm": 0.14069749414920807, + "learning_rate": 8.989543146020701e-06, + "loss": 1.0873, + "step": 13279 + }, + { + "epoch": 0.8068533932802722, + "grad_norm": 0.38233307003974915, + "learning_rate": 8.98406766092596e-06, + "loss": 1.2014, + "step": 13280 + }, + { + "epoch": 0.8069141503128987, + "grad_norm": 0.13473199307918549, + "learning_rate": 8.978593679278136e-06, + "loss": 1.0326, + "step": 13281 + }, + { + "epoch": 0.8069749073455252, + "grad_norm": 0.1711856573820114, + "learning_rate": 8.973121201277901e-06, + "loss": 1.1841, + "step": 13282 + }, + { + "epoch": 0.8070356643781518, + "grad_norm": 0.09955224394798279, + "learning_rate": 8.967650227125856e-06, + "loss": 1.0142, + "step": 13283 + }, + { + "epoch": 0.8070964214107783, + "grad_norm": 0.2441820651292801, + "learning_rate": 8.962180757022504e-06, + "loss": 1.1502, + "step": 13284 + }, + { + "epoch": 0.8071571784434048, + "grad_norm": 0.13988153636455536, + "learning_rate": 8.956712791168354e-06, + "loss": 1.0682, + "step": 13285 + }, + { + "epoch": 0.8072179354760314, + "grad_norm": 0.11172053217887878, + "learning_rate": 8.951246329763813e-06, + "loss": 1.0601, + "step": 13286 + }, + { + "epoch": 0.8072786925086579, + "grad_norm": 0.12162996828556061, + "learning_rate": 8.945781373009288e-06, + "loss": 1.0532, + "step": 13287 + }, + { + "epoch": 0.8073394495412844, + "grad_norm": 0.1264953315258026, + "learning_rate": 8.940317921105085e-06, + "loss": 1.1064, + "step": 13288 + }, + { + "epoch": 0.807400206573911, + "grad_norm": 0.21767602860927582, + "learning_rate": 8.934855974251472e-06, + "loss": 1.0879, + "step": 13289 + }, + { + "epoch": 0.8074609636065374, + "grad_norm": 0.1758887618780136, + "learning_rate": 8.929395532648643e-06, + "loss": 1.1053, + "step": 13290 + }, + { + "epoch": 0.8075217206391639, + "grad_norm": 0.13397012650966644, + "learning_rate": 8.923936596496745e-06, + "loss": 1.0486, + "step": 13291 + }, + { + "epoch": 0.8075824776717905, + "grad_norm": 0.21539229154586792, + "learning_rate": 8.918479165995901e-06, + "loss": 1.0518, + "step": 13292 + }, + { + "epoch": 0.807643234704417, + "grad_norm": 0.13243921101093292, + "learning_rate": 8.913023241346147e-06, + "loss": 1.155, + "step": 13293 + }, + { + "epoch": 0.8077039917370435, + "grad_norm": 0.18805481493473053, + "learning_rate": 8.907568822747469e-06, + "loss": 1.0687, + "step": 13294 + }, + { + "epoch": 0.8077647487696701, + "grad_norm": 0.1260208636522293, + "learning_rate": 8.902115910399794e-06, + "loss": 1.1178, + "step": 13295 + }, + { + "epoch": 0.8078255058022966, + "grad_norm": 0.1716604381799698, + "learning_rate": 8.89666450450301e-06, + "loss": 1.0875, + "step": 13296 + }, + { + "epoch": 0.8078862628349232, + "grad_norm": 0.1804407685995102, + "learning_rate": 8.89121460525693e-06, + "loss": 1.0911, + "step": 13297 + }, + { + "epoch": 0.8079470198675497, + "grad_norm": 0.18404801189899445, + "learning_rate": 8.885766212861329e-06, + "loss": 1.0872, + "step": 13298 + }, + { + "epoch": 0.8080077769001762, + "grad_norm": 0.1195506826043129, + "learning_rate": 8.880319327515913e-06, + "loss": 1.0159, + "step": 13299 + }, + { + "epoch": 0.8080685339328028, + "grad_norm": 0.13083086907863617, + "learning_rate": 8.874873949420336e-06, + "loss": 1.0683, + "step": 13300 + }, + { + "epoch": 0.8081292909654293, + "grad_norm": 0.45422470569610596, + "learning_rate": 8.869430078774205e-06, + "loss": 1.1042, + "step": 13301 + }, + { + "epoch": 0.8081900479980558, + "grad_norm": 0.0996997058391571, + "learning_rate": 8.863987715777055e-06, + "loss": 1.0408, + "step": 13302 + }, + { + "epoch": 0.8082508050306823, + "grad_norm": 0.1602700650691986, + "learning_rate": 8.858546860628415e-06, + "loss": 1.0789, + "step": 13303 + }, + { + "epoch": 0.8083115620633088, + "grad_norm": 0.7398463487625122, + "learning_rate": 8.853107513527676e-06, + "loss": 1.0957, + "step": 13304 + }, + { + "epoch": 0.8083723190959353, + "grad_norm": 0.18652676045894623, + "learning_rate": 8.84766967467423e-06, + "loss": 1.073, + "step": 13305 + }, + { + "epoch": 0.8084330761285619, + "grad_norm": 0.13182881474494934, + "learning_rate": 8.842233344267414e-06, + "loss": 1.092, + "step": 13306 + }, + { + "epoch": 0.8084938331611884, + "grad_norm": 0.09149140119552612, + "learning_rate": 8.836798522506473e-06, + "loss": 1.035, + "step": 13307 + }, + { + "epoch": 0.8085545901938149, + "grad_norm": 0.1158941239118576, + "learning_rate": 8.831365209590653e-06, + "loss": 1.019, + "step": 13308 + }, + { + "epoch": 0.8086153472264415, + "grad_norm": 0.16232800483703613, + "learning_rate": 8.825933405719112e-06, + "loss": 1.0799, + "step": 13309 + }, + { + "epoch": 0.808676104259068, + "grad_norm": 0.3569292426109314, + "learning_rate": 8.820503111090927e-06, + "loss": 1.0566, + "step": 13310 + }, + { + "epoch": 0.8087368612916945, + "grad_norm": 0.2154925912618637, + "learning_rate": 8.815074325905147e-06, + "loss": 1.034, + "step": 13311 + }, + { + "epoch": 0.8087976183243211, + "grad_norm": 0.12039048224687576, + "learning_rate": 8.809647050360793e-06, + "loss": 1.0018, + "step": 13312 + }, + { + "epoch": 0.8088583753569476, + "grad_norm": 0.1591581404209137, + "learning_rate": 8.804221284656788e-06, + "loss": 1.0799, + "step": 13313 + }, + { + "epoch": 0.8089191323895741, + "grad_norm": 0.13909871876239777, + "learning_rate": 8.798797028992018e-06, + "loss": 1.0332, + "step": 13314 + }, + { + "epoch": 0.8089798894222007, + "grad_norm": 0.13049043715000153, + "learning_rate": 8.793374283565303e-06, + "loss": 1.0218, + "step": 13315 + }, + { + "epoch": 0.8090406464548271, + "grad_norm": 0.20883135497570038, + "learning_rate": 8.787953048575414e-06, + "loss": 1.1019, + "step": 13316 + }, + { + "epoch": 0.8091014034874536, + "grad_norm": 0.18288029730319977, + "learning_rate": 8.782533324221081e-06, + "loss": 1.2101, + "step": 13317 + }, + { + "epoch": 0.8091621605200802, + "grad_norm": 0.1941082775592804, + "learning_rate": 8.77711511070095e-06, + "loss": 1.1775, + "step": 13318 + }, + { + "epoch": 0.8092229175527067, + "grad_norm": 0.11101412773132324, + "learning_rate": 8.771698408213635e-06, + "loss": 1.0599, + "step": 13319 + }, + { + "epoch": 0.8092836745853332, + "grad_norm": 0.11275576055049896, + "learning_rate": 8.766283216957689e-06, + "loss": 1.0599, + "step": 13320 + }, + { + "epoch": 0.8093444316179598, + "grad_norm": 0.18482142686843872, + "learning_rate": 8.760869537131594e-06, + "loss": 1.033, + "step": 13321 + }, + { + "epoch": 0.8094051886505863, + "grad_norm": 0.2128552347421646, + "learning_rate": 8.755457368933801e-06, + "loss": 1.1556, + "step": 13322 + }, + { + "epoch": 0.8094659456832128, + "grad_norm": 0.15571439266204834, + "learning_rate": 8.750046712562693e-06, + "loss": 1.1257, + "step": 13323 + }, + { + "epoch": 0.8095267027158394, + "grad_norm": 0.10723083466291428, + "learning_rate": 8.74463756821659e-06, + "loss": 1.0042, + "step": 13324 + }, + { + "epoch": 0.8095874597484659, + "grad_norm": 0.2939596176147461, + "learning_rate": 8.73922993609378e-06, + "loss": 1.1041, + "step": 13325 + }, + { + "epoch": 0.8096482167810924, + "grad_norm": 0.14527249336242676, + "learning_rate": 8.733823816392466e-06, + "loss": 1.0108, + "step": 13326 + }, + { + "epoch": 0.809708973813719, + "grad_norm": 0.09700893610715866, + "learning_rate": 8.7284192093108e-06, + "loss": 1.0229, + "step": 13327 + }, + { + "epoch": 0.8097697308463455, + "grad_norm": 0.10777320712804794, + "learning_rate": 8.723016115046928e-06, + "loss": 1.025, + "step": 13328 + }, + { + "epoch": 0.8098304878789719, + "grad_norm": 0.15057148039340973, + "learning_rate": 8.717614533798885e-06, + "loss": 1.1323, + "step": 13329 + }, + { + "epoch": 0.8098912449115985, + "grad_norm": 0.09742692112922668, + "learning_rate": 8.71221446576465e-06, + "loss": 1.0654, + "step": 13330 + }, + { + "epoch": 0.809952001944225, + "grad_norm": 0.16451206803321838, + "learning_rate": 8.706815911142175e-06, + "loss": 1.0946, + "step": 13331 + }, + { + "epoch": 0.8100127589768515, + "grad_norm": 0.14277468621730804, + "learning_rate": 8.70141887012933e-06, + "loss": 1.0022, + "step": 13332 + }, + { + "epoch": 0.8100735160094781, + "grad_norm": 0.11534839868545532, + "learning_rate": 8.696023342923976e-06, + "loss": 0.9816, + "step": 13333 + }, + { + "epoch": 0.8101342730421046, + "grad_norm": 0.13074979186058044, + "learning_rate": 8.690629329723865e-06, + "loss": 1.073, + "step": 13334 + }, + { + "epoch": 0.8101950300747311, + "grad_norm": 0.1668655276298523, + "learning_rate": 8.685236830726729e-06, + "loss": 1.1067, + "step": 13335 + }, + { + "epoch": 0.8102557871073577, + "grad_norm": 0.27570295333862305, + "learning_rate": 8.679845846130225e-06, + "loss": 1.0884, + "step": 13336 + }, + { + "epoch": 0.8103165441399842, + "grad_norm": 0.1755102276802063, + "learning_rate": 8.67445637613194e-06, + "loss": 1.1081, + "step": 13337 + }, + { + "epoch": 0.8103773011726108, + "grad_norm": 0.14132289588451385, + "learning_rate": 8.669068420929454e-06, + "loss": 1.087, + "step": 13338 + }, + { + "epoch": 0.8104380582052373, + "grad_norm": 0.1467886120080948, + "learning_rate": 8.663681980720257e-06, + "loss": 1.0114, + "step": 13339 + }, + { + "epoch": 0.8104988152378638, + "grad_norm": 0.10594131797552109, + "learning_rate": 8.658297055701786e-06, + "loss": 1.0128, + "step": 13340 + }, + { + "epoch": 0.8105595722704904, + "grad_norm": 0.11226150393486023, + "learning_rate": 8.652913646071425e-06, + "loss": 1.0077, + "step": 13341 + }, + { + "epoch": 0.8106203293031168, + "grad_norm": 0.17683349549770355, + "learning_rate": 8.647531752026511e-06, + "loss": 1.1241, + "step": 13342 + }, + { + "epoch": 0.8106810863357433, + "grad_norm": 0.1497465819120407, + "learning_rate": 8.642151373764312e-06, + "loss": 1.1421, + "step": 13343 + }, + { + "epoch": 0.8107418433683699, + "grad_norm": 0.11562196165323257, + "learning_rate": 8.63677251148205e-06, + "loss": 1.0636, + "step": 13344 + }, + { + "epoch": 0.8108026004009964, + "grad_norm": 0.1849144995212555, + "learning_rate": 8.631395165376888e-06, + "loss": 1.1321, + "step": 13345 + }, + { + "epoch": 0.8108633574336229, + "grad_norm": 0.21299001574516296, + "learning_rate": 8.62601933564593e-06, + "loss": 1.2577, + "step": 13346 + }, + { + "epoch": 0.8109241144662495, + "grad_norm": 0.17349374294281006, + "learning_rate": 8.62064502248623e-06, + "loss": 1.1222, + "step": 13347 + }, + { + "epoch": 0.810984871498876, + "grad_norm": 0.565649688243866, + "learning_rate": 8.615272226094773e-06, + "loss": 1.0232, + "step": 13348 + }, + { + "epoch": 0.8110456285315025, + "grad_norm": 0.5099070072174072, + "learning_rate": 8.609900946668537e-06, + "loss": 1.1207, + "step": 13349 + }, + { + "epoch": 0.8111063855641291, + "grad_norm": 0.11480657011270523, + "learning_rate": 8.604531184404368e-06, + "loss": 1.0016, + "step": 13350 + }, + { + "epoch": 0.8111671425967556, + "grad_norm": 0.14679767191410065, + "learning_rate": 8.599162939499111e-06, + "loss": 1.002, + "step": 13351 + }, + { + "epoch": 0.8112278996293821, + "grad_norm": 0.16588066518306732, + "learning_rate": 8.59379621214954e-06, + "loss": 1.0356, + "step": 13352 + }, + { + "epoch": 0.8112886566620087, + "grad_norm": 0.1046774610877037, + "learning_rate": 8.588431002552361e-06, + "loss": 1.0527, + "step": 13353 + }, + { + "epoch": 0.8113494136946352, + "grad_norm": 0.1544579565525055, + "learning_rate": 8.583067310904264e-06, + "loss": 1.1128, + "step": 13354 + }, + { + "epoch": 0.8114101707272616, + "grad_norm": 0.10491519421339035, + "learning_rate": 8.577705137401837e-06, + "loss": 1.0483, + "step": 13355 + }, + { + "epoch": 0.8114709277598882, + "grad_norm": 0.1678817868232727, + "learning_rate": 8.572344482241645e-06, + "loss": 1.0977, + "step": 13356 + }, + { + "epoch": 0.8115316847925147, + "grad_norm": 0.13530850410461426, + "learning_rate": 8.56698534562016e-06, + "loss": 1.0349, + "step": 13357 + }, + { + "epoch": 0.8115924418251412, + "grad_norm": 0.4289415776729584, + "learning_rate": 8.561627727733828e-06, + "loss": 1.1378, + "step": 13358 + }, + { + "epoch": 0.8116531988577678, + "grad_norm": 0.19829007983207703, + "learning_rate": 8.556271628779056e-06, + "loss": 1.0593, + "step": 13359 + }, + { + "epoch": 0.8117139558903943, + "grad_norm": 0.23053620755672455, + "learning_rate": 8.550917048952151e-06, + "loss": 1.202, + "step": 13360 + }, + { + "epoch": 0.8117747129230208, + "grad_norm": 0.2767316401004791, + "learning_rate": 8.545563988449401e-06, + "loss": 1.1134, + "step": 13361 + }, + { + "epoch": 0.8118354699556474, + "grad_norm": 0.14083430171012878, + "learning_rate": 8.54021244746701e-06, + "loss": 1.2361, + "step": 13362 + }, + { + "epoch": 0.8118962269882739, + "grad_norm": 0.14965178072452545, + "learning_rate": 8.534862426201151e-06, + "loss": 1.0433, + "step": 13363 + }, + { + "epoch": 0.8119569840209004, + "grad_norm": 0.12796726822853088, + "learning_rate": 8.52951392484792e-06, + "loss": 1.0932, + "step": 13364 + }, + { + "epoch": 0.812017741053527, + "grad_norm": 0.11587242782115936, + "learning_rate": 8.524166943603373e-06, + "loss": 1.046, + "step": 13365 + }, + { + "epoch": 0.8120784980861535, + "grad_norm": 0.1596328616142273, + "learning_rate": 8.518821482663508e-06, + "loss": 1.0971, + "step": 13366 + }, + { + "epoch": 0.81213925511878, + "grad_norm": 0.11747420579195023, + "learning_rate": 8.513477542224258e-06, + "loss": 0.9845, + "step": 13367 + }, + { + "epoch": 0.8122000121514066, + "grad_norm": 0.2287248969078064, + "learning_rate": 8.508135122481503e-06, + "loss": 1.073, + "step": 13368 + }, + { + "epoch": 0.812260769184033, + "grad_norm": 0.20100581645965576, + "learning_rate": 8.502794223631067e-06, + "loss": 1.0521, + "step": 13369 + }, + { + "epoch": 0.8123215262166595, + "grad_norm": 0.13511161506175995, + "learning_rate": 8.497454845868752e-06, + "loss": 1.0252, + "step": 13370 + }, + { + "epoch": 0.8123822832492861, + "grad_norm": 0.1565893143415451, + "learning_rate": 8.492116989390243e-06, + "loss": 1.042, + "step": 13371 + }, + { + "epoch": 0.8124430402819126, + "grad_norm": 11.986188888549805, + "learning_rate": 8.486780654391207e-06, + "loss": 1.0436, + "step": 13372 + }, + { + "epoch": 0.8125037973145391, + "grad_norm": 0.310553640127182, + "learning_rate": 8.481445841067253e-06, + "loss": 1.0624, + "step": 13373 + }, + { + "epoch": 0.8125645543471657, + "grad_norm": 0.15974737703800201, + "learning_rate": 8.476112549613912e-06, + "loss": 1.0767, + "step": 13374 + }, + { + "epoch": 0.8126253113797922, + "grad_norm": 0.11792033910751343, + "learning_rate": 8.470780780226705e-06, + "loss": 1.0487, + "step": 13375 + }, + { + "epoch": 0.8126860684124187, + "grad_norm": 0.114162877202034, + "learning_rate": 8.465450533101072e-06, + "loss": 1.1031, + "step": 13376 + }, + { + "epoch": 0.8127468254450453, + "grad_norm": 0.5604024529457092, + "learning_rate": 8.460121808432358e-06, + "loss": 1.0376, + "step": 13377 + }, + { + "epoch": 0.8128075824776718, + "grad_norm": 2.486450433731079, + "learning_rate": 8.454794606415911e-06, + "loss": 1.0815, + "step": 13378 + }, + { + "epoch": 0.8128683395102984, + "grad_norm": 0.48364776372909546, + "learning_rate": 8.44946892724699e-06, + "loss": 1.0377, + "step": 13379 + }, + { + "epoch": 0.8129290965429249, + "grad_norm": 0.177617147564888, + "learning_rate": 8.444144771120826e-06, + "loss": 1.1836, + "step": 13380 + }, + { + "epoch": 0.8129898535755514, + "grad_norm": 0.2059275209903717, + "learning_rate": 8.438822138232572e-06, + "loss": 1.1152, + "step": 13381 + }, + { + "epoch": 0.8130506106081778, + "grad_norm": 0.11329291760921478, + "learning_rate": 8.433501028777318e-06, + "loss": 1.0223, + "step": 13382 + }, + { + "epoch": 0.8131113676408044, + "grad_norm": 0.11364471912384033, + "learning_rate": 8.42818144295014e-06, + "loss": 1.0489, + "step": 13383 + }, + { + "epoch": 0.8131721246734309, + "grad_norm": 0.10933076590299606, + "learning_rate": 8.422863380945977e-06, + "loss": 1.0082, + "step": 13384 + }, + { + "epoch": 0.8132328817060575, + "grad_norm": 0.20043650269508362, + "learning_rate": 8.417546842959806e-06, + "loss": 1.1137, + "step": 13385 + }, + { + "epoch": 0.813293638738684, + "grad_norm": 0.2203894853591919, + "learning_rate": 8.412231829186495e-06, + "loss": 1.1604, + "step": 13386 + }, + { + "epoch": 0.8133543957713105, + "grad_norm": 0.11832256615161896, + "learning_rate": 8.406918339820864e-06, + "loss": 1.0116, + "step": 13387 + }, + { + "epoch": 0.8134151528039371, + "grad_norm": 0.1803184449672699, + "learning_rate": 8.40160637505768e-06, + "loss": 1.1081, + "step": 13388 + }, + { + "epoch": 0.8134759098365636, + "grad_norm": 0.5034940838813782, + "learning_rate": 8.396295935091659e-06, + "loss": 0.9652, + "step": 13389 + }, + { + "epoch": 0.8135366668691901, + "grad_norm": 0.12402615696191788, + "learning_rate": 8.390987020117447e-06, + "loss": 1.0652, + "step": 13390 + }, + { + "epoch": 0.8135974239018167, + "grad_norm": 0.1698220670223236, + "learning_rate": 8.385679630329652e-06, + "loss": 0.9991, + "step": 13391 + }, + { + "epoch": 0.8136581809344432, + "grad_norm": 0.09587369859218597, + "learning_rate": 8.380373765922812e-06, + "loss": 1.0545, + "step": 13392 + }, + { + "epoch": 0.8137189379670697, + "grad_norm": 0.11561412364244461, + "learning_rate": 8.37506942709142e-06, + "loss": 1.0178, + "step": 13393 + }, + { + "epoch": 0.8137796949996963, + "grad_norm": 0.36636975407600403, + "learning_rate": 8.3697666140299e-06, + "loss": 1.1586, + "step": 13394 + }, + { + "epoch": 0.8138404520323227, + "grad_norm": 0.2472551465034485, + "learning_rate": 8.364465326932619e-06, + "loss": 1.0352, + "step": 13395 + }, + { + "epoch": 0.8139012090649492, + "grad_norm": 0.11036510765552521, + "learning_rate": 8.359165565993938e-06, + "loss": 1.0487, + "step": 13396 + }, + { + "epoch": 0.8139619660975758, + "grad_norm": 0.19025571644306183, + "learning_rate": 8.353867331408078e-06, + "loss": 1.0521, + "step": 13397 + }, + { + "epoch": 0.8140227231302023, + "grad_norm": 0.2817869484424591, + "learning_rate": 8.348570623369262e-06, + "loss": 1.1052, + "step": 13398 + }, + { + "epoch": 0.8140834801628288, + "grad_norm": 0.17201749980449677, + "learning_rate": 8.34327544207164e-06, + "loss": 1.072, + "step": 13399 + }, + { + "epoch": 0.8141442371954554, + "grad_norm": 0.12416765838861465, + "learning_rate": 8.337981787709297e-06, + "loss": 1.0651, + "step": 13400 + }, + { + "epoch": 0.8142049942280819, + "grad_norm": 0.1194167211651802, + "learning_rate": 8.332689660476294e-06, + "loss": 1.0725, + "step": 13401 + }, + { + "epoch": 0.8142657512607084, + "grad_norm": 0.16337068378925323, + "learning_rate": 8.32739906056661e-06, + "loss": 1.144, + "step": 13402 + }, + { + "epoch": 0.814326508293335, + "grad_norm": 0.14371144771575928, + "learning_rate": 8.322109988174177e-06, + "loss": 1.0232, + "step": 13403 + }, + { + "epoch": 0.8143872653259615, + "grad_norm": 0.14053042232990265, + "learning_rate": 8.31682244349285e-06, + "loss": 0.9842, + "step": 13404 + }, + { + "epoch": 0.814448022358588, + "grad_norm": 0.2046688050031662, + "learning_rate": 8.311536426716437e-06, + "loss": 1.0372, + "step": 13405 + }, + { + "epoch": 0.8145087793912146, + "grad_norm": 0.1138419359922409, + "learning_rate": 8.306251938038728e-06, + "loss": 1.1145, + "step": 13406 + }, + { + "epoch": 0.8145695364238411, + "grad_norm": 0.12448260933160782, + "learning_rate": 8.300968977653417e-06, + "loss": 1.0261, + "step": 13407 + }, + { + "epoch": 0.8146302934564675, + "grad_norm": 0.10281818360090256, + "learning_rate": 8.295687545754143e-06, + "loss": 1.0405, + "step": 13408 + }, + { + "epoch": 0.8146910504890941, + "grad_norm": 0.13087216019630432, + "learning_rate": 8.29040764253451e-06, + "loss": 1.0052, + "step": 13409 + }, + { + "epoch": 0.8147518075217206, + "grad_norm": 0.18908251821994781, + "learning_rate": 8.285129268188042e-06, + "loss": 1.1077, + "step": 13410 + }, + { + "epoch": 0.8148125645543471, + "grad_norm": 0.24766699969768524, + "learning_rate": 8.279852422908225e-06, + "loss": 1.0524, + "step": 13411 + }, + { + "epoch": 0.8148733215869737, + "grad_norm": 0.32727837562561035, + "learning_rate": 8.274577106888481e-06, + "loss": 1.2578, + "step": 13412 + }, + { + "epoch": 0.8149340786196002, + "grad_norm": 0.11442811787128448, + "learning_rate": 8.26930332032218e-06, + "loss": 1.0316, + "step": 13413 + }, + { + "epoch": 0.8149948356522267, + "grad_norm": 0.20276670157909393, + "learning_rate": 8.264031063402627e-06, + "loss": 1.0174, + "step": 13414 + }, + { + "epoch": 0.8150555926848533, + "grad_norm": 0.15397687256336212, + "learning_rate": 8.25876033632309e-06, + "loss": 1.0995, + "step": 13415 + }, + { + "epoch": 0.8151163497174798, + "grad_norm": 0.1613105684518814, + "learning_rate": 8.253491139276759e-06, + "loss": 1.0569, + "step": 13416 + }, + { + "epoch": 0.8151771067501064, + "grad_norm": 0.09520558267831802, + "learning_rate": 8.248223472456778e-06, + "loss": 1.0463, + "step": 13417 + }, + { + "epoch": 0.8152378637827329, + "grad_norm": 0.3520737290382385, + "learning_rate": 8.24295733605624e-06, + "loss": 1.2388, + "step": 13418 + }, + { + "epoch": 0.8152986208153594, + "grad_norm": 0.2062932848930359, + "learning_rate": 8.23769273026817e-06, + "loss": 1.2183, + "step": 13419 + }, + { + "epoch": 0.815359377847986, + "grad_norm": 0.1090056300163269, + "learning_rate": 8.232429655285529e-06, + "loss": 1.0164, + "step": 13420 + }, + { + "epoch": 0.8154201348806124, + "grad_norm": 0.11583462357521057, + "learning_rate": 8.22716811130127e-06, + "loss": 1.0591, + "step": 13421 + }, + { + "epoch": 0.8154808919132389, + "grad_norm": 0.31722715497016907, + "learning_rate": 8.221908098508235e-06, + "loss": 1.0835, + "step": 13422 + }, + { + "epoch": 0.8155416489458654, + "grad_norm": 0.15766093134880066, + "learning_rate": 8.216649617099248e-06, + "loss": 1.0525, + "step": 13423 + }, + { + "epoch": 0.815602405978492, + "grad_norm": 0.137874573469162, + "learning_rate": 8.211392667267037e-06, + "loss": 1.0473, + "step": 13424 + }, + { + "epoch": 0.8156631630111185, + "grad_norm": 0.11918137222528458, + "learning_rate": 8.206137249204288e-06, + "loss": 1.065, + "step": 13425 + }, + { + "epoch": 0.815723920043745, + "grad_norm": 0.20145457983016968, + "learning_rate": 8.200883363103673e-06, + "loss": 1.0876, + "step": 13426 + }, + { + "epoch": 0.8157846770763716, + "grad_norm": 1.0978052616119385, + "learning_rate": 8.195631009157756e-06, + "loss": 1.0407, + "step": 13427 + }, + { + "epoch": 0.8158454341089981, + "grad_norm": 0.1237892359495163, + "learning_rate": 8.190380187559065e-06, + "loss": 1.0636, + "step": 13428 + }, + { + "epoch": 0.8159061911416247, + "grad_norm": 0.12328382581472397, + "learning_rate": 8.185130898500071e-06, + "loss": 1.0438, + "step": 13429 + }, + { + "epoch": 0.8159669481742512, + "grad_norm": 0.14856725931167603, + "learning_rate": 8.179883142173184e-06, + "loss": 1.2112, + "step": 13430 + }, + { + "epoch": 0.8160277052068777, + "grad_norm": 0.1367659866809845, + "learning_rate": 8.174636918770768e-06, + "loss": 0.9977, + "step": 13431 + }, + { + "epoch": 0.8160884622395043, + "grad_norm": 0.25983452796936035, + "learning_rate": 8.169392228485111e-06, + "loss": 1.2778, + "step": 13432 + }, + { + "epoch": 0.8161492192721308, + "grad_norm": 0.18509969115257263, + "learning_rate": 8.164149071508476e-06, + "loss": 1.1337, + "step": 13433 + }, + { + "epoch": 0.8162099763047572, + "grad_norm": 0.2002178430557251, + "learning_rate": 8.158907448033043e-06, + "loss": 1.2526, + "step": 13434 + }, + { + "epoch": 0.8162707333373838, + "grad_norm": 0.15770648419857025, + "learning_rate": 8.153667358250944e-06, + "loss": 1.082, + "step": 13435 + }, + { + "epoch": 0.8163314903700103, + "grad_norm": 0.11276135593652725, + "learning_rate": 8.148428802354246e-06, + "loss": 1.0172, + "step": 13436 + }, + { + "epoch": 0.8163922474026368, + "grad_norm": 0.1375294029712677, + "learning_rate": 8.143191780535003e-06, + "loss": 1.0377, + "step": 13437 + }, + { + "epoch": 0.8164530044352634, + "grad_norm": 0.13543906807899475, + "learning_rate": 8.137956292985144e-06, + "loss": 1.0538, + "step": 13438 + }, + { + "epoch": 0.8165137614678899, + "grad_norm": 0.09280017763376236, + "learning_rate": 8.132722339896588e-06, + "loss": 1.0167, + "step": 13439 + }, + { + "epoch": 0.8165745185005164, + "grad_norm": 0.15531599521636963, + "learning_rate": 8.127489921461185e-06, + "loss": 1.0726, + "step": 13440 + }, + { + "epoch": 0.816635275533143, + "grad_norm": 0.19414614140987396, + "learning_rate": 8.122259037870728e-06, + "loss": 1.1199, + "step": 13441 + }, + { + "epoch": 0.8166960325657695, + "grad_norm": 0.15643206238746643, + "learning_rate": 8.117029689316962e-06, + "loss": 1.0448, + "step": 13442 + }, + { + "epoch": 0.816756789598396, + "grad_norm": 0.14813822507858276, + "learning_rate": 8.111801875991593e-06, + "loss": 1.0673, + "step": 13443 + }, + { + "epoch": 0.8168175466310226, + "grad_norm": 0.1387462615966797, + "learning_rate": 8.106575598086203e-06, + "loss": 1.0505, + "step": 13444 + }, + { + "epoch": 0.8168783036636491, + "grad_norm": 0.21125616133213043, + "learning_rate": 8.101350855792383e-06, + "loss": 1.1712, + "step": 13445 + }, + { + "epoch": 0.8169390606962756, + "grad_norm": 2.0209147930145264, + "learning_rate": 8.096127649301638e-06, + "loss": 1.1521, + "step": 13446 + }, + { + "epoch": 0.8169998177289021, + "grad_norm": 0.14098483324050903, + "learning_rate": 8.09090597880544e-06, + "loss": 1.0115, + "step": 13447 + }, + { + "epoch": 0.8170605747615286, + "grad_norm": 0.1621711254119873, + "learning_rate": 8.08568584449519e-06, + "loss": 1.1544, + "step": 13448 + }, + { + "epoch": 0.8171213317941551, + "grad_norm": 0.14340583980083466, + "learning_rate": 8.080467246562224e-06, + "loss": 1.0983, + "step": 13449 + }, + { + "epoch": 0.8171820888267817, + "grad_norm": 0.15204134583473206, + "learning_rate": 8.075250185197847e-06, + "loss": 1.0353, + "step": 13450 + }, + { + "epoch": 0.8172428458594082, + "grad_norm": 3.1373207569122314, + "learning_rate": 8.07003466059325e-06, + "loss": 1.0434, + "step": 13451 + }, + { + "epoch": 0.8173036028920347, + "grad_norm": 0.3946131765842438, + "learning_rate": 8.064820672939654e-06, + "loss": 1.1012, + "step": 13452 + }, + { + "epoch": 0.8173643599246613, + "grad_norm": 0.19499781727790833, + "learning_rate": 8.059608222428162e-06, + "loss": 1.1419, + "step": 13453 + }, + { + "epoch": 0.8174251169572878, + "grad_norm": 0.33928677439689636, + "learning_rate": 8.054397309249839e-06, + "loss": 1.177, + "step": 13454 + }, + { + "epoch": 0.8174858739899143, + "grad_norm": 0.11727161705493927, + "learning_rate": 8.049187933595686e-06, + "loss": 1.0469, + "step": 13455 + }, + { + "epoch": 0.8175466310225409, + "grad_norm": 0.27368858456611633, + "learning_rate": 8.043980095656662e-06, + "loss": 1.1589, + "step": 13456 + }, + { + "epoch": 0.8176073880551674, + "grad_norm": 0.17845216393470764, + "learning_rate": 8.038773795623656e-06, + "loss": 1.0977, + "step": 13457 + }, + { + "epoch": 0.817668145087794, + "grad_norm": 0.19985872507095337, + "learning_rate": 8.03356903368751e-06, + "loss": 1.1126, + "step": 13458 + }, + { + "epoch": 0.8177289021204205, + "grad_norm": 0.1213662400841713, + "learning_rate": 8.028365810039002e-06, + "loss": 1.0387, + "step": 13459 + }, + { + "epoch": 0.8177896591530469, + "grad_norm": 0.16757020354270935, + "learning_rate": 8.023164124868855e-06, + "loss": 1.1035, + "step": 13460 + }, + { + "epoch": 0.8178504161856734, + "grad_norm": 0.12357483059167862, + "learning_rate": 8.017963978367742e-06, + "loss": 1.0775, + "step": 13461 + }, + { + "epoch": 0.8179111732183, + "grad_norm": 0.1485072374343872, + "learning_rate": 8.012765370726261e-06, + "loss": 1.0058, + "step": 13462 + }, + { + "epoch": 0.8179719302509265, + "grad_norm": 0.17385391891002655, + "learning_rate": 8.007568302135011e-06, + "loss": 1.0988, + "step": 13463 + }, + { + "epoch": 0.818032687283553, + "grad_norm": 0.13299430906772614, + "learning_rate": 8.002372772784445e-06, + "loss": 1.0398, + "step": 13464 + }, + { + "epoch": 0.8180934443161796, + "grad_norm": 0.4909656345844269, + "learning_rate": 7.997178782865023e-06, + "loss": 1.1298, + "step": 13465 + }, + { + "epoch": 0.8181542013488061, + "grad_norm": 0.14939147233963013, + "learning_rate": 7.99198633256713e-06, + "loss": 1.105, + "step": 13466 + }, + { + "epoch": 0.8182149583814327, + "grad_norm": 0.16109956800937653, + "learning_rate": 7.986795422081089e-06, + "loss": 1.129, + "step": 13467 + }, + { + "epoch": 0.8182757154140592, + "grad_norm": 0.10842107981443405, + "learning_rate": 7.981606051597196e-06, + "loss": 0.9968, + "step": 13468 + }, + { + "epoch": 0.8183364724466857, + "grad_norm": 0.5431641340255737, + "learning_rate": 7.976418221305653e-06, + "loss": 1.152, + "step": 13469 + }, + { + "epoch": 0.8183972294793123, + "grad_norm": 0.16564422845840454, + "learning_rate": 7.971231931396638e-06, + "loss": 1.027, + "step": 13470 + }, + { + "epoch": 0.8184579865119388, + "grad_norm": 0.18083353340625763, + "learning_rate": 7.966047182060226e-06, + "loss": 1.0913, + "step": 13471 + }, + { + "epoch": 0.8185187435445653, + "grad_norm": 0.13449324667453766, + "learning_rate": 7.960863973486466e-06, + "loss": 1.0507, + "step": 13472 + }, + { + "epoch": 0.8185795005771919, + "grad_norm": 0.1310717612504959, + "learning_rate": 7.955682305865381e-06, + "loss": 1.0563, + "step": 13473 + }, + { + "epoch": 0.8186402576098183, + "grad_norm": 0.20050659775733948, + "learning_rate": 7.950502179386882e-06, + "loss": 1.1739, + "step": 13474 + }, + { + "epoch": 0.8187010146424448, + "grad_norm": 0.16730952262878418, + "learning_rate": 7.945323594240856e-06, + "loss": 1.0201, + "step": 13475 + }, + { + "epoch": 0.8187617716750714, + "grad_norm": 0.10560943186283112, + "learning_rate": 7.940146550617123e-06, + "loss": 1.0624, + "step": 13476 + }, + { + "epoch": 0.8188225287076979, + "grad_norm": 0.1055358275771141, + "learning_rate": 7.934971048705447e-06, + "loss": 1.0492, + "step": 13477 + }, + { + "epoch": 0.8188832857403244, + "grad_norm": 0.11885208636522293, + "learning_rate": 7.929797088695539e-06, + "loss": 1.0096, + "step": 13478 + }, + { + "epoch": 0.818944042772951, + "grad_norm": 0.12103947252035141, + "learning_rate": 7.924624670777047e-06, + "loss": 1.0342, + "step": 13479 + }, + { + "epoch": 0.8190047998055775, + "grad_norm": 0.14237910509109497, + "learning_rate": 7.919453795139576e-06, + "loss": 1.0502, + "step": 13480 + }, + { + "epoch": 0.819065556838204, + "grad_norm": 0.09899695962667465, + "learning_rate": 7.91428446197266e-06, + "loss": 0.9941, + "step": 13481 + }, + { + "epoch": 0.8191263138708306, + "grad_norm": 0.3681263327598572, + "learning_rate": 7.909116671465778e-06, + "loss": 1.093, + "step": 13482 + }, + { + "epoch": 0.8191870709034571, + "grad_norm": 0.3283785581588745, + "learning_rate": 7.903950423808365e-06, + "loss": 1.1385, + "step": 13483 + }, + { + "epoch": 0.8192478279360836, + "grad_norm": 0.1327342689037323, + "learning_rate": 7.898785719189784e-06, + "loss": 1.0461, + "step": 13484 + }, + { + "epoch": 0.8193085849687102, + "grad_norm": 0.12868739664554596, + "learning_rate": 7.893622557799346e-06, + "loss": 1.0096, + "step": 13485 + }, + { + "epoch": 0.8193693420013367, + "grad_norm": 0.3868829905986786, + "learning_rate": 7.88846093982632e-06, + "loss": 1.0969, + "step": 13486 + }, + { + "epoch": 0.8194300990339631, + "grad_norm": 6.048282623291016, + "learning_rate": 7.883300865459891e-06, + "loss": 1.2119, + "step": 13487 + }, + { + "epoch": 0.8194908560665897, + "grad_norm": 0.23110385239124298, + "learning_rate": 7.878142334889205e-06, + "loss": 1.1701, + "step": 13488 + }, + { + "epoch": 0.8195516130992162, + "grad_norm": 0.228868767619133, + "learning_rate": 7.872985348303358e-06, + "loss": 1.1009, + "step": 13489 + }, + { + "epoch": 0.8196123701318427, + "grad_norm": 0.11409421265125275, + "learning_rate": 7.867829905891394e-06, + "loss": 1.0083, + "step": 13490 + }, + { + "epoch": 0.8196731271644693, + "grad_norm": 0.11872085183858871, + "learning_rate": 7.862676007842257e-06, + "loss": 1.0471, + "step": 13491 + }, + { + "epoch": 0.8197338841970958, + "grad_norm": 0.8929527997970581, + "learning_rate": 7.857523654344873e-06, + "loss": 1.0941, + "step": 13492 + }, + { + "epoch": 0.8197946412297223, + "grad_norm": 0.22200007736682892, + "learning_rate": 7.85237284558809e-06, + "loss": 1.0776, + "step": 13493 + }, + { + "epoch": 0.8198553982623489, + "grad_norm": 0.11394982039928436, + "learning_rate": 7.847223581760744e-06, + "loss": 1.0413, + "step": 13494 + }, + { + "epoch": 0.8199161552949754, + "grad_norm": 0.09477784484624863, + "learning_rate": 7.842075863051567e-06, + "loss": 1.036, + "step": 13495 + }, + { + "epoch": 0.819976912327602, + "grad_norm": 0.17983530461788177, + "learning_rate": 7.836929689649247e-06, + "loss": 1.112, + "step": 13496 + }, + { + "epoch": 0.8200376693602285, + "grad_norm": 0.10790320485830307, + "learning_rate": 7.831785061742431e-06, + "loss": 1.0104, + "step": 13497 + }, + { + "epoch": 0.820098426392855, + "grad_norm": 0.6867464184761047, + "learning_rate": 7.826641979519655e-06, + "loss": 1.0825, + "step": 13498 + }, + { + "epoch": 0.8201591834254816, + "grad_norm": 0.19029100239276886, + "learning_rate": 7.821500443169487e-06, + "loss": 0.9832, + "step": 13499 + }, + { + "epoch": 0.820219940458108, + "grad_norm": 0.6250835657119751, + "learning_rate": 7.816360452880373e-06, + "loss": 1.0707, + "step": 13500 + }, + { + "epoch": 0.8202806974907345, + "grad_norm": 0.12136763334274292, + "learning_rate": 7.81122200884072e-06, + "loss": 1.1161, + "step": 13501 + }, + { + "epoch": 0.820341454523361, + "grad_norm": 0.33576223254203796, + "learning_rate": 7.80608511123888e-06, + "loss": 1.2448, + "step": 13502 + }, + { + "epoch": 0.8204022115559876, + "grad_norm": 0.7326078414916992, + "learning_rate": 7.800949760263144e-06, + "loss": 1.2602, + "step": 13503 + }, + { + "epoch": 0.8204629685886141, + "grad_norm": 0.16755549609661102, + "learning_rate": 7.795815956101755e-06, + "loss": 1.1486, + "step": 13504 + }, + { + "epoch": 0.8205237256212407, + "grad_norm": 0.10186265408992767, + "learning_rate": 7.790683698942885e-06, + "loss": 1.0032, + "step": 13505 + }, + { + "epoch": 0.8205844826538672, + "grad_norm": 0.14835381507873535, + "learning_rate": 7.785552988974665e-06, + "loss": 1.1042, + "step": 13506 + }, + { + "epoch": 0.8206452396864937, + "grad_norm": 0.1081235483288765, + "learning_rate": 7.780423826385158e-06, + "loss": 1.0296, + "step": 13507 + }, + { + "epoch": 0.8207059967191203, + "grad_norm": 0.2186422199010849, + "learning_rate": 7.775296211362382e-06, + "loss": 1.0677, + "step": 13508 + }, + { + "epoch": 0.8207667537517468, + "grad_norm": 0.1254095733165741, + "learning_rate": 7.770170144094269e-06, + "loss": 1.0395, + "step": 13509 + }, + { + "epoch": 0.8208275107843733, + "grad_norm": 0.36395975947380066, + "learning_rate": 7.765045624768758e-06, + "loss": 1.0946, + "step": 13510 + }, + { + "epoch": 0.8208882678169999, + "grad_norm": 0.15172605216503143, + "learning_rate": 7.759922653573648e-06, + "loss": 1.0255, + "step": 13511 + }, + { + "epoch": 0.8209490248496264, + "grad_norm": 0.14489217102527618, + "learning_rate": 7.754801230696745e-06, + "loss": 1.0556, + "step": 13512 + }, + { + "epoch": 0.8210097818822528, + "grad_norm": 0.20732025802135468, + "learning_rate": 7.74968135632575e-06, + "loss": 1.1075, + "step": 13513 + }, + { + "epoch": 0.8210705389148794, + "grad_norm": 0.2300223410129547, + "learning_rate": 7.744563030648367e-06, + "loss": 1.2089, + "step": 13514 + }, + { + "epoch": 0.8211312959475059, + "grad_norm": 0.15074793994426727, + "learning_rate": 7.739446253852195e-06, + "loss": 1.0471, + "step": 13515 + }, + { + "epoch": 0.8211920529801324, + "grad_norm": 0.1597336381673813, + "learning_rate": 7.734331026124786e-06, + "loss": 1.0606, + "step": 13516 + }, + { + "epoch": 0.821252810012759, + "grad_norm": 0.14109216630458832, + "learning_rate": 7.729217347653655e-06, + "loss": 0.9996, + "step": 13517 + }, + { + "epoch": 0.8213135670453855, + "grad_norm": 0.10818999260663986, + "learning_rate": 7.72410521862621e-06, + "loss": 1.0427, + "step": 13518 + }, + { + "epoch": 0.821374324078012, + "grad_norm": 0.4802270531654358, + "learning_rate": 7.71899463922987e-06, + "loss": 1.0507, + "step": 13519 + }, + { + "epoch": 0.8214350811106386, + "grad_norm": 0.10484717041254044, + "learning_rate": 7.71388560965196e-06, + "loss": 0.9752, + "step": 13520 + }, + { + "epoch": 0.8214958381432651, + "grad_norm": 0.1392170637845993, + "learning_rate": 7.708778130079742e-06, + "loss": 1.1028, + "step": 13521 + }, + { + "epoch": 0.8215565951758916, + "grad_norm": 0.12246867269277573, + "learning_rate": 7.70367220070044e-06, + "loss": 1.0444, + "step": 13522 + }, + { + "epoch": 0.8216173522085182, + "grad_norm": 0.2541763484477997, + "learning_rate": 7.698567821701203e-06, + "loss": 1.1154, + "step": 13523 + }, + { + "epoch": 0.8216781092411447, + "grad_norm": 0.17111891508102417, + "learning_rate": 7.693464993269145e-06, + "loss": 1.0929, + "step": 13524 + }, + { + "epoch": 0.8217388662737712, + "grad_norm": 0.11510283499956131, + "learning_rate": 7.6883637155913e-06, + "loss": 1.0316, + "step": 13525 + }, + { + "epoch": 0.8217996233063977, + "grad_norm": 0.1308744251728058, + "learning_rate": 7.683263988854667e-06, + "loss": 1.0596, + "step": 13526 + }, + { + "epoch": 0.8218603803390242, + "grad_norm": 0.10676344484090805, + "learning_rate": 7.678165813246168e-06, + "loss": 1.0318, + "step": 13527 + }, + { + "epoch": 0.8219211373716507, + "grad_norm": 0.12430138885974884, + "learning_rate": 7.673069188952686e-06, + "loss": 1.0768, + "step": 13528 + }, + { + "epoch": 0.8219818944042773, + "grad_norm": 0.1866612583398819, + "learning_rate": 7.66797411616102e-06, + "loss": 1.0705, + "step": 13529 + }, + { + "epoch": 0.8220426514369038, + "grad_norm": 0.1155172660946846, + "learning_rate": 7.66288059505797e-06, + "loss": 1.0221, + "step": 13530 + }, + { + "epoch": 0.8221034084695303, + "grad_norm": 0.143506720662117, + "learning_rate": 7.657788625830204e-06, + "loss": 1.0939, + "step": 13531 + }, + { + "epoch": 0.8221641655021569, + "grad_norm": 0.45693644881248474, + "learning_rate": 7.652698208664378e-06, + "loss": 1.047, + "step": 13532 + }, + { + "epoch": 0.8222249225347834, + "grad_norm": 0.16410203278064728, + "learning_rate": 7.647609343747086e-06, + "loss": 1.1678, + "step": 13533 + }, + { + "epoch": 0.8222856795674099, + "grad_norm": 0.1787978559732437, + "learning_rate": 7.642522031264849e-06, + "loss": 1.0448, + "step": 13534 + }, + { + "epoch": 0.8223464366000365, + "grad_norm": 0.13374283909797668, + "learning_rate": 7.637436271404168e-06, + "loss": 1.0733, + "step": 13535 + }, + { + "epoch": 0.822407193632663, + "grad_norm": 0.10483134537935257, + "learning_rate": 7.632352064351445e-06, + "loss": 1.0516, + "step": 13536 + }, + { + "epoch": 0.8224679506652895, + "grad_norm": 0.10286300629377365, + "learning_rate": 7.627269410293059e-06, + "loss": 1.0518, + "step": 13537 + }, + { + "epoch": 0.8225287076979161, + "grad_norm": 0.18806321918964386, + "learning_rate": 7.6221883094152955e-06, + "loss": 1.216, + "step": 13538 + }, + { + "epoch": 0.8225894647305425, + "grad_norm": 0.9146344065666199, + "learning_rate": 7.617108761904396e-06, + "loss": 1.2358, + "step": 13539 + }, + { + "epoch": 0.822650221763169, + "grad_norm": 0.1602848917245865, + "learning_rate": 7.612030767946577e-06, + "loss": 1.0769, + "step": 13540 + }, + { + "epoch": 0.8227109787957956, + "grad_norm": 0.20304131507873535, + "learning_rate": 7.606954327727967e-06, + "loss": 1.0745, + "step": 13541 + }, + { + "epoch": 0.8227717358284221, + "grad_norm": 0.1013583168387413, + "learning_rate": 7.601879441434639e-06, + "loss": 1.0125, + "step": 13542 + }, + { + "epoch": 0.8228324928610486, + "grad_norm": 0.14118294417858124, + "learning_rate": 7.596806109252613e-06, + "loss": 1.0837, + "step": 13543 + }, + { + "epoch": 0.8228932498936752, + "grad_norm": 0.7267357707023621, + "learning_rate": 7.591734331367861e-06, + "loss": 1.2208, + "step": 13544 + }, + { + "epoch": 0.8229540069263017, + "grad_norm": 0.18252035975456238, + "learning_rate": 7.586664107966279e-06, + "loss": 1.1129, + "step": 13545 + }, + { + "epoch": 0.8230147639589283, + "grad_norm": 0.18453197181224823, + "learning_rate": 7.58159543923373e-06, + "loss": 1.12, + "step": 13546 + }, + { + "epoch": 0.8230755209915548, + "grad_norm": 0.24387039244174957, + "learning_rate": 7.576528325355991e-06, + "loss": 1.0128, + "step": 13547 + }, + { + "epoch": 0.8231362780241813, + "grad_norm": 0.3113418221473694, + "learning_rate": 7.571462766518811e-06, + "loss": 1.0681, + "step": 13548 + }, + { + "epoch": 0.8231970350568079, + "grad_norm": 0.4675925672054291, + "learning_rate": 7.566398762907861e-06, + "loss": 1.1145, + "step": 13549 + }, + { + "epoch": 0.8232577920894344, + "grad_norm": 0.16404248774051666, + "learning_rate": 7.561336314708773e-06, + "loss": 1.039, + "step": 13550 + }, + { + "epoch": 0.8233185491220609, + "grad_norm": 0.10805781930685043, + "learning_rate": 7.5562754221071005e-06, + "loss": 1.0403, + "step": 13551 + }, + { + "epoch": 0.8233793061546874, + "grad_norm": 0.257723331451416, + "learning_rate": 7.551216085288359e-06, + "loss": 1.145, + "step": 13552 + }, + { + "epoch": 0.8234400631873139, + "grad_norm": 0.17606984078884125, + "learning_rate": 7.546158304437995e-06, + "loss": 1.1894, + "step": 13553 + }, + { + "epoch": 0.8235008202199404, + "grad_norm": 0.12058721482753754, + "learning_rate": 7.541102079741402e-06, + "loss": 0.975, + "step": 13554 + }, + { + "epoch": 0.823561577252567, + "grad_norm": 0.09002473205327988, + "learning_rate": 7.536047411383912e-06, + "loss": 0.9875, + "step": 13555 + }, + { + "epoch": 0.8236223342851935, + "grad_norm": 0.12132207304239273, + "learning_rate": 7.530994299550826e-06, + "loss": 1.0886, + "step": 13556 + }, + { + "epoch": 0.82368309131782, + "grad_norm": 0.19114543497562408, + "learning_rate": 7.5259427444273614e-06, + "loss": 1.0621, + "step": 13557 + }, + { + "epoch": 0.8237438483504466, + "grad_norm": 1.147022008895874, + "learning_rate": 7.520892746198666e-06, + "loss": 1.0021, + "step": 13558 + }, + { + "epoch": 0.8238046053830731, + "grad_norm": 0.16448919475078583, + "learning_rate": 7.5158443050498515e-06, + "loss": 1.0586, + "step": 13559 + }, + { + "epoch": 0.8238653624156996, + "grad_norm": 0.14355164766311646, + "learning_rate": 7.510797421165972e-06, + "loss": 1.2218, + "step": 13560 + }, + { + "epoch": 0.8239261194483262, + "grad_norm": 0.11638607829809189, + "learning_rate": 7.505752094732032e-06, + "loss": 1.0586, + "step": 13561 + }, + { + "epoch": 0.8239868764809527, + "grad_norm": 0.18874981999397278, + "learning_rate": 7.500708325932965e-06, + "loss": 1.1447, + "step": 13562 + }, + { + "epoch": 0.8240476335135792, + "grad_norm": 0.16098664700984955, + "learning_rate": 7.495666114953653e-06, + "loss": 1.1337, + "step": 13563 + }, + { + "epoch": 0.8241083905462058, + "grad_norm": 0.1381879299879074, + "learning_rate": 7.490625461978923e-06, + "loss": 1.0181, + "step": 13564 + }, + { + "epoch": 0.8241691475788322, + "grad_norm": 0.12066946178674698, + "learning_rate": 7.485586367193509e-06, + "loss": 1.1, + "step": 13565 + }, + { + "epoch": 0.8242299046114587, + "grad_norm": 0.22286900877952576, + "learning_rate": 7.480548830782164e-06, + "loss": 1.1605, + "step": 13566 + }, + { + "epoch": 0.8242906616440853, + "grad_norm": 0.2938409149646759, + "learning_rate": 7.475512852929511e-06, + "loss": 1.0216, + "step": 13567 + }, + { + "epoch": 0.8243514186767118, + "grad_norm": 0.11672308295965195, + "learning_rate": 7.470478433820161e-06, + "loss": 1.0389, + "step": 13568 + }, + { + "epoch": 0.8244121757093383, + "grad_norm": 0.27402544021606445, + "learning_rate": 7.4654455736386406e-06, + "loss": 1.1318, + "step": 13569 + }, + { + "epoch": 0.8244729327419649, + "grad_norm": 21.82569122314453, + "learning_rate": 7.460414272569433e-06, + "loss": 1.0366, + "step": 13570 + }, + { + "epoch": 0.8245336897745914, + "grad_norm": 0.12488722056150436, + "learning_rate": 7.455384530796966e-06, + "loss": 1.0117, + "step": 13571 + }, + { + "epoch": 0.8245944468072179, + "grad_norm": 0.17637819051742554, + "learning_rate": 7.450356348505605e-06, + "loss": 1.0513, + "step": 13572 + }, + { + "epoch": 0.8246552038398445, + "grad_norm": 0.2478485256433487, + "learning_rate": 7.445329725879652e-06, + "loss": 1.1749, + "step": 13573 + }, + { + "epoch": 0.824715960872471, + "grad_norm": 0.1008739322423935, + "learning_rate": 7.4403046631033645e-06, + "loss": 1.0179, + "step": 13574 + }, + { + "epoch": 0.8247767179050975, + "grad_norm": 0.12400525063276291, + "learning_rate": 7.435281160360941e-06, + "loss": 1.0189, + "step": 13575 + }, + { + "epoch": 0.8248374749377241, + "grad_norm": 0.11206913739442825, + "learning_rate": 7.4302592178365e-06, + "loss": 1.0422, + "step": 13576 + }, + { + "epoch": 0.8248982319703506, + "grad_norm": 0.13070032000541687, + "learning_rate": 7.425238835714155e-06, + "loss": 1.0435, + "step": 13577 + }, + { + "epoch": 0.8249589890029771, + "grad_norm": 0.12553338706493378, + "learning_rate": 7.420220014177903e-06, + "loss": 1.0368, + "step": 13578 + }, + { + "epoch": 0.8250197460356036, + "grad_norm": 0.180640310049057, + "learning_rate": 7.4152027534117155e-06, + "loss": 1.1231, + "step": 13579 + }, + { + "epoch": 0.8250805030682301, + "grad_norm": 0.2928134799003601, + "learning_rate": 7.410187053599499e-06, + "loss": 1.1799, + "step": 13580 + }, + { + "epoch": 0.8251412601008566, + "grad_norm": 0.1468118131160736, + "learning_rate": 7.405172914925101e-06, + "loss": 1.0684, + "step": 13581 + }, + { + "epoch": 0.8252020171334832, + "grad_norm": 0.16805818676948547, + "learning_rate": 7.400160337572337e-06, + "loss": 1.0013, + "step": 13582 + }, + { + "epoch": 0.8252627741661097, + "grad_norm": 0.2454906702041626, + "learning_rate": 7.395149321724926e-06, + "loss": 1.1418, + "step": 13583 + }, + { + "epoch": 0.8253235311987362, + "grad_norm": 0.14615805447101593, + "learning_rate": 7.390139867566565e-06, + "loss": 1.026, + "step": 13584 + }, + { + "epoch": 0.8253842882313628, + "grad_norm": 0.10640106350183487, + "learning_rate": 7.385131975280851e-06, + "loss": 1.0046, + "step": 13585 + }, + { + "epoch": 0.8254450452639893, + "grad_norm": 1.4889495372772217, + "learning_rate": 7.38012564505135e-06, + "loss": 1.0464, + "step": 13586 + }, + { + "epoch": 0.8255058022966159, + "grad_norm": 1.630642056465149, + "learning_rate": 7.375120877061592e-06, + "loss": 1.0621, + "step": 13587 + }, + { + "epoch": 0.8255665593292424, + "grad_norm": 0.16819359362125397, + "learning_rate": 7.370117671495019e-06, + "loss": 1.1247, + "step": 13588 + }, + { + "epoch": 0.8256273163618689, + "grad_norm": 0.33381471037864685, + "learning_rate": 7.365116028535024e-06, + "loss": 1.07, + "step": 13589 + }, + { + "epoch": 0.8256880733944955, + "grad_norm": 0.12128517776727676, + "learning_rate": 7.360115948364943e-06, + "loss": 1.0396, + "step": 13590 + }, + { + "epoch": 0.825748830427122, + "grad_norm": 0.12619876861572266, + "learning_rate": 7.355117431168052e-06, + "loss": 1.0541, + "step": 13591 + }, + { + "epoch": 0.8258095874597484, + "grad_norm": 0.09805592894554138, + "learning_rate": 7.350120477127576e-06, + "loss": 1.0304, + "step": 13592 + }, + { + "epoch": 0.825870344492375, + "grad_norm": 0.10195305943489075, + "learning_rate": 7.345125086426674e-06, + "loss": 1.0454, + "step": 13593 + }, + { + "epoch": 0.8259311015250015, + "grad_norm": 0.10392196476459503, + "learning_rate": 7.340131259248461e-06, + "loss": 1.0472, + "step": 13594 + }, + { + "epoch": 0.825991858557628, + "grad_norm": 0.13752086460590363, + "learning_rate": 7.335138995775981e-06, + "loss": 1.1008, + "step": 13595 + }, + { + "epoch": 0.8260526155902546, + "grad_norm": 0.11507494747638702, + "learning_rate": 7.330148296192229e-06, + "loss": 1.004, + "step": 13596 + }, + { + "epoch": 0.8261133726228811, + "grad_norm": 0.15243516862392426, + "learning_rate": 7.325159160680139e-06, + "loss": 1.0588, + "step": 13597 + }, + { + "epoch": 0.8261741296555076, + "grad_norm": 0.11210864037275314, + "learning_rate": 7.320171589422592e-06, + "loss": 1.0571, + "step": 13598 + }, + { + "epoch": 0.8262348866881342, + "grad_norm": 0.1640331745147705, + "learning_rate": 7.315185582602402e-06, + "loss": 1.1551, + "step": 13599 + }, + { + "epoch": 0.8262956437207607, + "grad_norm": 0.1828596144914627, + "learning_rate": 7.310201140402334e-06, + "loss": 1.1405, + "step": 13600 + }, + { + "epoch": 0.8263564007533872, + "grad_norm": 0.15175187587738037, + "learning_rate": 7.305218263005098e-06, + "loss": 1.0813, + "step": 13601 + }, + { + "epoch": 0.8264171577860138, + "grad_norm": 0.953859269618988, + "learning_rate": 7.300236950593326e-06, + "loss": 1.019, + "step": 13602 + }, + { + "epoch": 0.8264779148186403, + "grad_norm": 0.14325015246868134, + "learning_rate": 7.295257203349632e-06, + "loss": 1.0881, + "step": 13603 + }, + { + "epoch": 0.8265386718512668, + "grad_norm": 0.15648460388183594, + "learning_rate": 7.290279021456553e-06, + "loss": 1.0504, + "step": 13604 + }, + { + "epoch": 0.8265994288838933, + "grad_norm": 0.19723069667816162, + "learning_rate": 7.285302405096539e-06, + "loss": 1.1383, + "step": 13605 + }, + { + "epoch": 0.8266601859165198, + "grad_norm": 0.13852082192897797, + "learning_rate": 7.280327354452027e-06, + "loss": 1.051, + "step": 13606 + }, + { + "epoch": 0.8267209429491463, + "grad_norm": 0.32211220264434814, + "learning_rate": 7.275353869705353e-06, + "loss": 1.2097, + "step": 13607 + }, + { + "epoch": 0.8267816999817729, + "grad_norm": 0.09656880795955658, + "learning_rate": 7.270381951038863e-06, + "loss": 1.0389, + "step": 13608 + }, + { + "epoch": 0.8268424570143994, + "grad_norm": 0.16907167434692383, + "learning_rate": 7.265411598634775e-06, + "loss": 1.1242, + "step": 13609 + }, + { + "epoch": 0.8269032140470259, + "grad_norm": 1.975132703781128, + "learning_rate": 7.260442812675288e-06, + "loss": 1.062, + "step": 13610 + }, + { + "epoch": 0.8269639710796525, + "grad_norm": 0.24325786530971527, + "learning_rate": 7.255475593342531e-06, + "loss": 1.2642, + "step": 13611 + }, + { + "epoch": 0.827024728112279, + "grad_norm": 0.19007137417793274, + "learning_rate": 7.250509940818578e-06, + "loss": 1.0715, + "step": 13612 + }, + { + "epoch": 0.8270854851449055, + "grad_norm": 0.1458977311849594, + "learning_rate": 7.245545855285446e-06, + "loss": 1.0858, + "step": 13613 + }, + { + "epoch": 0.8271462421775321, + "grad_norm": 0.15047360956668854, + "learning_rate": 7.240583336925094e-06, + "loss": 1.0298, + "step": 13614 + }, + { + "epoch": 0.8272069992101586, + "grad_norm": 0.12451022863388062, + "learning_rate": 7.235622385919427e-06, + "loss": 1.0188, + "step": 13615 + }, + { + "epoch": 0.8272677562427851, + "grad_norm": 0.2222929149866104, + "learning_rate": 7.230663002450289e-06, + "loss": 1.079, + "step": 13616 + }, + { + "epoch": 0.8273285132754117, + "grad_norm": 0.5067652463912964, + "learning_rate": 7.22570518669946e-06, + "loss": 1.0343, + "step": 13617 + }, + { + "epoch": 0.8273892703080381, + "grad_norm": 0.12848037481307983, + "learning_rate": 7.220748938848682e-06, + "loss": 1.0676, + "step": 13618 + }, + { + "epoch": 0.8274500273406646, + "grad_norm": 0.19044119119644165, + "learning_rate": 7.215794259079617e-06, + "loss": 1.136, + "step": 13619 + }, + { + "epoch": 0.8275107843732912, + "grad_norm": 0.1902448534965515, + "learning_rate": 7.2108411475738825e-06, + "loss": 1.0809, + "step": 13620 + }, + { + "epoch": 0.8275715414059177, + "grad_norm": 0.15893591940402985, + "learning_rate": 7.2058896045130384e-06, + "loss": 1.0257, + "step": 13621 + }, + { + "epoch": 0.8276322984385442, + "grad_norm": 0.10953414440155029, + "learning_rate": 7.200939630078573e-06, + "loss": 1.014, + "step": 13622 + }, + { + "epoch": 0.8276930554711708, + "grad_norm": 0.14103396236896515, + "learning_rate": 7.195991224451948e-06, + "loss": 1.0724, + "step": 13623 + }, + { + "epoch": 0.8277538125037973, + "grad_norm": 0.1850893795490265, + "learning_rate": 7.19104438781455e-06, + "loss": 1.019, + "step": 13624 + }, + { + "epoch": 0.8278145695364238, + "grad_norm": 0.21538183093070984, + "learning_rate": 7.186099120347683e-06, + "loss": 1.0432, + "step": 13625 + }, + { + "epoch": 0.8278753265690504, + "grad_norm": 0.2647867202758789, + "learning_rate": 7.181155422232633e-06, + "loss": 1.2063, + "step": 13626 + }, + { + "epoch": 0.8279360836016769, + "grad_norm": 0.11590036749839783, + "learning_rate": 7.1762132936505914e-06, + "loss": 1.0521, + "step": 13627 + }, + { + "epoch": 0.8279968406343035, + "grad_norm": 0.1335471123456955, + "learning_rate": 7.171272734782741e-06, + "loss": 1.066, + "step": 13628 + }, + { + "epoch": 0.82805759766693, + "grad_norm": 0.1113278940320015, + "learning_rate": 7.166333745810166e-06, + "loss": 1.0212, + "step": 13629 + }, + { + "epoch": 0.8281183546995565, + "grad_norm": 0.22371621429920197, + "learning_rate": 7.161396326913911e-06, + "loss": 1.1098, + "step": 13630 + }, + { + "epoch": 0.828179111732183, + "grad_norm": 0.19300615787506104, + "learning_rate": 7.156460478274962e-06, + "loss": 1.0784, + "step": 13631 + }, + { + "epoch": 0.8282398687648095, + "grad_norm": 0.2759637236595154, + "learning_rate": 7.1515262000742156e-06, + "loss": 1.1484, + "step": 13632 + }, + { + "epoch": 0.828300625797436, + "grad_norm": 0.12149547040462494, + "learning_rate": 7.146593492492565e-06, + "loss": 0.9509, + "step": 13633 + }, + { + "epoch": 0.8283613828300626, + "grad_norm": 0.15606677532196045, + "learning_rate": 7.141662355710821e-06, + "loss": 1.0416, + "step": 13634 + }, + { + "epoch": 0.8284221398626891, + "grad_norm": 0.19518235325813293, + "learning_rate": 7.136732789909723e-06, + "loss": 1.1857, + "step": 13635 + }, + { + "epoch": 0.8284828968953156, + "grad_norm": 0.1401486098766327, + "learning_rate": 7.1318047952699695e-06, + "loss": 1.0899, + "step": 13636 + }, + { + "epoch": 0.8285436539279422, + "grad_norm": 0.20847487449645996, + "learning_rate": 7.1268783719721955e-06, + "loss": 1.1556, + "step": 13637 + }, + { + "epoch": 0.8286044109605687, + "grad_norm": 0.6007376909255981, + "learning_rate": 7.121953520196989e-06, + "loss": 1.2849, + "step": 13638 + }, + { + "epoch": 0.8286651679931952, + "grad_norm": 0.18197812139987946, + "learning_rate": 7.117030240124855e-06, + "loss": 1.2203, + "step": 13639 + }, + { + "epoch": 0.8287259250258218, + "grad_norm": 0.13139967620372772, + "learning_rate": 7.1121085319362736e-06, + "loss": 1.0126, + "step": 13640 + }, + { + "epoch": 0.8287866820584483, + "grad_norm": 0.5589107275009155, + "learning_rate": 7.107188395811643e-06, + "loss": 1.0957, + "step": 13641 + }, + { + "epoch": 0.8288474390910748, + "grad_norm": 0.16172543168067932, + "learning_rate": 7.102269831931313e-06, + "loss": 1.1093, + "step": 13642 + }, + { + "epoch": 0.8289081961237014, + "grad_norm": 0.09160090237855911, + "learning_rate": 7.097352840475563e-06, + "loss": 1.0263, + "step": 13643 + }, + { + "epoch": 0.8289689531563278, + "grad_norm": 0.1511775255203247, + "learning_rate": 7.092437421624659e-06, + "loss": 1.1145, + "step": 13644 + }, + { + "epoch": 0.8290297101889543, + "grad_norm": 0.0995846539735794, + "learning_rate": 7.087523575558741e-06, + "loss": 1.0743, + "step": 13645 + }, + { + "epoch": 0.8290904672215809, + "grad_norm": 0.12825095653533936, + "learning_rate": 7.082611302457948e-06, + "loss": 0.9914, + "step": 13646 + }, + { + "epoch": 0.8291512242542074, + "grad_norm": 0.19835519790649414, + "learning_rate": 7.07770060250233e-06, + "loss": 1.1042, + "step": 13647 + }, + { + "epoch": 0.8292119812868339, + "grad_norm": 0.19569192826747894, + "learning_rate": 7.0727914758718815e-06, + "loss": 1.1402, + "step": 13648 + }, + { + "epoch": 0.8292727383194605, + "grad_norm": 0.19548919796943665, + "learning_rate": 7.067883922746565e-06, + "loss": 1.1665, + "step": 13649 + }, + { + "epoch": 0.829333495352087, + "grad_norm": 0.1786556839942932, + "learning_rate": 7.06297794330627e-06, + "loss": 1.2982, + "step": 13650 + }, + { + "epoch": 0.8293942523847135, + "grad_norm": 0.13037210702896118, + "learning_rate": 7.0580735377308245e-06, + "loss": 1.055, + "step": 13651 + }, + { + "epoch": 0.8294550094173401, + "grad_norm": 0.10718393325805664, + "learning_rate": 7.053170706199985e-06, + "loss": 1.039, + "step": 13652 + }, + { + "epoch": 0.8295157664499666, + "grad_norm": 0.1495639830827713, + "learning_rate": 7.048269448893463e-06, + "loss": 1.0966, + "step": 13653 + }, + { + "epoch": 0.8295765234825931, + "grad_norm": 0.15754443407058716, + "learning_rate": 7.043369765990943e-06, + "loss": 1.1105, + "step": 13654 + }, + { + "epoch": 0.8296372805152197, + "grad_norm": 0.12724776566028595, + "learning_rate": 7.0384716576720025e-06, + "loss": 1.0565, + "step": 13655 + }, + { + "epoch": 0.8296980375478462, + "grad_norm": 0.17613442242145538, + "learning_rate": 7.033575124116187e-06, + "loss": 1.0838, + "step": 13656 + }, + { + "epoch": 0.8297587945804726, + "grad_norm": 0.6000800132751465, + "learning_rate": 7.028680165502982e-06, + "loss": 1.0815, + "step": 13657 + }, + { + "epoch": 0.8298195516130992, + "grad_norm": 0.2269129902124405, + "learning_rate": 7.0237867820118094e-06, + "loss": 1.2047, + "step": 13658 + }, + { + "epoch": 0.8298803086457257, + "grad_norm": 0.11749386787414551, + "learning_rate": 7.018894973822043e-06, + "loss": 1.0573, + "step": 13659 + }, + { + "epoch": 0.8299410656783522, + "grad_norm": 0.12844781577587128, + "learning_rate": 7.014004741112984e-06, + "loss": 1.0787, + "step": 13660 + }, + { + "epoch": 0.8300018227109788, + "grad_norm": 0.30096015334129333, + "learning_rate": 7.009116084063893e-06, + "loss": 1.0929, + "step": 13661 + }, + { + "epoch": 0.8300625797436053, + "grad_norm": 1.499553918838501, + "learning_rate": 7.004229002853962e-06, + "loss": 1.0759, + "step": 13662 + }, + { + "epoch": 0.8301233367762318, + "grad_norm": 0.12127718329429626, + "learning_rate": 6.99934349766233e-06, + "loss": 1.0543, + "step": 13663 + }, + { + "epoch": 0.8301840938088584, + "grad_norm": 0.18528977036476135, + "learning_rate": 6.994459568668071e-06, + "loss": 1.0711, + "step": 13664 + }, + { + "epoch": 0.8302448508414849, + "grad_norm": 0.14939551055431366, + "learning_rate": 6.989577216050214e-06, + "loss": 1.0764, + "step": 13665 + }, + { + "epoch": 0.8303056078741115, + "grad_norm": 0.2261781245470047, + "learning_rate": 6.984696439987715e-06, + "loss": 1.1361, + "step": 13666 + }, + { + "epoch": 0.830366364906738, + "grad_norm": 0.12320876121520996, + "learning_rate": 6.979817240659481e-06, + "loss": 1.0552, + "step": 13667 + }, + { + "epoch": 0.8304271219393645, + "grad_norm": 3.8952367305755615, + "learning_rate": 6.974939618244364e-06, + "loss": 1.0586, + "step": 13668 + }, + { + "epoch": 0.8304878789719911, + "grad_norm": 0.10868556797504425, + "learning_rate": 6.970063572921137e-06, + "loss": 1.0734, + "step": 13669 + }, + { + "epoch": 0.8305486360046175, + "grad_norm": 0.11588618904352188, + "learning_rate": 6.965189104868564e-06, + "loss": 1.0109, + "step": 13670 + }, + { + "epoch": 0.830609393037244, + "grad_norm": 0.10129258036613464, + "learning_rate": 6.960316214265311e-06, + "loss": 1.0545, + "step": 13671 + }, + { + "epoch": 0.8306701500698705, + "grad_norm": 0.3364144563674927, + "learning_rate": 6.9554449012899755e-06, + "loss": 1.1406, + "step": 13672 + }, + { + "epoch": 0.8307309071024971, + "grad_norm": 0.19322070479393005, + "learning_rate": 6.950575166121126e-06, + "loss": 1.1751, + "step": 13673 + }, + { + "epoch": 0.8307916641351236, + "grad_norm": 0.10311407595872879, + "learning_rate": 6.9457070089372575e-06, + "loss": 1.0058, + "step": 13674 + }, + { + "epoch": 0.8308524211677502, + "grad_norm": 0.1127227172255516, + "learning_rate": 6.940840429916828e-06, + "loss": 0.9993, + "step": 13675 + }, + { + "epoch": 0.8309131782003767, + "grad_norm": 0.2369873970746994, + "learning_rate": 6.935975429238218e-06, + "loss": 1.129, + "step": 13676 + }, + { + "epoch": 0.8309739352330032, + "grad_norm": 0.13883095979690552, + "learning_rate": 6.931112007079749e-06, + "loss": 1.0715, + "step": 13677 + }, + { + "epoch": 0.8310346922656298, + "grad_norm": 0.22597213089466095, + "learning_rate": 6.926250163619713e-06, + "loss": 1.0718, + "step": 13678 + }, + { + "epoch": 0.8310954492982563, + "grad_norm": 0.1657181680202484, + "learning_rate": 6.9213898990362736e-06, + "loss": 1.0405, + "step": 13679 + }, + { + "epoch": 0.8311562063308828, + "grad_norm": 0.3855338990688324, + "learning_rate": 6.916531213507626e-06, + "loss": 1.1183, + "step": 13680 + }, + { + "epoch": 0.8312169633635094, + "grad_norm": 0.12398028373718262, + "learning_rate": 6.911674107211858e-06, + "loss": 1.0507, + "step": 13681 + }, + { + "epoch": 0.8312777203961359, + "grad_norm": 0.16409063339233398, + "learning_rate": 6.906818580327007e-06, + "loss": 1.0393, + "step": 13682 + }, + { + "epoch": 0.8313384774287624, + "grad_norm": 0.18331725895404816, + "learning_rate": 6.901964633031044e-06, + "loss": 1.02, + "step": 13683 + }, + { + "epoch": 0.8313992344613889, + "grad_norm": 0.15774329006671906, + "learning_rate": 6.897112265501904e-06, + "loss": 1.0643, + "step": 13684 + }, + { + "epoch": 0.8314599914940154, + "grad_norm": 0.9368786215782166, + "learning_rate": 6.892261477917444e-06, + "loss": 1.204, + "step": 13685 + }, + { + "epoch": 0.8315207485266419, + "grad_norm": 0.26125773787498474, + "learning_rate": 6.8874122704554676e-06, + "loss": 1.1126, + "step": 13686 + }, + { + "epoch": 0.8315815055592685, + "grad_norm": 0.4213768541812897, + "learning_rate": 6.882564643293732e-06, + "loss": 1.107, + "step": 13687 + }, + { + "epoch": 0.831642262591895, + "grad_norm": 0.1398756057024002, + "learning_rate": 6.8777185966099225e-06, + "loss": 1.0566, + "step": 13688 + }, + { + "epoch": 0.8317030196245215, + "grad_norm": 0.14308799803256989, + "learning_rate": 6.872874130581674e-06, + "loss": 1.101, + "step": 13689 + }, + { + "epoch": 0.8317637766571481, + "grad_norm": 0.21111813187599182, + "learning_rate": 6.868031245386547e-06, + "loss": 1.1003, + "step": 13690 + }, + { + "epoch": 0.8318245336897746, + "grad_norm": 0.2287217676639557, + "learning_rate": 6.8631899412020974e-06, + "loss": 1.0466, + "step": 13691 + }, + { + "epoch": 0.8318852907224011, + "grad_norm": 0.19451963901519775, + "learning_rate": 6.858350218205745e-06, + "loss": 1.0643, + "step": 13692 + }, + { + "epoch": 0.8319460477550277, + "grad_norm": 0.157549649477005, + "learning_rate": 6.853512076574908e-06, + "loss": 1.0906, + "step": 13693 + }, + { + "epoch": 0.8320068047876542, + "grad_norm": 0.28237295150756836, + "learning_rate": 6.848675516486924e-06, + "loss": 1.1226, + "step": 13694 + }, + { + "epoch": 0.8320675618202807, + "grad_norm": 0.12846878170967102, + "learning_rate": 6.843840538119067e-06, + "loss": 1.0677, + "step": 13695 + }, + { + "epoch": 0.8321283188529073, + "grad_norm": 0.09501422941684723, + "learning_rate": 6.8390071416485905e-06, + "loss": 1.0524, + "step": 13696 + }, + { + "epoch": 0.8321890758855337, + "grad_norm": 0.17965899407863617, + "learning_rate": 6.834175327252651e-06, + "loss": 1.0786, + "step": 13697 + }, + { + "epoch": 0.8322498329181602, + "grad_norm": 0.1301775574684143, + "learning_rate": 6.829345095108364e-06, + "loss": 1.0571, + "step": 13698 + }, + { + "epoch": 0.8323105899507868, + "grad_norm": 0.23039108514785767, + "learning_rate": 6.82451644539277e-06, + "loss": 1.1818, + "step": 13699 + }, + { + "epoch": 0.8323713469834133, + "grad_norm": 0.2390698790550232, + "learning_rate": 6.819689378282862e-06, + "loss": 1.0777, + "step": 13700 + }, + { + "epoch": 0.8324321040160398, + "grad_norm": 0.16512732207775116, + "learning_rate": 6.814863893955598e-06, + "loss": 1.0842, + "step": 13701 + }, + { + "epoch": 0.8324928610486664, + "grad_norm": 0.4279487729072571, + "learning_rate": 6.810039992587841e-06, + "loss": 1.0516, + "step": 13702 + }, + { + "epoch": 0.8325536180812929, + "grad_norm": 0.23650000989437103, + "learning_rate": 6.805217674356418e-06, + "loss": 1.1109, + "step": 13703 + }, + { + "epoch": 0.8326143751139194, + "grad_norm": 0.12996117770671844, + "learning_rate": 6.8003969394380915e-06, + "loss": 0.9953, + "step": 13704 + }, + { + "epoch": 0.832675132146546, + "grad_norm": 1.8573832511901855, + "learning_rate": 6.7955777880095665e-06, + "loss": 1.0391, + "step": 13705 + }, + { + "epoch": 0.8327358891791725, + "grad_norm": 0.10993792116641998, + "learning_rate": 6.790760220247488e-06, + "loss": 1.0667, + "step": 13706 + }, + { + "epoch": 0.832796646211799, + "grad_norm": 0.1834808737039566, + "learning_rate": 6.785944236328445e-06, + "loss": 1.1014, + "step": 13707 + }, + { + "epoch": 0.8328574032444256, + "grad_norm": 0.11124822497367859, + "learning_rate": 6.7811298364289675e-06, + "loss": 1.0246, + "step": 13708 + }, + { + "epoch": 0.8329181602770521, + "grad_norm": 0.16302169859409332, + "learning_rate": 6.776317020725531e-06, + "loss": 1.0169, + "step": 13709 + }, + { + "epoch": 0.8329789173096785, + "grad_norm": 0.6890310645103455, + "learning_rate": 6.77150578939455e-06, + "loss": 1.0296, + "step": 13710 + }, + { + "epoch": 0.8330396743423051, + "grad_norm": 0.11164145916700363, + "learning_rate": 6.766696142612383e-06, + "loss": 1.0556, + "step": 13711 + }, + { + "epoch": 0.8331004313749316, + "grad_norm": 0.12510815262794495, + "learning_rate": 6.761888080555323e-06, + "loss": 1.0287, + "step": 13712 + }, + { + "epoch": 0.8331611884075582, + "grad_norm": 0.31896454095840454, + "learning_rate": 6.757081603399612e-06, + "loss": 1.0798, + "step": 13713 + }, + { + "epoch": 0.8332219454401847, + "grad_norm": 0.2016814947128296, + "learning_rate": 6.752276711321432e-06, + "loss": 1.2079, + "step": 13714 + }, + { + "epoch": 0.8332827024728112, + "grad_norm": 0.2229379564523697, + "learning_rate": 6.7474734044969016e-06, + "loss": 1.1087, + "step": 13715 + }, + { + "epoch": 0.8333434595054378, + "grad_norm": 0.2151334434747696, + "learning_rate": 6.742671683102103e-06, + "loss": 1.1717, + "step": 13716 + }, + { + "epoch": 0.8334042165380643, + "grad_norm": 0.19906583428382874, + "learning_rate": 6.737871547313035e-06, + "loss": 1.0985, + "step": 13717 + }, + { + "epoch": 0.8334649735706908, + "grad_norm": 0.11293461918830872, + "learning_rate": 6.733072997305661e-06, + "loss": 0.9888, + "step": 13718 + }, + { + "epoch": 0.8335257306033174, + "grad_norm": 0.09920281916856766, + "learning_rate": 6.728276033255848e-06, + "loss": 1.0293, + "step": 13719 + }, + { + "epoch": 0.8335864876359439, + "grad_norm": 0.1459367573261261, + "learning_rate": 6.723480655339431e-06, + "loss": 1.0148, + "step": 13720 + }, + { + "epoch": 0.8336472446685704, + "grad_norm": 0.23297899961471558, + "learning_rate": 6.718686863732204e-06, + "loss": 1.128, + "step": 13721 + }, + { + "epoch": 0.833708001701197, + "grad_norm": 0.17642824351787567, + "learning_rate": 6.713894658609876e-06, + "loss": 1.0808, + "step": 13722 + }, + { + "epoch": 0.8337687587338234, + "grad_norm": 0.12494087219238281, + "learning_rate": 6.709104040148106e-06, + "loss": 0.9638, + "step": 13723 + }, + { + "epoch": 0.8338295157664499, + "grad_norm": 0.22081853449344635, + "learning_rate": 6.7043150085225e-06, + "loss": 1.1811, + "step": 13724 + }, + { + "epoch": 0.8338902727990765, + "grad_norm": 0.24359643459320068, + "learning_rate": 6.699527563908592e-06, + "loss": 1.2602, + "step": 13725 + }, + { + "epoch": 0.833951029831703, + "grad_norm": 0.10297096520662308, + "learning_rate": 6.694741706481872e-06, + "loss": 1.0032, + "step": 13726 + }, + { + "epoch": 0.8340117868643295, + "grad_norm": 0.0980038195848465, + "learning_rate": 6.689957436417766e-06, + "loss": 1.0307, + "step": 13727 + }, + { + "epoch": 0.8340725438969561, + "grad_norm": 0.14997591078281403, + "learning_rate": 6.685174753891643e-06, + "loss": 1.0007, + "step": 13728 + }, + { + "epoch": 0.8341333009295826, + "grad_norm": 0.20200970768928528, + "learning_rate": 6.68039365907881e-06, + "loss": 1.1032, + "step": 13729 + }, + { + "epoch": 0.8341940579622091, + "grad_norm": 0.2090281993150711, + "learning_rate": 6.675614152154519e-06, + "loss": 1.1025, + "step": 13730 + }, + { + "epoch": 0.8342548149948357, + "grad_norm": 0.19217103719711304, + "learning_rate": 6.67083623329397e-06, + "loss": 1.0885, + "step": 13731 + }, + { + "epoch": 0.8343155720274622, + "grad_norm": 0.10333271324634552, + "learning_rate": 6.666059902672295e-06, + "loss": 1.0105, + "step": 13732 + }, + { + "epoch": 0.8343763290600887, + "grad_norm": 0.15675486624240875, + "learning_rate": 6.661285160464564e-06, + "loss": 1.0837, + "step": 13733 + }, + { + "epoch": 0.8344370860927153, + "grad_norm": 0.10371602326631546, + "learning_rate": 6.656512006845805e-06, + "loss": 1.0317, + "step": 13734 + }, + { + "epoch": 0.8344978431253418, + "grad_norm": 0.24126096069812775, + "learning_rate": 6.6517404419909835e-06, + "loss": 1.108, + "step": 13735 + }, + { + "epoch": 0.8345586001579682, + "grad_norm": 0.1097838431596756, + "learning_rate": 6.646970466074975e-06, + "loss": 0.9996, + "step": 13736 + }, + { + "epoch": 0.8346193571905948, + "grad_norm": 0.1136871725320816, + "learning_rate": 6.642202079272658e-06, + "loss": 1.0573, + "step": 13737 + }, + { + "epoch": 0.8346801142232213, + "grad_norm": 0.17664246261119843, + "learning_rate": 6.637435281758819e-06, + "loss": 1.1226, + "step": 13738 + }, + { + "epoch": 0.8347408712558478, + "grad_norm": 0.19927391409873962, + "learning_rate": 6.632670073708158e-06, + "loss": 1.0615, + "step": 13739 + }, + { + "epoch": 0.8348016282884744, + "grad_norm": 0.17190268635749817, + "learning_rate": 6.627906455295358e-06, + "loss": 1.0155, + "step": 13740 + }, + { + "epoch": 0.8348623853211009, + "grad_norm": 0.10104475915431976, + "learning_rate": 6.623144426695016e-06, + "loss": 1.001, + "step": 13741 + }, + { + "epoch": 0.8349231423537274, + "grad_norm": 0.11691311001777649, + "learning_rate": 6.618383988081717e-06, + "loss": 1.0219, + "step": 13742 + }, + { + "epoch": 0.834983899386354, + "grad_norm": 0.16260352730751038, + "learning_rate": 6.613625139629936e-06, + "loss": 1.0935, + "step": 13743 + }, + { + "epoch": 0.8350446564189805, + "grad_norm": 0.16674138605594635, + "learning_rate": 6.608867881514108e-06, + "loss": 1.0991, + "step": 13744 + }, + { + "epoch": 0.835105413451607, + "grad_norm": 0.10440758615732193, + "learning_rate": 6.604112213908631e-06, + "loss": 1.0249, + "step": 13745 + }, + { + "epoch": 0.8351661704842336, + "grad_norm": 0.11826813966035843, + "learning_rate": 6.5993581369877874e-06, + "loss": 0.995, + "step": 13746 + }, + { + "epoch": 0.8352269275168601, + "grad_norm": 0.18261979520320892, + "learning_rate": 6.594605650925867e-06, + "loss": 0.9989, + "step": 13747 + }, + { + "epoch": 0.8352876845494867, + "grad_norm": 0.12629005312919617, + "learning_rate": 6.589854755897068e-06, + "loss": 1.0658, + "step": 13748 + }, + { + "epoch": 0.8353484415821131, + "grad_norm": 0.10424727201461792, + "learning_rate": 6.585105452075535e-06, + "loss": 1.0041, + "step": 13749 + }, + { + "epoch": 0.8354091986147396, + "grad_norm": 0.22772297263145447, + "learning_rate": 6.580357739635357e-06, + "loss": 1.3067, + "step": 13750 + }, + { + "epoch": 0.8354699556473661, + "grad_norm": 0.1549176573753357, + "learning_rate": 6.575611618750555e-06, + "loss": 1.0807, + "step": 13751 + }, + { + "epoch": 0.8355307126799927, + "grad_norm": 0.11761268973350525, + "learning_rate": 6.5708670895951e-06, + "loss": 1.0924, + "step": 13752 + }, + { + "epoch": 0.8355914697126192, + "grad_norm": 0.10721135884523392, + "learning_rate": 6.566124152342912e-06, + "loss": 1.0559, + "step": 13753 + }, + { + "epoch": 0.8356522267452458, + "grad_norm": 0.12835678458213806, + "learning_rate": 6.5613828071678374e-06, + "loss": 1.0451, + "step": 13754 + }, + { + "epoch": 0.8357129837778723, + "grad_norm": 0.18677636981010437, + "learning_rate": 6.556643054243672e-06, + "loss": 1.0333, + "step": 13755 + }, + { + "epoch": 0.8357737408104988, + "grad_norm": 0.10661299526691437, + "learning_rate": 6.551904893744154e-06, + "loss": 1.0408, + "step": 13756 + }, + { + "epoch": 0.8358344978431254, + "grad_norm": 0.6153198480606079, + "learning_rate": 6.547168325842945e-06, + "loss": 1.0523, + "step": 13757 + }, + { + "epoch": 0.8358952548757519, + "grad_norm": 0.12669655680656433, + "learning_rate": 6.542433350713706e-06, + "loss": 1.026, + "step": 13758 + }, + { + "epoch": 0.8359560119083784, + "grad_norm": 0.14585493505001068, + "learning_rate": 6.537699968529965e-06, + "loss": 1.0532, + "step": 13759 + }, + { + "epoch": 0.836016768941005, + "grad_norm": 0.18196462094783783, + "learning_rate": 6.532968179465227e-06, + "loss": 1.1514, + "step": 13760 + }, + { + "epoch": 0.8360775259736315, + "grad_norm": 0.15091489255428314, + "learning_rate": 6.528237983692942e-06, + "loss": 1.0346, + "step": 13761 + }, + { + "epoch": 0.8361382830062579, + "grad_norm": 0.20028628408908844, + "learning_rate": 6.523509381386489e-06, + "loss": 1.0668, + "step": 13762 + }, + { + "epoch": 0.8361990400388845, + "grad_norm": 0.1909007877111435, + "learning_rate": 6.5187823727192125e-06, + "loss": 0.994, + "step": 13763 + }, + { + "epoch": 0.836259797071511, + "grad_norm": 9.37720012664795, + "learning_rate": 6.514056957864373e-06, + "loss": 1.314, + "step": 13764 + }, + { + "epoch": 0.8363205541041375, + "grad_norm": 0.13656282424926758, + "learning_rate": 6.509333136995194e-06, + "loss": 0.9967, + "step": 13765 + }, + { + "epoch": 0.8363813111367641, + "grad_norm": 0.13284167647361755, + "learning_rate": 6.504610910284803e-06, + "loss": 1.0325, + "step": 13766 + }, + { + "epoch": 0.8364420681693906, + "grad_norm": 0.13866139948368073, + "learning_rate": 6.499890277906295e-06, + "loss": 1.0621, + "step": 13767 + }, + { + "epoch": 0.8365028252020171, + "grad_norm": 0.10912806540727615, + "learning_rate": 6.495171240032733e-06, + "loss": 1.0505, + "step": 13768 + }, + { + "epoch": 0.8365635822346437, + "grad_norm": 0.14385144412517548, + "learning_rate": 6.490453796837071e-06, + "loss": 1.0154, + "step": 13769 + }, + { + "epoch": 0.8366243392672702, + "grad_norm": 0.19134403765201569, + "learning_rate": 6.4857379484922375e-06, + "loss": 1.1225, + "step": 13770 + }, + { + "epoch": 0.8366850962998967, + "grad_norm": 0.4064330458641052, + "learning_rate": 6.481023695171107e-06, + "loss": 1.0581, + "step": 13771 + }, + { + "epoch": 0.8367458533325233, + "grad_norm": 0.12208230793476105, + "learning_rate": 6.47631103704644e-06, + "loss": 1.0183, + "step": 13772 + }, + { + "epoch": 0.8368066103651498, + "grad_norm": 0.09965897351503372, + "learning_rate": 6.471599974291009e-06, + "loss": 1.0188, + "step": 13773 + }, + { + "epoch": 0.8368673673977763, + "grad_norm": 0.2437160164117813, + "learning_rate": 6.4668905070775055e-06, + "loss": 1.125, + "step": 13774 + }, + { + "epoch": 0.8369281244304028, + "grad_norm": 0.10612661391496658, + "learning_rate": 6.46218263557854e-06, + "loss": 1.022, + "step": 13775 + }, + { + "epoch": 0.8369888814630293, + "grad_norm": 0.12046092748641968, + "learning_rate": 6.4574763599666856e-06, + "loss": 1.0311, + "step": 13776 + }, + { + "epoch": 0.8370496384956558, + "grad_norm": 0.16663911938667297, + "learning_rate": 6.452771680414449e-06, + "loss": 1.1517, + "step": 13777 + }, + { + "epoch": 0.8371103955282824, + "grad_norm": 0.2208757847547531, + "learning_rate": 6.448068597094292e-06, + "loss": 1.1361, + "step": 13778 + }, + { + "epoch": 0.8371711525609089, + "grad_norm": 0.1213504746556282, + "learning_rate": 6.443367110178594e-06, + "loss": 1.064, + "step": 13779 + }, + { + "epoch": 0.8372319095935354, + "grad_norm": 0.16529811918735504, + "learning_rate": 6.438667219839689e-06, + "loss": 1.0732, + "step": 13780 + }, + { + "epoch": 0.837292666626162, + "grad_norm": 0.18973691761493683, + "learning_rate": 6.433968926249867e-06, + "loss": 1.0192, + "step": 13781 + }, + { + "epoch": 0.8373534236587885, + "grad_norm": 2.0302512645721436, + "learning_rate": 6.429272229581329e-06, + "loss": 1.091, + "step": 13782 + }, + { + "epoch": 0.837414180691415, + "grad_norm": 0.1460111439228058, + "learning_rate": 6.424577130006232e-06, + "loss": 1.0026, + "step": 13783 + }, + { + "epoch": 0.8374749377240416, + "grad_norm": 0.177422896027565, + "learning_rate": 6.419883627696688e-06, + "loss": 1.1193, + "step": 13784 + }, + { + "epoch": 0.8375356947566681, + "grad_norm": 0.11280520260334015, + "learning_rate": 6.415191722824749e-06, + "loss": 1.028, + "step": 13785 + }, + { + "epoch": 0.8375964517892946, + "grad_norm": 0.12574908137321472, + "learning_rate": 6.41050141556237e-06, + "loss": 1.0779, + "step": 13786 + }, + { + "epoch": 0.8376572088219212, + "grad_norm": 0.13504955172538757, + "learning_rate": 6.405812706081488e-06, + "loss": 1.0827, + "step": 13787 + }, + { + "epoch": 0.8377179658545477, + "grad_norm": 0.11524248868227005, + "learning_rate": 6.401125594553959e-06, + "loss": 1.0086, + "step": 13788 + }, + { + "epoch": 0.8377787228871741, + "grad_norm": 0.22904406487941742, + "learning_rate": 6.396440081151606e-06, + "loss": 1.161, + "step": 13789 + }, + { + "epoch": 0.8378394799198007, + "grad_norm": 0.12628456950187683, + "learning_rate": 6.391756166046176e-06, + "loss": 1.0432, + "step": 13790 + }, + { + "epoch": 0.8379002369524272, + "grad_norm": 0.17692239582538605, + "learning_rate": 6.387073849409347e-06, + "loss": 1.0568, + "step": 13791 + }, + { + "epoch": 0.8379609939850537, + "grad_norm": 0.13144269585609436, + "learning_rate": 6.3823931314127675e-06, + "loss": 1.0635, + "step": 13792 + }, + { + "epoch": 0.8380217510176803, + "grad_norm": 0.17531993985176086, + "learning_rate": 6.377714012227981e-06, + "loss": 1.0814, + "step": 13793 + }, + { + "epoch": 0.8380825080503068, + "grad_norm": 0.2876814901828766, + "learning_rate": 6.3730364920265295e-06, + "loss": 1.1806, + "step": 13794 + }, + { + "epoch": 0.8381432650829334, + "grad_norm": 0.953540563583374, + "learning_rate": 6.3683605709798534e-06, + "loss": 1.0599, + "step": 13795 + }, + { + "epoch": 0.8382040221155599, + "grad_norm": 0.14210133254528046, + "learning_rate": 6.36368624925936e-06, + "loss": 1.0592, + "step": 13796 + }, + { + "epoch": 0.8382647791481864, + "grad_norm": 0.10907606780529022, + "learning_rate": 6.3590135270363784e-06, + "loss": 1.0734, + "step": 13797 + }, + { + "epoch": 0.838325536180813, + "grad_norm": 0.15380249917507172, + "learning_rate": 6.354342404482194e-06, + "loss": 1.0379, + "step": 13798 + }, + { + "epoch": 0.8383862932134395, + "grad_norm": 0.490646630525589, + "learning_rate": 6.349672881768021e-06, + "loss": 1.1746, + "step": 13799 + }, + { + "epoch": 0.838447050246066, + "grad_norm": 0.24855506420135498, + "learning_rate": 6.345004959065032e-06, + "loss": 1.0777, + "step": 13800 + }, + { + "epoch": 0.8385078072786926, + "grad_norm": 0.14284101128578186, + "learning_rate": 6.340338636544319e-06, + "loss": 1.0026, + "step": 13801 + }, + { + "epoch": 0.838568564311319, + "grad_norm": 0.09635406732559204, + "learning_rate": 6.335673914376938e-06, + "loss": 1.0234, + "step": 13802 + }, + { + "epoch": 0.8386293213439455, + "grad_norm": 0.19218318164348602, + "learning_rate": 6.331010792733866e-06, + "loss": 1.1586, + "step": 13803 + }, + { + "epoch": 0.8386900783765721, + "grad_norm": 0.1273650974035263, + "learning_rate": 6.326349271786025e-06, + "loss": 1.0265, + "step": 13804 + }, + { + "epoch": 0.8387508354091986, + "grad_norm": 0.15145963430404663, + "learning_rate": 6.321689351704313e-06, + "loss": 1.017, + "step": 13805 + }, + { + "epoch": 0.8388115924418251, + "grad_norm": 0.19609273970127106, + "learning_rate": 6.31703103265951e-06, + "loss": 1.1041, + "step": 13806 + }, + { + "epoch": 0.8388723494744517, + "grad_norm": 0.21271000802516937, + "learning_rate": 6.312374314822378e-06, + "loss": 1.2021, + "step": 13807 + }, + { + "epoch": 0.8389331065070782, + "grad_norm": 0.11968686431646347, + "learning_rate": 6.307719198363615e-06, + "loss": 1.0439, + "step": 13808 + }, + { + "epoch": 0.8389938635397047, + "grad_norm": 0.12191746383905411, + "learning_rate": 6.303065683453835e-06, + "loss": 1.0517, + "step": 13809 + }, + { + "epoch": 0.8390546205723313, + "grad_norm": 0.1229800134897232, + "learning_rate": 6.298413770263639e-06, + "loss": 1.0184, + "step": 13810 + }, + { + "epoch": 0.8391153776049578, + "grad_norm": 0.15104541182518005, + "learning_rate": 6.293763458963536e-06, + "loss": 1.0986, + "step": 13811 + }, + { + "epoch": 0.8391761346375843, + "grad_norm": 0.11182752996683121, + "learning_rate": 6.28911474972399e-06, + "loss": 1.0656, + "step": 13812 + }, + { + "epoch": 0.8392368916702109, + "grad_norm": 0.11685499548912048, + "learning_rate": 6.284467642715375e-06, + "loss": 1.0433, + "step": 13813 + }, + { + "epoch": 0.8392976487028374, + "grad_norm": 0.1364884078502655, + "learning_rate": 6.279822138108054e-06, + "loss": 1.0881, + "step": 13814 + }, + { + "epoch": 0.8393584057354638, + "grad_norm": 0.17139606177806854, + "learning_rate": 6.27517823607231e-06, + "loss": 1.1082, + "step": 13815 + }, + { + "epoch": 0.8394191627680904, + "grad_norm": 0.19314952194690704, + "learning_rate": 6.270535936778354e-06, + "loss": 1.113, + "step": 13816 + }, + { + "epoch": 0.8394799198007169, + "grad_norm": 0.2756255269050598, + "learning_rate": 6.265895240396358e-06, + "loss": 1.1312, + "step": 13817 + }, + { + "epoch": 0.8395406768333434, + "grad_norm": 0.1790992170572281, + "learning_rate": 6.261256147096423e-06, + "loss": 1.0781, + "step": 13818 + }, + { + "epoch": 0.83960143386597, + "grad_norm": 0.12227267771959305, + "learning_rate": 6.256618657048602e-06, + "loss": 1.0565, + "step": 13819 + }, + { + "epoch": 0.8396621908985965, + "grad_norm": 0.14711852371692657, + "learning_rate": 6.2519827704228765e-06, + "loss": 1.0691, + "step": 13820 + }, + { + "epoch": 0.839722947931223, + "grad_norm": 1.5554476976394653, + "learning_rate": 6.247348487389182e-06, + "loss": 1.0783, + "step": 13821 + }, + { + "epoch": 0.8397837049638496, + "grad_norm": 0.19400683045387268, + "learning_rate": 6.242715808117383e-06, + "loss": 1.1859, + "step": 13822 + }, + { + "epoch": 0.8398444619964761, + "grad_norm": 0.17753420770168304, + "learning_rate": 6.238084732777294e-06, + "loss": 1.1051, + "step": 13823 + }, + { + "epoch": 0.8399052190291026, + "grad_norm": 0.17847858369350433, + "learning_rate": 6.233455261538662e-06, + "loss": 1.1659, + "step": 13824 + }, + { + "epoch": 0.8399659760617292, + "grad_norm": 0.10870621353387833, + "learning_rate": 6.228827394571201e-06, + "loss": 1.001, + "step": 13825 + }, + { + "epoch": 0.8400267330943557, + "grad_norm": 0.12984980642795563, + "learning_rate": 6.224201132044527e-06, + "loss": 1.0062, + "step": 13826 + }, + { + "epoch": 0.8400874901269823, + "grad_norm": 0.12406597286462784, + "learning_rate": 6.219576474128224e-06, + "loss": 1.0093, + "step": 13827 + }, + { + "epoch": 0.8401482471596087, + "grad_norm": 0.09741330146789551, + "learning_rate": 6.214953420991803e-06, + "loss": 1.0695, + "step": 13828 + }, + { + "epoch": 0.8402090041922352, + "grad_norm": 0.11878557503223419, + "learning_rate": 6.210331972804723e-06, + "loss": 1.1275, + "step": 13829 + }, + { + "epoch": 0.8402697612248617, + "grad_norm": 0.25833576917648315, + "learning_rate": 6.205712129736396e-06, + "loss": 1.3048, + "step": 13830 + }, + { + "epoch": 0.8403305182574883, + "grad_norm": 0.13255338370800018, + "learning_rate": 6.201093891956156e-06, + "loss": 1.012, + "step": 13831 + }, + { + "epoch": 0.8403912752901148, + "grad_norm": 0.13217753171920776, + "learning_rate": 6.196477259633299e-06, + "loss": 1.0539, + "step": 13832 + }, + { + "epoch": 0.8404520323227413, + "grad_norm": 0.7342213988304138, + "learning_rate": 6.191862232937018e-06, + "loss": 1.1737, + "step": 13833 + }, + { + "epoch": 0.8405127893553679, + "grad_norm": 0.39364326000213623, + "learning_rate": 6.187248812036489e-06, + "loss": 1.1532, + "step": 13834 + }, + { + "epoch": 0.8405735463879944, + "grad_norm": 0.09241849929094315, + "learning_rate": 6.182636997100833e-06, + "loss": 0.9985, + "step": 13835 + }, + { + "epoch": 0.840634303420621, + "grad_norm": 0.22844815254211426, + "learning_rate": 6.178026788299091e-06, + "loss": 1.0934, + "step": 13836 + }, + { + "epoch": 0.8406950604532475, + "grad_norm": 0.16626201570034027, + "learning_rate": 6.17341818580024e-06, + "loss": 1.0266, + "step": 13837 + }, + { + "epoch": 0.840755817485874, + "grad_norm": 0.10953666269779205, + "learning_rate": 6.168811189773233e-06, + "loss": 1.0366, + "step": 13838 + }, + { + "epoch": 0.8408165745185006, + "grad_norm": 0.12286390364170074, + "learning_rate": 6.164205800386902e-06, + "loss": 1.0569, + "step": 13839 + }, + { + "epoch": 0.8408773315511271, + "grad_norm": 0.14571483433246613, + "learning_rate": 6.15960201781009e-06, + "loss": 1.0828, + "step": 13840 + }, + { + "epoch": 0.8409380885837535, + "grad_norm": 0.12175819277763367, + "learning_rate": 6.154999842211545e-06, + "loss": 0.9593, + "step": 13841 + }, + { + "epoch": 0.84099884561638, + "grad_norm": 0.20199422538280487, + "learning_rate": 6.150399273759949e-06, + "loss": 1.068, + "step": 13842 + }, + { + "epoch": 0.8410596026490066, + "grad_norm": 0.10887544602155685, + "learning_rate": 6.145800312623951e-06, + "loss": 1.0704, + "step": 13843 + }, + { + "epoch": 0.8411203596816331, + "grad_norm": 0.179683119058609, + "learning_rate": 6.141202958972114e-06, + "loss": 0.9995, + "step": 13844 + }, + { + "epoch": 0.8411811167142597, + "grad_norm": 0.16286370158195496, + "learning_rate": 6.1366072129729615e-06, + "loss": 0.9997, + "step": 13845 + }, + { + "epoch": 0.8412418737468862, + "grad_norm": 0.3153090178966522, + "learning_rate": 6.132013074794951e-06, + "loss": 1.2551, + "step": 13846 + }, + { + "epoch": 0.8413026307795127, + "grad_norm": 0.3104003667831421, + "learning_rate": 6.127420544606477e-06, + "loss": 1.1072, + "step": 13847 + }, + { + "epoch": 0.8413633878121393, + "grad_norm": 0.2739587724208832, + "learning_rate": 6.122829622575888e-06, + "loss": 1.1687, + "step": 13848 + }, + { + "epoch": 0.8414241448447658, + "grad_norm": 0.6147429347038269, + "learning_rate": 6.118240308871459e-06, + "loss": 0.9993, + "step": 13849 + }, + { + "epoch": 0.8414849018773923, + "grad_norm": 0.15113450586795807, + "learning_rate": 6.113652603661402e-06, + "loss": 1.0322, + "step": 13850 + }, + { + "epoch": 0.8415456589100189, + "grad_norm": 0.23021289706230164, + "learning_rate": 6.109066507113903e-06, + "loss": 1.1207, + "step": 13851 + }, + { + "epoch": 0.8416064159426454, + "grad_norm": 0.6288475394248962, + "learning_rate": 6.104482019397068e-06, + "loss": 1.1348, + "step": 13852 + }, + { + "epoch": 0.8416671729752719, + "grad_norm": 0.480734258890152, + "learning_rate": 6.099899140678916e-06, + "loss": 1.1913, + "step": 13853 + }, + { + "epoch": 0.8417279300078984, + "grad_norm": 0.1646154522895813, + "learning_rate": 6.095317871127448e-06, + "loss": 1.1624, + "step": 13854 + }, + { + "epoch": 0.8417886870405249, + "grad_norm": 0.11706171184778214, + "learning_rate": 6.090738210910579e-06, + "loss": 1.0541, + "step": 13855 + }, + { + "epoch": 0.8418494440731514, + "grad_norm": 0.1765625923871994, + "learning_rate": 6.086160160196197e-06, + "loss": 1.0522, + "step": 13856 + }, + { + "epoch": 0.841910201105778, + "grad_norm": 0.19297125935554504, + "learning_rate": 6.081583719152106e-06, + "loss": 1.162, + "step": 13857 + }, + { + "epoch": 0.8419709581384045, + "grad_norm": 0.738193690776825, + "learning_rate": 6.07700888794605e-06, + "loss": 1.3687, + "step": 13858 + }, + { + "epoch": 0.842031715171031, + "grad_norm": 0.16662824153900146, + "learning_rate": 6.072435666745735e-06, + "loss": 1.1661, + "step": 13859 + }, + { + "epoch": 0.8420924722036576, + "grad_norm": 0.12282543629407883, + "learning_rate": 6.067864055718764e-06, + "loss": 1.0225, + "step": 13860 + }, + { + "epoch": 0.8421532292362841, + "grad_norm": 0.11731891334056854, + "learning_rate": 6.063294055032731e-06, + "loss": 1.0572, + "step": 13861 + }, + { + "epoch": 0.8422139862689106, + "grad_norm": 0.15579040348529816, + "learning_rate": 6.058725664855153e-06, + "loss": 1.1617, + "step": 13862 + }, + { + "epoch": 0.8422747433015372, + "grad_norm": 0.11521507054567337, + "learning_rate": 6.054158885353478e-06, + "loss": 1.037, + "step": 13863 + }, + { + "epoch": 0.8423355003341637, + "grad_norm": 0.15756145119667053, + "learning_rate": 6.049593716695107e-06, + "loss": 1.096, + "step": 13864 + }, + { + "epoch": 0.8423962573667902, + "grad_norm": 0.355928510427475, + "learning_rate": 6.045030159047366e-06, + "loss": 1.019, + "step": 13865 + }, + { + "epoch": 0.8424570143994168, + "grad_norm": 0.19757333397865295, + "learning_rate": 6.040468212577549e-06, + "loss": 1.2141, + "step": 13866 + }, + { + "epoch": 0.8425177714320432, + "grad_norm": 0.15494762361049652, + "learning_rate": 6.0359078774528625e-06, + "loss": 1.0597, + "step": 13867 + }, + { + "epoch": 0.8425785284646697, + "grad_norm": 0.11002039909362793, + "learning_rate": 6.031349153840471e-06, + "loss": 1.0286, + "step": 13868 + }, + { + "epoch": 0.8426392854972963, + "grad_norm": 0.15614134073257446, + "learning_rate": 6.026792041907475e-06, + "loss": 0.9997, + "step": 13869 + }, + { + "epoch": 0.8427000425299228, + "grad_norm": 0.1367620974779129, + "learning_rate": 6.022236541820919e-06, + "loss": 1.0647, + "step": 13870 + }, + { + "epoch": 0.8427607995625493, + "grad_norm": 0.1541340947151184, + "learning_rate": 6.01768265374777e-06, + "loss": 1.0453, + "step": 13871 + }, + { + "epoch": 0.8428215565951759, + "grad_norm": 0.10972689092159271, + "learning_rate": 6.013130377854986e-06, + "loss": 1.0448, + "step": 13872 + }, + { + "epoch": 0.8428823136278024, + "grad_norm": 0.15365009009838104, + "learning_rate": 6.008579714309398e-06, + "loss": 1.1324, + "step": 13873 + }, + { + "epoch": 0.842943070660429, + "grad_norm": 0.40709957480430603, + "learning_rate": 6.004030663277832e-06, + "loss": 1.4185, + "step": 13874 + }, + { + "epoch": 0.8430038276930555, + "grad_norm": 0.13938423991203308, + "learning_rate": 5.999483224927016e-06, + "loss": 1.0445, + "step": 13875 + }, + { + "epoch": 0.843064584725682, + "grad_norm": 0.16435745358467102, + "learning_rate": 5.994937399423645e-06, + "loss": 1.0489, + "step": 13876 + }, + { + "epoch": 0.8431253417583086, + "grad_norm": 0.15925735235214233, + "learning_rate": 5.990393186934357e-06, + "loss": 1.0384, + "step": 13877 + }, + { + "epoch": 0.8431860987909351, + "grad_norm": 0.11503589153289795, + "learning_rate": 5.9858505876257136e-06, + "loss": 1.0924, + "step": 13878 + }, + { + "epoch": 0.8432468558235616, + "grad_norm": 0.1875385046005249, + "learning_rate": 5.981309601664237e-06, + "loss": 1.1136, + "step": 13879 + }, + { + "epoch": 0.843307612856188, + "grad_norm": 0.14069220423698425, + "learning_rate": 5.97677022921635e-06, + "loss": 1.096, + "step": 13880 + }, + { + "epoch": 0.8433683698888146, + "grad_norm": 0.10320793092250824, + "learning_rate": 5.972232470448452e-06, + "loss": 0.972, + "step": 13881 + }, + { + "epoch": 0.8434291269214411, + "grad_norm": 0.2010670155286789, + "learning_rate": 5.967696325526895e-06, + "loss": 0.9984, + "step": 13882 + }, + { + "epoch": 0.8434898839540677, + "grad_norm": 0.13203686475753784, + "learning_rate": 5.9631617946179385e-06, + "loss": 1.0238, + "step": 13883 + }, + { + "epoch": 0.8435506409866942, + "grad_norm": 1.4844870567321777, + "learning_rate": 5.958628877887801e-06, + "loss": 1.0436, + "step": 13884 + }, + { + "epoch": 0.8436113980193207, + "grad_norm": 0.16063804924488068, + "learning_rate": 5.9540975755026426e-06, + "loss": 1.1197, + "step": 13885 + }, + { + "epoch": 0.8436721550519473, + "grad_norm": 0.09038233011960983, + "learning_rate": 5.949567887628532e-06, + "loss": 1.0234, + "step": 13886 + }, + { + "epoch": 0.8437329120845738, + "grad_norm": 0.18728040158748627, + "learning_rate": 5.945039814431541e-06, + "loss": 1.1137, + "step": 13887 + }, + { + "epoch": 0.8437936691172003, + "grad_norm": 0.19329407811164856, + "learning_rate": 5.940513356077626e-06, + "loss": 1.0977, + "step": 13888 + }, + { + "epoch": 0.8438544261498269, + "grad_norm": 0.231111079454422, + "learning_rate": 5.93598851273271e-06, + "loss": 1.0863, + "step": 13889 + }, + { + "epoch": 0.8439151831824534, + "grad_norm": 0.25511226058006287, + "learning_rate": 5.931465284562648e-06, + "loss": 1.1529, + "step": 13890 + }, + { + "epoch": 0.8439759402150799, + "grad_norm": 0.12302915006875992, + "learning_rate": 5.926943671733254e-06, + "loss": 1.0417, + "step": 13891 + }, + { + "epoch": 0.8440366972477065, + "grad_norm": 0.139537513256073, + "learning_rate": 5.922423674410255e-06, + "loss": 1.0403, + "step": 13892 + }, + { + "epoch": 0.844097454280333, + "grad_norm": 0.13094432651996613, + "learning_rate": 5.917905292759334e-06, + "loss": 1.0463, + "step": 13893 + }, + { + "epoch": 0.8441582113129594, + "grad_norm": 0.2075701355934143, + "learning_rate": 5.913388526946117e-06, + "loss": 1.0801, + "step": 13894 + }, + { + "epoch": 0.844218968345586, + "grad_norm": 0.18712620437145233, + "learning_rate": 5.9088733771361645e-06, + "loss": 1.1441, + "step": 13895 + }, + { + "epoch": 0.8442797253782125, + "grad_norm": 0.19830457866191864, + "learning_rate": 5.904359843494978e-06, + "loss": 1.1013, + "step": 13896 + }, + { + "epoch": 0.844340482410839, + "grad_norm": 1.6655857563018799, + "learning_rate": 5.899847926188001e-06, + "loss": 1.0438, + "step": 13897 + }, + { + "epoch": 0.8444012394434656, + "grad_norm": 0.11538496613502502, + "learning_rate": 5.895337625380632e-06, + "loss": 0.9785, + "step": 13898 + }, + { + "epoch": 0.8444619964760921, + "grad_norm": 0.1874082088470459, + "learning_rate": 5.890828941238191e-06, + "loss": 1.0697, + "step": 13899 + }, + { + "epoch": 0.8445227535087186, + "grad_norm": 0.18893639743328094, + "learning_rate": 5.886321873925937e-06, + "loss": 1.0719, + "step": 13900 + }, + { + "epoch": 0.8445835105413452, + "grad_norm": 0.19268958270549774, + "learning_rate": 5.881816423609082e-06, + "loss": 1.1023, + "step": 13901 + }, + { + "epoch": 0.8446442675739717, + "grad_norm": 0.14129790663719177, + "learning_rate": 5.87731259045276e-06, + "loss": 1.0181, + "step": 13902 + }, + { + "epoch": 0.8447050246065982, + "grad_norm": 0.11569572240114212, + "learning_rate": 5.8728103746220894e-06, + "loss": 1.1015, + "step": 13903 + }, + { + "epoch": 0.8447657816392248, + "grad_norm": 0.2612714469432831, + "learning_rate": 5.868309776282077e-06, + "loss": 1.0468, + "step": 13904 + }, + { + "epoch": 0.8448265386718513, + "grad_norm": 0.16901415586471558, + "learning_rate": 5.863810795597718e-06, + "loss": 1.1631, + "step": 13905 + }, + { + "epoch": 0.8448872957044778, + "grad_norm": 0.20452101528644562, + "learning_rate": 5.859313432733882e-06, + "loss": 1.062, + "step": 13906 + }, + { + "epoch": 0.8449480527371043, + "grad_norm": 0.20321491360664368, + "learning_rate": 5.854817687855451e-06, + "loss": 1.059, + "step": 13907 + }, + { + "epoch": 0.8450088097697308, + "grad_norm": 0.1561172753572464, + "learning_rate": 5.850323561127213e-06, + "loss": 1.1638, + "step": 13908 + }, + { + "epoch": 0.8450695668023573, + "grad_norm": 0.14651024341583252, + "learning_rate": 5.845831052713902e-06, + "loss": 1.0839, + "step": 13909 + }, + { + "epoch": 0.8451303238349839, + "grad_norm": 0.11238010972738266, + "learning_rate": 5.841340162780184e-06, + "loss": 1.0364, + "step": 13910 + }, + { + "epoch": 0.8451910808676104, + "grad_norm": 0.0998220443725586, + "learning_rate": 5.836850891490681e-06, + "loss": 0.9538, + "step": 13911 + }, + { + "epoch": 0.845251837900237, + "grad_norm": 3.9812381267547607, + "learning_rate": 5.832363239009947e-06, + "loss": 1.0316, + "step": 13912 + }, + { + "epoch": 0.8453125949328635, + "grad_norm": 0.1369580626487732, + "learning_rate": 5.827877205502469e-06, + "loss": 1.0209, + "step": 13913 + }, + { + "epoch": 0.84537335196549, + "grad_norm": 1.0016809701919556, + "learning_rate": 5.823392791132698e-06, + "loss": 1.1488, + "step": 13914 + }, + { + "epoch": 0.8454341089981166, + "grad_norm": 0.12096266448497772, + "learning_rate": 5.818909996064998e-06, + "loss": 1.0222, + "step": 13915 + }, + { + "epoch": 0.8454948660307431, + "grad_norm": 0.24976760149002075, + "learning_rate": 5.814428820463691e-06, + "loss": 1.1279, + "step": 13916 + }, + { + "epoch": 0.8455556230633696, + "grad_norm": 0.7143173217773438, + "learning_rate": 5.809949264493031e-06, + "loss": 1.0125, + "step": 13917 + }, + { + "epoch": 0.8456163800959962, + "grad_norm": 0.4793556332588196, + "learning_rate": 5.80547132831723e-06, + "loss": 1.0407, + "step": 13918 + }, + { + "epoch": 0.8456771371286227, + "grad_norm": 0.16486790776252747, + "learning_rate": 5.800995012100435e-06, + "loss": 1.1637, + "step": 13919 + }, + { + "epoch": 0.8457378941612491, + "grad_norm": 0.19586597383022308, + "learning_rate": 5.796520316006693e-06, + "loss": 1.1231, + "step": 13920 + }, + { + "epoch": 0.8457986511938757, + "grad_norm": 0.21166226267814636, + "learning_rate": 5.792047240200049e-06, + "loss": 1.0789, + "step": 13921 + }, + { + "epoch": 0.8458594082265022, + "grad_norm": 0.11669517308473587, + "learning_rate": 5.787575784844446e-06, + "loss": 1.0335, + "step": 13922 + }, + { + "epoch": 0.8459201652591287, + "grad_norm": 0.20192952454090118, + "learning_rate": 5.783105950103812e-06, + "loss": 1.0376, + "step": 13923 + }, + { + "epoch": 0.8459809222917553, + "grad_norm": 0.10634389519691467, + "learning_rate": 5.778637736141973e-06, + "loss": 0.9854, + "step": 13924 + }, + { + "epoch": 0.8460416793243818, + "grad_norm": 1.1997287273406982, + "learning_rate": 5.7741711431227255e-06, + "loss": 1.1657, + "step": 13925 + }, + { + "epoch": 0.8461024363570083, + "grad_norm": 0.23737677931785583, + "learning_rate": 5.769706171209777e-06, + "loss": 1.1759, + "step": 13926 + }, + { + "epoch": 0.8461631933896349, + "grad_norm": 0.13504615426063538, + "learning_rate": 5.7652428205667855e-06, + "loss": 1.0699, + "step": 13927 + }, + { + "epoch": 0.8462239504222614, + "grad_norm": 0.14231258630752563, + "learning_rate": 5.760781091357376e-06, + "loss": 1.0612, + "step": 13928 + }, + { + "epoch": 0.8462847074548879, + "grad_norm": 0.1106368899345398, + "learning_rate": 5.756320983745089e-06, + "loss": 1.0621, + "step": 13929 + }, + { + "epoch": 0.8463454644875145, + "grad_norm": 0.21288204193115234, + "learning_rate": 5.751862497893412e-06, + "loss": 1.1791, + "step": 13930 + }, + { + "epoch": 0.846406221520141, + "grad_norm": 0.16346928477287292, + "learning_rate": 5.747405633965763e-06, + "loss": 1.0293, + "step": 13931 + }, + { + "epoch": 0.8464669785527675, + "grad_norm": 0.21459725499153137, + "learning_rate": 5.742950392125518e-06, + "loss": 0.9784, + "step": 13932 + }, + { + "epoch": 0.846527735585394, + "grad_norm": 0.30261167883872986, + "learning_rate": 5.738496772535984e-06, + "loss": 1.2409, + "step": 13933 + }, + { + "epoch": 0.8465884926180205, + "grad_norm": 0.14750361442565918, + "learning_rate": 5.734044775360397e-06, + "loss": 1.083, + "step": 13934 + }, + { + "epoch": 0.846649249650647, + "grad_norm": 0.15585088729858398, + "learning_rate": 5.729594400761961e-06, + "loss": 1.0613, + "step": 13935 + }, + { + "epoch": 0.8467100066832736, + "grad_norm": 0.12576434016227722, + "learning_rate": 5.725145648903801e-06, + "loss": 1.0309, + "step": 13936 + }, + { + "epoch": 0.8467707637159001, + "grad_norm": 0.13810418546199799, + "learning_rate": 5.720698519948986e-06, + "loss": 1.0646, + "step": 13937 + }, + { + "epoch": 0.8468315207485266, + "grad_norm": 0.21547050774097443, + "learning_rate": 5.716253014060513e-06, + "loss": 1.132, + "step": 13938 + }, + { + "epoch": 0.8468922777811532, + "grad_norm": 0.16368132829666138, + "learning_rate": 5.7118091314013635e-06, + "loss": 1.0583, + "step": 13939 + }, + { + "epoch": 0.8469530348137797, + "grad_norm": 0.15535663068294525, + "learning_rate": 5.707366872134401e-06, + "loss": 1.0338, + "step": 13940 + }, + { + "epoch": 0.8470137918464062, + "grad_norm": 0.13681825995445251, + "learning_rate": 5.702926236422474e-06, + "loss": 0.9994, + "step": 13941 + }, + { + "epoch": 0.8470745488790328, + "grad_norm": 0.14484292268753052, + "learning_rate": 5.6984872244283395e-06, + "loss": 1.1008, + "step": 13942 + }, + { + "epoch": 0.8471353059116593, + "grad_norm": 0.2158062756061554, + "learning_rate": 5.694049836314713e-06, + "loss": 1.1981, + "step": 13943 + }, + { + "epoch": 0.8471960629442858, + "grad_norm": 0.1107899472117424, + "learning_rate": 5.689614072244265e-06, + "loss": 1.0519, + "step": 13944 + }, + { + "epoch": 0.8472568199769124, + "grad_norm": 0.12145136296749115, + "learning_rate": 5.685179932379581e-06, + "loss": 1.0633, + "step": 13945 + }, + { + "epoch": 0.8473175770095388, + "grad_norm": 0.11620216071605682, + "learning_rate": 5.680747416883198e-06, + "loss": 1.0644, + "step": 13946 + }, + { + "epoch": 0.8473783340421653, + "grad_norm": 0.1601884812116623, + "learning_rate": 5.676316525917575e-06, + "loss": 1.132, + "step": 13947 + }, + { + "epoch": 0.8474390910747919, + "grad_norm": 0.10534972697496414, + "learning_rate": 5.671887259645131e-06, + "loss": 0.9992, + "step": 13948 + }, + { + "epoch": 0.8474998481074184, + "grad_norm": 0.17490743100643158, + "learning_rate": 5.667459618228232e-06, + "loss": 1.2029, + "step": 13949 + }, + { + "epoch": 0.8475606051400449, + "grad_norm": 0.16057506203651428, + "learning_rate": 5.663033601829177e-06, + "loss": 1.0507, + "step": 13950 + }, + { + "epoch": 0.8476213621726715, + "grad_norm": 0.3475726246833801, + "learning_rate": 5.658609210610189e-06, + "loss": 1.1973, + "step": 13951 + }, + { + "epoch": 0.847682119205298, + "grad_norm": 0.22239616513252258, + "learning_rate": 5.6541864447334665e-06, + "loss": 1.1561, + "step": 13952 + }, + { + "epoch": 0.8477428762379245, + "grad_norm": 0.10262919962406158, + "learning_rate": 5.649765304361088e-06, + "loss": 1.0142, + "step": 13953 + }, + { + "epoch": 0.8478036332705511, + "grad_norm": 0.19047772884368896, + "learning_rate": 5.645345789655149e-06, + "loss": 1.0935, + "step": 13954 + }, + { + "epoch": 0.8478643903031776, + "grad_norm": 0.37129777669906616, + "learning_rate": 5.640927900777626e-06, + "loss": 1.1662, + "step": 13955 + }, + { + "epoch": 0.8479251473358042, + "grad_norm": 0.12124883383512497, + "learning_rate": 5.636511637890474e-06, + "loss": 1.0394, + "step": 13956 + }, + { + "epoch": 0.8479859043684307, + "grad_norm": 0.15603722631931305, + "learning_rate": 5.632097001155556e-06, + "loss": 1.1198, + "step": 13957 + }, + { + "epoch": 0.8480466614010572, + "grad_norm": 0.1487198919057846, + "learning_rate": 5.6276839907347054e-06, + "loss": 1.1079, + "step": 13958 + }, + { + "epoch": 0.8481074184336836, + "grad_norm": 0.1818862110376358, + "learning_rate": 5.62327260678967e-06, + "loss": 1.1105, + "step": 13959 + }, + { + "epoch": 0.8481681754663102, + "grad_norm": 0.13143955171108246, + "learning_rate": 5.618862849482159e-06, + "loss": 1.0979, + "step": 13960 + }, + { + "epoch": 0.8482289324989367, + "grad_norm": 0.11102840304374695, + "learning_rate": 5.6144547189738055e-06, + "loss": 1.0216, + "step": 13961 + }, + { + "epoch": 0.8482896895315633, + "grad_norm": 0.7428750991821289, + "learning_rate": 5.6100482154261945e-06, + "loss": 1.0789, + "step": 13962 + }, + { + "epoch": 0.8483504465641898, + "grad_norm": 0.185042142868042, + "learning_rate": 5.605643339000849e-06, + "loss": 1.1868, + "step": 13963 + }, + { + "epoch": 0.8484112035968163, + "grad_norm": 0.1131417453289032, + "learning_rate": 5.6012400898592166e-06, + "loss": 1.0566, + "step": 13964 + }, + { + "epoch": 0.8484719606294429, + "grad_norm": 0.12369358539581299, + "learning_rate": 5.596838468162718e-06, + "loss": 1.0471, + "step": 13965 + }, + { + "epoch": 0.8485327176620694, + "grad_norm": 0.15037371218204498, + "learning_rate": 5.592438474072703e-06, + "loss": 1.1091, + "step": 13966 + }, + { + "epoch": 0.8485934746946959, + "grad_norm": 0.11376750469207764, + "learning_rate": 5.588040107750425e-06, + "loss": 1.0585, + "step": 13967 + }, + { + "epoch": 0.8486542317273225, + "grad_norm": 0.13548539578914642, + "learning_rate": 5.583643369357122e-06, + "loss": 1.0781, + "step": 13968 + }, + { + "epoch": 0.848714988759949, + "grad_norm": 0.13752390444278717, + "learning_rate": 5.579248259053949e-06, + "loss": 1.0263, + "step": 13969 + }, + { + "epoch": 0.8487757457925755, + "grad_norm": 0.1105789989233017, + "learning_rate": 5.574854777002026e-06, + "loss": 1.0754, + "step": 13970 + }, + { + "epoch": 0.8488365028252021, + "grad_norm": 0.1519423872232437, + "learning_rate": 5.570462923362391e-06, + "loss": 1.1217, + "step": 13971 + }, + { + "epoch": 0.8488972598578285, + "grad_norm": 2.2291924953460693, + "learning_rate": 5.566072698296032e-06, + "loss": 1.1915, + "step": 13972 + }, + { + "epoch": 0.848958016890455, + "grad_norm": 0.09913677722215652, + "learning_rate": 5.56168410196386e-06, + "loss": 1.0571, + "step": 13973 + }, + { + "epoch": 0.8490187739230816, + "grad_norm": 0.10555507242679596, + "learning_rate": 5.557297134526734e-06, + "loss": 1.0546, + "step": 13974 + }, + { + "epoch": 0.8490795309557081, + "grad_norm": 0.20103463530540466, + "learning_rate": 5.552911796145488e-06, + "loss": 1.143, + "step": 13975 + }, + { + "epoch": 0.8491402879883346, + "grad_norm": 0.09908825159072876, + "learning_rate": 5.548528086980848e-06, + "loss": 1.0404, + "step": 13976 + }, + { + "epoch": 0.8492010450209612, + "grad_norm": 0.4844468832015991, + "learning_rate": 5.544146007193501e-06, + "loss": 1.2248, + "step": 13977 + }, + { + "epoch": 0.8492618020535877, + "grad_norm": 0.13323774933815002, + "learning_rate": 5.5397655569440765e-06, + "loss": 1.0883, + "step": 13978 + }, + { + "epoch": 0.8493225590862142, + "grad_norm": 0.1800452023744583, + "learning_rate": 5.535386736393139e-06, + "loss": 1.1608, + "step": 13979 + }, + { + "epoch": 0.8493833161188408, + "grad_norm": 0.18974387645721436, + "learning_rate": 5.531009545701199e-06, + "loss": 1.0439, + "step": 13980 + }, + { + "epoch": 0.8494440731514673, + "grad_norm": 0.14465534687042236, + "learning_rate": 5.5266339850286955e-06, + "loss": 1.1212, + "step": 13981 + }, + { + "epoch": 0.8495048301840938, + "grad_norm": 0.19995470345020294, + "learning_rate": 5.522260054536022e-06, + "loss": 1.1382, + "step": 13982 + }, + { + "epoch": 0.8495655872167204, + "grad_norm": 0.11444170773029327, + "learning_rate": 5.517887754383505e-06, + "loss": 1.0484, + "step": 13983 + }, + { + "epoch": 0.8496263442493469, + "grad_norm": 0.20048007369041443, + "learning_rate": 5.5135170847314065e-06, + "loss": 1.0072, + "step": 13984 + }, + { + "epoch": 0.8496871012819733, + "grad_norm": 0.12816888093948364, + "learning_rate": 5.509148045739932e-06, + "loss": 1.0688, + "step": 13985 + }, + { + "epoch": 0.8497478583145999, + "grad_norm": 0.16064612567424774, + "learning_rate": 5.504780637569251e-06, + "loss": 1.067, + "step": 13986 + }, + { + "epoch": 0.8498086153472264, + "grad_norm": 0.15055491030216217, + "learning_rate": 5.500414860379433e-06, + "loss": 1.0457, + "step": 13987 + }, + { + "epoch": 0.8498693723798529, + "grad_norm": 0.17457495629787445, + "learning_rate": 5.496050714330508e-06, + "loss": 1.086, + "step": 13988 + }, + { + "epoch": 0.8499301294124795, + "grad_norm": 0.11255787312984467, + "learning_rate": 5.491688199582446e-06, + "loss": 1.0399, + "step": 13989 + }, + { + "epoch": 0.849990886445106, + "grad_norm": 0.2116011530160904, + "learning_rate": 5.487327316295143e-06, + "loss": 1.1662, + "step": 13990 + }, + { + "epoch": 0.8500516434777325, + "grad_norm": 0.15451011061668396, + "learning_rate": 5.482968064628474e-06, + "loss": 1.0324, + "step": 13991 + }, + { + "epoch": 0.8501124005103591, + "grad_norm": 0.13074050843715668, + "learning_rate": 5.478610444742227e-06, + "loss": 0.9919, + "step": 13992 + }, + { + "epoch": 0.8501731575429856, + "grad_norm": 0.10223536938428879, + "learning_rate": 5.474254456796107e-06, + "loss": 1.0534, + "step": 13993 + }, + { + "epoch": 0.8502339145756121, + "grad_norm": 0.09384096413850784, + "learning_rate": 5.469900100949798e-06, + "loss": 1.0026, + "step": 13994 + }, + { + "epoch": 0.8502946716082387, + "grad_norm": 0.14617595076560974, + "learning_rate": 5.465547377362895e-06, + "loss": 1.3121, + "step": 13995 + }, + { + "epoch": 0.8503554286408652, + "grad_norm": 0.12430252134799957, + "learning_rate": 5.461196286194975e-06, + "loss": 1.0583, + "step": 13996 + }, + { + "epoch": 0.8504161856734918, + "grad_norm": 0.18041782081127167, + "learning_rate": 5.4568468276055165e-06, + "loss": 1.1298, + "step": 13997 + }, + { + "epoch": 0.8504769427061183, + "grad_norm": 0.13031385838985443, + "learning_rate": 5.4524990017539415e-06, + "loss": 1.075, + "step": 13998 + }, + { + "epoch": 0.8505376997387447, + "grad_norm": 0.20246997475624084, + "learning_rate": 5.4481528087996284e-06, + "loss": 1.1979, + "step": 13999 + }, + { + "epoch": 0.8505984567713712, + "grad_norm": 0.2741469740867615, + "learning_rate": 5.443808248901888e-06, + "loss": 1.0297, + "step": 14000 + }, + { + "epoch": 0.8506592138039978, + "grad_norm": 0.12367981672286987, + "learning_rate": 5.439465322219966e-06, + "loss": 1.033, + "step": 14001 + }, + { + "epoch": 0.8507199708366243, + "grad_norm": 0.15394064784049988, + "learning_rate": 5.435124028913058e-06, + "loss": 1.0034, + "step": 14002 + }, + { + "epoch": 0.8507807278692509, + "grad_norm": 0.20726700127124786, + "learning_rate": 5.430784369140296e-06, + "loss": 1.199, + "step": 14003 + }, + { + "epoch": 0.8508414849018774, + "grad_norm": 0.13614840805530548, + "learning_rate": 5.426446343060743e-06, + "loss": 1.0427, + "step": 14004 + }, + { + "epoch": 0.8509022419345039, + "grad_norm": 0.10408087074756622, + "learning_rate": 5.422109950833415e-06, + "loss": 1.0179, + "step": 14005 + }, + { + "epoch": 0.8509629989671305, + "grad_norm": 0.0921889990568161, + "learning_rate": 5.417775192617269e-06, + "loss": 0.9978, + "step": 14006 + }, + { + "epoch": 0.851023755999757, + "grad_norm": 0.14945179224014282, + "learning_rate": 5.413442068571184e-06, + "loss": 1.0741, + "step": 14007 + }, + { + "epoch": 0.8510845130323835, + "grad_norm": 0.16615328192710876, + "learning_rate": 5.4091105788540045e-06, + "loss": 1.088, + "step": 14008 + }, + { + "epoch": 0.8511452700650101, + "grad_norm": 0.29710111021995544, + "learning_rate": 5.404780723624492e-06, + "loss": 1.0532, + "step": 14009 + }, + { + "epoch": 0.8512060270976366, + "grad_norm": 0.13664542138576508, + "learning_rate": 5.400452503041353e-06, + "loss": 1.0783, + "step": 14010 + }, + { + "epoch": 0.8512667841302631, + "grad_norm": 0.12013693898916245, + "learning_rate": 5.396125917263256e-06, + "loss": 1.0257, + "step": 14011 + }, + { + "epoch": 0.8513275411628896, + "grad_norm": 0.1466490477323532, + "learning_rate": 5.39180096644879e-06, + "loss": 1.0319, + "step": 14012 + }, + { + "epoch": 0.8513882981955161, + "grad_norm": 0.1422511637210846, + "learning_rate": 5.387477650756489e-06, + "loss": 1.0595, + "step": 14013 + }, + { + "epoch": 0.8514490552281426, + "grad_norm": 0.10382051020860672, + "learning_rate": 5.383155970344806e-06, + "loss": 1.0395, + "step": 14014 + }, + { + "epoch": 0.8515098122607692, + "grad_norm": 0.28484755754470825, + "learning_rate": 5.378835925372161e-06, + "loss": 1.0452, + "step": 14015 + }, + { + "epoch": 0.8515705692933957, + "grad_norm": 0.12596751749515533, + "learning_rate": 5.374517515996919e-06, + "loss": 1.0188, + "step": 14016 + }, + { + "epoch": 0.8516313263260222, + "grad_norm": 0.19154946506023407, + "learning_rate": 5.370200742377357e-06, + "loss": 1.0316, + "step": 14017 + }, + { + "epoch": 0.8516920833586488, + "grad_norm": 0.3829948902130127, + "learning_rate": 5.365885604671722e-06, + "loss": 1.0672, + "step": 14018 + }, + { + "epoch": 0.8517528403912753, + "grad_norm": 0.15549182891845703, + "learning_rate": 5.3615721030381815e-06, + "loss": 1.0702, + "step": 14019 + }, + { + "epoch": 0.8518135974239018, + "grad_norm": 0.15565907955169678, + "learning_rate": 5.357260237634826e-06, + "loss": 1.0909, + "step": 14020 + }, + { + "epoch": 0.8518743544565284, + "grad_norm": 0.4052349627017975, + "learning_rate": 5.3529500086197394e-06, + "loss": 1.0723, + "step": 14021 + }, + { + "epoch": 0.8519351114891549, + "grad_norm": 0.30731961131095886, + "learning_rate": 5.348641416150896e-06, + "loss": 1.0973, + "step": 14022 + }, + { + "epoch": 0.8519958685217814, + "grad_norm": 0.1267758160829544, + "learning_rate": 5.34433446038623e-06, + "loss": 1.0609, + "step": 14023 + }, + { + "epoch": 0.852056625554408, + "grad_norm": 0.16898858547210693, + "learning_rate": 5.340029141483621e-06, + "loss": 1.0255, + "step": 14024 + }, + { + "epoch": 0.8521173825870344, + "grad_norm": 0.3386203944683075, + "learning_rate": 5.335725459600876e-06, + "loss": 1.1653, + "step": 14025 + }, + { + "epoch": 0.8521781396196609, + "grad_norm": 0.24327914416790009, + "learning_rate": 5.331423414895747e-06, + "loss": 1.1276, + "step": 14026 + }, + { + "epoch": 0.8522388966522875, + "grad_norm": 0.4767051041126251, + "learning_rate": 5.327123007525925e-06, + "loss": 1.1859, + "step": 14027 + }, + { + "epoch": 0.852299653684914, + "grad_norm": 0.14266517758369446, + "learning_rate": 5.322824237649043e-06, + "loss": 1.0888, + "step": 14028 + }, + { + "epoch": 0.8523604107175405, + "grad_norm": 0.1028783991932869, + "learning_rate": 5.3185271054226825e-06, + "loss": 1.034, + "step": 14029 + }, + { + "epoch": 0.8524211677501671, + "grad_norm": 0.19003506004810333, + "learning_rate": 5.314231611004339e-06, + "loss": 1.0175, + "step": 14030 + }, + { + "epoch": 0.8524819247827936, + "grad_norm": 0.16263505816459656, + "learning_rate": 5.309937754551469e-06, + "loss": 1.0863, + "step": 14031 + }, + { + "epoch": 0.8525426818154201, + "grad_norm": 0.20552152395248413, + "learning_rate": 5.305645536221476e-06, + "loss": 1.1712, + "step": 14032 + }, + { + "epoch": 0.8526034388480467, + "grad_norm": 0.19740070402622223, + "learning_rate": 5.301354956171695e-06, + "loss": 1.1318, + "step": 14033 + }, + { + "epoch": 0.8526641958806732, + "grad_norm": 0.1874537169933319, + "learning_rate": 5.297066014559382e-06, + "loss": 1.0039, + "step": 14034 + }, + { + "epoch": 0.8527249529132997, + "grad_norm": 0.24981433153152466, + "learning_rate": 5.292778711541751e-06, + "loss": 1.0573, + "step": 14035 + }, + { + "epoch": 0.8527857099459263, + "grad_norm": 0.10621805489063263, + "learning_rate": 5.28849304727595e-06, + "loss": 1.0049, + "step": 14036 + }, + { + "epoch": 0.8528464669785528, + "grad_norm": 0.1408226042985916, + "learning_rate": 5.284209021919085e-06, + "loss": 1.0894, + "step": 14037 + }, + { + "epoch": 0.8529072240111792, + "grad_norm": 0.1325346976518631, + "learning_rate": 5.279926635628185e-06, + "loss": 1.0021, + "step": 14038 + }, + { + "epoch": 0.8529679810438058, + "grad_norm": 0.13675685226917267, + "learning_rate": 5.275645888560232e-06, + "loss": 1.1116, + "step": 14039 + }, + { + "epoch": 0.8530287380764323, + "grad_norm": 0.1163097470998764, + "learning_rate": 5.271366780872105e-06, + "loss": 1.0652, + "step": 14040 + }, + { + "epoch": 0.8530894951090588, + "grad_norm": 0.1079884022474289, + "learning_rate": 5.267089312720674e-06, + "loss": 1.0231, + "step": 14041 + }, + { + "epoch": 0.8531502521416854, + "grad_norm": 0.2544534206390381, + "learning_rate": 5.262813484262735e-06, + "loss": 1.1633, + "step": 14042 + }, + { + "epoch": 0.8532110091743119, + "grad_norm": 2.2221500873565674, + "learning_rate": 5.258539295655019e-06, + "loss": 1.0751, + "step": 14043 + }, + { + "epoch": 0.8532717662069385, + "grad_norm": 0.15971939265727997, + "learning_rate": 5.254266747054188e-06, + "loss": 1.0725, + "step": 14044 + }, + { + "epoch": 0.853332523239565, + "grad_norm": 0.14076870679855347, + "learning_rate": 5.249995838616861e-06, + "loss": 1.0241, + "step": 14045 + }, + { + "epoch": 0.8533932802721915, + "grad_norm": 0.22558552026748657, + "learning_rate": 5.245726570499587e-06, + "loss": 1.0576, + "step": 14046 + }, + { + "epoch": 0.8534540373048181, + "grad_norm": 0.10706805437803268, + "learning_rate": 5.241458942858857e-06, + "loss": 1.0184, + "step": 14047 + }, + { + "epoch": 0.8535147943374446, + "grad_norm": 0.1199878379702568, + "learning_rate": 5.2371929558511e-06, + "loss": 1.028, + "step": 14048 + }, + { + "epoch": 0.8535755513700711, + "grad_norm": 0.32045525312423706, + "learning_rate": 5.232928609632692e-06, + "loss": 1.1553, + "step": 14049 + }, + { + "epoch": 0.8536363084026977, + "grad_norm": 0.15111345052719116, + "learning_rate": 5.228665904359936e-06, + "loss": 1.1162, + "step": 14050 + }, + { + "epoch": 0.8536970654353241, + "grad_norm": 0.13601776957511902, + "learning_rate": 5.224404840189084e-06, + "loss": 1.0531, + "step": 14051 + }, + { + "epoch": 0.8537578224679506, + "grad_norm": 0.1124998927116394, + "learning_rate": 5.2201454172763175e-06, + "loss": 1.0374, + "step": 14052 + }, + { + "epoch": 0.8538185795005772, + "grad_norm": 0.14750927686691284, + "learning_rate": 5.2158876357777995e-06, + "loss": 1.0862, + "step": 14053 + }, + { + "epoch": 0.8538793365332037, + "grad_norm": 1.0044686794281006, + "learning_rate": 5.211631495849562e-06, + "loss": 1.112, + "step": 14054 + }, + { + "epoch": 0.8539400935658302, + "grad_norm": 0.1462719887495041, + "learning_rate": 5.2073769976476335e-06, + "loss": 1.0509, + "step": 14055 + }, + { + "epoch": 0.8540008505984568, + "grad_norm": 0.39019280672073364, + "learning_rate": 5.203124141327959e-06, + "loss": 1.1632, + "step": 14056 + }, + { + "epoch": 0.8540616076310833, + "grad_norm": 0.14390425384044647, + "learning_rate": 5.198872927046417e-06, + "loss": 1.0763, + "step": 14057 + }, + { + "epoch": 0.8541223646637098, + "grad_norm": 0.1774798184633255, + "learning_rate": 5.194623354958855e-06, + "loss": 1.0547, + "step": 14058 + }, + { + "epoch": 0.8541831216963364, + "grad_norm": 0.2587502896785736, + "learning_rate": 5.19037542522105e-06, + "loss": 1.1674, + "step": 14059 + }, + { + "epoch": 0.8542438787289629, + "grad_norm": 0.14740493893623352, + "learning_rate": 5.186129137988683e-06, + "loss": 1.0952, + "step": 14060 + }, + { + "epoch": 0.8543046357615894, + "grad_norm": 0.1673775166273117, + "learning_rate": 5.181884493417416e-06, + "loss": 1.0447, + "step": 14061 + }, + { + "epoch": 0.854365392794216, + "grad_norm": 0.4900735020637512, + "learning_rate": 5.177641491662821e-06, + "loss": 1.2021, + "step": 14062 + }, + { + "epoch": 0.8544261498268425, + "grad_norm": 0.15155020356178284, + "learning_rate": 5.173400132880457e-06, + "loss": 1.0754, + "step": 14063 + }, + { + "epoch": 0.8544869068594689, + "grad_norm": 0.10192351043224335, + "learning_rate": 5.169160417225771e-06, + "loss": 1.0477, + "step": 14064 + }, + { + "epoch": 0.8545476638920955, + "grad_norm": 0.1388743370771408, + "learning_rate": 5.1649223448541786e-06, + "loss": 1.053, + "step": 14065 + }, + { + "epoch": 0.854608420924722, + "grad_norm": 0.26271912455558777, + "learning_rate": 5.160685915921032e-06, + "loss": 1.2382, + "step": 14066 + }, + { + "epoch": 0.8546691779573485, + "grad_norm": 0.261271595954895, + "learning_rate": 5.156451130581597e-06, + "loss": 1.069, + "step": 14067 + }, + { + "epoch": 0.8547299349899751, + "grad_norm": 0.1277359127998352, + "learning_rate": 5.152217988991115e-06, + "loss": 1.0757, + "step": 14068 + }, + { + "epoch": 0.8547906920226016, + "grad_norm": 0.2207491248846054, + "learning_rate": 5.147986491304757e-06, + "loss": 1.0602, + "step": 14069 + }, + { + "epoch": 0.8548514490552281, + "grad_norm": 0.10567124933004379, + "learning_rate": 5.143756637677621e-06, + "loss": 1.072, + "step": 14070 + }, + { + "epoch": 0.8549122060878547, + "grad_norm": 6.4839253425598145, + "learning_rate": 5.139528428264756e-06, + "loss": 1.0505, + "step": 14071 + }, + { + "epoch": 0.8549729631204812, + "grad_norm": 0.1824016273021698, + "learning_rate": 5.135301863221154e-06, + "loss": 1.0083, + "step": 14072 + }, + { + "epoch": 0.8550337201531077, + "grad_norm": 0.12151354551315308, + "learning_rate": 5.131076942701729e-06, + "loss": 1.0784, + "step": 14073 + }, + { + "epoch": 0.8550944771857343, + "grad_norm": 0.1690412163734436, + "learning_rate": 5.126853666861347e-06, + "loss": 1.0106, + "step": 14074 + }, + { + "epoch": 0.8551552342183608, + "grad_norm": 0.2130904495716095, + "learning_rate": 5.122632035854824e-06, + "loss": 1.0766, + "step": 14075 + }, + { + "epoch": 0.8552159912509874, + "grad_norm": 0.12926946580410004, + "learning_rate": 5.118412049836896e-06, + "loss": 1.0916, + "step": 14076 + }, + { + "epoch": 0.8552767482836138, + "grad_norm": 0.1278305947780609, + "learning_rate": 5.11419370896225e-06, + "loss": 1.0319, + "step": 14077 + }, + { + "epoch": 0.8553375053162403, + "grad_norm": 0.48680371046066284, + "learning_rate": 5.109977013385503e-06, + "loss": 1.115, + "step": 14078 + }, + { + "epoch": 0.8553982623488668, + "grad_norm": 0.13173919916152954, + "learning_rate": 5.105761963261236e-06, + "loss": 1.0611, + "step": 14079 + }, + { + "epoch": 0.8554590193814934, + "grad_norm": 0.11212275177240372, + "learning_rate": 5.101548558743946e-06, + "loss": 1.0894, + "step": 14080 + }, + { + "epoch": 0.8555197764141199, + "grad_norm": 9.608139038085938, + "learning_rate": 5.097336799988067e-06, + "loss": 1.0463, + "step": 14081 + }, + { + "epoch": 0.8555805334467464, + "grad_norm": 0.11307689547538757, + "learning_rate": 5.093126687147986e-06, + "loss": 1.0792, + "step": 14082 + }, + { + "epoch": 0.855641290479373, + "grad_norm": 0.13400432467460632, + "learning_rate": 5.088918220378014e-06, + "loss": 1.0328, + "step": 14083 + }, + { + "epoch": 0.8557020475119995, + "grad_norm": 0.11246509850025177, + "learning_rate": 5.084711399832437e-06, + "loss": 0.9947, + "step": 14084 + }, + { + "epoch": 0.855762804544626, + "grad_norm": 0.18977652490139008, + "learning_rate": 5.080506225665449e-06, + "loss": 1.0919, + "step": 14085 + }, + { + "epoch": 0.8558235615772526, + "grad_norm": 0.1288769692182541, + "learning_rate": 5.076302698031193e-06, + "loss": 1.0324, + "step": 14086 + }, + { + "epoch": 0.8558843186098791, + "grad_norm": 0.15988439321517944, + "learning_rate": 5.072100817083741e-06, + "loss": 1.1172, + "step": 14087 + }, + { + "epoch": 0.8559450756425057, + "grad_norm": 0.2972189784049988, + "learning_rate": 5.067900582977103e-06, + "loss": 1.0873, + "step": 14088 + }, + { + "epoch": 0.8560058326751322, + "grad_norm": 0.17799590528011322, + "learning_rate": 5.063701995865266e-06, + "loss": 1.0474, + "step": 14089 + }, + { + "epoch": 0.8560665897077586, + "grad_norm": 0.12099135667085648, + "learning_rate": 5.0595050559021196e-06, + "loss": 1.0697, + "step": 14090 + }, + { + "epoch": 0.8561273467403852, + "grad_norm": 0.10432138293981552, + "learning_rate": 5.055309763241501e-06, + "loss": 0.9967, + "step": 14091 + }, + { + "epoch": 0.8561881037730117, + "grad_norm": 0.10693902522325516, + "learning_rate": 5.0511161180371916e-06, + "loss": 1.0283, + "step": 14092 + }, + { + "epoch": 0.8562488608056382, + "grad_norm": 0.1719645857810974, + "learning_rate": 5.046924120442903e-06, + "loss": 1.1292, + "step": 14093 + }, + { + "epoch": 0.8563096178382648, + "grad_norm": 0.2054222822189331, + "learning_rate": 5.042733770612307e-06, + "loss": 1.1198, + "step": 14094 + }, + { + "epoch": 0.8563703748708913, + "grad_norm": 0.1370614618062973, + "learning_rate": 5.038545068698991e-06, + "loss": 1.036, + "step": 14095 + }, + { + "epoch": 0.8564311319035178, + "grad_norm": 0.16334545612335205, + "learning_rate": 5.034358014856499e-06, + "loss": 1.1287, + "step": 14096 + }, + { + "epoch": 0.8564918889361444, + "grad_norm": 0.33314672112464905, + "learning_rate": 5.030172609238304e-06, + "loss": 1.1162, + "step": 14097 + }, + { + "epoch": 0.8565526459687709, + "grad_norm": 0.2050005942583084, + "learning_rate": 5.025988851997826e-06, + "loss": 1.0288, + "step": 14098 + }, + { + "epoch": 0.8566134030013974, + "grad_norm": 0.18852275609970093, + "learning_rate": 5.02180674328841e-06, + "loss": 1.0533, + "step": 14099 + }, + { + "epoch": 0.856674160034024, + "grad_norm": 0.2342219352722168, + "learning_rate": 5.017626283263382e-06, + "loss": 1.1181, + "step": 14100 + }, + { + "epoch": 0.8567349170666505, + "grad_norm": 0.1488930583000183, + "learning_rate": 5.01344747207595e-06, + "loss": 1.0883, + "step": 14101 + }, + { + "epoch": 0.856795674099277, + "grad_norm": 0.1642390340566635, + "learning_rate": 5.009270309879288e-06, + "loss": 1.025, + "step": 14102 + }, + { + "epoch": 0.8568564311319036, + "grad_norm": 0.14734713733196259, + "learning_rate": 5.005094796826526e-06, + "loss": 1.0894, + "step": 14103 + }, + { + "epoch": 0.85691718816453, + "grad_norm": 0.16555152833461761, + "learning_rate": 5.000920933070702e-06, + "loss": 1.093, + "step": 14104 + }, + { + "epoch": 0.8569779451971565, + "grad_norm": 0.30410251021385193, + "learning_rate": 4.9967487187648266e-06, + "loss": 1.263, + "step": 14105 + }, + { + "epoch": 0.8570387022297831, + "grad_norm": 0.09450112283229828, + "learning_rate": 4.992578154061839e-06, + "loss": 1.0099, + "step": 14106 + }, + { + "epoch": 0.8570994592624096, + "grad_norm": 0.11336357891559601, + "learning_rate": 4.988409239114583e-06, + "loss": 1.0278, + "step": 14107 + }, + { + "epoch": 0.8571602162950361, + "grad_norm": 0.25878921151161194, + "learning_rate": 4.984241974075882e-06, + "loss": 1.1545, + "step": 14108 + }, + { + "epoch": 0.8572209733276627, + "grad_norm": 0.16125600039958954, + "learning_rate": 4.980076359098501e-06, + "loss": 1.0565, + "step": 14109 + }, + { + "epoch": 0.8572817303602892, + "grad_norm": 0.12710823118686676, + "learning_rate": 4.975912394335125e-06, + "loss": 1.0849, + "step": 14110 + }, + { + "epoch": 0.8573424873929157, + "grad_norm": 0.1700657159090042, + "learning_rate": 4.971750079938381e-06, + "loss": 1.2611, + "step": 14111 + }, + { + "epoch": 0.8574032444255423, + "grad_norm": 0.12558159232139587, + "learning_rate": 4.967589416060842e-06, + "loss": 1.0416, + "step": 14112 + }, + { + "epoch": 0.8574640014581688, + "grad_norm": 0.1135503426194191, + "learning_rate": 4.9634304028550185e-06, + "loss": 1.0282, + "step": 14113 + }, + { + "epoch": 0.8575247584907953, + "grad_norm": 0.15996211767196655, + "learning_rate": 4.959273040473356e-06, + "loss": 1.1139, + "step": 14114 + }, + { + "epoch": 0.8575855155234219, + "grad_norm": 0.20730850100517273, + "learning_rate": 4.955117329068249e-06, + "loss": 1.1364, + "step": 14115 + }, + { + "epoch": 0.8576462725560484, + "grad_norm": 0.28644946217536926, + "learning_rate": 4.950963268792025e-06, + "loss": 1.1264, + "step": 14116 + }, + { + "epoch": 0.8577070295886748, + "grad_norm": 0.21113575994968414, + "learning_rate": 4.9468108597969465e-06, + "loss": 1.0931, + "step": 14117 + }, + { + "epoch": 0.8577677866213014, + "grad_norm": 0.9280003309249878, + "learning_rate": 4.942660102235225e-06, + "loss": 1.1053, + "step": 14118 + }, + { + "epoch": 0.8578285436539279, + "grad_norm": 0.2833453118801117, + "learning_rate": 4.938510996258999e-06, + "loss": 1.0873, + "step": 14119 + }, + { + "epoch": 0.8578893006865544, + "grad_norm": 0.1464274525642395, + "learning_rate": 4.934363542020381e-06, + "loss": 1.008, + "step": 14120 + }, + { + "epoch": 0.857950057719181, + "grad_norm": 0.174157977104187, + "learning_rate": 4.930217739671367e-06, + "loss": 1.1519, + "step": 14121 + }, + { + "epoch": 0.8580108147518075, + "grad_norm": 0.14351730048656464, + "learning_rate": 4.926073589363939e-06, + "loss": 1.0704, + "step": 14122 + }, + { + "epoch": 0.858071571784434, + "grad_norm": 0.21605785191059113, + "learning_rate": 4.921931091249993e-06, + "loss": 1.121, + "step": 14123 + }, + { + "epoch": 0.8581323288170606, + "grad_norm": 0.2234364002943039, + "learning_rate": 4.917790245481363e-06, + "loss": 1.0369, + "step": 14124 + }, + { + "epoch": 0.8581930858496871, + "grad_norm": 0.12953612208366394, + "learning_rate": 4.913651052209861e-06, + "loss": 1.1133, + "step": 14125 + }, + { + "epoch": 0.8582538428823137, + "grad_norm": 0.10417594760656357, + "learning_rate": 4.909513511587199e-06, + "loss": 1.0122, + "step": 14126 + }, + { + "epoch": 0.8583145999149402, + "grad_norm": 0.18923239409923553, + "learning_rate": 4.905377623765029e-06, + "loss": 1.1166, + "step": 14127 + }, + { + "epoch": 0.8583753569475667, + "grad_norm": 0.28426477313041687, + "learning_rate": 4.901243388894961e-06, + "loss": 1.0946, + "step": 14128 + }, + { + "epoch": 0.8584361139801933, + "grad_norm": 0.14293935894966125, + "learning_rate": 4.897110807128519e-06, + "loss": 1.0768, + "step": 14129 + }, + { + "epoch": 0.8584968710128197, + "grad_norm": 4.603726863861084, + "learning_rate": 4.89297987861721e-06, + "loss": 1.0716, + "step": 14130 + }, + { + "epoch": 0.8585576280454462, + "grad_norm": 0.15873724222183228, + "learning_rate": 4.888850603512446e-06, + "loss": 1.0715, + "step": 14131 + }, + { + "epoch": 0.8586183850780728, + "grad_norm": 0.16651402413845062, + "learning_rate": 4.884722981965578e-06, + "loss": 1.0712, + "step": 14132 + }, + { + "epoch": 0.8586791421106993, + "grad_norm": 0.1254180669784546, + "learning_rate": 4.880597014127925e-06, + "loss": 1.0405, + "step": 14133 + }, + { + "epoch": 0.8587398991433258, + "grad_norm": 4.309706211090088, + "learning_rate": 4.876472700150686e-06, + "loss": 1.0467, + "step": 14134 + }, + { + "epoch": 0.8588006561759524, + "grad_norm": 0.9488537311553955, + "learning_rate": 4.87235004018507e-06, + "loss": 1.0888, + "step": 14135 + }, + { + "epoch": 0.8588614132085789, + "grad_norm": 0.26401764154434204, + "learning_rate": 4.8682290343821875e-06, + "loss": 1.2698, + "step": 14136 + }, + { + "epoch": 0.8589221702412054, + "grad_norm": 0.1238195076584816, + "learning_rate": 4.864109682893098e-06, + "loss": 1.0337, + "step": 14137 + }, + { + "epoch": 0.858982927273832, + "grad_norm": 0.42624789476394653, + "learning_rate": 4.859991985868789e-06, + "loss": 1.1827, + "step": 14138 + }, + { + "epoch": 0.8590436843064585, + "grad_norm": 0.2221917361021042, + "learning_rate": 4.855875943460203e-06, + "loss": 1.1298, + "step": 14139 + }, + { + "epoch": 0.859104441339085, + "grad_norm": 0.25979480147361755, + "learning_rate": 4.851761555818207e-06, + "loss": 1.1176, + "step": 14140 + }, + { + "epoch": 0.8591651983717116, + "grad_norm": 0.20306643843650818, + "learning_rate": 4.847648823093615e-06, + "loss": 1.122, + "step": 14141 + }, + { + "epoch": 0.8592259554043381, + "grad_norm": 0.10130464285612106, + "learning_rate": 4.843537745437188e-06, + "loss": 0.9902, + "step": 14142 + }, + { + "epoch": 0.8592867124369645, + "grad_norm": 0.1563415229320526, + "learning_rate": 4.839428322999612e-06, + "loss": 1.1351, + "step": 14143 + }, + { + "epoch": 0.8593474694695911, + "grad_norm": 0.2717992067337036, + "learning_rate": 4.835320555931522e-06, + "loss": 1.043, + "step": 14144 + }, + { + "epoch": 0.8594082265022176, + "grad_norm": 0.12131844460964203, + "learning_rate": 4.831214444383475e-06, + "loss": 1.0318, + "step": 14145 + }, + { + "epoch": 0.8594689835348441, + "grad_norm": 0.30003729462623596, + "learning_rate": 4.8271099885060125e-06, + "loss": 1.0986, + "step": 14146 + }, + { + "epoch": 0.8595297405674707, + "grad_norm": 0.09912194311618805, + "learning_rate": 4.823007188449557e-06, + "loss": 1.0117, + "step": 14147 + }, + { + "epoch": 0.8595904976000972, + "grad_norm": 1.0891952514648438, + "learning_rate": 4.818906044364507e-06, + "loss": 1.2641, + "step": 14148 + }, + { + "epoch": 0.8596512546327237, + "grad_norm": 0.17120946943759918, + "learning_rate": 4.814806556401186e-06, + "loss": 1.1154, + "step": 14149 + }, + { + "epoch": 0.8597120116653503, + "grad_norm": 1.8278672695159912, + "learning_rate": 4.810708724709856e-06, + "loss": 1.0324, + "step": 14150 + }, + { + "epoch": 0.8597727686979768, + "grad_norm": 0.11879687756299973, + "learning_rate": 4.806612549440742e-06, + "loss": 1.0133, + "step": 14151 + }, + { + "epoch": 0.8598335257306033, + "grad_norm": 0.18011632561683655, + "learning_rate": 4.802518030743985e-06, + "loss": 1.0537, + "step": 14152 + }, + { + "epoch": 0.8598942827632299, + "grad_norm": 0.1315927356481552, + "learning_rate": 4.798425168769671e-06, + "loss": 1.0653, + "step": 14153 + }, + { + "epoch": 0.8599550397958564, + "grad_norm": 0.19348208606243134, + "learning_rate": 4.794333963667813e-06, + "loss": 1.1729, + "step": 14154 + }, + { + "epoch": 0.860015796828483, + "grad_norm": 0.24430322647094727, + "learning_rate": 4.790244415588369e-06, + "loss": 1.0538, + "step": 14155 + }, + { + "epoch": 0.8600765538611094, + "grad_norm": 0.27006277441978455, + "learning_rate": 4.786156524681268e-06, + "loss": 1.2019, + "step": 14156 + }, + { + "epoch": 0.8601373108937359, + "grad_norm": 0.22173461318016052, + "learning_rate": 4.782070291096336e-06, + "loss": 1.0858, + "step": 14157 + }, + { + "epoch": 0.8601980679263624, + "grad_norm": 0.10452301800251007, + "learning_rate": 4.777985714983357e-06, + "loss": 1.0287, + "step": 14158 + }, + { + "epoch": 0.860258824958989, + "grad_norm": 0.12470521032810211, + "learning_rate": 4.773902796492052e-06, + "loss": 1.0533, + "step": 14159 + }, + { + "epoch": 0.8603195819916155, + "grad_norm": 0.26975512504577637, + "learning_rate": 4.7698215357720895e-06, + "loss": 1.0738, + "step": 14160 + }, + { + "epoch": 0.860380339024242, + "grad_norm": 0.19559358060359955, + "learning_rate": 4.765741932973055e-06, + "loss": 1.1401, + "step": 14161 + }, + { + "epoch": 0.8604410960568686, + "grad_norm": 0.10784042626619339, + "learning_rate": 4.761663988244497e-06, + "loss": 1.0178, + "step": 14162 + }, + { + "epoch": 0.8605018530894951, + "grad_norm": 0.15287598967552185, + "learning_rate": 4.757587701735883e-06, + "loss": 1.0631, + "step": 14163 + }, + { + "epoch": 0.8605626101221217, + "grad_norm": 0.1741325855255127, + "learning_rate": 4.753513073596644e-06, + "loss": 1.1007, + "step": 14164 + }, + { + "epoch": 0.8606233671547482, + "grad_norm": 0.807770848274231, + "learning_rate": 4.749440103976127e-06, + "loss": 1.0559, + "step": 14165 + }, + { + "epoch": 0.8606841241873747, + "grad_norm": 0.352888822555542, + "learning_rate": 4.745368793023619e-06, + "loss": 1.2144, + "step": 14166 + }, + { + "epoch": 0.8607448812200013, + "grad_norm": 45.675228118896484, + "learning_rate": 4.741299140888389e-06, + "loss": 1.0253, + "step": 14167 + }, + { + "epoch": 0.8608056382526278, + "grad_norm": 15.36818790435791, + "learning_rate": 4.737231147719573e-06, + "loss": 1.0231, + "step": 14168 + }, + { + "epoch": 0.8608663952852542, + "grad_norm": 0.1279137283563614, + "learning_rate": 4.733164813666302e-06, + "loss": 1.052, + "step": 14169 + }, + { + "epoch": 0.8609271523178808, + "grad_norm": 0.16352872550487518, + "learning_rate": 4.729100138877623e-06, + "loss": 1.0183, + "step": 14170 + }, + { + "epoch": 0.8609879093505073, + "grad_norm": 0.14424246549606323, + "learning_rate": 4.725037123502518e-06, + "loss": 1.0729, + "step": 14171 + }, + { + "epoch": 0.8610486663831338, + "grad_norm": 0.12785501778125763, + "learning_rate": 4.720975767689939e-06, + "loss": 1.0509, + "step": 14172 + }, + { + "epoch": 0.8611094234157604, + "grad_norm": 0.1404189169406891, + "learning_rate": 4.7169160715887566e-06, + "loss": 1.0793, + "step": 14173 + }, + { + "epoch": 0.8611701804483869, + "grad_norm": 0.21100686490535736, + "learning_rate": 4.712858035347756e-06, + "loss": 1.0954, + "step": 14174 + }, + { + "epoch": 0.8612309374810134, + "grad_norm": 6.07974910736084, + "learning_rate": 4.708801659115702e-06, + "loss": 0.9877, + "step": 14175 + }, + { + "epoch": 0.86129169451364, + "grad_norm": 0.17326590418815613, + "learning_rate": 4.704746943041266e-06, + "loss": 1.0594, + "step": 14176 + }, + { + "epoch": 0.8613524515462665, + "grad_norm": 0.11902457475662231, + "learning_rate": 4.7006938872730945e-06, + "loss": 1.082, + "step": 14177 + }, + { + "epoch": 0.861413208578893, + "grad_norm": 0.09663273394107819, + "learning_rate": 4.696642491959746e-06, + "loss": 1.0214, + "step": 14178 + }, + { + "epoch": 0.8614739656115196, + "grad_norm": 0.1515512764453888, + "learning_rate": 4.692592757249725e-06, + "loss": 1.075, + "step": 14179 + }, + { + "epoch": 0.8615347226441461, + "grad_norm": 0.13148732483386993, + "learning_rate": 4.688544683291479e-06, + "loss": 1.109, + "step": 14180 + }, + { + "epoch": 0.8615954796767726, + "grad_norm": 0.17569436132907867, + "learning_rate": 4.6844982702333725e-06, + "loss": 1.0373, + "step": 14181 + }, + { + "epoch": 0.8616562367093991, + "grad_norm": 0.12792392075061798, + "learning_rate": 4.6804535182237485e-06, + "loss": 1.0156, + "step": 14182 + }, + { + "epoch": 0.8617169937420256, + "grad_norm": 0.16717316210269928, + "learning_rate": 4.67641042741086e-06, + "loss": 1.1186, + "step": 14183 + }, + { + "epoch": 0.8617777507746521, + "grad_norm": 0.15593482553958893, + "learning_rate": 4.672368997942911e-06, + "loss": 1.1887, + "step": 14184 + }, + { + "epoch": 0.8618385078072787, + "grad_norm": 0.25899389386177063, + "learning_rate": 4.668329229968038e-06, + "loss": 1.0453, + "step": 14185 + }, + { + "epoch": 0.8618992648399052, + "grad_norm": 0.15924355387687683, + "learning_rate": 4.664291123634318e-06, + "loss": 1.0993, + "step": 14186 + }, + { + "epoch": 0.8619600218725317, + "grad_norm": 0.12288684397935867, + "learning_rate": 4.660254679089771e-06, + "loss": 1.053, + "step": 14187 + }, + { + "epoch": 0.8620207789051583, + "grad_norm": 0.11554786562919617, + "learning_rate": 4.65621989648235e-06, + "loss": 1.0203, + "step": 14188 + }, + { + "epoch": 0.8620815359377848, + "grad_norm": 0.1808425486087799, + "learning_rate": 4.652186775959954e-06, + "loss": 1.1128, + "step": 14189 + }, + { + "epoch": 0.8621422929704113, + "grad_norm": 0.20926572382450104, + "learning_rate": 4.648155317670422e-06, + "loss": 1.1271, + "step": 14190 + }, + { + "epoch": 0.8622030500030379, + "grad_norm": 0.23795105516910553, + "learning_rate": 4.644125521761516e-06, + "loss": 1.0368, + "step": 14191 + }, + { + "epoch": 0.8622638070356644, + "grad_norm": 0.1222681999206543, + "learning_rate": 4.640097388380954e-06, + "loss": 0.9786, + "step": 14192 + }, + { + "epoch": 0.8623245640682909, + "grad_norm": 0.16433022916316986, + "learning_rate": 4.636070917676405e-06, + "loss": 1.0514, + "step": 14193 + }, + { + "epoch": 0.8623853211009175, + "grad_norm": 0.12489213049411774, + "learning_rate": 4.632046109795435e-06, + "loss": 1.014, + "step": 14194 + }, + { + "epoch": 0.8624460781335439, + "grad_norm": 0.2033950239419937, + "learning_rate": 4.628022964885587e-06, + "loss": 1.1627, + "step": 14195 + }, + { + "epoch": 0.8625068351661704, + "grad_norm": 0.22645170986652374, + "learning_rate": 4.62400148309432e-06, + "loss": 1.1643, + "step": 14196 + }, + { + "epoch": 0.862567592198797, + "grad_norm": 0.10457313805818558, + "learning_rate": 4.6199816645690505e-06, + "loss": 1.0445, + "step": 14197 + }, + { + "epoch": 0.8626283492314235, + "grad_norm": 0.16166149079799652, + "learning_rate": 4.615963509457128e-06, + "loss": 1.1237, + "step": 14198 + }, + { + "epoch": 0.86268910626405, + "grad_norm": 1.4195982217788696, + "learning_rate": 4.611947017905838e-06, + "loss": 1.0219, + "step": 14199 + }, + { + "epoch": 0.8627498632966766, + "grad_norm": 0.25008589029312134, + "learning_rate": 4.607932190062408e-06, + "loss": 1.181, + "step": 14200 + }, + { + "epoch": 0.8628106203293031, + "grad_norm": 0.19807790219783783, + "learning_rate": 4.603919026073989e-06, + "loss": 1.1015, + "step": 14201 + }, + { + "epoch": 0.8628713773619296, + "grad_norm": 0.21317186951637268, + "learning_rate": 4.5999075260876935e-06, + "loss": 1.0791, + "step": 14202 + }, + { + "epoch": 0.8629321343945562, + "grad_norm": 0.13090331852436066, + "learning_rate": 4.595897690250567e-06, + "loss": 1.0227, + "step": 14203 + }, + { + "epoch": 0.8629928914271827, + "grad_norm": 0.2109454721212387, + "learning_rate": 4.591889518709585e-06, + "loss": 1.2276, + "step": 14204 + }, + { + "epoch": 0.8630536484598093, + "grad_norm": 0.10929658263921738, + "learning_rate": 4.587883011611671e-06, + "loss": 1.0149, + "step": 14205 + }, + { + "epoch": 0.8631144054924358, + "grad_norm": 0.6618485450744629, + "learning_rate": 4.583878169103684e-06, + "loss": 1.0616, + "step": 14206 + }, + { + "epoch": 0.8631751625250623, + "grad_norm": 0.1818619817495346, + "learning_rate": 4.579874991332423e-06, + "loss": 1.1181, + "step": 14207 + }, + { + "epoch": 0.8632359195576889, + "grad_norm": 0.17064742743968964, + "learning_rate": 4.57587347844462e-06, + "loss": 1.0393, + "step": 14208 + }, + { + "epoch": 0.8632966765903153, + "grad_norm": 0.20591960847377777, + "learning_rate": 4.571873630586954e-06, + "loss": 1.1793, + "step": 14209 + }, + { + "epoch": 0.8633574336229418, + "grad_norm": 0.2499299794435501, + "learning_rate": 4.567875447906045e-06, + "loss": 1.0405, + "step": 14210 + }, + { + "epoch": 0.8634181906555684, + "grad_norm": 0.14137180149555206, + "learning_rate": 4.563878930548443e-06, + "loss": 1.0978, + "step": 14211 + }, + { + "epoch": 0.8634789476881949, + "grad_norm": 0.14389704167842865, + "learning_rate": 4.559884078660626e-06, + "loss": 1.145, + "step": 14212 + }, + { + "epoch": 0.8635397047208214, + "grad_norm": 0.2118283361196518, + "learning_rate": 4.55589089238907e-06, + "loss": 1.0046, + "step": 14213 + }, + { + "epoch": 0.863600461753448, + "grad_norm": 1.3960285186767578, + "learning_rate": 4.551899371880103e-06, + "loss": 1.0388, + "step": 14214 + }, + { + "epoch": 0.8636612187860745, + "grad_norm": 0.1342649757862091, + "learning_rate": 4.547909517280047e-06, + "loss": 1.0483, + "step": 14215 + }, + { + "epoch": 0.863721975818701, + "grad_norm": 0.12921537458896637, + "learning_rate": 4.543921328735157e-06, + "loss": 1.0534, + "step": 14216 + }, + { + "epoch": 0.8637827328513276, + "grad_norm": 0.12537027895450592, + "learning_rate": 4.539934806391605e-06, + "loss": 1.0032, + "step": 14217 + }, + { + "epoch": 0.8638434898839541, + "grad_norm": 0.11934049427509308, + "learning_rate": 4.535949950395541e-06, + "loss": 1.055, + "step": 14218 + }, + { + "epoch": 0.8639042469165806, + "grad_norm": 0.13322697579860687, + "learning_rate": 4.53196676089302e-06, + "loss": 1.0497, + "step": 14219 + }, + { + "epoch": 0.8639650039492072, + "grad_norm": 0.18889231979846954, + "learning_rate": 4.5279852380300524e-06, + "loss": 1.0634, + "step": 14220 + }, + { + "epoch": 0.8640257609818337, + "grad_norm": 0.17164137959480286, + "learning_rate": 4.5240053819525726e-06, + "loss": 1.0604, + "step": 14221 + }, + { + "epoch": 0.8640865180144601, + "grad_norm": 0.14856575429439545, + "learning_rate": 4.520027192806453e-06, + "loss": 1.1066, + "step": 14222 + }, + { + "epoch": 0.8641472750470867, + "grad_norm": 0.38683027029037476, + "learning_rate": 4.516050670737537e-06, + "loss": 1.061, + "step": 14223 + }, + { + "epoch": 0.8642080320797132, + "grad_norm": 0.42218300700187683, + "learning_rate": 4.5120758158915755e-06, + "loss": 1.0734, + "step": 14224 + }, + { + "epoch": 0.8642687891123397, + "grad_norm": 0.23691798746585846, + "learning_rate": 4.5081026284142625e-06, + "loss": 1.062, + "step": 14225 + }, + { + "epoch": 0.8643295461449663, + "grad_norm": 0.10736768692731857, + "learning_rate": 4.504131108451248e-06, + "loss": 1.0433, + "step": 14226 + }, + { + "epoch": 0.8643903031775928, + "grad_norm": 0.1270807534456253, + "learning_rate": 4.5001612561481e-06, + "loss": 1.0299, + "step": 14227 + }, + { + "epoch": 0.8644510602102193, + "grad_norm": 0.13499632477760315, + "learning_rate": 4.496193071650334e-06, + "loss": 1.0845, + "step": 14228 + }, + { + "epoch": 0.8645118172428459, + "grad_norm": 0.15015171468257904, + "learning_rate": 4.492226555103407e-06, + "loss": 1.0697, + "step": 14229 + }, + { + "epoch": 0.8645725742754724, + "grad_norm": 0.1274319738149643, + "learning_rate": 4.488261706652713e-06, + "loss": 1.041, + "step": 14230 + }, + { + "epoch": 0.8646333313080989, + "grad_norm": 0.1719062626361847, + "learning_rate": 4.4842985264435864e-06, + "loss": 1.153, + "step": 14231 + }, + { + "epoch": 0.8646940883407255, + "grad_norm": 0.23702573776245117, + "learning_rate": 4.480337014621294e-06, + "loss": 1.1169, + "step": 14232 + }, + { + "epoch": 0.864754845373352, + "grad_norm": 0.16583359241485596, + "learning_rate": 4.476377171331037e-06, + "loss": 1.0569, + "step": 14233 + }, + { + "epoch": 0.8648156024059785, + "grad_norm": 0.15656907856464386, + "learning_rate": 4.472418996717992e-06, + "loss": 1.0844, + "step": 14234 + }, + { + "epoch": 0.864876359438605, + "grad_norm": 0.22169530391693115, + "learning_rate": 4.468462490927216e-06, + "loss": 1.0993, + "step": 14235 + }, + { + "epoch": 0.8649371164712315, + "grad_norm": 0.1542995125055313, + "learning_rate": 4.464507654103756e-06, + "loss": 1.0316, + "step": 14236 + }, + { + "epoch": 0.864997873503858, + "grad_norm": 0.10396695137023926, + "learning_rate": 4.4605544863925654e-06, + "loss": 1.0052, + "step": 14237 + }, + { + "epoch": 0.8650586305364846, + "grad_norm": 0.29740509390830994, + "learning_rate": 4.456602987938546e-06, + "loss": 1.1668, + "step": 14238 + }, + { + "epoch": 0.8651193875691111, + "grad_norm": 0.14284315705299377, + "learning_rate": 4.452653158886555e-06, + "loss": 1.083, + "step": 14239 + }, + { + "epoch": 0.8651801446017376, + "grad_norm": 0.11881954222917557, + "learning_rate": 4.448704999381376e-06, + "loss": 1.0358, + "step": 14240 + }, + { + "epoch": 0.8652409016343642, + "grad_norm": 0.11248598247766495, + "learning_rate": 4.444758509567709e-06, + "loss": 1.1083, + "step": 14241 + }, + { + "epoch": 0.8653016586669907, + "grad_norm": 0.18575093150138855, + "learning_rate": 4.4408136895902286e-06, + "loss": 1.0914, + "step": 14242 + }, + { + "epoch": 0.8653624156996172, + "grad_norm": 2.2854063510894775, + "learning_rate": 4.436870539593513e-06, + "loss": 1.0581, + "step": 14243 + }, + { + "epoch": 0.8654231727322438, + "grad_norm": 0.15236227214336395, + "learning_rate": 4.4329290597221295e-06, + "loss": 1.1288, + "step": 14244 + }, + { + "epoch": 0.8654839297648703, + "grad_norm": 0.5712029933929443, + "learning_rate": 4.428989250120541e-06, + "loss": 1.2311, + "step": 14245 + }, + { + "epoch": 0.8655446867974969, + "grad_norm": 0.14029696583747864, + "learning_rate": 4.4250511109331534e-06, + "loss": 1.0724, + "step": 14246 + }, + { + "epoch": 0.8656054438301234, + "grad_norm": 0.12488316744565964, + "learning_rate": 4.421114642304342e-06, + "loss": 1.0861, + "step": 14247 + }, + { + "epoch": 0.8656662008627498, + "grad_norm": 0.20525529980659485, + "learning_rate": 4.417179844378361e-06, + "loss": 1.1568, + "step": 14248 + }, + { + "epoch": 0.8657269578953763, + "grad_norm": 0.13475671410560608, + "learning_rate": 4.413246717299474e-06, + "loss": 1.0144, + "step": 14249 + }, + { + "epoch": 0.8657877149280029, + "grad_norm": 0.11134091019630432, + "learning_rate": 4.4093152612118434e-06, + "loss": 1.0229, + "step": 14250 + }, + { + "epoch": 0.8658484719606294, + "grad_norm": 0.1131577268242836, + "learning_rate": 4.405385476259571e-06, + "loss": 1.0198, + "step": 14251 + }, + { + "epoch": 0.865909228993256, + "grad_norm": 0.10383344441652298, + "learning_rate": 4.401457362586709e-06, + "loss": 1.0528, + "step": 14252 + }, + { + "epoch": 0.8659699860258825, + "grad_norm": 0.17939306795597076, + "learning_rate": 4.3975309203372396e-06, + "loss": 1.2157, + "step": 14253 + }, + { + "epoch": 0.866030743058509, + "grad_norm": 0.18818438053131104, + "learning_rate": 4.393606149655094e-06, + "loss": 0.9997, + "step": 14254 + }, + { + "epoch": 0.8660915000911356, + "grad_norm": 0.31205496191978455, + "learning_rate": 4.389683050684124e-06, + "loss": 1.0796, + "step": 14255 + }, + { + "epoch": 0.8661522571237621, + "grad_norm": 0.17409072816371918, + "learning_rate": 4.385761623568141e-06, + "loss": 1.2496, + "step": 14256 + }, + { + "epoch": 0.8662130141563886, + "grad_norm": 0.33971136808395386, + "learning_rate": 4.381841868450881e-06, + "loss": 1.0332, + "step": 14257 + }, + { + "epoch": 0.8662737711890152, + "grad_norm": 0.13563384115695953, + "learning_rate": 4.377923785476029e-06, + "loss": 1.0465, + "step": 14258 + }, + { + "epoch": 0.8663345282216417, + "grad_norm": 0.2389136403799057, + "learning_rate": 4.3740073747871865e-06, + "loss": 1.0715, + "step": 14259 + }, + { + "epoch": 0.8663952852542682, + "grad_norm": 0.6087927222251892, + "learning_rate": 4.3700926365279445e-06, + "loss": 1.1517, + "step": 14260 + }, + { + "epoch": 0.8664560422868947, + "grad_norm": 0.1144602820277214, + "learning_rate": 4.366179570841766e-06, + "loss": 0.9902, + "step": 14261 + }, + { + "epoch": 0.8665167993195212, + "grad_norm": 0.12300249189138412, + "learning_rate": 4.362268177872097e-06, + "loss": 1.024, + "step": 14262 + }, + { + "epoch": 0.8665775563521477, + "grad_norm": 0.14014755189418793, + "learning_rate": 4.358358457762307e-06, + "loss": 1.0035, + "step": 14263 + }, + { + "epoch": 0.8666383133847743, + "grad_norm": 0.2034769058227539, + "learning_rate": 4.3544504106557025e-06, + "loss": 1.2037, + "step": 14264 + }, + { + "epoch": 0.8666990704174008, + "grad_norm": 0.583436131477356, + "learning_rate": 4.350544036695553e-06, + "loss": 1.1972, + "step": 14265 + }, + { + "epoch": 0.8667598274500273, + "grad_norm": 0.16248038411140442, + "learning_rate": 4.346639336025033e-06, + "loss": 1.0567, + "step": 14266 + }, + { + "epoch": 0.8668205844826539, + "grad_norm": 2.0765669345855713, + "learning_rate": 4.3427363087872815e-06, + "loss": 1.1516, + "step": 14267 + }, + { + "epoch": 0.8668813415152804, + "grad_norm": 0.206801176071167, + "learning_rate": 4.338834955125343e-06, + "loss": 1.0645, + "step": 14268 + }, + { + "epoch": 0.8669420985479069, + "grad_norm": 0.18729735910892487, + "learning_rate": 4.334935275182234e-06, + "loss": 1.0654, + "step": 14269 + }, + { + "epoch": 0.8670028555805335, + "grad_norm": 0.6838131546974182, + "learning_rate": 4.331037269100901e-06, + "loss": 1.2174, + "step": 14270 + }, + { + "epoch": 0.86706361261316, + "grad_norm": 0.14661040902137756, + "learning_rate": 4.327140937024232e-06, + "loss": 1.0889, + "step": 14271 + }, + { + "epoch": 0.8671243696457865, + "grad_norm": 0.15410764515399933, + "learning_rate": 4.323246279095033e-06, + "loss": 1.0795, + "step": 14272 + }, + { + "epoch": 0.8671851266784131, + "grad_norm": 0.17606182396411896, + "learning_rate": 4.319353295456074e-06, + "loss": 1.1158, + "step": 14273 + }, + { + "epoch": 0.8672458837110395, + "grad_norm": 0.2107379138469696, + "learning_rate": 4.315461986250052e-06, + "loss": 1.0633, + "step": 14274 + }, + { + "epoch": 0.867306640743666, + "grad_norm": 0.19956208765506744, + "learning_rate": 4.311572351619603e-06, + "loss": 1.058, + "step": 14275 + }, + { + "epoch": 0.8673673977762926, + "grad_norm": 0.4064960181713104, + "learning_rate": 4.3076843917072955e-06, + "loss": 1.1979, + "step": 14276 + }, + { + "epoch": 0.8674281548089191, + "grad_norm": 0.21979095041751862, + "learning_rate": 4.303798106655649e-06, + "loss": 1.0984, + "step": 14277 + }, + { + "epoch": 0.8674889118415456, + "grad_norm": 0.14662480354309082, + "learning_rate": 4.299913496607122e-06, + "loss": 1.028, + "step": 14278 + }, + { + "epoch": 0.8675496688741722, + "grad_norm": 0.21457675099372864, + "learning_rate": 4.2960305617040955e-06, + "loss": 1.1901, + "step": 14279 + }, + { + "epoch": 0.8676104259067987, + "grad_norm": 0.10797221213579178, + "learning_rate": 4.292149302088899e-06, + "loss": 1.0355, + "step": 14280 + }, + { + "epoch": 0.8676711829394252, + "grad_norm": 0.14204855263233185, + "learning_rate": 4.288269717903809e-06, + "loss": 1.0063, + "step": 14281 + }, + { + "epoch": 0.8677319399720518, + "grad_norm": 0.11679262667894363, + "learning_rate": 4.284391809291027e-06, + "loss": 1.0634, + "step": 14282 + }, + { + "epoch": 0.8677926970046783, + "grad_norm": 0.1416119486093521, + "learning_rate": 4.280515576392702e-06, + "loss": 1.0696, + "step": 14283 + }, + { + "epoch": 0.8678534540373049, + "grad_norm": 0.5395480990409851, + "learning_rate": 4.2766410193509136e-06, + "loss": 1.1584, + "step": 14284 + }, + { + "epoch": 0.8679142110699314, + "grad_norm": 0.13430988788604736, + "learning_rate": 4.2727681383076816e-06, + "loss": 1.0678, + "step": 14285 + }, + { + "epoch": 0.8679749681025579, + "grad_norm": 4.478382587432861, + "learning_rate": 4.268896933404981e-06, + "loss": 1.1018, + "step": 14286 + }, + { + "epoch": 0.8680357251351843, + "grad_norm": 0.11822658032178879, + "learning_rate": 4.265027404784705e-06, + "loss": 1.0135, + "step": 14287 + }, + { + "epoch": 0.8680964821678109, + "grad_norm": 0.20759336650371552, + "learning_rate": 4.261159552588689e-06, + "loss": 1.0894, + "step": 14288 + }, + { + "epoch": 0.8681572392004374, + "grad_norm": 0.11896127462387085, + "learning_rate": 4.2572933769587045e-06, + "loss": 1.043, + "step": 14289 + }, + { + "epoch": 0.868217996233064, + "grad_norm": 0.10811272263526917, + "learning_rate": 4.25342887803647e-06, + "loss": 1.0165, + "step": 14290 + }, + { + "epoch": 0.8682787532656905, + "grad_norm": 0.1244196742773056, + "learning_rate": 4.24956605596365e-06, + "loss": 1.0522, + "step": 14291 + }, + { + "epoch": 0.868339510298317, + "grad_norm": 0.19802682101726532, + "learning_rate": 4.245704910881826e-06, + "loss": 1.0674, + "step": 14292 + }, + { + "epoch": 0.8684002673309436, + "grad_norm": 0.3971807062625885, + "learning_rate": 4.241845442932535e-06, + "loss": 1.1721, + "step": 14293 + }, + { + "epoch": 0.8684610243635701, + "grad_norm": 0.5288057327270508, + "learning_rate": 4.237987652257242e-06, + "loss": 1.1029, + "step": 14294 + }, + { + "epoch": 0.8685217813961966, + "grad_norm": 0.13204018771648407, + "learning_rate": 4.23413153899736e-06, + "loss": 1.0733, + "step": 14295 + }, + { + "epoch": 0.8685825384288232, + "grad_norm": 0.10832633078098297, + "learning_rate": 4.2302771032942334e-06, + "loss": 1.0083, + "step": 14296 + }, + { + "epoch": 0.8686432954614497, + "grad_norm": 0.10118073970079422, + "learning_rate": 4.226424345289143e-06, + "loss": 1.0782, + "step": 14297 + }, + { + "epoch": 0.8687040524940762, + "grad_norm": 0.19373410940170288, + "learning_rate": 4.2225732651233195e-06, + "loss": 1.0414, + "step": 14298 + }, + { + "epoch": 0.8687648095267028, + "grad_norm": 0.20592525601387024, + "learning_rate": 4.218723862937917e-06, + "loss": 1.1401, + "step": 14299 + }, + { + "epoch": 0.8688255665593292, + "grad_norm": 0.5029820799827576, + "learning_rate": 4.214876138874046e-06, + "loss": 1.1679, + "step": 14300 + }, + { + "epoch": 0.8688863235919557, + "grad_norm": 0.130966916680336, + "learning_rate": 4.211030093072738e-06, + "loss": 1.0334, + "step": 14301 + }, + { + "epoch": 0.8689470806245823, + "grad_norm": 0.13666383922100067, + "learning_rate": 4.2071857256749725e-06, + "loss": 1.0496, + "step": 14302 + }, + { + "epoch": 0.8690078376572088, + "grad_norm": 0.09910415858030319, + "learning_rate": 4.203343036821667e-06, + "loss": 1.0108, + "step": 14303 + }, + { + "epoch": 0.8690685946898353, + "grad_norm": 0.2526373267173767, + "learning_rate": 4.199502026653674e-06, + "loss": 1.1438, + "step": 14304 + }, + { + "epoch": 0.8691293517224619, + "grad_norm": 0.5141761898994446, + "learning_rate": 4.195662695311786e-06, + "loss": 1.223, + "step": 14305 + }, + { + "epoch": 0.8691901087550884, + "grad_norm": 0.16585469245910645, + "learning_rate": 4.1918250429367255e-06, + "loss": 1.1055, + "step": 14306 + }, + { + "epoch": 0.8692508657877149, + "grad_norm": 0.13301222026348114, + "learning_rate": 4.187989069669196e-06, + "loss": 1.065, + "step": 14307 + }, + { + "epoch": 0.8693116228203415, + "grad_norm": 0.18069688975811005, + "learning_rate": 4.184154775649768e-06, + "loss": 1.0944, + "step": 14308 + }, + { + "epoch": 0.869372379852968, + "grad_norm": 0.14474423229694366, + "learning_rate": 4.180322161019007e-06, + "loss": 1.0705, + "step": 14309 + }, + { + "epoch": 0.8694331368855945, + "grad_norm": 0.1404387652873993, + "learning_rate": 4.176491225917384e-06, + "loss": 1.0815, + "step": 14310 + }, + { + "epoch": 0.8694938939182211, + "grad_norm": 0.12162946909666061, + "learning_rate": 4.172661970485336e-06, + "loss": 1.0218, + "step": 14311 + }, + { + "epoch": 0.8695546509508476, + "grad_norm": 0.12406567484140396, + "learning_rate": 4.168834394863231e-06, + "loss": 1.0487, + "step": 14312 + }, + { + "epoch": 0.8696154079834741, + "grad_norm": 0.19263829290866852, + "learning_rate": 4.165008499191353e-06, + "loss": 1.1119, + "step": 14313 + }, + { + "epoch": 0.8696761650161006, + "grad_norm": 0.1953824758529663, + "learning_rate": 4.161184283609959e-06, + "loss": 1.0972, + "step": 14314 + }, + { + "epoch": 0.8697369220487271, + "grad_norm": 0.1113588958978653, + "learning_rate": 4.157361748259198e-06, + "loss": 1.0449, + "step": 14315 + }, + { + "epoch": 0.8697976790813536, + "grad_norm": 0.12395302951335907, + "learning_rate": 4.153540893279212e-06, + "loss": 1.0492, + "step": 14316 + }, + { + "epoch": 0.8698584361139802, + "grad_norm": 0.17302490770816803, + "learning_rate": 4.149721718810051e-06, + "loss": 1.0272, + "step": 14317 + }, + { + "epoch": 0.8699191931466067, + "grad_norm": 0.14489449560642242, + "learning_rate": 4.145904224991698e-06, + "loss": 1.0945, + "step": 14318 + }, + { + "epoch": 0.8699799501792332, + "grad_norm": 0.16393961012363434, + "learning_rate": 4.1420884119640905e-06, + "loss": 1.094, + "step": 14319 + }, + { + "epoch": 0.8700407072118598, + "grad_norm": 0.10775593668222427, + "learning_rate": 4.1382742798670935e-06, + "loss": 1.0414, + "step": 14320 + }, + { + "epoch": 0.8701014642444863, + "grad_norm": 0.23243103921413422, + "learning_rate": 4.134461828840519e-06, + "loss": 1.2255, + "step": 14321 + }, + { + "epoch": 0.8701622212771128, + "grad_norm": 0.15937195718288422, + "learning_rate": 4.130651059024116e-06, + "loss": 1.0756, + "step": 14322 + }, + { + "epoch": 0.8702229783097394, + "grad_norm": 3.0323517322540283, + "learning_rate": 4.12684197055756e-06, + "loss": 1.0131, + "step": 14323 + }, + { + "epoch": 0.8702837353423659, + "grad_norm": 0.19789010286331177, + "learning_rate": 4.1230345635804804e-06, + "loss": 1.1573, + "step": 14324 + }, + { + "epoch": 0.8703444923749925, + "grad_norm": 0.16213564574718475, + "learning_rate": 4.119228838232436e-06, + "loss": 1.1004, + "step": 14325 + }, + { + "epoch": 0.870405249407619, + "grad_norm": 0.22013337910175323, + "learning_rate": 4.115424794652917e-06, + "loss": 1.0789, + "step": 14326 + }, + { + "epoch": 0.8704660064402454, + "grad_norm": 0.11992393434047699, + "learning_rate": 4.111622432981393e-06, + "loss": 1.0526, + "step": 14327 + }, + { + "epoch": 0.870526763472872, + "grad_norm": 0.15205895900726318, + "learning_rate": 4.10782175335721e-06, + "loss": 1.1187, + "step": 14328 + }, + { + "epoch": 0.8705875205054985, + "grad_norm": 0.1337515264749527, + "learning_rate": 4.104022755919689e-06, + "loss": 0.983, + "step": 14329 + }, + { + "epoch": 0.870648277538125, + "grad_norm": 3.0589065551757812, + "learning_rate": 4.1002254408080865e-06, + "loss": 1.0804, + "step": 14330 + }, + { + "epoch": 0.8707090345707515, + "grad_norm": 0.0997358188033104, + "learning_rate": 4.096429808161578e-06, + "loss": 1.0412, + "step": 14331 + }, + { + "epoch": 0.8707697916033781, + "grad_norm": 0.1250249147415161, + "learning_rate": 4.092635858119326e-06, + "loss": 1.0676, + "step": 14332 + }, + { + "epoch": 0.8708305486360046, + "grad_norm": 0.23462393879890442, + "learning_rate": 4.088843590820374e-06, + "loss": 1.1005, + "step": 14333 + }, + { + "epoch": 0.8708913056686312, + "grad_norm": 0.2286270260810852, + "learning_rate": 4.085053006403744e-06, + "loss": 1.0357, + "step": 14334 + }, + { + "epoch": 0.8709520627012577, + "grad_norm": 0.19235652685165405, + "learning_rate": 4.081264105008364e-06, + "loss": 1.056, + "step": 14335 + }, + { + "epoch": 0.8710128197338842, + "grad_norm": 0.12831154465675354, + "learning_rate": 4.077476886773118e-06, + "loss": 1.0069, + "step": 14336 + }, + { + "epoch": 0.8710735767665108, + "grad_norm": 0.196796253323555, + "learning_rate": 4.073691351836839e-06, + "loss": 1.0868, + "step": 14337 + }, + { + "epoch": 0.8711343337991373, + "grad_norm": 0.37093472480773926, + "learning_rate": 4.069907500338277e-06, + "loss": 1.1283, + "step": 14338 + }, + { + "epoch": 0.8711950908317638, + "grad_norm": 0.20871317386627197, + "learning_rate": 4.066125332416138e-06, + "loss": 1.1847, + "step": 14339 + }, + { + "epoch": 0.8712558478643903, + "grad_norm": 1.4458787441253662, + "learning_rate": 4.062344848209055e-06, + "loss": 1.0755, + "step": 14340 + }, + { + "epoch": 0.8713166048970168, + "grad_norm": 0.11043219268321991, + "learning_rate": 4.0585660478556e-06, + "loss": 1.0243, + "step": 14341 + }, + { + "epoch": 0.8713773619296433, + "grad_norm": 0.11328072100877762, + "learning_rate": 4.0547889314942855e-06, + "loss": 0.9849, + "step": 14342 + }, + { + "epoch": 0.8714381189622699, + "grad_norm": 0.16323398053646088, + "learning_rate": 4.051013499263568e-06, + "loss": 1.1197, + "step": 14343 + }, + { + "epoch": 0.8714988759948964, + "grad_norm": 0.16723348200321198, + "learning_rate": 4.047239751301829e-06, + "loss": 1.0409, + "step": 14344 + }, + { + "epoch": 0.8715596330275229, + "grad_norm": 0.15389733016490936, + "learning_rate": 4.043467687747399e-06, + "loss": 1.1401, + "step": 14345 + }, + { + "epoch": 0.8716203900601495, + "grad_norm": 0.12652705609798431, + "learning_rate": 4.039697308738544e-06, + "loss": 1.0526, + "step": 14346 + }, + { + "epoch": 0.871681147092776, + "grad_norm": 1.9126840829849243, + "learning_rate": 4.03592861441347e-06, + "loss": 1.1255, + "step": 14347 + }, + { + "epoch": 0.8717419041254025, + "grad_norm": 0.1312454640865326, + "learning_rate": 4.0321616049103116e-06, + "loss": 1.0382, + "step": 14348 + }, + { + "epoch": 0.8718026611580291, + "grad_norm": 0.16049225628376007, + "learning_rate": 4.028396280367164e-06, + "loss": 1.1389, + "step": 14349 + }, + { + "epoch": 0.8718634181906556, + "grad_norm": 0.2220696061849594, + "learning_rate": 4.024632640922027e-06, + "loss": 1.0807, + "step": 14350 + }, + { + "epoch": 0.8719241752232821, + "grad_norm": 0.1569354385137558, + "learning_rate": 4.0208706867128744e-06, + "loss": 1.0815, + "step": 14351 + }, + { + "epoch": 0.8719849322559087, + "grad_norm": 0.12963464856147766, + "learning_rate": 4.017110417877585e-06, + "loss": 1.0417, + "step": 14352 + }, + { + "epoch": 0.8720456892885351, + "grad_norm": 0.10035611689090729, + "learning_rate": 4.013351834554008e-06, + "loss": 1.0455, + "step": 14353 + }, + { + "epoch": 0.8721064463211616, + "grad_norm": 0.11115585267543793, + "learning_rate": 4.009594936879918e-06, + "loss": 1.0292, + "step": 14354 + }, + { + "epoch": 0.8721672033537882, + "grad_norm": 0.10728547722101212, + "learning_rate": 4.00583972499301e-06, + "loss": 1.0573, + "step": 14355 + }, + { + "epoch": 0.8722279603864147, + "grad_norm": 5.861358165740967, + "learning_rate": 4.00208619903093e-06, + "loss": 1.02, + "step": 14356 + }, + { + "epoch": 0.8722887174190412, + "grad_norm": 0.1270265430212021, + "learning_rate": 3.998334359131267e-06, + "loss": 1.0581, + "step": 14357 + }, + { + "epoch": 0.8723494744516678, + "grad_norm": 0.08880908042192459, + "learning_rate": 3.994584205431562e-06, + "loss": 1.0108, + "step": 14358 + }, + { + "epoch": 0.8724102314842943, + "grad_norm": 0.4161021113395691, + "learning_rate": 3.99083573806926e-06, + "loss": 1.2731, + "step": 14359 + }, + { + "epoch": 0.8724709885169208, + "grad_norm": 0.1339392364025116, + "learning_rate": 3.987088957181772e-06, + "loss": 1.0502, + "step": 14360 + }, + { + "epoch": 0.8725317455495474, + "grad_norm": 0.6967560052871704, + "learning_rate": 3.983343862906436e-06, + "loss": 1.0221, + "step": 14361 + }, + { + "epoch": 0.8725925025821739, + "grad_norm": 0.15927958488464355, + "learning_rate": 3.979600455380511e-06, + "loss": 1.0628, + "step": 14362 + }, + { + "epoch": 0.8726532596148004, + "grad_norm": 0.11636935919523239, + "learning_rate": 3.975858734741239e-06, + "loss": 1.035, + "step": 14363 + }, + { + "epoch": 0.872714016647427, + "grad_norm": 0.25674161314964294, + "learning_rate": 3.972118701125754e-06, + "loss": 1.1213, + "step": 14364 + }, + { + "epoch": 0.8727747736800535, + "grad_norm": 0.14380165934562683, + "learning_rate": 3.968380354671164e-06, + "loss": 1.0911, + "step": 14365 + }, + { + "epoch": 0.8728355307126799, + "grad_norm": 0.09789226949214935, + "learning_rate": 3.964643695514481e-06, + "loss": 1.0164, + "step": 14366 + }, + { + "epoch": 0.8728962877453065, + "grad_norm": 0.10349620878696442, + "learning_rate": 3.960908723792689e-06, + "loss": 0.9742, + "step": 14367 + }, + { + "epoch": 0.872957044777933, + "grad_norm": 0.11412148177623749, + "learning_rate": 3.95717543964268e-06, + "loss": 1.0696, + "step": 14368 + }, + { + "epoch": 0.8730178018105595, + "grad_norm": 2.68796443939209, + "learning_rate": 3.95344384320131e-06, + "loss": 0.991, + "step": 14369 + }, + { + "epoch": 0.8730785588431861, + "grad_norm": 0.16747133433818817, + "learning_rate": 3.949713934605354e-06, + "loss": 1.0614, + "step": 14370 + }, + { + "epoch": 0.8731393158758126, + "grad_norm": 0.09575141966342926, + "learning_rate": 3.945985713991534e-06, + "loss": 1.0033, + "step": 14371 + }, + { + "epoch": 0.8732000729084392, + "grad_norm": 0.2550472021102905, + "learning_rate": 3.942259181496516e-06, + "loss": 1.1353, + "step": 14372 + }, + { + "epoch": 0.8732608299410657, + "grad_norm": 0.10565902292728424, + "learning_rate": 3.938534337256877e-06, + "loss": 1.0307, + "step": 14373 + }, + { + "epoch": 0.8733215869736922, + "grad_norm": 0.3183503746986389, + "learning_rate": 3.934811181409181e-06, + "loss": 1.1327, + "step": 14374 + }, + { + "epoch": 0.8733823440063188, + "grad_norm": 0.19271215796470642, + "learning_rate": 3.9310897140898805e-06, + "loss": 1.1201, + "step": 14375 + }, + { + "epoch": 0.8734431010389453, + "grad_norm": 0.24655552208423615, + "learning_rate": 3.927369935435387e-06, + "loss": 1.1384, + "step": 14376 + }, + { + "epoch": 0.8735038580715718, + "grad_norm": 0.16251324117183685, + "learning_rate": 3.923651845582061e-06, + "loss": 1.1044, + "step": 14377 + }, + { + "epoch": 0.8735646151041984, + "grad_norm": 0.09870925545692444, + "learning_rate": 3.9199354446661675e-06, + "loss": 1.0222, + "step": 14378 + }, + { + "epoch": 0.8736253721368248, + "grad_norm": 0.17102311551570892, + "learning_rate": 3.916220732823961e-06, + "loss": 1.078, + "step": 14379 + }, + { + "epoch": 0.8736861291694513, + "grad_norm": 0.111696757376194, + "learning_rate": 3.9125077101915875e-06, + "loss": 1.0412, + "step": 14380 + }, + { + "epoch": 0.8737468862020779, + "grad_norm": 0.1547119915485382, + "learning_rate": 3.908796376905166e-06, + "loss": 1.047, + "step": 14381 + }, + { + "epoch": 0.8738076432347044, + "grad_norm": 0.3426506519317627, + "learning_rate": 3.90508673310071e-06, + "loss": 1.1596, + "step": 14382 + }, + { + "epoch": 0.8738684002673309, + "grad_norm": 0.09747691452503204, + "learning_rate": 3.9013787789142e-06, + "loss": 1.0123, + "step": 14383 + }, + { + "epoch": 0.8739291572999575, + "grad_norm": 0.09974610805511475, + "learning_rate": 3.89767251448157e-06, + "loss": 1.0447, + "step": 14384 + }, + { + "epoch": 0.873989914332584, + "grad_norm": 0.10408081859350204, + "learning_rate": 3.893967939938669e-06, + "loss": 1.0935, + "step": 14385 + }, + { + "epoch": 0.8740506713652105, + "grad_norm": 0.16880090534687042, + "learning_rate": 3.890265055421283e-06, + "loss": 1.0666, + "step": 14386 + }, + { + "epoch": 0.8741114283978371, + "grad_norm": 0.19213543832302094, + "learning_rate": 3.886563861065146e-06, + "loss": 1.1783, + "step": 14387 + }, + { + "epoch": 0.8741721854304636, + "grad_norm": 0.172330841422081, + "learning_rate": 3.882864357005922e-06, + "loss": 1.0489, + "step": 14388 + }, + { + "epoch": 0.8742329424630901, + "grad_norm": 0.11979315429925919, + "learning_rate": 3.87916654337922e-06, + "loss": 1.0029, + "step": 14389 + }, + { + "epoch": 0.8742936994957167, + "grad_norm": 0.18948282301425934, + "learning_rate": 3.875470420320582e-06, + "loss": 1.1096, + "step": 14390 + }, + { + "epoch": 0.8743544565283432, + "grad_norm": 0.3669520914554596, + "learning_rate": 3.871775987965498e-06, + "loss": 1.1074, + "step": 14391 + }, + { + "epoch": 0.8744152135609696, + "grad_norm": 0.14308518171310425, + "learning_rate": 3.868083246449372e-06, + "loss": 1.1311, + "step": 14392 + }, + { + "epoch": 0.8744759705935962, + "grad_norm": 0.16750699281692505, + "learning_rate": 3.864392195907579e-06, + "loss": 1.0572, + "step": 14393 + }, + { + "epoch": 0.8745367276262227, + "grad_norm": 0.14526700973510742, + "learning_rate": 3.860702836475405e-06, + "loss": 1.096, + "step": 14394 + }, + { + "epoch": 0.8745974846588492, + "grad_norm": 0.26115116477012634, + "learning_rate": 3.8570151682880865e-06, + "loss": 1.0689, + "step": 14395 + }, + { + "epoch": 0.8746582416914758, + "grad_norm": 0.17449040710926056, + "learning_rate": 3.853329191480798e-06, + "loss": 1.1736, + "step": 14396 + }, + { + "epoch": 0.8747189987241023, + "grad_norm": 0.10472922772169113, + "learning_rate": 3.849644906188643e-06, + "loss": 0.9713, + "step": 14397 + }, + { + "epoch": 0.8747797557567288, + "grad_norm": 0.455191969871521, + "learning_rate": 3.845962312546681e-06, + "loss": 1.2272, + "step": 14398 + }, + { + "epoch": 0.8748405127893554, + "grad_norm": 0.16802021861076355, + "learning_rate": 3.842281410689879e-06, + "loss": 1.0846, + "step": 14399 + }, + { + "epoch": 0.8749012698219819, + "grad_norm": 0.1144322007894516, + "learning_rate": 3.838602200753183e-06, + "loss": 1.0231, + "step": 14400 + }, + { + "epoch": 0.8749620268546084, + "grad_norm": 0.1819830983877182, + "learning_rate": 3.83492468287146e-06, + "loss": 1.0766, + "step": 14401 + }, + { + "epoch": 0.875022783887235, + "grad_norm": 0.1960277408361435, + "learning_rate": 3.8312488571794816e-06, + "loss": 1.0489, + "step": 14402 + }, + { + "epoch": 0.8750835409198615, + "grad_norm": 0.2229665070772171, + "learning_rate": 3.827574723811994e-06, + "loss": 1.2662, + "step": 14403 + }, + { + "epoch": 0.875144297952488, + "grad_norm": 0.16860918700695038, + "learning_rate": 3.8239022829036905e-06, + "loss": 1.1115, + "step": 14404 + }, + { + "epoch": 0.8752050549851145, + "grad_norm": 0.13054722547531128, + "learning_rate": 3.8202315345891696e-06, + "loss": 1.0217, + "step": 14405 + }, + { + "epoch": 0.875265812017741, + "grad_norm": 0.14450567960739136, + "learning_rate": 3.816562479002994e-06, + "loss": 1.0582, + "step": 14406 + }, + { + "epoch": 0.8753265690503675, + "grad_norm": 0.10299817472696304, + "learning_rate": 3.812895116279641e-06, + "loss": 1.0167, + "step": 14407 + }, + { + "epoch": 0.8753873260829941, + "grad_norm": 0.14837372303009033, + "learning_rate": 3.8092294465535527e-06, + "loss": 1.0695, + "step": 14408 + }, + { + "epoch": 0.8754480831156206, + "grad_norm": 0.11886508762836456, + "learning_rate": 3.8055654699590827e-06, + "loss": 1.049, + "step": 14409 + }, + { + "epoch": 0.8755088401482471, + "grad_norm": 0.15067318081855774, + "learning_rate": 3.80190318663054e-06, + "loss": 1.1297, + "step": 14410 + }, + { + "epoch": 0.8755695971808737, + "grad_norm": 0.164082869887352, + "learning_rate": 3.7982425967021673e-06, + "loss": 1.112, + "step": 14411 + }, + { + "epoch": 0.8756303542135002, + "grad_norm": 0.2626175880432129, + "learning_rate": 3.7945837003081406e-06, + "loss": 1.0367, + "step": 14412 + }, + { + "epoch": 0.8756911112461268, + "grad_norm": 0.16389034688472748, + "learning_rate": 3.790926497582581e-06, + "loss": 1.135, + "step": 14413 + }, + { + "epoch": 0.8757518682787533, + "grad_norm": 0.7456648349761963, + "learning_rate": 3.7872709886595413e-06, + "loss": 1.0796, + "step": 14414 + }, + { + "epoch": 0.8758126253113798, + "grad_norm": 0.12602953612804413, + "learning_rate": 3.7836171736730206e-06, + "loss": 1.0384, + "step": 14415 + }, + { + "epoch": 0.8758733823440064, + "grad_norm": 0.11357144266366959, + "learning_rate": 3.7799650527569396e-06, + "loss": 1.0601, + "step": 14416 + }, + { + "epoch": 0.8759341393766329, + "grad_norm": 0.10738505423069, + "learning_rate": 3.77631462604518e-06, + "loss": 1.0487, + "step": 14417 + }, + { + "epoch": 0.8759948964092594, + "grad_norm": 2.844250440597534, + "learning_rate": 3.7726658936715397e-06, + "loss": 1.0078, + "step": 14418 + }, + { + "epoch": 0.8760556534418859, + "grad_norm": 0.12740908563137054, + "learning_rate": 3.769018855769757e-06, + "loss": 1.0331, + "step": 14419 + }, + { + "epoch": 0.8761164104745124, + "grad_norm": 0.10843446105718613, + "learning_rate": 3.7653735124735355e-06, + "loss": 1.087, + "step": 14420 + }, + { + "epoch": 0.8761771675071389, + "grad_norm": 0.11844402551651001, + "learning_rate": 3.7617298639164913e-06, + "loss": 1.0338, + "step": 14421 + }, + { + "epoch": 0.8762379245397655, + "grad_norm": 0.6083289980888367, + "learning_rate": 3.7580879102321663e-06, + "loss": 1.0459, + "step": 14422 + }, + { + "epoch": 0.876298681572392, + "grad_norm": 0.16263823211193085, + "learning_rate": 3.7544476515540715e-06, + "loss": 1.158, + "step": 14423 + }, + { + "epoch": 0.8763594386050185, + "grad_norm": 0.15546706318855286, + "learning_rate": 3.750809088015628e-06, + "loss": 1.0721, + "step": 14424 + }, + { + "epoch": 0.8764201956376451, + "grad_norm": 22.740657806396484, + "learning_rate": 3.747172219750222e-06, + "loss": 1.0248, + "step": 14425 + }, + { + "epoch": 0.8764809526702716, + "grad_norm": 0.298325777053833, + "learning_rate": 3.74353704689116e-06, + "loss": 1.0601, + "step": 14426 + }, + { + "epoch": 0.8765417097028981, + "grad_norm": 2.110259771347046, + "learning_rate": 3.73990356957169e-06, + "loss": 1.0454, + "step": 14427 + }, + { + "epoch": 0.8766024667355247, + "grad_norm": 0.1646614968776703, + "learning_rate": 3.7362717879250053e-06, + "loss": 1.1097, + "step": 14428 + }, + { + "epoch": 0.8766632237681512, + "grad_norm": 0.278971403837204, + "learning_rate": 3.732641702084205e-06, + "loss": 1.1391, + "step": 14429 + }, + { + "epoch": 0.8767239808007777, + "grad_norm": 0.1312994360923767, + "learning_rate": 3.7290133121823723e-06, + "loss": 1.011, + "step": 14430 + }, + { + "epoch": 0.8767847378334043, + "grad_norm": 0.2943628132343292, + "learning_rate": 3.7253866183525e-06, + "loss": 1.0538, + "step": 14431 + }, + { + "epoch": 0.8768454948660307, + "grad_norm": 0.15289996564388275, + "learning_rate": 3.721761620727532e-06, + "loss": 1.0203, + "step": 14432 + }, + { + "epoch": 0.8769062518986572, + "grad_norm": 0.18115881085395813, + "learning_rate": 3.7181383194403286e-06, + "loss": 1.0979, + "step": 14433 + }, + { + "epoch": 0.8769670089312838, + "grad_norm": 0.2012121081352234, + "learning_rate": 3.714516714623717e-06, + "loss": 1.1157, + "step": 14434 + }, + { + "epoch": 0.8770277659639103, + "grad_norm": 0.1832466423511505, + "learning_rate": 3.7108968064104412e-06, + "loss": 1.0592, + "step": 14435 + }, + { + "epoch": 0.8770885229965368, + "grad_norm": 0.13826370239257812, + "learning_rate": 3.707278594933189e-06, + "loss": 1.0607, + "step": 14436 + }, + { + "epoch": 0.8771492800291634, + "grad_norm": 0.190177783370018, + "learning_rate": 3.703662080324588e-06, + "loss": 1.1455, + "step": 14437 + }, + { + "epoch": 0.8772100370617899, + "grad_norm": 0.14002488553524017, + "learning_rate": 3.7000472627172047e-06, + "loss": 1.0244, + "step": 14438 + }, + { + "epoch": 0.8772707940944164, + "grad_norm": 0.17376451194286346, + "learning_rate": 3.696434142243538e-06, + "loss": 1.1515, + "step": 14439 + }, + { + "epoch": 0.877331551127043, + "grad_norm": 0.15191587805747986, + "learning_rate": 3.6928227190360154e-06, + "loss": 1.0563, + "step": 14440 + }, + { + "epoch": 0.8773923081596695, + "grad_norm": 0.5717953443527222, + "learning_rate": 3.6892129932270482e-06, + "loss": 1.062, + "step": 14441 + }, + { + "epoch": 0.877453065192296, + "grad_norm": 0.09690621495246887, + "learning_rate": 3.685604964948919e-06, + "loss": 1.0183, + "step": 14442 + }, + { + "epoch": 0.8775138222249226, + "grad_norm": 0.1264795958995819, + "learning_rate": 3.681998634333894e-06, + "loss": 1.0749, + "step": 14443 + }, + { + "epoch": 0.8775745792575491, + "grad_norm": 0.12863373756408691, + "learning_rate": 3.6783940015141625e-06, + "loss": 1.0854, + "step": 14444 + }, + { + "epoch": 0.8776353362901755, + "grad_norm": 1.2896924018859863, + "learning_rate": 3.6747910666218466e-06, + "loss": 1.06, + "step": 14445 + }, + { + "epoch": 0.8776960933228021, + "grad_norm": 0.12301401793956757, + "learning_rate": 3.6711898297890234e-06, + "loss": 1.0481, + "step": 14446 + }, + { + "epoch": 0.8777568503554286, + "grad_norm": 0.15882372856140137, + "learning_rate": 3.667590291147693e-06, + "loss": 1.0705, + "step": 14447 + }, + { + "epoch": 0.8778176073880551, + "grad_norm": 0.12979115545749664, + "learning_rate": 3.663992450829801e-06, + "loss": 1.0566, + "step": 14448 + }, + { + "epoch": 0.8778783644206817, + "grad_norm": 9.955571174621582, + "learning_rate": 3.660396308967218e-06, + "loss": 0.9969, + "step": 14449 + }, + { + "epoch": 0.8779391214533082, + "grad_norm": 0.11079701036214828, + "learning_rate": 3.6568018656917614e-06, + "loss": 1.0505, + "step": 14450 + }, + { + "epoch": 0.8779998784859347, + "grad_norm": 0.1300920695066452, + "learning_rate": 3.6532091211351928e-06, + "loss": 0.9929, + "step": 14451 + }, + { + "epoch": 0.8780606355185613, + "grad_norm": 0.18141959607601166, + "learning_rate": 3.6496180754292007e-06, + "loss": 1.1408, + "step": 14452 + }, + { + "epoch": 0.8781213925511878, + "grad_norm": 1.4117289781570435, + "learning_rate": 3.6460287287054196e-06, + "loss": 1.0695, + "step": 14453 + }, + { + "epoch": 0.8781821495838144, + "grad_norm": 0.1022375300526619, + "learning_rate": 3.642441081095421e-06, + "loss": 1.0178, + "step": 14454 + }, + { + "epoch": 0.8782429066164409, + "grad_norm": 0.17482461035251617, + "learning_rate": 3.638855132730701e-06, + "loss": 1.0692, + "step": 14455 + }, + { + "epoch": 0.8783036636490674, + "grad_norm": 0.1493391990661621, + "learning_rate": 3.635270883742703e-06, + "loss": 1.0183, + "step": 14456 + }, + { + "epoch": 0.878364420681694, + "grad_norm": 0.3150024712085724, + "learning_rate": 3.631688334262817e-06, + "loss": 1.0442, + "step": 14457 + }, + { + "epoch": 0.8784251777143204, + "grad_norm": 0.1289951056241989, + "learning_rate": 3.6281074844223608e-06, + "loss": 1.0338, + "step": 14458 + }, + { + "epoch": 0.8784859347469469, + "grad_norm": 0.40785345435142517, + "learning_rate": 3.6245283343525837e-06, + "loss": 1.1256, + "step": 14459 + }, + { + "epoch": 0.8785466917795735, + "grad_norm": 0.1404150277376175, + "learning_rate": 3.6209508841846873e-06, + "loss": 1.0624, + "step": 14460 + }, + { + "epoch": 0.8786074488122, + "grad_norm": 0.12771236896514893, + "learning_rate": 3.6173751340497997e-06, + "loss": 1.045, + "step": 14461 + }, + { + "epoch": 0.8786682058448265, + "grad_norm": 0.140315443277359, + "learning_rate": 3.6138010840789936e-06, + "loss": 0.9968, + "step": 14462 + }, + { + "epoch": 0.8787289628774531, + "grad_norm": 0.18975098431110382, + "learning_rate": 3.6102287344032703e-06, + "loss": 1.102, + "step": 14463 + }, + { + "epoch": 0.8787897199100796, + "grad_norm": 0.1953096240758896, + "learning_rate": 3.6066580851535857e-06, + "loss": 1.1883, + "step": 14464 + }, + { + "epoch": 0.8788504769427061, + "grad_norm": 0.1363508254289627, + "learning_rate": 3.6030891364608133e-06, + "loss": 1.0215, + "step": 14465 + }, + { + "epoch": 0.8789112339753327, + "grad_norm": 0.14448469877243042, + "learning_rate": 3.59952188845577e-06, + "loss": 1.0609, + "step": 14466 + }, + { + "epoch": 0.8789719910079592, + "grad_norm": 0.5873912572860718, + "learning_rate": 3.5959563412692297e-06, + "loss": 1.2934, + "step": 14467 + }, + { + "epoch": 0.8790327480405857, + "grad_norm": 0.2090008556842804, + "learning_rate": 3.5923924950318875e-06, + "loss": 1.1541, + "step": 14468 + }, + { + "epoch": 0.8790935050732123, + "grad_norm": 0.13231422007083893, + "learning_rate": 3.588830349874356e-06, + "loss": 1.0369, + "step": 14469 + }, + { + "epoch": 0.8791542621058388, + "grad_norm": 0.15206478536128998, + "learning_rate": 3.5852699059272243e-06, + "loss": 1.0779, + "step": 14470 + }, + { + "epoch": 0.8792150191384652, + "grad_norm": 0.1617000550031662, + "learning_rate": 3.581711163320989e-06, + "loss": 1.0624, + "step": 14471 + }, + { + "epoch": 0.8792757761710918, + "grad_norm": 0.12221449613571167, + "learning_rate": 3.578154122186106e-06, + "loss": 1.0475, + "step": 14472 + }, + { + "epoch": 0.8793365332037183, + "grad_norm": 0.11573433130979538, + "learning_rate": 3.5745987826529604e-06, + "loss": 1.0414, + "step": 14473 + }, + { + "epoch": 0.8793972902363448, + "grad_norm": 0.2252984195947647, + "learning_rate": 3.57104514485187e-06, + "loss": 1.1763, + "step": 14474 + }, + { + "epoch": 0.8794580472689714, + "grad_norm": 0.1389123648405075, + "learning_rate": 3.567493208913103e-06, + "loss": 1.0547, + "step": 14475 + }, + { + "epoch": 0.8795188043015979, + "grad_norm": 0.09723228961229324, + "learning_rate": 3.5639429749668272e-06, + "loss": 0.9914, + "step": 14476 + }, + { + "epoch": 0.8795795613342244, + "grad_norm": 0.16194170713424683, + "learning_rate": 3.5603944431432114e-06, + "loss": 1.0777, + "step": 14477 + }, + { + "epoch": 0.879640318366851, + "grad_norm": 0.12059863656759262, + "learning_rate": 3.5568476135723116e-06, + "loss": 1.0466, + "step": 14478 + }, + { + "epoch": 0.8797010753994775, + "grad_norm": 0.1424207240343094, + "learning_rate": 3.553302486384141e-06, + "loss": 1.055, + "step": 14479 + }, + { + "epoch": 0.879761832432104, + "grad_norm": 0.13578937947750092, + "learning_rate": 3.549759061708641e-06, + "loss": 1.1214, + "step": 14480 + }, + { + "epoch": 0.8798225894647306, + "grad_norm": 0.13324271142482758, + "learning_rate": 3.546217339675706e-06, + "loss": 1.1089, + "step": 14481 + }, + { + "epoch": 0.8798833464973571, + "grad_norm": 0.10492102056741714, + "learning_rate": 3.5426773204151498e-06, + "loss": 1.0897, + "step": 14482 + }, + { + "epoch": 0.8799441035299836, + "grad_norm": 0.3962917625904083, + "learning_rate": 3.5391390040567408e-06, + "loss": 1.2425, + "step": 14483 + }, + { + "epoch": 0.8800048605626101, + "grad_norm": 0.10516023635864258, + "learning_rate": 3.5356023907301696e-06, + "loss": 1.0185, + "step": 14484 + }, + { + "epoch": 0.8800656175952366, + "grad_norm": 0.14894473552703857, + "learning_rate": 3.5320674805650767e-06, + "loss": 1.0935, + "step": 14485 + }, + { + "epoch": 0.8801263746278631, + "grad_norm": 0.16137200593948364, + "learning_rate": 3.528534273691031e-06, + "loss": 1.0546, + "step": 14486 + }, + { + "epoch": 0.8801871316604897, + "grad_norm": 0.10996224731206894, + "learning_rate": 3.5250027702375345e-06, + "loss": 1.0507, + "step": 14487 + }, + { + "epoch": 0.8802478886931162, + "grad_norm": 0.15636777877807617, + "learning_rate": 3.521472970334061e-06, + "loss": 1.1398, + "step": 14488 + }, + { + "epoch": 0.8803086457257427, + "grad_norm": 0.22474385797977448, + "learning_rate": 3.5179448741099743e-06, + "loss": 1.1368, + "step": 14489 + }, + { + "epoch": 0.8803694027583693, + "grad_norm": 0.1425997018814087, + "learning_rate": 3.5144184816946036e-06, + "loss": 0.9917, + "step": 14490 + }, + { + "epoch": 0.8804301597909958, + "grad_norm": 0.236920565366745, + "learning_rate": 3.510893793217207e-06, + "loss": 1.1124, + "step": 14491 + }, + { + "epoch": 0.8804909168236223, + "grad_norm": 0.11593442410230637, + "learning_rate": 3.5073708088069756e-06, + "loss": 1.0283, + "step": 14492 + }, + { + "epoch": 0.8805516738562489, + "grad_norm": 0.19685669243335724, + "learning_rate": 3.503849528593062e-06, + "loss": 1.0703, + "step": 14493 + }, + { + "epoch": 0.8806124308888754, + "grad_norm": 0.15274019539356232, + "learning_rate": 3.500329952704534e-06, + "loss": 1.0925, + "step": 14494 + }, + { + "epoch": 0.880673187921502, + "grad_norm": 0.1302100419998169, + "learning_rate": 3.496812081270406e-06, + "loss": 1.065, + "step": 14495 + }, + { + "epoch": 0.8807339449541285, + "grad_norm": 0.14908307790756226, + "learning_rate": 3.493295914419603e-06, + "loss": 1.0319, + "step": 14496 + }, + { + "epoch": 0.8807947019867549, + "grad_norm": 0.1167093962430954, + "learning_rate": 3.489781452281038e-06, + "loss": 1.0376, + "step": 14497 + }, + { + "epoch": 0.8808554590193814, + "grad_norm": 0.24106687307357788, + "learning_rate": 3.4862686949835244e-06, + "loss": 1.0563, + "step": 14498 + }, + { + "epoch": 0.880916216052008, + "grad_norm": 0.11067598313093185, + "learning_rate": 3.4827576426558206e-06, + "loss": 1.0273, + "step": 14499 + }, + { + "epoch": 0.8809769730846345, + "grad_norm": 0.23668546974658966, + "learning_rate": 3.479248295426624e-06, + "loss": 1.1028, + "step": 14500 + }, + { + "epoch": 0.881037730117261, + "grad_norm": 0.3210560977458954, + "learning_rate": 3.4757406534245808e-06, + "loss": 1.0954, + "step": 14501 + }, + { + "epoch": 0.8810984871498876, + "grad_norm": 0.39827099442481995, + "learning_rate": 3.4722347167782498e-06, + "loss": 1.0256, + "step": 14502 + }, + { + "epoch": 0.8811592441825141, + "grad_norm": 1.8480387926101685, + "learning_rate": 3.468730485616156e-06, + "loss": 1.1882, + "step": 14503 + }, + { + "epoch": 0.8812200012151407, + "grad_norm": 0.14183840155601501, + "learning_rate": 3.4652279600667357e-06, + "loss": 1.0937, + "step": 14504 + }, + { + "epoch": 0.8812807582477672, + "grad_norm": 0.2680705487728119, + "learning_rate": 3.4617271402583806e-06, + "loss": 1.1024, + "step": 14505 + }, + { + "epoch": 0.8813415152803937, + "grad_norm": 0.12620899081230164, + "learning_rate": 3.4582280263194157e-06, + "loss": 1.0671, + "step": 14506 + }, + { + "epoch": 0.8814022723130203, + "grad_norm": 0.14713971316814423, + "learning_rate": 3.4547306183780825e-06, + "loss": 1.0373, + "step": 14507 + }, + { + "epoch": 0.8814630293456468, + "grad_norm": 0.12060145288705826, + "learning_rate": 3.451234916562618e-06, + "loss": 1.0737, + "step": 14508 + }, + { + "epoch": 0.8815237863782733, + "grad_norm": 0.11234474182128906, + "learning_rate": 3.44774092100113e-06, + "loss": 1.0373, + "step": 14509 + }, + { + "epoch": 0.8815845434108998, + "grad_norm": 0.18484778702259064, + "learning_rate": 3.444248631821689e-06, + "loss": 1.1451, + "step": 14510 + }, + { + "epoch": 0.8816453004435263, + "grad_norm": 0.1472494900226593, + "learning_rate": 3.4407580491523206e-06, + "loss": 1.0421, + "step": 14511 + }, + { + "epoch": 0.8817060574761528, + "grad_norm": 0.10259804129600525, + "learning_rate": 3.437269173120955e-06, + "loss": 0.9976, + "step": 14512 + }, + { + "epoch": 0.8817668145087794, + "grad_norm": 0.14788947999477386, + "learning_rate": 3.4337820038554958e-06, + "loss": 1.0323, + "step": 14513 + }, + { + "epoch": 0.8818275715414059, + "grad_norm": 0.14503687620162964, + "learning_rate": 3.4302965414837575e-06, + "loss": 1.0969, + "step": 14514 + }, + { + "epoch": 0.8818883285740324, + "grad_norm": 0.10066328942775726, + "learning_rate": 3.4268127861335097e-06, + "loss": 1.017, + "step": 14515 + }, + { + "epoch": 0.881949085606659, + "grad_norm": 0.15164169669151306, + "learning_rate": 3.423330737932434e-06, + "loss": 1.1951, + "step": 14516 + }, + { + "epoch": 0.8820098426392855, + "grad_norm": 1.1713110208511353, + "learning_rate": 3.4198503970081675e-06, + "loss": 1.0925, + "step": 14517 + }, + { + "epoch": 0.882070599671912, + "grad_norm": 0.1136820986866951, + "learning_rate": 3.416371763488291e-06, + "loss": 1.0381, + "step": 14518 + }, + { + "epoch": 0.8821313567045386, + "grad_norm": 0.15022777020931244, + "learning_rate": 3.412894837500319e-06, + "loss": 1.0667, + "step": 14519 + }, + { + "epoch": 0.8821921137371651, + "grad_norm": 0.3239344358444214, + "learning_rate": 3.409419619171683e-06, + "loss": 1.073, + "step": 14520 + }, + { + "epoch": 0.8822528707697916, + "grad_norm": 0.11347474157810211, + "learning_rate": 3.4059461086297815e-06, + "loss": 1.0118, + "step": 14521 + }, + { + "epoch": 0.8823136278024182, + "grad_norm": 0.14050550758838654, + "learning_rate": 3.402474306001935e-06, + "loss": 1.0807, + "step": 14522 + }, + { + "epoch": 0.8823743848350447, + "grad_norm": 0.12560464441776276, + "learning_rate": 3.3990042114153965e-06, + "loss": 1.0569, + "step": 14523 + }, + { + "epoch": 0.8824351418676711, + "grad_norm": 0.09681057929992676, + "learning_rate": 3.3955358249973646e-06, + "loss": 1.0486, + "step": 14524 + }, + { + "epoch": 0.8824958989002977, + "grad_norm": 0.48717978596687317, + "learning_rate": 3.3920691468749768e-06, + "loss": 1.0973, + "step": 14525 + }, + { + "epoch": 0.8825566559329242, + "grad_norm": 0.11208033561706543, + "learning_rate": 3.3886041771753086e-06, + "loss": 1.0222, + "step": 14526 + }, + { + "epoch": 0.8826174129655507, + "grad_norm": 0.1361130326986313, + "learning_rate": 3.385140916025359e-06, + "loss": 1.1048, + "step": 14527 + }, + { + "epoch": 0.8826781699981773, + "grad_norm": 0.10553893446922302, + "learning_rate": 3.3816793635520816e-06, + "loss": 1.0176, + "step": 14528 + }, + { + "epoch": 0.8827389270308038, + "grad_norm": 0.1062655821442604, + "learning_rate": 3.378219519882353e-06, + "loss": 1.0626, + "step": 14529 + }, + { + "epoch": 0.8827996840634303, + "grad_norm": 0.522912323474884, + "learning_rate": 3.374761385143005e-06, + "loss": 1.18, + "step": 14530 + }, + { + "epoch": 0.8828604410960569, + "grad_norm": 0.148276224732399, + "learning_rate": 3.3713049594607915e-06, + "loss": 1.0676, + "step": 14531 + }, + { + "epoch": 0.8829211981286834, + "grad_norm": 0.23223872482776642, + "learning_rate": 3.3678502429624057e-06, + "loss": 1.1133, + "step": 14532 + }, + { + "epoch": 0.88298195516131, + "grad_norm": 0.11547403037548065, + "learning_rate": 3.3643972357744746e-06, + "loss": 1.0592, + "step": 14533 + }, + { + "epoch": 0.8830427121939365, + "grad_norm": 0.1985643059015274, + "learning_rate": 3.360945938023585e-06, + "loss": 1.2399, + "step": 14534 + }, + { + "epoch": 0.883103469226563, + "grad_norm": 0.386805921792984, + "learning_rate": 3.357496349836248e-06, + "loss": 1.2557, + "step": 14535 + }, + { + "epoch": 0.8831642262591896, + "grad_norm": 1.045297622680664, + "learning_rate": 3.3540484713388832e-06, + "loss": 1.1679, + "step": 14536 + }, + { + "epoch": 0.883224983291816, + "grad_norm": 0.21725356578826904, + "learning_rate": 3.350602302657896e-06, + "loss": 1.1921, + "step": 14537 + }, + { + "epoch": 0.8832857403244425, + "grad_norm": 0.10408329963684082, + "learning_rate": 3.347157843919585e-06, + "loss": 1.0146, + "step": 14538 + }, + { + "epoch": 0.883346497357069, + "grad_norm": 0.181331604719162, + "learning_rate": 3.3437150952502327e-06, + "loss": 1.1562, + "step": 14539 + }, + { + "epoch": 0.8834072543896956, + "grad_norm": 0.11471977829933167, + "learning_rate": 3.3402740567760214e-06, + "loss": 1.0624, + "step": 14540 + }, + { + "epoch": 0.8834680114223221, + "grad_norm": 0.6379608511924744, + "learning_rate": 3.336834728623084e-06, + "loss": 1.123, + "step": 14541 + }, + { + "epoch": 0.8835287684549487, + "grad_norm": 0.1922662854194641, + "learning_rate": 3.333397110917497e-06, + "loss": 1.1686, + "step": 14542 + }, + { + "epoch": 0.8835895254875752, + "grad_norm": 0.14018617570400238, + "learning_rate": 3.329961203785242e-06, + "loss": 1.066, + "step": 14543 + }, + { + "epoch": 0.8836502825202017, + "grad_norm": 0.14638058841228485, + "learning_rate": 3.3265270073522925e-06, + "loss": 0.9903, + "step": 14544 + }, + { + "epoch": 0.8837110395528283, + "grad_norm": 0.14833728969097137, + "learning_rate": 3.323094521744513e-06, + "loss": 1.1354, + "step": 14545 + }, + { + "epoch": 0.8837717965854548, + "grad_norm": 0.4308561682701111, + "learning_rate": 3.3196637470877256e-06, + "loss": 1.2485, + "step": 14546 + }, + { + "epoch": 0.8838325536180813, + "grad_norm": 2.2455477714538574, + "learning_rate": 3.3162346835076906e-06, + "loss": 1.1876, + "step": 14547 + }, + { + "epoch": 0.8838933106507079, + "grad_norm": 0.23969553411006927, + "learning_rate": 3.312807331130091e-06, + "loss": 1.1379, + "step": 14548 + }, + { + "epoch": 0.8839540676833344, + "grad_norm": 0.7818536758422852, + "learning_rate": 3.309381690080571e-06, + "loss": 1.1782, + "step": 14549 + }, + { + "epoch": 0.8840148247159608, + "grad_norm": 0.14767079055309296, + "learning_rate": 3.305957760484685e-06, + "loss": 1.1263, + "step": 14550 + }, + { + "epoch": 0.8840755817485874, + "grad_norm": 1.5664955377578735, + "learning_rate": 3.302535542467944e-06, + "loss": 1.0775, + "step": 14551 + }, + { + "epoch": 0.8841363387812139, + "grad_norm": 0.23392058908939362, + "learning_rate": 3.2991150361557865e-06, + "loss": 1.0968, + "step": 14552 + }, + { + "epoch": 0.8841970958138404, + "grad_norm": 0.19733375310897827, + "learning_rate": 3.2956962416736015e-06, + "loss": 1.0642, + "step": 14553 + }, + { + "epoch": 0.884257852846467, + "grad_norm": 4.169165134429932, + "learning_rate": 3.292279159146683e-06, + "loss": 1.0619, + "step": 14554 + }, + { + "epoch": 0.8843186098790935, + "grad_norm": 0.1237090602517128, + "learning_rate": 3.288863788700319e-06, + "loss": 1.0435, + "step": 14555 + }, + { + "epoch": 0.88437936691172, + "grad_norm": 0.12797720730304718, + "learning_rate": 3.2854501304596774e-06, + "loss": 1.0792, + "step": 14556 + }, + { + "epoch": 0.8844401239443466, + "grad_norm": 0.164870023727417, + "learning_rate": 3.282038184549885e-06, + "loss": 1.0304, + "step": 14557 + }, + { + "epoch": 0.8845008809769731, + "grad_norm": 0.12417539209127426, + "learning_rate": 3.2786279510960193e-06, + "loss": 1.0287, + "step": 14558 + }, + { + "epoch": 0.8845616380095996, + "grad_norm": 0.11996292322874069, + "learning_rate": 3.2752194302230644e-06, + "loss": 1.0658, + "step": 14559 + }, + { + "epoch": 0.8846223950422262, + "grad_norm": 0.11469022184610367, + "learning_rate": 3.271812622055981e-06, + "loss": 1.0098, + "step": 14560 + }, + { + "epoch": 0.8846831520748527, + "grad_norm": 0.10035540908575058, + "learning_rate": 3.268407526719641e-06, + "loss": 1.0082, + "step": 14561 + }, + { + "epoch": 0.8847439091074792, + "grad_norm": 0.177669957280159, + "learning_rate": 3.265004144338862e-06, + "loss": 1.1048, + "step": 14562 + }, + { + "epoch": 0.8848046661401057, + "grad_norm": 0.16027499735355377, + "learning_rate": 3.2616024750383833e-06, + "loss": 1.0394, + "step": 14563 + }, + { + "epoch": 0.8848654231727322, + "grad_norm": 0.6399933695793152, + "learning_rate": 3.2582025189428934e-06, + "loss": 1.1645, + "step": 14564 + }, + { + "epoch": 0.8849261802053587, + "grad_norm": 0.11340584605932236, + "learning_rate": 3.2548042761770316e-06, + "loss": 1.0378, + "step": 14565 + }, + { + "epoch": 0.8849869372379853, + "grad_norm": 0.24245789647102356, + "learning_rate": 3.2514077468653537e-06, + "loss": 1.1092, + "step": 14566 + }, + { + "epoch": 0.8850476942706118, + "grad_norm": 0.17457427084445953, + "learning_rate": 3.248012931132366e-06, + "loss": 1.1266, + "step": 14567 + }, + { + "epoch": 0.8851084513032383, + "grad_norm": 0.1674877405166626, + "learning_rate": 3.244619829102502e-06, + "loss": 1.0469, + "step": 14568 + }, + { + "epoch": 0.8851692083358649, + "grad_norm": 0.09634416550397873, + "learning_rate": 3.2412284409001235e-06, + "loss": 1.0049, + "step": 14569 + }, + { + "epoch": 0.8852299653684914, + "grad_norm": 0.3651910722255707, + "learning_rate": 3.2378387666495645e-06, + "loss": 1.1461, + "step": 14570 + }, + { + "epoch": 0.885290722401118, + "grad_norm": 0.17827612161636353, + "learning_rate": 3.234450806475059e-06, + "loss": 1.1395, + "step": 14571 + }, + { + "epoch": 0.8853514794337445, + "grad_norm": 0.1108350157737732, + "learning_rate": 3.2310645605008015e-06, + "loss": 1.0429, + "step": 14572 + }, + { + "epoch": 0.885412236466371, + "grad_norm": 0.2824110984802246, + "learning_rate": 3.2276800288509157e-06, + "loss": 1.2115, + "step": 14573 + }, + { + "epoch": 0.8854729934989976, + "grad_norm": 0.17498032748699188, + "learning_rate": 3.2242972116494574e-06, + "loss": 1.0318, + "step": 14574 + }, + { + "epoch": 0.8855337505316241, + "grad_norm": 0.10410551726818085, + "learning_rate": 3.2209161090204277e-06, + "loss": 1.0404, + "step": 14575 + }, + { + "epoch": 0.8855945075642505, + "grad_norm": 0.1333116739988327, + "learning_rate": 3.217536721087755e-06, + "loss": 1.0476, + "step": 14576 + }, + { + "epoch": 0.885655264596877, + "grad_norm": 0.24690653383731842, + "learning_rate": 3.2141590479753236e-06, + "loss": 1.0213, + "step": 14577 + }, + { + "epoch": 0.8857160216295036, + "grad_norm": 0.14949177205562592, + "learning_rate": 3.210783089806929e-06, + "loss": 1.1178, + "step": 14578 + }, + { + "epoch": 0.8857767786621301, + "grad_norm": 0.22718439996242523, + "learning_rate": 3.2074088467063277e-06, + "loss": 1.0418, + "step": 14579 + }, + { + "epoch": 0.8858375356947567, + "grad_norm": 0.16216522455215454, + "learning_rate": 3.2040363187971935e-06, + "loss": 1.1333, + "step": 14580 + }, + { + "epoch": 0.8858982927273832, + "grad_norm": 0.11710798740386963, + "learning_rate": 3.2006655062031597e-06, + "loss": 1.0534, + "step": 14581 + }, + { + "epoch": 0.8859590497600097, + "grad_norm": 0.14996717870235443, + "learning_rate": 3.197296409047784e-06, + "loss": 1.2264, + "step": 14582 + }, + { + "epoch": 0.8860198067926363, + "grad_norm": 0.14383438229560852, + "learning_rate": 3.193929027454551e-06, + "loss": 1.056, + "step": 14583 + }, + { + "epoch": 0.8860805638252628, + "grad_norm": 0.10204288363456726, + "learning_rate": 3.1905633615468944e-06, + "loss": 1.0455, + "step": 14584 + }, + { + "epoch": 0.8861413208578893, + "grad_norm": 0.16288653016090393, + "learning_rate": 3.1871994114481772e-06, + "loss": 1.1167, + "step": 14585 + }, + { + "epoch": 0.8862020778905159, + "grad_norm": 0.1325506567955017, + "learning_rate": 3.1838371772817286e-06, + "loss": 1.0647, + "step": 14586 + }, + { + "epoch": 0.8862628349231424, + "grad_norm": 0.14301139116287231, + "learning_rate": 3.1804766591707724e-06, + "loss": 1.0752, + "step": 14587 + }, + { + "epoch": 0.8863235919557689, + "grad_norm": 0.22541333734989166, + "learning_rate": 3.177117857238493e-06, + "loss": 1.1439, + "step": 14588 + }, + { + "epoch": 0.8863843489883954, + "grad_norm": 0.14628830552101135, + "learning_rate": 3.173760771608009e-06, + "loss": 1.1141, + "step": 14589 + }, + { + "epoch": 0.8864451060210219, + "grad_norm": 0.17259405553340912, + "learning_rate": 3.1704054024023767e-06, + "loss": 1.0724, + "step": 14590 + }, + { + "epoch": 0.8865058630536484, + "grad_norm": 0.1687692552804947, + "learning_rate": 3.167051749744587e-06, + "loss": 1.0829, + "step": 14591 + }, + { + "epoch": 0.886566620086275, + "grad_norm": 0.1520577371120453, + "learning_rate": 3.16369981375757e-06, + "loss": 1.0366, + "step": 14592 + }, + { + "epoch": 0.8866273771189015, + "grad_norm": 0.17253956198692322, + "learning_rate": 3.160349594564188e-06, + "loss": 1.0822, + "step": 14593 + }, + { + "epoch": 0.886688134151528, + "grad_norm": 8.534601211547852, + "learning_rate": 3.157001092287243e-06, + "loss": 1.1007, + "step": 14594 + }, + { + "epoch": 0.8867488911841546, + "grad_norm": 0.13734091818332672, + "learning_rate": 3.1536543070494806e-06, + "loss": 1.0672, + "step": 14595 + }, + { + "epoch": 0.8868096482167811, + "grad_norm": 0.2312757670879364, + "learning_rate": 3.1503092389735754e-06, + "loss": 1.051, + "step": 14596 + }, + { + "epoch": 0.8868704052494076, + "grad_norm": 1.7190401554107666, + "learning_rate": 3.146965888182135e-06, + "loss": 1.1829, + "step": 14597 + }, + { + "epoch": 0.8869311622820342, + "grad_norm": 0.31289583444595337, + "learning_rate": 3.1436242547977225e-06, + "loss": 1.2083, + "step": 14598 + }, + { + "epoch": 0.8869919193146607, + "grad_norm": 0.1963265836238861, + "learning_rate": 3.140284338942817e-06, + "loss": 1.1283, + "step": 14599 + }, + { + "epoch": 0.8870526763472872, + "grad_norm": 0.17142769694328308, + "learning_rate": 3.1369461407398495e-06, + "loss": 1.0933, + "step": 14600 + }, + { + "epoch": 0.8871134333799138, + "grad_norm": 0.10682990401983261, + "learning_rate": 3.133609660311171e-06, + "loss": 0.9715, + "step": 14601 + }, + { + "epoch": 0.8871741904125402, + "grad_norm": 0.10956233739852905, + "learning_rate": 3.1302748977791007e-06, + "loss": 1.0109, + "step": 14602 + }, + { + "epoch": 0.8872349474451667, + "grad_norm": 0.1335013508796692, + "learning_rate": 3.1269418532658635e-06, + "loss": 1.1286, + "step": 14603 + }, + { + "epoch": 0.8872957044777933, + "grad_norm": 0.1755015254020691, + "learning_rate": 3.1236105268936277e-06, + "loss": 1.1408, + "step": 14604 + }, + { + "epoch": 0.8873564615104198, + "grad_norm": 0.1585562825202942, + "learning_rate": 3.1202809187845016e-06, + "loss": 1.0053, + "step": 14605 + }, + { + "epoch": 0.8874172185430463, + "grad_norm": 0.22049635648727417, + "learning_rate": 3.1169530290605486e-06, + "loss": 1.2608, + "step": 14606 + }, + { + "epoch": 0.8874779755756729, + "grad_norm": 0.11492518335580826, + "learning_rate": 3.113626857843743e-06, + "loss": 1.023, + "step": 14607 + }, + { + "epoch": 0.8875387326082994, + "grad_norm": 0.19224266707897186, + "learning_rate": 3.1103024052560105e-06, + "loss": 1.1204, + "step": 14608 + }, + { + "epoch": 0.8875994896409259, + "grad_norm": 0.32298731803894043, + "learning_rate": 3.1069796714192132e-06, + "loss": 1.0594, + "step": 14609 + }, + { + "epoch": 0.8876602466735525, + "grad_norm": 0.11298403888940811, + "learning_rate": 3.1036586564551273e-06, + "loss": 0.9929, + "step": 14610 + }, + { + "epoch": 0.887721003706179, + "grad_norm": 0.1807352751493454, + "learning_rate": 3.1003393604855047e-06, + "loss": 1.0969, + "step": 14611 + }, + { + "epoch": 0.8877817607388055, + "grad_norm": 0.14948639273643494, + "learning_rate": 3.097021783632009e-06, + "loss": 1.029, + "step": 14612 + }, + { + "epoch": 0.8878425177714321, + "grad_norm": 0.14266057312488556, + "learning_rate": 3.0937059260162438e-06, + "loss": 1.0347, + "step": 14613 + }, + { + "epoch": 0.8879032748040586, + "grad_norm": 0.13356633484363556, + "learning_rate": 3.0903917877597556e-06, + "loss": 1.0597, + "step": 14614 + }, + { + "epoch": 0.887964031836685, + "grad_norm": 0.23884007334709167, + "learning_rate": 3.087079368984025e-06, + "loss": 1.118, + "step": 14615 + }, + { + "epoch": 0.8880247888693116, + "grad_norm": 0.2663938105106354, + "learning_rate": 3.0837686698104663e-06, + "loss": 1.1783, + "step": 14616 + }, + { + "epoch": 0.8880855459019381, + "grad_norm": 0.2507220506668091, + "learning_rate": 3.0804596903604376e-06, + "loss": 1.0454, + "step": 14617 + }, + { + "epoch": 0.8881463029345646, + "grad_norm": 0.3995007276535034, + "learning_rate": 3.0771524307552256e-06, + "loss": 1.0603, + "step": 14618 + }, + { + "epoch": 0.8882070599671912, + "grad_norm": 0.10384316742420197, + "learning_rate": 3.0738468911160614e-06, + "loss": 1.0473, + "step": 14619 + }, + { + "epoch": 0.8882678169998177, + "grad_norm": 0.11711938679218292, + "learning_rate": 3.0705430715641135e-06, + "loss": 1.0116, + "step": 14620 + }, + { + "epoch": 0.8883285740324443, + "grad_norm": 0.12873300909996033, + "learning_rate": 3.06724097222047e-06, + "loss": 1.0806, + "step": 14621 + }, + { + "epoch": 0.8883893310650708, + "grad_norm": 0.129280686378479, + "learning_rate": 3.0639405932061994e-06, + "loss": 1.0045, + "step": 14622 + }, + { + "epoch": 0.8884500880976973, + "grad_norm": 0.12430091947317123, + "learning_rate": 3.0606419346422444e-06, + "loss": 1.0395, + "step": 14623 + }, + { + "epoch": 0.8885108451303239, + "grad_norm": 0.19456730782985687, + "learning_rate": 3.057344996649536e-06, + "loss": 1.1176, + "step": 14624 + }, + { + "epoch": 0.8885716021629504, + "grad_norm": 0.1140284612774849, + "learning_rate": 3.054049779348922e-06, + "loss": 1.03, + "step": 14625 + }, + { + "epoch": 0.8886323591955769, + "grad_norm": 0.1789393126964569, + "learning_rate": 3.050756282861178e-06, + "loss": 1.0705, + "step": 14626 + }, + { + "epoch": 0.8886931162282035, + "grad_norm": 0.25484660267829895, + "learning_rate": 3.0474645073070517e-06, + "loss": 1.1812, + "step": 14627 + }, + { + "epoch": 0.88875387326083, + "grad_norm": 0.139180988073349, + "learning_rate": 3.04417445280718e-06, + "loss": 1.0934, + "step": 14628 + }, + { + "epoch": 0.8888146302934564, + "grad_norm": 0.15103241801261902, + "learning_rate": 3.040886119482184e-06, + "loss": 0.9761, + "step": 14629 + }, + { + "epoch": 0.888875387326083, + "grad_norm": 0.17447814345359802, + "learning_rate": 3.0375995074525764e-06, + "loss": 1.0903, + "step": 14630 + }, + { + "epoch": 0.8889361443587095, + "grad_norm": 0.12317290902137756, + "learning_rate": 3.0343146168388235e-06, + "loss": 1.064, + "step": 14631 + }, + { + "epoch": 0.888996901391336, + "grad_norm": 0.13857927918434143, + "learning_rate": 3.031031447761362e-06, + "loss": 1.0435, + "step": 14632 + }, + { + "epoch": 0.8890576584239626, + "grad_norm": 0.19820040464401245, + "learning_rate": 3.0277500003405178e-06, + "loss": 1.1088, + "step": 14633 + }, + { + "epoch": 0.8891184154565891, + "grad_norm": 0.1622518002986908, + "learning_rate": 3.0244702746965723e-06, + "loss": 1.0283, + "step": 14634 + }, + { + "epoch": 0.8891791724892156, + "grad_norm": 0.16865570843219757, + "learning_rate": 3.021192270949763e-06, + "loss": 0.992, + "step": 14635 + }, + { + "epoch": 0.8892399295218422, + "grad_norm": 0.16449861228466034, + "learning_rate": 3.0179159892202157e-06, + "loss": 1.0331, + "step": 14636 + }, + { + "epoch": 0.8893006865544687, + "grad_norm": 0.20679821074008942, + "learning_rate": 3.0146414296280454e-06, + "loss": 1.1515, + "step": 14637 + }, + { + "epoch": 0.8893614435870952, + "grad_norm": 0.10066144168376923, + "learning_rate": 3.0113685922932734e-06, + "loss": 1.03, + "step": 14638 + }, + { + "epoch": 0.8894222006197218, + "grad_norm": 0.12101301550865173, + "learning_rate": 3.008097477335875e-06, + "loss": 1.0367, + "step": 14639 + }, + { + "epoch": 0.8894829576523483, + "grad_norm": 0.10414587706327438, + "learning_rate": 3.004828084875744e-06, + "loss": 1.0178, + "step": 14640 + }, + { + "epoch": 0.8895437146849748, + "grad_norm": 0.24158069491386414, + "learning_rate": 3.001560415032717e-06, + "loss": 1.1149, + "step": 14641 + }, + { + "epoch": 0.8896044717176013, + "grad_norm": 1.1494296789169312, + "learning_rate": 2.9982944679265877e-06, + "loss": 1.1292, + "step": 14642 + }, + { + "epoch": 0.8896652287502278, + "grad_norm": 0.2280639261007309, + "learning_rate": 2.995030243677055e-06, + "loss": 1.2318, + "step": 14643 + }, + { + "epoch": 0.8897259857828543, + "grad_norm": 0.167388454079628, + "learning_rate": 2.9917677424037725e-06, + "loss": 1.0778, + "step": 14644 + }, + { + "epoch": 0.8897867428154809, + "grad_norm": 0.11865873634815216, + "learning_rate": 2.988506964226334e-06, + "loss": 1.0208, + "step": 14645 + }, + { + "epoch": 0.8898474998481074, + "grad_norm": 0.1320948302745819, + "learning_rate": 2.985247909264255e-06, + "loss": 1.0266, + "step": 14646 + }, + { + "epoch": 0.8899082568807339, + "grad_norm": 0.2526165246963501, + "learning_rate": 2.9819905776370006e-06, + "loss": 1.1069, + "step": 14647 + }, + { + "epoch": 0.8899690139133605, + "grad_norm": 0.10250655561685562, + "learning_rate": 2.97873496946397e-06, + "loss": 0.9924, + "step": 14648 + }, + { + "epoch": 0.890029770945987, + "grad_norm": 0.1981375366449356, + "learning_rate": 2.975481084864512e-06, + "loss": 1.0848, + "step": 14649 + }, + { + "epoch": 0.8900905279786135, + "grad_norm": 0.1285632848739624, + "learning_rate": 2.972228923957876e-06, + "loss": 0.9948, + "step": 14650 + }, + { + "epoch": 0.8901512850112401, + "grad_norm": 0.2181164026260376, + "learning_rate": 2.9689784868632773e-06, + "loss": 1.0004, + "step": 14651 + }, + { + "epoch": 0.8902120420438666, + "grad_norm": 0.13612231612205505, + "learning_rate": 2.9657297736998548e-06, + "loss": 1.0731, + "step": 14652 + }, + { + "epoch": 0.8902727990764931, + "grad_norm": 0.1471223533153534, + "learning_rate": 2.9624827845867065e-06, + "loss": 1.1116, + "step": 14653 + }, + { + "epoch": 0.8903335561091197, + "grad_norm": 0.2809170186519623, + "learning_rate": 2.9592375196428436e-06, + "loss": 1.1078, + "step": 14654 + }, + { + "epoch": 0.8903943131417461, + "grad_norm": 0.12725882232189178, + "learning_rate": 2.955993978987226e-06, + "loss": 1.0368, + "step": 14655 + }, + { + "epoch": 0.8904550701743726, + "grad_norm": 0.21599172055721283, + "learning_rate": 2.9527521627387476e-06, + "loss": 1.1954, + "step": 14656 + }, + { + "epoch": 0.8905158272069992, + "grad_norm": 0.18019451200962067, + "learning_rate": 2.949512071016214e-06, + "loss": 1.1317, + "step": 14657 + }, + { + "epoch": 0.8905765842396257, + "grad_norm": 0.2100551873445511, + "learning_rate": 2.9462737039384236e-06, + "loss": 1.0517, + "step": 14658 + }, + { + "epoch": 0.8906373412722522, + "grad_norm": 0.14522022008895874, + "learning_rate": 2.9430370616240656e-06, + "loss": 1.0404, + "step": 14659 + }, + { + "epoch": 0.8906980983048788, + "grad_norm": 0.24829906225204468, + "learning_rate": 2.9398021441917724e-06, + "loss": 1.1041, + "step": 14660 + }, + { + "epoch": 0.8907588553375053, + "grad_norm": 0.1998528391122818, + "learning_rate": 2.936568951760138e-06, + "loss": 1.0044, + "step": 14661 + }, + { + "epoch": 0.8908196123701319, + "grad_norm": 0.17631661891937256, + "learning_rate": 2.933337484447657e-06, + "loss": 1.1724, + "step": 14662 + }, + { + "epoch": 0.8908803694027584, + "grad_norm": 0.09878667443990707, + "learning_rate": 2.9301077423727953e-06, + "loss": 1.0421, + "step": 14663 + }, + { + "epoch": 0.8909411264353849, + "grad_norm": 0.13521592319011688, + "learning_rate": 2.9268797256539305e-06, + "loss": 1.0611, + "step": 14664 + }, + { + "epoch": 0.8910018834680115, + "grad_norm": 0.14376071095466614, + "learning_rate": 2.9236534344093904e-06, + "loss": 1.0151, + "step": 14665 + }, + { + "epoch": 0.891062640500638, + "grad_norm": 0.10521792620420456, + "learning_rate": 2.92042886875743e-06, + "loss": 1.0532, + "step": 14666 + }, + { + "epoch": 0.8911233975332645, + "grad_norm": 0.20890606939792633, + "learning_rate": 2.9172060288162493e-06, + "loss": 1.1579, + "step": 14667 + }, + { + "epoch": 0.891184154565891, + "grad_norm": 0.2172417789697647, + "learning_rate": 2.913984914703977e-06, + "loss": 1.1392, + "step": 14668 + }, + { + "epoch": 0.8912449115985175, + "grad_norm": 0.468802273273468, + "learning_rate": 2.9107655265387e-06, + "loss": 1.1962, + "step": 14669 + }, + { + "epoch": 0.891305668631144, + "grad_norm": 0.15653954446315765, + "learning_rate": 2.9075478644384146e-06, + "loss": 1.1079, + "step": 14670 + }, + { + "epoch": 0.8913664256637706, + "grad_norm": 0.2025233507156372, + "learning_rate": 2.904331928521059e-06, + "loss": 1.0116, + "step": 14671 + }, + { + "epoch": 0.8914271826963971, + "grad_norm": 0.14733392000198364, + "learning_rate": 2.9011177189045225e-06, + "loss": 1.0793, + "step": 14672 + }, + { + "epoch": 0.8914879397290236, + "grad_norm": 0.11526741832494736, + "learning_rate": 2.8979052357066104e-06, + "loss": 1.0578, + "step": 14673 + }, + { + "epoch": 0.8915486967616502, + "grad_norm": 0.1686958372592926, + "learning_rate": 2.894694479045096e-06, + "loss": 1.0547, + "step": 14674 + }, + { + "epoch": 0.8916094537942767, + "grad_norm": 0.10652989149093628, + "learning_rate": 2.891485449037662e-06, + "loss": 1.0395, + "step": 14675 + }, + { + "epoch": 0.8916702108269032, + "grad_norm": 0.1593276858329773, + "learning_rate": 2.8882781458019426e-06, + "loss": 1.0542, + "step": 14676 + }, + { + "epoch": 0.8917309678595298, + "grad_norm": 0.10352182388305664, + "learning_rate": 2.8850725694554827e-06, + "loss": 1.0187, + "step": 14677 + }, + { + "epoch": 0.8917917248921563, + "grad_norm": 0.13450925052165985, + "learning_rate": 2.881868720115788e-06, + "loss": 1.1116, + "step": 14678 + }, + { + "epoch": 0.8918524819247828, + "grad_norm": 0.5086029767990112, + "learning_rate": 2.8786665979003146e-06, + "loss": 1.0086, + "step": 14679 + }, + { + "epoch": 0.8919132389574094, + "grad_norm": 0.109037846326828, + "learning_rate": 2.8754662029264248e-06, + "loss": 1.0607, + "step": 14680 + }, + { + "epoch": 0.8919739959900358, + "grad_norm": 0.2711449861526489, + "learning_rate": 2.8722675353114236e-06, + "loss": 1.1371, + "step": 14681 + }, + { + "epoch": 0.8920347530226623, + "grad_norm": 0.08838333189487457, + "learning_rate": 2.869070595172574e-06, + "loss": 0.9772, + "step": 14682 + }, + { + "epoch": 0.8920955100552889, + "grad_norm": 0.13673768937587738, + "learning_rate": 2.8658753826270477e-06, + "loss": 1.0905, + "step": 14683 + }, + { + "epoch": 0.8921562670879154, + "grad_norm": 0.15699419379234314, + "learning_rate": 2.862681897791969e-06, + "loss": 1.1289, + "step": 14684 + }, + { + "epoch": 0.8922170241205419, + "grad_norm": 0.6075234413146973, + "learning_rate": 2.859490140784399e-06, + "loss": 1.0319, + "step": 14685 + }, + { + "epoch": 0.8922777811531685, + "grad_norm": 0.12360430508852005, + "learning_rate": 2.856300111721327e-06, + "loss": 1.1443, + "step": 14686 + }, + { + "epoch": 0.892338538185795, + "grad_norm": 0.29364779591560364, + "learning_rate": 2.853111810719694e-06, + "loss": 1.0884, + "step": 14687 + }, + { + "epoch": 0.8923992952184215, + "grad_norm": 0.3638782799243927, + "learning_rate": 2.849925237896356e-06, + "loss": 1.0868, + "step": 14688 + }, + { + "epoch": 0.8924600522510481, + "grad_norm": 0.14094452559947968, + "learning_rate": 2.8467403933681193e-06, + "loss": 1.0028, + "step": 14689 + }, + { + "epoch": 0.8925208092836746, + "grad_norm": 0.09357637912034988, + "learning_rate": 2.843557277251735e-06, + "loss": 1.0289, + "step": 14690 + }, + { + "epoch": 0.8925815663163011, + "grad_norm": 0.1741989254951477, + "learning_rate": 2.840375889663871e-06, + "loss": 1.0886, + "step": 14691 + }, + { + "epoch": 0.8926423233489277, + "grad_norm": 0.18354468047618866, + "learning_rate": 2.8371962307211395e-06, + "loss": 1.0822, + "step": 14692 + }, + { + "epoch": 0.8927030803815542, + "grad_norm": 0.14537058770656586, + "learning_rate": 2.834018300540103e-06, + "loss": 1.0258, + "step": 14693 + }, + { + "epoch": 0.8927638374141806, + "grad_norm": 0.11739664524793625, + "learning_rate": 2.8308420992372298e-06, + "loss": 1.0519, + "step": 14694 + }, + { + "epoch": 0.8928245944468072, + "grad_norm": 0.17187419533729553, + "learning_rate": 2.8276676269289704e-06, + "loss": 1.0675, + "step": 14695 + }, + { + "epoch": 0.8928853514794337, + "grad_norm": 0.17267253994941711, + "learning_rate": 2.8244948837316764e-06, + "loss": 1.0788, + "step": 14696 + }, + { + "epoch": 0.8929461085120602, + "grad_norm": 0.16254602372646332, + "learning_rate": 2.821323869761633e-06, + "loss": 1.0853, + "step": 14697 + }, + { + "epoch": 0.8930068655446868, + "grad_norm": 0.1279696673154831, + "learning_rate": 2.81815458513508e-06, + "loss": 1.0458, + "step": 14698 + }, + { + "epoch": 0.8930676225773133, + "grad_norm": 0.16091276705265045, + "learning_rate": 2.8149870299681913e-06, + "loss": 1.1893, + "step": 14699 + }, + { + "epoch": 0.8931283796099398, + "grad_norm": 0.1802794635295868, + "learning_rate": 2.8118212043770798e-06, + "loss": 1.067, + "step": 14700 + }, + { + "epoch": 0.8931891366425664, + "grad_norm": 0.1672443151473999, + "learning_rate": 2.80865710847778e-06, + "loss": 1.105, + "step": 14701 + }, + { + "epoch": 0.8932498936751929, + "grad_norm": 0.48433154821395874, + "learning_rate": 2.8054947423862833e-06, + "loss": 1.0651, + "step": 14702 + }, + { + "epoch": 0.8933106507078195, + "grad_norm": 0.1625070869922638, + "learning_rate": 2.802334106218485e-06, + "loss": 1.09, + "step": 14703 + }, + { + "epoch": 0.893371407740446, + "grad_norm": 0.16016674041748047, + "learning_rate": 2.7991752000902595e-06, + "loss": 1.0444, + "step": 14704 + }, + { + "epoch": 0.8934321647730725, + "grad_norm": 0.16798751056194305, + "learning_rate": 2.7960180241173917e-06, + "loss": 1.0838, + "step": 14705 + }, + { + "epoch": 0.8934929218056991, + "grad_norm": 0.11957212537527084, + "learning_rate": 2.7928625784156115e-06, + "loss": 1.0446, + "step": 14706 + }, + { + "epoch": 0.8935536788383255, + "grad_norm": 0.1715918779373169, + "learning_rate": 2.789708863100571e-06, + "loss": 1.0617, + "step": 14707 + }, + { + "epoch": 0.893614435870952, + "grad_norm": 0.1891513615846634, + "learning_rate": 2.7865568782878826e-06, + "loss": 1.0494, + "step": 14708 + }, + { + "epoch": 0.8936751929035786, + "grad_norm": 0.14633367955684662, + "learning_rate": 2.783406624093077e-06, + "loss": 1.0881, + "step": 14709 + }, + { + "epoch": 0.8937359499362051, + "grad_norm": 0.319070428609848, + "learning_rate": 2.7802581006316273e-06, + "loss": 1.0443, + "step": 14710 + }, + { + "epoch": 0.8937967069688316, + "grad_norm": 0.1441330760717392, + "learning_rate": 2.777111308018948e-06, + "loss": 1.0995, + "step": 14711 + }, + { + "epoch": 0.8938574640014582, + "grad_norm": 0.20820242166519165, + "learning_rate": 2.7739662463703743e-06, + "loss": 1.1511, + "step": 14712 + }, + { + "epoch": 0.8939182210340847, + "grad_norm": 0.1259198635816574, + "learning_rate": 2.770822915801202e-06, + "loss": 1.0475, + "step": 14713 + }, + { + "epoch": 0.8939789780667112, + "grad_norm": 0.39245837926864624, + "learning_rate": 2.76768131642664e-06, + "loss": 1.0277, + "step": 14714 + }, + { + "epoch": 0.8940397350993378, + "grad_norm": 0.13661637902259827, + "learning_rate": 2.7645414483618516e-06, + "loss": 1.0478, + "step": 14715 + }, + { + "epoch": 0.8941004921319643, + "grad_norm": 0.16632981598377228, + "learning_rate": 2.7614033117219327e-06, + "loss": 1.0753, + "step": 14716 + }, + { + "epoch": 0.8941612491645908, + "grad_norm": 0.11878080666065216, + "learning_rate": 2.7582669066219034e-06, + "loss": 1.01, + "step": 14717 + }, + { + "epoch": 0.8942220061972174, + "grad_norm": 0.1428791731595993, + "learning_rate": 2.7551322331767326e-06, + "loss": 1.0535, + "step": 14718 + }, + { + "epoch": 0.8942827632298439, + "grad_norm": 0.2595476806163788, + "learning_rate": 2.7519992915013115e-06, + "loss": 0.9776, + "step": 14719 + }, + { + "epoch": 0.8943435202624703, + "grad_norm": 0.4360501170158386, + "learning_rate": 2.7488680817104982e-06, + "loss": 1.0733, + "step": 14720 + }, + { + "epoch": 0.8944042772950969, + "grad_norm": 0.20499996840953827, + "learning_rate": 2.7457386039190568e-06, + "loss": 1.0537, + "step": 14721 + }, + { + "epoch": 0.8944650343277234, + "grad_norm": 0.15135543048381805, + "learning_rate": 2.742610858241712e-06, + "loss": 1.019, + "step": 14722 + }, + { + "epoch": 0.8945257913603499, + "grad_norm": 0.13484498858451843, + "learning_rate": 2.7394848447930887e-06, + "loss": 1.211, + "step": 14723 + }, + { + "epoch": 0.8945865483929765, + "grad_norm": 0.19778922200202942, + "learning_rate": 2.736360563687779e-06, + "loss": 1.1677, + "step": 14724 + }, + { + "epoch": 0.894647305425603, + "grad_norm": 0.17957265675067902, + "learning_rate": 2.7332380150403182e-06, + "loss": 1.1081, + "step": 14725 + }, + { + "epoch": 0.8947080624582295, + "grad_norm": 8.465873718261719, + "learning_rate": 2.730117198965154e-06, + "loss": 1.0996, + "step": 14726 + }, + { + "epoch": 0.8947688194908561, + "grad_norm": 0.6284109950065613, + "learning_rate": 2.726998115576679e-06, + "loss": 1.2421, + "step": 14727 + }, + { + "epoch": 0.8948295765234826, + "grad_norm": 0.15040907263755798, + "learning_rate": 2.7238807649892226e-06, + "loss": 1.0695, + "step": 14728 + }, + { + "epoch": 0.8948903335561091, + "grad_norm": 0.16046516597270966, + "learning_rate": 2.7207651473170615e-06, + "loss": 1.077, + "step": 14729 + }, + { + "epoch": 0.8949510905887357, + "grad_norm": 0.41832560300827026, + "learning_rate": 2.7176512626743867e-06, + "loss": 1.0307, + "step": 14730 + }, + { + "epoch": 0.8950118476213622, + "grad_norm": 0.1138959676027298, + "learning_rate": 2.714539111175346e-06, + "loss": 1.036, + "step": 14731 + }, + { + "epoch": 0.8950726046539887, + "grad_norm": 0.16120284795761108, + "learning_rate": 2.7114286929340206e-06, + "loss": 1.1776, + "step": 14732 + }, + { + "epoch": 0.8951333616866153, + "grad_norm": 0.2626010775566101, + "learning_rate": 2.7083200080644077e-06, + "loss": 1.111, + "step": 14733 + }, + { + "epoch": 0.8951941187192417, + "grad_norm": 0.1747281551361084, + "learning_rate": 2.7052130566804723e-06, + "loss": 1.0399, + "step": 14734 + }, + { + "epoch": 0.8952548757518682, + "grad_norm": 0.12358351051807404, + "learning_rate": 2.702107838896084e-06, + "loss": 1.0774, + "step": 14735 + }, + { + "epoch": 0.8953156327844948, + "grad_norm": 0.1457117795944214, + "learning_rate": 2.699004354825091e-06, + "loss": 1.0574, + "step": 14736 + }, + { + "epoch": 0.8953763898171213, + "grad_norm": 0.1838354617357254, + "learning_rate": 2.69590260458123e-06, + "loss": 1.1001, + "step": 14737 + }, + { + "epoch": 0.8954371468497478, + "grad_norm": 0.16018275916576385, + "learning_rate": 2.6928025882782037e-06, + "loss": 1.0407, + "step": 14738 + }, + { + "epoch": 0.8954979038823744, + "grad_norm": 0.7069351673126221, + "learning_rate": 2.689704306029639e-06, + "loss": 1.2702, + "step": 14739 + }, + { + "epoch": 0.8955586609150009, + "grad_norm": 0.12320427596569061, + "learning_rate": 2.6866077579491054e-06, + "loss": 1.0563, + "step": 14740 + }, + { + "epoch": 0.8956194179476274, + "grad_norm": 0.1000717282295227, + "learning_rate": 2.6835129441501126e-06, + "loss": 1.0168, + "step": 14741 + }, + { + "epoch": 0.895680174980254, + "grad_norm": 0.6606243252754211, + "learning_rate": 2.6804198647461033e-06, + "loss": 1.0581, + "step": 14742 + }, + { + "epoch": 0.8957409320128805, + "grad_norm": 0.1815868616104126, + "learning_rate": 2.677328519850453e-06, + "loss": 1.1081, + "step": 14743 + }, + { + "epoch": 0.895801689045507, + "grad_norm": 0.09678613394498825, + "learning_rate": 2.674238909576471e-06, + "loss": 1.0217, + "step": 14744 + }, + { + "epoch": 0.8958624460781336, + "grad_norm": 0.12373851239681244, + "learning_rate": 2.671151034037395e-06, + "loss": 1.0852, + "step": 14745 + }, + { + "epoch": 0.8959232031107601, + "grad_norm": 0.3140968680381775, + "learning_rate": 2.668064893346439e-06, + "loss": 1.188, + "step": 14746 + }, + { + "epoch": 0.8959839601433865, + "grad_norm": 0.17102232575416565, + "learning_rate": 2.664980487616714e-06, + "loss": 1.1088, + "step": 14747 + }, + { + "epoch": 0.8960447171760131, + "grad_norm": 0.14022096991539001, + "learning_rate": 2.6618978169612786e-06, + "loss": 1.0149, + "step": 14748 + }, + { + "epoch": 0.8961054742086396, + "grad_norm": 0.10651004314422607, + "learning_rate": 2.658816881493131e-06, + "loss": 1.0699, + "step": 14749 + }, + { + "epoch": 0.8961662312412662, + "grad_norm": 0.10498160868883133, + "learning_rate": 2.6557376813251934e-06, + "loss": 1.0357, + "step": 14750 + }, + { + "epoch": 0.8962269882738927, + "grad_norm": 0.11480096727609634, + "learning_rate": 2.6526602165703464e-06, + "loss": 1.0319, + "step": 14751 + }, + { + "epoch": 0.8962877453065192, + "grad_norm": 0.12573085725307465, + "learning_rate": 2.6495844873413943e-06, + "loss": 1.0453, + "step": 14752 + }, + { + "epoch": 0.8963485023391458, + "grad_norm": 0.17667414247989655, + "learning_rate": 2.64651049375107e-06, + "loss": 1.1554, + "step": 14753 + }, + { + "epoch": 0.8964092593717723, + "grad_norm": 0.11018501967191696, + "learning_rate": 2.6434382359120604e-06, + "loss": 1.0749, + "step": 14754 + }, + { + "epoch": 0.8964700164043988, + "grad_norm": 0.1279340535402298, + "learning_rate": 2.6403677139369754e-06, + "loss": 1.0774, + "step": 14755 + }, + { + "epoch": 0.8965307734370254, + "grad_norm": 0.149647518992424, + "learning_rate": 2.63729892793837e-06, + "loss": 1.0736, + "step": 14756 + }, + { + "epoch": 0.8965915304696519, + "grad_norm": 0.16375941038131714, + "learning_rate": 2.6342318780287203e-06, + "loss": 1.1356, + "step": 14757 + }, + { + "epoch": 0.8966522875022784, + "grad_norm": 0.136698916554451, + "learning_rate": 2.6311665643204586e-06, + "loss": 1.0437, + "step": 14758 + }, + { + "epoch": 0.896713044534905, + "grad_norm": 0.18355245888233185, + "learning_rate": 2.6281029869259454e-06, + "loss": 1.0928, + "step": 14759 + }, + { + "epoch": 0.8967738015675314, + "grad_norm": 0.0999312549829483, + "learning_rate": 2.625041145957474e-06, + "loss": 1.0195, + "step": 14760 + }, + { + "epoch": 0.8968345586001579, + "grad_norm": 0.15324987471103668, + "learning_rate": 2.6219810415272714e-06, + "loss": 1.0849, + "step": 14761 + }, + { + "epoch": 0.8968953156327845, + "grad_norm": 0.10880114883184433, + "learning_rate": 2.6189226737475146e-06, + "loss": 1.0414, + "step": 14762 + }, + { + "epoch": 0.896956072665411, + "grad_norm": 0.11195948719978333, + "learning_rate": 2.615866042730314e-06, + "loss": 1.0144, + "step": 14763 + }, + { + "epoch": 0.8970168296980375, + "grad_norm": 0.635172963142395, + "learning_rate": 2.612811148587696e-06, + "loss": 1.065, + "step": 14764 + }, + { + "epoch": 0.8970775867306641, + "grad_norm": 0.09788442403078079, + "learning_rate": 2.609757991431644e-06, + "loss": 0.9867, + "step": 14765 + }, + { + "epoch": 0.8971383437632906, + "grad_norm": 0.1582690328359604, + "learning_rate": 2.6067065713740677e-06, + "loss": 1.0674, + "step": 14766 + }, + { + "epoch": 0.8971991007959171, + "grad_norm": 0.25265640020370483, + "learning_rate": 2.6036568885268287e-06, + "loss": 1.1109, + "step": 14767 + }, + { + "epoch": 0.8972598578285437, + "grad_norm": 0.15409798920154572, + "learning_rate": 2.600608943001709e-06, + "loss": 1.1001, + "step": 14768 + }, + { + "epoch": 0.8973206148611702, + "grad_norm": 0.11112014204263687, + "learning_rate": 2.597562734910436e-06, + "loss": 1.0156, + "step": 14769 + }, + { + "epoch": 0.8973813718937967, + "grad_norm": 0.23407863080501556, + "learning_rate": 2.594518264364659e-06, + "loss": 1.0893, + "step": 14770 + }, + { + "epoch": 0.8974421289264233, + "grad_norm": 0.27085840702056885, + "learning_rate": 2.591475531475973e-06, + "loss": 1.2096, + "step": 14771 + }, + { + "epoch": 0.8975028859590498, + "grad_norm": 0.4738995134830475, + "learning_rate": 2.588434536355916e-06, + "loss": 1.1933, + "step": 14772 + }, + { + "epoch": 0.8975636429916762, + "grad_norm": 0.18575914204120636, + "learning_rate": 2.58539527911596e-06, + "loss": 1.1266, + "step": 14773 + }, + { + "epoch": 0.8976244000243028, + "grad_norm": 0.29799142479896545, + "learning_rate": 2.5823577598675043e-06, + "loss": 1.2722, + "step": 14774 + }, + { + "epoch": 0.8976851570569293, + "grad_norm": 0.15743301808834076, + "learning_rate": 2.579321978721888e-06, + "loss": 1.1032, + "step": 14775 + }, + { + "epoch": 0.8977459140895558, + "grad_norm": 0.17843139171600342, + "learning_rate": 2.5762879357903946e-06, + "loss": 1.0774, + "step": 14776 + }, + { + "epoch": 0.8978066711221824, + "grad_norm": 0.10879513621330261, + "learning_rate": 2.573255631184235e-06, + "loss": 1.0216, + "step": 14777 + }, + { + "epoch": 0.8978674281548089, + "grad_norm": 0.09962224215269089, + "learning_rate": 2.570225065014553e-06, + "loss": 1.0091, + "step": 14778 + }, + { + "epoch": 0.8979281851874354, + "grad_norm": 0.3224875032901764, + "learning_rate": 2.5671962373924384e-06, + "loss": 1.1435, + "step": 14779 + }, + { + "epoch": 0.897988942220062, + "grad_norm": 0.2196751981973648, + "learning_rate": 2.5641691484289187e-06, + "loss": 1.0831, + "step": 14780 + }, + { + "epoch": 0.8980496992526885, + "grad_norm": 0.1339867115020752, + "learning_rate": 2.5611437982349493e-06, + "loss": 1.0189, + "step": 14781 + }, + { + "epoch": 0.898110456285315, + "grad_norm": 0.32741713523864746, + "learning_rate": 2.5581201869214087e-06, + "loss": 1.0599, + "step": 14782 + }, + { + "epoch": 0.8981712133179416, + "grad_norm": 0.11647716909646988, + "learning_rate": 2.5550983145991635e-06, + "loss": 1.0442, + "step": 14783 + }, + { + "epoch": 0.8982319703505681, + "grad_norm": 0.19201484322547913, + "learning_rate": 2.552078181378953e-06, + "loss": 1.0715, + "step": 14784 + }, + { + "epoch": 0.8982927273831947, + "grad_norm": 0.1456119567155838, + "learning_rate": 2.549059787371488e-06, + "loss": 1.0855, + "step": 14785 + }, + { + "epoch": 0.8983534844158211, + "grad_norm": 0.20046569406986237, + "learning_rate": 2.546043132687409e-06, + "loss": 1.006, + "step": 14786 + }, + { + "epoch": 0.8984142414484476, + "grad_norm": 0.09116701036691666, + "learning_rate": 2.5430282174372823e-06, + "loss": 1.0245, + "step": 14787 + }, + { + "epoch": 0.8984749984810741, + "grad_norm": 0.11472343653440475, + "learning_rate": 2.5400150417316358e-06, + "loss": 1.059, + "step": 14788 + }, + { + "epoch": 0.8985357555137007, + "grad_norm": 0.11024674028158188, + "learning_rate": 2.537003605680921e-06, + "loss": 1.0503, + "step": 14789 + }, + { + "epoch": 0.8985965125463272, + "grad_norm": 0.11182813346385956, + "learning_rate": 2.5339939093955047e-06, + "loss": 1.0394, + "step": 14790 + }, + { + "epoch": 0.8986572695789538, + "grad_norm": 0.1158798485994339, + "learning_rate": 2.5309859529857095e-06, + "loss": 1.0521, + "step": 14791 + }, + { + "epoch": 0.8987180266115803, + "grad_norm": 0.11270305514335632, + "learning_rate": 2.5279797365618085e-06, + "loss": 1.0086, + "step": 14792 + }, + { + "epoch": 0.8987787836442068, + "grad_norm": 0.2143092006444931, + "learning_rate": 2.524975260233986e-06, + "loss": 1.0624, + "step": 14793 + }, + { + "epoch": 0.8988395406768334, + "grad_norm": 0.14864306151866913, + "learning_rate": 2.521972524112376e-06, + "loss": 1.1003, + "step": 14794 + }, + { + "epoch": 0.8989002977094599, + "grad_norm": 0.26273003220558167, + "learning_rate": 2.5189715283070346e-06, + "loss": 1.1184, + "step": 14795 + }, + { + "epoch": 0.8989610547420864, + "grad_norm": 0.1566545069217682, + "learning_rate": 2.515972272927969e-06, + "loss": 1.0751, + "step": 14796 + }, + { + "epoch": 0.899021811774713, + "grad_norm": 0.19338296353816986, + "learning_rate": 2.512974758085124e-06, + "loss": 1.0438, + "step": 14797 + }, + { + "epoch": 0.8990825688073395, + "grad_norm": 0.10696569085121155, + "learning_rate": 2.509978983888367e-06, + "loss": 1.0386, + "step": 14798 + }, + { + "epoch": 0.8991433258399659, + "grad_norm": 0.1505601853132248, + "learning_rate": 2.506984950447505e-06, + "loss": 1.0853, + "step": 14799 + }, + { + "epoch": 0.8992040828725925, + "grad_norm": 0.15337173640727997, + "learning_rate": 2.503992657872295e-06, + "loss": 1.0246, + "step": 14800 + }, + { + "epoch": 0.899264839905219, + "grad_norm": 0.31898120045661926, + "learning_rate": 2.5010021062724097e-06, + "loss": 1.1141, + "step": 14801 + }, + { + "epoch": 0.8993255969378455, + "grad_norm": 0.21222637593746185, + "learning_rate": 2.498013295757479e-06, + "loss": 1.1675, + "step": 14802 + }, + { + "epoch": 0.8993863539704721, + "grad_norm": 0.2428138107061386, + "learning_rate": 2.4950262264370473e-06, + "loss": 1.2234, + "step": 14803 + }, + { + "epoch": 0.8994471110030986, + "grad_norm": 0.13729248940944672, + "learning_rate": 2.4920408984206166e-06, + "loss": 1.0823, + "step": 14804 + }, + { + "epoch": 0.8995078680357251, + "grad_norm": 0.14754195511341095, + "learning_rate": 2.4890573118176107e-06, + "loss": 1.1047, + "step": 14805 + }, + { + "epoch": 0.8995686250683517, + "grad_norm": 1.1289266347885132, + "learning_rate": 2.4860754667373866e-06, + "loss": 1.1209, + "step": 14806 + }, + { + "epoch": 0.8996293821009782, + "grad_norm": 0.31131842732429504, + "learning_rate": 2.4830953632892505e-06, + "loss": 1.288, + "step": 14807 + }, + { + "epoch": 0.8996901391336047, + "grad_norm": 0.10699814558029175, + "learning_rate": 2.4801170015824384e-06, + "loss": 1.0468, + "step": 14808 + }, + { + "epoch": 0.8997508961662313, + "grad_norm": 0.42613834142684937, + "learning_rate": 2.4771403817261285e-06, + "loss": 1.0187, + "step": 14809 + }, + { + "epoch": 0.8998116531988578, + "grad_norm": 0.13182315230369568, + "learning_rate": 2.4741655038294285e-06, + "loss": 1.0689, + "step": 14810 + }, + { + "epoch": 0.8998724102314843, + "grad_norm": 0.13865163922309875, + "learning_rate": 2.471192368001374e-06, + "loss": 1.0806, + "step": 14811 + }, + { + "epoch": 0.8999331672641108, + "grad_norm": 0.1953926384449005, + "learning_rate": 2.4682209743509376e-06, + "loss": 1.0799, + "step": 14812 + }, + { + "epoch": 0.8999939242967373, + "grad_norm": 2.2017624378204346, + "learning_rate": 2.4652513229870612e-06, + "loss": 1.0499, + "step": 14813 + }, + { + "epoch": 0.9000546813293638, + "grad_norm": 0.17436793446540833, + "learning_rate": 2.4622834140185846e-06, + "loss": 1.0822, + "step": 14814 + }, + { + "epoch": 0.9001154383619904, + "grad_norm": 0.3115310072898865, + "learning_rate": 2.459317247554299e-06, + "loss": 1.1325, + "step": 14815 + }, + { + "epoch": 0.9001761953946169, + "grad_norm": 0.13118942081928253, + "learning_rate": 2.4563528237029344e-06, + "loss": 1.0601, + "step": 14816 + }, + { + "epoch": 0.9002369524272434, + "grad_norm": 0.9630343317985535, + "learning_rate": 2.4533901425731365e-06, + "loss": 1.0786, + "step": 14817 + }, + { + "epoch": 0.90029770945987, + "grad_norm": 0.2966160476207733, + "learning_rate": 2.4504292042735187e-06, + "loss": 1.0761, + "step": 14818 + }, + { + "epoch": 0.9003584664924965, + "grad_norm": 0.12345825135707855, + "learning_rate": 2.4474700089126055e-06, + "loss": 1.0625, + "step": 14819 + }, + { + "epoch": 0.900419223525123, + "grad_norm": 0.239149272441864, + "learning_rate": 2.4445125565988767e-06, + "loss": 1.2008, + "step": 14820 + }, + { + "epoch": 0.9004799805577496, + "grad_norm": 0.19035352766513824, + "learning_rate": 2.441556847440729e-06, + "loss": 1.095, + "step": 14821 + }, + { + "epoch": 0.9005407375903761, + "grad_norm": 0.4163861870765686, + "learning_rate": 2.438602881546509e-06, + "loss": 1.0479, + "step": 14822 + }, + { + "epoch": 0.9006014946230027, + "grad_norm": 0.09577146172523499, + "learning_rate": 2.4356506590244967e-06, + "loss": 1.0097, + "step": 14823 + }, + { + "epoch": 0.9006622516556292, + "grad_norm": 0.15774449706077576, + "learning_rate": 2.4327001799828995e-06, + "loss": 1.0381, + "step": 14824 + }, + { + "epoch": 0.9007230086882556, + "grad_norm": 0.22189322113990784, + "learning_rate": 2.429751444529871e-06, + "loss": 1.1019, + "step": 14825 + }, + { + "epoch": 0.9007837657208821, + "grad_norm": 0.1437055468559265, + "learning_rate": 2.426804452773501e-06, + "loss": 1.0409, + "step": 14826 + }, + { + "epoch": 0.9008445227535087, + "grad_norm": 0.1471676081418991, + "learning_rate": 2.42385920482181e-06, + "loss": 1.0899, + "step": 14827 + }, + { + "epoch": 0.9009052797861352, + "grad_norm": 0.2153516709804535, + "learning_rate": 2.420915700782744e-06, + "loss": 1.2429, + "step": 14828 + }, + { + "epoch": 0.9009660368187618, + "grad_norm": 0.17334984242916107, + "learning_rate": 2.417973940764223e-06, + "loss": 1.1008, + "step": 14829 + }, + { + "epoch": 0.9010267938513883, + "grad_norm": 0.18309518694877625, + "learning_rate": 2.415033924874066e-06, + "loss": 1.0947, + "step": 14830 + }, + { + "epoch": 0.9010875508840148, + "grad_norm": 0.18143358826637268, + "learning_rate": 2.4120956532200367e-06, + "loss": 1.0275, + "step": 14831 + }, + { + "epoch": 0.9011483079166414, + "grad_norm": 0.1206001490354538, + "learning_rate": 2.409159125909838e-06, + "loss": 1.0806, + "step": 14832 + }, + { + "epoch": 0.9012090649492679, + "grad_norm": 1.0380929708480835, + "learning_rate": 2.4062243430511056e-06, + "loss": 1.0939, + "step": 14833 + }, + { + "epoch": 0.9012698219818944, + "grad_norm": 0.14141760766506195, + "learning_rate": 2.4032913047514204e-06, + "loss": 1.0364, + "step": 14834 + }, + { + "epoch": 0.901330579014521, + "grad_norm": 0.13730286061763763, + "learning_rate": 2.400360011118297e-06, + "loss": 1.0002, + "step": 14835 + }, + { + "epoch": 0.9013913360471475, + "grad_norm": 0.12602296471595764, + "learning_rate": 2.397430462259187e-06, + "loss": 1.0192, + "step": 14836 + }, + { + "epoch": 0.901452093079774, + "grad_norm": 0.2573122978210449, + "learning_rate": 2.3945026582814557e-06, + "loss": 1.1678, + "step": 14837 + }, + { + "epoch": 0.9015128501124006, + "grad_norm": 0.1748371571302414, + "learning_rate": 2.391576599292422e-06, + "loss": 1.1519, + "step": 14838 + }, + { + "epoch": 0.901573607145027, + "grad_norm": 0.18285953998565674, + "learning_rate": 2.388652285399362e-06, + "loss": 1.0914, + "step": 14839 + }, + { + "epoch": 0.9016343641776535, + "grad_norm": 0.15655595064163208, + "learning_rate": 2.3857297167094508e-06, + "loss": 1.0505, + "step": 14840 + }, + { + "epoch": 0.9016951212102801, + "grad_norm": 0.18352477252483368, + "learning_rate": 2.3828088933298243e-06, + "loss": 1.049, + "step": 14841 + }, + { + "epoch": 0.9017558782429066, + "grad_norm": 0.22545355558395386, + "learning_rate": 2.379889815367542e-06, + "loss": 1.2236, + "step": 14842 + }, + { + "epoch": 0.9018166352755331, + "grad_norm": 0.11439885944128036, + "learning_rate": 2.3769724829296015e-06, + "loss": 1.0496, + "step": 14843 + }, + { + "epoch": 0.9018773923081597, + "grad_norm": 0.15851031243801117, + "learning_rate": 2.3740568961229394e-06, + "loss": 1.0856, + "step": 14844 + }, + { + "epoch": 0.9019381493407862, + "grad_norm": 0.14722847938537598, + "learning_rate": 2.3711430550544257e-06, + "loss": 1.0487, + "step": 14845 + }, + { + "epoch": 0.9019989063734127, + "grad_norm": 0.2087150514125824, + "learning_rate": 2.3682309598308747e-06, + "loss": 1.1448, + "step": 14846 + }, + { + "epoch": 0.9020596634060393, + "grad_norm": 0.17388825118541718, + "learning_rate": 2.365320610559024e-06, + "loss": 1.0812, + "step": 14847 + }, + { + "epoch": 0.9021204204386658, + "grad_norm": 0.10240533202886581, + "learning_rate": 2.3624120073455548e-06, + "loss": 1.0514, + "step": 14848 + }, + { + "epoch": 0.9021811774712923, + "grad_norm": 0.1334536075592041, + "learning_rate": 2.35950515029707e-06, + "loss": 1.0451, + "step": 14849 + }, + { + "epoch": 0.9022419345039189, + "grad_norm": 0.12250686436891556, + "learning_rate": 2.3566000395201513e-06, + "loss": 1.0514, + "step": 14850 + }, + { + "epoch": 0.9023026915365454, + "grad_norm": 0.19700130820274353, + "learning_rate": 2.3536966751212586e-06, + "loss": 1.0974, + "step": 14851 + }, + { + "epoch": 0.9023634485691718, + "grad_norm": 0.10672856867313385, + "learning_rate": 2.3507950572068228e-06, + "loss": 1.1092, + "step": 14852 + }, + { + "epoch": 0.9024242056017984, + "grad_norm": 0.28611481189727783, + "learning_rate": 2.347895185883203e-06, + "loss": 1.0191, + "step": 14853 + }, + { + "epoch": 0.9024849626344249, + "grad_norm": 0.18546275794506073, + "learning_rate": 2.344997061256693e-06, + "loss": 1.0589, + "step": 14854 + }, + { + "epoch": 0.9025457196670514, + "grad_norm": 0.14217063784599304, + "learning_rate": 2.3421006834335348e-06, + "loss": 1.0837, + "step": 14855 + }, + { + "epoch": 0.902606476699678, + "grad_norm": 0.18248297274112701, + "learning_rate": 2.3392060525198935e-06, + "loss": 1.0928, + "step": 14856 + }, + { + "epoch": 0.9026672337323045, + "grad_norm": 0.10684803873300552, + "learning_rate": 2.3363131686218563e-06, + "loss": 1.0422, + "step": 14857 + }, + { + "epoch": 0.902727990764931, + "grad_norm": 0.09788000583648682, + "learning_rate": 2.333422031845478e-06, + "loss": 0.9968, + "step": 14858 + }, + { + "epoch": 0.9027887477975576, + "grad_norm": 0.10721543431282043, + "learning_rate": 2.3305326422967178e-06, + "loss": 1.0339, + "step": 14859 + }, + { + "epoch": 0.9028495048301841, + "grad_norm": 0.1177595779299736, + "learning_rate": 2.3276450000815076e-06, + "loss": 1.025, + "step": 14860 + }, + { + "epoch": 0.9029102618628106, + "grad_norm": 0.27633893489837646, + "learning_rate": 2.324759105305685e-06, + "loss": 1.1643, + "step": 14861 + }, + { + "epoch": 0.9029710188954372, + "grad_norm": 0.10858333855867386, + "learning_rate": 2.321874958075032e-06, + "loss": 1.0431, + "step": 14862 + }, + { + "epoch": 0.9030317759280637, + "grad_norm": 0.15864817798137665, + "learning_rate": 2.318992558495281e-06, + "loss": 1.1289, + "step": 14863 + }, + { + "epoch": 0.9030925329606903, + "grad_norm": 0.09833846986293793, + "learning_rate": 2.316111906672053e-06, + "loss": 1.0525, + "step": 14864 + }, + { + "epoch": 0.9031532899933167, + "grad_norm": 0.14392895996570587, + "learning_rate": 2.3132330027109683e-06, + "loss": 1.0411, + "step": 14865 + }, + { + "epoch": 0.9032140470259432, + "grad_norm": 0.1342892348766327, + "learning_rate": 2.310355846717549e-06, + "loss": 1.0434, + "step": 14866 + }, + { + "epoch": 0.9032748040585697, + "grad_norm": 0.213627889752388, + "learning_rate": 2.3074804387972493e-06, + "loss": 1.0886, + "step": 14867 + }, + { + "epoch": 0.9033355610911963, + "grad_norm": 0.15513767302036285, + "learning_rate": 2.3046067790554794e-06, + "loss": 1.1355, + "step": 14868 + }, + { + "epoch": 0.9033963181238228, + "grad_norm": 0.30452588200569153, + "learning_rate": 2.3017348675975604e-06, + "loss": 1.2791, + "step": 14869 + }, + { + "epoch": 0.9034570751564494, + "grad_norm": 19.968107223510742, + "learning_rate": 2.2988647045287747e-06, + "loss": 1.105, + "step": 14870 + }, + { + "epoch": 0.9035178321890759, + "grad_norm": 0.13805511593818665, + "learning_rate": 2.295996289954322e-06, + "loss": 1.06, + "step": 14871 + }, + { + "epoch": 0.9035785892217024, + "grad_norm": 0.16649939119815826, + "learning_rate": 2.2931296239793455e-06, + "loss": 1.094, + "step": 14872 + }, + { + "epoch": 0.903639346254329, + "grad_norm": 0.10299520939588547, + "learning_rate": 2.290264706708922e-06, + "loss": 1.0535, + "step": 14873 + }, + { + "epoch": 0.9037001032869555, + "grad_norm": 0.1757773607969284, + "learning_rate": 2.287401538248074e-06, + "loss": 1.0288, + "step": 14874 + }, + { + "epoch": 0.903760860319582, + "grad_norm": 0.22568610310554504, + "learning_rate": 2.2845401187017333e-06, + "loss": 1.2472, + "step": 14875 + }, + { + "epoch": 0.9038216173522086, + "grad_norm": 0.2091546654701233, + "learning_rate": 2.2816804481748055e-06, + "loss": 1.1392, + "step": 14876 + }, + { + "epoch": 0.9038823743848351, + "grad_norm": 0.20075146853923798, + "learning_rate": 2.278822526772112e-06, + "loss": 1.0969, + "step": 14877 + }, + { + "epoch": 0.9039431314174615, + "grad_norm": 0.12191522121429443, + "learning_rate": 2.2759663545983966e-06, + "loss": 1.0571, + "step": 14878 + }, + { + "epoch": 0.9040038884500881, + "grad_norm": 0.09095070511102676, + "learning_rate": 2.2731119317583593e-06, + "loss": 0.9987, + "step": 14879 + }, + { + "epoch": 0.9040646454827146, + "grad_norm": 0.1410599797964096, + "learning_rate": 2.270259258356622e-06, + "loss": 1.0615, + "step": 14880 + }, + { + "epoch": 0.9041254025153411, + "grad_norm": 0.18395306169986725, + "learning_rate": 2.2674083344977615e-06, + "loss": 1.1135, + "step": 14881 + }, + { + "epoch": 0.9041861595479677, + "grad_norm": 0.5182232856750488, + "learning_rate": 2.2645591602862836e-06, + "loss": 1.0553, + "step": 14882 + }, + { + "epoch": 0.9042469165805942, + "grad_norm": 0.25021278858184814, + "learning_rate": 2.261711735826616e-06, + "loss": 1.1731, + "step": 14883 + }, + { + "epoch": 0.9043076736132207, + "grad_norm": 0.09032687544822693, + "learning_rate": 2.258866061223119e-06, + "loss": 1.0013, + "step": 14884 + }, + { + "epoch": 0.9043684306458473, + "grad_norm": 0.11903296411037445, + "learning_rate": 2.2560221365801273e-06, + "loss": 0.9914, + "step": 14885 + }, + { + "epoch": 0.9044291876784738, + "grad_norm": 0.13379590213298798, + "learning_rate": 2.2531799620018667e-06, + "loss": 1.0353, + "step": 14886 + }, + { + "epoch": 0.9044899447111003, + "grad_norm": 0.12906059622764587, + "learning_rate": 2.2503395375925275e-06, + "loss": 0.9949, + "step": 14887 + }, + { + "epoch": 0.9045507017437269, + "grad_norm": 0.18384777009487152, + "learning_rate": 2.24750086345622e-06, + "loss": 1.085, + "step": 14888 + }, + { + "epoch": 0.9046114587763534, + "grad_norm": 0.3484676480293274, + "learning_rate": 2.2446639396970006e-06, + "loss": 1.1427, + "step": 14889 + }, + { + "epoch": 0.9046722158089799, + "grad_norm": 0.17215284705162048, + "learning_rate": 2.241828766418852e-06, + "loss": 1.1103, + "step": 14890 + }, + { + "epoch": 0.9047329728416064, + "grad_norm": 0.20290076732635498, + "learning_rate": 2.238995343725703e-06, + "loss": 1.0607, + "step": 14891 + }, + { + "epoch": 0.9047937298742329, + "grad_norm": 0.10693349689245224, + "learning_rate": 2.236163671721414e-06, + "loss": 1.068, + "step": 14892 + }, + { + "epoch": 0.9048544869068594, + "grad_norm": 0.1770641803741455, + "learning_rate": 2.233333750509775e-06, + "loss": 1.0573, + "step": 14893 + }, + { + "epoch": 0.904915243939486, + "grad_norm": 0.11270394921302795, + "learning_rate": 2.2305055801945253e-06, + "loss": 0.9978, + "step": 14894 + }, + { + "epoch": 0.9049760009721125, + "grad_norm": 0.17094773054122925, + "learning_rate": 2.2276791608793257e-06, + "loss": 1.1332, + "step": 14895 + }, + { + "epoch": 0.905036758004739, + "grad_norm": 0.16758529841899872, + "learning_rate": 2.2248544926677717e-06, + "loss": 1.0376, + "step": 14896 + }, + { + "epoch": 0.9050975150373656, + "grad_norm": 0.15046589076519012, + "learning_rate": 2.2220315756634245e-06, + "loss": 1.0281, + "step": 14897 + }, + { + "epoch": 0.9051582720699921, + "grad_norm": 0.1624196469783783, + "learning_rate": 2.2192104099697407e-06, + "loss": 1.0697, + "step": 14898 + }, + { + "epoch": 0.9052190291026186, + "grad_norm": 0.16985276341438293, + "learning_rate": 2.216390995690132e-06, + "loss": 1.0238, + "step": 14899 + }, + { + "epoch": 0.9052797861352452, + "grad_norm": 0.1171732023358345, + "learning_rate": 2.213573332927943e-06, + "loss": 1.0048, + "step": 14900 + }, + { + "epoch": 0.9053405431678717, + "grad_norm": 0.17110712826251984, + "learning_rate": 2.2107574217864634e-06, + "loss": 1.0858, + "step": 14901 + }, + { + "epoch": 0.9054013002004982, + "grad_norm": 1.0572800636291504, + "learning_rate": 2.2079432623689113e-06, + "loss": 1.0112, + "step": 14902 + }, + { + "epoch": 0.9054620572331248, + "grad_norm": 1.7373898029327393, + "learning_rate": 2.205130854778442e-06, + "loss": 1.0129, + "step": 14903 + }, + { + "epoch": 0.9055228142657512, + "grad_norm": 0.08647758513689041, + "learning_rate": 2.2023201991181354e-06, + "loss": 1.0087, + "step": 14904 + }, + { + "epoch": 0.9055835712983777, + "grad_norm": 0.13178007304668427, + "learning_rate": 2.1995112954910134e-06, + "loss": 1.0393, + "step": 14905 + }, + { + "epoch": 0.9056443283310043, + "grad_norm": 0.15629468858242035, + "learning_rate": 2.1967041440000558e-06, + "loss": 1.0491, + "step": 14906 + }, + { + "epoch": 0.9057050853636308, + "grad_norm": 0.11591527611017227, + "learning_rate": 2.1938987447481405e-06, + "loss": 1.0753, + "step": 14907 + }, + { + "epoch": 0.9057658423962573, + "grad_norm": 0.27414581179618835, + "learning_rate": 2.1910950978381086e-06, + "loss": 1.1585, + "step": 14908 + }, + { + "epoch": 0.9058265994288839, + "grad_norm": 0.14085903763771057, + "learning_rate": 2.188293203372732e-06, + "loss": 1.1002, + "step": 14909 + }, + { + "epoch": 0.9058873564615104, + "grad_norm": 0.0992974266409874, + "learning_rate": 2.1854930614547074e-06, + "loss": 1.0517, + "step": 14910 + }, + { + "epoch": 0.905948113494137, + "grad_norm": 0.12857897579669952, + "learning_rate": 2.1826946721866747e-06, + "loss": 1.0681, + "step": 14911 + }, + { + "epoch": 0.9060088705267635, + "grad_norm": 0.17564108967781067, + "learning_rate": 2.179898035671213e-06, + "loss": 1.0624, + "step": 14912 + }, + { + "epoch": 0.90606962755939, + "grad_norm": 0.08895411342382431, + "learning_rate": 2.1771031520108343e-06, + "loss": 1.0147, + "step": 14913 + }, + { + "epoch": 0.9061303845920166, + "grad_norm": 0.20652857422828674, + "learning_rate": 2.1743100213079794e-06, + "loss": 1.1615, + "step": 14914 + }, + { + "epoch": 0.9061911416246431, + "grad_norm": 0.24078267812728882, + "learning_rate": 2.171518643665038e-06, + "loss": 1.0412, + "step": 14915 + }, + { + "epoch": 0.9062518986572696, + "grad_norm": 0.17702774703502655, + "learning_rate": 2.1687290191843178e-06, + "loss": 1.0171, + "step": 14916 + }, + { + "epoch": 0.906312655689896, + "grad_norm": 0.13435055315494537, + "learning_rate": 2.165941147968098e-06, + "loss": 1.0695, + "step": 14917 + }, + { + "epoch": 0.9063734127225226, + "grad_norm": 0.11238982528448105, + "learning_rate": 2.163155030118541e-06, + "loss": 1.016, + "step": 14918 + }, + { + "epoch": 0.9064341697551491, + "grad_norm": 0.11210320889949799, + "learning_rate": 2.160370665737782e-06, + "loss": 1.0352, + "step": 14919 + }, + { + "epoch": 0.9064949267877757, + "grad_norm": 0.13768525421619415, + "learning_rate": 2.157588054927884e-06, + "loss": 1.0964, + "step": 14920 + }, + { + "epoch": 0.9065556838204022, + "grad_norm": 0.11701498925685883, + "learning_rate": 2.1548071977908423e-06, + "loss": 1.0417, + "step": 14921 + }, + { + "epoch": 0.9066164408530287, + "grad_norm": 0.10916713625192642, + "learning_rate": 2.152028094428593e-06, + "loss": 1.0241, + "step": 14922 + }, + { + "epoch": 0.9066771978856553, + "grad_norm": 1.759683609008789, + "learning_rate": 2.1492507449430098e-06, + "loss": 1.0213, + "step": 14923 + }, + { + "epoch": 0.9067379549182818, + "grad_norm": 0.19697317481040955, + "learning_rate": 2.1464751494358837e-06, + "loss": 1.1385, + "step": 14924 + }, + { + "epoch": 0.9067987119509083, + "grad_norm": 0.11534097790718079, + "learning_rate": 2.1437013080089608e-06, + "loss": 1.0203, + "step": 14925 + }, + { + "epoch": 0.9068594689835349, + "grad_norm": 0.12873831391334534, + "learning_rate": 2.1409292207639096e-06, + "loss": 1.0734, + "step": 14926 + }, + { + "epoch": 0.9069202260161614, + "grad_norm": 0.1048264354467392, + "learning_rate": 2.138158887802355e-06, + "loss": 1.0258, + "step": 14927 + }, + { + "epoch": 0.9069809830487879, + "grad_norm": 0.16935966908931732, + "learning_rate": 2.1353903092258374e-06, + "loss": 1.1621, + "step": 14928 + }, + { + "epoch": 0.9070417400814145, + "grad_norm": 0.13281705975532532, + "learning_rate": 2.132623485135843e-06, + "loss": 1.0636, + "step": 14929 + }, + { + "epoch": 0.9071024971140409, + "grad_norm": 0.15051071345806122, + "learning_rate": 2.1298584156337896e-06, + "loss": 1.0907, + "step": 14930 + }, + { + "epoch": 0.9071632541466674, + "grad_norm": 0.16882653534412384, + "learning_rate": 2.1270951008210192e-06, + "loss": 1.1023, + "step": 14931 + }, + { + "epoch": 0.907224011179294, + "grad_norm": 0.1358702927827835, + "learning_rate": 2.1243335407988396e-06, + "loss": 1.038, + "step": 14932 + }, + { + "epoch": 0.9072847682119205, + "grad_norm": 0.13505752384662628, + "learning_rate": 2.121573735668464e-06, + "loss": 1.0085, + "step": 14933 + }, + { + "epoch": 0.907345525244547, + "grad_norm": 0.1285618245601654, + "learning_rate": 2.1188156855310613e-06, + "loss": 1.0184, + "step": 14934 + }, + { + "epoch": 0.9074062822771736, + "grad_norm": 0.15828286111354828, + "learning_rate": 2.1160593904877235e-06, + "loss": 1.1131, + "step": 14935 + }, + { + "epoch": 0.9074670393098001, + "grad_norm": 0.12114787101745605, + "learning_rate": 2.113304850639486e-06, + "loss": 1.0311, + "step": 14936 + }, + { + "epoch": 0.9075277963424266, + "grad_norm": 1.705637812614441, + "learning_rate": 2.1105520660873123e-06, + "loss": 1.0907, + "step": 14937 + }, + { + "epoch": 0.9075885533750532, + "grad_norm": 0.11159954220056534, + "learning_rate": 2.107801036932111e-06, + "loss": 1.0591, + "step": 14938 + }, + { + "epoch": 0.9076493104076797, + "grad_norm": 0.13896577060222626, + "learning_rate": 2.1050517632747178e-06, + "loss": 1.0834, + "step": 14939 + }, + { + "epoch": 0.9077100674403062, + "grad_norm": 0.16339844465255737, + "learning_rate": 2.1023042452159136e-06, + "loss": 1.1279, + "step": 14940 + }, + { + "epoch": 0.9077708244729328, + "grad_norm": 0.14502331614494324, + "learning_rate": 2.0995584828564064e-06, + "loss": 1.0893, + "step": 14941 + }, + { + "epoch": 0.9078315815055593, + "grad_norm": 0.11531216651201248, + "learning_rate": 2.096814476296832e-06, + "loss": 1.0076, + "step": 14942 + }, + { + "epoch": 0.9078923385381859, + "grad_norm": 0.17742054164409637, + "learning_rate": 2.094072225637794e-06, + "loss": 1.165, + "step": 14943 + }, + { + "epoch": 0.9079530955708123, + "grad_norm": 0.15709777176380157, + "learning_rate": 2.091331730979801e-06, + "loss": 1.0494, + "step": 14944 + }, + { + "epoch": 0.9080138526034388, + "grad_norm": 0.20858605206012726, + "learning_rate": 2.0885929924232994e-06, + "loss": 1.0875, + "step": 14945 + }, + { + "epoch": 0.9080746096360653, + "grad_norm": 0.14397738873958588, + "learning_rate": 2.0858560100686763e-06, + "loss": 1.067, + "step": 14946 + }, + { + "epoch": 0.9081353666686919, + "grad_norm": 0.12032930552959442, + "learning_rate": 2.0831207840162626e-06, + "loss": 1.0613, + "step": 14947 + }, + { + "epoch": 0.9081961237013184, + "grad_norm": 0.14941547811031342, + "learning_rate": 2.0803873143663277e-06, + "loss": 1.0364, + "step": 14948 + }, + { + "epoch": 0.908256880733945, + "grad_norm": 0.10667446255683899, + "learning_rate": 2.0776556012190527e-06, + "loss": 1.0121, + "step": 14949 + }, + { + "epoch": 0.9083176377665715, + "grad_norm": 0.3162420988082886, + "learning_rate": 2.07492564467458e-06, + "loss": 1.1724, + "step": 14950 + }, + { + "epoch": 0.908378394799198, + "grad_norm": 0.1320541799068451, + "learning_rate": 2.0721974448329683e-06, + "loss": 1.0384, + "step": 14951 + }, + { + "epoch": 0.9084391518318246, + "grad_norm": 0.27250608801841736, + "learning_rate": 2.069471001794221e-06, + "loss": 1.1995, + "step": 14952 + }, + { + "epoch": 0.9084999088644511, + "grad_norm": 0.09595417976379395, + "learning_rate": 2.06674631565828e-06, + "loss": 1.0296, + "step": 14953 + }, + { + "epoch": 0.9085606658970776, + "grad_norm": 0.1230664774775505, + "learning_rate": 2.0640233865250215e-06, + "loss": 1.044, + "step": 14954 + }, + { + "epoch": 0.9086214229297042, + "grad_norm": 0.10095830261707306, + "learning_rate": 2.0613022144942482e-06, + "loss": 1.0377, + "step": 14955 + }, + { + "epoch": 0.9086821799623307, + "grad_norm": 1.5195826292037964, + "learning_rate": 2.058582799665709e-06, + "loss": 1.0577, + "step": 14956 + }, + { + "epoch": 0.9087429369949571, + "grad_norm": 0.12075446546077728, + "learning_rate": 2.055865142139085e-06, + "loss": 1.0343, + "step": 14957 + }, + { + "epoch": 0.9088036940275837, + "grad_norm": 0.10185153037309647, + "learning_rate": 2.053149242013991e-06, + "loss": 0.9888, + "step": 14958 + }, + { + "epoch": 0.9088644510602102, + "grad_norm": 0.1275072693824768, + "learning_rate": 2.05043509938998e-06, + "loss": 1.0409, + "step": 14959 + }, + { + "epoch": 0.9089252080928367, + "grad_norm": 0.19518202543258667, + "learning_rate": 2.04772271436654e-06, + "loss": 1.0735, + "step": 14960 + }, + { + "epoch": 0.9089859651254633, + "grad_norm": 0.10764231532812119, + "learning_rate": 2.045012087043091e-06, + "loss": 1.0423, + "step": 14961 + }, + { + "epoch": 0.9090467221580898, + "grad_norm": 0.2531062960624695, + "learning_rate": 2.0423032175189983e-06, + "loss": 1.1535, + "step": 14962 + }, + { + "epoch": 0.9091074791907163, + "grad_norm": 0.25562238693237305, + "learning_rate": 2.0395961058935377e-06, + "loss": 1.0257, + "step": 14963 + }, + { + "epoch": 0.9091682362233429, + "grad_norm": 0.12102354317903519, + "learning_rate": 2.0368907522659686e-06, + "loss": 1.0686, + "step": 14964 + }, + { + "epoch": 0.9092289932559694, + "grad_norm": 0.13641558587551117, + "learning_rate": 2.034187156735434e-06, + "loss": 1.0498, + "step": 14965 + }, + { + "epoch": 0.9092897502885959, + "grad_norm": 0.1522539108991623, + "learning_rate": 2.0314853194010385e-06, + "loss": 1.1173, + "step": 14966 + }, + { + "epoch": 0.9093505073212225, + "grad_norm": 0.14811113476753235, + "learning_rate": 2.0287852403618247e-06, + "loss": 1.0094, + "step": 14967 + }, + { + "epoch": 0.909411264353849, + "grad_norm": 0.1169791966676712, + "learning_rate": 2.026086919716752e-06, + "loss": 1.0588, + "step": 14968 + }, + { + "epoch": 0.9094720213864755, + "grad_norm": 0.11501649022102356, + "learning_rate": 2.023390357564742e-06, + "loss": 1.0412, + "step": 14969 + }, + { + "epoch": 0.909532778419102, + "grad_norm": 0.12850110232830048, + "learning_rate": 2.020695554004637e-06, + "loss": 1.0739, + "step": 14970 + }, + { + "epoch": 0.9095935354517285, + "grad_norm": 0.1282275766134262, + "learning_rate": 2.0180025091352086e-06, + "loss": 1.0616, + "step": 14971 + }, + { + "epoch": 0.909654292484355, + "grad_norm": 0.28371360898017883, + "learning_rate": 2.0153112230551717e-06, + "loss": 1.1005, + "step": 14972 + }, + { + "epoch": 0.9097150495169816, + "grad_norm": 0.14681534469127655, + "learning_rate": 2.0126216958631707e-06, + "loss": 1.0195, + "step": 14973 + }, + { + "epoch": 0.9097758065496081, + "grad_norm": 0.13920894265174866, + "learning_rate": 2.009933927657803e-06, + "loss": 1.0926, + "step": 14974 + }, + { + "epoch": 0.9098365635822346, + "grad_norm": 0.09868458658456802, + "learning_rate": 2.0072479185375805e-06, + "loss": 1.0385, + "step": 14975 + }, + { + "epoch": 0.9098973206148612, + "grad_norm": 0.13641227781772614, + "learning_rate": 2.004563668600967e-06, + "loss": 1.0784, + "step": 14976 + }, + { + "epoch": 0.9099580776474877, + "grad_norm": 0.19605377316474915, + "learning_rate": 2.0018811779463464e-06, + "loss": 1.0987, + "step": 14977 + }, + { + "epoch": 0.9100188346801142, + "grad_norm": 0.10820509493350983, + "learning_rate": 1.9992004466720503e-06, + "loss": 1.0334, + "step": 14978 + }, + { + "epoch": 0.9100795917127408, + "grad_norm": 0.09274052083492279, + "learning_rate": 1.9965214748763395e-06, + "loss": 1.0015, + "step": 14979 + }, + { + "epoch": 0.9101403487453673, + "grad_norm": 0.22717702388763428, + "learning_rate": 1.993844262657413e-06, + "loss": 1.0885, + "step": 14980 + }, + { + "epoch": 0.9102011057779938, + "grad_norm": 0.13725969195365906, + "learning_rate": 1.991168810113403e-06, + "loss": 1.038, + "step": 14981 + }, + { + "epoch": 0.9102618628106204, + "grad_norm": 0.13086746633052826, + "learning_rate": 1.9884951173423814e-06, + "loss": 1.0778, + "step": 14982 + }, + { + "epoch": 0.9103226198432468, + "grad_norm": 0.09827271103858948, + "learning_rate": 1.9858231844423536e-06, + "loss": 1.0303, + "step": 14983 + }, + { + "epoch": 0.9103833768758733, + "grad_norm": 0.10377005487680435, + "learning_rate": 1.9831530115112575e-06, + "loss": 1.0548, + "step": 14984 + }, + { + "epoch": 0.9104441339084999, + "grad_norm": 0.1241227388381958, + "learning_rate": 1.98048459864697e-06, + "loss": 1.0995, + "step": 14985 + }, + { + "epoch": 0.9105048909411264, + "grad_norm": 0.11876312643289566, + "learning_rate": 1.9778179459472968e-06, + "loss": 1.0295, + "step": 14986 + }, + { + "epoch": 0.910565647973753, + "grad_norm": 0.17811179161071777, + "learning_rate": 1.9751530535099926e-06, + "loss": 1.1226, + "step": 14987 + }, + { + "epoch": 0.9106264050063795, + "grad_norm": 0.11316623538732529, + "learning_rate": 1.972489921432735e-06, + "loss": 1.0308, + "step": 14988 + }, + { + "epoch": 0.910687162039006, + "grad_norm": 0.24598747491836548, + "learning_rate": 1.969828549813141e-06, + "loss": 1.0775, + "step": 14989 + }, + { + "epoch": 0.9107479190716326, + "grad_norm": 0.6849556565284729, + "learning_rate": 1.9671689387487756e-06, + "loss": 1.1127, + "step": 14990 + }, + { + "epoch": 0.9108086761042591, + "grad_norm": 2.2096712589263916, + "learning_rate": 1.9645110883371064e-06, + "loss": 1.0574, + "step": 14991 + }, + { + "epoch": 0.9108694331368856, + "grad_norm": 0.14541193842887878, + "learning_rate": 1.9618549986755717e-06, + "loss": 1.0415, + "step": 14992 + }, + { + "epoch": 0.9109301901695122, + "grad_norm": 0.1036265566945076, + "learning_rate": 1.959200669861522e-06, + "loss": 1.0633, + "step": 14993 + }, + { + "epoch": 0.9109909472021387, + "grad_norm": 0.17369700968265533, + "learning_rate": 1.956548101992267e-06, + "loss": 1.0696, + "step": 14994 + }, + { + "epoch": 0.9110517042347652, + "grad_norm": 0.13502176105976105, + "learning_rate": 1.9538972951650245e-06, + "loss": 1.0682, + "step": 14995 + }, + { + "epoch": 0.9111124612673916, + "grad_norm": 0.1685733199119568, + "learning_rate": 1.9512482494769613e-06, + "loss": 1.0532, + "step": 14996 + }, + { + "epoch": 0.9111732183000182, + "grad_norm": 0.501579999923706, + "learning_rate": 1.948600965025188e-06, + "loss": 1.0255, + "step": 14997 + }, + { + "epoch": 0.9112339753326447, + "grad_norm": 0.20074892044067383, + "learning_rate": 1.9459554419067273e-06, + "loss": 1.1113, + "step": 14998 + }, + { + "epoch": 0.9112947323652713, + "grad_norm": 0.13688123226165771, + "learning_rate": 1.9433116802185626e-06, + "loss": 1.0556, + "step": 14999 + }, + { + "epoch": 0.9113554893978978, + "grad_norm": 0.21024824678897858, + "learning_rate": 1.940669680057594e-06, + "loss": 1.0449, + "step": 15000 + }, + { + "epoch": 0.9114162464305243, + "grad_norm": 0.13234519958496094, + "learning_rate": 1.9380294415206723e-06, + "loss": 1.0707, + "step": 15001 + }, + { + "epoch": 0.9114770034631509, + "grad_norm": 0.10887365788221359, + "learning_rate": 1.935390964704564e-06, + "loss": 1.0696, + "step": 15002 + }, + { + "epoch": 0.9115377604957774, + "grad_norm": 0.1296595185995102, + "learning_rate": 1.932754249705998e-06, + "loss": 1.0096, + "step": 15003 + }, + { + "epoch": 0.9115985175284039, + "grad_norm": 0.12822416424751282, + "learning_rate": 1.930119296621613e-06, + "loss": 1.0501, + "step": 15004 + }, + { + "epoch": 0.9116592745610305, + "grad_norm": 0.27920305728912354, + "learning_rate": 1.9274861055479986e-06, + "loss": 1.1158, + "step": 15005 + }, + { + "epoch": 0.911720031593657, + "grad_norm": 0.1370418816804886, + "learning_rate": 1.9248546765816722e-06, + "loss": 1.0468, + "step": 15006 + }, + { + "epoch": 0.9117807886262835, + "grad_norm": 0.13695718348026276, + "learning_rate": 1.9222250098190897e-06, + "loss": 1.0816, + "step": 15007 + }, + { + "epoch": 0.9118415456589101, + "grad_norm": 0.1371576189994812, + "learning_rate": 1.9195971053566464e-06, + "loss": 1.115, + "step": 15008 + }, + { + "epoch": 0.9119023026915365, + "grad_norm": 0.1741715967655182, + "learning_rate": 1.916970963290654e-06, + "loss": 1.0969, + "step": 15009 + }, + { + "epoch": 0.911963059724163, + "grad_norm": 0.13772989809513092, + "learning_rate": 1.9143465837174023e-06, + "loss": 1.0289, + "step": 15010 + }, + { + "epoch": 0.9120238167567896, + "grad_norm": 0.1589927226305008, + "learning_rate": 1.911723966733059e-06, + "loss": 1.0327, + "step": 15011 + }, + { + "epoch": 0.9120845737894161, + "grad_norm": 0.0933261290192604, + "learning_rate": 1.9091031124337744e-06, + "loss": 1.0767, + "step": 15012 + }, + { + "epoch": 0.9121453308220426, + "grad_norm": 0.19839192926883698, + "learning_rate": 1.906484020915611e-06, + "loss": 0.9879, + "step": 15013 + }, + { + "epoch": 0.9122060878546692, + "grad_norm": 0.1780577450990677, + "learning_rate": 1.903866692274564e-06, + "loss": 1.1104, + "step": 15014 + }, + { + "epoch": 0.9122668448872957, + "grad_norm": 0.1848917007446289, + "learning_rate": 1.9012511266065847e-06, + "loss": 1.0603, + "step": 15015 + }, + { + "epoch": 0.9123276019199222, + "grad_norm": 0.1053280457854271, + "learning_rate": 1.8986373240075462e-06, + "loss": 1.041, + "step": 15016 + }, + { + "epoch": 0.9123883589525488, + "grad_norm": 0.10170558094978333, + "learning_rate": 1.8960252845732551e-06, + "loss": 1.0641, + "step": 15017 + }, + { + "epoch": 0.9124491159851753, + "grad_norm": 0.1257053166627884, + "learning_rate": 1.8934150083994518e-06, + "loss": 1.0007, + "step": 15018 + }, + { + "epoch": 0.9125098730178018, + "grad_norm": 0.22070614993572235, + "learning_rate": 1.890806495581815e-06, + "loss": 1.0581, + "step": 15019 + }, + { + "epoch": 0.9125706300504284, + "grad_norm": 0.1336812525987625, + "learning_rate": 1.888199746215974e-06, + "loss": 1.0686, + "step": 15020 + }, + { + "epoch": 0.9126313870830549, + "grad_norm": 0.1422586888074875, + "learning_rate": 1.8855947603974688e-06, + "loss": 1.0763, + "step": 15021 + }, + { + "epoch": 0.9126921441156813, + "grad_norm": 0.12930350005626678, + "learning_rate": 1.8829915382217899e-06, + "loss": 1.0757, + "step": 15022 + }, + { + "epoch": 0.9127529011483079, + "grad_norm": 0.21522289514541626, + "learning_rate": 1.88039007978435e-06, + "loss": 1.1454, + "step": 15023 + }, + { + "epoch": 0.9128136581809344, + "grad_norm": 0.1586683988571167, + "learning_rate": 1.8777903851805222e-06, + "loss": 1.1496, + "step": 15024 + }, + { + "epoch": 0.9128744152135609, + "grad_norm": 0.16037622094154358, + "learning_rate": 1.875192454505581e-06, + "loss": 1.0363, + "step": 15025 + }, + { + "epoch": 0.9129351722461875, + "grad_norm": 0.17235533893108368, + "learning_rate": 1.8725962878547721e-06, + "loss": 1.0429, + "step": 15026 + }, + { + "epoch": 0.912995929278814, + "grad_norm": 0.2739880084991455, + "learning_rate": 1.8700018853232416e-06, + "loss": 1.1995, + "step": 15027 + }, + { + "epoch": 0.9130566863114405, + "grad_norm": 0.10850818455219269, + "learning_rate": 1.8674092470060966e-06, + "loss": 1.0333, + "step": 15028 + }, + { + "epoch": 0.9131174433440671, + "grad_norm": 0.13918934762477875, + "learning_rate": 1.8648183729983726e-06, + "loss": 1.0514, + "step": 15029 + }, + { + "epoch": 0.9131782003766936, + "grad_norm": 0.20980829000473022, + "learning_rate": 1.8622292633950266e-06, + "loss": 1.1437, + "step": 15030 + }, + { + "epoch": 0.9132389574093202, + "grad_norm": 0.15558724105358124, + "learning_rate": 1.859641918290983e-06, + "loss": 1.08, + "step": 15031 + }, + { + "epoch": 0.9132997144419467, + "grad_norm": 0.13440930843353271, + "learning_rate": 1.8570563377810657e-06, + "loss": 1.073, + "step": 15032 + }, + { + "epoch": 0.9133604714745732, + "grad_norm": 0.12515634298324585, + "learning_rate": 1.8544725219600544e-06, + "loss": 1.0554, + "step": 15033 + }, + { + "epoch": 0.9134212285071998, + "grad_norm": 0.13739170134067535, + "learning_rate": 1.8518904709226626e-06, + "loss": 1.1265, + "step": 15034 + }, + { + "epoch": 0.9134819855398262, + "grad_norm": 0.11433909088373184, + "learning_rate": 1.8493101847635196e-06, + "loss": 1.0616, + "step": 15035 + }, + { + "epoch": 0.9135427425724527, + "grad_norm": 0.12163405865430832, + "learning_rate": 1.846731663577228e-06, + "loss": 1.1028, + "step": 15036 + }, + { + "epoch": 0.9136034996050793, + "grad_norm": 0.1588326394557953, + "learning_rate": 1.8441549074583008e-06, + "loss": 1.0424, + "step": 15037 + }, + { + "epoch": 0.9136642566377058, + "grad_norm": 0.10011840611696243, + "learning_rate": 1.8415799165011793e-06, + "loss": 1.0136, + "step": 15038 + }, + { + "epoch": 0.9137250136703323, + "grad_norm": 0.1284933090209961, + "learning_rate": 1.8390066908002545e-06, + "loss": 1.0365, + "step": 15039 + }, + { + "epoch": 0.9137857707029589, + "grad_norm": 0.3652207851409912, + "learning_rate": 1.83643523044984e-06, + "loss": 1.1992, + "step": 15040 + }, + { + "epoch": 0.9138465277355854, + "grad_norm": 0.11399808526039124, + "learning_rate": 1.8338655355442104e-06, + "loss": 1.0214, + "step": 15041 + }, + { + "epoch": 0.9139072847682119, + "grad_norm": 0.1427631676197052, + "learning_rate": 1.8312976061775512e-06, + "loss": 1.1073, + "step": 15042 + }, + { + "epoch": 0.9139680418008385, + "grad_norm": 0.18603037297725677, + "learning_rate": 1.8287314424439872e-06, + "loss": 1.1403, + "step": 15043 + }, + { + "epoch": 0.914028798833465, + "grad_norm": 0.2763344645500183, + "learning_rate": 1.8261670444375934e-06, + "loss": 1.2671, + "step": 15044 + }, + { + "epoch": 0.9140895558660915, + "grad_norm": 0.18840569257736206, + "learning_rate": 1.8236044122523389e-06, + "loss": 1.0861, + "step": 15045 + }, + { + "epoch": 0.9141503128987181, + "grad_norm": 0.19211970269680023, + "learning_rate": 1.8210435459821873e-06, + "loss": 1.0524, + "step": 15046 + }, + { + "epoch": 0.9142110699313446, + "grad_norm": 0.11699900031089783, + "learning_rate": 1.818484445720997e-06, + "loss": 1.0632, + "step": 15047 + }, + { + "epoch": 0.9142718269639711, + "grad_norm": 0.17466813325881958, + "learning_rate": 1.815927111562571e-06, + "loss": 1.1245, + "step": 15048 + }, + { + "epoch": 0.9143325839965976, + "grad_norm": 0.11343840509653091, + "learning_rate": 1.813371543600656e-06, + "loss": 1.0551, + "step": 15049 + }, + { + "epoch": 0.9143933410292241, + "grad_norm": 0.10502791404724121, + "learning_rate": 1.8108177419289162e-06, + "loss": 1.0623, + "step": 15050 + }, + { + "epoch": 0.9144540980618506, + "grad_norm": 0.2084721028804779, + "learning_rate": 1.808265706640966e-06, + "loss": 1.1051, + "step": 15051 + }, + { + "epoch": 0.9145148550944772, + "grad_norm": 0.1667078286409378, + "learning_rate": 1.805715437830352e-06, + "loss": 1.0962, + "step": 15052 + }, + { + "epoch": 0.9145756121271037, + "grad_norm": 0.1975841522216797, + "learning_rate": 1.8031669355905556e-06, + "loss": 1.0915, + "step": 15053 + }, + { + "epoch": 0.9146363691597302, + "grad_norm": 0.18476633727550507, + "learning_rate": 1.8006202000149963e-06, + "loss": 1.0782, + "step": 15054 + }, + { + "epoch": 0.9146971261923568, + "grad_norm": 0.5259315371513367, + "learning_rate": 1.7980752311970106e-06, + "loss": 1.0697, + "step": 15055 + }, + { + "epoch": 0.9147578832249833, + "grad_norm": 0.8986549377441406, + "learning_rate": 1.795532029229896e-06, + "loss": 1.1258, + "step": 15056 + }, + { + "epoch": 0.9148186402576098, + "grad_norm": 0.11824187636375427, + "learning_rate": 1.7929905942068836e-06, + "loss": 1.0232, + "step": 15057 + }, + { + "epoch": 0.9148793972902364, + "grad_norm": 0.14921750128269196, + "learning_rate": 1.79045092622111e-06, + "loss": 1.0841, + "step": 15058 + }, + { + "epoch": 0.9149401543228629, + "grad_norm": 0.31009456515312195, + "learning_rate": 1.7879130253656783e-06, + "loss": 1.2014, + "step": 15059 + }, + { + "epoch": 0.9150009113554894, + "grad_norm": 0.09869231283664703, + "learning_rate": 1.7853768917336144e-06, + "loss": 0.998, + "step": 15060 + }, + { + "epoch": 0.915061668388116, + "grad_norm": 0.17841672897338867, + "learning_rate": 1.7828425254178715e-06, + "loss": 1.159, + "step": 15061 + }, + { + "epoch": 0.9151224254207424, + "grad_norm": 0.20998667180538177, + "learning_rate": 1.7803099265113643e-06, + "loss": 1.0843, + "step": 15062 + }, + { + "epoch": 0.9151831824533689, + "grad_norm": 0.10971344262361526, + "learning_rate": 1.7777790951069185e-06, + "loss": 1.0163, + "step": 15063 + }, + { + "epoch": 0.9152439394859955, + "grad_norm": 0.4894367754459381, + "learning_rate": 1.7752500312973042e-06, + "loss": 1.1286, + "step": 15064 + }, + { + "epoch": 0.915304696518622, + "grad_norm": 11.082429885864258, + "learning_rate": 1.7727227351752195e-06, + "loss": 1.0615, + "step": 15065 + }, + { + "epoch": 0.9153654535512485, + "grad_norm": 0.2113294005393982, + "learning_rate": 1.7701972068332962e-06, + "loss": 1.0273, + "step": 15066 + }, + { + "epoch": 0.9154262105838751, + "grad_norm": 0.21624793112277985, + "learning_rate": 1.7676734463641209e-06, + "loss": 1.1947, + "step": 15067 + }, + { + "epoch": 0.9154869676165016, + "grad_norm": 0.12739229202270508, + "learning_rate": 1.7651514538602033e-06, + "loss": 1.0036, + "step": 15068 + }, + { + "epoch": 0.9155477246491281, + "grad_norm": 0.47212859988212585, + "learning_rate": 1.7626312294139801e-06, + "loss": 1.293, + "step": 15069 + }, + { + "epoch": 0.9156084816817547, + "grad_norm": 0.14549385011196136, + "learning_rate": 1.7601127731178335e-06, + "loss": 1.068, + "step": 15070 + }, + { + "epoch": 0.9156692387143812, + "grad_norm": 0.09311727434396744, + "learning_rate": 1.7575960850640783e-06, + "loss": 1.0015, + "step": 15071 + }, + { + "epoch": 0.9157299957470078, + "grad_norm": 0.12194705754518509, + "learning_rate": 1.7550811653449628e-06, + "loss": 1.0511, + "step": 15072 + }, + { + "epoch": 0.9157907527796343, + "grad_norm": 1.0025709867477417, + "learning_rate": 1.7525680140526746e-06, + "loss": 1.0395, + "step": 15073 + }, + { + "epoch": 0.9158515098122608, + "grad_norm": 0.13267354667186737, + "learning_rate": 1.7500566312793287e-06, + "loss": 1.1499, + "step": 15074 + }, + { + "epoch": 0.9159122668448872, + "grad_norm": 0.30322837829589844, + "learning_rate": 1.747547017116985e-06, + "loss": 1.1591, + "step": 15075 + }, + { + "epoch": 0.9159730238775138, + "grad_norm": 0.15233691036701202, + "learning_rate": 1.7450391716576308e-06, + "loss": 1.0302, + "step": 15076 + }, + { + "epoch": 0.9160337809101403, + "grad_norm": 0.8492075204849243, + "learning_rate": 1.7425330949931985e-06, + "loss": 1.0863, + "step": 15077 + }, + { + "epoch": 0.9160945379427669, + "grad_norm": 0.4539567828178406, + "learning_rate": 1.7400287872155362e-06, + "loss": 1.1518, + "step": 15078 + }, + { + "epoch": 0.9161552949753934, + "grad_norm": 0.16213977336883545, + "learning_rate": 1.7375262484164546e-06, + "loss": 1.1028, + "step": 15079 + }, + { + "epoch": 0.9162160520080199, + "grad_norm": 0.9916358590126038, + "learning_rate": 1.735025478687674e-06, + "loss": 1.0673, + "step": 15080 + }, + { + "epoch": 0.9162768090406465, + "grad_norm": 0.1475456804037094, + "learning_rate": 1.7325264781208606e-06, + "loss": 1.0185, + "step": 15081 + }, + { + "epoch": 0.916337566073273, + "grad_norm": 0.2631661593914032, + "learning_rate": 1.7300292468076186e-06, + "loss": 1.0192, + "step": 15082 + }, + { + "epoch": 0.9163983231058995, + "grad_norm": 0.8500600457191467, + "learning_rate": 1.7275337848394856e-06, + "loss": 1.0965, + "step": 15083 + }, + { + "epoch": 0.9164590801385261, + "grad_norm": 0.17283283174037933, + "learning_rate": 1.7250400923079447e-06, + "loss": 1.1046, + "step": 15084 + }, + { + "epoch": 0.9165198371711526, + "grad_norm": 0.2416338324546814, + "learning_rate": 1.7225481693043776e-06, + "loss": 1.0715, + "step": 15085 + }, + { + "epoch": 0.9165805942037791, + "grad_norm": 0.1734868586063385, + "learning_rate": 1.7200580159201396e-06, + "loss": 1.0368, + "step": 15086 + }, + { + "epoch": 0.9166413512364057, + "grad_norm": 0.1387193202972412, + "learning_rate": 1.7175696322465129e-06, + "loss": 1.1174, + "step": 15087 + }, + { + "epoch": 0.9167021082690321, + "grad_norm": 0.22944627702236176, + "learning_rate": 1.7150830183746968e-06, + "loss": 1.1043, + "step": 15088 + }, + { + "epoch": 0.9167628653016586, + "grad_norm": 0.25378623604774475, + "learning_rate": 1.7125981743958518e-06, + "loss": 1.0573, + "step": 15089 + }, + { + "epoch": 0.9168236223342852, + "grad_norm": 0.226588174700737, + "learning_rate": 1.7101151004010551e-06, + "loss": 1.0743, + "step": 15090 + }, + { + "epoch": 0.9168843793669117, + "grad_norm": 0.11758585274219513, + "learning_rate": 1.7076337964813226e-06, + "loss": 1.0468, + "step": 15091 + }, + { + "epoch": 0.9169451363995382, + "grad_norm": 0.1250493824481964, + "learning_rate": 1.7051542627276097e-06, + "loss": 1.1005, + "step": 15092 + }, + { + "epoch": 0.9170058934321648, + "grad_norm": 0.10729379206895828, + "learning_rate": 1.7026764992307987e-06, + "loss": 1.0467, + "step": 15093 + }, + { + "epoch": 0.9170666504647913, + "grad_norm": 0.1321740299463272, + "learning_rate": 1.7002005060817173e-06, + "loss": 1.0143, + "step": 15094 + }, + { + "epoch": 0.9171274074974178, + "grad_norm": 0.13206522166728973, + "learning_rate": 1.6977262833711205e-06, + "loss": 1.1253, + "step": 15095 + }, + { + "epoch": 0.9171881645300444, + "grad_norm": 0.11639541387557983, + "learning_rate": 1.695253831189708e-06, + "loss": 1.0193, + "step": 15096 + }, + { + "epoch": 0.9172489215626709, + "grad_norm": 0.20463216304779053, + "learning_rate": 1.6927831496281021e-06, + "loss": 1.2974, + "step": 15097 + }, + { + "epoch": 0.9173096785952974, + "grad_norm": 0.14341233670711517, + "learning_rate": 1.6903142387768634e-06, + "loss": 1.0719, + "step": 15098 + }, + { + "epoch": 0.917370435627924, + "grad_norm": 0.11572027951478958, + "learning_rate": 1.6878470987264971e-06, + "loss": 1.0303, + "step": 15099 + }, + { + "epoch": 0.9174311926605505, + "grad_norm": 0.1278015822172165, + "learning_rate": 1.6853817295674368e-06, + "loss": 1.066, + "step": 15100 + }, + { + "epoch": 0.9174919496931769, + "grad_norm": 0.1656336933374405, + "learning_rate": 1.6829181313900488e-06, + "loss": 1.167, + "step": 15101 + }, + { + "epoch": 0.9175527067258035, + "grad_norm": 0.13989870250225067, + "learning_rate": 1.6804563042846223e-06, + "loss": 1.1451, + "step": 15102 + }, + { + "epoch": 0.91761346375843, + "grad_norm": 0.09838593751192093, + "learning_rate": 1.6779962483414235e-06, + "loss": 1.0545, + "step": 15103 + }, + { + "epoch": 0.9176742207910565, + "grad_norm": 0.2986197769641876, + "learning_rate": 1.6755379636506141e-06, + "loss": 1.122, + "step": 15104 + }, + { + "epoch": 0.9177349778236831, + "grad_norm": 0.12212865054607391, + "learning_rate": 1.6730814503022997e-06, + "loss": 1.0263, + "step": 15105 + }, + { + "epoch": 0.9177957348563096, + "grad_norm": 0.17864882946014404, + "learning_rate": 1.6706267083865245e-06, + "loss": 1.1397, + "step": 15106 + }, + { + "epoch": 0.9178564918889361, + "grad_norm": 0.15472503006458282, + "learning_rate": 1.6681737379932616e-06, + "loss": 1.0232, + "step": 15107 + }, + { + "epoch": 0.9179172489215627, + "grad_norm": 0.11831034719944, + "learning_rate": 1.6657225392124387e-06, + "loss": 1.0462, + "step": 15108 + }, + { + "epoch": 0.9179780059541892, + "grad_norm": 0.16926069557666779, + "learning_rate": 1.6632731121339008e-06, + "loss": 1.1335, + "step": 15109 + }, + { + "epoch": 0.9180387629868157, + "grad_norm": 0.11622993648052216, + "learning_rate": 1.6608254568474313e-06, + "loss": 1.0223, + "step": 15110 + }, + { + "epoch": 0.9180995200194423, + "grad_norm": 0.1877569556236267, + "learning_rate": 1.6583795734427477e-06, + "loss": 1.0648, + "step": 15111 + }, + { + "epoch": 0.9181602770520688, + "grad_norm": 0.1548006385564804, + "learning_rate": 1.6559354620095003e-06, + "loss": 1.0551, + "step": 15112 + }, + { + "epoch": 0.9182210340846954, + "grad_norm": 5.163941383361816, + "learning_rate": 1.6534931226372896e-06, + "loss": 1.0314, + "step": 15113 + }, + { + "epoch": 0.9182817911173218, + "grad_norm": 13.354634284973145, + "learning_rate": 1.6510525554156275e-06, + "loss": 1.0812, + "step": 15114 + }, + { + "epoch": 0.9183425481499483, + "grad_norm": 0.21274954080581665, + "learning_rate": 1.6486137604339813e-06, + "loss": 1.101, + "step": 15115 + }, + { + "epoch": 0.9184033051825748, + "grad_norm": 0.14278192818164825, + "learning_rate": 1.646176737781746e-06, + "loss": 1.1057, + "step": 15116 + }, + { + "epoch": 0.9184640622152014, + "grad_norm": 0.1299707293510437, + "learning_rate": 1.6437414875482504e-06, + "loss": 1.0963, + "step": 15117 + }, + { + "epoch": 0.9185248192478279, + "grad_norm": 0.1664724498987198, + "learning_rate": 1.6413080098227561e-06, + "loss": 1.0759, + "step": 15118 + }, + { + "epoch": 0.9185855762804545, + "grad_norm": 0.17971952259540558, + "learning_rate": 1.6388763046944644e-06, + "loss": 1.2105, + "step": 15119 + }, + { + "epoch": 0.918646333313081, + "grad_norm": 0.13144651055335999, + "learning_rate": 1.6364463722525092e-06, + "loss": 1.0894, + "step": 15120 + }, + { + "epoch": 0.9187070903457075, + "grad_norm": 0.1569184511899948, + "learning_rate": 1.6340182125859583e-06, + "loss": 1.111, + "step": 15121 + }, + { + "epoch": 0.9187678473783341, + "grad_norm": 0.11527897417545319, + "learning_rate": 1.6315918257838236e-06, + "loss": 1.0228, + "step": 15122 + }, + { + "epoch": 0.9188286044109606, + "grad_norm": 7.612890243530273, + "learning_rate": 1.6291672119350288e-06, + "loss": 1.0654, + "step": 15123 + }, + { + "epoch": 0.9188893614435871, + "grad_norm": 0.11632709950208664, + "learning_rate": 1.6267443711284747e-06, + "loss": 1.0385, + "step": 15124 + }, + { + "epoch": 0.9189501184762137, + "grad_norm": 0.18938694894313812, + "learning_rate": 1.6243233034529515e-06, + "loss": 1.2582, + "step": 15125 + }, + { + "epoch": 0.9190108755088402, + "grad_norm": 0.332832008600235, + "learning_rate": 1.621904008997205e-06, + "loss": 1.1757, + "step": 15126 + }, + { + "epoch": 0.9190716325414666, + "grad_norm": 0.18409976363182068, + "learning_rate": 1.6194864878499193e-06, + "loss": 1.0493, + "step": 15127 + }, + { + "epoch": 0.9191323895740932, + "grad_norm": 0.13636156916618347, + "learning_rate": 1.617070740099702e-06, + "loss": 0.9828, + "step": 15128 + }, + { + "epoch": 0.9191931466067197, + "grad_norm": 0.2805340588092804, + "learning_rate": 1.6146567658351154e-06, + "loss": 1.154, + "step": 15129 + }, + { + "epoch": 0.9192539036393462, + "grad_norm": 0.11665313690900803, + "learning_rate": 1.6122445651446383e-06, + "loss": 1.0406, + "step": 15130 + }, + { + "epoch": 0.9193146606719728, + "grad_norm": 0.14811690151691437, + "learning_rate": 1.6098341381166949e-06, + "loss": 1.0485, + "step": 15131 + }, + { + "epoch": 0.9193754177045993, + "grad_norm": 0.08609217405319214, + "learning_rate": 1.607425484839631e-06, + "loss": 1.0399, + "step": 15132 + }, + { + "epoch": 0.9194361747372258, + "grad_norm": 0.2471848428249359, + "learning_rate": 1.6050186054017313e-06, + "loss": 1.1401, + "step": 15133 + }, + { + "epoch": 0.9194969317698524, + "grad_norm": 0.13908128440380096, + "learning_rate": 1.6026134998912367e-06, + "loss": 1.0849, + "step": 15134 + }, + { + "epoch": 0.9195576888024789, + "grad_norm": 0.1085067093372345, + "learning_rate": 1.600210168396299e-06, + "loss": 1.0475, + "step": 15135 + }, + { + "epoch": 0.9196184458351054, + "grad_norm": 0.19212636351585388, + "learning_rate": 1.597808611005014e-06, + "loss": 1.1038, + "step": 15136 + }, + { + "epoch": 0.919679202867732, + "grad_norm": 0.11599438637495041, + "learning_rate": 1.5954088278054114e-06, + "loss": 1.0308, + "step": 15137 + }, + { + "epoch": 0.9197399599003585, + "grad_norm": 0.10855875164270401, + "learning_rate": 1.5930108188854487e-06, + "loss": 1.0726, + "step": 15138 + }, + { + "epoch": 0.919800716932985, + "grad_norm": 0.1338091939687729, + "learning_rate": 1.590614584333039e-06, + "loss": 1.0135, + "step": 15139 + }, + { + "epoch": 0.9198614739656115, + "grad_norm": 0.27087607979774475, + "learning_rate": 1.5882201242360006e-06, + "loss": 1.1385, + "step": 15140 + }, + { + "epoch": 0.919922230998238, + "grad_norm": 0.15763407945632935, + "learning_rate": 1.5858274386821192e-06, + "loss": 1.1272, + "step": 15141 + }, + { + "epoch": 0.9199829880308645, + "grad_norm": 0.265626460313797, + "learning_rate": 1.5834365277590857e-06, + "loss": 1.0558, + "step": 15142 + }, + { + "epoch": 0.9200437450634911, + "grad_norm": 0.15396176278591156, + "learning_rate": 1.5810473915545466e-06, + "loss": 1.0715, + "step": 15143 + }, + { + "epoch": 0.9201045020961176, + "grad_norm": 0.1626933515071869, + "learning_rate": 1.578660030156076e-06, + "loss": 1.1258, + "step": 15144 + }, + { + "epoch": 0.9201652591287441, + "grad_norm": 0.16092416644096375, + "learning_rate": 1.5762744436511822e-06, + "loss": 1.0809, + "step": 15145 + }, + { + "epoch": 0.9202260161613707, + "grad_norm": 0.3362584412097931, + "learning_rate": 1.573890632127306e-06, + "loss": 1.2333, + "step": 15146 + }, + { + "epoch": 0.9202867731939972, + "grad_norm": 0.13535462319850922, + "learning_rate": 1.5715085956718278e-06, + "loss": 1.0837, + "step": 15147 + }, + { + "epoch": 0.9203475302266237, + "grad_norm": 0.11573890596628189, + "learning_rate": 1.5691283343720664e-06, + "loss": 1.0326, + "step": 15148 + }, + { + "epoch": 0.9204082872592503, + "grad_norm": 0.2805766463279724, + "learning_rate": 1.566749848315263e-06, + "loss": 1.0142, + "step": 15149 + }, + { + "epoch": 0.9204690442918768, + "grad_norm": 0.12610600888729095, + "learning_rate": 1.5643731375886094e-06, + "loss": 1.0439, + "step": 15150 + }, + { + "epoch": 0.9205298013245033, + "grad_norm": 0.1797090768814087, + "learning_rate": 1.5619982022792245e-06, + "loss": 1.1453, + "step": 15151 + }, + { + "epoch": 0.9205905583571299, + "grad_norm": 0.15865083038806915, + "learning_rate": 1.5596250424741553e-06, + "loss": 1.1232, + "step": 15152 + }, + { + "epoch": 0.9206513153897564, + "grad_norm": 0.12154293060302734, + "learning_rate": 1.5572536582603935e-06, + "loss": 1.047, + "step": 15153 + }, + { + "epoch": 0.9207120724223828, + "grad_norm": 0.25497084856033325, + "learning_rate": 1.5548840497248584e-06, + "loss": 1.1329, + "step": 15154 + }, + { + "epoch": 0.9207728294550094, + "grad_norm": 0.10914847999811172, + "learning_rate": 1.5525162169544138e-06, + "loss": 1.0099, + "step": 15155 + }, + { + "epoch": 0.9208335864876359, + "grad_norm": 0.12534913420677185, + "learning_rate": 1.550150160035857e-06, + "loss": 1.0252, + "step": 15156 + }, + { + "epoch": 0.9208943435202624, + "grad_norm": 0.12034306675195694, + "learning_rate": 1.5477858790559075e-06, + "loss": 1.0439, + "step": 15157 + }, + { + "epoch": 0.920955100552889, + "grad_norm": 0.13736814260482788, + "learning_rate": 1.5454233741012403e-06, + "loss": 1.059, + "step": 15158 + }, + { + "epoch": 0.9210158575855155, + "grad_norm": 0.6928398013114929, + "learning_rate": 1.5430626452584306e-06, + "loss": 1.0643, + "step": 15159 + }, + { + "epoch": 0.921076614618142, + "grad_norm": 0.12043861299753189, + "learning_rate": 1.5407036926140372e-06, + "loss": 0.984, + "step": 15160 + }, + { + "epoch": 0.9211373716507686, + "grad_norm": 0.15041232109069824, + "learning_rate": 1.538346516254513e-06, + "loss": 1.0959, + "step": 15161 + }, + { + "epoch": 0.9211981286833951, + "grad_norm": 26.66671371459961, + "learning_rate": 1.5359911162662667e-06, + "loss": 1.0281, + "step": 15162 + }, + { + "epoch": 0.9212588857160217, + "grad_norm": 0.11382798850536346, + "learning_rate": 1.5336374927356344e-06, + "loss": 1.0151, + "step": 15163 + }, + { + "epoch": 0.9213196427486482, + "grad_norm": 0.247624009847641, + "learning_rate": 1.5312856457488866e-06, + "loss": 1.149, + "step": 15164 + }, + { + "epoch": 0.9213803997812747, + "grad_norm": 0.4614062011241913, + "learning_rate": 1.5289355753922318e-06, + "loss": 1.2893, + "step": 15165 + }, + { + "epoch": 0.9214411568139013, + "grad_norm": 0.1196788027882576, + "learning_rate": 1.5265872817518123e-06, + "loss": 0.992, + "step": 15166 + }, + { + "epoch": 0.9215019138465277, + "grad_norm": 0.2365286499261856, + "learning_rate": 1.5242407649137036e-06, + "loss": 1.0595, + "step": 15167 + }, + { + "epoch": 0.9215626708791542, + "grad_norm": 0.1954217255115509, + "learning_rate": 1.5218960249639258e-06, + "loss": 1.0842, + "step": 15168 + }, + { + "epoch": 0.9216234279117808, + "grad_norm": 0.16624826192855835, + "learning_rate": 1.5195530619884158e-06, + "loss": 1.0679, + "step": 15169 + }, + { + "epoch": 0.9216841849444073, + "grad_norm": 0.11108061671257019, + "learning_rate": 1.5172118760730546e-06, + "loss": 1.0124, + "step": 15170 + }, + { + "epoch": 0.9217449419770338, + "grad_norm": 0.13399088382720947, + "learning_rate": 1.514872467303674e-06, + "loss": 1.0822, + "step": 15171 + }, + { + "epoch": 0.9218056990096604, + "grad_norm": 0.1440088152885437, + "learning_rate": 1.5125348357660108e-06, + "loss": 1.0829, + "step": 15172 + }, + { + "epoch": 0.9218664560422869, + "grad_norm": 0.17107906937599182, + "learning_rate": 1.5101989815457573e-06, + "loss": 1.0695, + "step": 15173 + }, + { + "epoch": 0.9219272130749134, + "grad_norm": 0.3680818974971771, + "learning_rate": 1.5078649047285288e-06, + "loss": 1.0326, + "step": 15174 + }, + { + "epoch": 0.92198797010754, + "grad_norm": 0.14400503039360046, + "learning_rate": 1.505532605399884e-06, + "loss": 1.1473, + "step": 15175 + }, + { + "epoch": 0.9220487271401665, + "grad_norm": 0.11141887307167053, + "learning_rate": 1.5032020836453164e-06, + "loss": 1.0741, + "step": 15176 + }, + { + "epoch": 0.922109484172793, + "grad_norm": 0.15904878079891205, + "learning_rate": 1.5008733395502517e-06, + "loss": 1.0601, + "step": 15177 + }, + { + "epoch": 0.9221702412054196, + "grad_norm": 0.11423798650503159, + "learning_rate": 1.4985463732000604e-06, + "loss": 1.056, + "step": 15178 + }, + { + "epoch": 0.9222309982380461, + "grad_norm": 0.11866994947195053, + "learning_rate": 1.4962211846800077e-06, + "loss": 1.0049, + "step": 15179 + }, + { + "epoch": 0.9222917552706725, + "grad_norm": 0.29792261123657227, + "learning_rate": 1.4938977740753535e-06, + "loss": 1.0376, + "step": 15180 + }, + { + "epoch": 0.9223525123032991, + "grad_norm": 0.12448982894420624, + "learning_rate": 1.4915761414712514e-06, + "loss": 0.9684, + "step": 15181 + }, + { + "epoch": 0.9224132693359256, + "grad_norm": 0.24090053141117096, + "learning_rate": 1.4892562869528059e-06, + "loss": 1.1001, + "step": 15182 + }, + { + "epoch": 0.9224740263685521, + "grad_norm": 0.1544858068227768, + "learning_rate": 1.4869382106050433e-06, + "loss": 1.0554, + "step": 15183 + }, + { + "epoch": 0.9225347834011787, + "grad_norm": 0.12713445723056793, + "learning_rate": 1.48462191251294e-06, + "loss": 1.0475, + "step": 15184 + }, + { + "epoch": 0.9225955404338052, + "grad_norm": 0.7496867179870605, + "learning_rate": 1.4823073927614006e-06, + "loss": 1.1893, + "step": 15185 + }, + { + "epoch": 0.9226562974664317, + "grad_norm": 0.1444765031337738, + "learning_rate": 1.479994651435257e-06, + "loss": 1.1527, + "step": 15186 + }, + { + "epoch": 0.9227170544990583, + "grad_norm": 0.1176825538277626, + "learning_rate": 1.4776836886192914e-06, + "loss": 1.0687, + "step": 15187 + }, + { + "epoch": 0.9227778115316848, + "grad_norm": 0.11566071212291718, + "learning_rate": 1.4753745043982081e-06, + "loss": 1.0357, + "step": 15188 + }, + { + "epoch": 0.9228385685643113, + "grad_norm": 0.14826469123363495, + "learning_rate": 1.4730670988566508e-06, + "loss": 1.0522, + "step": 15189 + }, + { + "epoch": 0.9228993255969379, + "grad_norm": 0.13102050125598907, + "learning_rate": 1.4707614720792018e-06, + "loss": 1.0462, + "step": 15190 + }, + { + "epoch": 0.9229600826295644, + "grad_norm": 0.3773258328437805, + "learning_rate": 1.4684576241503712e-06, + "loss": 1.1564, + "step": 15191 + }, + { + "epoch": 0.923020839662191, + "grad_norm": 0.16633334755897522, + "learning_rate": 1.4661555551546024e-06, + "loss": 1.0712, + "step": 15192 + }, + { + "epoch": 0.9230815966948174, + "grad_norm": 0.1298397034406662, + "learning_rate": 1.4638552651762893e-06, + "loss": 1.0648, + "step": 15193 + }, + { + "epoch": 0.9231423537274439, + "grad_norm": 0.11218418926000595, + "learning_rate": 1.4615567542997422e-06, + "loss": 1.0078, + "step": 15194 + }, + { + "epoch": 0.9232031107600704, + "grad_norm": 0.2105770856142044, + "learning_rate": 1.4592600226092045e-06, + "loss": 1.0128, + "step": 15195 + }, + { + "epoch": 0.923263867792697, + "grad_norm": 2.036931276321411, + "learning_rate": 1.4569650701888815e-06, + "loss": 1.0732, + "step": 15196 + }, + { + "epoch": 0.9233246248253235, + "grad_norm": 0.11545518040657043, + "learning_rate": 1.454671897122889e-06, + "loss": 1.0226, + "step": 15197 + }, + { + "epoch": 0.92338538185795, + "grad_norm": 0.5267545580863953, + "learning_rate": 1.4523805034952874e-06, + "loss": 1.4146, + "step": 15198 + }, + { + "epoch": 0.9234461388905766, + "grad_norm": 0.10423264652490616, + "learning_rate": 1.4500908893900544e-06, + "loss": 1.0388, + "step": 15199 + }, + { + "epoch": 0.9235068959232031, + "grad_norm": 0.1641158014535904, + "learning_rate": 1.447803054891117e-06, + "loss": 1.0857, + "step": 15200 + }, + { + "epoch": 0.9235676529558297, + "grad_norm": 0.17542433738708496, + "learning_rate": 1.4455170000823526e-06, + "loss": 1.0079, + "step": 15201 + }, + { + "epoch": 0.9236284099884562, + "grad_norm": 0.1571839153766632, + "learning_rate": 1.4432327250475497e-06, + "loss": 1.0978, + "step": 15202 + }, + { + "epoch": 0.9236891670210827, + "grad_norm": 0.15233772993087769, + "learning_rate": 1.4409502298704359e-06, + "loss": 1.0884, + "step": 15203 + }, + { + "epoch": 0.9237499240537093, + "grad_norm": 0.1004517450928688, + "learning_rate": 1.4386695146346774e-06, + "loss": 0.9797, + "step": 15204 + }, + { + "epoch": 0.9238106810863358, + "grad_norm": 0.1002735048532486, + "learning_rate": 1.4363905794238686e-06, + "loss": 1.0276, + "step": 15205 + }, + { + "epoch": 0.9238714381189622, + "grad_norm": 0.1188020408153534, + "learning_rate": 1.4341134243215593e-06, + "loss": 1.0357, + "step": 15206 + }, + { + "epoch": 0.9239321951515888, + "grad_norm": 0.11223286390304565, + "learning_rate": 1.431838049411205e-06, + "loss": 1.0715, + "step": 15207 + }, + { + "epoch": 0.9239929521842153, + "grad_norm": 7.940769195556641, + "learning_rate": 1.4295644547762166e-06, + "loss": 1.0322, + "step": 15208 + }, + { + "epoch": 0.9240537092168418, + "grad_norm": 0.2006407082080841, + "learning_rate": 1.4272926404999276e-06, + "loss": 1.0644, + "step": 15209 + }, + { + "epoch": 0.9241144662494684, + "grad_norm": 0.18006841838359833, + "learning_rate": 1.4250226066656213e-06, + "loss": 0.9936, + "step": 15210 + }, + { + "epoch": 0.9241752232820949, + "grad_norm": 0.13213543593883514, + "learning_rate": 1.422754353356498e-06, + "loss": 1.0732, + "step": 15211 + }, + { + "epoch": 0.9242359803147214, + "grad_norm": 0.10727523267269135, + "learning_rate": 1.420487880655702e-06, + "loss": 1.0815, + "step": 15212 + }, + { + "epoch": 0.924296737347348, + "grad_norm": 0.13576774299144745, + "learning_rate": 1.4182231886463171e-06, + "loss": 1.0189, + "step": 15213 + }, + { + "epoch": 0.9243574943799745, + "grad_norm": 0.6041025519371033, + "learning_rate": 1.415960277411349e-06, + "loss": 1.129, + "step": 15214 + }, + { + "epoch": 0.924418251412601, + "grad_norm": 0.1084340512752533, + "learning_rate": 1.413699147033748e-06, + "loss": 1.0276, + "step": 15215 + }, + { + "epoch": 0.9244790084452276, + "grad_norm": 0.5081831216812134, + "learning_rate": 1.4114397975963923e-06, + "loss": 1.0542, + "step": 15216 + }, + { + "epoch": 0.9245397654778541, + "grad_norm": 0.19882328808307648, + "learning_rate": 1.40918222918211e-06, + "loss": 1.0765, + "step": 15217 + }, + { + "epoch": 0.9246005225104806, + "grad_norm": 0.0956098809838295, + "learning_rate": 1.406926441873646e-06, + "loss": 1.0108, + "step": 15218 + }, + { + "epoch": 0.9246612795431071, + "grad_norm": 0.32133370637893677, + "learning_rate": 1.4046724357536846e-06, + "loss": 1.1457, + "step": 15219 + }, + { + "epoch": 0.9247220365757336, + "grad_norm": 0.11757557094097137, + "learning_rate": 1.402420210904848e-06, + "loss": 1.0127, + "step": 15220 + }, + { + "epoch": 0.9247827936083601, + "grad_norm": 0.1253536343574524, + "learning_rate": 1.400169767409687e-06, + "loss": 1.0741, + "step": 15221 + }, + { + "epoch": 0.9248435506409867, + "grad_norm": 0.10463772714138031, + "learning_rate": 1.3979211053506968e-06, + "loss": 0.9783, + "step": 15222 + }, + { + "epoch": 0.9249043076736132, + "grad_norm": 0.13048705458641052, + "learning_rate": 1.3956742248103117e-06, + "loss": 1.0796, + "step": 15223 + }, + { + "epoch": 0.9249650647062397, + "grad_norm": 2.9244537353515625, + "learning_rate": 1.3934291258708765e-06, + "loss": 1.2361, + "step": 15224 + }, + { + "epoch": 0.9250258217388663, + "grad_norm": 0.15533210337162018, + "learning_rate": 1.3911858086147033e-06, + "loss": 1.1036, + "step": 15225 + }, + { + "epoch": 0.9250865787714928, + "grad_norm": 0.2609361708164215, + "learning_rate": 1.3889442731239933e-06, + "loss": 1.1349, + "step": 15226 + }, + { + "epoch": 0.9251473358041193, + "grad_norm": 0.19171033799648285, + "learning_rate": 1.3867045194809358e-06, + "loss": 1.0601, + "step": 15227 + }, + { + "epoch": 0.9252080928367459, + "grad_norm": 0.1614377349615097, + "learning_rate": 1.384466547767621e-06, + "loss": 1.0036, + "step": 15228 + }, + { + "epoch": 0.9252688498693724, + "grad_norm": 0.12945087254047394, + "learning_rate": 1.3822303580660833e-06, + "loss": 1.0721, + "step": 15229 + }, + { + "epoch": 0.925329606901999, + "grad_norm": 0.13309133052825928, + "learning_rate": 1.379995950458285e-06, + "loss": 1.1972, + "step": 15230 + }, + { + "epoch": 0.9253903639346255, + "grad_norm": 0.1279413104057312, + "learning_rate": 1.377763325026138e-06, + "loss": 1.0477, + "step": 15231 + }, + { + "epoch": 0.9254511209672519, + "grad_norm": 0.1321583092212677, + "learning_rate": 1.3755324818514714e-06, + "loss": 1.1255, + "step": 15232 + }, + { + "epoch": 0.9255118779998784, + "grad_norm": 0.19626502692699432, + "learning_rate": 1.373303421016059e-06, + "loss": 1.0662, + "step": 15233 + }, + { + "epoch": 0.925572635032505, + "grad_norm": 0.13864190876483917, + "learning_rate": 1.3710761426016127e-06, + "loss": 1.0627, + "step": 15234 + }, + { + "epoch": 0.9256333920651315, + "grad_norm": 0.20498597621917725, + "learning_rate": 1.3688506466897733e-06, + "loss": 1.0984, + "step": 15235 + }, + { + "epoch": 0.925694149097758, + "grad_norm": 0.17506982386112213, + "learning_rate": 1.3666269333621084e-06, + "loss": 1.1093, + "step": 15236 + }, + { + "epoch": 0.9257549061303846, + "grad_norm": 0.12914767861366272, + "learning_rate": 1.364405002700131e-06, + "loss": 1.033, + "step": 15237 + }, + { + "epoch": 0.9258156631630111, + "grad_norm": 0.11264797300100327, + "learning_rate": 1.3621848547852978e-06, + "loss": 1.0989, + "step": 15238 + }, + { + "epoch": 0.9258764201956377, + "grad_norm": 0.20568512380123138, + "learning_rate": 1.3599664896989773e-06, + "loss": 1.0369, + "step": 15239 + }, + { + "epoch": 0.9259371772282642, + "grad_norm": 0.11404545605182648, + "learning_rate": 1.357749907522482e-06, + "loss": 1.0098, + "step": 15240 + }, + { + "epoch": 0.9259979342608907, + "grad_norm": 0.46397948265075684, + "learning_rate": 1.3555351083370694e-06, + "loss": 1.0878, + "step": 15241 + }, + { + "epoch": 0.9260586912935173, + "grad_norm": 0.3841884136199951, + "learning_rate": 1.3533220922239188e-06, + "loss": 1.0557, + "step": 15242 + }, + { + "epoch": 0.9261194483261438, + "grad_norm": 0.2260499745607376, + "learning_rate": 1.3511108592641487e-06, + "loss": 1.2119, + "step": 15243 + }, + { + "epoch": 0.9261802053587703, + "grad_norm": 17.858728408813477, + "learning_rate": 1.3489014095388163e-06, + "loss": 1.0558, + "step": 15244 + }, + { + "epoch": 0.9262409623913967, + "grad_norm": 0.3694033920764923, + "learning_rate": 1.346693743128913e-06, + "loss": 1.0222, + "step": 15245 + }, + { + "epoch": 0.9263017194240233, + "grad_norm": 1.4111883640289307, + "learning_rate": 1.344487860115351e-06, + "loss": 1.0564, + "step": 15246 + }, + { + "epoch": 0.9263624764566498, + "grad_norm": 0.1914466917514801, + "learning_rate": 1.3422837605789828e-06, + "loss": 1.1455, + "step": 15247 + }, + { + "epoch": 0.9264232334892764, + "grad_norm": 0.20472809672355652, + "learning_rate": 1.340081444600616e-06, + "loss": 1.0565, + "step": 15248 + }, + { + "epoch": 0.9264839905219029, + "grad_norm": 0.17095518112182617, + "learning_rate": 1.3378809122609693e-06, + "loss": 1.077, + "step": 15249 + }, + { + "epoch": 0.9265447475545294, + "grad_norm": 0.5156824588775635, + "learning_rate": 1.3356821636407059e-06, + "loss": 1.0825, + "step": 15250 + }, + { + "epoch": 0.926605504587156, + "grad_norm": 0.25496575236320496, + "learning_rate": 1.3334851988204167e-06, + "loss": 1.142, + "step": 15251 + }, + { + "epoch": 0.9266662616197825, + "grad_norm": 0.13751541078090668, + "learning_rate": 1.331290017880632e-06, + "loss": 1.0444, + "step": 15252 + }, + { + "epoch": 0.926727018652409, + "grad_norm": 0.11381416767835617, + "learning_rate": 1.3290966209018264e-06, + "loss": 1.0318, + "step": 15253 + }, + { + "epoch": 0.9267877756850356, + "grad_norm": 0.13319475948810577, + "learning_rate": 1.3269050079643851e-06, + "loss": 1.0916, + "step": 15254 + }, + { + "epoch": 0.9268485327176621, + "grad_norm": 0.11884255707263947, + "learning_rate": 1.3247151791486557e-06, + "loss": 1.0822, + "step": 15255 + }, + { + "epoch": 0.9269092897502886, + "grad_norm": 0.21788297593593597, + "learning_rate": 1.3225271345348956e-06, + "loss": 1.1315, + "step": 15256 + }, + { + "epoch": 0.9269700467829152, + "grad_norm": 0.10422495752573013, + "learning_rate": 1.320340874203313e-06, + "loss": 1.007, + "step": 15257 + }, + { + "epoch": 0.9270308038155417, + "grad_norm": 0.14656057953834534, + "learning_rate": 1.3181563982340438e-06, + "loss": 1.1066, + "step": 15258 + }, + { + "epoch": 0.9270915608481681, + "grad_norm": 0.22052647173404694, + "learning_rate": 1.3159737067071631e-06, + "loss": 1.0927, + "step": 15259 + }, + { + "epoch": 0.9271523178807947, + "grad_norm": 0.1543327420949936, + "learning_rate": 1.313792799702679e-06, + "loss": 1.0855, + "step": 15260 + }, + { + "epoch": 0.9272130749134212, + "grad_norm": 0.2059401124715805, + "learning_rate": 1.311613677300527e-06, + "loss": 1.1389, + "step": 15261 + }, + { + "epoch": 0.9272738319460477, + "grad_norm": 0.18106365203857422, + "learning_rate": 1.3094363395805887e-06, + "loss": 1.0509, + "step": 15262 + }, + { + "epoch": 0.9273345889786743, + "grad_norm": 0.11952624469995499, + "learning_rate": 1.307260786622666e-06, + "loss": 1.0256, + "step": 15263 + }, + { + "epoch": 0.9273953460113008, + "grad_norm": 0.1100388839840889, + "learning_rate": 1.3050870185065178e-06, + "loss": 1.0106, + "step": 15264 + }, + { + "epoch": 0.9274561030439273, + "grad_norm": 0.17127452790737152, + "learning_rate": 1.3029150353118247e-06, + "loss": 1.0707, + "step": 15265 + }, + { + "epoch": 0.9275168600765539, + "grad_norm": 0.12455254793167114, + "learning_rate": 1.3007448371181897e-06, + "loss": 1.0833, + "step": 15266 + }, + { + "epoch": 0.9275776171091804, + "grad_norm": 0.11031661182641983, + "learning_rate": 1.29857642400516e-06, + "loss": 1.0643, + "step": 15267 + }, + { + "epoch": 0.9276383741418069, + "grad_norm": 0.13668157160282135, + "learning_rate": 1.2964097960522225e-06, + "loss": 1.0638, + "step": 15268 + }, + { + "epoch": 0.9276991311744335, + "grad_norm": 0.1880815476179123, + "learning_rate": 1.2942449533388078e-06, + "loss": 1.0867, + "step": 15269 + }, + { + "epoch": 0.92775988820706, + "grad_norm": 0.13995423913002014, + "learning_rate": 1.2920818959442526e-06, + "loss": 1.056, + "step": 15270 + }, + { + "epoch": 0.9278206452396865, + "grad_norm": 0.17679435014724731, + "learning_rate": 1.2899206239478545e-06, + "loss": 1.0798, + "step": 15271 + }, + { + "epoch": 0.927881402272313, + "grad_norm": 0.11338049918413162, + "learning_rate": 1.287761137428828e-06, + "loss": 1.031, + "step": 15272 + }, + { + "epoch": 0.9279421593049395, + "grad_norm": 0.13410165905952454, + "learning_rate": 1.2856034364663372e-06, + "loss": 1.0715, + "step": 15273 + }, + { + "epoch": 0.928002916337566, + "grad_norm": 0.1318570226430893, + "learning_rate": 1.2834475211394692e-06, + "loss": 1.0474, + "step": 15274 + }, + { + "epoch": 0.9280636733701926, + "grad_norm": 0.11561030149459839, + "learning_rate": 1.281293391527244e-06, + "loss": 1.0798, + "step": 15275 + }, + { + "epoch": 0.9281244304028191, + "grad_norm": 0.14250938594341278, + "learning_rate": 1.2791410477086319e-06, + "loss": 1.0462, + "step": 15276 + }, + { + "epoch": 0.9281851874354456, + "grad_norm": 0.18573889136314392, + "learning_rate": 1.276990489762514e-06, + "loss": 1.0046, + "step": 15277 + }, + { + "epoch": 0.9282459444680722, + "grad_norm": 1.8279820680618286, + "learning_rate": 1.274841717767733e-06, + "loss": 1.0845, + "step": 15278 + }, + { + "epoch": 0.9283067015006987, + "grad_norm": 0.8750011920928955, + "learning_rate": 1.272694731803048e-06, + "loss": 1.1081, + "step": 15279 + }, + { + "epoch": 0.9283674585333253, + "grad_norm": 0.2108527570962906, + "learning_rate": 1.2705495319471573e-06, + "loss": 0.9953, + "step": 15280 + }, + { + "epoch": 0.9284282155659518, + "grad_norm": 0.2060713768005371, + "learning_rate": 1.2684061182786922e-06, + "loss": 1.0865, + "step": 15281 + }, + { + "epoch": 0.9284889725985783, + "grad_norm": 0.1269395649433136, + "learning_rate": 1.2662644908762179e-06, + "loss": 1.069, + "step": 15282 + }, + { + "epoch": 0.9285497296312049, + "grad_norm": 0.14588280022144318, + "learning_rate": 1.264124649818238e-06, + "loss": 1.0895, + "step": 15283 + }, + { + "epoch": 0.9286104866638314, + "grad_norm": 0.1509881317615509, + "learning_rate": 1.26198659518319e-06, + "loss": 1.1111, + "step": 15284 + }, + { + "epoch": 0.9286712436964578, + "grad_norm": 0.11926669627428055, + "learning_rate": 1.25985032704945e-06, + "loss": 0.9683, + "step": 15285 + }, + { + "epoch": 0.9287320007290844, + "grad_norm": 0.49089598655700684, + "learning_rate": 1.2577158454953108e-06, + "loss": 1.0998, + "step": 15286 + }, + { + "epoch": 0.9287927577617109, + "grad_norm": 0.1705758273601532, + "learning_rate": 1.2555831505990267e-06, + "loss": 1.0665, + "step": 15287 + }, + { + "epoch": 0.9288535147943374, + "grad_norm": 0.13350285589694977, + "learning_rate": 1.2534522424387518e-06, + "loss": 1.0435, + "step": 15288 + }, + { + "epoch": 0.928914271826964, + "grad_norm": 0.12272652238607407, + "learning_rate": 1.2513231210926124e-06, + "loss": 1.0346, + "step": 15289 + }, + { + "epoch": 0.9289750288595905, + "grad_norm": 0.22617629170417786, + "learning_rate": 1.2491957866386516e-06, + "loss": 1.1836, + "step": 15290 + }, + { + "epoch": 0.929035785892217, + "grad_norm": 0.14095793664455414, + "learning_rate": 1.2470702391548406e-06, + "loss": 1.1044, + "step": 15291 + }, + { + "epoch": 0.9290965429248436, + "grad_norm": 0.18492470681667328, + "learning_rate": 1.2449464787191e-06, + "loss": 1.2188, + "step": 15292 + }, + { + "epoch": 0.9291572999574701, + "grad_norm": 0.2547471225261688, + "learning_rate": 1.242824505409257e-06, + "loss": 1.2218, + "step": 15293 + }, + { + "epoch": 0.9292180569900966, + "grad_norm": 0.22714532911777496, + "learning_rate": 1.2407043193031098e-06, + "loss": 1.0555, + "step": 15294 + }, + { + "epoch": 0.9292788140227232, + "grad_norm": 0.3288191556930542, + "learning_rate": 1.2385859204783746e-06, + "loss": 1.0663, + "step": 15295 + }, + { + "epoch": 0.9293395710553497, + "grad_norm": 0.13069987297058105, + "learning_rate": 1.2364693090126999e-06, + "loss": 1.0489, + "step": 15296 + }, + { + "epoch": 0.9294003280879762, + "grad_norm": 0.10086257755756378, + "learning_rate": 1.234354484983663e-06, + "loss": 1.047, + "step": 15297 + }, + { + "epoch": 0.9294610851206027, + "grad_norm": 0.0976250097155571, + "learning_rate": 1.2322414484687904e-06, + "loss": 1.0175, + "step": 15298 + }, + { + "epoch": 0.9295218421532292, + "grad_norm": 0.13630108535289764, + "learning_rate": 1.2301301995455371e-06, + "loss": 1.0772, + "step": 15299 + }, + { + "epoch": 0.9295825991858557, + "grad_norm": 0.22394976019859314, + "learning_rate": 1.2280207382912855e-06, + "loss": 1.0533, + "step": 15300 + }, + { + "epoch": 0.9296433562184823, + "grad_norm": 0.1796313226222992, + "learning_rate": 1.2259130647833627e-06, + "loss": 1.1555, + "step": 15301 + }, + { + "epoch": 0.9297041132511088, + "grad_norm": 0.13305111229419708, + "learning_rate": 1.2238071790990236e-06, + "loss": 1.0099, + "step": 15302 + }, + { + "epoch": 0.9297648702837353, + "grad_norm": 0.15687556564807892, + "learning_rate": 1.2217030813154617e-06, + "loss": 1.1247, + "step": 15303 + }, + { + "epoch": 0.9298256273163619, + "grad_norm": 0.12067729979753494, + "learning_rate": 1.2196007715097934e-06, + "loss": 1.0598, + "step": 15304 + }, + { + "epoch": 0.9298863843489884, + "grad_norm": 0.12093468010425568, + "learning_rate": 1.2175002497591016e-06, + "loss": 1.0716, + "step": 15305 + }, + { + "epoch": 0.9299471413816149, + "grad_norm": 1.464052677154541, + "learning_rate": 1.2154015161403631e-06, + "loss": 0.9718, + "step": 15306 + }, + { + "epoch": 0.9300078984142415, + "grad_norm": 0.20744918286800385, + "learning_rate": 1.2133045707305112e-06, + "loss": 1.1986, + "step": 15307 + }, + { + "epoch": 0.930068655446868, + "grad_norm": 0.13889753818511963, + "learning_rate": 1.2112094136064067e-06, + "loss": 1.1535, + "step": 15308 + }, + { + "epoch": 0.9301294124794945, + "grad_norm": 0.2360927164554596, + "learning_rate": 1.209116044844849e-06, + "loss": 1.0099, + "step": 15309 + }, + { + "epoch": 0.9301901695121211, + "grad_norm": 0.45913657546043396, + "learning_rate": 1.2070244645225825e-06, + "loss": 1.2555, + "step": 15310 + }, + { + "epoch": 0.9302509265447475, + "grad_norm": 0.09064749628305435, + "learning_rate": 1.2049346727162626e-06, + "loss": 0.9981, + "step": 15311 + }, + { + "epoch": 0.930311683577374, + "grad_norm": 0.17532262206077576, + "learning_rate": 1.2028466695024942e-06, + "loss": 1.1039, + "step": 15312 + }, + { + "epoch": 0.9303724406100006, + "grad_norm": 0.1494452804327011, + "learning_rate": 1.2007604549578167e-06, + "loss": 1.076, + "step": 15313 + }, + { + "epoch": 0.9304331976426271, + "grad_norm": 0.21924225986003876, + "learning_rate": 1.1986760291586908e-06, + "loss": 1.142, + "step": 15314 + }, + { + "epoch": 0.9304939546752536, + "grad_norm": 0.09638996422290802, + "learning_rate": 1.1965933921815275e-06, + "loss": 1.0379, + "step": 15315 + }, + { + "epoch": 0.9305547117078802, + "grad_norm": 0.11173415929079056, + "learning_rate": 1.1945125441026718e-06, + "loss": 1.0251, + "step": 15316 + }, + { + "epoch": 0.9306154687405067, + "grad_norm": 0.12360332906246185, + "learning_rate": 1.1924334849983898e-06, + "loss": 1.0203, + "step": 15317 + }, + { + "epoch": 0.9306762257731332, + "grad_norm": 0.10610894858837128, + "learning_rate": 1.1903562149448988e-06, + "loss": 1.0322, + "step": 15318 + }, + { + "epoch": 0.9307369828057598, + "grad_norm": 0.10047803074121475, + "learning_rate": 1.1882807340183265e-06, + "loss": 0.9895, + "step": 15319 + }, + { + "epoch": 0.9307977398383863, + "grad_norm": 0.11373595893383026, + "learning_rate": 1.186207042294768e-06, + "loss": 1.0652, + "step": 15320 + }, + { + "epoch": 0.9308584968710129, + "grad_norm": 0.1170571967959404, + "learning_rate": 1.1841351398502177e-06, + "loss": 1.0474, + "step": 15321 + }, + { + "epoch": 0.9309192539036394, + "grad_norm": 0.16566219925880432, + "learning_rate": 1.182065026760637e-06, + "loss": 1.0283, + "step": 15322 + }, + { + "epoch": 0.9309800109362659, + "grad_norm": 0.9753913879394531, + "learning_rate": 1.1799967031018933e-06, + "loss": 1.2524, + "step": 15323 + }, + { + "epoch": 0.9310407679688923, + "grad_norm": 0.19697614014148712, + "learning_rate": 1.1779301689498091e-06, + "loss": 1.1466, + "step": 15324 + }, + { + "epoch": 0.9311015250015189, + "grad_norm": 0.09586262702941895, + "learning_rate": 1.1758654243801348e-06, + "loss": 1.0151, + "step": 15325 + }, + { + "epoch": 0.9311622820341454, + "grad_norm": 0.10872571170330048, + "learning_rate": 1.1738024694685435e-06, + "loss": 1.0008, + "step": 15326 + }, + { + "epoch": 0.931223039066772, + "grad_norm": 0.8605412840843201, + "learning_rate": 1.171741304290669e-06, + "loss": 1.1964, + "step": 15327 + }, + { + "epoch": 0.9312837960993985, + "grad_norm": 0.10982364416122437, + "learning_rate": 1.1696819289220507e-06, + "loss": 1.0272, + "step": 15328 + }, + { + "epoch": 0.931344553132025, + "grad_norm": 0.14758697152137756, + "learning_rate": 1.1676243434381783e-06, + "loss": 1.0833, + "step": 15329 + }, + { + "epoch": 0.9314053101646516, + "grad_norm": 0.1348530799150467, + "learning_rate": 1.165568547914475e-06, + "loss": 1.0688, + "step": 15330 + }, + { + "epoch": 0.9314660671972781, + "grad_norm": 0.16634732484817505, + "learning_rate": 1.1635145424262973e-06, + "loss": 1.1161, + "step": 15331 + }, + { + "epoch": 0.9315268242299046, + "grad_norm": 0.30785343050956726, + "learning_rate": 1.16146232704894e-06, + "loss": 1.1312, + "step": 15332 + }, + { + "epoch": 0.9315875812625312, + "grad_norm": 0.15048499405384064, + "learning_rate": 1.1594119018576155e-06, + "loss": 1.0915, + "step": 15333 + }, + { + "epoch": 0.9316483382951577, + "grad_norm": 0.7665331363677979, + "learning_rate": 1.157363266927486e-06, + "loss": 1.0658, + "step": 15334 + }, + { + "epoch": 0.9317090953277842, + "grad_norm": 0.10684623569250107, + "learning_rate": 1.1553164223336465e-06, + "loss": 1.0454, + "step": 15335 + }, + { + "epoch": 0.9317698523604108, + "grad_norm": 0.10057730972766876, + "learning_rate": 1.153271368151132e-06, + "loss": 1.028, + "step": 15336 + }, + { + "epoch": 0.9318306093930372, + "grad_norm": 0.21029414236545563, + "learning_rate": 1.1512281044548933e-06, + "loss": 1.1419, + "step": 15337 + }, + { + "epoch": 0.9318913664256637, + "grad_norm": 0.19378623366355896, + "learning_rate": 1.1491866313198318e-06, + "loss": 1.0342, + "step": 15338 + }, + { + "epoch": 0.9319521234582903, + "grad_norm": 0.19587653875350952, + "learning_rate": 1.1471469488207765e-06, + "loss": 1.0876, + "step": 15339 + }, + { + "epoch": 0.9320128804909168, + "grad_norm": 0.2780442535877228, + "learning_rate": 1.1451090570324896e-06, + "loss": 1.0296, + "step": 15340 + }, + { + "epoch": 0.9320736375235433, + "grad_norm": 0.33275866508483887, + "learning_rate": 1.1430729560296782e-06, + "loss": 1.1214, + "step": 15341 + }, + { + "epoch": 0.9321343945561699, + "grad_norm": 0.1182708591222763, + "learning_rate": 1.1410386458869715e-06, + "loss": 1.0326, + "step": 15342 + }, + { + "epoch": 0.9321951515887964, + "grad_norm": 0.16430284082889557, + "learning_rate": 1.139006126678932e-06, + "loss": 1.0288, + "step": 15343 + }, + { + "epoch": 0.9322559086214229, + "grad_norm": 2.208350896835327, + "learning_rate": 1.1369753984800724e-06, + "loss": 1.1012, + "step": 15344 + }, + { + "epoch": 0.9323166656540495, + "grad_norm": 0.17178237438201904, + "learning_rate": 1.1349464613648275e-06, + "loss": 1.0325, + "step": 15345 + }, + { + "epoch": 0.932377422686676, + "grad_norm": 0.12913119792938232, + "learning_rate": 1.1329193154075601e-06, + "loss": 1.0632, + "step": 15346 + }, + { + "epoch": 0.9324381797193025, + "grad_norm": 0.14151978492736816, + "learning_rate": 1.130893960682583e-06, + "loss": 1.0634, + "step": 15347 + }, + { + "epoch": 0.9324989367519291, + "grad_norm": 0.10698746889829636, + "learning_rate": 1.1288703972641313e-06, + "loss": 1.037, + "step": 15348 + }, + { + "epoch": 0.9325596937845556, + "grad_norm": 0.12261773645877838, + "learning_rate": 1.1268486252263844e-06, + "loss": 0.9918, + "step": 15349 + }, + { + "epoch": 0.932620450817182, + "grad_norm": 0.19454126060009003, + "learning_rate": 1.1248286446434498e-06, + "loss": 1.058, + "step": 15350 + }, + { + "epoch": 0.9326812078498086, + "grad_norm": 2.1978588104248047, + "learning_rate": 1.1228104555893625e-06, + "loss": 1.1101, + "step": 15351 + }, + { + "epoch": 0.9327419648824351, + "grad_norm": 0.16989226639270782, + "learning_rate": 1.1207940581381138e-06, + "loss": 1.1179, + "step": 15352 + }, + { + "epoch": 0.9328027219150616, + "grad_norm": 0.10813550651073456, + "learning_rate": 1.1187794523635997e-06, + "loss": 1.0238, + "step": 15353 + }, + { + "epoch": 0.9328634789476882, + "grad_norm": 0.10100190341472626, + "learning_rate": 1.116766638339678e-06, + "loss": 1.0579, + "step": 15354 + }, + { + "epoch": 0.9329242359803147, + "grad_norm": 0.10340183973312378, + "learning_rate": 1.114755616140123e-06, + "loss": 1.0037, + "step": 15355 + }, + { + "epoch": 0.9329849930129412, + "grad_norm": 0.13751733303070068, + "learning_rate": 1.1127463858386477e-06, + "loss": 1.0692, + "step": 15356 + }, + { + "epoch": 0.9330457500455678, + "grad_norm": 0.151703879237175, + "learning_rate": 1.1107389475089048e-06, + "loss": 1.0709, + "step": 15357 + }, + { + "epoch": 0.9331065070781943, + "grad_norm": 0.1182345524430275, + "learning_rate": 1.1087333012244737e-06, + "loss": 1.0159, + "step": 15358 + }, + { + "epoch": 0.9331672641108208, + "grad_norm": 0.1722113937139511, + "learning_rate": 1.1067294470588852e-06, + "loss": 1.0687, + "step": 15359 + }, + { + "epoch": 0.9332280211434474, + "grad_norm": 0.13528689742088318, + "learning_rate": 1.1047273850855689e-06, + "loss": 1.0531, + "step": 15360 + }, + { + "epoch": 0.9332887781760739, + "grad_norm": 0.10084126889705658, + "learning_rate": 1.1027271153779218e-06, + "loss": 0.9678, + "step": 15361 + }, + { + "epoch": 0.9333495352087005, + "grad_norm": 0.09768477827310562, + "learning_rate": 1.100728638009263e-06, + "loss": 1.0103, + "step": 15362 + }, + { + "epoch": 0.933410292241327, + "grad_norm": 0.11217644810676575, + "learning_rate": 1.0987319530528506e-06, + "loss": 1.0143, + "step": 15363 + }, + { + "epoch": 0.9334710492739534, + "grad_norm": 0.38295334577560425, + "learning_rate": 1.0967370605818705e-06, + "loss": 1.0166, + "step": 15364 + }, + { + "epoch": 0.93353180630658, + "grad_norm": 0.18921901285648346, + "learning_rate": 1.0947439606694532e-06, + "loss": 1.1728, + "step": 15365 + }, + { + "epoch": 0.9335925633392065, + "grad_norm": 1.356097936630249, + "learning_rate": 1.0927526533886401e-06, + "loss": 1.1471, + "step": 15366 + }, + { + "epoch": 0.933653320371833, + "grad_norm": 0.10401421785354614, + "learning_rate": 1.0907631388124339e-06, + "loss": 1.0296, + "step": 15367 + }, + { + "epoch": 0.9337140774044596, + "grad_norm": 0.1892024278640747, + "learning_rate": 1.0887754170137598e-06, + "loss": 1.1636, + "step": 15368 + }, + { + "epoch": 0.9337748344370861, + "grad_norm": 0.42177772521972656, + "learning_rate": 1.0867894880654762e-06, + "loss": 1.0814, + "step": 15369 + }, + { + "epoch": 0.9338355914697126, + "grad_norm": 0.15873512625694275, + "learning_rate": 1.08480535204038e-06, + "loss": 1.1007, + "step": 15370 + }, + { + "epoch": 0.9338963485023392, + "grad_norm": 1.713375210762024, + "learning_rate": 1.0828230090111969e-06, + "loss": 1.0644, + "step": 15371 + }, + { + "epoch": 0.9339571055349657, + "grad_norm": 0.14453130960464478, + "learning_rate": 1.080842459050596e-06, + "loss": 1.0471, + "step": 15372 + }, + { + "epoch": 0.9340178625675922, + "grad_norm": 0.32694312930107117, + "learning_rate": 1.0788637022311642e-06, + "loss": 1.1044, + "step": 15373 + }, + { + "epoch": 0.9340786196002188, + "grad_norm": 0.11188525706529617, + "learning_rate": 1.0768867386254434e-06, + "loss": 1.0735, + "step": 15374 + }, + { + "epoch": 0.9341393766328453, + "grad_norm": 0.3183089792728424, + "learning_rate": 1.0749115683058974e-06, + "loss": 1.2225, + "step": 15375 + }, + { + "epoch": 0.9342001336654718, + "grad_norm": 0.15144848823547363, + "learning_rate": 1.0729381913449244e-06, + "loss": 0.9922, + "step": 15376 + }, + { + "epoch": 0.9342608906980983, + "grad_norm": 0.21125288307666779, + "learning_rate": 1.0709666078148551e-06, + "loss": 1.0736, + "step": 15377 + }, + { + "epoch": 0.9343216477307248, + "grad_norm": 0.22160610556602478, + "learning_rate": 1.068996817787965e-06, + "loss": 1.2534, + "step": 15378 + }, + { + "epoch": 0.9343824047633513, + "grad_norm": 0.10727578401565552, + "learning_rate": 1.0670288213364577e-06, + "loss": 1.0171, + "step": 15379 + }, + { + "epoch": 0.9344431617959779, + "grad_norm": 0.1001729741692543, + "learning_rate": 1.065062618532464e-06, + "loss": 0.9914, + "step": 15380 + }, + { + "epoch": 0.9345039188286044, + "grad_norm": 0.09848897904157639, + "learning_rate": 1.06309820944806e-06, + "loss": 1.0295, + "step": 15381 + }, + { + "epoch": 0.9345646758612309, + "grad_norm": 0.1327124387025833, + "learning_rate": 1.0611355941552491e-06, + "loss": 1.0136, + "step": 15382 + }, + { + "epoch": 0.9346254328938575, + "grad_norm": 0.15134766697883606, + "learning_rate": 1.0591747727259739e-06, + "loss": 1.0681, + "step": 15383 + }, + { + "epoch": 0.934686189926484, + "grad_norm": 1.4509658813476562, + "learning_rate": 1.0572157452321097e-06, + "loss": 1.0549, + "step": 15384 + }, + { + "epoch": 0.9347469469591105, + "grad_norm": 0.11305056512355804, + "learning_rate": 1.0552585117454606e-06, + "loss": 1.0669, + "step": 15385 + }, + { + "epoch": 0.9348077039917371, + "grad_norm": 0.13011938333511353, + "learning_rate": 1.053303072337769e-06, + "loss": 1.0947, + "step": 15386 + }, + { + "epoch": 0.9348684610243636, + "grad_norm": 0.17125792801380157, + "learning_rate": 1.0513494270807168e-06, + "loss": 1.1389, + "step": 15387 + }, + { + "epoch": 0.9349292180569901, + "grad_norm": 0.11153468489646912, + "learning_rate": 1.049397576045913e-06, + "loss": 0.9935, + "step": 15388 + }, + { + "epoch": 0.9349899750896167, + "grad_norm": 0.16348513960838318, + "learning_rate": 1.047447519304906e-06, + "loss": 1.1284, + "step": 15389 + }, + { + "epoch": 0.9350507321222431, + "grad_norm": 0.29993122816085815, + "learning_rate": 1.0454992569291666e-06, + "loss": 1.1022, + "step": 15390 + }, + { + "epoch": 0.9351114891548696, + "grad_norm": 0.12200228124856949, + "learning_rate": 1.0435527889901208e-06, + "loss": 1.0583, + "step": 15391 + }, + { + "epoch": 0.9351722461874962, + "grad_norm": 0.21584726870059967, + "learning_rate": 1.0416081155591061e-06, + "loss": 1.1769, + "step": 15392 + }, + { + "epoch": 0.9352330032201227, + "grad_norm": 0.18006779253482819, + "learning_rate": 1.03966523670741e-06, + "loss": 1.1844, + "step": 15393 + }, + { + "epoch": 0.9352937602527492, + "grad_norm": 0.13245415687561035, + "learning_rate": 1.0377241525062531e-06, + "loss": 1.0496, + "step": 15394 + }, + { + "epoch": 0.9353545172853758, + "grad_norm": 0.25103774666786194, + "learning_rate": 1.035784863026773e-06, + "loss": 1.1934, + "step": 15395 + }, + { + "epoch": 0.9354152743180023, + "grad_norm": 0.16308242082595825, + "learning_rate": 1.0338473683400685e-06, + "loss": 1.1009, + "step": 15396 + }, + { + "epoch": 0.9354760313506288, + "grad_norm": 0.19510650634765625, + "learning_rate": 1.0319116685171492e-06, + "loss": 1.108, + "step": 15397 + }, + { + "epoch": 0.9355367883832554, + "grad_norm": 0.20332162082195282, + "learning_rate": 1.0299777636289808e-06, + "loss": 1.1157, + "step": 15398 + }, + { + "epoch": 0.9355975454158819, + "grad_norm": 0.14856691658496857, + "learning_rate": 1.02804565374644e-06, + "loss": 1.0897, + "step": 15399 + }, + { + "epoch": 0.9356583024485084, + "grad_norm": 0.09579699486494064, + "learning_rate": 1.0261153389403533e-06, + "loss": 0.9748, + "step": 15400 + }, + { + "epoch": 0.935719059481135, + "grad_norm": 0.15310057997703552, + "learning_rate": 1.0241868192814752e-06, + "loss": 1.2507, + "step": 15401 + }, + { + "epoch": 0.9357798165137615, + "grad_norm": 0.11506348848342896, + "learning_rate": 1.0222600948404882e-06, + "loss": 1.0618, + "step": 15402 + }, + { + "epoch": 0.9358405735463879, + "grad_norm": 0.0961810052394867, + "learning_rate": 1.02033516568803e-06, + "loss": 1.086, + "step": 15403 + }, + { + "epoch": 0.9359013305790145, + "grad_norm": 0.10804151743650436, + "learning_rate": 1.0184120318946554e-06, + "loss": 1.0123, + "step": 15404 + }, + { + "epoch": 0.935962087611641, + "grad_norm": 0.31057628989219666, + "learning_rate": 1.0164906935308526e-06, + "loss": 1.0942, + "step": 15405 + }, + { + "epoch": 0.9360228446442675, + "grad_norm": 0.12351847440004349, + "learning_rate": 1.0145711506670596e-06, + "loss": 1.003, + "step": 15406 + }, + { + "epoch": 0.9360836016768941, + "grad_norm": 0.1282394677400589, + "learning_rate": 1.0126534033736202e-06, + "loss": 1.1411, + "step": 15407 + }, + { + "epoch": 0.9361443587095206, + "grad_norm": 0.19097188115119934, + "learning_rate": 1.0107374517208447e-06, + "loss": 1.0953, + "step": 15408 + }, + { + "epoch": 0.9362051157421472, + "grad_norm": 0.20312348008155823, + "learning_rate": 1.0088232957789545e-06, + "loss": 1.0992, + "step": 15409 + }, + { + "epoch": 0.9362658727747737, + "grad_norm": 0.13771170377731323, + "learning_rate": 1.0069109356181162e-06, + "loss": 1.1084, + "step": 15410 + }, + { + "epoch": 0.9363266298074002, + "grad_norm": 0.10973066091537476, + "learning_rate": 1.005000371308429e-06, + "loss": 1.0339, + "step": 15411 + }, + { + "epoch": 0.9363873868400268, + "grad_norm": 0.12341954559087753, + "learning_rate": 1.0030916029199256e-06, + "loss": 1.0248, + "step": 15412 + }, + { + "epoch": 0.9364481438726533, + "grad_norm": 0.11936056613922119, + "learning_rate": 1.001184630522567e-06, + "loss": 1.0348, + "step": 15413 + }, + { + "epoch": 0.9365089009052798, + "grad_norm": 0.17877939343452454, + "learning_rate": 9.992794541862582e-07, + "loss": 1.0628, + "step": 15414 + }, + { + "epoch": 0.9365696579379064, + "grad_norm": 0.16074557602405548, + "learning_rate": 9.973760739808325e-07, + "loss": 1.1226, + "step": 15415 + }, + { + "epoch": 0.9366304149705328, + "grad_norm": 0.29493623971939087, + "learning_rate": 9.954744899760616e-07, + "loss": 1.2371, + "step": 15416 + }, + { + "epoch": 0.9366911720031593, + "grad_norm": 1.5070557594299316, + "learning_rate": 9.935747022416397e-07, + "loss": 1.0696, + "step": 15417 + }, + { + "epoch": 0.9367519290357859, + "grad_norm": 0.12263424694538116, + "learning_rate": 9.916767108472114e-07, + "loss": 1.0877, + "step": 15418 + }, + { + "epoch": 0.9368126860684124, + "grad_norm": 0.11993416398763657, + "learning_rate": 9.89780515862354e-07, + "loss": 1.0615, + "step": 15419 + }, + { + "epoch": 0.9368734431010389, + "grad_norm": 0.12146300077438354, + "learning_rate": 9.878861173565569e-07, + "loss": 1.0379, + "step": 15420 + }, + { + "epoch": 0.9369342001336655, + "grad_norm": 0.11891285330057144, + "learning_rate": 9.859935153992694e-07, + "loss": 1.0215, + "step": 15421 + }, + { + "epoch": 0.936994957166292, + "grad_norm": 0.14614197611808777, + "learning_rate": 9.841027100598699e-07, + "loss": 1.1099, + "step": 15422 + }, + { + "epoch": 0.9370557141989185, + "grad_norm": 0.20706407725811005, + "learning_rate": 9.82213701407647e-07, + "loss": 1.1478, + "step": 15423 + }, + { + "epoch": 0.9371164712315451, + "grad_norm": 0.19839854538440704, + "learning_rate": 9.80326489511868e-07, + "loss": 1.2426, + "step": 15424 + }, + { + "epoch": 0.9371772282641716, + "grad_norm": 0.158394455909729, + "learning_rate": 9.784410744416882e-07, + "loss": 1.1307, + "step": 15425 + }, + { + "epoch": 0.9372379852967981, + "grad_norm": 0.468840092420578, + "learning_rate": 9.765574562662362e-07, + "loss": 1.0582, + "step": 15426 + }, + { + "epoch": 0.9372987423294247, + "grad_norm": 0.25128600001335144, + "learning_rate": 9.746756350545449e-07, + "loss": 1.1505, + "step": 15427 + }, + { + "epoch": 0.9373594993620512, + "grad_norm": 0.10685040801763535, + "learning_rate": 9.727956108755875e-07, + "loss": 1.0198, + "step": 15428 + }, + { + "epoch": 0.9374202563946776, + "grad_norm": 0.09054891765117645, + "learning_rate": 9.709173837982866e-07, + "loss": 1.039, + "step": 15429 + }, + { + "epoch": 0.9374810134273042, + "grad_norm": 0.12794844806194305, + "learning_rate": 9.690409538914869e-07, + "loss": 1.0365, + "step": 15430 + }, + { + "epoch": 0.9375417704599307, + "grad_norm": 0.2145228534936905, + "learning_rate": 9.671663212239667e-07, + "loss": 1.0805, + "step": 15431 + }, + { + "epoch": 0.9376025274925572, + "grad_norm": 0.13872674107551575, + "learning_rate": 9.65293485864449e-07, + "loss": 1.093, + "step": 15432 + }, + { + "epoch": 0.9376632845251838, + "grad_norm": 0.09915322065353394, + "learning_rate": 9.63422447881568e-07, + "loss": 1.0321, + "step": 15433 + }, + { + "epoch": 0.9377240415578103, + "grad_norm": 0.19035974144935608, + "learning_rate": 9.615532073439238e-07, + "loss": 1.1623, + "step": 15434 + }, + { + "epoch": 0.9377847985904368, + "grad_norm": 0.10336155444383621, + "learning_rate": 9.59685764320023e-07, + "loss": 1.0581, + "step": 15435 + }, + { + "epoch": 0.9378455556230634, + "grad_norm": 3.234569549560547, + "learning_rate": 9.578201188783165e-07, + "loss": 1.1341, + "step": 15436 + }, + { + "epoch": 0.9379063126556899, + "grad_norm": 3.544027805328369, + "learning_rate": 9.55956271087194e-07, + "loss": 1.0675, + "step": 15437 + }, + { + "epoch": 0.9379670696883164, + "grad_norm": 0.14697876572608948, + "learning_rate": 9.540942210149783e-07, + "loss": 1.0665, + "step": 15438 + }, + { + "epoch": 0.938027826720943, + "grad_norm": 0.3609788119792938, + "learning_rate": 9.522339687299153e-07, + "loss": 1.1431, + "step": 15439 + }, + { + "epoch": 0.9380885837535695, + "grad_norm": 0.11817152053117752, + "learning_rate": 9.503755143002002e-07, + "loss": 1.0406, + "step": 15440 + }, + { + "epoch": 0.938149340786196, + "grad_norm": 0.16668759286403656, + "learning_rate": 9.485188577939508e-07, + "loss": 1.1371, + "step": 15441 + }, + { + "epoch": 0.9382100978188225, + "grad_norm": 2.793811321258545, + "learning_rate": 9.466639992792237e-07, + "loss": 1.0374, + "step": 15442 + }, + { + "epoch": 0.938270854851449, + "grad_norm": 0.1596498191356659, + "learning_rate": 9.448109388240089e-07, + "loss": 1.0866, + "step": 15443 + }, + { + "epoch": 0.9383316118840755, + "grad_norm": 0.12353494763374329, + "learning_rate": 9.429596764962245e-07, + "loss": 1.0382, + "step": 15444 + }, + { + "epoch": 0.9383923689167021, + "grad_norm": 0.11589280515909195, + "learning_rate": 9.411102123637439e-07, + "loss": 1.0551, + "step": 15445 + }, + { + "epoch": 0.9384531259493286, + "grad_norm": 42.043296813964844, + "learning_rate": 9.392625464943572e-07, + "loss": 1.0201, + "step": 15446 + }, + { + "epoch": 0.9385138829819551, + "grad_norm": 0.0957220047712326, + "learning_rate": 9.37416678955777e-07, + "loss": 1.1029, + "step": 15447 + }, + { + "epoch": 0.9385746400145817, + "grad_norm": 0.253751665353775, + "learning_rate": 9.355726098156714e-07, + "loss": 1.3224, + "step": 15448 + }, + { + "epoch": 0.9386353970472082, + "grad_norm": 0.11063911765813828, + "learning_rate": 9.337303391416363e-07, + "loss": 1.0602, + "step": 15449 + }, + { + "epoch": 0.9386961540798348, + "grad_norm": 0.1451398879289627, + "learning_rate": 9.318898670012011e-07, + "loss": 1.0168, + "step": 15450 + }, + { + "epoch": 0.9387569111124613, + "grad_norm": 0.09249144047498703, + "learning_rate": 9.300511934618284e-07, + "loss": 1.0056, + "step": 15451 + }, + { + "epoch": 0.9388176681450878, + "grad_norm": 0.17374804615974426, + "learning_rate": 9.282143185909087e-07, + "loss": 1.1407, + "step": 15452 + }, + { + "epoch": 0.9388784251777144, + "grad_norm": 0.19422796368598938, + "learning_rate": 9.263792424557882e-07, + "loss": 1.1103, + "step": 15453 + }, + { + "epoch": 0.9389391822103409, + "grad_norm": 0.09566016495227814, + "learning_rate": 9.245459651237132e-07, + "loss": 1.0108, + "step": 15454 + }, + { + "epoch": 0.9389999392429673, + "grad_norm": 0.10348668694496155, + "learning_rate": 9.227144866618964e-07, + "loss": 1.0474, + "step": 15455 + }, + { + "epoch": 0.9390606962755939, + "grad_norm": 0.16968800127506256, + "learning_rate": 9.208848071374676e-07, + "loss": 1.0663, + "step": 15456 + }, + { + "epoch": 0.9391214533082204, + "grad_norm": 0.1156466007232666, + "learning_rate": 9.190569266174898e-07, + "loss": 1.0593, + "step": 15457 + }, + { + "epoch": 0.9391822103408469, + "grad_norm": 0.1115623265504837, + "learning_rate": 9.172308451689704e-07, + "loss": 1.021, + "step": 15458 + }, + { + "epoch": 0.9392429673734735, + "grad_norm": 0.09910662472248077, + "learning_rate": 9.154065628588392e-07, + "loss": 1.0201, + "step": 15459 + }, + { + "epoch": 0.9393037244061, + "grad_norm": 0.13908688724040985, + "learning_rate": 9.135840797539708e-07, + "loss": 1.0597, + "step": 15460 + }, + { + "epoch": 0.9393644814387265, + "grad_norm": 0.10848038643598557, + "learning_rate": 9.117633959211668e-07, + "loss": 1.0248, + "step": 15461 + }, + { + "epoch": 0.9394252384713531, + "grad_norm": 0.12842628359794617, + "learning_rate": 9.099445114271632e-07, + "loss": 1.053, + "step": 15462 + }, + { + "epoch": 0.9394859955039796, + "grad_norm": 0.20164434611797333, + "learning_rate": 9.081274263386285e-07, + "loss": 1.1054, + "step": 15463 + }, + { + "epoch": 0.9395467525366061, + "grad_norm": 0.1131063848733902, + "learning_rate": 9.063121407221764e-07, + "loss": 1.0463, + "step": 15464 + }, + { + "epoch": 0.9396075095692327, + "grad_norm": 0.14073964953422546, + "learning_rate": 9.044986546443368e-07, + "loss": 1.0738, + "step": 15465 + }, + { + "epoch": 0.9396682666018592, + "grad_norm": 0.14815548062324524, + "learning_rate": 9.026869681715954e-07, + "loss": 1.0619, + "step": 15466 + }, + { + "epoch": 0.9397290236344857, + "grad_norm": 0.13954861462116241, + "learning_rate": 9.008770813703549e-07, + "loss": 1.0497, + "step": 15467 + }, + { + "epoch": 0.9397897806671123, + "grad_norm": 0.14102184772491455, + "learning_rate": 8.990689943069508e-07, + "loss": 1.1041, + "step": 15468 + }, + { + "epoch": 0.9398505376997387, + "grad_norm": 0.23908531665802002, + "learning_rate": 8.972627070476636e-07, + "loss": 1.1724, + "step": 15469 + }, + { + "epoch": 0.9399112947323652, + "grad_norm": 0.15082105994224548, + "learning_rate": 8.954582196587013e-07, + "loss": 1.0835, + "step": 15470 + }, + { + "epoch": 0.9399720517649918, + "grad_norm": 0.09962394833564758, + "learning_rate": 8.936555322062168e-07, + "loss": 1.0273, + "step": 15471 + }, + { + "epoch": 0.9400328087976183, + "grad_norm": 0.32642045617103577, + "learning_rate": 8.918546447562737e-07, + "loss": 1.1048, + "step": 15472 + }, + { + "epoch": 0.9400935658302448, + "grad_norm": 0.13679812848567963, + "learning_rate": 8.90055557374897e-07, + "loss": 1.0399, + "step": 15473 + }, + { + "epoch": 0.9401543228628714, + "grad_norm": 0.12505611777305603, + "learning_rate": 8.882582701280228e-07, + "loss": 1.036, + "step": 15474 + }, + { + "epoch": 0.9402150798954979, + "grad_norm": 0.14543262124061584, + "learning_rate": 8.864627830815375e-07, + "loss": 1.0111, + "step": 15475 + }, + { + "epoch": 0.9402758369281244, + "grad_norm": 0.10226338356733322, + "learning_rate": 8.846690963012494e-07, + "loss": 1.0437, + "step": 15476 + }, + { + "epoch": 0.940336593960751, + "grad_norm": 0.11086393892765045, + "learning_rate": 8.828772098529114e-07, + "loss": 1.0519, + "step": 15477 + }, + { + "epoch": 0.9403973509933775, + "grad_norm": 0.1930517852306366, + "learning_rate": 8.81087123802199e-07, + "loss": 1.1278, + "step": 15478 + }, + { + "epoch": 0.940458108026004, + "grad_norm": 0.1946292221546173, + "learning_rate": 8.792988382147372e-07, + "loss": 1.0244, + "step": 15479 + }, + { + "epoch": 0.9405188650586306, + "grad_norm": 0.11169382929801941, + "learning_rate": 8.775123531560736e-07, + "loss": 1.0799, + "step": 15480 + }, + { + "epoch": 0.9405796220912571, + "grad_norm": 0.12485785037279129, + "learning_rate": 8.757276686916837e-07, + "loss": 1.0749, + "step": 15481 + }, + { + "epoch": 0.9406403791238835, + "grad_norm": 0.119688481092453, + "learning_rate": 8.739447848869987e-07, + "loss": 1.0125, + "step": 15482 + }, + { + "epoch": 0.9407011361565101, + "grad_norm": 0.13595597445964813, + "learning_rate": 8.721637018073548e-07, + "loss": 1.0779, + "step": 15483 + }, + { + "epoch": 0.9407618931891366, + "grad_norm": 0.2306956648826599, + "learning_rate": 8.703844195180555e-07, + "loss": 1.0528, + "step": 15484 + }, + { + "epoch": 0.9408226502217631, + "grad_norm": 0.27294921875, + "learning_rate": 8.686069380843042e-07, + "loss": 1.1304, + "step": 15485 + }, + { + "epoch": 0.9408834072543897, + "grad_norm": 0.26540976762771606, + "learning_rate": 8.668312575712656e-07, + "loss": 1.0042, + "step": 15486 + }, + { + "epoch": 0.9409441642870162, + "grad_norm": 0.18844637274742126, + "learning_rate": 8.650573780440263e-07, + "loss": 1.0722, + "step": 15487 + }, + { + "epoch": 0.9410049213196428, + "grad_norm": 0.16316348314285278, + "learning_rate": 8.632852995676066e-07, + "loss": 1.0507, + "step": 15488 + }, + { + "epoch": 0.9410656783522693, + "grad_norm": 0.15009582042694092, + "learning_rate": 8.615150222069601e-07, + "loss": 1.1169, + "step": 15489 + }, + { + "epoch": 0.9411264353848958, + "grad_norm": 0.1508311778306961, + "learning_rate": 8.597465460269738e-07, + "loss": 1.0106, + "step": 15490 + }, + { + "epoch": 0.9411871924175224, + "grad_norm": 0.21480034291744232, + "learning_rate": 8.579798710924791e-07, + "loss": 1.1273, + "step": 15491 + }, + { + "epoch": 0.9412479494501489, + "grad_norm": 0.11603716760873795, + "learning_rate": 8.562149974682354e-07, + "loss": 1.081, + "step": 15492 + }, + { + "epoch": 0.9413087064827754, + "grad_norm": 0.1249760165810585, + "learning_rate": 8.544519252189354e-07, + "loss": 1.0765, + "step": 15493 + }, + { + "epoch": 0.941369463515402, + "grad_norm": 0.15420284867286682, + "learning_rate": 8.526906544091884e-07, + "loss": 1.1285, + "step": 15494 + }, + { + "epoch": 0.9414302205480284, + "grad_norm": 0.24884967505931854, + "learning_rate": 8.50931185103565e-07, + "loss": 1.0758, + "step": 15495 + }, + { + "epoch": 0.9414909775806549, + "grad_norm": 0.20638661086559296, + "learning_rate": 8.491735173665693e-07, + "loss": 1.1614, + "step": 15496 + }, + { + "epoch": 0.9415517346132815, + "grad_norm": 9.35626220703125, + "learning_rate": 8.474176512626108e-07, + "loss": 1.0402, + "step": 15497 + }, + { + "epoch": 0.941612491645908, + "grad_norm": 0.13855469226837158, + "learning_rate": 8.456635868560603e-07, + "loss": 1.1637, + "step": 15498 + }, + { + "epoch": 0.9416732486785345, + "grad_norm": 0.1717992126941681, + "learning_rate": 8.439113242112217e-07, + "loss": 1.116, + "step": 15499 + }, + { + "epoch": 0.9417340057111611, + "grad_norm": 0.26355358958244324, + "learning_rate": 8.421608633923051e-07, + "loss": 1.0636, + "step": 15500 + }, + { + "epoch": 0.9417947627437876, + "grad_norm": 0.12528984248638153, + "learning_rate": 8.404122044634865e-07, + "loss": 0.9809, + "step": 15501 + }, + { + "epoch": 0.9418555197764141, + "grad_norm": 0.184946209192276, + "learning_rate": 8.386653474888595e-07, + "loss": 1.1069, + "step": 15502 + }, + { + "epoch": 0.9419162768090407, + "grad_norm": 0.12299303710460663, + "learning_rate": 8.369202925324559e-07, + "loss": 1.0393, + "step": 15503 + }, + { + "epoch": 0.9419770338416672, + "grad_norm": 0.14128755033016205, + "learning_rate": 8.35177039658247e-07, + "loss": 1.001, + "step": 15504 + }, + { + "epoch": 0.9420377908742937, + "grad_norm": 0.35349786281585693, + "learning_rate": 8.33435588930126e-07, + "loss": 1.0806, + "step": 15505 + }, + { + "epoch": 0.9420985479069203, + "grad_norm": 0.2799130380153656, + "learning_rate": 8.316959404119252e-07, + "loss": 1.2533, + "step": 15506 + }, + { + "epoch": 0.9421593049395468, + "grad_norm": 0.254946231842041, + "learning_rate": 8.299580941674157e-07, + "loss": 1.0968, + "step": 15507 + }, + { + "epoch": 0.9422200619721732, + "grad_norm": 0.10920147597789764, + "learning_rate": 8.282220502602911e-07, + "loss": 1.051, + "step": 15508 + }, + { + "epoch": 0.9422808190047998, + "grad_norm": 1.0087721347808838, + "learning_rate": 8.264878087542005e-07, + "loss": 1.0609, + "step": 15509 + }, + { + "epoch": 0.9423415760374263, + "grad_norm": 0.15644152462482452, + "learning_rate": 8.247553697126986e-07, + "loss": 1.1413, + "step": 15510 + }, + { + "epoch": 0.9424023330700528, + "grad_norm": 0.08863753825426102, + "learning_rate": 8.230247331992902e-07, + "loss": 1.0224, + "step": 15511 + }, + { + "epoch": 0.9424630901026794, + "grad_norm": 0.17508651316165924, + "learning_rate": 8.212958992774189e-07, + "loss": 1.0603, + "step": 15512 + }, + { + "epoch": 0.9425238471353059, + "grad_norm": 0.17421424388885498, + "learning_rate": 8.195688680104618e-07, + "loss": 1.1268, + "step": 15513 + }, + { + "epoch": 0.9425846041679324, + "grad_norm": 0.3248521685600281, + "learning_rate": 8.178436394617073e-07, + "loss": 1.1632, + "step": 15514 + }, + { + "epoch": 0.942645361200559, + "grad_norm": 0.14074921607971191, + "learning_rate": 8.16120213694399e-07, + "loss": 1.0927, + "step": 15515 + }, + { + "epoch": 0.9427061182331855, + "grad_norm": 0.12300248444080353, + "learning_rate": 8.143985907717089e-07, + "loss": 1.0098, + "step": 15516 + }, + { + "epoch": 0.942766875265812, + "grad_norm": 0.6366365551948547, + "learning_rate": 8.126787707567473e-07, + "loss": 1.0995, + "step": 15517 + }, + { + "epoch": 0.9428276322984386, + "grad_norm": 0.17644186317920685, + "learning_rate": 8.109607537125585e-07, + "loss": 1.1225, + "step": 15518 + }, + { + "epoch": 0.9428883893310651, + "grad_norm": 0.14260059595108032, + "learning_rate": 8.092445397021142e-07, + "loss": 1.0811, + "step": 15519 + }, + { + "epoch": 0.9429491463636916, + "grad_norm": 0.14571037888526917, + "learning_rate": 8.075301287883141e-07, + "loss": 1.0701, + "step": 15520 + }, + { + "epoch": 0.9430099033963181, + "grad_norm": 0.14896683394908905, + "learning_rate": 8.058175210339968e-07, + "loss": 1.0904, + "step": 15521 + }, + { + "epoch": 0.9430706604289446, + "grad_norm": 0.09953810274600983, + "learning_rate": 8.041067165019567e-07, + "loss": 0.9973, + "step": 15522 + }, + { + "epoch": 0.9431314174615711, + "grad_norm": 0.10686226934194565, + "learning_rate": 8.023977152548934e-07, + "loss": 1.0325, + "step": 15523 + }, + { + "epoch": 0.9431921744941977, + "grad_norm": 0.1334177553653717, + "learning_rate": 8.006905173554513e-07, + "loss": 0.9956, + "step": 15524 + }, + { + "epoch": 0.9432529315268242, + "grad_norm": 0.15446703135967255, + "learning_rate": 7.989851228662082e-07, + "loss": 1.0661, + "step": 15525 + }, + { + "epoch": 0.9433136885594507, + "grad_norm": 0.13371652364730835, + "learning_rate": 7.972815318496807e-07, + "loss": 1.0973, + "step": 15526 + }, + { + "epoch": 0.9433744455920773, + "grad_norm": 0.12622101604938507, + "learning_rate": 7.955797443683022e-07, + "loss": 1.0525, + "step": 15527 + }, + { + "epoch": 0.9434352026247038, + "grad_norm": 0.08999328315258026, + "learning_rate": 7.938797604844616e-07, + "loss": 1.0099, + "step": 15528 + }, + { + "epoch": 0.9434959596573304, + "grad_norm": 0.12138425558805466, + "learning_rate": 7.921815802604704e-07, + "loss": 1.0302, + "step": 15529 + }, + { + "epoch": 0.9435567166899569, + "grad_norm": 0.11151651293039322, + "learning_rate": 7.904852037585731e-07, + "loss": 1.065, + "step": 15530 + }, + { + "epoch": 0.9436174737225834, + "grad_norm": 0.12263260781764984, + "learning_rate": 7.887906310409588e-07, + "loss": 1.0348, + "step": 15531 + }, + { + "epoch": 0.94367823075521, + "grad_norm": 0.11933889985084534, + "learning_rate": 7.870978621697222e-07, + "loss": 1.0526, + "step": 15532 + }, + { + "epoch": 0.9437389877878365, + "grad_norm": 0.1343223750591278, + "learning_rate": 7.854068972069418e-07, + "loss": 1.2137, + "step": 15533 + }, + { + "epoch": 0.9437997448204629, + "grad_norm": 0.25458455085754395, + "learning_rate": 7.837177362145786e-07, + "loss": 1.0393, + "step": 15534 + }, + { + "epoch": 0.9438605018530895, + "grad_norm": 0.11736898869276047, + "learning_rate": 7.820303792545558e-07, + "loss": 1.0824, + "step": 15535 + }, + { + "epoch": 0.943921258885716, + "grad_norm": 0.09872827678918839, + "learning_rate": 7.803448263887237e-07, + "loss": 1.0255, + "step": 15536 + }, + { + "epoch": 0.9439820159183425, + "grad_norm": 0.15799172222614288, + "learning_rate": 7.78661077678855e-07, + "loss": 1.1307, + "step": 15537 + }, + { + "epoch": 0.9440427729509691, + "grad_norm": 0.24274155497550964, + "learning_rate": 7.769791331866894e-07, + "loss": 1.1223, + "step": 15538 + }, + { + "epoch": 0.9441035299835956, + "grad_norm": 0.13727737963199615, + "learning_rate": 7.752989929738663e-07, + "loss": 1.0714, + "step": 15539 + }, + { + "epoch": 0.9441642870162221, + "grad_norm": 0.1504889577627182, + "learning_rate": 7.73620657101981e-07, + "loss": 1.0696, + "step": 15540 + }, + { + "epoch": 0.9442250440488487, + "grad_norm": 0.2271743267774582, + "learning_rate": 7.719441256325399e-07, + "loss": 1.1468, + "step": 15541 + }, + { + "epoch": 0.9442858010814752, + "grad_norm": 0.16605541110038757, + "learning_rate": 7.702693986269938e-07, + "loss": 1.1537, + "step": 15542 + }, + { + "epoch": 0.9443465581141017, + "grad_norm": 0.18032360076904297, + "learning_rate": 7.685964761467491e-07, + "loss": 1.0823, + "step": 15543 + }, + { + "epoch": 0.9444073151467283, + "grad_norm": 0.1634989231824875, + "learning_rate": 7.669253582531122e-07, + "loss": 1.0584, + "step": 15544 + }, + { + "epoch": 0.9444680721793548, + "grad_norm": 0.2044805884361267, + "learning_rate": 7.652560450073454e-07, + "loss": 1.1238, + "step": 15545 + }, + { + "epoch": 0.9445288292119813, + "grad_norm": 0.11974194645881653, + "learning_rate": 7.635885364706385e-07, + "loss": 1.0715, + "step": 15546 + }, + { + "epoch": 0.9445895862446078, + "grad_norm": 0.17538370192050934, + "learning_rate": 7.61922832704104e-07, + "loss": 1.0661, + "step": 15547 + }, + { + "epoch": 0.9446503432772343, + "grad_norm": 0.10671769827604294, + "learning_rate": 7.602589337688093e-07, + "loss": 1.0213, + "step": 15548 + }, + { + "epoch": 0.9447111003098608, + "grad_norm": 0.1608729511499405, + "learning_rate": 7.585968397257392e-07, + "loss": 1.1058, + "step": 15549 + }, + { + "epoch": 0.9447718573424874, + "grad_norm": 0.39861640334129333, + "learning_rate": 7.569365506358228e-07, + "loss": 1.0483, + "step": 15550 + }, + { + "epoch": 0.9448326143751139, + "grad_norm": 0.18744775652885437, + "learning_rate": 7.552780665599113e-07, + "loss": 0.9952, + "step": 15551 + }, + { + "epoch": 0.9448933714077404, + "grad_norm": 0.19755451381206512, + "learning_rate": 7.536213875588061e-07, + "loss": 1.0674, + "step": 15552 + }, + { + "epoch": 0.944954128440367, + "grad_norm": 0.140263631939888, + "learning_rate": 7.519665136932252e-07, + "loss": 1.0613, + "step": 15553 + }, + { + "epoch": 0.9450148854729935, + "grad_norm": 0.28901341557502747, + "learning_rate": 7.503134450238314e-07, + "loss": 1.3098, + "step": 15554 + }, + { + "epoch": 0.94507564250562, + "grad_norm": 0.13824376463890076, + "learning_rate": 7.486621816112149e-07, + "loss": 1.0958, + "step": 15555 + }, + { + "epoch": 0.9451363995382466, + "grad_norm": 0.16486625373363495, + "learning_rate": 7.470127235159108e-07, + "loss": 1.1009, + "step": 15556 + }, + { + "epoch": 0.9451971565708731, + "grad_norm": 0.1943669468164444, + "learning_rate": 7.453650707983761e-07, + "loss": 1.0475, + "step": 15557 + }, + { + "epoch": 0.9452579136034996, + "grad_norm": 0.17605499923229218, + "learning_rate": 7.437192235190016e-07, + "loss": 1.0456, + "step": 15558 + }, + { + "epoch": 0.9453186706361262, + "grad_norm": 0.19593696296215057, + "learning_rate": 7.420751817381222e-07, + "loss": 1.1297, + "step": 15559 + }, + { + "epoch": 0.9453794276687526, + "grad_norm": 0.19700486958026886, + "learning_rate": 7.404329455160008e-07, + "loss": 1.0603, + "step": 15560 + }, + { + "epoch": 0.9454401847013791, + "grad_norm": 0.14606669545173645, + "learning_rate": 7.387925149128283e-07, + "loss": 1.0968, + "step": 15561 + }, + { + "epoch": 0.9455009417340057, + "grad_norm": 0.17591744661331177, + "learning_rate": 7.371538899887398e-07, + "loss": 1.1222, + "step": 15562 + }, + { + "epoch": 0.9455616987666322, + "grad_norm": 0.20410175621509552, + "learning_rate": 7.355170708037928e-07, + "loss": 1.1199, + "step": 15563 + }, + { + "epoch": 0.9456224557992587, + "grad_norm": 0.18681199848651886, + "learning_rate": 7.33882057417995e-07, + "loss": 1.2142, + "step": 15564 + }, + { + "epoch": 0.9456832128318853, + "grad_norm": 0.1307850480079651, + "learning_rate": 7.322488498912705e-07, + "loss": 1.0561, + "step": 15565 + }, + { + "epoch": 0.9457439698645118, + "grad_norm": 0.1756134331226349, + "learning_rate": 7.306174482834938e-07, + "loss": 1.1029, + "step": 15566 + }, + { + "epoch": 0.9458047268971383, + "grad_norm": 0.21926729381084442, + "learning_rate": 7.289878526544503e-07, + "loss": 1.215, + "step": 15567 + }, + { + "epoch": 0.9458654839297649, + "grad_norm": 0.18988537788391113, + "learning_rate": 7.273600630638866e-07, + "loss": 1.3811, + "step": 15568 + }, + { + "epoch": 0.9459262409623914, + "grad_norm": 0.18372716009616852, + "learning_rate": 7.257340795714607e-07, + "loss": 1.0801, + "step": 15569 + }, + { + "epoch": 0.945986997995018, + "grad_norm": 0.13982757925987244, + "learning_rate": 7.241099022367803e-07, + "loss": 1.1232, + "step": 15570 + }, + { + "epoch": 0.9460477550276445, + "grad_norm": 0.19893155992031097, + "learning_rate": 7.224875311193757e-07, + "loss": 1.0736, + "step": 15571 + }, + { + "epoch": 0.946108512060271, + "grad_norm": 0.12165235728025436, + "learning_rate": 7.208669662787104e-07, + "loss": 1.0941, + "step": 15572 + }, + { + "epoch": 0.9461692690928976, + "grad_norm": 0.14123670756816864, + "learning_rate": 7.19248207774198e-07, + "loss": 1.046, + "step": 15573 + }, + { + "epoch": 0.946230026125524, + "grad_norm": 0.1697043925523758, + "learning_rate": 7.176312556651632e-07, + "loss": 1.0971, + "step": 15574 + }, + { + "epoch": 0.9462907831581505, + "grad_norm": 0.12591105699539185, + "learning_rate": 7.160161100108864e-07, + "loss": 1.1016, + "step": 15575 + }, + { + "epoch": 0.946351540190777, + "grad_norm": 0.14257919788360596, + "learning_rate": 7.144027708705592e-07, + "loss": 1.0616, + "step": 15576 + }, + { + "epoch": 0.9464122972234036, + "grad_norm": 0.10719078779220581, + "learning_rate": 7.127912383033286e-07, + "loss": 1.0762, + "step": 15577 + }, + { + "epoch": 0.9464730542560301, + "grad_norm": 0.1505705863237381, + "learning_rate": 7.111815123682641e-07, + "loss": 1.0204, + "step": 15578 + }, + { + "epoch": 0.9465338112886567, + "grad_norm": 0.10594255477190018, + "learning_rate": 7.09573593124363e-07, + "loss": 1.0437, + "step": 15579 + }, + { + "epoch": 0.9465945683212832, + "grad_norm": 0.16246521472930908, + "learning_rate": 7.07967480630578e-07, + "loss": 1.0342, + "step": 15580 + }, + { + "epoch": 0.9466553253539097, + "grad_norm": 0.11460207402706146, + "learning_rate": 7.063631749457677e-07, + "loss": 1.0481, + "step": 15581 + }, + { + "epoch": 0.9467160823865363, + "grad_norm": 0.1805431842803955, + "learning_rate": 7.047606761287406e-07, + "loss": 1.1141, + "step": 15582 + }, + { + "epoch": 0.9467768394191628, + "grad_norm": 0.26799359917640686, + "learning_rate": 7.031599842382386e-07, + "loss": 1.0467, + "step": 15583 + }, + { + "epoch": 0.9468375964517893, + "grad_norm": 0.32107555866241455, + "learning_rate": 7.015610993329424e-07, + "loss": 1.122, + "step": 15584 + }, + { + "epoch": 0.9468983534844159, + "grad_norm": 0.11507256329059601, + "learning_rate": 6.999640214714498e-07, + "loss": 1.0745, + "step": 15585 + }, + { + "epoch": 0.9469591105170424, + "grad_norm": 0.38523489236831665, + "learning_rate": 6.983687507123138e-07, + "loss": 1.2181, + "step": 15586 + }, + { + "epoch": 0.9470198675496688, + "grad_norm": 0.14577315747737885, + "learning_rate": 6.967752871139932e-07, + "loss": 1.0399, + "step": 15587 + }, + { + "epoch": 0.9470806245822954, + "grad_norm": 0.12074267119169235, + "learning_rate": 6.951836307349025e-07, + "loss": 1.0143, + "step": 15588 + }, + { + "epoch": 0.9471413816149219, + "grad_norm": 0.11242710798978806, + "learning_rate": 6.935937816333837e-07, + "loss": 1.0237, + "step": 15589 + }, + { + "epoch": 0.9472021386475484, + "grad_norm": 4.724917411804199, + "learning_rate": 6.920057398677238e-07, + "loss": 1.1851, + "step": 15590 + }, + { + "epoch": 0.947262895680175, + "grad_norm": 0.20853085815906525, + "learning_rate": 6.904195054961204e-07, + "loss": 1.1195, + "step": 15591 + }, + { + "epoch": 0.9473236527128015, + "grad_norm": 0.23358775675296783, + "learning_rate": 6.888350785767217e-07, + "loss": 1.1955, + "step": 15592 + }, + { + "epoch": 0.947384409745428, + "grad_norm": 0.13040676712989807, + "learning_rate": 6.872524591676089e-07, + "loss": 1.0172, + "step": 15593 + }, + { + "epoch": 0.9474451667780546, + "grad_norm": 0.15266183018684387, + "learning_rate": 6.856716473267854e-07, + "loss": 1.0712, + "step": 15594 + }, + { + "epoch": 0.9475059238106811, + "grad_norm": 0.3191109597682953, + "learning_rate": 6.840926431121997e-07, + "loss": 1.1593, + "step": 15595 + }, + { + "epoch": 0.9475666808433076, + "grad_norm": 0.1220596432685852, + "learning_rate": 6.82515446581733e-07, + "loss": 1.0495, + "step": 15596 + }, + { + "epoch": 0.9476274378759342, + "grad_norm": 0.7531089186668396, + "learning_rate": 6.809400577931946e-07, + "loss": 1.1295, + "step": 15597 + }, + { + "epoch": 0.9476881949085607, + "grad_norm": 0.44466760754585266, + "learning_rate": 6.793664768043273e-07, + "loss": 1.0922, + "step": 15598 + }, + { + "epoch": 0.9477489519411872, + "grad_norm": 0.5898516178131104, + "learning_rate": 6.777947036728182e-07, + "loss": 1.0571, + "step": 15599 + }, + { + "epoch": 0.9478097089738137, + "grad_norm": 0.11698181927204132, + "learning_rate": 6.762247384562825e-07, + "loss": 1.0672, + "step": 15600 + }, + { + "epoch": 0.9478704660064402, + "grad_norm": 0.9985892176628113, + "learning_rate": 6.746565812122574e-07, + "loss": 1.1468, + "step": 15601 + }, + { + "epoch": 0.9479312230390667, + "grad_norm": 1.2169983386993408, + "learning_rate": 6.730902319982302e-07, + "loss": 1.1957, + "step": 15602 + }, + { + "epoch": 0.9479919800716933, + "grad_norm": 0.6467977166175842, + "learning_rate": 6.715256908716161e-07, + "loss": 1.0532, + "step": 15603 + }, + { + "epoch": 0.9480527371043198, + "grad_norm": 1.040322184562683, + "learning_rate": 6.699629578897637e-07, + "loss": 0.9915, + "step": 15604 + }, + { + "epoch": 0.9481134941369463, + "grad_norm": 0.3417030870914459, + "learning_rate": 6.684020331099494e-07, + "loss": 1.205, + "step": 15605 + }, + { + "epoch": 0.9481742511695729, + "grad_norm": 0.1402425467967987, + "learning_rate": 6.668429165893997e-07, + "loss": 1.0995, + "step": 15606 + }, + { + "epoch": 0.9482350082021994, + "grad_norm": 0.17089398205280304, + "learning_rate": 6.652856083852632e-07, + "loss": 1.0777, + "step": 15607 + }, + { + "epoch": 0.948295765234826, + "grad_norm": 0.10627352446317673, + "learning_rate": 6.637301085546111e-07, + "loss": 1.0384, + "step": 15608 + }, + { + "epoch": 0.9483565222674525, + "grad_norm": 0.34206822514533997, + "learning_rate": 6.621764171544698e-07, + "loss": 1.0198, + "step": 15609 + }, + { + "epoch": 0.948417279300079, + "grad_norm": 0.10728035867214203, + "learning_rate": 6.606245342417882e-07, + "loss": 1.0321, + "step": 15610 + }, + { + "epoch": 0.9484780363327056, + "grad_norm": 0.10981914401054382, + "learning_rate": 6.590744598734544e-07, + "loss": 1.0015, + "step": 15611 + }, + { + "epoch": 0.9485387933653321, + "grad_norm": 0.7554893493652344, + "learning_rate": 6.575261941062838e-07, + "loss": 1.1793, + "step": 15612 + }, + { + "epoch": 0.9485995503979585, + "grad_norm": 0.1202106773853302, + "learning_rate": 6.559797369970256e-07, + "loss": 0.971, + "step": 15613 + }, + { + "epoch": 0.948660307430585, + "grad_norm": 0.1026490181684494, + "learning_rate": 6.544350886023675e-07, + "loss": 1.0654, + "step": 15614 + }, + { + "epoch": 0.9487210644632116, + "grad_norm": 0.19423659145832062, + "learning_rate": 6.528922489789313e-07, + "loss": 1.1146, + "step": 15615 + }, + { + "epoch": 0.9487818214958381, + "grad_norm": 0.16467399895191193, + "learning_rate": 6.513512181832659e-07, + "loss": 1.1322, + "step": 15616 + }, + { + "epoch": 0.9488425785284647, + "grad_norm": 0.13733038306236267, + "learning_rate": 6.498119962718652e-07, + "loss": 1.054, + "step": 15617 + }, + { + "epoch": 0.9489033355610912, + "grad_norm": 25.943130493164062, + "learning_rate": 6.482745833011394e-07, + "loss": 1.0367, + "step": 15618 + }, + { + "epoch": 0.9489640925937177, + "grad_norm": 0.12427359819412231, + "learning_rate": 6.467389793274547e-07, + "loss": 1.0664, + "step": 15619 + }, + { + "epoch": 0.9490248496263443, + "grad_norm": 0.1817326694726944, + "learning_rate": 6.452051844070883e-07, + "loss": 1.0023, + "step": 15620 + }, + { + "epoch": 0.9490856066589708, + "grad_norm": 0.14965835213661194, + "learning_rate": 6.436731985962674e-07, + "loss": 1.0867, + "step": 15621 + }, + { + "epoch": 0.9491463636915973, + "grad_norm": 0.21469591557979584, + "learning_rate": 6.421430219511415e-07, + "loss": 1.0936, + "step": 15622 + }, + { + "epoch": 0.9492071207242239, + "grad_norm": 0.20590290427207947, + "learning_rate": 6.406146545278102e-07, + "loss": 1.0878, + "step": 15623 + }, + { + "epoch": 0.9492678777568504, + "grad_norm": 0.21590803563594818, + "learning_rate": 6.390880963822842e-07, + "loss": 1.1184, + "step": 15624 + }, + { + "epoch": 0.9493286347894769, + "grad_norm": 0.09580983221530914, + "learning_rate": 6.375633475705245e-07, + "loss": 1.0025, + "step": 15625 + }, + { + "epoch": 0.9493893918221034, + "grad_norm": 0.19283616542816162, + "learning_rate": 6.36040408148425e-07, + "loss": 1.1458, + "step": 15626 + }, + { + "epoch": 0.9494501488547299, + "grad_norm": 0.14228251576423645, + "learning_rate": 6.345192781718134e-07, + "loss": 1.0288, + "step": 15627 + }, + { + "epoch": 0.9495109058873564, + "grad_norm": 0.10137403756380081, + "learning_rate": 6.329999576964341e-07, + "loss": 1.0268, + "step": 15628 + }, + { + "epoch": 0.949571662919983, + "grad_norm": 0.11061901599168777, + "learning_rate": 6.314824467779812e-07, + "loss": 1.0246, + "step": 15629 + }, + { + "epoch": 0.9496324199526095, + "grad_norm": 0.1095118448138237, + "learning_rate": 6.29966745472077e-07, + "loss": 1.0235, + "step": 15630 + }, + { + "epoch": 0.949693176985236, + "grad_norm": 0.11759944260120392, + "learning_rate": 6.284528538342937e-07, + "loss": 1.0645, + "step": 15631 + }, + { + "epoch": 0.9497539340178626, + "grad_norm": 0.4917198419570923, + "learning_rate": 6.269407719201092e-07, + "loss": 1.0552, + "step": 15632 + }, + { + "epoch": 0.9498146910504891, + "grad_norm": 0.11569838225841522, + "learning_rate": 6.254304997849624e-07, + "loss": 1.0497, + "step": 15633 + }, + { + "epoch": 0.9498754480831156, + "grad_norm": 0.17323362827301025, + "learning_rate": 6.239220374841981e-07, + "loss": 1.1177, + "step": 15634 + }, + { + "epoch": 0.9499362051157422, + "grad_norm": 0.10534697026014328, + "learning_rate": 6.224153850731107e-07, + "loss": 1.0024, + "step": 15635 + }, + { + "epoch": 0.9499969621483687, + "grad_norm": 0.3628701865673065, + "learning_rate": 6.209105426069395e-07, + "loss": 1.2407, + "step": 15636 + }, + { + "epoch": 0.9500577191809952, + "grad_norm": 0.12392721325159073, + "learning_rate": 6.194075101408348e-07, + "loss": 1.0336, + "step": 15637 + }, + { + "epoch": 0.9501184762136218, + "grad_norm": 0.12260594218969345, + "learning_rate": 6.179062877298969e-07, + "loss": 1.0222, + "step": 15638 + }, + { + "epoch": 0.9501792332462482, + "grad_norm": 0.17222723364830017, + "learning_rate": 6.16406875429143e-07, + "loss": 1.0944, + "step": 15639 + }, + { + "epoch": 0.9502399902788747, + "grad_norm": 0.13815884292125702, + "learning_rate": 6.149092732935457e-07, + "loss": 1.0336, + "step": 15640 + }, + { + "epoch": 0.9503007473115013, + "grad_norm": 0.22213789820671082, + "learning_rate": 6.13413481377989e-07, + "loss": 1.0869, + "step": 15641 + }, + { + "epoch": 0.9503615043441278, + "grad_norm": 0.14781610667705536, + "learning_rate": 6.119194997373123e-07, + "loss": 1.0634, + "step": 15642 + }, + { + "epoch": 0.9504222613767543, + "grad_norm": 0.11637907475233078, + "learning_rate": 6.104273284262718e-07, + "loss": 1.0912, + "step": 15643 + }, + { + "epoch": 0.9504830184093809, + "grad_norm": 0.11060317605733871, + "learning_rate": 6.089369674995626e-07, + "loss": 1.0476, + "step": 15644 + }, + { + "epoch": 0.9505437754420074, + "grad_norm": 0.14210262894630432, + "learning_rate": 6.074484170118188e-07, + "loss": 1.0488, + "step": 15645 + }, + { + "epoch": 0.950604532474634, + "grad_norm": 0.1596514880657196, + "learning_rate": 6.059616770175969e-07, + "loss": 1.0141, + "step": 15646 + }, + { + "epoch": 0.9506652895072605, + "grad_norm": 0.13412600755691528, + "learning_rate": 6.044767475714031e-07, + "loss": 1.1651, + "step": 15647 + }, + { + "epoch": 0.950726046539887, + "grad_norm": 0.14134304225444794, + "learning_rate": 6.029936287276605e-07, + "loss": 0.9804, + "step": 15648 + }, + { + "epoch": 0.9507868035725136, + "grad_norm": 0.1634296029806137, + "learning_rate": 6.015123205407313e-07, + "loss": 1.0712, + "step": 15649 + }, + { + "epoch": 0.9508475606051401, + "grad_norm": 0.10759787261486053, + "learning_rate": 6.000328230649166e-07, + "loss": 1.0393, + "step": 15650 + }, + { + "epoch": 0.9509083176377666, + "grad_norm": 0.09372863173484802, + "learning_rate": 5.98555136354445e-07, + "loss": 1.0099, + "step": 15651 + }, + { + "epoch": 0.950969074670393, + "grad_norm": 0.13221114873886108, + "learning_rate": 5.970792604634901e-07, + "loss": 1.0567, + "step": 15652 + }, + { + "epoch": 0.9510298317030196, + "grad_norm": 0.10997379571199417, + "learning_rate": 5.956051954461472e-07, + "loss": 1.057, + "step": 15653 + }, + { + "epoch": 0.9510905887356461, + "grad_norm": 0.15794794261455536, + "learning_rate": 5.941329413564401e-07, + "loss": 1.1131, + "step": 15654 + }, + { + "epoch": 0.9511513457682726, + "grad_norm": 0.11565219610929489, + "learning_rate": 5.926624982483364e-07, + "loss": 1.04, + "step": 15655 + }, + { + "epoch": 0.9512121028008992, + "grad_norm": 0.1022125706076622, + "learning_rate": 5.911938661757433e-07, + "loss": 0.9985, + "step": 15656 + }, + { + "epoch": 0.9512728598335257, + "grad_norm": 0.11516375094652176, + "learning_rate": 5.897270451924897e-07, + "loss": 1.0435, + "step": 15657 + }, + { + "epoch": 0.9513336168661523, + "grad_norm": 0.10048330575227737, + "learning_rate": 5.882620353523383e-07, + "loss": 0.977, + "step": 15658 + }, + { + "epoch": 0.9513943738987788, + "grad_norm": 0.143539696931839, + "learning_rate": 5.867988367089961e-07, + "loss": 1.0442, + "step": 15659 + }, + { + "epoch": 0.9514551309314053, + "grad_norm": 0.12086950242519379, + "learning_rate": 5.853374493160979e-07, + "loss": 1.0932, + "step": 15660 + }, + { + "epoch": 0.9515158879640319, + "grad_norm": 0.2632455825805664, + "learning_rate": 5.83877873227201e-07, + "loss": 1.0284, + "step": 15661 + }, + { + "epoch": 0.9515766449966584, + "grad_norm": 0.3406867980957031, + "learning_rate": 5.824201084958181e-07, + "loss": 1.0898, + "step": 15662 + }, + { + "epoch": 0.9516374020292849, + "grad_norm": 0.11601199954748154, + "learning_rate": 5.809641551753731e-07, + "loss": 1.0777, + "step": 15663 + }, + { + "epoch": 0.9516981590619115, + "grad_norm": 0.11150176078081131, + "learning_rate": 5.7951001331924e-07, + "loss": 1.0282, + "step": 15664 + }, + { + "epoch": 0.9517589160945379, + "grad_norm": 0.15512247383594513, + "learning_rate": 5.78057682980726e-07, + "loss": 1.1055, + "step": 15665 + }, + { + "epoch": 0.9518196731271644, + "grad_norm": 0.11280364543199539, + "learning_rate": 5.766071642130555e-07, + "loss": 0.9957, + "step": 15666 + }, + { + "epoch": 0.951880430159791, + "grad_norm": 0.11917781084775925, + "learning_rate": 5.751584570694024e-07, + "loss": 1.0478, + "step": 15667 + }, + { + "epoch": 0.9519411871924175, + "grad_norm": 0.10429950058460236, + "learning_rate": 5.737115616028687e-07, + "loss": 1.0165, + "step": 15668 + }, + { + "epoch": 0.952001944225044, + "grad_norm": 0.14690320193767548, + "learning_rate": 5.722664778664955e-07, + "loss": 1.0667, + "step": 15669 + }, + { + "epoch": 0.9520627012576706, + "grad_norm": 0.11742889136075974, + "learning_rate": 5.708232059132456e-07, + "loss": 1.0384, + "step": 15670 + }, + { + "epoch": 0.9521234582902971, + "grad_norm": 0.1140822321176529, + "learning_rate": 5.693817457960271e-07, + "loss": 1.0401, + "step": 15671 + }, + { + "epoch": 0.9521842153229236, + "grad_norm": 0.09306786954402924, + "learning_rate": 5.679420975676753e-07, + "loss": 1.031, + "step": 15672 + }, + { + "epoch": 0.9522449723555502, + "grad_norm": 0.1260322630405426, + "learning_rate": 5.66504261280959e-07, + "loss": 1.0305, + "step": 15673 + }, + { + "epoch": 0.9523057293881767, + "grad_norm": 0.273558109998703, + "learning_rate": 5.650682369885918e-07, + "loss": 1.1694, + "step": 15674 + }, + { + "epoch": 0.9523664864208032, + "grad_norm": 1.9666259288787842, + "learning_rate": 5.636340247431981e-07, + "loss": 1.0251, + "step": 15675 + }, + { + "epoch": 0.9524272434534298, + "grad_norm": 0.17853352427482605, + "learning_rate": 5.622016245973527e-07, + "loss": 1.1639, + "step": 15676 + }, + { + "epoch": 0.9524880004860563, + "grad_norm": 0.16596582531929016, + "learning_rate": 5.607710366035634e-07, + "loss": 1.1504, + "step": 15677 + }, + { + "epoch": 0.9525487575186828, + "grad_norm": 0.20105420053005219, + "learning_rate": 5.593422608142718e-07, + "loss": 1.1802, + "step": 15678 + }, + { + "epoch": 0.9526095145513093, + "grad_norm": 0.15446464717388153, + "learning_rate": 5.579152972818469e-07, + "loss": 1.0664, + "step": 15679 + }, + { + "epoch": 0.9526702715839358, + "grad_norm": 0.17659232020378113, + "learning_rate": 5.564901460585914e-07, + "loss": 1.1251, + "step": 15680 + }, + { + "epoch": 0.9527310286165623, + "grad_norm": 0.1474592089653015, + "learning_rate": 5.550668071967469e-07, + "loss": 1.0833, + "step": 15681 + }, + { + "epoch": 0.9527917856491889, + "grad_norm": 0.17089638113975525, + "learning_rate": 5.536452807484827e-07, + "loss": 1.0889, + "step": 15682 + }, + { + "epoch": 0.9528525426818154, + "grad_norm": 0.13901163637638092, + "learning_rate": 5.522255667659127e-07, + "loss": 1.018, + "step": 15683 + }, + { + "epoch": 0.9529132997144419, + "grad_norm": 0.19612663984298706, + "learning_rate": 5.508076653010729e-07, + "loss": 1.1671, + "step": 15684 + }, + { + "epoch": 0.9529740567470685, + "grad_norm": 0.14554022252559662, + "learning_rate": 5.493915764059332e-07, + "loss": 1.0425, + "step": 15685 + }, + { + "epoch": 0.953034813779695, + "grad_norm": 0.18441662192344666, + "learning_rate": 5.479773001324074e-07, + "loss": 1.2528, + "step": 15686 + }, + { + "epoch": 0.9530955708123215, + "grad_norm": 0.08343720436096191, + "learning_rate": 5.465648365323317e-07, + "loss": 0.9894, + "step": 15687 + }, + { + "epoch": 0.9531563278449481, + "grad_norm": 0.2077893167734146, + "learning_rate": 5.451541856574815e-07, + "loss": 1.1351, + "step": 15688 + }, + { + "epoch": 0.9532170848775746, + "grad_norm": 0.18338990211486816, + "learning_rate": 5.4374534755956e-07, + "loss": 1.1574, + "step": 15689 + }, + { + "epoch": 0.9532778419102012, + "grad_norm": 5.325088977813721, + "learning_rate": 5.4233832229022e-07, + "loss": 1.2211, + "step": 15690 + }, + { + "epoch": 0.9533385989428277, + "grad_norm": 0.15941472351551056, + "learning_rate": 5.409331099010206e-07, + "loss": 1.1056, + "step": 15691 + }, + { + "epoch": 0.9533993559754541, + "grad_norm": 0.21715497970581055, + "learning_rate": 5.395297104434815e-07, + "loss": 1.2403, + "step": 15692 + }, + { + "epoch": 0.9534601130080806, + "grad_norm": 0.19840890169143677, + "learning_rate": 5.381281239690394e-07, + "loss": 1.1974, + "step": 15693 + }, + { + "epoch": 0.9535208700407072, + "grad_norm": 0.16425780951976776, + "learning_rate": 5.36728350529081e-07, + "loss": 1.0497, + "step": 15694 + }, + { + "epoch": 0.9535816270733337, + "grad_norm": 0.1002884954214096, + "learning_rate": 5.353303901748985e-07, + "loss": 0.9873, + "step": 15695 + }, + { + "epoch": 0.9536423841059603, + "grad_norm": 0.15883687138557434, + "learning_rate": 5.339342429577398e-07, + "loss": 1.084, + "step": 15696 + }, + { + "epoch": 0.9537031411385868, + "grad_norm": 0.15514416992664337, + "learning_rate": 5.325399089287809e-07, + "loss": 1.084, + "step": 15697 + }, + { + "epoch": 0.9537638981712133, + "grad_norm": 0.1579744517803192, + "learning_rate": 5.311473881391415e-07, + "loss": 1.1398, + "step": 15698 + }, + { + "epoch": 0.9538246552038399, + "grad_norm": 0.10681905597448349, + "learning_rate": 5.297566806398535e-07, + "loss": 1.0223, + "step": 15699 + }, + { + "epoch": 0.9538854122364664, + "grad_norm": 0.9505147933959961, + "learning_rate": 5.283677864818981e-07, + "loss": 1.0676, + "step": 15700 + }, + { + "epoch": 0.9539461692690929, + "grad_norm": 0.12085028737783432, + "learning_rate": 5.269807057161847e-07, + "loss": 1.0438, + "step": 15701 + }, + { + "epoch": 0.9540069263017195, + "grad_norm": 0.15130169689655304, + "learning_rate": 5.255954383935502e-07, + "loss": 1.0802, + "step": 15702 + }, + { + "epoch": 0.954067683334346, + "grad_norm": 0.28984054923057556, + "learning_rate": 5.242119845647819e-07, + "loss": 1.0393, + "step": 15703 + }, + { + "epoch": 0.9541284403669725, + "grad_norm": 0.12057067453861237, + "learning_rate": 5.228303442805893e-07, + "loss": 1.0825, + "step": 15704 + }, + { + "epoch": 0.954189197399599, + "grad_norm": 0.1457018107175827, + "learning_rate": 5.214505175916096e-07, + "loss": 1.081, + "step": 15705 + }, + { + "epoch": 0.9542499544322255, + "grad_norm": 0.747618556022644, + "learning_rate": 5.2007250454843e-07, + "loss": 1.0604, + "step": 15706 + }, + { + "epoch": 0.954310711464852, + "grad_norm": 0.09438946098089218, + "learning_rate": 5.186963052015547e-07, + "loss": 1.0137, + "step": 15707 + }, + { + "epoch": 0.9543714684974786, + "grad_norm": 0.15344533324241638, + "learning_rate": 5.173219196014267e-07, + "loss": 1.0576, + "step": 15708 + }, + { + "epoch": 0.9544322255301051, + "grad_norm": 0.17992642521858215, + "learning_rate": 5.159493477984335e-07, + "loss": 1.0509, + "step": 15709 + }, + { + "epoch": 0.9544929825627316, + "grad_norm": 0.9288654327392578, + "learning_rate": 5.145785898428846e-07, + "loss": 1.1897, + "step": 15710 + }, + { + "epoch": 0.9545537395953582, + "grad_norm": 0.2120366394519806, + "learning_rate": 5.132096457850177e-07, + "loss": 1.0823, + "step": 15711 + }, + { + "epoch": 0.9546144966279847, + "grad_norm": 0.13646164536476135, + "learning_rate": 5.118425156750206e-07, + "loss": 1.0284, + "step": 15712 + }, + { + "epoch": 0.9546752536606112, + "grad_norm": 0.10246030241250992, + "learning_rate": 5.10477199562992e-07, + "loss": 1.0369, + "step": 15713 + }, + { + "epoch": 0.9547360106932378, + "grad_norm": 0.18350543081760406, + "learning_rate": 5.091136974990029e-07, + "loss": 1.1862, + "step": 15714 + }, + { + "epoch": 0.9547967677258643, + "grad_norm": 0.7761063575744629, + "learning_rate": 5.07752009533008e-07, + "loss": 1.0339, + "step": 15715 + }, + { + "epoch": 0.9548575247584908, + "grad_norm": 0.10499266535043716, + "learning_rate": 5.063921357149337e-07, + "loss": 1.0231, + "step": 15716 + }, + { + "epoch": 0.9549182817911174, + "grad_norm": 0.11632519960403442, + "learning_rate": 5.050340760946182e-07, + "loss": 1.0214, + "step": 15717 + }, + { + "epoch": 0.9549790388237438, + "grad_norm": 0.10571163892745972, + "learning_rate": 5.036778307218493e-07, + "loss": 1.009, + "step": 15718 + }, + { + "epoch": 0.9550397958563703, + "grad_norm": 0.11057620495557785, + "learning_rate": 5.023233996463317e-07, + "loss": 1.0243, + "step": 15719 + }, + { + "epoch": 0.9551005528889969, + "grad_norm": 0.19193731248378754, + "learning_rate": 5.009707829177257e-07, + "loss": 0.9708, + "step": 15720 + }, + { + "epoch": 0.9551613099216234, + "grad_norm": 0.18102526664733887, + "learning_rate": 4.996199805856028e-07, + "loss": 1.0966, + "step": 15721 + }, + { + "epoch": 0.9552220669542499, + "grad_norm": 0.15442179143428802, + "learning_rate": 4.982709926994733e-07, + "loss": 1.0796, + "step": 15722 + }, + { + "epoch": 0.9552828239868765, + "grad_norm": 0.2329256236553192, + "learning_rate": 4.969238193087867e-07, + "loss": 1.1888, + "step": 15723 + }, + { + "epoch": 0.955343581019503, + "grad_norm": 0.2275102138519287, + "learning_rate": 4.95578460462931e-07, + "loss": 1.1093, + "step": 15724 + }, + { + "epoch": 0.9554043380521295, + "grad_norm": 0.36220794916152954, + "learning_rate": 4.94234916211217e-07, + "loss": 1.034, + "step": 15725 + }, + { + "epoch": 0.9554650950847561, + "grad_norm": 0.10626080632209778, + "learning_rate": 4.928931866028885e-07, + "loss": 1.04, + "step": 15726 + }, + { + "epoch": 0.9555258521173826, + "grad_norm": 0.18155361711978912, + "learning_rate": 4.915532716871341e-07, + "loss": 1.1162, + "step": 15727 + }, + { + "epoch": 0.9555866091500091, + "grad_norm": 0.20560337603092194, + "learning_rate": 4.902151715130587e-07, + "loss": 1.133, + "step": 15728 + }, + { + "epoch": 0.9556473661826357, + "grad_norm": 0.15575452148914337, + "learning_rate": 4.888788861297178e-07, + "loss": 1.0354, + "step": 15729 + }, + { + "epoch": 0.9557081232152622, + "grad_norm": 0.12168926745653152, + "learning_rate": 4.875444155860886e-07, + "loss": 0.9853, + "step": 15730 + }, + { + "epoch": 0.9557688802478886, + "grad_norm": 0.1620974838733673, + "learning_rate": 4.862117599310933e-07, + "loss": 1.0908, + "step": 15731 + }, + { + "epoch": 0.9558296372805152, + "grad_norm": 0.10455036163330078, + "learning_rate": 4.848809192135761e-07, + "loss": 1.0511, + "step": 15732 + }, + { + "epoch": 0.9558903943131417, + "grad_norm": 0.16169939935207367, + "learning_rate": 4.835518934823203e-07, + "loss": 1.0938, + "step": 15733 + }, + { + "epoch": 0.9559511513457682, + "grad_norm": 0.12686842679977417, + "learning_rate": 4.822246827860422e-07, + "loss": 0.9902, + "step": 15734 + }, + { + "epoch": 0.9560119083783948, + "grad_norm": 1.6132800579071045, + "learning_rate": 4.80899287173392e-07, + "loss": 1.0305, + "step": 15735 + }, + { + "epoch": 0.9560726654110213, + "grad_norm": 0.44043534994125366, + "learning_rate": 4.795757066929419e-07, + "loss": 1.3478, + "step": 15736 + }, + { + "epoch": 0.9561334224436479, + "grad_norm": 0.36503586173057556, + "learning_rate": 4.782539413932252e-07, + "loss": 1.129, + "step": 15737 + }, + { + "epoch": 0.9561941794762744, + "grad_norm": 0.15852870047092438, + "learning_rate": 4.769339913226756e-07, + "loss": 1.0866, + "step": 15738 + }, + { + "epoch": 0.9562549365089009, + "grad_norm": 4.603458404541016, + "learning_rate": 4.756158565296875e-07, + "loss": 1.0553, + "step": 15739 + }, + { + "epoch": 0.9563156935415275, + "grad_norm": 0.9752864837646484, + "learning_rate": 4.7429953706257246e-07, + "loss": 1.0211, + "step": 15740 + }, + { + "epoch": 0.956376450574154, + "grad_norm": 0.12347756326198578, + "learning_rate": 4.7298503296958065e-07, + "loss": 1.0389, + "step": 15741 + }, + { + "epoch": 0.9564372076067805, + "grad_norm": 0.1566256731748581, + "learning_rate": 4.716723442988957e-07, + "loss": 1.1258, + "step": 15742 + }, + { + "epoch": 0.9564979646394071, + "grad_norm": 0.42548832297325134, + "learning_rate": 4.7036147109863484e-07, + "loss": 1.0828, + "step": 15743 + }, + { + "epoch": 0.9565587216720335, + "grad_norm": 0.4719665050506592, + "learning_rate": 4.690524134168428e-07, + "loss": 1.1058, + "step": 15744 + }, + { + "epoch": 0.95661947870466, + "grad_norm": 0.09681335836648941, + "learning_rate": 4.677451713015146e-07, + "loss": 1.0263, + "step": 15745 + }, + { + "epoch": 0.9566802357372866, + "grad_norm": 0.12808607518672943, + "learning_rate": 4.66439744800562e-07, + "loss": 1.0149, + "step": 15746 + }, + { + "epoch": 0.9567409927699131, + "grad_norm": 0.13159635663032532, + "learning_rate": 4.651361339618354e-07, + "loss": 1.0513, + "step": 15747 + }, + { + "epoch": 0.9568017498025396, + "grad_norm": 0.48745205998420715, + "learning_rate": 4.63834338833119e-07, + "loss": 1.0448, + "step": 15748 + }, + { + "epoch": 0.9568625068351662, + "grad_norm": 0.1470884084701538, + "learning_rate": 4.6253435946212454e-07, + "loss": 1.1253, + "step": 15749 + }, + { + "epoch": 0.9569232638677927, + "grad_norm": 1.147767186164856, + "learning_rate": 4.612361958965139e-07, + "loss": 1.0394, + "step": 15750 + }, + { + "epoch": 0.9569840209004192, + "grad_norm": 0.1831953525543213, + "learning_rate": 4.5993984818386015e-07, + "loss": 1.078, + "step": 15751 + }, + { + "epoch": 0.9570447779330458, + "grad_norm": 0.24117903411388397, + "learning_rate": 4.586453163716919e-07, + "loss": 1.2209, + "step": 15752 + }, + { + "epoch": 0.9571055349656723, + "grad_norm": 0.19036448001861572, + "learning_rate": 4.5735260050745464e-07, + "loss": 1.018, + "step": 15753 + }, + { + "epoch": 0.9571662919982988, + "grad_norm": 0.09763475507497787, + "learning_rate": 4.5606170063853816e-07, + "loss": 0.9756, + "step": 15754 + }, + { + "epoch": 0.9572270490309254, + "grad_norm": 0.2130412757396698, + "learning_rate": 4.54772616812249e-07, + "loss": 1.143, + "step": 15755 + }, + { + "epoch": 0.9572878060635519, + "grad_norm": 0.11960317939519882, + "learning_rate": 4.5348534907585507e-07, + "loss": 1.0223, + "step": 15756 + }, + { + "epoch": 0.9573485630961783, + "grad_norm": 0.47025609016418457, + "learning_rate": 4.5219989747652404e-07, + "loss": 1.0692, + "step": 15757 + }, + { + "epoch": 0.9574093201288049, + "grad_norm": 0.1826414167881012, + "learning_rate": 4.5091626206139047e-07, + "loss": 1.1241, + "step": 15758 + }, + { + "epoch": 0.9574700771614314, + "grad_norm": 0.1296650618314743, + "learning_rate": 4.496344428774946e-07, + "loss": 1.0579, + "step": 15759 + }, + { + "epoch": 0.9575308341940579, + "grad_norm": 0.35346952080726624, + "learning_rate": 4.483544399718265e-07, + "loss": 1.0649, + "step": 15760 + }, + { + "epoch": 0.9575915912266845, + "grad_norm": 0.08745156228542328, + "learning_rate": 4.4707625339130976e-07, + "loss": 1.06, + "step": 15761 + }, + { + "epoch": 0.957652348259311, + "grad_norm": 0.10423334687948227, + "learning_rate": 4.4579988318278477e-07, + "loss": 1.0251, + "step": 15762 + }, + { + "epoch": 0.9577131052919375, + "grad_norm": 0.12803082168102264, + "learning_rate": 4.4452532939304734e-07, + "loss": 1.0227, + "step": 15763 + }, + { + "epoch": 0.9577738623245641, + "grad_norm": 0.1084895208477974, + "learning_rate": 4.432525920688102e-07, + "loss": 1.0396, + "step": 15764 + }, + { + "epoch": 0.9578346193571906, + "grad_norm": 0.19573159515857697, + "learning_rate": 4.4198167125672485e-07, + "loss": 1.0564, + "step": 15765 + }, + { + "epoch": 0.9578953763898171, + "grad_norm": 1.038779854774475, + "learning_rate": 4.4071256700338737e-07, + "loss": 1.087, + "step": 15766 + }, + { + "epoch": 0.9579561334224437, + "grad_norm": 0.0839330330491066, + "learning_rate": 4.3944527935531053e-07, + "loss": 1.0192, + "step": 15767 + }, + { + "epoch": 0.9580168904550702, + "grad_norm": 0.2146690934896469, + "learning_rate": 4.381798083589461e-07, + "loss": 1.1401, + "step": 15768 + }, + { + "epoch": 0.9580776474876967, + "grad_norm": 0.14495867490768433, + "learning_rate": 4.3691615406067365e-07, + "loss": 1.0922, + "step": 15769 + }, + { + "epoch": 0.9581384045203232, + "grad_norm": 0.13484595715999603, + "learning_rate": 4.356543165068283e-07, + "loss": 1.0681, + "step": 15770 + }, + { + "epoch": 0.9581991615529497, + "grad_norm": 0.11026013642549515, + "learning_rate": 4.3439429574365086e-07, + "loss": 1.0064, + "step": 15771 + }, + { + "epoch": 0.9582599185855762, + "grad_norm": 0.1724824160337448, + "learning_rate": 4.331360918173322e-07, + "loss": 1.1284, + "step": 15772 + }, + { + "epoch": 0.9583206756182028, + "grad_norm": 0.10396581888198853, + "learning_rate": 4.3187970477399087e-07, + "loss": 1.0039, + "step": 15773 + }, + { + "epoch": 0.9583814326508293, + "grad_norm": 0.11033745855093002, + "learning_rate": 4.306251346596846e-07, + "loss": 1.0926, + "step": 15774 + }, + { + "epoch": 0.9584421896834558, + "grad_norm": 0.09618641436100006, + "learning_rate": 4.293723815203876e-07, + "loss": 1.0383, + "step": 15775 + }, + { + "epoch": 0.9585029467160824, + "grad_norm": 0.10459344834089279, + "learning_rate": 4.2812144540203545e-07, + "loss": 1.0, + "step": 15776 + }, + { + "epoch": 0.9585637037487089, + "grad_norm": 0.13060784339904785, + "learning_rate": 4.2687232635046923e-07, + "loss": 1.0797, + "step": 15777 + }, + { + "epoch": 0.9586244607813355, + "grad_norm": 0.16173116862773895, + "learning_rate": 4.256250244114801e-07, + "loss": 1.1165, + "step": 15778 + }, + { + "epoch": 0.958685217813962, + "grad_norm": 0.2411125749349594, + "learning_rate": 4.2437953963078704e-07, + "loss": 1.1622, + "step": 15779 + }, + { + "epoch": 0.9587459748465885, + "grad_norm": 0.11841461062431335, + "learning_rate": 4.231358720540479e-07, + "loss": 1.0184, + "step": 15780 + }, + { + "epoch": 0.9588067318792151, + "grad_norm": 0.2478577345609665, + "learning_rate": 4.2189402172684856e-07, + "loss": 1.3158, + "step": 15781 + }, + { + "epoch": 0.9588674889118416, + "grad_norm": 0.24367450177669525, + "learning_rate": 4.206539886947025e-07, + "loss": 1.0433, + "step": 15782 + }, + { + "epoch": 0.9589282459444681, + "grad_norm": 0.110954150557518, + "learning_rate": 4.194157730030679e-07, + "loss": 1.0466, + "step": 15783 + }, + { + "epoch": 0.9589890029770946, + "grad_norm": 0.10849586874246597, + "learning_rate": 4.1817937469733613e-07, + "loss": 1.0486, + "step": 15784 + }, + { + "epoch": 0.9590497600097211, + "grad_norm": 0.13907769322395325, + "learning_rate": 4.169447938228155e-07, + "loss": 1.0129, + "step": 15785 + }, + { + "epoch": 0.9591105170423476, + "grad_norm": 0.10487202554941177, + "learning_rate": 4.1571203042477523e-07, + "loss": 1.0686, + "step": 15786 + }, + { + "epoch": 0.9591712740749742, + "grad_norm": 0.15854674577713013, + "learning_rate": 4.144810845483904e-07, + "loss": 1.0877, + "step": 15787 + }, + { + "epoch": 0.9592320311076007, + "grad_norm": 0.11448860168457031, + "learning_rate": 4.1325195623879153e-07, + "loss": 1.0534, + "step": 15788 + }, + { + "epoch": 0.9592927881402272, + "grad_norm": 0.1065724790096283, + "learning_rate": 4.120246455410204e-07, + "loss": 1.0341, + "step": 15789 + }, + { + "epoch": 0.9593535451728538, + "grad_norm": 0.14623962342739105, + "learning_rate": 4.1079915250007427e-07, + "loss": 1.0399, + "step": 15790 + }, + { + "epoch": 0.9594143022054803, + "grad_norm": 0.12341058254241943, + "learning_rate": 4.0957547716086733e-07, + "loss": 1.0453, + "step": 15791 + }, + { + "epoch": 0.9594750592381068, + "grad_norm": 0.1551622897386551, + "learning_rate": 4.083536195682636e-07, + "loss": 1.0949, + "step": 15792 + }, + { + "epoch": 0.9595358162707334, + "grad_norm": 0.24149124324321747, + "learning_rate": 4.0713357976703834e-07, + "loss": 1.067, + "step": 15793 + }, + { + "epoch": 0.9595965733033599, + "grad_norm": 0.1642986536026001, + "learning_rate": 4.059153578019226e-07, + "loss": 1.098, + "step": 15794 + }, + { + "epoch": 0.9596573303359864, + "grad_norm": 0.09819808602333069, + "learning_rate": 4.0469895371756383e-07, + "loss": 0.9867, + "step": 15795 + }, + { + "epoch": 0.959718087368613, + "grad_norm": 0.17577776312828064, + "learning_rate": 4.034843675585487e-07, + "loss": 1.0885, + "step": 15796 + }, + { + "epoch": 0.9597788444012394, + "grad_norm": 0.13872236013412476, + "learning_rate": 4.022715993694026e-07, + "loss": 1.0661, + "step": 15797 + }, + { + "epoch": 0.9598396014338659, + "grad_norm": 0.1984054297208786, + "learning_rate": 4.0106064919457897e-07, + "loss": 1.0258, + "step": 15798 + }, + { + "epoch": 0.9599003584664925, + "grad_norm": 0.09982796758413315, + "learning_rate": 3.9985151707846444e-07, + "loss": 0.9987, + "step": 15799 + }, + { + "epoch": 0.959961115499119, + "grad_norm": 0.17996053397655487, + "learning_rate": 3.986442030653792e-07, + "loss": 1.1298, + "step": 15800 + }, + { + "epoch": 0.9600218725317455, + "grad_norm": 0.10261844843626022, + "learning_rate": 3.9743870719957664e-07, + "loss": 1.0492, + "step": 15801 + }, + { + "epoch": 0.9600826295643721, + "grad_norm": 0.11543924361467361, + "learning_rate": 3.962350295252493e-07, + "loss": 1.0396, + "step": 15802 + }, + { + "epoch": 0.9601433865969986, + "grad_norm": 0.22956831753253937, + "learning_rate": 3.9503317008651173e-07, + "loss": 1.1858, + "step": 15803 + }, + { + "epoch": 0.9602041436296251, + "grad_norm": 0.18623453378677368, + "learning_rate": 3.938331289274233e-07, + "loss": 1.112, + "step": 15804 + }, + { + "epoch": 0.9602649006622517, + "grad_norm": 0.17458507418632507, + "learning_rate": 3.9263490609197094e-07, + "loss": 1.1401, + "step": 15805 + }, + { + "epoch": 0.9603256576948782, + "grad_norm": 0.3694145679473877, + "learning_rate": 3.914385016240751e-07, + "loss": 1.0779, + "step": 15806 + }, + { + "epoch": 0.9603864147275047, + "grad_norm": 0.13405080139636993, + "learning_rate": 3.902439155675897e-07, + "loss": 1.0505, + "step": 15807 + }, + { + "epoch": 0.9604471717601313, + "grad_norm": 0.23359820246696472, + "learning_rate": 3.8905114796630724e-07, + "loss": 1.0955, + "step": 15808 + }, + { + "epoch": 0.9605079287927578, + "grad_norm": 0.10692767798900604, + "learning_rate": 3.878601988639374e-07, + "loss": 1.0553, + "step": 15809 + }, + { + "epoch": 0.9605686858253842, + "grad_norm": 0.47428059577941895, + "learning_rate": 3.8667106830414525e-07, + "loss": 1.2993, + "step": 15810 + }, + { + "epoch": 0.9606294428580108, + "grad_norm": 0.25792521238327026, + "learning_rate": 3.8548375633051247e-07, + "loss": 1.1253, + "step": 15811 + }, + { + "epoch": 0.9606901998906373, + "grad_norm": 0.1352255940437317, + "learning_rate": 3.8429826298656544e-07, + "loss": 1.0429, + "step": 15812 + }, + { + "epoch": 0.9607509569232638, + "grad_norm": 0.09801478683948517, + "learning_rate": 3.831145883157528e-07, + "loss": 1.0848, + "step": 15813 + }, + { + "epoch": 0.9608117139558904, + "grad_norm": 0.11662235110998154, + "learning_rate": 3.81932732361473e-07, + "loss": 1.0356, + "step": 15814 + }, + { + "epoch": 0.9608724709885169, + "grad_norm": 0.13122665882110596, + "learning_rate": 3.8075269516703596e-07, + "loss": 1.0, + "step": 15815 + }, + { + "epoch": 0.9609332280211434, + "grad_norm": 0.11306820809841156, + "learning_rate": 3.795744767756959e-07, + "loss": 1.0648, + "step": 15816 + }, + { + "epoch": 0.96099398505377, + "grad_norm": 0.22960704565048218, + "learning_rate": 3.783980772306517e-07, + "loss": 1.1373, + "step": 15817 + }, + { + "epoch": 0.9610547420863965, + "grad_norm": 0.1593761444091797, + "learning_rate": 3.772234965750132e-07, + "loss": 1.1185, + "step": 15818 + }, + { + "epoch": 0.961115499119023, + "grad_norm": 0.09501218795776367, + "learning_rate": 3.760507348518405e-07, + "loss": 1.0277, + "step": 15819 + }, + { + "epoch": 0.9611762561516496, + "grad_norm": 0.13905061781406403, + "learning_rate": 3.748797921041214e-07, + "loss": 1.0464, + "step": 15820 + }, + { + "epoch": 0.9612370131842761, + "grad_norm": 1.2092803716659546, + "learning_rate": 3.7371066837477154e-07, + "loss": 1.0348, + "step": 15821 + }, + { + "epoch": 0.9612977702169027, + "grad_norm": 0.21236439049243927, + "learning_rate": 3.7254336370665666e-07, + "loss": 1.0205, + "step": 15822 + }, + { + "epoch": 0.9613585272495291, + "grad_norm": 0.11326601356267929, + "learning_rate": 3.7137787814255365e-07, + "loss": 1.0609, + "step": 15823 + }, + { + "epoch": 0.9614192842821556, + "grad_norm": 0.1466970294713974, + "learning_rate": 3.702142117251839e-07, + "loss": 1.0726, + "step": 15824 + }, + { + "epoch": 0.9614800413147822, + "grad_norm": 0.18312968313694, + "learning_rate": 3.690523644972077e-07, + "loss": 1.1707, + "step": 15825 + }, + { + "epoch": 0.9615407983474087, + "grad_norm": 0.10370973497629166, + "learning_rate": 3.678923365012132e-07, + "loss": 0.987, + "step": 15826 + }, + { + "epoch": 0.9616015553800352, + "grad_norm": 0.12054158747196198, + "learning_rate": 3.667341277797165e-07, + "loss": 1.0349, + "step": 15827 + }, + { + "epoch": 0.9616623124126618, + "grad_norm": 0.09524430334568024, + "learning_rate": 3.6557773837517797e-07, + "loss": 1.0522, + "step": 15828 + }, + { + "epoch": 0.9617230694452883, + "grad_norm": 0.12918563187122345, + "learning_rate": 3.6442316832998035e-07, + "loss": 1.0663, + "step": 15829 + }, + { + "epoch": 0.9617838264779148, + "grad_norm": 0.13953079283237457, + "learning_rate": 3.6327041768643987e-07, + "loss": 1.0688, + "step": 15830 + }, + { + "epoch": 0.9618445835105414, + "grad_norm": 0.1647307574748993, + "learning_rate": 3.621194864868227e-07, + "loss": 1.0776, + "step": 15831 + }, + { + "epoch": 0.9619053405431679, + "grad_norm": 0.11573868989944458, + "learning_rate": 3.609703747733062e-07, + "loss": 1.0396, + "step": 15832 + }, + { + "epoch": 0.9619660975757944, + "grad_norm": 0.22192254662513733, + "learning_rate": 3.598230825880178e-07, + "loss": 1.0959, + "step": 15833 + }, + { + "epoch": 0.962026854608421, + "grad_norm": 0.10462997108697891, + "learning_rate": 3.586776099730127e-07, + "loss": 0.9694, + "step": 15834 + }, + { + "epoch": 0.9620876116410475, + "grad_norm": 0.13647164404392242, + "learning_rate": 3.57533956970274e-07, + "loss": 1.0748, + "step": 15835 + }, + { + "epoch": 0.9621483686736739, + "grad_norm": 0.10433114320039749, + "learning_rate": 3.5639212362172383e-07, + "loss": 1.0215, + "step": 15836 + }, + { + "epoch": 0.9622091257063005, + "grad_norm": 0.2004692107439041, + "learning_rate": 3.5525210996921187e-07, + "loss": 1.1962, + "step": 15837 + }, + { + "epoch": 0.962269882738927, + "grad_norm": 1.9959053993225098, + "learning_rate": 3.5411391605453257e-07, + "loss": 1.101, + "step": 15838 + }, + { + "epoch": 0.9623306397715535, + "grad_norm": 0.23181526362895966, + "learning_rate": 3.529775419194081e-07, + "loss": 1.0963, + "step": 15839 + }, + { + "epoch": 0.9623913968041801, + "grad_norm": 0.1325407177209854, + "learning_rate": 3.5184298760548296e-07, + "loss": 1.0725, + "step": 15840 + }, + { + "epoch": 0.9624521538368066, + "grad_norm": 0.19533170759677887, + "learning_rate": 3.507102531543516e-07, + "loss": 1.1787, + "step": 15841 + }, + { + "epoch": 0.9625129108694331, + "grad_norm": 31.75570297241211, + "learning_rate": 3.495793386075308e-07, + "loss": 1.091, + "step": 15842 + }, + { + "epoch": 0.9625736679020597, + "grad_norm": 0.13796232640743256, + "learning_rate": 3.484502440064818e-07, + "loss": 0.9974, + "step": 15843 + }, + { + "epoch": 0.9626344249346862, + "grad_norm": 0.08848493546247482, + "learning_rate": 3.473229693925828e-07, + "loss": 1.0555, + "step": 15844 + }, + { + "epoch": 0.9626951819673127, + "grad_norm": 0.09830200672149658, + "learning_rate": 3.461975148071561e-07, + "loss": 0.99, + "step": 15845 + }, + { + "epoch": 0.9627559389999393, + "grad_norm": 0.16677135229110718, + "learning_rate": 3.4507388029146324e-07, + "loss": 1.0793, + "step": 15846 + }, + { + "epoch": 0.9628166960325658, + "grad_norm": 0.10981838405132294, + "learning_rate": 3.439520658866824e-07, + "loss": 1.0254, + "step": 15847 + }, + { + "epoch": 0.9628774530651923, + "grad_norm": 0.14670252799987793, + "learning_rate": 3.4283207163393614e-07, + "loss": 1.0497, + "step": 15848 + }, + { + "epoch": 0.9629382100978188, + "grad_norm": 0.1654321849346161, + "learning_rate": 3.417138975742806e-07, + "loss": 1.0519, + "step": 15849 + }, + { + "epoch": 0.9629989671304453, + "grad_norm": 0.1254972219467163, + "learning_rate": 3.405975437486997e-07, + "loss": 0.9463, + "step": 15850 + }, + { + "epoch": 0.9630597241630718, + "grad_norm": 0.1553478091955185, + "learning_rate": 3.3948301019811056e-07, + "loss": 1.1441, + "step": 15851 + }, + { + "epoch": 0.9631204811956984, + "grad_norm": 0.14452053606510162, + "learning_rate": 3.383702969633751e-07, + "loss": 1.0105, + "step": 15852 + }, + { + "epoch": 0.9631812382283249, + "grad_norm": 0.17748470604419708, + "learning_rate": 3.372594040852772e-07, + "loss": 1.0594, + "step": 15853 + }, + { + "epoch": 0.9632419952609514, + "grad_norm": 0.19587081670761108, + "learning_rate": 3.3615033160453444e-07, + "loss": 1.2383, + "step": 15854 + }, + { + "epoch": 0.963302752293578, + "grad_norm": 0.09949034452438354, + "learning_rate": 3.350430795617976e-07, + "loss": 1.0072, + "step": 15855 + }, + { + "epoch": 0.9633635093262045, + "grad_norm": 0.1357887089252472, + "learning_rate": 3.339376479976619e-07, + "loss": 1.0904, + "step": 15856 + }, + { + "epoch": 0.963424266358831, + "grad_norm": 0.2644502520561218, + "learning_rate": 3.328340369526395e-07, + "loss": 1.1511, + "step": 15857 + }, + { + "epoch": 0.9634850233914576, + "grad_norm": 0.5562261343002319, + "learning_rate": 3.317322464671868e-07, + "loss": 1.1867, + "step": 15858 + }, + { + "epoch": 0.9635457804240841, + "grad_norm": 0.16169697046279907, + "learning_rate": 3.3063227658168827e-07, + "loss": 1.1308, + "step": 15859 + }, + { + "epoch": 0.9636065374567107, + "grad_norm": 0.11124769598245621, + "learning_rate": 3.295341273364616e-07, + "loss": 1.0593, + "step": 15860 + }, + { + "epoch": 0.9636672944893372, + "grad_norm": 0.16454099118709564, + "learning_rate": 3.2843779877176904e-07, + "loss": 1.0574, + "step": 15861 + }, + { + "epoch": 0.9637280515219636, + "grad_norm": 0.6228294968605042, + "learning_rate": 3.273432909277896e-07, + "loss": 1.0999, + "step": 15862 + }, + { + "epoch": 0.9637888085545901, + "grad_norm": 0.17999854683876038, + "learning_rate": 3.262506038446356e-07, + "loss": 1.1772, + "step": 15863 + }, + { + "epoch": 0.9638495655872167, + "grad_norm": 0.20635171234607697, + "learning_rate": 3.2515973756237496e-07, + "loss": 1.0948, + "step": 15864 + }, + { + "epoch": 0.9639103226198432, + "grad_norm": 0.12803366780281067, + "learning_rate": 3.240706921209813e-07, + "loss": 1.0711, + "step": 15865 + }, + { + "epoch": 0.9639710796524698, + "grad_norm": 0.12278302013874054, + "learning_rate": 3.229834675603838e-07, + "loss": 1.0304, + "step": 15866 + }, + { + "epoch": 0.9640318366850963, + "grad_norm": 0.09925037622451782, + "learning_rate": 3.2189806392042275e-07, + "loss": 1.0281, + "step": 15867 + }, + { + "epoch": 0.9640925937177228, + "grad_norm": 0.12721189856529236, + "learning_rate": 3.208144812408942e-07, + "loss": 1.0925, + "step": 15868 + }, + { + "epoch": 0.9641533507503494, + "grad_norm": 0.12747856974601746, + "learning_rate": 3.197327195615163e-07, + "loss": 1.0564, + "step": 15869 + }, + { + "epoch": 0.9642141077829759, + "grad_norm": 0.19266590476036072, + "learning_rate": 3.1865277892193515e-07, + "loss": 1.1516, + "step": 15870 + }, + { + "epoch": 0.9642748648156024, + "grad_norm": 0.127415731549263, + "learning_rate": 3.1757465936174127e-07, + "loss": 1.0182, + "step": 15871 + }, + { + "epoch": 0.964335621848229, + "grad_norm": 0.15527105331420898, + "learning_rate": 3.1649836092045306e-07, + "loss": 1.0965, + "step": 15872 + }, + { + "epoch": 0.9643963788808555, + "grad_norm": 0.16879485547542572, + "learning_rate": 3.1542388363751676e-07, + "loss": 1.0524, + "step": 15873 + }, + { + "epoch": 0.964457135913482, + "grad_norm": 0.18120986223220825, + "learning_rate": 3.143512275523286e-07, + "loss": 1.1234, + "step": 15874 + }, + { + "epoch": 0.9645178929461085, + "grad_norm": 0.16349101066589355, + "learning_rate": 3.132803927041961e-07, + "loss": 1.0923, + "step": 15875 + }, + { + "epoch": 0.964578649978735, + "grad_norm": 0.22919566929340363, + "learning_rate": 3.1221137913237107e-07, + "loss": 1.0207, + "step": 15876 + }, + { + "epoch": 0.9646394070113615, + "grad_norm": 0.1649797111749649, + "learning_rate": 3.111441868760445e-07, + "loss": 1.0874, + "step": 15877 + }, + { + "epoch": 0.9647001640439881, + "grad_norm": 0.22419624030590057, + "learning_rate": 3.100788159743351e-07, + "loss": 1.1291, + "step": 15878 + }, + { + "epoch": 0.9647609210766146, + "grad_norm": 0.1662607342004776, + "learning_rate": 3.090152664662893e-07, + "loss": 1.2389, + "step": 15879 + }, + { + "epoch": 0.9648216781092411, + "grad_norm": 0.10185400396585464, + "learning_rate": 3.0795353839089826e-07, + "loss": 1.0005, + "step": 15880 + }, + { + "epoch": 0.9648824351418677, + "grad_norm": 0.11787091195583344, + "learning_rate": 3.0689363178706964e-07, + "loss": 1.0436, + "step": 15881 + }, + { + "epoch": 0.9649431921744942, + "grad_norm": 0.10902456194162369, + "learning_rate": 3.0583554669366687e-07, + "loss": 1.0051, + "step": 15882 + }, + { + "epoch": 0.9650039492071207, + "grad_norm": 0.2078927606344223, + "learning_rate": 3.047792831494589e-07, + "loss": 1.0755, + "step": 15883 + }, + { + "epoch": 0.9650647062397473, + "grad_norm": 0.15197834372520447, + "learning_rate": 3.0372484119318143e-07, + "loss": 1.0498, + "step": 15884 + }, + { + "epoch": 0.9651254632723738, + "grad_norm": 0.11288712918758392, + "learning_rate": 3.0267222086347026e-07, + "loss": 1.0557, + "step": 15885 + }, + { + "epoch": 0.9651862203050003, + "grad_norm": 1.0391411781311035, + "learning_rate": 3.0162142219891663e-07, + "loss": 1.0546, + "step": 15886 + }, + { + "epoch": 0.9652469773376269, + "grad_norm": 0.23355908691883087, + "learning_rate": 3.005724452380343e-07, + "loss": 1.1244, + "step": 15887 + }, + { + "epoch": 0.9653077343702534, + "grad_norm": 0.21837641298770905, + "learning_rate": 2.995252900192813e-07, + "loss": 1.1245, + "step": 15888 + }, + { + "epoch": 0.9653684914028798, + "grad_norm": 0.33931583166122437, + "learning_rate": 2.984799565810326e-07, + "loss": 1.1074, + "step": 15889 + }, + { + "epoch": 0.9654292484355064, + "grad_norm": 0.5034958124160767, + "learning_rate": 2.974364449616074e-07, + "loss": 1.0286, + "step": 15890 + }, + { + "epoch": 0.9654900054681329, + "grad_norm": 0.11125493794679642, + "learning_rate": 2.9639475519925297e-07, + "loss": 1.0886, + "step": 15891 + }, + { + "epoch": 0.9655507625007594, + "grad_norm": 0.1079137921333313, + "learning_rate": 2.95354887332161e-07, + "loss": 1.0344, + "step": 15892 + }, + { + "epoch": 0.965611519533386, + "grad_norm": 2.671691656112671, + "learning_rate": 2.943168413984454e-07, + "loss": 1.2258, + "step": 15893 + }, + { + "epoch": 0.9656722765660125, + "grad_norm": 0.10957320034503937, + "learning_rate": 2.9328061743614796e-07, + "loss": 1.0921, + "step": 15894 + }, + { + "epoch": 0.965733033598639, + "grad_norm": 0.4058312177658081, + "learning_rate": 2.922462154832606e-07, + "loss": 1.2526, + "step": 15895 + }, + { + "epoch": 0.9657937906312656, + "grad_norm": 0.13628971576690674, + "learning_rate": 2.912136355776973e-07, + "loss": 1.0909, + "step": 15896 + }, + { + "epoch": 0.9658545476638921, + "grad_norm": 0.11478254199028015, + "learning_rate": 2.9018287775730566e-07, + "loss": 1.002, + "step": 15897 + }, + { + "epoch": 0.9659153046965187, + "grad_norm": 0.2863271236419678, + "learning_rate": 2.891539420598721e-07, + "loss": 1.0041, + "step": 15898 + }, + { + "epoch": 0.9659760617291452, + "grad_norm": 0.10050107538700104, + "learning_rate": 2.881268285231054e-07, + "loss": 1.0308, + "step": 15899 + }, + { + "epoch": 0.9660368187617717, + "grad_norm": 0.10459680110216141, + "learning_rate": 2.871015371846586e-07, + "loss": 1.05, + "step": 15900 + }, + { + "epoch": 0.9660975757943983, + "grad_norm": 0.12734566628932953, + "learning_rate": 2.860780680821185e-07, + "loss": 1.004, + "step": 15901 + }, + { + "epoch": 0.9661583328270247, + "grad_norm": 0.11034282296895981, + "learning_rate": 2.850564212529938e-07, + "loss": 1.038, + "step": 15902 + }, + { + "epoch": 0.9662190898596512, + "grad_norm": 0.1617112010717392, + "learning_rate": 2.84036596734738e-07, + "loss": 1.0622, + "step": 15903 + }, + { + "epoch": 0.9662798468922777, + "grad_norm": 0.21670573949813843, + "learning_rate": 2.830185945647268e-07, + "loss": 1.1696, + "step": 15904 + }, + { + "epoch": 0.9663406039249043, + "grad_norm": 0.10613535344600677, + "learning_rate": 2.820024147802802e-07, + "loss": 1.0534, + "step": 15905 + }, + { + "epoch": 0.9664013609575308, + "grad_norm": 0.12204107642173767, + "learning_rate": 2.809880574186463e-07, + "loss": 1.0461, + "step": 15906 + }, + { + "epoch": 0.9664621179901574, + "grad_norm": 1.8109042644500732, + "learning_rate": 2.799755225170064e-07, + "loss": 1.1054, + "step": 15907 + }, + { + "epoch": 0.9665228750227839, + "grad_norm": 0.1148768812417984, + "learning_rate": 2.789648101124753e-07, + "loss": 1.0529, + "step": 15908 + }, + { + "epoch": 0.9665836320554104, + "grad_norm": 0.12113587558269501, + "learning_rate": 2.7795592024209563e-07, + "loss": 1.079, + "step": 15909 + }, + { + "epoch": 0.966644389088037, + "grad_norm": 0.14573897421360016, + "learning_rate": 2.769488529428543e-07, + "loss": 1.0579, + "step": 15910 + }, + { + "epoch": 0.9667051461206635, + "grad_norm": 0.20230604708194733, + "learning_rate": 2.7594360825166644e-07, + "loss": 1.1347, + "step": 15911 + }, + { + "epoch": 0.96676590315329, + "grad_norm": 0.12635920941829681, + "learning_rate": 2.7494018620537466e-07, + "loss": 1.0587, + "step": 15912 + }, + { + "epoch": 0.9668266601859166, + "grad_norm": 0.10484524816274643, + "learning_rate": 2.739385868407607e-07, + "loss": 0.9967, + "step": 15913 + }, + { + "epoch": 0.9668874172185431, + "grad_norm": 0.24736304581165314, + "learning_rate": 2.7293881019454513e-07, + "loss": 1.1609, + "step": 15914 + }, + { + "epoch": 0.9669481742511695, + "grad_norm": 0.12311619520187378, + "learning_rate": 2.719408563033654e-07, + "loss": 1.0413, + "step": 15915 + }, + { + "epoch": 0.9670089312837961, + "grad_norm": 0.1055658683180809, + "learning_rate": 2.709447252038089e-07, + "loss": 1.0791, + "step": 15916 + }, + { + "epoch": 0.9670696883164226, + "grad_norm": 0.15936604142189026, + "learning_rate": 2.6995041693237965e-07, + "loss": 1.0079, + "step": 15917 + }, + { + "epoch": 0.9671304453490491, + "grad_norm": 0.560377836227417, + "learning_rate": 2.689579315255375e-07, + "loss": 1.3305, + "step": 15918 + }, + { + "epoch": 0.9671912023816757, + "grad_norm": 0.17118674516677856, + "learning_rate": 2.679672690196533e-07, + "loss": 1.1252, + "step": 15919 + }, + { + "epoch": 0.9672519594143022, + "grad_norm": 0.3924512565135956, + "learning_rate": 2.669784294510369e-07, + "loss": 1.1713, + "step": 15920 + }, + { + "epoch": 0.9673127164469287, + "grad_norm": 0.1629311889410019, + "learning_rate": 2.659914128559426e-07, + "loss": 1.0323, + "step": 15921 + }, + { + "epoch": 0.9673734734795553, + "grad_norm": 0.1835370510816574, + "learning_rate": 2.6500621927054715e-07, + "loss": 1.1132, + "step": 15922 + }, + { + "epoch": 0.9674342305121818, + "grad_norm": 6.135840892791748, + "learning_rate": 2.6402284873096594e-07, + "loss": 1.038, + "step": 15923 + }, + { + "epoch": 0.9674949875448083, + "grad_norm": 0.21831414103507996, + "learning_rate": 2.630413012732369e-07, + "loss": 1.1087, + "step": 15924 + }, + { + "epoch": 0.9675557445774349, + "grad_norm": 0.11834456771612167, + "learning_rate": 2.6206157693333677e-07, + "loss": 1.0419, + "step": 15925 + }, + { + "epoch": 0.9676165016100614, + "grad_norm": 36.64017868041992, + "learning_rate": 2.610836757471924e-07, + "loss": 1.1122, + "step": 15926 + }, + { + "epoch": 0.9676772586426879, + "grad_norm": 0.6722599864006042, + "learning_rate": 2.601075977506362e-07, + "loss": 1.0483, + "step": 15927 + }, + { + "epoch": 0.9677380156753144, + "grad_norm": 0.3618413507938385, + "learning_rate": 2.5913334297945624e-07, + "loss": 1.1299, + "step": 15928 + }, + { + "epoch": 0.9677987727079409, + "grad_norm": 0.13960571587085724, + "learning_rate": 2.5816091146935176e-07, + "loss": 1.0873, + "step": 15929 + }, + { + "epoch": 0.9678595297405674, + "grad_norm": 0.15450404584407806, + "learning_rate": 2.5719030325597746e-07, + "loss": 1.0672, + "step": 15930 + }, + { + "epoch": 0.967920286773194, + "grad_norm": 0.10349221527576447, + "learning_rate": 2.5622151837490504e-07, + "loss": 1.0249, + "step": 15931 + }, + { + "epoch": 0.9679810438058205, + "grad_norm": 0.22325465083122253, + "learning_rate": 2.5525455686165043e-07, + "loss": 1.1754, + "step": 15932 + }, + { + "epoch": 0.968041800838447, + "grad_norm": 0.09836241602897644, + "learning_rate": 2.54289418751652e-07, + "loss": 1.0064, + "step": 15933 + }, + { + "epoch": 0.9681025578710736, + "grad_norm": 0.2870478928089142, + "learning_rate": 2.533261040802981e-07, + "loss": 1.3061, + "step": 15934 + }, + { + "epoch": 0.9681633149037001, + "grad_norm": 0.11183391511440277, + "learning_rate": 2.5236461288288274e-07, + "loss": 1.0314, + "step": 15935 + }, + { + "epoch": 0.9682240719363266, + "grad_norm": 0.10338148474693298, + "learning_rate": 2.514049451946665e-07, + "loss": 1.0182, + "step": 15936 + }, + { + "epoch": 0.9682848289689532, + "grad_norm": 0.2402750700712204, + "learning_rate": 2.5044710105081027e-07, + "loss": 1.1383, + "step": 15937 + }, + { + "epoch": 0.9683455860015797, + "grad_norm": 0.12980733811855316, + "learning_rate": 2.494910804864359e-07, + "loss": 1.0342, + "step": 15938 + }, + { + "epoch": 0.9684063430342063, + "grad_norm": 0.12378325313329697, + "learning_rate": 2.4853688353658753e-07, + "loss": 1.0294, + "step": 15939 + }, + { + "epoch": 0.9684671000668328, + "grad_norm": 0.26979950070381165, + "learning_rate": 2.475845102362262e-07, + "loss": 1.0667, + "step": 15940 + }, + { + "epoch": 0.9685278570994592, + "grad_norm": 0.2891567647457123, + "learning_rate": 2.466339606202794e-07, + "loss": 1.0716, + "step": 15941 + }, + { + "epoch": 0.9685886141320857, + "grad_norm": 0.12859764695167542, + "learning_rate": 2.4568523472358054e-07, + "loss": 1.0487, + "step": 15942 + }, + { + "epoch": 0.9686493711647123, + "grad_norm": 0.17575380206108093, + "learning_rate": 2.4473833258090715e-07, + "loss": 1.0588, + "step": 15943 + }, + { + "epoch": 0.9687101281973388, + "grad_norm": 0.1738256961107254, + "learning_rate": 2.4379325422696495e-07, + "loss": 1.1585, + "step": 15944 + }, + { + "epoch": 0.9687708852299654, + "grad_norm": 0.10076726227998734, + "learning_rate": 2.4284999969639844e-07, + "loss": 0.9953, + "step": 15945 + }, + { + "epoch": 0.9688316422625919, + "grad_norm": 0.13657309114933014, + "learning_rate": 2.419085690237799e-07, + "loss": 1.0956, + "step": 15946 + }, + { + "epoch": 0.9688923992952184, + "grad_norm": 0.12712503969669342, + "learning_rate": 2.4096896224362617e-07, + "loss": 0.9993, + "step": 15947 + }, + { + "epoch": 0.968953156327845, + "grad_norm": 0.16392002999782562, + "learning_rate": 2.4003117939037647e-07, + "loss": 1.1506, + "step": 15948 + }, + { + "epoch": 0.9690139133604715, + "grad_norm": 0.1706627756357193, + "learning_rate": 2.390952204983976e-07, + "loss": 1.1572, + "step": 15949 + }, + { + "epoch": 0.969074670393098, + "grad_norm": 0.15807820856571198, + "learning_rate": 2.3816108560200111e-07, + "loss": 1.0901, + "step": 15950 + }, + { + "epoch": 0.9691354274257246, + "grad_norm": 0.5251103639602661, + "learning_rate": 2.3722877473543182e-07, + "loss": 1.1488, + "step": 15951 + }, + { + "epoch": 0.9691961844583511, + "grad_norm": 0.3150418996810913, + "learning_rate": 2.362982879328568e-07, + "loss": 0.9907, + "step": 15952 + }, + { + "epoch": 0.9692569414909776, + "grad_norm": 5.522952079772949, + "learning_rate": 2.3536962522838768e-07, + "loss": 1.0944, + "step": 15953 + }, + { + "epoch": 0.969317698523604, + "grad_norm": 0.24826404452323914, + "learning_rate": 2.3444278665606944e-07, + "loss": 1.008, + "step": 15954 + }, + { + "epoch": 0.9693784555562306, + "grad_norm": 0.16055329144001007, + "learning_rate": 2.3351777224986938e-07, + "loss": 1.1287, + "step": 15955 + }, + { + "epoch": 0.9694392125888571, + "grad_norm": 0.12546609342098236, + "learning_rate": 2.3259458204368812e-07, + "loss": 1.0814, + "step": 15956 + }, + { + "epoch": 0.9694999696214837, + "grad_norm": 0.1838313192129135, + "learning_rate": 2.3167321607137637e-07, + "loss": 1.1996, + "step": 15957 + }, + { + "epoch": 0.9695607266541102, + "grad_norm": 0.11834122985601425, + "learning_rate": 2.3075367436670715e-07, + "loss": 1.0623, + "step": 15958 + }, + { + "epoch": 0.9696214836867367, + "grad_norm": 0.11417987942695618, + "learning_rate": 2.2983595696337567e-07, + "loss": 1.09, + "step": 15959 + }, + { + "epoch": 0.9696822407193633, + "grad_norm": 0.15419122576713562, + "learning_rate": 2.2892006389503285e-07, + "loss": 1.1104, + "step": 15960 + }, + { + "epoch": 0.9697429977519898, + "grad_norm": 0.10465330630540848, + "learning_rate": 2.280059951952407e-07, + "loss": 0.9956, + "step": 15961 + }, + { + "epoch": 0.9698037547846163, + "grad_norm": 0.21363714337348938, + "learning_rate": 2.2709375089751127e-07, + "loss": 1.1505, + "step": 15962 + }, + { + "epoch": 0.9698645118172429, + "grad_norm": 0.16582389175891876, + "learning_rate": 2.2618333103528455e-07, + "loss": 1.115, + "step": 15963 + }, + { + "epoch": 0.9699252688498694, + "grad_norm": 0.14295408129692078, + "learning_rate": 2.252747356419227e-07, + "loss": 1.0617, + "step": 15964 + }, + { + "epoch": 0.9699860258824959, + "grad_norm": 0.10888700187206268, + "learning_rate": 2.243679647507435e-07, + "loss": 1.0311, + "step": 15965 + }, + { + "epoch": 0.9700467829151225, + "grad_norm": 0.23514166474342346, + "learning_rate": 2.234630183949704e-07, + "loss": 1.098, + "step": 15966 + }, + { + "epoch": 0.9701075399477489, + "grad_norm": 0.10871703922748566, + "learning_rate": 2.2255989660778798e-07, + "loss": 1.0273, + "step": 15967 + }, + { + "epoch": 0.9701682969803754, + "grad_norm": 0.10922279953956604, + "learning_rate": 2.2165859942229194e-07, + "loss": 1.0241, + "step": 15968 + }, + { + "epoch": 0.970229054013002, + "grad_norm": 0.10100605338811874, + "learning_rate": 2.2075912687152255e-07, + "loss": 1.0537, + "step": 15969 + }, + { + "epoch": 0.9702898110456285, + "grad_norm": 0.10844356566667557, + "learning_rate": 2.1986147898844788e-07, + "loss": 1.0019, + "step": 15970 + }, + { + "epoch": 0.970350568078255, + "grad_norm": 0.15895801782608032, + "learning_rate": 2.1896565580597495e-07, + "loss": 1.0303, + "step": 15971 + }, + { + "epoch": 0.9704113251108816, + "grad_norm": 0.10335024446249008, + "learning_rate": 2.1807165735693857e-07, + "loss": 0.9887, + "step": 15972 + }, + { + "epoch": 0.9704720821435081, + "grad_norm": 0.11725698411464691, + "learning_rate": 2.17179483674107e-07, + "loss": 1.0343, + "step": 15973 + }, + { + "epoch": 0.9705328391761346, + "grad_norm": 1.839383602142334, + "learning_rate": 2.1628913479018186e-07, + "loss": 1.094, + "step": 15974 + }, + { + "epoch": 0.9705935962087612, + "grad_norm": 6.549253463745117, + "learning_rate": 2.1540061073780927e-07, + "loss": 1.0673, + "step": 15975 + }, + { + "epoch": 0.9706543532413877, + "grad_norm": 0.10236018896102905, + "learning_rate": 2.1451391154954093e-07, + "loss": 1.0173, + "step": 15976 + }, + { + "epoch": 0.9707151102740142, + "grad_norm": 0.1534181833267212, + "learning_rate": 2.136290372578953e-07, + "loss": 1.1281, + "step": 15977 + }, + { + "epoch": 0.9707758673066408, + "grad_norm": 0.20116965472698212, + "learning_rate": 2.1274598789529643e-07, + "loss": 1.1008, + "step": 15978 + }, + { + "epoch": 0.9708366243392673, + "grad_norm": 0.16923700273036957, + "learning_rate": 2.1186476349411843e-07, + "loss": 1.0648, + "step": 15979 + }, + { + "epoch": 0.9708973813718937, + "grad_norm": 0.10010872781276703, + "learning_rate": 2.1098536408666325e-07, + "loss": 1.0502, + "step": 15980 + }, + { + "epoch": 0.9709581384045203, + "grad_norm": 0.10427713394165039, + "learning_rate": 2.1010778970516064e-07, + "loss": 1.0487, + "step": 15981 + }, + { + "epoch": 0.9710188954371468, + "grad_norm": 0.09616519510746002, + "learning_rate": 2.092320403817849e-07, + "loss": 1.0517, + "step": 15982 + }, + { + "epoch": 0.9710796524697733, + "grad_norm": 0.18378973007202148, + "learning_rate": 2.0835811614863254e-07, + "loss": 1.0333, + "step": 15983 + }, + { + "epoch": 0.9711404095023999, + "grad_norm": 0.18472549319267273, + "learning_rate": 2.074860170377335e-07, + "loss": 1.0902, + "step": 15984 + }, + { + "epoch": 0.9712011665350264, + "grad_norm": 0.09981843829154968, + "learning_rate": 2.0661574308106224e-07, + "loss": 1.0282, + "step": 15985 + }, + { + "epoch": 0.971261923567653, + "grad_norm": 0.1477714329957962, + "learning_rate": 2.0574729431051543e-07, + "loss": 1.1393, + "step": 15986 + }, + { + "epoch": 0.9713226806002795, + "grad_norm": 0.2482462078332901, + "learning_rate": 2.0488067075792872e-07, + "loss": 1.1187, + "step": 15987 + }, + { + "epoch": 0.971383437632906, + "grad_norm": 0.10056059062480927, + "learning_rate": 2.040158724550656e-07, + "loss": 0.997, + "step": 15988 + }, + { + "epoch": 0.9714441946655326, + "grad_norm": 0.13931965827941895, + "learning_rate": 2.0315289943362292e-07, + "loss": 1.0828, + "step": 15989 + }, + { + "epoch": 0.9715049516981591, + "grad_norm": 0.13172860443592072, + "learning_rate": 2.022917517252365e-07, + "loss": 1.0808, + "step": 15990 + }, + { + "epoch": 0.9715657087307856, + "grad_norm": 0.16991360485553741, + "learning_rate": 2.0143242936147554e-07, + "loss": 1.1247, + "step": 15991 + }, + { + "epoch": 0.9716264657634122, + "grad_norm": 0.38997524976730347, + "learning_rate": 2.005749323738315e-07, + "loss": 1.1852, + "step": 15992 + }, + { + "epoch": 0.9716872227960387, + "grad_norm": 0.19578388333320618, + "learning_rate": 1.9971926079374037e-07, + "loss": 1.1603, + "step": 15993 + }, + { + "epoch": 0.9717479798286651, + "grad_norm": 0.2540781795978546, + "learning_rate": 1.9886541465256593e-07, + "loss": 1.0591, + "step": 15994 + }, + { + "epoch": 0.9718087368612917, + "grad_norm": 0.33513855934143066, + "learning_rate": 1.9801339398160534e-07, + "loss": 1.071, + "step": 15995 + }, + { + "epoch": 0.9718694938939182, + "grad_norm": 0.12066254019737244, + "learning_rate": 1.9716319881208922e-07, + "loss": 1.0337, + "step": 15996 + }, + { + "epoch": 0.9719302509265447, + "grad_norm": 0.11083171516656876, + "learning_rate": 1.9631482917518707e-07, + "loss": 1.0525, + "step": 15997 + }, + { + "epoch": 0.9719910079591713, + "grad_norm": 0.13840889930725098, + "learning_rate": 1.954682851019851e-07, + "loss": 1.0658, + "step": 15998 + }, + { + "epoch": 0.9720517649917978, + "grad_norm": 0.1476316601037979, + "learning_rate": 1.9462356662352522e-07, + "loss": 1.0947, + "step": 15999 + }, + { + "epoch": 0.9721125220244243, + "grad_norm": 1.4096200466156006, + "learning_rate": 1.9378067377076593e-07, + "loss": 1.2981, + "step": 16000 + }, + { + "epoch": 0.9721732790570509, + "grad_norm": 0.12837892770767212, + "learning_rate": 1.929396065745992e-07, + "loss": 1.0293, + "step": 16001 + }, + { + "epoch": 0.9722340360896774, + "grad_norm": 0.16172531247138977, + "learning_rate": 1.9210036506586148e-07, + "loss": 1.0312, + "step": 16002 + }, + { + "epoch": 0.9722947931223039, + "grad_norm": 0.15705092251300812, + "learning_rate": 1.9126294927531153e-07, + "loss": 1.0737, + "step": 16003 + }, + { + "epoch": 0.9723555501549305, + "grad_norm": 0.13786739110946655, + "learning_rate": 1.9042735923364696e-07, + "loss": 1.1181, + "step": 16004 + }, + { + "epoch": 0.972416307187557, + "grad_norm": 0.17052096128463745, + "learning_rate": 1.8959359497149886e-07, + "loss": 1.116, + "step": 16005 + }, + { + "epoch": 0.9724770642201835, + "grad_norm": 0.12219720333814621, + "learning_rate": 1.8876165651942056e-07, + "loss": 1.0122, + "step": 16006 + }, + { + "epoch": 0.97253782125281, + "grad_norm": 1.060339093208313, + "learning_rate": 1.8793154390791545e-07, + "loss": 1.0741, + "step": 16007 + }, + { + "epoch": 0.9725985782854365, + "grad_norm": 0.11641202121973038, + "learning_rate": 1.8710325716740363e-07, + "loss": 1.0312, + "step": 16008 + }, + { + "epoch": 0.972659335318063, + "grad_norm": 0.13792043924331665, + "learning_rate": 1.8627679632825523e-07, + "loss": 1.1228, + "step": 16009 + }, + { + "epoch": 0.9727200923506896, + "grad_norm": 0.1705060452222824, + "learning_rate": 1.8545216142075717e-07, + "loss": 1.1256, + "step": 16010 + }, + { + "epoch": 0.9727808493833161, + "grad_norm": 0.1110953837633133, + "learning_rate": 1.846293524751408e-07, + "loss": 1.0149, + "step": 16011 + }, + { + "epoch": 0.9728416064159426, + "grad_norm": 0.13366954028606415, + "learning_rate": 1.8380836952155979e-07, + "loss": 1.1379, + "step": 16012 + }, + { + "epoch": 0.9729023634485692, + "grad_norm": 0.22752730548381805, + "learning_rate": 1.8298921259011782e-07, + "loss": 1.1169, + "step": 16013 + }, + { + "epoch": 0.9729631204811957, + "grad_norm": 0.27061083912849426, + "learning_rate": 1.8217188171082977e-07, + "loss": 1.1635, + "step": 16014 + }, + { + "epoch": 0.9730238775138222, + "grad_norm": 0.7638154029846191, + "learning_rate": 1.8135637691366613e-07, + "loss": 1.073, + "step": 16015 + }, + { + "epoch": 0.9730846345464488, + "grad_norm": 0.1051676943898201, + "learning_rate": 1.805426982285141e-07, + "loss": 1.0303, + "step": 16016 + }, + { + "epoch": 0.9731453915790753, + "grad_norm": 1.0993504524230957, + "learning_rate": 1.7973084568519982e-07, + "loss": 1.0117, + "step": 16017 + }, + { + "epoch": 0.9732061486117018, + "grad_norm": 0.1313251554965973, + "learning_rate": 1.7892081931347726e-07, + "loss": 1.0436, + "step": 16018 + }, + { + "epoch": 0.9732669056443284, + "grad_norm": 1.02340829372406, + "learning_rate": 1.7811261914304488e-07, + "loss": 1.0913, + "step": 16019 + }, + { + "epoch": 0.9733276626769548, + "grad_norm": 0.12242753058671951, + "learning_rate": 1.7730624520352346e-07, + "loss": 1.0217, + "step": 16020 + }, + { + "epoch": 0.9733884197095813, + "grad_norm": 0.09510639309883118, + "learning_rate": 1.7650169752447265e-07, + "loss": 0.992, + "step": 16021 + }, + { + "epoch": 0.9734491767422079, + "grad_norm": 0.12729017436504364, + "learning_rate": 1.7569897613538555e-07, + "loss": 1.0467, + "step": 16022 + }, + { + "epoch": 0.9735099337748344, + "grad_norm": 0.16152387857437134, + "learning_rate": 1.7489808106568305e-07, + "loss": 1.0752, + "step": 16023 + }, + { + "epoch": 0.973570690807461, + "grad_norm": 0.10730086266994476, + "learning_rate": 1.7409901234471948e-07, + "loss": 1.0293, + "step": 16024 + }, + { + "epoch": 0.9736314478400875, + "grad_norm": 1.1608229875564575, + "learning_rate": 1.7330177000178805e-07, + "loss": 1.1188, + "step": 16025 + }, + { + "epoch": 0.973692204872714, + "grad_norm": 0.12683729827404022, + "learning_rate": 1.7250635406611538e-07, + "loss": 0.9897, + "step": 16026 + }, + { + "epoch": 0.9737529619053406, + "grad_norm": 0.13331612944602966, + "learning_rate": 1.7171276456685037e-07, + "loss": 1.0664, + "step": 16027 + }, + { + "epoch": 0.9738137189379671, + "grad_norm": 3.2299716472625732, + "learning_rate": 1.7092100153308643e-07, + "loss": 1.044, + "step": 16028 + }, + { + "epoch": 0.9738744759705936, + "grad_norm": 0.11321816593408585, + "learning_rate": 1.7013106499383924e-07, + "loss": 1.0706, + "step": 16029 + }, + { + "epoch": 0.9739352330032202, + "grad_norm": 0.11880458146333694, + "learning_rate": 1.6934295497808006e-07, + "loss": 1.0139, + "step": 16030 + }, + { + "epoch": 0.9739959900358467, + "grad_norm": 0.2528494596481323, + "learning_rate": 1.6855667151468024e-07, + "loss": 1.1883, + "step": 16031 + }, + { + "epoch": 0.9740567470684732, + "grad_norm": 0.10102248936891556, + "learning_rate": 1.6777221463246673e-07, + "loss": 0.9729, + "step": 16032 + }, + { + "epoch": 0.9741175041010997, + "grad_norm": 0.1477328985929489, + "learning_rate": 1.6698958436019985e-07, + "loss": 1.1111, + "step": 16033 + }, + { + "epoch": 0.9741782611337262, + "grad_norm": 0.15187117457389832, + "learning_rate": 1.6620878072655664e-07, + "loss": 1.0719, + "step": 16034 + }, + { + "epoch": 0.9742390181663527, + "grad_norm": 0.25257760286331177, + "learning_rate": 1.6542980376016982e-07, + "loss": 1.0492, + "step": 16035 + }, + { + "epoch": 0.9742997751989793, + "grad_norm": 0.14116472005844116, + "learning_rate": 1.6465265348958314e-07, + "loss": 1.0791, + "step": 16036 + }, + { + "epoch": 0.9743605322316058, + "grad_norm": 0.22963206470012665, + "learning_rate": 1.6387732994328496e-07, + "loss": 1.0766, + "step": 16037 + }, + { + "epoch": 0.9744212892642323, + "grad_norm": 0.15372848510742188, + "learning_rate": 1.63103833149697e-07, + "loss": 1.0821, + "step": 16038 + }, + { + "epoch": 0.9744820462968589, + "grad_norm": 0.1959514319896698, + "learning_rate": 1.6233216313717437e-07, + "loss": 1.0473, + "step": 16039 + }, + { + "epoch": 0.9745428033294854, + "grad_norm": 0.15264278650283813, + "learning_rate": 1.615623199339944e-07, + "loss": 1.1159, + "step": 16040 + }, + { + "epoch": 0.9746035603621119, + "grad_norm": 0.09602281451225281, + "learning_rate": 1.6079430356838454e-07, + "loss": 1.0424, + "step": 16041 + }, + { + "epoch": 0.9746643173947385, + "grad_norm": 0.0902513787150383, + "learning_rate": 1.600281140684945e-07, + "loss": 1.0112, + "step": 16042 + }, + { + "epoch": 0.974725074427365, + "grad_norm": 0.1508355736732483, + "learning_rate": 1.5926375146240734e-07, + "loss": 1.1134, + "step": 16043 + }, + { + "epoch": 0.9747858314599915, + "grad_norm": 0.13014133274555206, + "learning_rate": 1.5850121577813959e-07, + "loss": 1.0306, + "step": 16044 + }, + { + "epoch": 0.9748465884926181, + "grad_norm": 0.12807750701904297, + "learning_rate": 1.5774050704364106e-07, + "loss": 1.0856, + "step": 16045 + }, + { + "epoch": 0.9749073455252445, + "grad_norm": 0.17278870940208435, + "learning_rate": 1.5698162528680616e-07, + "loss": 1.1414, + "step": 16046 + }, + { + "epoch": 0.974968102557871, + "grad_norm": 0.18267321586608887, + "learning_rate": 1.5622457053544038e-07, + "loss": 1.0351, + "step": 16047 + }, + { + "epoch": 0.9750288595904976, + "grad_norm": 0.20184537768363953, + "learning_rate": 1.5546934281729375e-07, + "loss": 1.2628, + "step": 16048 + }, + { + "epoch": 0.9750896166231241, + "grad_norm": 0.34459781646728516, + "learning_rate": 1.547159421600608e-07, + "loss": 1.2446, + "step": 16049 + }, + { + "epoch": 0.9751503736557506, + "grad_norm": 0.27078482508659363, + "learning_rate": 1.5396436859134168e-07, + "loss": 1.0954, + "step": 16050 + }, + { + "epoch": 0.9752111306883772, + "grad_norm": 0.11293277144432068, + "learning_rate": 1.5321462213869764e-07, + "loss": 1.0631, + "step": 16051 + }, + { + "epoch": 0.9752718877210037, + "grad_norm": 0.2270057201385498, + "learning_rate": 1.5246670282960674e-07, + "loss": 1.1245, + "step": 16052 + }, + { + "epoch": 0.9753326447536302, + "grad_norm": 0.20857922732830048, + "learning_rate": 1.517206106914859e-07, + "loss": 1.1282, + "step": 16053 + }, + { + "epoch": 0.9753934017862568, + "grad_norm": 0.3938729166984558, + "learning_rate": 1.5097634575167996e-07, + "loss": 1.196, + "step": 16054 + }, + { + "epoch": 0.9754541588188833, + "grad_norm": 0.12796056270599365, + "learning_rate": 1.502339080374726e-07, + "loss": 1.032, + "step": 16055 + }, + { + "epoch": 0.9755149158515098, + "grad_norm": 0.1373816430568695, + "learning_rate": 1.4949329757607544e-07, + "loss": 1.0655, + "step": 16056 + }, + { + "epoch": 0.9755756728841364, + "grad_norm": 0.3361382484436035, + "learning_rate": 1.4875451439463894e-07, + "loss": 1.1071, + "step": 16057 + }, + { + "epoch": 0.9756364299167629, + "grad_norm": 0.14629597961902618, + "learning_rate": 1.4801755852024145e-07, + "loss": 1.102, + "step": 16058 + }, + { + "epoch": 0.9756971869493893, + "grad_norm": 0.11022738367319107, + "learning_rate": 1.4728242997989473e-07, + "loss": 1.1013, + "step": 16059 + }, + { + "epoch": 0.9757579439820159, + "grad_norm": 0.43559399247169495, + "learning_rate": 1.4654912880054382e-07, + "loss": 1.1323, + "step": 16060 + }, + { + "epoch": 0.9758187010146424, + "grad_norm": 0.12420898675918579, + "learning_rate": 1.458176550090784e-07, + "loss": 1.0982, + "step": 16061 + }, + { + "epoch": 0.9758794580472689, + "grad_norm": 0.340138703584671, + "learning_rate": 1.450880086322992e-07, + "loss": 1.1505, + "step": 16062 + }, + { + "epoch": 0.9759402150798955, + "grad_norm": 0.19947384297847748, + "learning_rate": 1.4436018969695708e-07, + "loss": 1.0682, + "step": 16063 + }, + { + "epoch": 0.976000972112522, + "grad_norm": 0.17929403483867645, + "learning_rate": 1.4363419822973067e-07, + "loss": 1.0751, + "step": 16064 + }, + { + "epoch": 0.9760617291451485, + "grad_norm": 0.1395479142665863, + "learning_rate": 1.4291003425722648e-07, + "loss": 1.1095, + "step": 16065 + }, + { + "epoch": 0.9761224861777751, + "grad_norm": 0.14332261681556702, + "learning_rate": 1.4218769780598995e-07, + "loss": 1.1459, + "step": 16066 + }, + { + "epoch": 0.9761832432104016, + "grad_norm": 0.10867652297019958, + "learning_rate": 1.4146718890250543e-07, + "loss": 1.0158, + "step": 16067 + }, + { + "epoch": 0.9762440002430282, + "grad_norm": 0.1907731145620346, + "learning_rate": 1.407485075731796e-07, + "loss": 1.2309, + "step": 16068 + }, + { + "epoch": 0.9763047572756547, + "grad_norm": 0.143650621175766, + "learning_rate": 1.4003165384435246e-07, + "loss": 1.008, + "step": 16069 + }, + { + "epoch": 0.9763655143082812, + "grad_norm": 0.14735780656337738, + "learning_rate": 1.3931662774230304e-07, + "loss": 1.0731, + "step": 16070 + }, + { + "epoch": 0.9764262713409078, + "grad_norm": 0.1586189568042755, + "learning_rate": 1.386034292932381e-07, + "loss": 1.1754, + "step": 16071 + }, + { + "epoch": 0.9764870283735342, + "grad_norm": 0.13290809094905853, + "learning_rate": 1.37892058523309e-07, + "loss": 1.0146, + "step": 16072 + }, + { + "epoch": 0.9765477854061607, + "grad_norm": 0.47854822874069214, + "learning_rate": 1.371825154585782e-07, + "loss": 1.1807, + "step": 16073 + }, + { + "epoch": 0.9766085424387873, + "grad_norm": 0.11864987015724182, + "learning_rate": 1.3647480012506375e-07, + "loss": 1.0684, + "step": 16074 + }, + { + "epoch": 0.9766692994714138, + "grad_norm": 0.15853135287761688, + "learning_rate": 1.357689125487005e-07, + "loss": 1.069, + "step": 16075 + }, + { + "epoch": 0.9767300565040403, + "grad_norm": 2.330995798110962, + "learning_rate": 1.3506485275536772e-07, + "loss": 1.0224, + "step": 16076 + }, + { + "epoch": 0.9767908135366669, + "grad_norm": 0.14817878603935242, + "learning_rate": 1.3436262077087258e-07, + "loss": 1.2449, + "step": 16077 + }, + { + "epoch": 0.9768515705692934, + "grad_norm": 0.14410121738910675, + "learning_rate": 1.3366221662095558e-07, + "loss": 1.0676, + "step": 16078 + }, + { + "epoch": 0.9769123276019199, + "grad_norm": 0.15412510931491852, + "learning_rate": 1.3296364033128506e-07, + "loss": 1.1063, + "step": 16079 + }, + { + "epoch": 0.9769730846345465, + "grad_norm": 0.2766765356063843, + "learning_rate": 1.3226689192747387e-07, + "loss": 1.1065, + "step": 16080 + }, + { + "epoch": 0.977033841667173, + "grad_norm": 0.19024202227592468, + "learning_rate": 1.3157197143506273e-07, + "loss": 1.0934, + "step": 16081 + }, + { + "epoch": 0.9770945986997995, + "grad_norm": 0.10379128903150558, + "learning_rate": 1.3087887887951457e-07, + "loss": 1.0493, + "step": 16082 + }, + { + "epoch": 0.9771553557324261, + "grad_norm": 0.4700990319252014, + "learning_rate": 1.3018761428624237e-07, + "loss": 0.9965, + "step": 16083 + }, + { + "epoch": 0.9772161127650526, + "grad_norm": 0.1554998755455017, + "learning_rate": 1.294981776805815e-07, + "loss": 1.0842, + "step": 16084 + }, + { + "epoch": 0.977276869797679, + "grad_norm": 0.10045026987791061, + "learning_rate": 1.2881056908780608e-07, + "loss": 1.0269, + "step": 16085 + }, + { + "epoch": 0.9773376268303056, + "grad_norm": 0.302529901266098, + "learning_rate": 1.2812478853311826e-07, + "loss": 1.1391, + "step": 16086 + }, + { + "epoch": 0.9773983838629321, + "grad_norm": 0.24632851779460907, + "learning_rate": 1.27440836041659e-07, + "loss": 1.0796, + "step": 16087 + }, + { + "epoch": 0.9774591408955586, + "grad_norm": 0.10734419524669647, + "learning_rate": 1.2675871163849718e-07, + "loss": 1.0819, + "step": 16088 + }, + { + "epoch": 0.9775198979281852, + "grad_norm": 0.31728655099868774, + "learning_rate": 1.260784153486294e-07, + "loss": 1.2579, + "step": 16089 + }, + { + "epoch": 0.9775806549608117, + "grad_norm": 0.48442772030830383, + "learning_rate": 1.2539994719700244e-07, + "loss": 1.0493, + "step": 16090 + }, + { + "epoch": 0.9776414119934382, + "grad_norm": 0.1007699966430664, + "learning_rate": 1.247233072084797e-07, + "loss": 0.987, + "step": 16091 + }, + { + "epoch": 0.9777021690260648, + "grad_norm": 0.29470014572143555, + "learning_rate": 1.2404849540786357e-07, + "loss": 1.087, + "step": 16092 + }, + { + "epoch": 0.9777629260586913, + "grad_norm": 0.12711603939533234, + "learning_rate": 1.2337551181988983e-07, + "loss": 1.0659, + "step": 16093 + }, + { + "epoch": 0.9778236830913178, + "grad_norm": 0.27971151471138, + "learning_rate": 1.2270435646922762e-07, + "loss": 1.2239, + "step": 16094 + }, + { + "epoch": 0.9778844401239444, + "grad_norm": 0.11354578286409378, + "learning_rate": 1.2203502938047952e-07, + "loss": 1.0161, + "step": 16095 + }, + { + "epoch": 0.9779451971565709, + "grad_norm": 0.2748790383338928, + "learning_rate": 1.213675305781814e-07, + "loss": 1.0479, + "step": 16096 + }, + { + "epoch": 0.9780059541891974, + "grad_norm": 0.09628867357969284, + "learning_rate": 1.2070186008679707e-07, + "loss": 1.0198, + "step": 16097 + }, + { + "epoch": 0.978066711221824, + "grad_norm": 0.16216102242469788, + "learning_rate": 1.2003801793072366e-07, + "loss": 1.095, + "step": 16098 + }, + { + "epoch": 0.9781274682544504, + "grad_norm": 0.19075436890125275, + "learning_rate": 1.1937600413430283e-07, + "loss": 1.0632, + "step": 16099 + }, + { + "epoch": 0.9781882252870769, + "grad_norm": 0.22025249898433685, + "learning_rate": 1.1871581872179294e-07, + "loss": 1.0657, + "step": 16100 + }, + { + "epoch": 0.9782489823197035, + "grad_norm": 0.28478437662124634, + "learning_rate": 1.1805746171739685e-07, + "loss": 1.0728, + "step": 16101 + }, + { + "epoch": 0.97830973935233, + "grad_norm": 0.09931562840938568, + "learning_rate": 1.1740093314524525e-07, + "loss": 1.063, + "step": 16102 + }, + { + "epoch": 0.9783704963849565, + "grad_norm": 0.14065153896808624, + "learning_rate": 1.167462330294078e-07, + "loss": 1.092, + "step": 16103 + }, + { + "epoch": 0.9784312534175831, + "grad_norm": 0.24711716175079346, + "learning_rate": 1.1609336139387639e-07, + "loss": 1.1335, + "step": 16104 + }, + { + "epoch": 0.9784920104502096, + "grad_norm": 0.1082901656627655, + "learning_rate": 1.1544231826258745e-07, + "loss": 1.0165, + "step": 16105 + }, + { + "epoch": 0.9785527674828361, + "grad_norm": 0.16817279160022736, + "learning_rate": 1.1479310365939966e-07, + "loss": 1.1308, + "step": 16106 + }, + { + "epoch": 0.9786135245154627, + "grad_norm": 0.13043826818466187, + "learning_rate": 1.1414571760811621e-07, + "loss": 1.096, + "step": 16107 + }, + { + "epoch": 0.9786742815480892, + "grad_norm": 0.14298072457313538, + "learning_rate": 1.1350016013246256e-07, + "loss": 1.0753, + "step": 16108 + }, + { + "epoch": 0.9787350385807158, + "grad_norm": 0.2213154435157776, + "learning_rate": 1.1285643125610867e-07, + "loss": 1.1939, + "step": 16109 + }, + { + "epoch": 0.9787957956133423, + "grad_norm": 0.1052158921957016, + "learning_rate": 1.1221453100264123e-07, + "loss": 1.0351, + "step": 16110 + }, + { + "epoch": 0.9788565526459688, + "grad_norm": 0.1351722925901413, + "learning_rate": 1.1157445939559141e-07, + "loss": 1.1005, + "step": 16111 + }, + { + "epoch": 0.9789173096785952, + "grad_norm": 0.12105369567871094, + "learning_rate": 1.1093621645842378e-07, + "loss": 1.0058, + "step": 16112 + }, + { + "epoch": 0.9789780667112218, + "grad_norm": 0.11711379885673523, + "learning_rate": 1.1029980221453073e-07, + "loss": 1.0504, + "step": 16113 + }, + { + "epoch": 0.9790388237438483, + "grad_norm": 0.12315700203180313, + "learning_rate": 1.0966521668724361e-07, + "loss": 1.0017, + "step": 16114 + }, + { + "epoch": 0.9790995807764749, + "grad_norm": 0.10018137097358704, + "learning_rate": 1.0903245989982714e-07, + "loss": 1.024, + "step": 16115 + }, + { + "epoch": 0.9791603378091014, + "grad_norm": 0.3580493628978729, + "learning_rate": 1.0840153187546276e-07, + "loss": 1.1111, + "step": 16116 + }, + { + "epoch": 0.9792210948417279, + "grad_norm": 0.11206480860710144, + "learning_rate": 1.0777243263728754e-07, + "loss": 1.0214, + "step": 16117 + }, + { + "epoch": 0.9792818518743545, + "grad_norm": 0.137099027633667, + "learning_rate": 1.0714516220835525e-07, + "loss": 1.0701, + "step": 16118 + }, + { + "epoch": 0.979342608906981, + "grad_norm": 0.187575101852417, + "learning_rate": 1.0651972061166415e-07, + "loss": 1.1005, + "step": 16119 + }, + { + "epoch": 0.9794033659396075, + "grad_norm": 0.16693246364593506, + "learning_rate": 1.0589610787013482e-07, + "loss": 1.1508, + "step": 16120 + }, + { + "epoch": 0.9794641229722341, + "grad_norm": 0.14592605829238892, + "learning_rate": 1.0527432400662674e-07, + "loss": 1.0543, + "step": 16121 + }, + { + "epoch": 0.9795248800048606, + "grad_norm": 0.1340041160583496, + "learning_rate": 1.0465436904393277e-07, + "loss": 1.0305, + "step": 16122 + }, + { + "epoch": 0.9795856370374871, + "grad_norm": 0.25268298387527466, + "learning_rate": 1.0403624300477921e-07, + "loss": 1.1417, + "step": 16123 + }, + { + "epoch": 0.9796463940701137, + "grad_norm": 0.16122040152549744, + "learning_rate": 1.0341994591182014e-07, + "loss": 1.0411, + "step": 16124 + }, + { + "epoch": 0.9797071511027401, + "grad_norm": 0.2182396501302719, + "learning_rate": 1.028054777876486e-07, + "loss": 1.0628, + "step": 16125 + }, + { + "epoch": 0.9797679081353666, + "grad_norm": 0.09938058257102966, + "learning_rate": 1.0219283865479101e-07, + "loss": 1.055, + "step": 16126 + }, + { + "epoch": 0.9798286651679932, + "grad_norm": 0.0970664769411087, + "learning_rate": 1.015820285356961e-07, + "loss": 1.0117, + "step": 16127 + }, + { + "epoch": 0.9798894222006197, + "grad_norm": 0.24902427196502686, + "learning_rate": 1.0097304745275704e-07, + "loss": 1.1219, + "step": 16128 + }, + { + "epoch": 0.9799501792332462, + "grad_norm": 0.1190854012966156, + "learning_rate": 1.0036589542829488e-07, + "loss": 1.0786, + "step": 16129 + }, + { + "epoch": 0.9800109362658728, + "grad_norm": 0.09834282100200653, + "learning_rate": 9.97605724845696e-08, + "loss": 1.0342, + "step": 16130 + }, + { + "epoch": 0.9800716932984993, + "grad_norm": 0.15337973833084106, + "learning_rate": 9.915707864376345e-08, + "loss": 1.1267, + "step": 16131 + }, + { + "epoch": 0.9801324503311258, + "grad_norm": 0.10228081047534943, + "learning_rate": 9.855541392800316e-08, + "loss": 1.0664, + "step": 16132 + }, + { + "epoch": 0.9801932073637524, + "grad_norm": 0.11617690324783325, + "learning_rate": 9.795557835933778e-08, + "loss": 1.0676, + "step": 16133 + }, + { + "epoch": 0.9802539643963789, + "grad_norm": 0.10063482820987701, + "learning_rate": 9.735757195975525e-08, + "loss": 1.0431, + "step": 16134 + }, + { + "epoch": 0.9803147214290054, + "grad_norm": 0.10332181304693222, + "learning_rate": 9.676139475117696e-08, + "loss": 1.0398, + "step": 16135 + }, + { + "epoch": 0.980375478461632, + "grad_norm": 0.3813704550266266, + "learning_rate": 9.616704675545762e-08, + "loss": 1.0076, + "step": 16136 + }, + { + "epoch": 0.9804362354942585, + "grad_norm": 0.19592706859111786, + "learning_rate": 9.557452799437982e-08, + "loss": 1.113, + "step": 16137 + }, + { + "epoch": 0.9804969925268849, + "grad_norm": 0.11222069710493088, + "learning_rate": 9.498383848966508e-08, + "loss": 1.013, + "step": 16138 + }, + { + "epoch": 0.9805577495595115, + "grad_norm": 0.22417768836021423, + "learning_rate": 9.439497826296828e-08, + "loss": 1.0343, + "step": 16139 + }, + { + "epoch": 0.980618506592138, + "grad_norm": 0.12241493910551071, + "learning_rate": 9.380794733586107e-08, + "loss": 1.0776, + "step": 16140 + }, + { + "epoch": 0.9806792636247645, + "grad_norm": 0.14164398610591888, + "learning_rate": 9.322274572987621e-08, + "loss": 1.0949, + "step": 16141 + }, + { + "epoch": 0.9807400206573911, + "grad_norm": 0.11311923712491989, + "learning_rate": 9.263937346645769e-08, + "loss": 1.0523, + "step": 16142 + }, + { + "epoch": 0.9808007776900176, + "grad_norm": 0.15364493429660797, + "learning_rate": 9.205783056698835e-08, + "loss": 1.0641, + "step": 16143 + }, + { + "epoch": 0.9808615347226441, + "grad_norm": 0.17510327696800232, + "learning_rate": 9.147811705279008e-08, + "loss": 1.1533, + "step": 16144 + }, + { + "epoch": 0.9809222917552707, + "grad_norm": 0.17928443849086761, + "learning_rate": 9.090023294510697e-08, + "loss": 1.129, + "step": 16145 + }, + { + "epoch": 0.9809830487878972, + "grad_norm": 0.13306649029254913, + "learning_rate": 9.032417826512207e-08, + "loss": 1.0595, + "step": 16146 + }, + { + "epoch": 0.9810438058205238, + "grad_norm": 0.09092667698860168, + "learning_rate": 8.974995303395184e-08, + "loss": 1.0479, + "step": 16147 + }, + { + "epoch": 0.9811045628531503, + "grad_norm": 0.11474576592445374, + "learning_rate": 8.917755727264054e-08, + "loss": 1.0573, + "step": 16148 + }, + { + "epoch": 0.9811653198857768, + "grad_norm": 0.12943261861801147, + "learning_rate": 8.860699100217696e-08, + "loss": 1.028, + "step": 16149 + }, + { + "epoch": 0.9812260769184034, + "grad_norm": 0.10790281742811203, + "learning_rate": 8.803825424347212e-08, + "loss": 0.9781, + "step": 16150 + }, + { + "epoch": 0.9812868339510298, + "grad_norm": 0.1185181587934494, + "learning_rate": 8.747134701737602e-08, + "loss": 1.0507, + "step": 16151 + }, + { + "epoch": 0.9813475909836563, + "grad_norm": 0.10440313816070557, + "learning_rate": 8.690626934466096e-08, + "loss": 1.0813, + "step": 16152 + }, + { + "epoch": 0.9814083480162828, + "grad_norm": 0.11123882979154587, + "learning_rate": 8.634302124604366e-08, + "loss": 1.0761, + "step": 16153 + }, + { + "epoch": 0.9814691050489094, + "grad_norm": 1.1919429302215576, + "learning_rate": 8.578160274217428e-08, + "loss": 1.0604, + "step": 16154 + }, + { + "epoch": 0.9815298620815359, + "grad_norm": 0.1616741269826889, + "learning_rate": 8.522201385362527e-08, + "loss": 1.1023, + "step": 16155 + }, + { + "epoch": 0.9815906191141625, + "grad_norm": 0.2502225339412689, + "learning_rate": 8.466425460091909e-08, + "loss": 1.0837, + "step": 16156 + }, + { + "epoch": 0.981651376146789, + "grad_norm": 0.10444097965955734, + "learning_rate": 8.410832500448385e-08, + "loss": 1.004, + "step": 16157 + }, + { + "epoch": 0.9817121331794155, + "grad_norm": 0.15894034504890442, + "learning_rate": 8.355422508470878e-08, + "loss": 1.0874, + "step": 16158 + }, + { + "epoch": 0.9817728902120421, + "grad_norm": 0.14622347056865692, + "learning_rate": 8.300195486190543e-08, + "loss": 1.0506, + "step": 16159 + }, + { + "epoch": 0.9818336472446686, + "grad_norm": 0.34307006001472473, + "learning_rate": 8.245151435631316e-08, + "loss": 1.0173, + "step": 16160 + }, + { + "epoch": 0.9818944042772951, + "grad_norm": 0.10111016035079956, + "learning_rate": 8.190290358811026e-08, + "loss": 0.9942, + "step": 16161 + }, + { + "epoch": 0.9819551613099217, + "grad_norm": 0.194380521774292, + "learning_rate": 8.135612257740289e-08, + "loss": 1.0556, + "step": 16162 + }, + { + "epoch": 0.9820159183425482, + "grad_norm": 0.5943927764892578, + "learning_rate": 8.08111713442361e-08, + "loss": 1.0523, + "step": 16163 + }, + { + "epoch": 0.9820766753751746, + "grad_norm": 0.11505883932113647, + "learning_rate": 8.026804990858283e-08, + "loss": 1.0105, + "step": 16164 + }, + { + "epoch": 0.9821374324078012, + "grad_norm": 0.08244690299034119, + "learning_rate": 7.972675829036046e-08, + "loss": 0.9533, + "step": 16165 + }, + { + "epoch": 0.9821981894404277, + "grad_norm": 0.1624736487865448, + "learning_rate": 7.91872965093976e-08, + "loss": 1.1099, + "step": 16166 + }, + { + "epoch": 0.9822589464730542, + "grad_norm": 0.1681128591299057, + "learning_rate": 7.86496645854784e-08, + "loss": 1.0665, + "step": 16167 + }, + { + "epoch": 0.9823197035056808, + "grad_norm": 0.13631050288677216, + "learning_rate": 7.811386253829822e-08, + "loss": 1.0573, + "step": 16168 + }, + { + "epoch": 0.9823804605383073, + "grad_norm": 1.784241795539856, + "learning_rate": 7.757989038750801e-08, + "loss": 1.1413, + "step": 16169 + }, + { + "epoch": 0.9824412175709338, + "grad_norm": 0.14992661774158478, + "learning_rate": 7.704774815268101e-08, + "loss": 1.1459, + "step": 16170 + }, + { + "epoch": 0.9825019746035604, + "grad_norm": 0.20285074412822723, + "learning_rate": 7.651743585331273e-08, + "loss": 1.0521, + "step": 16171 + }, + { + "epoch": 0.9825627316361869, + "grad_norm": 0.19562220573425293, + "learning_rate": 7.598895350884871e-08, + "loss": 1.0629, + "step": 16172 + }, + { + "epoch": 0.9826234886688134, + "grad_norm": 4.75029993057251, + "learning_rate": 7.546230113866237e-08, + "loss": 1.1577, + "step": 16173 + }, + { + "epoch": 0.98268424570144, + "grad_norm": 0.15583325922489166, + "learning_rate": 7.493747876204937e-08, + "loss": 1.1043, + "step": 16174 + }, + { + "epoch": 0.9827450027340665, + "grad_norm": 0.1735037863254547, + "learning_rate": 7.441448639826099e-08, + "loss": 1.1572, + "step": 16175 + }, + { + "epoch": 0.982805759766693, + "grad_norm": 2.5373318195343018, + "learning_rate": 7.389332406645966e-08, + "loss": 1.0495, + "step": 16176 + }, + { + "epoch": 0.9828665167993195, + "grad_norm": 0.147075355052948, + "learning_rate": 7.337399178574678e-08, + "loss": 1.0395, + "step": 16177 + }, + { + "epoch": 0.982927273831946, + "grad_norm": 0.10163099318742752, + "learning_rate": 7.285648957515712e-08, + "loss": 1.0222, + "step": 16178 + }, + { + "epoch": 0.9829880308645725, + "grad_norm": 0.13326889276504517, + "learning_rate": 7.234081745366438e-08, + "loss": 1.0021, + "step": 16179 + }, + { + "epoch": 0.9830487878971991, + "grad_norm": 0.22920936346054077, + "learning_rate": 7.182697544017569e-08, + "loss": 1.0977, + "step": 16180 + }, + { + "epoch": 0.9831095449298256, + "grad_norm": 0.13143767416477203, + "learning_rate": 7.131496355351485e-08, + "loss": 1.232, + "step": 16181 + }, + { + "epoch": 0.9831703019624521, + "grad_norm": 0.11583894491195679, + "learning_rate": 7.080478181245576e-08, + "loss": 1.0703, + "step": 16182 + }, + { + "epoch": 0.9832310589950787, + "grad_norm": 0.15936227142810822, + "learning_rate": 7.02964302357001e-08, + "loss": 1.0567, + "step": 16183 + }, + { + "epoch": 0.9832918160277052, + "grad_norm": 0.20767661929130554, + "learning_rate": 6.978990884187742e-08, + "loss": 1.0982, + "step": 16184 + }, + { + "epoch": 0.9833525730603317, + "grad_norm": 0.1629217118024826, + "learning_rate": 6.928521764956176e-08, + "loss": 1.0403, + "step": 16185 + }, + { + "epoch": 0.9834133300929583, + "grad_norm": 0.11175232380628586, + "learning_rate": 6.878235667724386e-08, + "loss": 1.0727, + "step": 16186 + }, + { + "epoch": 0.9834740871255848, + "grad_norm": 0.21810823678970337, + "learning_rate": 6.828132594336456e-08, + "loss": 1.2005, + "step": 16187 + }, + { + "epoch": 0.9835348441582114, + "grad_norm": 0.14823606610298157, + "learning_rate": 6.778212546628138e-08, + "loss": 1.0498, + "step": 16188 + }, + { + "epoch": 0.9835956011908379, + "grad_norm": 0.11760478466749191, + "learning_rate": 6.728475526429634e-08, + "loss": 1.0549, + "step": 16189 + }, + { + "epoch": 0.9836563582234643, + "grad_norm": 0.1506522297859192, + "learning_rate": 6.67892153556393e-08, + "loss": 1.1545, + "step": 16190 + }, + { + "epoch": 0.9837171152560908, + "grad_norm": 0.6151314377784729, + "learning_rate": 6.629550575847354e-08, + "loss": 1.0127, + "step": 16191 + }, + { + "epoch": 0.9837778722887174, + "grad_norm": 0.10732372850179672, + "learning_rate": 6.580362649090122e-08, + "loss": 1.0032, + "step": 16192 + }, + { + "epoch": 0.9838386293213439, + "grad_norm": 0.17160214483737946, + "learning_rate": 6.531357757095235e-08, + "loss": 1.1107, + "step": 16193 + }, + { + "epoch": 0.9838993863539705, + "grad_norm": 0.15070758759975433, + "learning_rate": 6.482535901657927e-08, + "loss": 1.0009, + "step": 16194 + }, + { + "epoch": 0.983960143386597, + "grad_norm": 0.15262217819690704, + "learning_rate": 6.433897084568985e-08, + "loss": 1.0278, + "step": 16195 + }, + { + "epoch": 0.9840209004192235, + "grad_norm": 0.18083415925502777, + "learning_rate": 6.385441307610873e-08, + "loss": 1.0858, + "step": 16196 + }, + { + "epoch": 0.9840816574518501, + "grad_norm": 0.1496964991092682, + "learning_rate": 6.337168572559393e-08, + "loss": 1.1024, + "step": 16197 + }, + { + "epoch": 0.9841424144844766, + "grad_norm": 0.16877521574497223, + "learning_rate": 6.289078881184241e-08, + "loss": 1.1185, + "step": 16198 + }, + { + "epoch": 0.9842031715171031, + "grad_norm": 0.17392559349536896, + "learning_rate": 6.241172235247894e-08, + "loss": 1.0851, + "step": 16199 + }, + { + "epoch": 0.9842639285497297, + "grad_norm": 0.8473336696624756, + "learning_rate": 6.193448636506726e-08, + "loss": 1.0939, + "step": 16200 + }, + { + "epoch": 0.9843246855823562, + "grad_norm": 0.17369872331619263, + "learning_rate": 6.14590808671045e-08, + "loss": 1.0155, + "step": 16201 + }, + { + "epoch": 0.9843854426149827, + "grad_norm": 0.2819485068321228, + "learning_rate": 6.098550587601004e-08, + "loss": 1.1304, + "step": 16202 + }, + { + "epoch": 0.9844461996476093, + "grad_norm": 0.17543594539165497, + "learning_rate": 6.051376140914778e-08, + "loss": 1.076, + "step": 16203 + }, + { + "epoch": 0.9845069566802357, + "grad_norm": 0.12441477179527283, + "learning_rate": 6.004384748379832e-08, + "loss": 1.028, + "step": 16204 + }, + { + "epoch": 0.9845677137128622, + "grad_norm": 0.13013726472854614, + "learning_rate": 5.957576411720345e-08, + "loss": 1.0308, + "step": 16205 + }, + { + "epoch": 0.9846284707454888, + "grad_norm": 0.24910157918930054, + "learning_rate": 5.9109511326504994e-08, + "loss": 1.0836, + "step": 16206 + }, + { + "epoch": 0.9846892277781153, + "grad_norm": 0.1251731514930725, + "learning_rate": 5.864508912880595e-08, + "loss": 1.0699, + "step": 16207 + }, + { + "epoch": 0.9847499848107418, + "grad_norm": 0.269575834274292, + "learning_rate": 5.818249754112603e-08, + "loss": 1.0653, + "step": 16208 + }, + { + "epoch": 0.9848107418433684, + "grad_norm": 0.24665388464927673, + "learning_rate": 5.77217365804128e-08, + "loss": 1.0616, + "step": 16209 + }, + { + "epoch": 0.9848714988759949, + "grad_norm": 0.1910027265548706, + "learning_rate": 5.726280626356939e-08, + "loss": 1.1048, + "step": 16210 + }, + { + "epoch": 0.9849322559086214, + "grad_norm": 0.1944764405488968, + "learning_rate": 5.680570660741014e-08, + "loss": 1.2086, + "step": 16211 + }, + { + "epoch": 0.984993012941248, + "grad_norm": 0.2127663940191269, + "learning_rate": 5.6350437628688303e-08, + "loss": 1.161, + "step": 16212 + }, + { + "epoch": 0.9850537699738745, + "grad_norm": 0.12328021228313446, + "learning_rate": 5.589699934410164e-08, + "loss": 0.9736, + "step": 16213 + }, + { + "epoch": 0.985114527006501, + "grad_norm": 0.4507182836532593, + "learning_rate": 5.544539177025909e-08, + "loss": 1.1743, + "step": 16214 + }, + { + "epoch": 0.9851752840391276, + "grad_norm": 0.1443803310394287, + "learning_rate": 5.499561492371963e-08, + "loss": 1.1052, + "step": 16215 + }, + { + "epoch": 0.9852360410717541, + "grad_norm": 0.09980019181966782, + "learning_rate": 5.454766882097007e-08, + "loss": 1.0177, + "step": 16216 + }, + { + "epoch": 0.9852967981043805, + "grad_norm": 0.1433623880147934, + "learning_rate": 5.410155347843615e-08, + "loss": 1.1324, + "step": 16217 + }, + { + "epoch": 0.9853575551370071, + "grad_norm": 0.15100646018981934, + "learning_rate": 5.3657268912454815e-08, + "loss": 1.0952, + "step": 16218 + }, + { + "epoch": 0.9854183121696336, + "grad_norm": 0.10645327717065811, + "learning_rate": 5.321481513932969e-08, + "loss": 0.9999, + "step": 16219 + }, + { + "epoch": 0.9854790692022601, + "grad_norm": 0.2934175729751587, + "learning_rate": 5.277419217526447e-08, + "loss": 1.1087, + "step": 16220 + }, + { + "epoch": 0.9855398262348867, + "grad_norm": 0.4645816683769226, + "learning_rate": 5.233540003641291e-08, + "loss": 1.0195, + "step": 16221 + }, + { + "epoch": 0.9856005832675132, + "grad_norm": 0.1823212206363678, + "learning_rate": 5.189843873886768e-08, + "loss": 1.1439, + "step": 16222 + }, + { + "epoch": 0.9856613403001397, + "grad_norm": 1.103392243385315, + "learning_rate": 5.1463308298638216e-08, + "loss": 1.0375, + "step": 16223 + }, + { + "epoch": 0.9857220973327663, + "grad_norm": 0.12636372447013855, + "learning_rate": 5.1030008731678406e-08, + "loss": 1.0799, + "step": 16224 + }, + { + "epoch": 0.9857828543653928, + "grad_norm": 0.15905901789665222, + "learning_rate": 5.059854005386444e-08, + "loss": 1.0932, + "step": 16225 + }, + { + "epoch": 0.9858436113980193, + "grad_norm": 0.39884480834007263, + "learning_rate": 5.0168902281017003e-08, + "loss": 1.0032, + "step": 16226 + }, + { + "epoch": 0.9859043684306459, + "grad_norm": 0.17216506600379944, + "learning_rate": 4.974109542888461e-08, + "loss": 1.0447, + "step": 16227 + }, + { + "epoch": 0.9859651254632724, + "grad_norm": 0.1548287570476532, + "learning_rate": 4.931511951314915e-08, + "loss": 1.1883, + "step": 16228 + }, + { + "epoch": 0.986025882495899, + "grad_norm": 0.08968793600797653, + "learning_rate": 4.889097454942037e-08, + "loss": 0.9937, + "step": 16229 + }, + { + "epoch": 0.9860866395285254, + "grad_norm": 0.1462932825088501, + "learning_rate": 4.846866055324695e-08, + "loss": 1.0091, + "step": 16230 + }, + { + "epoch": 0.9861473965611519, + "grad_norm": 0.11029744893312454, + "learning_rate": 4.8048177540116476e-08, + "loss": 1.0655, + "step": 16231 + }, + { + "epoch": 0.9862081535937784, + "grad_norm": 0.2198856770992279, + "learning_rate": 4.7629525525427767e-08, + "loss": 1.1506, + "step": 16232 + }, + { + "epoch": 0.986268910626405, + "grad_norm": 0.1832173764705658, + "learning_rate": 4.721270452454074e-08, + "loss": 1.1574, + "step": 16233 + }, + { + "epoch": 0.9863296676590315, + "grad_norm": 0.09299392253160477, + "learning_rate": 4.679771455272653e-08, + "loss": 1.0169, + "step": 16234 + }, + { + "epoch": 0.986390424691658, + "grad_norm": 0.14054886996746063, + "learning_rate": 4.638455562519517e-08, + "loss": 1.0209, + "step": 16235 + }, + { + "epoch": 0.9864511817242846, + "grad_norm": 0.15262296795845032, + "learning_rate": 4.597322775709567e-08, + "loss": 1.0365, + "step": 16236 + }, + { + "epoch": 0.9865119387569111, + "grad_norm": 0.22205904126167297, + "learning_rate": 4.5563730963499306e-08, + "loss": 1.0497, + "step": 16237 + }, + { + "epoch": 0.9865726957895377, + "grad_norm": 0.09753791987895966, + "learning_rate": 4.515606525942739e-08, + "loss": 1.0243, + "step": 16238 + }, + { + "epoch": 0.9866334528221642, + "grad_norm": 0.3189934194087982, + "learning_rate": 4.475023065981243e-08, + "loss": 1.0419, + "step": 16239 + }, + { + "epoch": 0.9866942098547907, + "grad_norm": 0.09614891558885574, + "learning_rate": 4.434622717953141e-08, + "loss": 1.0347, + "step": 16240 + }, + { + "epoch": 0.9867549668874173, + "grad_norm": 0.5230992436408997, + "learning_rate": 4.394405483340025e-08, + "loss": 1.2699, + "step": 16241 + }, + { + "epoch": 0.9868157239200438, + "grad_norm": 0.18224404752254486, + "learning_rate": 4.3543713636151615e-08, + "loss": 1.1505, + "step": 16242 + }, + { + "epoch": 0.9868764809526702, + "grad_norm": 0.09472563117742538, + "learning_rate": 4.3145203602468207e-08, + "loss": 1.0057, + "step": 16243 + }, + { + "epoch": 0.9869372379852968, + "grad_norm": 0.15544719994068146, + "learning_rate": 4.274852474694946e-08, + "loss": 1.1309, + "step": 16244 + }, + { + "epoch": 0.9869979950179233, + "grad_norm": 0.2506932318210602, + "learning_rate": 4.235367708414484e-08, + "loss": 1.2998, + "step": 16245 + }, + { + "epoch": 0.9870587520505498, + "grad_norm": 6.877426624298096, + "learning_rate": 4.1960660628520554e-08, + "loss": 1.1083, + "step": 16246 + }, + { + "epoch": 0.9871195090831764, + "grad_norm": 0.6385260224342346, + "learning_rate": 4.15694753944873e-08, + "loss": 1.1019, + "step": 16247 + }, + { + "epoch": 0.9871802661158029, + "grad_norm": 0.10798175632953644, + "learning_rate": 4.118012139637806e-08, + "loss": 1.0023, + "step": 16248 + }, + { + "epoch": 0.9872410231484294, + "grad_norm": 0.1646491289138794, + "learning_rate": 4.079259864847029e-08, + "loss": 1.0992, + "step": 16249 + }, + { + "epoch": 0.987301780181056, + "grad_norm": 0.1321839541196823, + "learning_rate": 4.040690716496376e-08, + "loss": 1.0144, + "step": 16250 + }, + { + "epoch": 0.9873625372136825, + "grad_norm": 0.14921942353248596, + "learning_rate": 4.00230469600027e-08, + "loss": 1.0754, + "step": 16251 + }, + { + "epoch": 0.987423294246309, + "grad_norm": 0.16771191358566284, + "learning_rate": 3.9641018047653633e-08, + "loss": 1.0244, + "step": 16252 + }, + { + "epoch": 0.9874840512789356, + "grad_norm": 0.10679901391267776, + "learning_rate": 3.9260820441916477e-08, + "loss": 1.0182, + "step": 16253 + }, + { + "epoch": 0.9875448083115621, + "grad_norm": 0.16618993878364563, + "learning_rate": 3.8882454156735634e-08, + "loss": 1.0964, + "step": 16254 + }, + { + "epoch": 0.9876055653441886, + "grad_norm": 3.1540703773498535, + "learning_rate": 3.850591920597224e-08, + "loss": 1.0817, + "step": 16255 + }, + { + "epoch": 0.9876663223768151, + "grad_norm": 0.1641586869955063, + "learning_rate": 3.8131215603437467e-08, + "loss": 1.0985, + "step": 16256 + }, + { + "epoch": 0.9877270794094416, + "grad_norm": 0.1605738401412964, + "learning_rate": 3.775834336285367e-08, + "loss": 1.072, + "step": 16257 + }, + { + "epoch": 0.9877878364420681, + "grad_norm": 0.14778538048267365, + "learning_rate": 3.738730249789879e-08, + "loss": 1.1756, + "step": 16258 + }, + { + "epoch": 0.9878485934746947, + "grad_norm": 0.16662347316741943, + "learning_rate": 3.701809302216752e-08, + "loss": 1.1221, + "step": 16259 + }, + { + "epoch": 0.9879093505073212, + "grad_norm": 0.11811026930809021, + "learning_rate": 3.665071494919903e-08, + "loss": 1.04, + "step": 16260 + }, + { + "epoch": 0.9879701075399477, + "grad_norm": 0.14614927768707275, + "learning_rate": 3.6285168292454766e-08, + "loss": 1.0863, + "step": 16261 + }, + { + "epoch": 0.9880308645725743, + "grad_norm": 0.10567299276590347, + "learning_rate": 3.592145306533512e-08, + "loss": 1.0259, + "step": 16262 + }, + { + "epoch": 0.9880916216052008, + "grad_norm": 0.11595337837934494, + "learning_rate": 3.555956928117388e-08, + "loss": 0.975, + "step": 16263 + }, + { + "epoch": 0.9881523786378273, + "grad_norm": 0.106806680560112, + "learning_rate": 3.519951695323265e-08, + "loss": 1.0369, + "step": 16264 + }, + { + "epoch": 0.9882131356704539, + "grad_norm": 0.19005513191223145, + "learning_rate": 3.4841296094711984e-08, + "loss": 1.0304, + "step": 16265 + }, + { + "epoch": 0.9882738927030804, + "grad_norm": 0.236617311835289, + "learning_rate": 3.4484906718740276e-08, + "loss": 1.0201, + "step": 16266 + }, + { + "epoch": 0.988334649735707, + "grad_norm": 0.17361678183078766, + "learning_rate": 3.413034883838484e-08, + "loss": 1.1204, + "step": 16267 + }, + { + "epoch": 0.9883954067683335, + "grad_norm": 0.16652122139930725, + "learning_rate": 3.377762246663529e-08, + "loss": 1.1523, + "step": 16268 + }, + { + "epoch": 0.9884561638009599, + "grad_norm": 0.16175909340381622, + "learning_rate": 3.3426727616431286e-08, + "loss": 1.0317, + "step": 16269 + }, + { + "epoch": 0.9885169208335864, + "grad_norm": 0.09417491406202316, + "learning_rate": 3.3077664300623645e-08, + "loss": 1.0093, + "step": 16270 + }, + { + "epoch": 0.988577677866213, + "grad_norm": 0.12074661254882812, + "learning_rate": 3.273043253201324e-08, + "loss": 0.9837, + "step": 16271 + }, + { + "epoch": 0.9886384348988395, + "grad_norm": 0.12248991429805756, + "learning_rate": 3.238503232333434e-08, + "loss": 1.0563, + "step": 16272 + }, + { + "epoch": 0.988699191931466, + "grad_norm": 0.1273757815361023, + "learning_rate": 3.2041463687232374e-08, + "loss": 0.9905, + "step": 16273 + }, + { + "epoch": 0.9887599489640926, + "grad_norm": 0.2450380176305771, + "learning_rate": 3.169972663631393e-08, + "loss": 1.2046, + "step": 16274 + }, + { + "epoch": 0.9888207059967191, + "grad_norm": 0.3407484292984009, + "learning_rate": 3.135982118309677e-08, + "loss": 1.1031, + "step": 16275 + }, + { + "epoch": 0.9888814630293457, + "grad_norm": 0.11378134042024612, + "learning_rate": 3.10217473400487e-08, + "loss": 1.0558, + "step": 16276 + }, + { + "epoch": 0.9889422200619722, + "grad_norm": 0.11455726623535156, + "learning_rate": 3.068550511955426e-08, + "loss": 1.0424, + "step": 16277 + }, + { + "epoch": 0.9890029770945987, + "grad_norm": 0.10960632562637329, + "learning_rate": 3.035109453393692e-08, + "loss": 1.0592, + "step": 16278 + }, + { + "epoch": 0.9890637341272253, + "grad_norm": 0.10926475375890732, + "learning_rate": 3.0018515595464645e-08, + "loss": 1.0339, + "step": 16279 + }, + { + "epoch": 0.9891244911598518, + "grad_norm": 0.14063212275505066, + "learning_rate": 2.968776831632214e-08, + "loss": 1.0296, + "step": 16280 + }, + { + "epoch": 0.9891852481924783, + "grad_norm": 0.13234879076480865, + "learning_rate": 2.9358852708633032e-08, + "loss": 1.0527, + "step": 16281 + }, + { + "epoch": 0.9892460052251048, + "grad_norm": 0.1297687143087387, + "learning_rate": 2.903176878445435e-08, + "loss": 1.0492, + "step": 16282 + }, + { + "epoch": 0.9893067622577313, + "grad_norm": 0.301035612821579, + "learning_rate": 2.8706516555776497e-08, + "loss": 1.0035, + "step": 16283 + }, + { + "epoch": 0.9893675192903578, + "grad_norm": 0.11623239517211914, + "learning_rate": 2.838309603452327e-08, + "loss": 1.0135, + "step": 16284 + }, + { + "epoch": 0.9894282763229844, + "grad_norm": 0.5636290311813354, + "learning_rate": 2.806150723254075e-08, + "loss": 1.1364, + "step": 16285 + }, + { + "epoch": 0.9894890333556109, + "grad_norm": 0.14663462340831757, + "learning_rate": 2.7741750161625057e-08, + "loss": 1.0931, + "step": 16286 + }, + { + "epoch": 0.9895497903882374, + "grad_norm": 0.24398909509181976, + "learning_rate": 2.7423824833494593e-08, + "loss": 1.0783, + "step": 16287 + }, + { + "epoch": 0.989610547420864, + "grad_norm": 0.12663604319095612, + "learning_rate": 2.71077312598067e-08, + "loss": 1.0267, + "step": 16288 + }, + { + "epoch": 0.9896713044534905, + "grad_norm": 0.19128142297267914, + "learning_rate": 2.6793469452141007e-08, + "loss": 1.0423, + "step": 16289 + }, + { + "epoch": 0.989732061486117, + "grad_norm": 0.18244168162345886, + "learning_rate": 2.6481039422021626e-08, + "loss": 1.0873, + "step": 16290 + }, + { + "epoch": 0.9897928185187436, + "grad_norm": 0.18155498802661896, + "learning_rate": 2.6170441180900506e-08, + "loss": 1.1018, + "step": 16291 + }, + { + "epoch": 0.9898535755513701, + "grad_norm": 0.11443190276622772, + "learning_rate": 2.586167474015744e-08, + "loss": 1.0669, + "step": 16292 + }, + { + "epoch": 0.9899143325839966, + "grad_norm": 0.1291109025478363, + "learning_rate": 2.5554740111116693e-08, + "loss": 1.0626, + "step": 16293 + }, + { + "epoch": 0.9899750896166232, + "grad_norm": 0.11472375690937042, + "learning_rate": 2.5249637305030383e-08, + "loss": 1.0227, + "step": 16294 + }, + { + "epoch": 0.9900358466492496, + "grad_norm": 0.15321263670921326, + "learning_rate": 2.49463663330729e-08, + "loss": 1.0819, + "step": 16295 + }, + { + "epoch": 0.9900966036818761, + "grad_norm": 0.3910882771015167, + "learning_rate": 2.4644927206368685e-08, + "loss": 1.0588, + "step": 16296 + }, + { + "epoch": 0.9901573607145027, + "grad_norm": 0.20071378350257874, + "learning_rate": 2.434531993596445e-08, + "loss": 1.1416, + "step": 16297 + }, + { + "epoch": 0.9902181177471292, + "grad_norm": 0.7159701585769653, + "learning_rate": 2.4047544532845857e-08, + "loss": 1.1266, + "step": 16298 + }, + { + "epoch": 0.9902788747797557, + "grad_norm": 0.19287821650505066, + "learning_rate": 2.3751601007920842e-08, + "loss": 1.0753, + "step": 16299 + }, + { + "epoch": 0.9903396318123823, + "grad_norm": 0.12025300413370132, + "learning_rate": 2.345748937204184e-08, + "loss": 1.0383, + "step": 16300 + }, + { + "epoch": 0.9904003888450088, + "grad_norm": 0.19872963428497314, + "learning_rate": 2.3165209635989116e-08, + "loss": 1.0726, + "step": 16301 + }, + { + "epoch": 0.9904611458776353, + "grad_norm": 0.11132209002971649, + "learning_rate": 2.287476181047632e-08, + "loss": 1.0227, + "step": 16302 + }, + { + "epoch": 0.9905219029102619, + "grad_norm": 0.10098928958177567, + "learning_rate": 2.258614590615049e-08, + "loss": 1.0289, + "step": 16303 + }, + { + "epoch": 0.9905826599428884, + "grad_norm": 0.17251595854759216, + "learning_rate": 2.2299361933586505e-08, + "loss": 1.0801, + "step": 16304 + }, + { + "epoch": 0.990643416975515, + "grad_norm": 0.221372589468956, + "learning_rate": 2.2014409903303724e-08, + "loss": 1.1968, + "step": 16305 + }, + { + "epoch": 0.9907041740081415, + "grad_norm": 0.15848013758659363, + "learning_rate": 2.1731289825738245e-08, + "loss": 1.0452, + "step": 16306 + }, + { + "epoch": 0.990764931040768, + "grad_norm": 0.5078273415565491, + "learning_rate": 2.14500017112762e-08, + "loss": 1.0953, + "step": 16307 + }, + { + "epoch": 0.9908256880733946, + "grad_norm": 0.09761633723974228, + "learning_rate": 2.117054557022602e-08, + "loss": 1.0434, + "step": 16308 + }, + { + "epoch": 0.990886445106021, + "grad_norm": 0.1328747570514679, + "learning_rate": 2.0892921412829504e-08, + "loss": 1.0404, + "step": 16309 + }, + { + "epoch": 0.9909472021386475, + "grad_norm": 0.1075906828045845, + "learning_rate": 2.0617129249261845e-08, + "loss": 1.0375, + "step": 16310 + }, + { + "epoch": 0.991007959171274, + "grad_norm": 0.13317234814167023, + "learning_rate": 2.0343169089637182e-08, + "loss": 1.0308, + "step": 16311 + }, + { + "epoch": 0.9910687162039006, + "grad_norm": 0.09540808945894241, + "learning_rate": 2.0071040943991926e-08, + "loss": 1.0466, + "step": 16312 + }, + { + "epoch": 0.9911294732365271, + "grad_norm": 0.09854982048273087, + "learning_rate": 1.980074482230698e-08, + "loss": 1.0191, + "step": 16313 + }, + { + "epoch": 0.9911902302691536, + "grad_norm": 0.2078581005334854, + "learning_rate": 1.953228073447999e-08, + "loss": 1.063, + "step": 16314 + }, + { + "epoch": 0.9912509873017802, + "grad_norm": 0.11335760354995728, + "learning_rate": 1.926564869036418e-08, + "loss": 1.0425, + "step": 16315 + }, + { + "epoch": 0.9913117443344067, + "grad_norm": 0.42150208353996277, + "learning_rate": 1.9000848699723962e-08, + "loss": 1.1199, + "step": 16316 + }, + { + "epoch": 0.9913725013670333, + "grad_norm": 0.12567904591560364, + "learning_rate": 1.8737880772273786e-08, + "loss": 1.0938, + "step": 16317 + }, + { + "epoch": 0.9914332583996598, + "grad_norm": 0.17655733227729797, + "learning_rate": 1.847674491763929e-08, + "loss": 1.1089, + "step": 16318 + }, + { + "epoch": 0.9914940154322863, + "grad_norm": 0.19671276211738586, + "learning_rate": 1.8217441145407243e-08, + "loss": 1.1662, + "step": 16319 + }, + { + "epoch": 0.9915547724649129, + "grad_norm": 0.24333301186561584, + "learning_rate": 1.795996946507561e-08, + "loss": 1.0904, + "step": 16320 + }, + { + "epoch": 0.9916155294975394, + "grad_norm": 0.15849147737026215, + "learning_rate": 1.7704329886081282e-08, + "loss": 1.0899, + "step": 16321 + }, + { + "epoch": 0.9916762865301658, + "grad_norm": 0.10567186027765274, + "learning_rate": 1.7450522417794546e-08, + "loss": 1.0363, + "step": 16322 + }, + { + "epoch": 0.9917370435627924, + "grad_norm": 0.23922015726566315, + "learning_rate": 1.719854706952462e-08, + "loss": 1.1539, + "step": 16323 + }, + { + "epoch": 0.9917978005954189, + "grad_norm": 0.9320358037948608, + "learning_rate": 1.6948403850503003e-08, + "loss": 1.0789, + "step": 16324 + }, + { + "epoch": 0.9918585576280454, + "grad_norm": 0.24150465428829193, + "learning_rate": 1.6700092769894593e-08, + "loss": 1.0223, + "step": 16325 + }, + { + "epoch": 0.991919314660672, + "grad_norm": 0.13452938199043274, + "learning_rate": 1.645361383680877e-08, + "loss": 0.9891, + "step": 16326 + }, + { + "epoch": 0.9919800716932985, + "grad_norm": 0.12692850828170776, + "learning_rate": 1.6208967060277193e-08, + "loss": 1.043, + "step": 16327 + }, + { + "epoch": 0.992040828725925, + "grad_norm": 0.1624302864074707, + "learning_rate": 1.5966152449264916e-08, + "loss": 1.1034, + "step": 16328 + }, + { + "epoch": 0.9921015857585516, + "grad_norm": 0.1466129571199417, + "learning_rate": 1.5725170012681478e-08, + "loss": 1.313, + "step": 16329 + }, + { + "epoch": 0.9921623427911781, + "grad_norm": 0.20591171085834503, + "learning_rate": 1.5486019759347603e-08, + "loss": 1.1587, + "step": 16330 + }, + { + "epoch": 0.9922230998238046, + "grad_norm": 0.10770957916975021, + "learning_rate": 1.5248701698039604e-08, + "loss": 1.0059, + "step": 16331 + }, + { + "epoch": 0.9922838568564312, + "grad_norm": 0.16154597699642181, + "learning_rate": 1.5013215837450522e-08, + "loss": 1.0685, + "step": 16332 + }, + { + "epoch": 0.9923446138890577, + "grad_norm": 0.18607288599014282, + "learning_rate": 1.4779562186212348e-08, + "loss": 1.1741, + "step": 16333 + }, + { + "epoch": 0.9924053709216842, + "grad_norm": 0.133961021900177, + "learning_rate": 1.4547740752890449e-08, + "loss": 1.0493, + "step": 16334 + }, + { + "epoch": 0.9924661279543107, + "grad_norm": 0.1740068644285202, + "learning_rate": 1.4317751545983582e-08, + "loss": 1.0907, + "step": 16335 + }, + { + "epoch": 0.9925268849869372, + "grad_norm": 0.11344313621520996, + "learning_rate": 1.4089594573923893e-08, + "loss": 1.0242, + "step": 16336 + }, + { + "epoch": 0.9925876420195637, + "grad_norm": 0.14299818873405457, + "learning_rate": 1.3863269845071359e-08, + "loss": 1.1523, + "step": 16337 + }, + { + "epoch": 0.9926483990521903, + "grad_norm": 0.12203660607337952, + "learning_rate": 1.3638777367724897e-08, + "loss": 1.1053, + "step": 16338 + }, + { + "epoch": 0.9927091560848168, + "grad_norm": 0.39186519384384155, + "learning_rate": 1.3416117150105711e-08, + "loss": 1.1477, + "step": 16339 + }, + { + "epoch": 0.9927699131174433, + "grad_norm": 0.1688614934682846, + "learning_rate": 1.3195289200385042e-08, + "loss": 1.1292, + "step": 16340 + }, + { + "epoch": 0.9928306701500699, + "grad_norm": 0.1663534939289093, + "learning_rate": 1.2976293526656414e-08, + "loss": 1.1446, + "step": 16341 + }, + { + "epoch": 0.9928914271826964, + "grad_norm": 0.1755792796611786, + "learning_rate": 1.2759130136935638e-08, + "loss": 1.0967, + "step": 16342 + }, + { + "epoch": 0.9929521842153229, + "grad_norm": 0.10801321268081665, + "learning_rate": 1.2543799039199666e-08, + "loss": 1.0146, + "step": 16343 + }, + { + "epoch": 0.9930129412479495, + "grad_norm": 0.339424729347229, + "learning_rate": 1.2330300241325531e-08, + "loss": 1.08, + "step": 16344 + }, + { + "epoch": 0.993073698280576, + "grad_norm": 0.11286205053329468, + "learning_rate": 1.211863375115141e-08, + "loss": 1.0761, + "step": 16345 + }, + { + "epoch": 0.9931344553132025, + "grad_norm": 0.1963888704776764, + "learning_rate": 1.1908799576432207e-08, + "loss": 1.0592, + "step": 16346 + }, + { + "epoch": 0.9931952123458291, + "grad_norm": 0.17381581664085388, + "learning_rate": 1.1700797724856217e-08, + "loss": 1.1608, + "step": 16347 + }, + { + "epoch": 0.9932559693784555, + "grad_norm": 0.16749228537082672, + "learning_rate": 1.1494628204050672e-08, + "loss": 1.0566, + "step": 16348 + }, + { + "epoch": 0.993316726411082, + "grad_norm": 0.10370411723852158, + "learning_rate": 1.1290291021570643e-08, + "loss": 0.9783, + "step": 16349 + }, + { + "epoch": 0.9933774834437086, + "grad_norm": 0.15005536377429962, + "learning_rate": 1.1087786184910132e-08, + "loss": 1.0797, + "step": 16350 + }, + { + "epoch": 0.9934382404763351, + "grad_norm": 0.4708842933177948, + "learning_rate": 1.088711370148543e-08, + "loss": 1.1091, + "step": 16351 + }, + { + "epoch": 0.9934989975089616, + "grad_norm": 0.09188425540924072, + "learning_rate": 1.0688273578657315e-08, + "loss": 1.0199, + "step": 16352 + }, + { + "epoch": 0.9935597545415882, + "grad_norm": 0.18060752749443054, + "learning_rate": 1.0491265823714403e-08, + "loss": 1.0648, + "step": 16353 + }, + { + "epoch": 0.9936205115742147, + "grad_norm": 0.18418973684310913, + "learning_rate": 1.0296090443878691e-08, + "loss": 1.0846, + "step": 16354 + }, + { + "epoch": 0.9936812686068413, + "grad_norm": 0.13063186407089233, + "learning_rate": 1.010274744630002e-08, + "loss": 1.0257, + "step": 16355 + }, + { + "epoch": 0.9937420256394678, + "grad_norm": 0.14973171055316925, + "learning_rate": 9.911236838067162e-09, + "loss": 1.1194, + "step": 16356 + }, + { + "epoch": 0.9938027826720943, + "grad_norm": 0.10031884908676147, + "learning_rate": 9.721558626207827e-09, + "loss": 1.0268, + "step": 16357 + }, + { + "epoch": 0.9938635397047209, + "grad_norm": 1.4674253463745117, + "learning_rate": 9.533712817660912e-09, + "loss": 1.2043, + "step": 16358 + }, + { + "epoch": 0.9939242967373474, + "grad_norm": 0.1417066901922226, + "learning_rate": 9.347699419320898e-09, + "loss": 1.0582, + "step": 16359 + }, + { + "epoch": 0.9939850537699739, + "grad_norm": 0.1254492700099945, + "learning_rate": 9.163518437999009e-09, + "loss": 1.0635, + "step": 16360 + }, + { + "epoch": 0.9940458108026003, + "grad_norm": 0.1302124261856079, + "learning_rate": 8.981169880456497e-09, + "loss": 1.0724, + "step": 16361 + }, + { + "epoch": 0.9941065678352269, + "grad_norm": 0.15401579439640045, + "learning_rate": 8.800653753371357e-09, + "loss": 1.0717, + "step": 16362 + }, + { + "epoch": 0.9941673248678534, + "grad_norm": 0.2836470901966095, + "learning_rate": 8.621970063360518e-09, + "loss": 1.091, + "step": 16363 + }, + { + "epoch": 0.99422808190048, + "grad_norm": 0.17619681358337402, + "learning_rate": 8.445118816979846e-09, + "loss": 1.0607, + "step": 16364 + }, + { + "epoch": 0.9942888389331065, + "grad_norm": 0.1280762106180191, + "learning_rate": 8.270100020701943e-09, + "loss": 1.0516, + "step": 16365 + }, + { + "epoch": 0.994349595965733, + "grad_norm": 0.1501254439353943, + "learning_rate": 8.096913680943896e-09, + "loss": 1.0818, + "step": 16366 + }, + { + "epoch": 0.9944103529983596, + "grad_norm": 0.10833926498889923, + "learning_rate": 7.925559804061733e-09, + "loss": 1.0294, + "step": 16367 + }, + { + "epoch": 0.9944711100309861, + "grad_norm": 0.14364440739154816, + "learning_rate": 7.756038396328214e-09, + "loss": 1.0696, + "step": 16368 + }, + { + "epoch": 0.9945318670636126, + "grad_norm": 0.1562419831752777, + "learning_rate": 7.588349463960587e-09, + "loss": 1.1501, + "step": 16369 + }, + { + "epoch": 0.9945926240962392, + "grad_norm": 0.16588133573532104, + "learning_rate": 7.422493013103937e-09, + "loss": 1.0214, + "step": 16370 + }, + { + "epoch": 0.9946533811288657, + "grad_norm": 0.11225719749927521, + "learning_rate": 7.258469049842287e-09, + "loss": 1.0363, + "step": 16371 + }, + { + "epoch": 0.9947141381614922, + "grad_norm": 0.21635957062244415, + "learning_rate": 7.0962775801874935e-09, + "loss": 1.1032, + "step": 16372 + }, + { + "epoch": 0.9947748951941188, + "grad_norm": 0.30810874700546265, + "learning_rate": 6.9359186100737e-09, + "loss": 1.0764, + "step": 16373 + }, + { + "epoch": 0.9948356522267452, + "grad_norm": 0.09591729193925858, + "learning_rate": 6.777392145396188e-09, + "loss": 1.0214, + "step": 16374 + }, + { + "epoch": 0.9948964092593717, + "grad_norm": 0.11895853281021118, + "learning_rate": 6.620698191950325e-09, + "loss": 0.9957, + "step": 16375 + }, + { + "epoch": 0.9949571662919983, + "grad_norm": 0.1277838796377182, + "learning_rate": 6.4658367554870644e-09, + "loss": 1.0188, + "step": 16376 + }, + { + "epoch": 0.9950179233246248, + "grad_norm": 1.3013290166854858, + "learning_rate": 6.312807841685198e-09, + "loss": 1.1323, + "step": 16377 + }, + { + "epoch": 0.9950786803572513, + "grad_norm": 12.565542221069336, + "learning_rate": 6.16161145615135e-09, + "loss": 1.053, + "step": 16378 + }, + { + "epoch": 0.9951394373898779, + "grad_norm": 0.12348482757806778, + "learning_rate": 6.012247604425536e-09, + "loss": 1.0495, + "step": 16379 + }, + { + "epoch": 0.9952001944225044, + "grad_norm": 0.16675661504268646, + "learning_rate": 5.864716291981154e-09, + "loss": 1.1276, + "step": 16380 + }, + { + "epoch": 0.9952609514551309, + "grad_norm": 0.1771889328956604, + "learning_rate": 5.719017524236092e-09, + "loss": 1.1399, + "step": 16381 + }, + { + "epoch": 0.9953217084877575, + "grad_norm": 0.1173153892159462, + "learning_rate": 5.575151306519421e-09, + "loss": 1.0114, + "step": 16382 + }, + { + "epoch": 0.995382465520384, + "grad_norm": 3.2511708736419678, + "learning_rate": 5.433117644110252e-09, + "loss": 1.0745, + "step": 16383 + }, + { + "epoch": 0.9954432225530105, + "grad_norm": 0.10906144231557846, + "learning_rate": 5.29291654220998e-09, + "loss": 1.0691, + "step": 16384 + }, + { + "epoch": 0.9955039795856371, + "grad_norm": 0.2170865833759308, + "learning_rate": 5.154548005964488e-09, + "loss": 1.0627, + "step": 16385 + }, + { + "epoch": 0.9955647366182636, + "grad_norm": 0.18898631632328033, + "learning_rate": 5.018012040447495e-09, + "loss": 1.0766, + "step": 16386 + }, + { + "epoch": 0.99562549365089, + "grad_norm": 0.12484834343194962, + "learning_rate": 4.883308650655005e-09, + "loss": 1.0526, + "step": 16387 + }, + { + "epoch": 0.9956862506835166, + "grad_norm": 0.1729438453912735, + "learning_rate": 4.750437841527511e-09, + "loss": 1.0582, + "step": 16388 + }, + { + "epoch": 0.9957470077161431, + "grad_norm": 0.16722114384174347, + "learning_rate": 4.619399617938891e-09, + "loss": 1.0758, + "step": 16389 + }, + { + "epoch": 0.9958077647487696, + "grad_norm": 0.768099844455719, + "learning_rate": 4.49019398469086e-09, + "loss": 1.2121, + "step": 16390 + }, + { + "epoch": 0.9958685217813962, + "grad_norm": 0.1186041310429573, + "learning_rate": 4.362820946512969e-09, + "loss": 1.0011, + "step": 16391 + }, + { + "epoch": 0.9959292788140227, + "grad_norm": 0.18163001537322998, + "learning_rate": 4.2372805080848065e-09, + "loss": 1.0594, + "step": 16392 + }, + { + "epoch": 0.9959900358466492, + "grad_norm": 0.15095829963684082, + "learning_rate": 4.113572674002697e-09, + "loss": 1.0616, + "step": 16393 + }, + { + "epoch": 0.9960507928792758, + "grad_norm": 0.4118235409259796, + "learning_rate": 3.991697448801901e-09, + "loss": 1.0738, + "step": 16394 + }, + { + "epoch": 0.9961115499119023, + "grad_norm": 0.3481995761394501, + "learning_rate": 3.871654836951066e-09, + "loss": 1.0326, + "step": 16395 + }, + { + "epoch": 0.9961723069445289, + "grad_norm": 0.09429150074720383, + "learning_rate": 3.753444842841125e-09, + "loss": 0.9885, + "step": 16396 + }, + { + "epoch": 0.9962330639771554, + "grad_norm": 0.09320554882287979, + "learning_rate": 3.6370674708186005e-09, + "loss": 1.0453, + "step": 16397 + }, + { + "epoch": 0.9962938210097819, + "grad_norm": 0.1572645604610443, + "learning_rate": 3.5225227251467484e-09, + "loss": 1.0304, + "step": 16398 + }, + { + "epoch": 0.9963545780424085, + "grad_norm": 0.0966515839099884, + "learning_rate": 3.4098106100166618e-09, + "loss": 1.0741, + "step": 16399 + }, + { + "epoch": 0.9964153350750349, + "grad_norm": 0.2818453311920166, + "learning_rate": 3.29893112956392e-09, + "loss": 1.0589, + "step": 16400 + }, + { + "epoch": 0.9964760921076614, + "grad_norm": 0.09154333174228668, + "learning_rate": 3.189884287851941e-09, + "loss": 0.9976, + "step": 16401 + }, + { + "epoch": 0.996536849140288, + "grad_norm": 0.09504634886980057, + "learning_rate": 3.0826700888830773e-09, + "loss": 1.0373, + "step": 16402 + }, + { + "epoch": 0.9965976061729145, + "grad_norm": 0.09509482234716415, + "learning_rate": 2.9772885365764168e-09, + "loss": 1.0066, + "step": 16403 + }, + { + "epoch": 0.996658363205541, + "grad_norm": 0.2784222960472107, + "learning_rate": 2.873739634806638e-09, + "loss": 1.1793, + "step": 16404 + }, + { + "epoch": 0.9967191202381676, + "grad_norm": 0.1504630595445633, + "learning_rate": 2.7720233873651523e-09, + "loss": 1.1545, + "step": 16405 + }, + { + "epoch": 0.9967798772707941, + "grad_norm": 0.17525725066661835, + "learning_rate": 2.672139797976758e-09, + "loss": 1.0201, + "step": 16406 + }, + { + "epoch": 0.9968406343034206, + "grad_norm": 0.13617132604122162, + "learning_rate": 2.5740888703051915e-09, + "loss": 1.0824, + "step": 16407 + }, + { + "epoch": 0.9969013913360472, + "grad_norm": 0.08649192005395889, + "learning_rate": 2.4778706079475745e-09, + "loss": 0.9728, + "step": 16408 + }, + { + "epoch": 0.9969621483686737, + "grad_norm": 0.15265050530433655, + "learning_rate": 2.3834850144233143e-09, + "loss": 1.0927, + "step": 16409 + }, + { + "epoch": 0.9970229054013002, + "grad_norm": 0.12428773194551468, + "learning_rate": 2.2909320932018586e-09, + "loss": 1.0532, + "step": 16410 + }, + { + "epoch": 0.9970836624339268, + "grad_norm": 0.1048312559723854, + "learning_rate": 2.2002118476638355e-09, + "loss": 1.0563, + "step": 16411 + }, + { + "epoch": 0.9971444194665533, + "grad_norm": 0.14031675457954407, + "learning_rate": 2.111324281145466e-09, + "loss": 1.0772, + "step": 16412 + }, + { + "epoch": 0.9972051764991798, + "grad_norm": 0.1768234372138977, + "learning_rate": 2.024269396899703e-09, + "loss": 1.0697, + "step": 16413 + }, + { + "epoch": 0.9972659335318063, + "grad_norm": 0.16252635419368744, + "learning_rate": 1.939047198123989e-09, + "loss": 1.1371, + "step": 16414 + }, + { + "epoch": 0.9973266905644328, + "grad_norm": 0.22938312590122223, + "learning_rate": 1.8556576879324993e-09, + "loss": 1.109, + "step": 16415 + }, + { + "epoch": 0.9973874475970593, + "grad_norm": 0.1136816069483757, + "learning_rate": 1.774100869383899e-09, + "loss": 0.9748, + "step": 16416 + }, + { + "epoch": 0.9974482046296859, + "grad_norm": 0.09893430024385452, + "learning_rate": 1.6943767454757897e-09, + "loss": 1.0262, + "step": 16417 + }, + { + "epoch": 0.9975089616623124, + "grad_norm": 0.12094129621982574, + "learning_rate": 1.616485319122507e-09, + "loss": 1.0346, + "step": 16418 + }, + { + "epoch": 0.9975697186949389, + "grad_norm": 0.12108869105577469, + "learning_rate": 1.5404265931773243e-09, + "loss": 1.0142, + "step": 16419 + }, + { + "epoch": 0.9976304757275655, + "grad_norm": 0.10918483138084412, + "learning_rate": 1.4662005704380033e-09, + "loss": 1.0577, + "step": 16420 + }, + { + "epoch": 0.997691232760192, + "grad_norm": 0.1524430811405182, + "learning_rate": 1.3938072536134884e-09, + "loss": 1.0922, + "step": 16421 + }, + { + "epoch": 0.9977519897928185, + "grad_norm": 0.1589829921722412, + "learning_rate": 1.3232466453683146e-09, + "loss": 1.1393, + "step": 16422 + }, + { + "epoch": 0.9978127468254451, + "grad_norm": 0.16543225944042206, + "learning_rate": 1.2545187482837505e-09, + "loss": 1.095, + "step": 16423 + }, + { + "epoch": 0.9978735038580716, + "grad_norm": 0.21794280409812927, + "learning_rate": 1.1876235648800027e-09, + "loss": 1.1503, + "step": 16424 + }, + { + "epoch": 0.9979342608906981, + "grad_norm": 0.12918835878372192, + "learning_rate": 1.1225610976051126e-09, + "loss": 1.05, + "step": 16425 + }, + { + "epoch": 0.9979950179233247, + "grad_norm": 0.1320475935935974, + "learning_rate": 1.059331348851611e-09, + "loss": 1.0702, + "step": 16426 + }, + { + "epoch": 0.9980557749559511, + "grad_norm": 0.10265633463859558, + "learning_rate": 9.979343209287618e-10, + "loss": 1.0249, + "step": 16427 + }, + { + "epoch": 0.9981165319885776, + "grad_norm": 0.1497434675693512, + "learning_rate": 9.383700160958686e-10, + "loss": 1.1647, + "step": 16428 + }, + { + "epoch": 0.9981772890212042, + "grad_norm": 0.14027445018291473, + "learning_rate": 8.806384365289688e-10, + "loss": 1.0638, + "step": 16429 + }, + { + "epoch": 0.9982380460538307, + "grad_norm": 0.20440469682216644, + "learning_rate": 8.247395843430372e-10, + "loss": 1.0517, + "step": 16430 + }, + { + "epoch": 0.9982988030864572, + "grad_norm": 0.13694840669631958, + "learning_rate": 7.706734615975375e-10, + "loss": 1.0594, + "step": 16431 + }, + { + "epoch": 0.9983595601190838, + "grad_norm": 0.1576005518436432, + "learning_rate": 7.184400702631156e-10, + "loss": 1.0848, + "step": 16432 + }, + { + "epoch": 0.9984203171517103, + "grad_norm": 1.2473230361938477, + "learning_rate": 6.680394122604572e-10, + "loss": 1.1231, + "step": 16433 + }, + { + "epoch": 0.9984810741843368, + "grad_norm": 0.09906170517206192, + "learning_rate": 6.194714894380838e-10, + "loss": 0.9965, + "step": 16434 + }, + { + "epoch": 0.9985418312169634, + "grad_norm": 0.2739318311214447, + "learning_rate": 5.727363035668009e-10, + "loss": 1.1276, + "step": 16435 + }, + { + "epoch": 0.9986025882495899, + "grad_norm": 0.3435087203979492, + "learning_rate": 5.278338563730056e-10, + "loss": 1.1132, + "step": 16436 + }, + { + "epoch": 0.9986633452822165, + "grad_norm": 0.0959831178188324, + "learning_rate": 4.847641494942767e-10, + "loss": 1.0131, + "step": 16437 + }, + { + "epoch": 0.998724102314843, + "grad_norm": 1.0217487812042236, + "learning_rate": 4.4352718451268204e-10, + "loss": 1.0946, + "step": 16438 + }, + { + "epoch": 0.9987848593474695, + "grad_norm": 0.15110178291797638, + "learning_rate": 4.041229629325738e-10, + "loss": 1.1102, + "step": 16439 + }, + { + "epoch": 0.998845616380096, + "grad_norm": 0.10901308059692383, + "learning_rate": 3.6655148620834413e-10, + "loss": 1.0399, + "step": 16440 + }, + { + "epoch": 0.9989063734127225, + "grad_norm": 0.1671479493379593, + "learning_rate": 3.308127557111185e-10, + "loss": 1.0423, + "step": 16441 + }, + { + "epoch": 0.998967130445349, + "grad_norm": 0.23347000777721405, + "learning_rate": 2.9690677275651114e-10, + "loss": 1.1833, + "step": 16442 + }, + { + "epoch": 0.9990278874779756, + "grad_norm": 0.2965988218784332, + "learning_rate": 2.648335385824208e-10, + "loss": 1.1272, + "step": 16443 + }, + { + "epoch": 0.9990886445106021, + "grad_norm": 0.18708714842796326, + "learning_rate": 2.345930543601327e-10, + "loss": 1.1634, + "step": 16444 + }, + { + "epoch": 0.9991494015432286, + "grad_norm": 0.12446684390306473, + "learning_rate": 2.0618532121097213e-10, + "loss": 1.0628, + "step": 16445 + }, + { + "epoch": 0.9992101585758552, + "grad_norm": 0.17894360423088074, + "learning_rate": 1.7961034016189538e-10, + "loss": 1.203, + "step": 16446 + }, + { + "epoch": 0.9992709156084817, + "grad_norm": 0.24358119070529938, + "learning_rate": 1.5486811220100096e-10, + "loss": 1.102, + "step": 16447 + }, + { + "epoch": 0.9993316726411082, + "grad_norm": 0.0997956171631813, + "learning_rate": 1.3195863822201838e-10, + "loss": 1.061, + "step": 16448 + }, + { + "epoch": 0.9993924296737348, + "grad_norm": 0.09750929474830627, + "learning_rate": 1.1088191907426825e-10, + "loss": 1.0389, + "step": 16449 + }, + { + "epoch": 0.9994531867063613, + "grad_norm": 0.1518574357032776, + "learning_rate": 9.163795552935561e-11, + "loss": 1.1509, + "step": 16450 + }, + { + "epoch": 0.9995139437389878, + "grad_norm": 0.108207106590271, + "learning_rate": 7.422674828672093e-11, + "loss": 1.0395, + "step": 16451 + }, + { + "epoch": 0.9995747007716144, + "grad_norm": 0.1269640177488327, + "learning_rate": 5.864829798474247e-11, + "loss": 1.0621, + "step": 16452 + }, + { + "epoch": 0.9996354578042408, + "grad_norm": 0.10491638630628586, + "learning_rate": 4.490260520628731e-11, + "loss": 1.0369, + "step": 16453 + }, + { + "epoch": 0.9996962148368673, + "grad_norm": 0.11752914637327194, + "learning_rate": 3.298967043985357e-11, + "loss": 1.0706, + "step": 16454 + }, + { + "epoch": 0.9997569718694939, + "grad_norm": 0.14386291801929474, + "learning_rate": 2.2909494129530474e-11, + "loss": 1.1535, + "step": 16455 + }, + { + "epoch": 0.9998177289021204, + "grad_norm": 0.10731829702854156, + "learning_rate": 1.466207664724273e-11, + "loss": 1.0542, + "step": 16456 + }, + { + "epoch": 0.9998784859347469, + "grad_norm": 0.24370910227298737, + "learning_rate": 8.247418292750553e-12, + "loss": 1.0762, + "step": 16457 + }, + { + "epoch": 0.9999392429673735, + "grad_norm": 0.14225801825523376, + "learning_rate": 3.665519293649666e-12, + "loss": 1.0693, + "step": 16458 + }, + { + "epoch": 1.0, + "grad_norm": 0.16426825523376465, + "learning_rate": 9.16379833126868e-13, + "loss": 1.1252, + "step": 16459 + }, + { + "epoch": 1.0, + "step": 16459, + "total_flos": 1.4485486633586629e+20, + "train_loss": 1.1091928682934797, + "train_runtime": 49071.5718, + "train_samples_per_second": 171.723, + "train_steps_per_second": 0.335 + } + ], + "logging_steps": 1.0, + "max_steps": 16459, + "num_input_tokens_seen": 0, + "num_train_epochs": 1, + "save_steps": 1000, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 1.4485486633586629e+20, + "train_batch_size": 16, + "trial_name": null, + "trial_params": null +}